diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000000..93aeded4aa9
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,135 @@
+# C
+BasedOnStyle: LLVM
+AlignEscapedNewlines: Indent
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignConsecutiveStructMembers: true
+AlignConsecutiveMacros: true
+AlignDeclarationByPointer: true
+AlignAfterOpenBracket: true
+AlignOperands: true
+PointerAlignment: Right
+DerivePointerAlignment: false
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: false
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AllowShortEnumsOnASingleLine: false
+AllowDesignatedInitializersOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+PenaltyReturnTypeOnItsOwnLine: 20
+PenaltyBreakAssignment: 100
+PenaltyExcessCharacter: 100
+PenaltyBreakBeforeFirstCallParameter: 100
+PenaltyBreakMemberAccess: 250
+PenaltyBreakLastMemberAccess: 300
+PenaltyIndentedWhitespace: 0
+ColumnLimit: 80
+AlwaysBreakBeforeMultilineStrings: false
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: true
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: false
+  BeforeElse: false
+  IndentBraces: false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: false
+BreakBeforeTernaryOperators: false
+BreakStringLiterals: true
+ContinuationIndentWidth: 8
+IncludeBlocks: Regroup
+IndentCaseLabels: false
+IndentWidth: 4
+KeepEmptyLinesAtTheStartOfBlocks: false
+IndentPPDirectives: None
+MaxEmptyLinesToKeep: 2
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceInEmptyParentheses: false
+SpaceBeforeParens: ControlStatementsExceptForEachMacros
+SpaceBeforeAssignmentOperators: true
+SpaceAfterCStyleCast: false
+SortIncludes: false
+ForEachMacros: ['_UCS_BITMAP_FOR_EACH_WORD',
+                'FOR_EACH_ENTITY',
+                'kh_foreach',
+                'kh_foreach_key',
+                'kh_foreach_value',
+                'ucp_unpacked_address_for_each',
+                'ucs_array_for_each',
+                'UCS_BITMAP_FOR_EACH_BIT',
+                'ucs_for_each_bit',
+                'ucs_for_each_submask',
+                'ucs_hlist_for_each',
+                'ucs_hlist_for_each_extract',
+                'ucs_hlist_for_each_extract_if',
+                'ucs_list_for_each',
+                'ucs_list_for_each_safe',
+                'ucs_memory_type_for_each',
+                'UCS_PP_FOREACH',
+                'UCS_PP_FOREACH_SEP',
+                'ucs_profile_for_each_location',
+                'ucs_ptr_array_for_each',
+                'ucs_ptr_array_locked_for_each',
+                'ucs_queue_for_each',
+                'ucs_queue_for_each_extract',
+                'ucs_queue_for_each_safe',
+                'ucs_timerq_for_each_expired',
+                'UCT_IB_IFACE_VERBS_FOREACH_RXWQE',
+                'UCT_RC_VERBS_IFACE_FOREACH_TXWQE',
+                'UCS_INIT_ONCE',
+                'UCS_TEST_F',
+                'UCX_PERF_TEST_FOREACH']
+StatementMacros : []
+TypenameMacros: ['khash_t', 'ucs_array_t']
+WhitespaceSensitiveMacros: []
+
+# CPP
+Standard: Cpp11
+AccessModifierOffset: -4
+AlwaysBreakTemplateDeclarations: false
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: AfterColon
+BreakConstructorInitializers: AfterColon
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+Cpp11BracedListStyle: true
+Cpp11BracedListLineBreak: true
+FixNamespaceComments: true
+NamespaceIndentation: None
+UseTab: Never
+ReflowComments: false
+SortIncludes: false
+IncludeCategories:
+ - Regex: '^"'
+   Priority: 1
+ - Regex: '^<'
+   Priority: 2
+SortUsingDeclarations: true
+TabWidth: 4
+SpacesInAngles: false
+SpacesBeforeTrailingComments: 1
+SpaceAfterTemplateKeyword: false
+SpacesInContainerLiterals: false
+---
+# Java
+Language: Java
+DisableFormat: true
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index a1bcd11741f..7af5d16c0d9 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -18,6 +18,7 @@ A clear and concise description of what the bug is.
 ### Setup and versions
 - OS version (e.g Linux distro) + CPU architecture (x86_64/aarch64/ppc64le/...)
    - `cat /etc/issue` or `cat /etc/redhat-release` + `uname -a`
+   - For Nvidia Bluefield SmartNIC include `cat /etc/mlnx-release` (the string identifies software and firmware setup)
 - For RDMA/IB/RoCE related issues:
     - Driver version:
         - `rpm -q rdma-core` or `rpm -q libibverbs`
diff --git a/.gitignore b/.gitignore
index 0ffccf4851a..c712c60b0b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,3 +88,8 @@ GTAGS
 *.swp
 compile_commands.json
 .idea/
+.externalToolBuilders
+.classpath
+.vscode
+src/tools/vfs/ucx_vfs
+test/apps/test_init_mt
diff --git a/Makefile.am b/Makefile.am
index 64364aa4fbf..073ead37243 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,7 +13,12 @@
 EXTRA_DIST =
 ACLOCAL_AMFLAGS = -I config/m4
 
-noinst_HEADERS = src/uct/api/uct.h src/uct/api/uct_def.h src/uct/api/tl.h
+noinst_HEADERS = \
+	src/uct/api/uct.h \
+	src/uct/api/v2/uct_v2.h \
+	src/uct/api/uct_def.h \
+	src/uct/api/tl.h
+
 doxygen_doc_files = $(noinst_HEADERS)
 
 doc_dir = $(pkgdatadir)/doc
@@ -37,6 +42,7 @@ SUBDIRS += $(UCG_SUBDIR)
 endif
 
 SUBDIRS += \
+	src/tools/vfs \
 	src/tools/info \
 	src/tools/perf \
 	src/tools/profile \
diff --git a/NEWS b/NEWS
index 720173b1e94..d6f490cceb2 100644
--- a/NEWS
+++ b/NEWS
@@ -1,17 +1,186 @@
 #
-## Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
+## Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 ## Copyright (C) UT-Battelle, LLC. 2014-2019. ALL RIGHTS RESERVED.
-## Copyright (C) ARM Ltd. 2017-2020.  ALL RIGHTS RESERVED.
+## Copyright (C) ARM Ltd. 2017-2021.  ALL RIGHTS RESERVED.
 ##
 ## See file LICENSE for terms.
 ##
 #
 
 ## Current
-### Features: TBD
-#### UCX Core TBD
-#### UCX Java (API Preview) TBD
-### Bugfixes: TBD
+### Features:
+#### UCP
+* Added API for querying UCP library attributes
+### Bugfixes:
+
+## 1.10.0 (March 9, 2021)
+### Features:
+#### Core
+* Added support for Nvidia HPC SDK
+* Added support for latest PGI and Clang
+* Added support for ROCM-3.7+ (warning generated if older version detected)
+* Added support for GCC11
+#### Architecture
+* Added Arm SVE memcpy()
+* Redesigned Arm WFE support
+* Improved clear_cache performance for Arm
+* Added architecture detection for Zhaoxin CPU
+#### CI
+* Added release builds on CUDA 11
+* Enabled performance validation in gtest
+* Added new OS for release CI
+#### UCP
+* Added locality awareness to the transport selection logic for GPU devices
+* Added put/offload/short and put/offload/zcopy protocols
+* Added receive message nbx routine
+* Reworked AM implementation and API, which adds support for RNDV semantics
+* Added support for multi-lane connection manager over TCP
+* Added support for printing AM tls with info log level
+* Implement flush and destroy for UCT EPs on UCP worker
+* Reduced UCP request size
+* Added support for keepalive protocol
+* Added support for multi-fragment protocol
+* Added implementation for protocol progress for eager, bcopy, and multicopy
+* Improved selection logic for protocol selection
+* Added new protocols for UCP get operation
+* Added bcopy protocols with support for GPU memory
+* Added RNDV protocol implementation for GPU devices (CUDA, ROCm)
+* Set SOCKADDR_CM_ENABLE=y by default
+* Added support for fast-path short with new tag protocols
+* Added a new parameter to control the CM listener's backlog
+* Added support sending AM RTS over short message protocol
+* Added support for shared memory multi-lane when CM is used
+* Added missing async locks
+#### UCT
+* Added API for keepalive_timeout value
+* Added add uct_completion.status
+* Allowed transports to access multiple mem_types
+* Removed status arg from uct_completion_callback_t
+* Restructured  uct_mem_alloc/uct_md_mem_alloc to use mem_type
+* Updated documentation for uct_listener_params
+* Lowered the log level for certain network errors
+* Added cuda_copy wakeup feature
+* Added wakeup support for shared memory
+#### UCS
+* Added "inf" and "auto" values to time units
+* Added on-stack constructors for array and string buffer
+* Added ucs_ptr_map_t data structure
+* Added bool CSWAP
+* Improved logging
+* Added optimization for namespace processing
+* Fixes for connection matching functionality
+#### CUDA
+* Added support for global IPC cache
+#### RDMA CORE (IB, ROCE, etc.)
+* Added support for auto detection of adapative routing settings
+* Added an option to poll TX CQ every progress iteration
+* Added local and remote addresses to the reject error message
+* Added support for UAR allocation with non-cacheable memory type
+* Added support for multiple flush cancel without completion
+* Added async events callback support
+* Added detection for  ConnectX-6, ConnectX-7 and BlueField-1/2 devices
+* Added support for connection matching for UD
+* Added a check for AM ordering
+* Added better support for non-4K MTU values
+#### Java (preview)
+* Added support for a different javadoc executable path for different java versions
+* Added UCS memory type constants
+* Added support build on Java10+
+* Added support for io-vector datatype.
+* Removed libjucx from packages.
+#### Tests
+* Added CI for CUDA 11
+* Added test_ucp_sockaddr_protocols.stream_short
+* Reimplemented tests using NBX API
+* Added flush(cancel) test
+* Added memory_wait mode to perftest
+* Added support for clang 10
+* Refactored RMA and atomic tests, add memtype support
+* Added test for uct_md_mem_query()
+* Added request interrupt support
+* Added support for connection manager fallbacks
+* Added new ucp request test checking for leaks from the ptr_map
+#### Documentation
+* Added glossaries
+
+### Bugfixes:
+#### Portability
+* Fixes in print functions to use format string like PRIx64, etc.
+* Fixes for Arm v8 cross compilation support
+#### Continues Integration:
+* Fixes in Github release flow
+* Fixes in docker image
+#### Packaging
+* Removed deb package dependencies
+* Fixes in SPEC to make the RPM relocatable
+#### Documentation
+* Fixes in documentation for ucp_am_recv_data_nbx
+* Fixes in quick start example
+* Fixes in installation instruction
+* Fixes in updates in author list
+#### Tests
+* Fixes for failures under valgrind runtime
+* Fixes in mmap tests for 0-length RMA
+* Fixes in definition of LAST_WQE wait timeout
+* Fixes in ROCm for mem_buffer test
+* Fixes in test name printing format
+* Fixes in tcp_sockcm test
+#### UCP
+* Fixes in worker cleanup flow
+* Fixes in RNDV RTS flow
+* Fix in length check condition for RMA PUT short
+* Fixes in handling failures from AM Bcopy
+* Fix in a release flow of deferred data
+* Fixes for invalid ID and handling of status in RNDV
+* Fixes in short active message reply protocol
+#### CUDA
+* Fixes in managed memory support
+* Fixes in topology detection
+#### RDMA CORE (IB, ROCE, etc.)
+* Fixes in assert definitions
+* Fixes in printing an error about invalid AM Bcopy length for UD
+* Fixes for thread safety support
+* Fixes to get ROCE device name according to GID
+* Fixes for SL selection
+* Fixes in create STRICT_ORDER key
+* Fixes addressing performance degradation in UD transport due to excess async events
+* Fixes in QP destroy
+* Fixes for CQ creation failure using old Verbs API
+#### UGNI
+* Fixing disable logic in config
+* Fixing clang 11 warnings
+#### Java
+* Fixes in build dependencies
+* Fixes in constructing UcpRequest object on error
+* Fixes in exception handling on endpoint closure request
+* Fixes for segfault in UcpErrorHandler
+#### UCP
+* Fixes in datatype support for get_zcopy RNDV
+* Fixes in connection manager disconnect
+* Fixes in assert definitions
+* Fixes in completion flow for failed EP
+* Fixes in flush error handling flow
+* Fixes in latency calculations for wireup protocol
+* Fixes in offload completion with inlined data
+* Fixes in unpacking flow
+* Fixes in error handling for various protocols
+#### UCT
+* Fixes in flush TX
+* Fixes in checks for enabling GPU Direct RDMA
+#### UCS
+* Fixes for crashes on incorrect value set in config
+* Fixes in ptr_array
+* Fixes in maximal size for ucs_snprintf_safe()
+* Fixes in compilation warning
+* Fixes in ucs_aarch64_dsb(_op) definition
+#### TCP
+* Fixes in default route interface confirmation flow
+* Fixes in PUT protocol
+* Fixes in max connection limit and improved error reporting
+#### UCM
+* Fixing crash on prevent unload
+* Fixes in libucm_rocm
+* Fixes for few racing conditions
 
 ## 1.9.0 (September 19, 2020)
 ### Features:
diff --git a/README b/README
deleted file mode 100644
index 231ac05a56a..00000000000
--- a/README
+++ /dev/null
@@ -1,184 +0,0 @@
-<div align="center">
-  <a href="http://www.openucx.org/"><img src="./docs/doxygen/UCX_Logo_930x933.png" width="200"></a>
-  <br>
-  <a href="https://twitter.com/intent/follow?screen_name=openucx"> <img src="https://img.shields.io/twitter/follow/openucx?style=social&logo=twitter" alt="follow on Twitter"></a>
-  <a href="https://openucx.github.io/ucx/api/latest/html/"><img src="docs/doxygen/api.svg"></a>
-  <a href='https://openucx.readthedocs.io/en/master/?badge=master'><img src='https://readthedocs.org/projects/openucx/badge/?version=master' alt='Documentation Status' />
-  <a href="https://github.com/openucx/ucx/releases/latest"><img src="docs/doxygen/release.svg"></a>
-</div>
-
-<!-- TOC generated by https://github.com/ekalinin/github-markdown-toc -->
-
-<hr>
-
-   * [Unified Communication X](#unified-communication-x)
-      * [Using UCX](#using-ucx)
-         * [Building and Running Internal Unit Tests](#building-and-running-internal-unit-tests)
-         * [UCX Performance Test](#ucx-performance-test)
-      * [Our Community](#our-community)
-      * [Licenses](#licenses)
-      * [Contributor Agreement and Guidelines](#contributor-agreement-and-guidelines)
-      * [UCX Publications](#ucx-publications)
-      * [UCX Architecture](#ucx-architecture)
-      * [Supported Transports](#supported-transports)
-      * [Supported CPU Architectures](#supported-cpu-architectures)
-
-<hr>
-
-# Unified Communication X
-
-Unified Communication X (UCX) provides an optimized communication
-layer for Message Passing ([MPI](https://www.mpi-forum.org/)),
-[PGAS](http://www.pgas.org/)/[OpenSHMEM](http://www.openshmem.org/)
-libraries and RPC/data-centric applications.
-
-UCX utilizes high-speed networks for inter-node communication, and
-shared memory mechanisms for efficient intra-node communication.
-
-## Using UCX
-
-### Release Builds
-
-Building UCX is typically a combination of running "configure" and "make".
-Execute the following commands to install the UCX system from within the
-directory at the top of the tree:
-
-```sh
-$ ./autogen.sh
-$ ./contrib/configure-release --prefix=/where/to/install
-$ make -j8
-$ make install
-```
-
-NOTE: Compiling support for various networks or other specific hardware may
-require additional command line flags when running configure.
-
-### Developer Builds
-
-```bash
-$ ./autogen.sh
-$ ./contrib/configure-devel --prefix=$PWD/install-debug
-```
-
-*** NOTE: Developer builds of UCX typically include a large performance 
-penalty at run-time because of extra debugging code.
-
-### Running internal unit tests
-
-```sh
-$ make -C test/gtest test
-```
-
-### Build RPM package
-```bash
-$ contrib/buildrpm.sh -s -b
-```
-
-### Build DEB package
-```bash
-$ dpkg-buildpackage -us -uc
-```
-
-### Build Doxygen documentation
-```bash
-$ make docs
-```
-
-### OpenMPI and OpenSHMEM installation with UCX
-[Wiki page](http://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX)
-
-### MPICH installation with UCX
-[Wiki page](http://github.com/openucx/ucx/wiki/MPICH-installation-with-UCX)
-
-### UCX Performance Test
-
-Start server:
-
-```sh
-$ ./src/tools/perf/ucx_perftest -c 0
-```
-
-Connect client:
-
-```sh
-$ ./src/tools/perf/ucx_perftest <server-hostname> -t tag_lat -c 1
-```
-Note: the `-c` flag sets CPU affinity. If running both commands on same host, make sure you set the affinity to different CPU cores.
-
-## Our Community
-
-* [Project Website](http://www.openucx.org/)
-* [ReadTheDocs](https://openucx.readthedocs.io/en/master/)
-* [Github](http://www.github.com/openucx/ucx/)
-* [Software Releases](http://www.github.com/openucx/ucx/releases)
-* [Mailing List](https://elist.ornl.gov/mailman/listinfo/ucx-group)
-* [Twitter](https://twitter.com/openucx)
-
-## Licenses
-
-UCX is licensed as:
-
-* [BSD3](LICENSE)
-
-## Contributor Agreement and Guidelines
-
-In order to contribute to UCX, please sign up with an appropriate
-[Contributor Agreement](http://www.openucx.org/license/).
-
-Follow these
-[instructions](https://github.com/openucx/ucx/wiki/Guidance-for-contributors)
-when submitting contributions and changes.
-
-## UCX Publications
-
-To reference UCX in a publication, please use the following entry:
-
-```bibtex
-@inproceedings{shamis2015ucx,
-  title={UCX: an open source framework for HPC network APIs and beyond},
-  author={Shamis, Pavel and Venkata, Manjunath Gorentla and Lopez, M Graham and Baker, Matthew B and Hernandez, Oscar and Itigin, Yossi and Dubman, Mike and Shainer, Gilad and Graham, Richard L and Liss, Liran and others},
-  booktitle={2015 IEEE 23rd Annual Symposium on High-Performance Interconnects},
-  pages={40--43},
-  year={2015},
-  organization={IEEE}
-}
-```
-
-To reference the UCX website:
-
-```bibtex
-@misc{openucx-website,
-    title = {{The Unified Communication X Library}},
-    key = {{{The Unified Communication X Library}},
-    howpublished = {{\url{http://www.openucx.org}}}
-}
-```
-
-## UCX Architecture
-
-![](docs/doxygen/Architecture.png)
-
-| Component | Role        | Description |
-| :---:     | :---:       | ---         |
-| UCP | Protocol          | Implements high-level abstractions such as tag-matching, streams, connection negotiation and establishment, multi-rail, and handling different memory types |
-| UCT | Transport         | Implements low-level communication primitives such as active messages, remote memory access, and atomic operations |
-| UCS | Services          | A collection of data structures, algorithms, and system utilities for common use |
-| UCM | Memory            | Intercepts memory allocation and release events, used by the  memory registration cache |
-
-## Supported Transports
-
-* [Infiniband](https://www.infinibandta.org/)
-* [Omni-Path](https://www.intel.com/content/www/us/en/high-performance-computing-fabrics/omni-path-driving-exascale-computing.html)
-* [RoCE](http://www.roceinitiative.org/)
-* [Cray Gemini and Aries](https://www.cray.com/)
-* [CUDA](https://developer.nvidia.com/cuda-zone)
-* [ROCm](https://rocm.github.io/)
-* Shared Memory
-    * posix, sysv, [cma](https://dl.acm.org/citation.cfm?id=2616532), [knem](http://knem.gforge.inria.fr/), and [xpmem](https://github.com/hjelmn/xpmem)
-* TCP/IP
-
-## Supported CPU Architectures
-
-* [x86_64](https://en.wikipedia.org/wiki/X86-64)
-* [Power8/9](https://www.ibm.com/support/knowledgecenter/en/POWER9/p9hdx/POWER9welcome.htm)
-* [Arm v8](https://www.arm.com/products/silicon-ip-cpu)
diff --git a/README b/README
new file mode 120000
index 00000000000..42061c01a1c
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+README.md
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 120000
index 100b93820ad..00000000000
--- a/README.md
+++ /dev/null
@@ -1 +0,0 @@
-README
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 00000000000..f02d791cb21
--- /dev/null
+++ b/README.md
@@ -0,0 +1,220 @@
+<div align="center">
+  <a href="http://www.openucx.org/"><img src="./docs/doxygen/UCX_Logo_930x933.png" width="200"></a>
+  <br>
+  <a href="https://twitter.com/intent/follow?screen_name=openucx"> <img src="https://img.shields.io/twitter/follow/openucx?style=social&logo=twitter" alt="follow on Twitter"></a>
+  <a href="https://openucx.github.io/ucx/api/latest/html/"><img src="docs/doxygen/api.svg"></a>
+  <a href='https://openucx.readthedocs.io/en/master/?badge=master'><img src='https://readthedocs.org/projects/openucx/badge/?version=master' alt='Documentation Status' />
+  <a href="https://github.com/openucx/ucx/releases/latest"><img src="docs/doxygen/release.svg"></a>
+</div>
+
+<hr>
+
+# Unified Communication X
+
+Unified Communication X (UCX) is an
+[award winning](https://losalamosreporter.com/2019/11/07/nine-los-alamos-national-laboratory-projects-win-rd-100-awards),
+optimized production proven-communication framework for modern, high-bandwidth
+and low-latency networks.
+
+UCX exposes a set of abstract communication primitives that utilize the best of
+available hardware resources and offloads. These include RDMA (InfiniBand and RoCE),
+TCP, GPUs, shared memory, and network atomic operations.
+
+Please visit our [documentation site](https://openucx.readthedocs.io/en/master)
+ for more details.
+
+<hr>
+
+<!-- TOC generated by https://github.com/ekalinin/github-markdown-toc -->
+
+* [Using UCX](#using-ucx)
+* [Known issues](#known-issues)
+* [Architecture](#architecture)
+* [Supported Transports](#supported-transports)
+* [Supported CPU Architectures](#supported-cpu-architectures)
+* [Licenses](#licenses)
+* [Our Community](#our-community)
+* [Contributor Agreement and Guidelines](#contributor-agreement-and-guidelines)
+* [Publications](#publications)
+
+
+<hr>
+
+## Using UCX
+
+### Release Builds
+
+Building UCX is typically a combination of running "configure" and "make".
+Execute the following commands to install the UCX system from within the
+directory at the top of the tree:
+
+```sh
+$ ./autogen.sh
+$ ./contrib/configure-release --prefix=/where/to/install
+$ make -j8
+$ make install
+```
+
+NOTE: Compiling support for various networks or other specific hardware may
+require additional command line flags when running configure.
+
+### Developer Builds
+
+```bash
+$ ./autogen.sh
+$ ./contrib/configure-devel --prefix=$PWD/install-debug
+```
+
+*** NOTE: Developer builds of UCX typically include a large performance
+penalty at run-time because of extra debugging code.
+
+### Build RPM package
+```bash
+$ contrib/buildrpm.sh -s -b
+```
+
+### Build DEB package
+```bash
+$ dpkg-buildpackage -us -uc
+```
+
+### Build Doxygen documentation
+```bash
+$ make docs
+```
+
+### OpenMPI and OpenSHMEM installation with UCX
+[Wiki page](http://github.com/openucx/ucx/wiki/OpenMPI-and-OpenSHMEM-installation-with-UCX)
+
+### MPICH installation with UCX
+[Wiki page](http://github.com/openucx/ucx/wiki/MPICH-installation-with-UCX)
+
+### UCX Performance Test
+
+Start server:
+
+```sh
+$ ./src/tools/perf/ucx_perftest -c 0
+```
+Connect client:
+
+```sh
+$ ./src/tools/perf/ucx_perftest <server-hostname> -t tag_lat -c 1
+```
+> NOTE the `-c` flag sets CPU affinity. If running both >commands on same host, make sure you set the affinity to different CPU cores.
+
+
+### Running internal unit tests
+
+```sh
+$ make -C test/gtest test
+```
+
+<hr>
+
+
+## Known issues
+* UCX version 1.8.0 has a bug that may cause data corruption when TCP transport
+  is used in conjunction with shared memory transport. It is advised to upgrade
+  to UCX version 1.9.0 and above. UCX versions released before 1.8.0 don't have
+  this bug.
+
+* UCX may hang with glibc versions 2.25-2.29 due to known bugs in the
+  pthread_rwlock functions. When such hangs occur, one of the UCX threads gets
+  stuck in pthread_rwlock_rdlock (which is called by ucs_rcache_get), even
+  though no other thread holds the lock. A related issue is reported in
+  [glibc Bug 23844](https://sourceware.org/bugzilla/show_bug.cgi?id=23844).
+  If this issue occurs, it is advised to use glibc version provided with your
+  OS distribution or build glibc from source using versions less than 2.25 or
+  greater than 2.29.
+
+<hr>
+
+
+## Architecture
+
+![](docs/doxygen/Architecture.png)
+
+| Component | Role        | Description |
+| :---:     | :---:       | ---         |
+| UCP | Protocol          | Implements high-level abstractions such as tag-matching, streams, connection negotiation and establishment, multi-rail, and handling different memory types |
+| UCT | Transport         | Implements low-level communication primitives such as active messages, remote memory access, and atomic operations |
+| UCS | Services          | A collection of data structures, algorithms, and system utilities for common use |
+| UCM | Memory            | Intercepts memory allocation and release events, used by the  memory registration cache |
+
+<hr>
+
+## Supported Transports
+
+* [Infiniband](https://www.infinibandta.org/)
+* [Omni-Path](https://www.intel.com/content/www/us/en/high-performance-computing-fabrics/omni-path-driving-exascale-computing.html)
+* [RoCE](http://www.roceinitiative.org/)
+* [Cray Gemini and Aries](https://www.cray.com/)
+* [CUDA](https://developer.nvidia.com/cuda-zone)
+* [ROCm](https://rocm.github.io/)
+* Shared Memory
+    * posix, sysv, [cma](https://dl.acm.org/citation.cfm?id=2616532), [knem](http://knem.gforge.inria.fr/), and [xpmem](https://github.com/hjelmn/xpmem)
+* TCP/IP
+
+<hr>
+
+## Supported CPU Architectures
+
+* [x86_64](https://en.wikipedia.org/wiki/X86-64)
+* [Power8/9](https://www.ibm.com/support/knowledgecenter/en/POWER9/p9hdx/POWER9welcome.htm)
+* [Arm v8](https://www.arm.com/products/silicon-ip-cpu)
+
+<hr>
+
+## Licenses
+
+UCX is licensed as:
+
+* [BSD3](LICENSE)
+
+<hr>
+
+## Our Community
+
+* [Project Website](http://www.openucx.org/)
+* [ReadTheDocs](https://openucx.readthedocs.io/en/master/)
+* [Github](http://www.github.com/openucx/ucx/)
+* [Software Releases](http://www.github.com/openucx/ucx/releases)
+* [Mailing List](https://elist.ornl.gov/mailman/listinfo/ucx-group)
+* [Twitter](https://twitter.com/openucx)
+
+<hr>
+
+## Contributor Agreement and Guidelines
+
+In order to contribute to UCX, please sign up with an appropriate
+[Contributor Agreement](http://www.openucx.org/license/).
+
+Follow these
+[instructions](https://github.com/openucx/ucx/wiki/Guidance-for-contributors)
+when submitting contributions and changes.
+
+## Publications
+
+To reference UCX in a publication, please use the following entry:
+
+```bibtex
+@inproceedings{shamis2015ucx,
+  title={UCX: an open source framework for HPC network APIs and beyond},
+  author={Shamis, Pavel and Venkata, Manjunath Gorentla and Lopez, M Graham and Baker, Matthew B and Hernandez, Oscar and Itigin, Yossi and Dubman, Mike and Shainer, Gilad and Graham, Richard L and Liss, Liran and others},
+  booktitle={2015 IEEE 23rd Annual Symposium on High-Performance Interconnects},
+  pages={40--43},
+  year={2015},
+  organization={IEEE}
+}
+```
+
+To reference the UCX website:
+
+```bibtex
+@misc{openucx-website,
+    title = {{The Unified Communication X Library}},
+    key = {{{The Unified Communication X Library}},
+    howpublished = {{\url{http://www.openucx.org}}}
+}
+```
diff --git a/bindings/java/pom.xml.in b/bindings/java/pom.xml.in
index b4311840aab..5bdeb959e4a 100644
--- a/bindings/java/pom.xml.in
+++ b/bindings/java/pom.xml.in
@@ -58,7 +58,7 @@
         <activation>
           <jdk>1.8</jdk>
         </activation>
-	<properties>
+        <properties>
           <javadocExecutable>${java.home}/../bin/javadoc</javadocExecutable>
         </properties>
         <build>
@@ -78,7 +78,7 @@
         <activation>
           <jdk>1.9</jdk>
         </activation>
-	<properties>
+        <properties>
           <javadocExecutable>${java.home}/../bin/javadoc</javadocExecutable>
         </properties>
         <build>
@@ -105,12 +105,11 @@
             <plugin>
               <artifactId>maven-compiler-plugin</artifactId>
               <configuration>
-		<source>1.9</source>
-                <target>1.9</target>
+                <source>1.8</source>
+                <target>1.8</target>
                 <compilerArgs>
                   <arg>-h</arg>
                   <arg>${native.dir}</arg>
-                  <arg>--add-exports</arg><arg>java.base/sun.nio.ch=ALL-UNNAMED</arg>
                 </compilerArgs>
               </configuration>
             </plugin>
@@ -395,6 +394,7 @@
         <artifactId>maven-javadoc-plugin</artifactId>
         <version>3.2.0</version>
         <configuration>
+          <source>8</source>
           <quiet>true</quiet>
           <doclint>all,-missing</doclint>
         </configuration>
diff --git a/bindings/java/src/main/java/org/openucx/jucx/UcxException.java b/bindings/java/src/main/java/org/openucx/jucx/UcxException.java
index 8fb3554473c..4686d96d9e1 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/UcxException.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/UcxException.java
@@ -10,6 +10,8 @@
  */
 public class UcxException extends RuntimeException {
 
+    private int status;
+
     public UcxException() {
         super();
     }
@@ -17,4 +19,16 @@ public UcxException() {
     public UcxException(String message) {
         super(message);
     }
+
+    public UcxException(String message, int status) {
+        super(message);
+        this.status = status;
+    }
+
+    /**
+     * Status of exception to compare with {@link org.openucx.jucx.ucs.UcsConstants.STATUS}
+     */
+    public int getStatus() {
+        return status;
+    }
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java b/bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java
index 2fd71cbfd54..51d0cf3151c 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/UcxNativeStruct.java
@@ -10,6 +10,30 @@
  */
 public abstract class UcxNativeStruct {
     private Long nativeId;
+    /**
+     * To use for hashCode and equals
+     */
+    private Long nativeIdCached;
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+
+        UcxNativeStruct that = (UcxNativeStruct) o;
+
+        return this.nativeIdCached.equals(that.nativeIdCached);
+    }
+
+    @Override
+    public int hashCode() {
+        return nativeIdCached.hashCode();
+    }
 
     /**
      * Getter for native pointer as long.
@@ -19,11 +43,24 @@ public Long getNativeId() {
         return nativeId;
     }
 
+    private void setNativeId(long nativeId) {
+        if (nativeId > 0) {
+            this.nativeId = nativeId;
+            this.nativeIdCached = nativeId;
+        } else {
+            this.nativeId = null;
+        }
+    }
+
     protected void setNativeId(Long nativeId) {
         if (nativeId != null && nativeId < 0) {
             throw new UcxException("UcxNativeStruct.setNativeId: invalid native pointer: "
                 + nativeId);
         }
+
+        if (nativeIdCached == null) {
+            this.nativeIdCached = nativeId;
+        }
         this.nativeId = nativeId;
     }
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java b/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java
index 8f43bf0be82..5f7a4639b9c 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/UcxUtils.java
@@ -5,38 +5,27 @@
 
 package org.openucx.jucx;
 
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
 import java.nio.ByteBuffer;
 
 public class UcxUtils {
 
-    private static final Constructor<?> directBufferConstructor;
-
-    static {
-        try {
-            Class<?> classDirectByteBuffer = Class.forName("java.nio.DirectByteBuffer");
-            directBufferConstructor = classDirectByteBuffer.getDeclaredConstructor(long.class,
-                int.class);
-            directBufferConstructor.setAccessible(true);
-        } catch (Exception e) {
-            throw new UcxException(e.getMessage());
-        }
-    }
+    private UcxUtils() { }
 
     /**
      * Returns view of underlying memory region as a ByteBuffer.
      * @param address - address of start of memory region
      */
-    public static ByteBuffer getByteBufferView(long address, int length)
-        throws IllegalAccessException, InvocationTargetException, InstantiationException {
-        return (ByteBuffer)directBufferConstructor.newInstance(address, length);
+    public static ByteBuffer getByteBufferView(long address, long length) {
+        return getByteBufferViewNative(address, length);
     }
 
     /**
      * Returns native address of the current position of a direct byte buffer.
      */
     public static long getAddress(ByteBuffer buffer) {
-        return ((sun.nio.ch.DirectBuffer) buffer).address() + buffer.position();
+        return getAddressNative(buffer) + buffer.position();
     }
+
+    private static native long getAddressNative(ByteBuffer buffer);
+    private static native ByteBuffer getByteBufferViewNative(long address, long length);
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java
index 26636e7f7aa..acb4ec525ec 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxBenchmark.java
@@ -92,9 +92,7 @@ protected static void createContextAndWorker() {
     }
 
     protected static double getBandwithGbits(long nanoTimeDelta, long size) {
-        double sizeInGigabits = (double)size * 8.0 / 1e9;
-        double secondsElapsed = nanoTimeDelta / 1e9;
-        return sizeInGigabits / secondsElapsed;
+        return (double)size * 8.0 / nanoTimeDelta;
     }
 
     protected static void closeResources() throws IOException {
diff --git a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java
index cc1b79c9558..288d9e6339b 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkReceiver.java
@@ -42,14 +42,6 @@ public static void main(String[] args) throws Exception {
             .setConnectionRequest(connRequest.get())
             .setPeerErrorHandlingMode());
 
-        // Temporary workaround until new connection establishment protocol in UCX.
-        for (int i = 0; i < 10; i++) {
-            worker.progress();
-            try {
-                Thread.sleep(10);
-            } catch (Exception ignored) { }
-        }
-
         ByteBuffer recvBuffer = ByteBuffer.allocateDirect(4096);
         UcpRequest recvRequest = worker.recvTaggedNonBlocking(recvBuffer, null);
 
@@ -72,13 +64,13 @@ public static void main(String[] args) throws Exception {
         UcpMemory recvMemory = context.memoryMap(allocationParams);
         resources.push(recvMemory);
         ByteBuffer data = UcxUtils.getByteBufferView(recvMemory.getAddress(),
-            (int)Math.min(Integer.MAX_VALUE, totalSize));
+            Math.min(Integer.MAX_VALUE, totalSize));
         for (int i = 0; i < numIterations; i++) {
             final int iterNum = i;
             UcpRequest getRequest = endpoint.getNonBlocking(remoteAddress, remoteKey,
-                recvMemory.getAddress(), totalSize,
+                recvMemory.getAddress(), remoteSize,
                 new UcxCallback() {
-                    long startTime = System.nanoTime();
+                    final long startTime = System.nanoTime();
 
                     @Override
                     public void onSuccess(UcpRequest request) {
@@ -95,16 +87,8 @@ public void onSuccess(UcpRequest request) {
             data.put(0, (byte)1);
         }
 
-        ByteBuffer sendBuffer = ByteBuffer.allocateDirect(100);
-        sendBuffer.asCharBuffer().put("DONE");
-        
-        UcpRequest sent = endpoint.sendTaggedNonBlocking(sendBuffer, null);
-        worker.progressRequest(sent);
-
         UcpRequest closeRequest = endpoint.closeNonBlockingFlush();
         worker.progressRequest(closeRequest);
-        // Close request won't be return to pull automatically, since there's no callback.
-        resources.push(closeRequest);
 
         closeResources();
     }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java
index 9c60206c11a..9aab66a521b 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/examples/UcxReadBWBenchmarkSender.java
@@ -5,13 +5,12 @@
 
 package org.openucx.jucx.examples;
 
-import org.openucx.jucx.UcxCallback;
-import org.openucx.jucx.ucp.UcpRequest;
+import org.openucx.jucx.UcxException;
+import org.openucx.jucx.ucp.*;
 import org.openucx.jucx.UcxUtils;
-import org.openucx.jucx.ucp.UcpEndpoint;
-import org.openucx.jucx.ucp.UcpEndpointParams;
-import org.openucx.jucx.ucp.UcpMemory;
+import org.openucx.jucx.ucs.UcsConstants;
 
+import java.net.ConnectException;
 import java.net.InetSocketAddress;
 import java.nio.ByteBuffer;
 
@@ -28,12 +27,19 @@ public static void main(String[] args) throws Exception {
         String serverHost = argsMap.get("s");
         UcpEndpoint endpoint = worker.newEndpoint(new UcpEndpointParams()
             .setPeerErrorHandlingMode()
+            .setErrorHandler((ep, status, errorMsg) -> {
+                if (status == UcsConstants.STATUS.UCS_ERR_CONNECTION_RESET) {
+                    throw new ConnectException(errorMsg);
+                } else {
+                    throw new UcxException(errorMsg);
+                }
+            })
             .setSocketAddress(new InetSocketAddress(serverHost, serverPort)));
 
         UcpMemory memory = context.memoryMap(allocationParams);
         resources.push(memory);
         ByteBuffer data = UcxUtils.getByteBufferView(memory.getAddress(),
-            (int)Math.min(Integer.MAX_VALUE, totalSize));
+            Math.min(Integer.MAX_VALUE, totalSize));
 
         // Send worker and memory address and Rkey to receiver.
         ByteBuffer rkeyBuffer = memory.getRemoteKeyBuffer();
@@ -49,22 +55,23 @@ public static void main(String[] args) throws Exception {
 
         // Send memory metadata and wait until receiver will finish benchmark.
         endpoint.sendTaggedNonBlocking(sendData, null);
-        ByteBuffer recvBuffer = ByteBuffer.allocateDirect(4096);
-        UcpRequest recvRequest = worker.recvTaggedNonBlocking(recvBuffer,
-            new UcxCallback() {
-                @Override
-                public void onSuccess(UcpRequest request) {
-                    System.out.println("Received a message:");
-                    System.out.println(recvBuffer.asCharBuffer().toString().trim());
-                }
-            });
-
-        worker.progressRequest(recvRequest);
 
-        UcpRequest closeRequest = endpoint.closeNonBlockingFlush();
-        worker.progressRequest(closeRequest);
-        resources.push(closeRequest);
+        try {
+            while (true) {
+                if (worker.progress() == 0) {
+                    worker.waitForEvents();
+                }
+            }
+        } catch (ConnectException ignored) {
+        } catch (Exception ex) {
+            System.err.println(ex.getMessage());
+        }
 
-        closeResources();
+        try {
+            worker.progressRequest(endpoint.closeNonBlockingForce());
+        } catch (Exception ignored) {
+        } finally {
+            closeResources();
+        }
     }
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpAmData.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpAmData.java
new file mode 100755
index 00000000000..fb0a8588609
--- /dev/null
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpAmData.java
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) Mellanox Technologies Ltd. 2021. ALL RIGHTS RESERVED.
+ * See file LICENSE for terms.
+ */
+package org.openucx.jucx.ucp;
+
+import org.openucx.jucx.UcxCallback;
+import org.openucx.jucx.UcxException;
+import org.openucx.jucx.ucs.UcsConstants;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+/**
+ * Wrapper over received active message data. Could be one of:
+ * - Internal ucx data descriptor. Need to call {@link UcpAmData#receive} to receive actual data.
+ * - Actual data. Need to call {@link UcpAmData#close()} when not needed.
+ */
+public class UcpAmData implements Closeable {
+    private final UcpWorker worker;
+    private final long address;
+    private final long length;
+    private final long flags;
+
+    private UcpAmData(UcpWorker worker, long address, long length, long flags) {
+        this.worker = worker;
+        this.address = address;
+        this.length = length;
+        this.flags = flags;
+    }
+
+    @Override
+    public String toString() {
+        return "UcpAmData{" +
+            "address=" + Long.toHexString(address) +
+            ", length=" + length +
+            ", received=" + isDataValid() +
+            '}';
+    }
+
+    /**
+     * Whether actual data is received or need to call {@link UcpAmData#receive(long, UcxCallback)}
+     */
+    public boolean isDataValid() {
+        return (flags & UcpConstants.UCP_AM_RECV_ATTR_FLAG_DATA) != 0;
+    }
+
+    /**
+     * Get an address of received data
+     */
+    public long getDataAddress() {
+        if (!isDataValid()) {
+            throw new UcxException("Data is not received yet.");
+        }
+        return address;
+    }
+
+    public long getLength() {
+        return length;
+    }
+
+    /**
+     * Get UCX data handle descriptor to pass to {@link UcpWorker#recvAmDataNonBlocking}
+     */
+    public long getDataHandle() {
+        return address;
+    }
+
+    public UcpRequest receive(long resultAddress, UcxCallback callback) {
+        return worker.recvAmDataNonBlocking(getDataHandle(), resultAddress,
+            length, callback, UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (isDataValid()) {
+            worker.amDataRelease(address);
+        }
+    }
+}
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpAmRecvCallback.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpAmRecvCallback.java
new file mode 100755
index 00000000000..cbb1a78cae9
--- /dev/null
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpAmRecvCallback.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) Mellanox Technologies Ltd. 2021. ALL RIGHTS RESERVED.
+ * See file LICENSE for terms.
+ */
+package org.openucx.jucx.ucp;
+
+/**
+ * Callback to process incoming Active Message sent by {@link UcpEndpoint#sendAmNonBlocking }
+ * routine.
+ *
+ * The callback is always called from the progress context, therefore calling
+ * {@link UcpWorker#progress()} is not allowed. It is recommended to define
+ * callbacks with relatively short execution time to avoid blocking of
+ * communication progress.
+ */
+public interface UcpAmRecvCallback {
+
+    /**
+     * The callback is always called from the progress context, therefore calling
+     * {@link UcpWorker#progress()} is not allowed. It is recommended to define
+     * callbacks with relatively short execution time to avoid blocking of communication progress.
+     * @param headerAddress - User defined active message header. Can be 0.
+     * @param headerSize - Active message header length in bytes. If this
+     *                     value is 0, the headerAddress is undefined and should not be accessed.
+     * @param amData     - Points to {@link UcpAmData} wrapper that has whether received data or
+     *                     data descriptor to receive in {@link UcpWorker#recvAmDataNonBlocking}
+     * @param replyEp    - Endpoint, which can be used for reply to this message.
+     * @return           - {@link org.openucx.jucx.ucs.UcsConstants.STATUS#UCS_OK} -
+     *                     data will not persist after the callback returns.
+     *                     {@link org.openucx.jucx.ucs.UcsConstants.STATUS#UCS_INPROGRESS} -
+     *                     The data will persist after the callback has returned.
+     *                     To free the memory, need to call {@link UcpAmData#close()}
+     */
+    int onReceive(long headerAddress, long headerSize,
+                  UcpAmData amData, UcpEndpoint replyEp);
+}
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java
index f0e7529accf..db0ebd97155 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConnectionRequest.java
@@ -6,13 +6,25 @@
 
 import org.openucx.jucx.UcxNativeStruct;
 
+import java.net.InetSocketAddress;
+
 /**
  * A server-side handle to incoming connection request. Can be used to create an
  * endpoint which connects back to the client.
  */
 public class UcpConnectionRequest extends UcxNativeStruct {
 
-    private UcpConnectionRequest(long nativeId) {
+    private InetSocketAddress clientAddress;
+
+    /**
+     * The address of the remote client that sent the connection request to the server.
+     */
+    public InetSocketAddress getClientAddress() {
+        return clientAddress;
+    }
+
+    private UcpConnectionRequest(long nativeId, InetSocketAddress clientAddress) {
         setNativeId(nativeId);
+        this.clientAddress = clientAddress;
     }
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java
index e47a25af70b..800d45f71f8 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpConstants.java
@@ -38,6 +38,7 @@ public class UcpConstants {
     static long UCP_FEATURE_AMO64;
     static long UCP_FEATURE_WAKEUP;
     static long UCP_FEATURE_STREAM;
+    static long UCP_FEATURE_AM;
 
     /**
      * UCP worker parameters field mask.
@@ -96,8 +97,7 @@ public class UcpConstants {
     /**
      *  The enumeration is used to specify the behavior of UcpEndpoint closeNonBlocking.
      */
-    static int UCP_EP_CLOSE_MODE_FORCE;
-    static int UCP_EP_CLOSE_MODE_FLUSH;
+    static int UCP_EP_CLOSE_FLAG_FORCE;
 
     /**
      * UCP memory mapping parameters field mask.
@@ -105,6 +105,8 @@ public class UcpConstants {
     static long UCP_MEM_MAP_PARAM_FIELD_ADDRESS;
     static long UCP_MEM_MAP_PARAM_FIELD_LENGTH;
     static long UCP_MEM_MAP_PARAM_FIELD_FLAGS;
+    static long UCP_MEM_MAP_PARAM_FIELD_PROT;
+    static long UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE;
 
     /**
      *  The enumeration list describes the memory mapping flags.
@@ -113,11 +115,45 @@ public class UcpConstants {
     static long UCP_MEM_MAP_ALLOCATE;
     static long UCP_MEM_MAP_FIXED;
 
+    /**
+     * The enumeration list describes the memory mapping protections supported by
+     * {@link UcpContext#memoryMap(UcpMemMapParams)}
+     */
+    public static long UCP_MEM_MAP_PROT_LOCAL_READ;
+    public static long UCP_MEM_MAP_PROT_LOCAL_WRITE;
+    public static long UCP_MEM_MAP_PROT_REMOTE_READ;
+    public static long UCP_MEM_MAP_PROT_REMOTE_WRITE;
+
     /**
      * The enumeration defines behavior of
      * {@link UcpEndpoint#recvStreamNonBlocking(long, long, long, UcxCallback)}  function.
      */
     public static long UCP_STREAM_RECV_FLAG_WAITALL;
 
+    /**
+     * Indicates that the data provided in {@link UcpAmRecvCallback} callback
+     * can be held by the user. If {@link org.openucx.jucx.ucs.UcsConstants.STATUS#UCS_INPROGRESS}
+     * is returned from the callback, the data parameter will persist and the user has to call
+     * {@link UcpWorker#amDataRelease } when data is no longer needed. This flag is
+     * mutually exclusive with {@link UcpConstants#UCP_AM_RECV_ATTR_FLAG_RNDV}.
+     */
+    public static long UCP_AM_RECV_ATTR_FLAG_DATA;
+
+    /**
+     * Indicates that the arriving data was sent using rendezvous protocol.
+     * In this case dataAddress parameter of the {@link UcpAmRecvCallback#onReceive} points
+     * to the internal UCP descriptor, which can be used for obtaining the actual
+     * data by calling {@link UcpWorker#recvAmDataNonBlocking} routine. This flag is mutually
+     * exclusive with {@link UcpConstants#UCP_AM_RECV_ATTR_FLAG_DATA}.
+     */
+    public static long UCP_AM_RECV_ATTR_FLAG_RNDV;
+
+    /**
+     * Flags dictate the behavior of {@link UcpEndpoint#sendAmNonBlocking} routine.
+     */
+    public static long UCP_AM_SEND_FLAG_REPLY;
+    public static long UCP_AM_SEND_FLAG_EAGER;
+    public static long UCP_AM_SEND_FLAG_RNDV;
+
     private static native void loadConstants();
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java
index 50cf4de6df5..767817cc43b 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpContext.java
@@ -44,6 +44,14 @@ public void close() {
         this.setNativeId(null);
     }
 
+    /**
+     * @return - mask which memory types are supported, for supported memory types
+     * please see {@link org.openucx.jucx.ucs.UcsConstants.MEMORY_TYPE#isMemTypeSupported}
+     */
+    public long getMemoryTypesMask() {
+        return queryMemTypesNative(getNativeId());
+    }
+
     /**
      * Creates new UcpWorker on current context.
      */
@@ -83,6 +91,8 @@ public UcpMemory memoryMap(UcpMemMapParams params) {
 
     private static native long createContextNative(UcpParams params);
 
+    private static native long queryMemTypesNative(long contextId);
+
     private static native void cleanupContextNative(long contextId);
 
     private native UcpMemory memoryMapNative(long conetxtId, UcpMemMapParams params);
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java
index cdb5c751d27..c3fabe3c113 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpoint.java
@@ -9,11 +9,20 @@
 import java.io.Closeable;
 import java.nio.ByteBuffer;
 
+import static org.openucx.jucx.ucs.UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_UNKNOWN;
+
 public class UcpEndpoint extends UcxNativeStruct implements Closeable {
-    private final String paramsString;
+    private String paramsString;
     // Keep a reference to errorHandler to prevent it from GC and have valid ref
     // from JNI error handler.
-    private final UcpEndpointErrorHandler errorHandler;
+    private UcpEndpointErrorHandler errorHandler;
+
+    /**
+     * To construct reply endpoint for Active Messages from JNI.
+     */
+    private UcpEndpoint(long nativeId) {
+        setNativeId(nativeId);
+    }
 
     @Override
     public String toString() {
@@ -81,9 +90,15 @@ public UcpRequest putNonBlocking(ByteBuffer src, long remoteAddress, UcpRemoteKe
     public UcpRequest putNonBlocking(long localAddress, long size,
                                      long remoteAddress, UcpRemoteKey remoteKey,
                                      UcxCallback callback) {
+        return putNonBlocking(localAddress, size, remoteAddress, remoteKey, callback,
+            UCS_MEMORY_TYPE_UNKNOWN);
+    }
 
+    public UcpRequest putNonBlocking(long localAddress, long size,
+                                     long remoteAddress, UcpRemoteKey remoteKey,
+                                     UcxCallback callback, int memoryType) {
         return putNonBlockingNative(getNativeId(), localAddress,
-            size, remoteAddress, remoteKey.getNativeId(), callback);
+            size, remoteAddress, remoteKey.getNativeId(), callback, memoryType);
     }
 
     /**
@@ -136,8 +151,16 @@ public UcpRequest getNonBlocking(long remoteAddress, UcpRemoteKey remoteKey,
     public UcpRequest getNonBlocking(long remoteAddress, UcpRemoteKey remoteKey,
                                      long localAddress, long size, UcxCallback callback) {
 
+        return getNonBlocking(remoteAddress, remoteKey, localAddress, size, callback,
+            UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    public UcpRequest getNonBlocking(long remoteAddress, UcpRemoteKey remoteKey,
+                                     long localAddress, long size, UcxCallback callback,
+                                     int memoryType) {
+
         return getNonBlockingNative(getNativeId(), remoteAddress, remoteKey.getNativeId(),
-            localAddress, size, callback);
+            localAddress, size, callback, memoryType);
     }
 
     /**
@@ -192,7 +215,13 @@ public UcpRequest sendTaggedNonBlocking(ByteBuffer sendBuffer, long tag, UcxCall
 
     public UcpRequest sendTaggedNonBlocking(long localAddress, long size,
                                             long tag, UcxCallback callback) {
-        return sendTaggedNonBlockingNative(getNativeId(), localAddress, size, tag, callback);
+        return sendTaggedNonBlocking(localAddress, size, tag, callback, UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    public UcpRequest sendTaggedNonBlocking(long localAddress, long size,
+                                            long tag, UcxCallback callback, int memoryType) {
+        return sendTaggedNonBlockingNative(getNativeId(), localAddress, size, tag, callback,
+            memoryType);
     }
 
     /**
@@ -207,11 +236,17 @@ public UcpRequest sendTaggedNonBlocking(ByteBuffer sendBuffer, UcxCallback callb
      * Iov version of non blocking send operaation
      */
     public UcpRequest sendTaggedNonBlocking(long[] localAddresses, long[] sizes,
-                                            long tag, UcxCallback callback) {
+                                            long tag, UcxCallback callback, int memoryType) {
         UcxParams.checkArraySizes(localAddresses, sizes);
 
         return sendTaggedIovNonBlockingNative(getNativeId(), localAddresses, sizes,
-            tag, callback);
+            tag, callback, memoryType);
+    }
+
+    public UcpRequest sendTaggedNonBlocking(long[] localAddresses, long[] sizes,
+                                            long tag, UcxCallback callback) {
+
+        return sendTaggedNonBlocking(localAddresses, sizes, tag, callback, UCS_MEMORY_TYPE_UNKNOWN);
     }
 
     /**
@@ -222,14 +257,27 @@ public UcpRequest sendTaggedNonBlocking(long[] localAddresses, long[] sizes,
      * completion of the send operation.
      */
     public UcpRequest sendStreamNonBlocking(long localAddress, long size, UcxCallback callback) {
-        return sendStreamNonBlockingNative(getNativeId(), localAddress, size, callback);
+        return sendStreamNonBlocking(localAddress, size, callback, UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    public UcpRequest sendStreamNonBlocking(long localAddress, long size, UcxCallback callback,
+                                            int memoryType) {
+        return sendStreamNonBlockingNative(getNativeId(), localAddress, size, callback, memoryType);
     }
 
     public UcpRequest sendStreamNonBlocking(long[] localAddresses, long[] sizes,
                                             UcxCallback callback) {
         UcxParams.checkArraySizes(localAddresses, sizes);
 
-        return sendStreamIovNonBlockingNative(getNativeId(), localAddresses, sizes, callback);
+        return sendStreamNonBlocking(localAddresses, sizes, callback, UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    public UcpRequest sendStreamNonBlocking(long[] localAddresses, long[] sizes,
+                                            UcxCallback callback, int memoryType) {
+        UcxParams.checkArraySizes(localAddresses, sizes);
+
+        return sendStreamIovNonBlockingNative(getNativeId(), localAddresses, sizes, callback,
+            memoryType);
     }
 
     public UcpRequest sendStreamNonBlocking(ByteBuffer buffer, UcxCallback callback) {
@@ -244,17 +292,30 @@ public UcpRequest sendStreamNonBlocking(ByteBuffer buffer, UcxCallback callback)
      * the UCP library will invoke the call-back when data is in the receive buffer
      * and ready for application access.
      */
+    public UcpRequest recvStreamNonBlocking(long localAddress, long size, long flags,
+                                            UcxCallback callback, int memoryType) {
+        return recvStreamNonBlockingNative(getNativeId(), localAddress, size, flags, callback,
+            memoryType);
+    }
+
     public UcpRequest recvStreamNonBlocking(long localAddress, long size, long flags,
                                             UcxCallback callback) {
-        return recvStreamNonBlockingNative(getNativeId(), localAddress, size, flags, callback);
+        return recvStreamNonBlocking(localAddress, size, flags, callback, UCS_MEMORY_TYPE_UNKNOWN);
     }
 
     public UcpRequest recvStreamNonBlocking(long[] localAddresses, long[] sizes, long flags,
-                                            UcxCallback callback) {
+                                            UcxCallback callback, int memoryType) {
         UcxParams.checkArraySizes(localAddresses, sizes);
 
         return recvStreamIovNonBlockingNative(getNativeId(), localAddresses, sizes, flags,
-            callback);
+            callback, memoryType);
+    }
+
+    public UcpRequest recvStreamNonBlocking(long[] localAddresses, long[] sizes, long flags,
+                                            UcxCallback callback) {
+
+        return recvStreamNonBlocking(localAddresses, sizes, flags, callback,
+            UCS_MEMORY_TYPE_UNKNOWN);
     }
 
     public UcpRequest recvStreamNonBlocking(ByteBuffer buffer, long flags, UcxCallback callback) {
@@ -262,6 +323,32 @@ public UcpRequest recvStreamNonBlocking(ByteBuffer buffer, long flags, UcxCallba
             callback);
     }
 
+    /**
+     * Send Active Message.
+     * @param activeMessageId - Active Message id. Specifies which callback registered by
+     *                          {@link UcpWorker#setAmRecvHandler(int, UcpAmRecvCallback)} to run.
+     * @param headerAddress   - User defined Active Message header. NULL value is
+     *                          allowed if no header needed. In this case
+     * @param headerLength    - Active message header length in bytes.
+     * @param dataAddress     - Pointer to the data to be sent to the target node
+     *                          of the Active Message.
+     * @param dataLength      - Data length size in bytes
+     * @param callback        - Callback to call on a completion.
+     */
+    public UcpRequest sendAmNonBlocking(int activeMessageId, long headerAddress, long headerLength,
+                                        long dataAddress, long dataLength, long flags,
+                                        UcxCallback callback, int memoryType) {
+        return sendAmNonBlockingNative(getNativeId(), activeMessageId,
+            headerAddress, headerLength, dataAddress, dataLength, flags, callback, memoryType);
+    }
+
+    public UcpRequest sendAmNonBlocking(int activeMessageId, long headerAddress, long headerLength,
+                                        long dataAddress, long dataLength, long flags,
+                                        UcxCallback callback) {
+        return sendAmNonBlocking(activeMessageId, headerAddress, headerLength,
+            dataAddress, dataLength, flags, callback, UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
     /**
      * This routine flushes all outstanding AMO and RMA communications on this endpoint.
      * All the AMO and RMA operations issued on this endpoint prior to this call
@@ -279,14 +366,14 @@ public UcpRequest flushNonBlocking(UcxCallback callback) {
      * both (local and remote) sides to avoid undefined behavior.
      */
     public UcpRequest closeNonBlockingForce() {
-        return closeNonBlockingNative(getNativeId(), UcpConstants.UCP_EP_CLOSE_MODE_FORCE);
+        return closeNonBlockingNative(getNativeId(), UcpConstants.UCP_EP_CLOSE_FLAG_FORCE);
     }
 
     /**
      * Releases the endpoint by scheduling flushes on all outstanding operations.
      */
     public UcpRequest closeNonBlockingFlush() {
-        return closeNonBlockingNative(getNativeId(), UcpConstants.UCP_EP_CLOSE_MODE_FLUSH);
+        return closeNonBlockingNative(getNativeId(), 0);
     }
 
     private native long createEndpointNative(UcpEndpointParams params, long workerId);
@@ -297,7 +384,8 @@ public UcpRequest closeNonBlockingFlush() {
 
     private static native UcpRequest putNonBlockingNative(long enpointId, long localAddress,
                                                           long size, long remoteAddr,
-                                                          long ucpRkeyId, UcxCallback callback);
+                                                          long ucpRkeyId, UcxCallback callback,
+                                                          int memoryType);
 
     private static native void putNonBlockingImplicitNative(long enpointId, long localAddress,
                                                             long size, long remoteAddr,
@@ -305,7 +393,8 @@ private static native void putNonBlockingImplicitNative(long enpointId, long loc
 
     private static native UcpRequest getNonBlockingNative(long enpointId, long remoteAddress,
                                                           long ucpRkeyId, long localAddress,
-                                                          long size, UcxCallback callback);
+                                                          long size, UcxCallback callback,
+                                                          int memoryType);
 
     private static native void getNonBlockingImplicitNative(long enpointId, long remoteAddress,
                                                             long ucpRkeyId, long localAddress,
@@ -313,29 +402,41 @@ private static native void getNonBlockingImplicitNative(long enpointId, long rem
 
     private static native UcpRequest sendTaggedNonBlockingNative(long enpointId, long localAddress,
                                                                  long size, long tag,
-                                                                 UcxCallback callback);
+                                                                 UcxCallback callback,
+                                                                 int memoryType);
 
     private static native UcpRequest sendTaggedIovNonBlockingNative(long enpointId,
                                                                     long[] localAddresses,
                                                                     long[] sizes, long tag,
-                                                                    UcxCallback callback);
+                                                                    UcxCallback callback,
+                                                                    int memoryType);
 
     private static native UcpRequest sendStreamNonBlockingNative(long enpointId, long localAddress,
-                                                                 long size, UcxCallback callback);
+                                                                 long size, UcxCallback callback,
+                                                                 int memoryType);
 
     private static native UcpRequest sendStreamIovNonBlockingNative(long enpointId,
                                                                     long[] localAddresses,
                                                                     long[] sizes,
-                                                                    UcxCallback callback);
+                                                                    UcxCallback callback,
+                                                                    int memoryType);
 
     private static native UcpRequest recvStreamNonBlockingNative(long enpointId, long localAddress,
                                                                  long size, long flags,
-                                                                 UcxCallback callback);
+                                                                 UcxCallback callback,
+                                                                 int memoryType);
 
     private static native UcpRequest recvStreamIovNonBlockingNative(long enpointId,
                                                                     long[] localAddresses,
                                                                     long[] sizes, long flags,
-                                                                    UcxCallback callback);
+                                                                    UcxCallback callback,
+                                                                    int memoryType);
+
+    private static native UcpRequest sendAmNonBlockingNative(long enpointId, int activeMessageId,
+                                                             long headerAddress, long headerLength,
+                                                             long dataAddress, long dataLength,
+                                                             long flags, UcxCallback callback,
+                                                             int memoryType);
 
     private static native UcpRequest flushNonBlockingNative(long enpointId, UcxCallback callback);
 
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java
index 855e5ef5f46..e53e24d6642 100755
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointErrorHandler.java
@@ -15,5 +15,5 @@ public interface UcpEndpointErrorHandler {
      *             all subsequent operations on this ep will fail with
      *             the error code passed in {@code status}.
      */
-    void onError(UcpEndpoint ep, int status, String errorMsg);
+    void onError(UcpEndpoint ep, int status, String errorMsg) throws Exception;
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java
index bde0f080216..1ac8a96eb39 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpEndpointParams.java
@@ -30,7 +30,8 @@ public String toString() {
         }
 
         if (connectionRequest != 0) {
-            result += "connectionRequest,";
+            result += "connectionRequest" +
+                ((clientAddress != null) ? clientAddress.toString() : "");
         }
         return result;
     }
@@ -43,6 +44,7 @@ public UcpEndpointParams clear() {
         flags = 0;
         socketAddress = null;
         connectionRequest = 0;
+        clientAddress = null;
         errorHandler = null;
         return this;
     }
@@ -55,6 +57,8 @@ public UcpEndpointParams clear() {
 
     private InetSocketAddress socketAddress;
 
+    private InetSocketAddress clientAddress;
+
     private long connectionRequest;
 
     UcpEndpointErrorHandler errorHandler;
@@ -107,6 +111,9 @@ public UcpEndpointParams setNoLoopbackMode() {
     public UcpEndpointParams setConnectionRequest(UcpConnectionRequest connectionRequest) {
         this.fieldMask |= UcpConstants.UCP_EP_PARAM_FIELD_CONN_REQUEST;
         this.connectionRequest = connectionRequest.getNativeId();
+        if (connectionRequest.getClientAddress() != null) {
+            this.clientAddress = connectionRequest.getClientAddress();
+        }
         return this;
     }
 
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java
index 63c0ac003b1..00ea35eda4e 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListener.java
@@ -17,13 +17,18 @@
 public class UcpListener extends UcxNativeStruct implements Closeable {
 
     private InetSocketAddress address;
+    private UcpListenerConnectionHandler connectionHandler;
 
     public UcpListener(UcpWorker worker, UcpListenerParams params) {
         if (params.getSockAddr() == null) {
             throw new UcxException("UcpListenerParams.sockAddr must be non-null.");
         }
+        if (params.connectionHandler == null) {
+            throw new UcxException("Connection handler must be set");
+        }
+        this.connectionHandler = params.connectionHandler;
+        this.address = params.getSockAddr();
         setNativeId(createUcpListener(params, worker.getNativeId()));
-        address = params.getSockAddr();
     }
 
     /**
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java
index 28153a0772d..94fdc8c96ad 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpListenerParams.java
@@ -14,12 +14,13 @@ public class UcpListenerParams extends UcxParams {
     public UcpListenerParams clear() {
         super.clear();
         sockAddr = null;
+        connectionHandler = null;
         return this;
     }
 
     private InetSocketAddress sockAddr;
 
-    private UcpListenerConnectionHandler connectionHandler;
+    UcpListenerConnectionHandler connectionHandler;
 
     /**
      *  An address, on which {@link UcpListener} would bind.
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java
index 9ce96b94089..7dcbcb1a110 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemMapParams.java
@@ -8,6 +8,8 @@
 
 public class UcpMemMapParams extends UcxParams {
     private long flags;
+    private long prot;
+    private int memType;
     private long address;
     private long length;
 
@@ -17,6 +19,8 @@ public UcpMemMapParams clear() {
         address = 0;
         length = 0;
         flags = 0;
+        prot = 0;
+        memType = 0;
         return this;
     }
 
@@ -69,4 +73,39 @@ public UcpMemMapParams fixed() {
         flags |= UcpConstants.UCP_MEM_MAP_FIXED;
         return this;
     }
+
+    /**
+     * Memory protection mode, e.g. {@link UcpConstants#UCP_MEM_MAP_PROT_LOCAL_READ}
+     * This value is optional. If it's not set, the {@link UcpContext#memoryMap(UcpMemMapParams)}
+     * routine will consider the flags as set to
+     * UCP_MEM_MAP_PROT_LOCAL_READ|UCP_MEM_MAP_PROT_LOCAL_WRITE|
+     * UCP_MEM_MAP_PROT_REMOTE_READ|UCP_MEM_MAP_PROT_REMOTE_WRITE.
+     */
+    public UcpMemMapParams setProtection(long protection) {
+        this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_PROT;
+        this.prot = protection;
+        return this;
+    }
+
+    /**
+     * Memory type (for possible memory types see
+     * {@link org.openucx.jucx.ucs.UcsConstants.MEMORY_TYPE})
+     * It is an optimization hint to avoid memory type detection for map buffer.
+     * The meaning of this field depends on the operation type.
+     *
+     * - Memory allocation: ({@link UcpMemMapParams#allocate()} is set) This field
+     *   specifies the type of memory to allocate. If it's not set
+     *   {@link org.openucx.jucx.ucs.UcsConstants.MEMORY_TYPE#UCS_MEMORY_TYPE_HOST}
+     *   will be assumed by default.
+     *
+     * - Memory registration: This field specifies the type of memory which is
+     *   pointed by {@link UcpMemMapParams#setAddress(long)}. If it's not set,
+     *   or set to {@link org.openucx.jucx.ucs.UcsConstants.MEMORY_TYPE#UCS_MEMORY_TYPE_UNKNOWN},
+     *   the memory type will be detected internally.
+     */
+    public UcpMemMapParams setMemoryType(int memoryType) {
+        this.fieldMask |= UcpConstants.UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE;
+        this.memType = memoryType;
+        return this;
+    }
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java
index 360b33f9e3a..1f6a9882405 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpMemory.java
@@ -27,11 +27,17 @@ public class UcpMemory extends UcxNativeStruct implements Closeable {
 
     private long length;
 
+    private int memType;
+
     /**
      * To prevent construct outside of JNI.
      */
-    private UcpMemory(long nativeId) {
+    private UcpMemory(long nativeId, UcpContext context, long address, long length, int memType) {
         setNativeId(nativeId);
+        this.address = address;
+        this.length = length;
+        this.memType = memType;
+        this.context = context;
     }
 
     /**
@@ -96,6 +102,13 @@ public long getLength() {
         return length;
     }
 
+    /**
+     * Type of allocated memory.
+     */
+    public int getMemType() {
+        return memType;
+    }
+
     private static native void unmapMemoryNative(long contextId, long memoryId);
 
     private static native ByteBuffer getRkeyBufferNative(long contextId, long memoryId);
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java
index d4ace227ed7..8797279e8e0 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpParams.java
@@ -102,6 +102,15 @@ public UcpParams requestTagFeature() {
         return this;
     }
 
+    /**
+     * Request Active Message support feature.
+     */
+    public UcpParams requestAmFeature() {
+        this.fieldMask |= UcpConstants.UCP_PARAM_FIELD_FEATURES;
+        this.features |= UcpConstants.UCP_FEATURE_AM;
+        return this;
+    }
+
     /**
      * Request remote memory access support.
      */
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java
index 2761f1f4559..916e92a2d2c 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpRequest.java
@@ -7,6 +7,7 @@
 
 import org.openucx.jucx.UcxCallback;
 import org.openucx.jucx.UcxNativeStruct;
+import org.openucx.jucx.ucs.UcsConstants;
 
 import java.io.Closeable;
 import java.nio.ByteBuffer;
@@ -15,15 +16,17 @@
  * Request object, that returns by ucp operations (GET, PUT, SEND, etc.).
  * Call {@link UcpRequest#isCompleted()} to monitor completion of request.
  */
-public class UcpRequest extends UcxNativeStruct implements Closeable {
+public class UcpRequest extends UcxNativeStruct {
 
     private long recvSize;
 
     private long senderTag;
 
-    private UcpRequest(long nativeId) {
-        setNativeId(nativeId);
-    }
+    private int status = UcsConstants.STATUS.UCS_INPROGRESS;
+
+    private long iovVector;
+
+    private UcxCallback callback;
 
     /**
      * To initialize for failed and immediately completed requests.
@@ -49,23 +52,14 @@ public long getSenderTag() {
      * @return whether this request is completed.
      */
     public boolean isCompleted() {
-        return (getNativeId() == null) || isCompletedNative(getNativeId());
+        return status != UcsConstants.STATUS.UCS_INPROGRESS;
     }
 
     /**
-     * This routine releases the non-blocking request back to the library, regardless
-     * of its current state. Communications operations associated with this request
-     * will make progress internally, however no further notifications or callbacks
-     * will be invoked for this request.
+     * @return status of the current request
      */
-    @Override
-    public void close() {
-        if (getNativeId() != null) {
-            closeRequestNative(getNativeId());
-        }
+    public int getStatus() {
+        return status;
     }
 
-    private static native boolean isCompletedNative(long ucpRequest);
-
-    private static native void closeRequestNative(long ucpRequest);
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java
index 8da85f2e4d2..83611a8c6d5 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucp/UcpWorker.java
@@ -7,9 +7,12 @@
 
 import java.io.Closeable;
 import java.nio.ByteBuffer;
+import java.util.HashMap;
 
 import org.openucx.jucx.*;
 
+import static org.openucx.jucx.ucs.UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_UNKNOWN;
+
 /**
  * UCP worker is an opaque object representing the communication context. The
  * worker represents an instance of a local communication resource and the
@@ -31,6 +34,13 @@
  */
 public class UcpWorker extends UcxNativeStruct implements Closeable {
 
+    /**
+     * To keep a reference to AmRecvCallback class to prevent it from GC.
+     */
+    private final HashMap<Integer, Object[]> amRecvHandlers = new HashMap<>();
+
+    private long maxAmHeaderSize = 0L;
+
     public UcpWorker(UcpContext context, UcpWorkerParams params) {
         setNativeId(createWorkerNative(params, context.getNativeId()));
     }
@@ -53,20 +63,78 @@ public UcpListener newListener(UcpListenerParams params) {
     public void close() {
         releaseWorkerNative(getNativeId());
         setNativeId(null);
+        amRecvHandlers.clear();
+    }
+
+    /**
+     * Maximal allowed header size for {@link UcpEndpoint#sendAmNonBlocking} routine.
+     */
+    public long getMaxAmHeaderSize() {
+        return maxAmHeaderSize;
+    }
+
+    /**
+     * This routine installs a user defined callback to handle incoming Active
+     * Messages with a specific id. This callback is called whenever an Active
+     * Message that was sent from the remote peer by @ref ucp_am_send_nbx is
+     * received on this worker.
+     *
+     * @param callback - Active Message callback. To clear the already set callback,
+     *                   this value should be set to null.
+     */
+    public void setAmRecvHandler(int amId, UcpAmRecvCallback callback) {
+        if (callback == null) {
+            removeAmRecvHandler(amId);
+            return;
+        }
+        Object[] callbackAndWorker = new Object[2];
+        callbackAndWorker[0] = callback;
+        callbackAndWorker[1] = this;
+        amRecvHandlers.put(amId, callbackAndWorker);
+        setAmRecvHandlerNative(getNativeId(), amId, callbackAndWorker);
+    }
+
+    /**
+     * Clears Active Message callback.
+     */
+    public void removeAmRecvHandler(int amId) {
+        amRecvHandlers.remove(amId);
+        setAmRecvHandlerNative(getNativeId(), amId, null);
+    }
+
+    /**
+     * This routine releases data that persisted through an Active Message
+     * callback because that callback returned UCS_INPROGRESS.
+     */
+    public void amDataRelease(long address) {
+        amDataReleaseNative(getNativeId(), address);
     }
 
+    /**
+     * This routine receives a message that is described by the data descriptor
+     * {@code dataDesc}, local address {@code address} and size {@code size} on a worker.
+     * The routine is non-blocking and therefore returns immediately.
+     * The receive operation is considered completed when the message is delivered to the buffer.
+     */
+    public UcpRequest recvAmDataNonBlocking(long dataDesc, long address, long size,
+                                            UcxCallback callback, int memoryType) {
+        return recvAmDataNonBlockingNative(getNativeId(), dataDesc, address, size, callback,
+            memoryType);
+    }
+
+
     /**
      * This routine explicitly progresses all communication operations on a worker.
      * @return Non-zero if any communication was progressed, zero otherwise.
      */
-    public int progress() {
+    public int progress() throws Exception {
         return progressWorkerNative(getNativeId());
     }
 
     /**
      * Blocking progress for request until it's not completed.
      */
-    public void progressRequest(UcpRequest request) {
+    public void progressRequest(UcpRequest request) throws Exception {
         while (!request.isCompleted()) {
             progress();
         }
@@ -128,14 +196,20 @@ public UcpRequest recvTaggedNonBlocking(ByteBuffer recvBuffer, long tag, long ta
         if (!recvBuffer.isDirect()) {
             throw new UcxException("Recv buffer must be direct.");
         }
-        return recvTaggedNonBlockingNative(getNativeId(), UcxUtils.getAddress(recvBuffer),
+        return recvTaggedNonBlocking(UcxUtils.getAddress(recvBuffer),
             recvBuffer.remaining(), tag, tagMask, callback);
     }
 
     public UcpRequest recvTaggedNonBlocking(long localAddress, long size, long tag, long tagMask,
                                             UcxCallback callback) {
+        return recvTaggedNonBlocking(localAddress, size, tag, tagMask, callback,
+            UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    public UcpRequest recvTaggedNonBlocking(long localAddress, long size, long tag, long tagMask,
+                                            UcxCallback callback, int memoryType) {
         return recvTaggedNonBlockingNative(getNativeId(), localAddress, size,
-            tag, tagMask, callback);
+            tag, tagMask, callback, memoryType);
     }
 
     /**
@@ -150,10 +224,18 @@ public UcpRequest recvTaggedNonBlocking(ByteBuffer recvBuffer, UcxCallback callb
     public UcpRequest recvTaggedNonBlocking(long[] localAddresses, long[] sizes,
                                             long tag, long tagMask,
                                             UcxCallback callback) {
+
+        return recvTaggedNonBlocking(localAddresses, sizes, tag, tagMask, callback,
+            UCS_MEMORY_TYPE_UNKNOWN);
+    }
+
+    public UcpRequest recvTaggedNonBlocking(long[] localAddresses, long[] sizes,
+                                            long tag, long tagMask,
+                                            UcxCallback callback, int memoryType) {
         UcxParams.checkArraySizes(localAddresses, sizes);
 
         return recvTaggedIovNonBlockingNative(getNativeId(), localAddresses, sizes, tag,
-            tagMask, callback);
+            tagMask, callback, memoryType);
     }
 
     /**
@@ -201,9 +283,15 @@ public UcpTagMessage tagProbeNonBlocking(long tag, long tagMask, boolean remove)
      * If the receive operation cannot be stated the routine returns an error.
      */
     public UcpRequest recvTaggedMessageNonBlocking(long address, long size, UcpTagMessage message,
-                                                   UcxCallback callback) {
+                                                   UcxCallback callback, int memoryType) {
         return recvTaggedMessageNonBlockingNative(getNativeId(), address, size,
-            message.getNativeId(), callback);
+            message.getNativeId(), callback, memoryType);
+    }
+
+    public UcpRequest recvTaggedMessageNonBlocking(long address, long size, UcpTagMessage message,
+                                                   UcxCallback callback) {
+        return recvTaggedMessageNonBlocking(address, size, message, callback,
+            UCS_MEMORY_TYPE_UNKNOWN);
     }
 
     public UcpRequest recvTaggedMessageNonBlocking(ByteBuffer buffer, UcpTagMessage message,
@@ -212,6 +300,7 @@ public UcpRequest recvTaggedMessageNonBlocking(ByteBuffer buffer, UcpTagMessage
             message, callback);
     }
 
+
     /**
      * This routine tries to cancels an outstanding communication request. After
      * calling this routine, the request will be in completed or canceled (but
@@ -222,6 +311,9 @@ public UcpRequest recvTaggedMessageNonBlocking(ByteBuffer buffer, UcpTagMessage
      * case it is canceled the status argument is set to UCS_ERR_CANCELED.
      */
     public void cancelRequest(UcpRequest request) {
+        if (request.getNativeId() == null) {
+            throw new UcxException("Request is not valid");
+        }
         cancelRequestNative(getNativeId(), request.getNativeId());
     }
 
@@ -243,7 +335,7 @@ public ByteBuffer getAddress() {
         return result;
     }
 
-    private static native long createWorkerNative(UcpWorkerParams params, long ucpContextId);
+    private native long createWorkerNative(UcpWorkerParams params, long ucpContextId);
 
     private static native void releaseWorkerNative(long workerId);
 
@@ -251,7 +343,7 @@ public ByteBuffer getAddress() {
 
     private static native void releaseAddressNative(long workerId, ByteBuffer addressId);
 
-    private static native int progressWorkerNative(long workerId);
+    private static native int progressWorkerNative(long workerId) throws Exception;
 
     private static native UcpRequest flushNonBlockingNative(long workerId, UcxCallback callback);
 
@@ -259,22 +351,35 @@ public ByteBuffer getAddress() {
 
     private static native void signalWorkerNative(long workerId);
 
+    private static native void setAmRecvHandlerNative(long workerId, int amId,
+                                                      Object[] callbackAndWorker);
+
+    private static native UcpRequest recvAmDataNonBlockingNative(long workerId, long dataDesc,
+                                                                 long address, long size,
+                                                                 UcxCallback callback,
+                                                                 int memoryType);
+
+    private static native void amDataReleaseNative(long workerId, long dataAddress);
+
     private static native UcpRequest recvTaggedNonBlockingNative(long workerId, long localAddress,
                                                                  long size, long tag, long tagMask,
-                                                                 UcxCallback callback);
+                                                                 UcxCallback callback,
+                                                                 int memoryType);
 
     private static native UcpRequest recvTaggedIovNonBlockingNative(long workerId,
                                                                     long[] localAddresses,
                                                                     long[] sizes,
                                                                     long tag, long tagMask,
-                                                                    UcxCallback callback);
+                                                                    UcxCallback callback,
+                                                                    int memoryType);
 
     private static native UcpTagMessage tagProbeNonBlockingNative(long workerId, long tag,
                                                                   long tagMask, boolean remove);
 
     private static native UcpRequest recvTaggedMessageNonBlockingNative(long workerId, long address,
                                                                         long size, long tagMsgId,
-                                                                        UcxCallback callback);
+                                                                        UcxCallback callback,
+                                                                        int memoryType);
 
     private static native void cancelRequestNative(long workerId, long requestId);
 }
diff --git a/bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java b/bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java
index b22f0b1da60..58a495c3a35 100644
--- a/bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java
+++ b/bindings/java/src/main/java/org/openucx/jucx/ucs/UcsConstants.java
@@ -6,6 +6,7 @@
 package org.openucx.jucx.ucs;
 
 import org.openucx.jucx.NativeLibs;
+import org.openucx.jucx.ucp.UcpContext;
 
 public class UcsConstants {
     static {
@@ -22,6 +23,79 @@ public static class ThreadMode {
         public static int UCS_THREAD_MODE_MULTI;
     }
 
+    /**
+     * Status codes
+     */
+    public static class STATUS {
+        static {
+            load();
+        }
+
+        /* Operation completed successfully */
+        public static int UCS_OK;
+
+        /* Operation is queued and still in progress */
+        public static int UCS_INPROGRESS;
+
+        /* Failure codes */
+        public static int UCS_ERR_NO_MESSAGE;
+        public static int UCS_ERR_NO_RESOURCE;
+        public static int UCS_ERR_IO_ERROR;
+        public static int UCS_ERR_NO_MEMORY;
+        public static int UCS_ERR_INVALID_PARAM;
+        public static int UCS_ERR_UNREACHABLE;
+        public static int UCS_ERR_INVALID_ADDR;
+        public static int UCS_ERR_NOT_IMPLEMENTED;
+        public static int UCS_ERR_MESSAGE_TRUNCATED;
+        public static int UCS_ERR_NO_PROGRESS;
+        public static int UCS_ERR_BUFFER_TOO_SMALL;
+        public static int UCS_ERR_NO_ELEM;
+        public static int UCS_ERR_SOME_CONNECTS_FAILED;
+        public static int UCS_ERR_NO_DEVICE;
+        public static int UCS_ERR_BUSY;
+        public static int UCS_ERR_CANCELED;
+        public static int UCS_ERR_SHMEM_SEGMENT;
+        public static int UCS_ERR_ALREADY_EXISTS;
+        public static int UCS_ERR_OUT_OF_RANGE;
+        public static int UCS_ERR_TIMED_OUT;
+        public static int UCS_ERR_EXCEEDS_LIMIT;
+        public static int UCS_ERR_UNSUPPORTED;
+        public static int UCS_ERR_REJECTED;
+        public static int UCS_ERR_NOT_CONNECTED;
+        public static int UCS_ERR_CONNECTION_RESET;
+
+        public static int UCS_ERR_FIRST_LINK_FAILURE;
+        public static int UCS_ERR_LAST_LINK_FAILURE;
+        public static int UCS_ERR_FIRST_ENDPOINT_FAILURE;
+        public static int UCS_ERR_ENDPOINT_TIMEOUT;
+        public static int UCS_ERR_LAST_ENDPOINT_FAILURE;
+
+        public static int UCS_ERR_LAST;
+    }
+
+    public static class MEMORY_TYPE {
+        static {
+            load();
+        }
+
+        /**
+         * Checks whether context's memory type mask
+         * (received via {@link UcpContext#getMemoryTypesMask()})
+         * supports particular memory type.
+         */
+        public static boolean isMemTypeSupported(long mask, int memoryType) {
+            return ((1L << memoryType) & mask) != 0;
+        }
+
+        public static int UCS_MEMORY_TYPE_HOST;          // Default system memory
+        public static int UCS_MEMORY_TYPE_CUDA;          // NVIDIA CUDA memory
+        public static int UCS_MEMORY_TYPE_CUDA_MANAGED;  // NVIDIA CUDA managed (or unified) memory
+        public static int UCS_MEMORY_TYPE_ROCM;          // AMD ROCM memory
+        public static int UCS_MEMORY_TYPE_ROCM_MANAGED;  // AMD ROCM managed system memory
+        public static int UCS_MEMORY_TYPE_LAST;
+        public static int UCS_MEMORY_TYPE_UNKNOWN;
+    }
+
     private static void load() {
         NativeLibs.load();
         loadConstants();
diff --git a/bindings/java/src/main/native/Makefile.am b/bindings/java/src/main/native/Makefile.am
index 67ce262f6a7..57f21ddd539 100644
--- a/bindings/java/src/main/native/Makefile.am
+++ b/bindings/java/src/main/native/Makefile.am
@@ -22,11 +22,11 @@ JUCX_GENERATED_H_FILES = org_openucx_jucx_ucp_UcpConstants.h             \
                          org_openucx_jucx_ucp_UcpEndpoint.h              \
                          org_openucx_jucx_ucp_UcpListener.h              \
                          org_openucx_jucx_ucp_UcpMemory.h                \
-                         org_openucx_jucx_ucp_UcpRequest.h               \
                          org_openucx_jucx_ucp_UcpRemoteKey.h             \
                          org_openucx_jucx_ucp_UcpWorker.h                \
                          org_openucx_jucx_ucs_UcsConstants_ThreadMode.h  \
-                         org_openucx_jucx_ucs_UcsConstants.h
+                         org_openucx_jucx_ucs_UcsConstants.h             \
+                         org_openucx_jucx_UcxUtils.h
 
 BUILT_SOURCES = $(JUCX_GENERATED_H_FILES)
 
@@ -40,7 +40,8 @@ MOSTLYCLEANFILES = $(JUCX_GENERATED_H_FILES) $(STAMP_FILE)
 #
 $(STAMP_FILE): \
 		$(javadir)/src/main/java/org/openucx/jucx/ucs/*.java \
-		$(javadir)/src/main/java/org/openucx/jucx/ucp/*.java
+		$(javadir)/src/main/java/org/openucx/jucx/ucp/*.java \
+		$(javadir)/src/main/java/org/openucx/jucx/examples/*.java
 	$(MVNCMD) compile
 	touch $(STAMP_FILE)
 
@@ -58,12 +59,11 @@ libjucx_la_SOURCES = context.cc \
                      jucx_common_def.cc \
                      listener.cc \
                      memory.cc \
-                     request.cc \
                      ucp_constants.cc \
                      ucs_constants.cc \
                      worker.cc
 
-libjucx_la_CXXFLAGS = -fPIC -DPIC -Werror -std=gnu++98
+libjucx_la_CXXFLAGS = $(BASE_CXXFLAGS) -std=gnu++98
 
 libjucx_la_LIBADD = $(topdir)/src/ucs/libucs.la \
                     $(topdir)/src/uct/libuct.la \
@@ -73,7 +73,7 @@ libjucx_la_LIBADD = $(topdir)/src/ucs/libucs.la \
 libjucx_la_DEPENDENCIES = Makefile.am Makefile.in Makefile
 
 # Compile Java source code and pack to jar
-$(jarfile):
+$(jarfile): libjucx.la
 	$(MVNCMD) package -DskipTests
 
 package : $(jarfile)
diff --git a/bindings/java/src/main/native/context.cc b/bindings/java/src/main/native/context.cc
index e68eee74974..d9bb5e45b3d 100644
--- a/bindings/java/src/main/native/context.cc
+++ b/bindings/java/src/main/native/context.cc
@@ -5,9 +5,6 @@
 
 #include "jucx_common_def.h"
 #include "org_openucx_jucx_ucp_UcpContext.h"
-extern "C" {
-#include <ucp/core/ucp_mm.h>
-}
 
 /**
  * Iterates through entries of java's hash map and apply
@@ -91,11 +88,6 @@ Java_org_openucx_jucx_ucp_UcpContext_createContextNative(JNIEnv *env, jclass cls
                                                          field);
     }
 
-    ucp_params.field_mask |= UCP_PARAM_FIELD_REQUEST_INIT |
-                             UCP_PARAM_FIELD_REQUEST_SIZE;
-    ucp_params.request_size = sizeof(struct jucx_context);
-    ucp_params.request_init = jucx_request_init;
-
     ucp_config_t *config = NULL;
     ucs_status_t status;
 
@@ -160,30 +152,54 @@ Java_org_openucx_jucx_ucp_UcpContext_memoryMapNative(JNIEnv *env, jobject ctx,
         params.flags = env->GetLongField(jucx_mmap_params, field);;
     }
 
+    if (params.field_mask & UCP_MEM_MAP_PARAM_FIELD_PROT) {
+        field = env->GetFieldID(jucx_mmap_class, "prot", "J");
+        params.prot = env->GetLongField(jucx_mmap_params, field);;
+    }
+
+    if (params.field_mask & UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE) {
+        field = env->GetFieldID(jucx_mmap_class, "memType", "I");
+        params.memory_type =
+            static_cast<ucs_memory_type_t>(env->GetIntField(jucx_mmap_params, field));
+    }
+
     ucs_status_t status =  ucp_mem_map((ucp_context_h)ucp_context_ptr, &params, &memh);
     if (status != UCS_OK) {
         JNU_ThrowExceptionByStatus(env, status);
     }
 
-    // Construct UcpMemory class
-    jclass jucx_mem_cls = env->FindClass("org/openucx/jucx/ucp/UcpMemory");
-    jmethodID constructor = env->GetMethodID(jucx_mem_cls, "<init>", "(J)V");
-    jobject jucx_mem = env->NewObject(jucx_mem_cls, constructor, (native_ptr)memh);
+    ucp_mem_attr_t attr = {0};
 
-    // Set UcpContext pointer
-    field = env->GetFieldID(jucx_mem_cls, "context", "Lorg/openucx/jucx/ucp/UcpContext;");
-    env->SetObjectField(jucx_mem, field, ctx);
+    attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH |
+                      UCP_MEM_ATTR_FIELD_MEM_TYPE;
 
-    // Set address
-    field =  env->GetFieldID(jucx_mem_cls, "address", "J");
-    env->SetLongField(jucx_mem, field, (native_ptr)memh->address);
+    ucp_mem_query(memh, &attr);
 
-    // Set length
-    field =  env->GetFieldID(jucx_mem_cls, "length", "J");
-    env->SetLongField(jucx_mem, field, memh->length);
+    // Construct UcpMemory class
+    jclass jucx_mem_cls = env->FindClass("org/openucx/jucx/ucp/UcpMemory");
+    jmethodID constructor = env->GetMethodID(jucx_mem_cls, "<init>",
+                                             "(JLorg/openucx/jucx/ucp/UcpContext;JJI)V");
+    jobject jucx_mem = env->NewObject(jucx_mem_cls, constructor, (native_ptr)memh, ctx,
+                                      attr.address, attr.length, attr.mem_type);
 
     /* Coverity thinks that memh is a leaked object here,
      * but it's stored in a UcpMemory object */
     /* coverity[leaked_storage] */
     return jucx_mem;
 }
+
+JNIEXPORT jlong JNICALL
+Java_org_openucx_jucx_ucp_UcpContext_queryMemTypesNative(JNIEnv *env, jclass cls,
+                                                         jlong ucp_context_ptr)
+{
+    ucp_context_attr_t params;
+
+    params.field_mask = UCP_ATTR_FIELD_MEMORY_TYPES;
+
+    ucs_status_t status = ucp_context_query((ucp_context_h)ucp_context_ptr, &params);
+    if (status != UCS_OK) {
+        JNU_ThrowExceptionByStatus(env, status);
+    }
+
+    return params.memory_types;
+}
diff --git a/bindings/java/src/main/native/endpoint.cc b/bindings/java/src/main/native/endpoint.cc
index 9f1151b398e..e756e9c0355 100644
--- a/bindings/java/src/main/native/endpoint.cc
+++ b/bindings/java/src/main/native/endpoint.cc
@@ -110,9 +110,18 @@ JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_closeNonBlockingNative(JNIEnv *env, jclass cls,
                                                              jlong ep_ptr, jint mode)
 {
-    ucs_status_ptr_t request = ucp_ep_close_nb((ucp_ep_h)ep_ptr, mode);
+    ucp_request_param_t param = {0};
 
-    return process_request(request, NULL);
+    jobject jucx_request = jucx_request_allocate(env, NULL, &param, UCS_MEMORY_TYPE_UNKNOWN);
+
+    param.op_attr_mask |= UCP_OP_ATTR_FIELD_FLAGS;
+    param.flags         = mode;
+    param.cb.send       = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_ep_close_nbx((ucp_ep_h)ep_ptr, &param);
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
@@ -138,14 +147,22 @@ JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_putNonBlockingNative(JNIEnv *env, jclass cls,
                                                            jlong ep_ptr, jlong laddr,
                                                            jlong size, jlong raddr,
-                                                           jlong rkey_ptr, jobject callback)
+                                                           jlong rkey_ptr, jobject callback,
+                                                           jint memory_type)
 {
-    ucs_status_ptr_t request = ucp_put_nb((ucp_ep_h)ep_ptr, (void *)laddr, size, raddr,
-                                          (ucp_rkey_h)rkey_ptr, jucx_request_callback);
+    ucp_request_param_t param = {0};
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.cb.send         = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_put_nbx((ucp_ep_h)ep_ptr, (void *)laddr, size, raddr,
+                                          (ucp_rkey_h)rkey_ptr, &param);
 
-    ucs_trace_req("JUCX: put_nb request %p, of size: %zu, raddr: %zu",
-                  request, size, raddr);
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+
+    ucs_trace_req("JUCX: put_nb request %p, of size: %zu, raddr: %zu", status, size, raddr);
+
+    return jucx_request;
 }
 
 JNIEXPORT void JNICALL
@@ -166,14 +183,22 @@ JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_getNonBlockingNative(JNIEnv *env, jclass cls,
                                                            jlong ep_ptr, jlong raddr,
                                                            jlong rkey_ptr, jlong laddr,
-                                                           jlong size, jobject callback)
+                                                           jlong size, jobject callback,
+                                                           jint memory_type)
 {
-    ucs_status_ptr_t request = ucp_get_nb((ucp_ep_h)ep_ptr, (void *)laddr, size,
-                                          raddr, (ucp_rkey_h)rkey_ptr, jucx_request_callback);
+    ucp_request_param_t param = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
 
+    param.cb.send       = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_get_nbx((ucp_ep_h)ep_ptr, (void *)laddr, size,
+                                          raddr, (ucp_rkey_h)rkey_ptr, &param);
     ucs_trace_req("JUCX: get_nb request %p, raddr: %zu, size: %zu, result address: %zu",
-                  request, raddr, size, laddr);
-    return process_request(request, callback);
+                  status, raddr, size, laddr);
+
+    process_request(env, jucx_request, status);
+    return jucx_request;
 }
 
 JNIEXPORT void JNICALL
@@ -194,145 +219,201 @@ JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_sendTaggedNonBlockingNative(JNIEnv *env, jclass cls,
                                                                   jlong ep_ptr, jlong addr,
                                                                   jlong size, jlong tag,
-                                                                  jobject callback)
+                                                                  jobject callback, jint memory_type)
 {
-    ucs_status_ptr_t request = ucp_tag_send_nb((ucp_ep_h)ep_ptr, (void *)addr, size,
-                                               ucp_dt_make_contig(1), tag, jucx_request_callback);
+    ucp_request_param_t param = {0};
 
-    ucs_trace_req("JUCX: send_tag_nb request %p, size: %zu, tag: %ld",
-                  request, size, tag);
-    return process_request(request, callback);
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.cb.send = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_tag_send_nbx((ucp_ep_h)ep_ptr, (void *)addr, size, tag, &param);
+    ucs_trace_req("JUCX: send_tag_nb request %p, size: %zu, tag: %ld", status, size, tag);
+
+    process_request(env, jucx_request, status);
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_sendTaggedIovNonBlockingNative(JNIEnv *env, jclass cls,
                                                                     jlong ep_ptr, jlongArray addresses,
                                                                     jlongArray sizes, jlong tag,
-                                                                    jobject callback)
+                                                                    jobject callback, jint memory_type)
 {
     int iovcnt;
+    ucp_request_param_t param = {0};
 
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
     ucp_dt_iov_t* iovec = get_ucp_iov(env, addresses, sizes, iovcnt);
     if (iovec == NULL) {
         return NULL;
     }
 
-    ucs_status_ptr_t request = ucp_tag_send_nb((ucp_ep_h)ep_ptr, iovec, iovcnt,
-                                               ucp_dt_make_iov(), tag, jucx_request_callback);
+    jucx_request_set_iov(env, jucx_request, iovec);
 
-    if (UCS_PTR_IS_PTR(request)) {
-        struct jucx_context *ctx = (struct jucx_context *)request;
-        ctx->iovec = iovec;
-    } else {
-        ucs_free(iovec);
-    }
+    param.op_attr_mask |= UCP_OP_ATTR_FIELD_DATATYPE;
+    param.cb.send       = jucx_request_callback;
+    param.datatype      = ucp_dt_make_iov();
+
+    ucs_status_ptr_t status = ucp_tag_send_nbx((ucp_ep_h)ep_ptr, iovec, iovcnt, tag, &param);
+    ucs_trace_req("JUCX: send_tag_iov_nb request %p, tag: %ld", status, tag);
 
-    ucs_trace_req("JUCX: send_tag_iov_nb request %p, tag: %ld", request, tag);
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_sendStreamNonBlockingNative(JNIEnv *env, jclass cls,
                                                                   jlong ep_ptr, jlong addr,
-                                                                  jlong size, jobject callback)
+                                                                  jlong size, jobject callback,
+                                                                  jint memory_type)
 {
-    ucs_status_ptr_t request = ucp_stream_send_nb((ucp_ep_h)ep_ptr, (void *)addr, size,
-                                                  ucp_dt_make_contig(1), jucx_request_callback, 0);
+    ucp_request_param_t param = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.cb.send        = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_stream_send_nbx((ucp_ep_h)ep_ptr, (void *)addr, size, &param);
+    ucs_trace_req("JUCX: send_stream_nb request %p, size: %zu", status, size);
 
-    ucs_trace_req("JUCX: send_stream_nb request %p, size: %zu", request, size);
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_sendStreamIovNonBlockingNative(JNIEnv *env, jclass cls,
                                                                      jlong ep_ptr, jlongArray addresses,
-                                                                     jlongArray sizes,
-                                                                     jobject callback)
+                                                                     jlongArray sizes, jobject callback,
+                                                                     jint memory_type)
 {
     int iovcnt;
+    ucp_request_param_t param = {0};
 
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
     ucp_dt_iov_t* iovec = get_ucp_iov(env, addresses, sizes, iovcnt);
     if (iovec == NULL) {
         return NULL;
     }
 
-    ucs_status_ptr_t request = ucp_stream_send_nb((ucp_ep_h)ep_ptr, iovec, iovcnt,
-                                                  ucp_dt_make_iov(), jucx_request_callback, 0);
+    jucx_request_set_iov(env, jucx_request, iovec);
 
-    if (UCS_PTR_IS_PTR(request)) {
-        struct jucx_context *ctx = (struct jucx_context *)request;
-        ctx->iovec = iovec;
-    } else {
-        ucs_free(iovec);
-    }
+    param.op_attr_mask |= UCP_OP_ATTR_FIELD_DATATYPE;
+    param.cb.send       = jucx_request_callback;
+    param.datatype      = ucp_dt_make_iov();
+
+    ucs_status_ptr_t status = ucp_stream_send_nbx((ucp_ep_h)ep_ptr, iovec, iovcnt, &param);
+    ucs_trace_req("JUCX: send_stream_iov_nb request %p", status);
 
-    ucs_trace_req("JUCX: send_stream_iov_nb request %p", request);
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_recvStreamNonBlockingNative(JNIEnv *env, jclass cls,
                                                                   jlong ep_ptr, jlong addr,
                                                                   jlong size, jlong flags,
-                                                                  jobject callback)
+                                                                  jobject callback,
+                                                                  jint memory_type)
 {
     size_t rlength;
-    ucs_status_ptr_t request = ucp_stream_recv_nb((ucp_ep_h)ep_ptr, (void *)addr, size,
-                                                  ucp_dt_make_contig(1), stream_recv_callback,
-                                                  &rlength, flags);
+    ucp_request_param_t param = {0};
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.op_attr_mask   |= UCP_OP_ATTR_FIELD_FLAGS;
+    param.cb.recv_stream  = stream_recv_callback;
+    param.flags           = flags;
 
-    ucs_trace_req("JUCX: recv_stream_nb request %p, size: %zu", request, size);
+    ucs_status_ptr_t status = ucp_stream_recv_nbx((ucp_ep_h)ep_ptr, (void *)addr, size,
+                                                  &rlength, &param);
+    ucs_trace_req("JUCX: recv_stream_nb request %p, size: %zu", status, size);
 
-    if (request == NULL) {
-        // If request completed immidiately.
-        return process_completed_stream_recv(rlength, callback);
+    if (status == NULL) {
+        jucx_request_update_recv_length(env, jucx_request, rlength);
     }
 
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_recvStreamIovNonBlockingNative(JNIEnv *env, jclass cls,
                                                                      jlong ep_ptr,
                                                                      jlongArray addresses, jlongArray sizes,
-                                                                     jlong flags, jobject callback)
+                                                                     jlong flags, jobject callback,
+                                                                     jint memory_type)
 {
     size_t rlength;
-
     int iovcnt;
+    ucp_request_param_t param = {0};
 
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
     ucp_dt_iov_t* iovec = get_ucp_iov(env, addresses, sizes, iovcnt);
     if (iovec == NULL) {
         return NULL;
     }
 
-    ucs_status_ptr_t request = ucp_stream_recv_nb((ucp_ep_h)ep_ptr, iovec, iovcnt,
-                                                  ucp_dt_make_iov(), stream_recv_callback,
-                                                  &rlength, flags);
+    jucx_request_set_iov(env, jucx_request, iovec);
 
-    ucs_trace_req("JUCX: recv_stream_iov_nb request %p", request);
+    param.op_attr_mask   |= UCP_OP_ATTR_FIELD_FLAGS |
+                            UCP_OP_ATTR_FIELD_DATATYPE;
+    param.cb.recv_stream  = stream_recv_callback;
+    param.datatype        = ucp_dt_make_iov();
+    param.flags           = flags;
 
-    if (UCS_PTR_IS_PTR(request)) {
-        struct jucx_context *ctx = (struct jucx_context *)request;
-        ctx->iovec = iovec;
-    } else {
-        ucs_free(iovec);
-    }
+    ucs_status_ptr_t status = ucp_stream_recv_nbx((ucp_ep_h)ep_ptr, iovec, iovcnt, &rlength,
+                                                  &param);
+    ucs_trace_req("JUCX: recv_stream_iov_nb request %p", status);
 
-    if (request == NULL) {
-        // If request completed immidiately.
-        return process_completed_stream_recv(rlength, callback);
+    if (status == NULL) {
+        jucx_request_update_recv_length(env, jucx_request, rlength);
     }
 
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpEndpoint_flushNonBlockingNative(JNIEnv *env, jclass cls,
-                                                             jlong ep_ptr,
-                                                             jobject callback)
+                                                             jlong ep_ptr, jobject callback)
 {
-    ucs_status_ptr_t request = ucp_ep_flush_nb((ucp_ep_h)ep_ptr, 0, jucx_request_callback);
+    ucp_request_param_t param = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, UCS_MEMORY_TYPE_UNKNOWN);
+
+    param.cb.send = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_ep_flush_nbx((ucp_ep_h)ep_ptr, &param);
+    ucs_trace_req("JUCX: ucp_ep_flush_nbx request %p", status);
+
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
+}
+
+JNIEXPORT jobject JNICALL
+Java_org_openucx_jucx_ucp_UcpEndpoint_sendAmNonBlockingNative(JNIEnv *env, jclass cls,
+                                                              jlong ep_ptr, jint am_id,
+                                                              jlong header_addr, jlong header_length,
+                                                              jlong data_address, jlong data_length,
+                                                              jlong flags, jobject callback,
+                                                              jint memory_type)
+{
+    ucp_request_param_t param = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.op_attr_mask |= UCP_OP_ATTR_FIELD_FLAGS;
+    param.cb.send       = jucx_request_callback;
+    param.flags         = flags;
+
+    ucs_status_ptr_t status = ucp_am_send_nbx((ucp_ep_h)ep_ptr, am_id, (void*)header_addr, header_length,
+                                              (void*)data_address, data_length, &param);
+    ucs_trace_req("JUCX: ucp_am_send_nbx request %p", status);
 
-    return process_request(request, callback);
+    process_request(env, jucx_request, status);
+    return jucx_request;
 }
diff --git a/bindings/java/src/main/native/jucx_common_def.cc b/bindings/java/src/main/native/jucx_common_def.cc
index 913edc66dfd..ea5407ee926 100644
--- a/bindings/java/src/main/native/jucx_common_def.cc
+++ b/bindings/java/src/main/native/jucx_common_def.cc
@@ -7,7 +7,7 @@
 extern "C" {
   #include <ucs/arch/cpu.h>
   #include <ucs/debug/assert.h>
-  #include <ucs/debug/debug.h>
+  #include <ucs/debug/debug_int.h>
 }
 
 #include <arpa/inet.h> /* inet_addr */
@@ -17,15 +17,27 @@ extern "C" {
 
 static JavaVM *jvm_global;
 static jclass jucx_request_cls;
+static jclass jucx_endpoint_cls;
+static jclass jucx_am_data_cls;
+static jclass ucp_rkey_cls;
+static jclass ucp_tag_msg_cls;
+
 static jfieldID native_id_field;
 static jfieldID recv_size_field;
 static jfieldID sender_tag_field;
-static jmethodID on_success;
+static jfieldID request_callback;
+static jfieldID request_status;
+static jfieldID request_iov_vec;
+
 static jmethodID jucx_request_constructor;
-static jclass ucp_rkey_cls;
+static jmethodID jucx_endpoint_constructor;
+static jmethodID jucx_am_data_constructor;
 static jmethodID ucp_rkey_cls_constructor;
-static jclass ucp_tag_msg_cls;
 static jmethodID ucp_tag_msg_cls_constructor;
+static jmethodID on_success;
+static jmethodID on_am_receive;
+static jmethodID jucx_set_native_id;
+
 
 extern "C" JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *jvm, void* reserved) {
     setlocale(LC_NUMERIC, "C");
@@ -37,21 +49,37 @@ extern "C" JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM *jvm, void* reserved) {
     }
 
     jclass jucx_request_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpRequest");
-    jucx_request_cls = (jclass) env->NewGlobalRef(jucx_request_cls_local);
     jclass jucx_callback_cls = env->FindClass("org/openucx/jucx/UcxCallback");
+    jclass ucp_rkey_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpRemoteKey");
+    jclass ucp_tag_msg_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpTagMessage");
+    jclass jucx_endpoint_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpEndpoint");
+    jclass jucx_am_data_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpAmData");
+    jclass jucx_am_recv_callback_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpAmRecvCallback");
+
+    jucx_request_cls = (jclass) env->NewGlobalRef(jucx_request_cls_local);
+    ucp_rkey_cls = (jclass) env->NewGlobalRef(ucp_rkey_cls_local);
+    ucp_tag_msg_cls = (jclass) env->NewGlobalRef(ucp_tag_msg_cls_local);
+    jucx_endpoint_cls = (jclass) env->NewGlobalRef(jucx_endpoint_cls_local);
+    jucx_am_data_cls = (jclass) env->NewGlobalRef(jucx_am_data_cls_local);
+
     native_id_field = env->GetFieldID(jucx_request_cls, "nativeId", "Ljava/lang/Long;");
+    request_callback = env->GetFieldID(jucx_request_cls, "callback", "Lorg/openucx/jucx/UcxCallback;");
+    request_status = env->GetFieldID(jucx_request_cls, "status", "I");
     recv_size_field = env->GetFieldID(jucx_request_cls, "recvSize", "J");
+    request_iov_vec = env->GetFieldID(jucx_request_cls, "iovVector", "J");
     sender_tag_field = env->GetFieldID(jucx_request_cls, "senderTag", "J");
+
+    jucx_set_native_id = env->GetMethodID(jucx_request_cls, "setNativeId", "(J)V");
     on_success = env->GetMethodID(jucx_callback_cls, "onSuccess",
                                   "(Lorg/openucx/jucx/ucp/UcpRequest;)V");
-    jucx_request_constructor = env->GetMethodID(jucx_request_cls, "<init>", "(J)V");
-
-    jclass ucp_rkey_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpRemoteKey");
-    ucp_rkey_cls = (jclass) env->NewGlobalRef(ucp_rkey_cls_local);
+    on_am_receive = env->GetMethodID(jucx_am_recv_callback_cls_local, "onReceive",
+                                     "(JJLorg/openucx/jucx/ucp/UcpAmData;Lorg/openucx/jucx/ucp/UcpEndpoint;)I");
+    jucx_endpoint_constructor = env->GetMethodID(jucx_endpoint_cls, "<init>", "(J)V");
+    jucx_am_data_constructor = env->GetMethodID(jucx_am_data_cls, "<init>", "(Lorg/openucx/jucx/ucp/UcpWorker;JJJ)V");
+    jucx_request_constructor = env->GetMethodID(jucx_request_cls, "<init>", "()V");
     ucp_rkey_cls_constructor = env->GetMethodID(ucp_rkey_cls, "<init>", "(J)V");
-    jclass ucp_tag_msg_cls_local = env->FindClass("org/openucx/jucx/ucp/UcpTagMessage");
-    ucp_tag_msg_cls = (jclass) env->NewGlobalRef(ucp_tag_msg_cls_local);
     ucp_tag_msg_cls_constructor = env->GetMethodID(ucp_tag_msg_cls, "<init>", "(JJJ)V");
+
     return JNI_VERSION_1_1;
 }
 
@@ -64,6 +92,44 @@ extern "C" JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *jvm, void *reserved) {
     if (jucx_request_cls != NULL) {
         env->DeleteGlobalRef(jucx_request_cls);
     }
+
+    if (jucx_endpoint_cls != NULL) {
+        env->DeleteGlobalRef(jucx_endpoint_cls);
+    }
+
+    if (jucx_am_data_cls != NULL) {
+        env->DeleteGlobalRef(jucx_am_data_cls);
+    }
+}
+
+jobject c2jInetSockAddr(JNIEnv *env, const sockaddr_storage* ss)
+{
+    jbyteArray buff;
+    int port = 0;
+
+    // 1. Construct InetAddress object
+    jclass inet_address_cls = env->FindClass("java/net/InetAddress");
+    jmethodID getByAddress = env->GetStaticMethodID(inet_address_cls, "getByAddress",
+                                                    "([B)Ljava/net/InetAddress;");
+    if(ss->ss_family == AF_INET6) {
+        const sockaddr_in6* sin6 = reinterpret_cast<const sockaddr_in6*>(ss);
+        buff = env->NewByteArray(16);
+        env->SetByteArrayRegion(buff, 0, 16, (jbyte*)&sin6->sin6_addr.s6_addr);
+        port = ntohs(sin6->sin6_port);
+    } else {
+        const sockaddr_in* sin = reinterpret_cast<const sockaddr_in*>(ss);
+        buff = env->NewByteArray(4);
+        env->SetByteArrayRegion(buff, 0, 4, (jbyte*)&sin->sin_addr);
+        port = ntohs(sin->sin_port);
+    }
+
+    jobject inet_address_obj = env->CallStaticObjectMethod(inet_address_cls, getByAddress, buff);
+    // 2. Construct InetSocketAddress object from InetAddress, port
+    jclass inet_socket_address_cls = env->FindClass("java/net/InetSocketAddress");
+    jmethodID inetSocketAddress_constructor = env->GetMethodID(inet_socket_address_cls,
+                                              "<init>", "(Ljava/net/InetAddress;I)V");
+
+    return env->NewObject(inet_socket_address_cls, inetSocketAddress_constructor, inet_address_obj, port);
 }
 
 bool j2cInetSockAddr(JNIEnv *env, jobject sock_addr, sockaddr_storage& ss,  socklen_t& sa_len)
@@ -147,49 +213,33 @@ bool j2cInetSockAddr(JNIEnv *env, jobject sock_addr, sockaddr_storage& ss,  sock
     return false;
 }
 
-static inline void jucx_context_reset(struct jucx_context* ctx)
+JNIEnv* get_jni_env()
 {
-    ctx->callback = NULL;
-    ctx->jucx_request = NULL;
-    ctx->status = UCS_INPROGRESS;
-    ctx->length = 0;
-    ctx->iovec = NULL;
-    ctx->sender_tag = 0;
+    void *env;
+    jint rs = jvm_global->AttachCurrentThread(&env, NULL);
+    ucs_assert_always(rs == JNI_OK);
+    return (JNIEnv*)env;
 }
 
-void jucx_request_init(void *request)
+void jucx_request_set_iov(JNIEnv *env, jobject jucx_request, ucp_dt_iov_t* iovec)
 {
-     struct jucx_context *ctx = (struct jucx_context *)request;
-     jucx_context_reset(ctx);
-     ucs_recursive_spinlock_init(&ctx->lock, 0);
+    env->SetLongField(jucx_request, request_iov_vec, (native_ptr)iovec);
 }
 
-JNIEnv* get_jni_env()
+void jucx_request_update_status(JNIEnv *env, jobject jucx_request, ucs_status_t status)
 {
-    void *env;
-    jint rs = jvm_global->AttachCurrentThread(&env, NULL);
-    ucs_assert_always(rs == JNI_OK);
-    return (JNIEnv*)env;
+    env->SetIntField(jucx_request, request_status, status);
 }
 
-static inline void set_jucx_request_completed(JNIEnv *env, jobject jucx_request,
-                                              struct jucx_context *ctx)
+static inline void set_jucx_request_completed(JNIEnv *env, jobject jucx_request, ucs_status_t status)
 {
     env->SetObjectField(jucx_request, native_id_field, NULL);
-    if (ctx != NULL) {
-        /* sender_tag and length are initialized to 0,
-         * so try to avoid the overhead of setting them again */
-        if (ctx->sender_tag != 0) {
-            env->SetLongField(jucx_request, sender_tag_field, ctx->sender_tag);
-        }
-
-        if (ctx->length > 0) {
-            env->SetLongField(jucx_request, recv_size_field, ctx->length);
-        }
+    jucx_request_update_status(env, jucx_request, status);
+    long iov_vec = env->GetLongField(jucx_request, request_iov_vec);
 
-        if (ctx->iovec != NULL) {
-            ucs_free(ctx->iovec);
-        }
+    if (iov_vec != 0L) {
+        ucp_dt_iov_t* iovec = reinterpret_cast<ucp_dt_iov_t*>(iov_vec);
+        ucs_free(iovec);
     }
 }
 
@@ -224,124 +274,150 @@ static inline void jucx_call_callback(jobject callback, jobject jucx_request,
     }
 }
 
-UCS_PROFILE_FUNC_VOID(jucx_request_callback, (request, status), void *request, ucs_status_t status)
+UCS_PROFILE_FUNC_VOID(jucx_request_callback, (request, status, user_data), void *request,
+                      ucs_status_t status, void *user_data)
 {
-    struct jucx_context *ctx = (struct jucx_context *)request;
-    ucs_recursive_spin_lock(&ctx->lock);
-    if (ctx->jucx_request == NULL) {
-        // here because 1 of 2 reasons:
-        // 1. progress is in another thread and got here earlier then process_request happened.
-        // 2. this callback is inside ucp_tag_recv_nb function.
-        ctx->status = status;
-        ucs_recursive_spin_unlock(&ctx->lock);
-        return;
-    }
+    jobject jucx_request = reinterpret_cast<jobject>(user_data);
 
     JNIEnv *env = get_jni_env();
-    set_jucx_request_completed(env, ctx->jucx_request, ctx);
 
-    if (ctx->callback != NULL) {
-        jucx_call_callback(ctx->callback, ctx->jucx_request, status);
-        env->DeleteGlobalRef(ctx->callback);
+    set_jucx_request_completed(env, jucx_request, UCS_PTR_STATUS(status));
+    ucp_request_free(request);
+
+    jobject callback = env->GetObjectField(jucx_request, request_callback);
+
+    if (callback != NULL) {
+        jucx_call_callback(callback, jucx_request, status);
+        // Remove callback reference from request.
+        env->SetObjectField(jucx_request, request_callback, NULL);
     }
 
-    env->DeleteGlobalRef(ctx->jucx_request);
-    jucx_context_reset(ctx);
-    ucp_request_free(request);
-    ucs_recursive_spin_unlock(&ctx->lock);
+    env->DeleteGlobalRef(jucx_request);
 }
 
-void recv_callback(void *request, ucs_status_t status, ucp_tag_recv_info_t *info)
+void jucx_request_update_recv_length(JNIEnv *env, jobject jucx_request,
+                                     size_t rlength)
 {
-    struct jucx_context *ctx = (struct jucx_context *)request;
-    ctx->length = info->length;
-    ctx->sender_tag = info->sender_tag;
-    jucx_request_callback(request, status);
+    env->SetLongField(jucx_request, recv_size_field, rlength);
 }
 
-void stream_recv_callback(void *request, ucs_status_t status, size_t length)
+void jucx_request_update_sender_tag(JNIEnv *env, jobject jucx_request,
+                                    ucp_tag_t sender_tag)
 {
-    struct jucx_context *ctx = (struct jucx_context *)request;
-    ctx->length = length;
-    jucx_request_callback(request, status);
+    env->SetLongField(jucx_request, sender_tag_field, sender_tag);
 }
 
-UCS_PROFILE_FUNC(jobject, process_request, (request, callback), void *request, jobject callback)
+void recv_callback(void *request, ucs_status_t status,
+                   const ucp_tag_recv_info_t *info, void *user_data)
 {
     JNIEnv *env = get_jni_env();
-    jobject jucx_request;
-
-    if (UCS_PTR_IS_PTR(request)) {
-        jucx_request = env->NewObject(jucx_request_cls, jucx_request_constructor,
-                                      (native_ptr)request);
-        struct jucx_context *ctx = (struct jucx_context *)request;
-        ucs_recursive_spin_lock(&ctx->lock);
-        if (ctx->status == UCS_INPROGRESS) {
-            // request not completed yet, install user callback
-            if (callback != NULL) {
-                ctx->callback = env->NewGlobalRef(callback);
-            }
-            ctx->jucx_request = env->NewGlobalRef(jucx_request);
-        } else {
-            // request was completed whether by progress in other thread or inside
-            // ucp_tag_recv_nb function call.
-            set_jucx_request_completed(env, jucx_request, ctx);
-            if (callback != NULL) {
-                jucx_call_callback(callback, jucx_request, ctx->status);
-            }
-            jucx_context_reset(ctx);
-            ucp_request_free(request);
-        }
-        ucs_recursive_spin_unlock(&ctx->lock);
-    } else {
-        jmethodID empty_constructor = env->GetMethodID(jucx_request_cls, "<init>", "()V");
-        jucx_request = env->NewObject(jucx_request_cls, empty_constructor);
-        set_jucx_request_completed(env, jucx_request, NULL);
-        if (UCS_PTR_IS_ERR(request)) {
-            JNU_ThrowExceptionByStatus(env, UCS_PTR_STATUS(request));
-            if (callback != NULL) {
-                call_on_error(callback, UCS_PTR_STATUS(request));
-            }
-        } else if (callback != NULL) {
-            call_on_success(callback, jucx_request);
-        }
-    }
-    return jucx_request;
+    jobject jucx_request = reinterpret_cast<jobject>(user_data);
+
+    jucx_request_update_sender_tag(env, jucx_request, info->sender_tag);
+    jucx_request_update_recv_length(env, jucx_request, info->length);
+    jucx_request_callback(request, status, user_data);
 }
 
-jobject process_completed_stream_recv(size_t length, jobject callback)
+void stream_recv_callback(void *request, ucs_status_t status, size_t length,
+                          void *user_data)
 {
     JNIEnv *env = get_jni_env();
-    jobject jucx_request = env->NewObject(jucx_request_cls, jucx_request_constructor, NULL);
-    env->SetObjectField(jucx_request, native_id_field, NULL);
-    env->SetLongField(jucx_request, recv_size_field, length);
+    jobject jucx_request = reinterpret_cast<jobject>(user_data);
+    jucx_request_update_recv_length(env, jucx_request, length);
+
+    jucx_request_callback(request, status, user_data);
+}
+
+ucs_status_t am_recv_callback(void *arg, const void *header, size_t header_length,
+                              void *data, size_t length, const ucp_am_recv_param_t *param)
+{
+    JNIEnv *env = get_jni_env();
+    jobject jucx_endpoint = NULL;
+
+    jobjectArray callback_and_worker = reinterpret_cast<jobjectArray>(arg);
+
+    jobject callback = env->GetObjectArrayElement(callback_and_worker, 0);
+    jobject worker = env->GetObjectArrayElement(callback_and_worker, 1);
+
+    jobject jucx_am_data = env->NewObject(jucx_am_data_cls, jucx_am_data_constructor,
+                                          worker, (native_ptr)data, length, param->recv_attr);
+
+    if (param->recv_attr & UCP_AM_RECV_ATTR_FIELD_REPLY_EP) {
+        jucx_endpoint = env->NewObject(jucx_endpoint_cls, jucx_endpoint_constructor, param->reply_ep);
+    }
+
+
+    return static_cast<ucs_status_t>(env->CallIntMethod(callback, on_am_receive, (native_ptr)header, header_length,
+                                     jucx_am_data, jucx_endpoint));
+}
+
+jobject jucx_request_allocate(JNIEnv *env, const jobject callback,
+                              ucp_request_param_t *param, jint memory_type)
+{
+    jobject jucx_request = env->NewObject(jucx_request_cls, jucx_request_constructor);
+
+    param->op_attr_mask = UCP_OP_ATTR_FIELD_USER_DATA |
+                          UCP_OP_ATTR_FIELD_CALLBACK  |
+                          UCP_OP_ATTR_FIELD_MEMORY_TYPE;
+    param->user_data    = env->NewGlobalRef(jucx_request);
+    param->memory_type  = static_cast<ucs_memory_type_t>(memory_type);
+
     if (callback != NULL) {
-        jucx_call_callback(callback, jucx_request, UCS_OK);
+         env->SetObjectField(jucx_request, request_callback, callback);
     }
+
     return jucx_request;
 }
 
+void process_request(JNIEnv *env, jobject jucx_request, ucs_status_ptr_t status)
+{
+    // If status is error - throw an exception in java.
+    if (UCS_PTR_IS_ERR(status)) {
+        JNU_ThrowExceptionByStatus(env, UCS_PTR_STATUS(status));
+    }
+
+    if (UCS_PTR_IS_PTR(status)) {
+      env->CallVoidMethod(jucx_request, jucx_set_native_id, (native_ptr)status);
+    } else {
+        // Request completed immidiately. Call jucx callback.
+        set_jucx_request_completed(env, jucx_request, UCS_PTR_RAW_STATUS(status));
+        jobject callback = env->GetObjectField(jucx_request, request_callback);
+        if (callback != NULL) {
+            jucx_call_callback(callback, jucx_request, UCS_PTR_RAW_STATUS(status));
+            // Remove callback reference from request.
+            env->SetObjectField(jucx_request, request_callback, NULL);
+        }
+    }
+}
+
 void jucx_connection_handler(ucp_conn_request_h conn_request, void *arg)
 {
-    jobject jucx_conn_handler = reinterpret_cast<jobject>(arg);
+    jobject client_address = NULL;
 
+    jobject jucx_conn_handler = reinterpret_cast<jobject>(arg);
     JNIEnv *env = get_jni_env();
+    ucp_conn_request_attr_t attr;
+    attr.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR;
+    ucs_status_t status = ucp_conn_request_query(conn_request, &attr);
+
+    if (status == UCS_OK) {
+        client_address = c2jInetSockAddr(env, &attr.client_address);
+    }
 
     // Construct connection request class instance
     jclass conn_request_cls = env->FindClass("org/openucx/jucx/ucp/UcpConnectionRequest");
-    jmethodID conn_request_constructor = env->GetMethodID(conn_request_cls, "<init>", "(J)V");
+    jmethodID conn_request_constructor = env->GetMethodID(conn_request_cls, "<init>",
+                                                          "(JLjava/net/InetSocketAddress;)V");
     jobject jucx_conn_request = env->NewObject(conn_request_cls, conn_request_constructor,
-                                               (native_ptr)conn_request);
+                                               (native_ptr)conn_request, client_address);
 
     // Call onConnectionRequest method
     jclass jucx_conn_hndl_cls = env->FindClass("org/openucx/jucx/ucp/UcpListenerConnectionHandler");
     jmethodID on_conn_request = env->GetMethodID(jucx_conn_hndl_cls, "onConnectionRequest",
-                                       "(Lorg/openucx/jucx/ucp/UcpConnectionRequest;)V");
+                                                 "(Lorg/openucx/jucx/ucp/UcpConnectionRequest;)V");
     env->CallVoidMethod(jucx_conn_handler, on_conn_request, jucx_conn_request);
-    env->DeleteGlobalRef(jucx_conn_handler);
 }
 
-
 jobject new_rkey_instance(JNIEnv *env, ucp_rkey_h rkey)
 {
     return env->NewObject(ucp_rkey_cls, ucp_rkey_cls_constructor, (native_ptr)rkey);
diff --git a/bindings/java/src/main/native/jucx_common_def.h b/bindings/java/src/main/native/jucx_common_def.h
index 6e83266d001..e225ba65016 100644
--- a/bindings/java/src/main/native/jucx_common_def.h
+++ b/bindings/java/src/main/native/jucx_common_def.h
@@ -42,7 +42,12 @@ typedef uintptr_t native_ptr;
 } while(0)
 
 #define JNU_ThrowExceptionByStatus(_env, _status) do { \
-    JNU_ThrowException(_env, ucs_status_string(_status)); \
+    jclass _cls = _env->FindClass("org/openucx/jucx/UcxException"); \
+    jmethodID _constr = _env->GetMethodID(_cls, "<init>", "(Ljava/lang/String;I)V"); \
+    jstring _error_msg = _env->NewStringUTF(ucs_status_string(_status)); \
+    jthrowable _ex = \
+    static_cast<jthrowable>(_env->NewObject(_cls, _constr, _error_msg, _status)); \
+    _env->Throw(_ex); \
 } while(0)
 
 /**
@@ -52,18 +57,6 @@ typedef uintptr_t native_ptr;
  */
 bool j2cInetSockAddr(JNIEnv *env, jobject sock_addr, sockaddr_storage& ss, socklen_t& sa_len);
 
-struct jucx_context {
-    jobject callback;
-    volatile jobject jucx_request;
-    ucs_status_t status;
-    ucs_recursive_spinlock_t lock;
-    size_t length;
-    ucp_dt_iov_t* iovec;
-    ucp_tag_t sender_tag;
-};
-
-void jucx_request_init(void *request);
-
 /**
  * @brief Get the jni env object. To be able to call java methods from ucx async callbacks.
  */
@@ -72,29 +65,60 @@ JNIEnv* get_jni_env();
 /**
  * @brief Send callback used to invoke java callback class on completion of ucp operations.
  */
-void jucx_request_callback(void *request, ucs_status_t status);
+void jucx_request_callback(void *request, ucs_status_t status, void *user_data);
 
 /**
  * @brief Recv callback used to invoke java callback class on completion of ucp tag_recv_nb operation.
  */
-void recv_callback(void *request, ucs_status_t status, ucp_tag_recv_info_t *info);
+void recv_callback(void *request, ucs_status_t status, const ucp_tag_recv_info_t *info,
+                   void *user_data);
 
 /**
  * @brief Recv callback used to invoke java callback class on completion of ucp stream_recv_nb operation.
  */
-void stream_recv_callback(void *request, ucs_status_t status, size_t length);
+void stream_recv_callback(void *request, ucs_status_t status, size_t length, void *user_data);
+
+/**
+ * @brief Active message receive callback.
+ */
+ucs_status_t am_recv_callback(void *arg, const void *header, size_t header_length, void *data, size_t length,
+                              const ucp_am_recv_param_t *param);
+
+/**
+ * @ingroup JUCX_REQ
+ * @brief Utility to allocate jucx request and set appropriate java callback in it.
+ */
+jobject jucx_request_allocate(JNIEnv *env, jobject callback, ucp_request_param_t *param,
+                              jint memory_type);
+
+/**
+ * @ingroup JUCX_REQ
+ * @brief Utility to set iov verctor in jucx_request, to release it on completion.
+ */
+void jucx_request_set_iov(JNIEnv *env, jobject request, ucp_dt_iov_t* iovec);
+
+/**
+ * @ingroup JUCX_REQ
+ * @brief Utility to update status of JUCX request to corresponding ucx request.
+ */
+void jucx_request_update_status(JNIEnv *env, jobject jucx_request, ucs_status_t status);
+
+/**
+ * @ingroup JUCX_REQ
+ * @brief Utility to set recv length in JUCX request.
+ */
+void jucx_request_update_recv_length(JNIEnv *env, jobject jucx_request, size_t rlength);
 
 /**
- * @brief Utility to process request logic: if request is pointer - set callback to request context.
- * If request is status - call callback directly.
- * Returns jucx_request object, that could be monitored on completion.
+ * @ingroup JUCX_REQ
+ * @brief Utility to set sender tag in JUCX request.
  */
-jobject process_request(void *request, jobject callback);
+void jucx_request_update_sender_tag(JNIEnv *env, jobject jucx_request, ucp_tag_t sender_tag);
 
 /**
- * @brief Call java callback on completed stream recv operation, that didn't invoke callback.
+ * @brief Function to handle result of ucx function submition, to handle immidiate completion.
  */
-jobject process_completed_stream_recv(size_t length, jobject callback);
+void process_request(JNIEnv *env, jobject request, ucs_status_ptr_t status);
 
 void jucx_connection_handler(ucp_conn_request_h conn_request, void *arg);
 
diff --git a/bindings/java/src/main/native/listener.cc b/bindings/java/src/main/native/listener.cc
index 3114e71488f..062b08028f2 100644
--- a/bindings/java/src/main/native/listener.cc
+++ b/bindings/java/src/main/native/listener.cc
@@ -44,7 +44,7 @@ Java_org_openucx_jucx_ucp_UcpListener_createUcpListener(JNIEnv *env, jclass cls,
         field = env->GetFieldID(jucx_listener_param_class,
                                 "connectionHandler", "Lorg/openucx/jucx/ucp/UcpListenerConnectionHandler;");
         jobject jucx_conn_handler = env->GetObjectField(ucp_listener_params, field);
-        params.conn_handler.arg = env->NewGlobalRef(jucx_conn_handler);
+        params.conn_handler.arg = env->NewWeakGlobalRef(jucx_conn_handler);
         params.conn_handler.cb = jucx_connection_handler;
     }
 
diff --git a/bindings/java/src/main/native/memory.cc b/bindings/java/src/main/native/memory.cc
index 8627aca89b5..239a071c2a4 100644
--- a/bindings/java/src/main/native/memory.cc
+++ b/bindings/java/src/main/native/memory.cc
@@ -5,6 +5,7 @@
 #include "jucx_common_def.h"
 #include "org_openucx_jucx_ucp_UcpMemory.h"
 #include "org_openucx_jucx_ucp_UcpRemoteKey.h"
+#include "org_openucx_jucx_UcxUtils.h"
 
 
 JNIEXPORT void JNICALL
@@ -43,3 +44,16 @@ Java_org_openucx_jucx_ucp_UcpRemoteKey_rkeyDestroy(JNIEnv *env, jclass cls, jlon
 {
     ucp_rkey_destroy((ucp_rkey_h) rkey_ptr);
 }
+
+JNIEXPORT jlong JNICALL
+Java_org_openucx_jucx_UcxUtils_getAddressNative(JNIEnv *env, jclass cls, jobject buffer)
+{
+    return (native_ptr)env->GetDirectBufferAddress(buffer);
+}
+
+JNIEXPORT jobject JNICALL
+Java_org_openucx_jucx_UcxUtils_getByteBufferViewNative(JNIEnv *env, jclass cls,
+                                                       jlong address, jlong size)
+{
+    return env->NewDirectByteBuffer((void*)address, size);
+}
diff --git a/bindings/java/src/main/native/request.cc b/bindings/java/src/main/native/request.cc
deleted file mode 100644
index d65619b922e..00000000000
--- a/bindings/java/src/main/native/request.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#include "org_openucx_jucx_ucp_UcpRequest.h"
-
-#include <ucp/api/ucp.h>
-#include <ucs/type/status.h>
-
-JNIEXPORT jboolean JNICALL
-Java_org_openucx_jucx_ucp_UcpRequest_isCompletedNative(JNIEnv *env, jclass cls,
-                                                       jlong ucp_req_ptr)
-{
-    return ucp_request_check_status((void *)ucp_req_ptr) != UCS_INPROGRESS;
-}
-
-JNIEXPORT void JNICALL
-Java_org_openucx_jucx_ucp_UcpRequest_closeRequestNative(JNIEnv *env, jclass cls,
-                                                        jlong ucp_req_ptr)
-{
-    ucp_request_free((void *)ucp_req_ptr);
-}
diff --git a/bindings/java/src/main/native/ucp_constants.cc b/bindings/java/src/main/native/ucp_constants.cc
index c156aae4aea..ee5eeb2642c 100644
--- a/bindings/java/src/main/native/ucp_constants.cc
+++ b/bindings/java/src/main/native/ucp_constants.cc
@@ -30,6 +30,7 @@ Java_org_openucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls)
     JUCX_DEFINE_LONG_CONSTANT(UCP_FEATURE_AMO64);
     JUCX_DEFINE_LONG_CONSTANT(UCP_FEATURE_WAKEUP);
     JUCX_DEFINE_LONG_CONSTANT(UCP_FEATURE_STREAM);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_FEATURE_AM);
 
     // UCP worker parameters
     JUCX_DEFINE_LONG_CONSTANT(UCP_WORKER_PARAM_FIELD_THREAD_MODE);
@@ -65,8 +66,7 @@ Java_org_openucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls)
     JUCX_DEFINE_INT_CONSTANT(UCP_ERR_HANDLING_MODE_PEER);
 
     // UCP endpoint close non blocking mode.
-    JUCX_DEFINE_INT_CONSTANT(UCP_EP_CLOSE_MODE_FORCE);
-    JUCX_DEFINE_INT_CONSTANT(UCP_EP_CLOSE_MODE_FLUSH);
+    JUCX_DEFINE_INT_CONSTANT(UCP_EP_CLOSE_FLAG_FORCE);
 
     // The enumeration list describes the endpoint's parameters flags
     JUCX_DEFINE_LONG_CONSTANT(UCP_EP_PARAMS_FLAGS_CLIENT_SERVER);
@@ -76,12 +76,30 @@ Java_org_openucx_jucx_ucp_UcpConstants_loadConstants(JNIEnv *env, jclass cls)
     JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_ADDRESS);
     JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_LENGTH);
     JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_FLAGS);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_PROT);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE);
 
     // The enumeration list describes the memory mapping flags
     JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_NONBLOCK);
     JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_ALLOCATE);
     JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_FIXED);
 
+    // The enumeration list describes the memory mapping protections
+    JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PROT_LOCAL_READ);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PROT_LOCAL_WRITE);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PROT_REMOTE_READ);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_MEM_MAP_PROT_REMOTE_WRITE);
+
     // The enumeration defines behavior of @ref ucp_stream_recv_nb function
     JUCX_DEFINE_LONG_CONSTANT(UCP_STREAM_RECV_FLAG_WAITALL);
+
+    // The enumeration allows specifying which fields in @ref ucp_am_recv_param_t
+    // are present and receive operation flags are used.
+    JUCX_DEFINE_LONG_CONSTANT(UCP_AM_RECV_ATTR_FLAG_DATA);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_AM_RECV_ATTR_FLAG_RNDV);
+
+    // Flags dictate the behavior of @ref ucp_am_send_nbx routine.
+    JUCX_DEFINE_LONG_CONSTANT(UCP_AM_SEND_FLAG_REPLY);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_AM_SEND_FLAG_EAGER);
+    JUCX_DEFINE_LONG_CONSTANT(UCP_AM_SEND_FLAG_RNDV);
 }
diff --git a/bindings/java/src/main/native/ucs_constants.cc b/bindings/java/src/main/native/ucs_constants.cc
index 28507b05c66..3461748959f 100644
--- a/bindings/java/src/main/native/ucs_constants.cc
+++ b/bindings/java/src/main/native/ucs_constants.cc
@@ -9,9 +9,59 @@
 #include <ucs/type/thread_mode.h>
 
 JNIEXPORT void JNICALL
-Java_org_openucx_jucx_ucs_UcsConstants_loadConstants(JNIEnv *env, jclass cls)
+Java_org_openucx_jucx_ucs_UcsConstants_loadConstants(JNIEnv *env, jclass ucs_class)
 {
     jclass thread_mode = env->FindClass("org/openucx/jucx/ucs/UcsConstants$ThreadMode");
     jfieldID field = env->GetStaticFieldID(thread_mode, "UCS_THREAD_MODE_MULTI", "I");
     env->SetStaticIntField(thread_mode, field, UCS_THREAD_MODE_MULTI);
+
+    jclass cls = env->FindClass("org/openucx/jucx/ucs/UcsConstants$STATUS");
+
+    /* Operation completed successfully */
+    JUCX_DEFINE_INT_CONSTANT(UCS_OK);
+
+    /* Operation is queued and stil in progress */
+    JUCX_DEFINE_INT_CONSTANT(UCS_INPROGRESS);
+    /* Failure codes */
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NO_MESSAGE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NO_RESOURCE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_IO_ERROR);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NO_MEMORY);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_INVALID_PARAM);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_UNREACHABLE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_INVALID_ADDR);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NOT_IMPLEMENTED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_MESSAGE_TRUNCATED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NO_PROGRESS);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_BUFFER_TOO_SMALL);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NO_ELEM);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_SOME_CONNECTS_FAILED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NO_DEVICE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_BUSY);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_CANCELED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_SHMEM_SEGMENT);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_ALREADY_EXISTS);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_OUT_OF_RANGE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_TIMED_OUT);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_EXCEEDS_LIMIT);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_UNSUPPORTED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_REJECTED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_NOT_CONNECTED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_CONNECTION_RESET);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_FIRST_LINK_FAILURE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_LAST_LINK_FAILURE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_FIRST_ENDPOINT_FAILURE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_ENDPOINT_TIMEOUT);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_LAST_ENDPOINT_FAILURE);
+    JUCX_DEFINE_INT_CONSTANT(UCS_ERR_LAST);
+
+    // Memory type
+    cls = env->FindClass("org/openucx/jucx/ucs/UcsConstants$MEMORY_TYPE");
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_HOST);
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_CUDA);
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_CUDA_MANAGED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_ROCM);
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_ROCM_MANAGED);
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_LAST);
+    JUCX_DEFINE_INT_CONSTANT(UCS_MEMORY_TYPE_UNKNOWN);
 }
diff --git a/bindings/java/src/main/native/worker.cc b/bindings/java/src/main/native/worker.cc
index cc1c4af45de..c72f49fc03b 100644
--- a/bindings/java/src/main/native/worker.cc
+++ b/bindings/java/src/main/native/worker.cc
@@ -10,7 +10,7 @@
  * Bridge method for creating ucp_worker from java
  */
 JNIEXPORT jlong JNICALL
-Java_org_openucx_jucx_ucp_UcpWorker_createWorkerNative(JNIEnv *env, jclass cls,
+Java_org_openucx_jucx_ucp_UcpWorker_createWorkerNative(JNIEnv *env, jobject jucx_worker,
                                                        jobject jucx_worker_params,
                                                        jlong context_ptr)
 {
@@ -63,7 +63,20 @@ Java_org_openucx_jucx_ucp_UcpWorker_createWorkerNative(JNIEnv *env, jclass cls,
     ucs_status_t status = ucp_worker_create(ucp_context, &worker_params, &ucp_worker);
     if (status != UCS_OK) {
         JNU_ThrowExceptionByStatus(env, status);
+        return -1L;
     }
+
+    ucp_worker_attr_t attr = {0};
+    attr.field_mask = UCP_WORKER_ATTR_FIELD_MAX_AM_HEADER;
+
+    status = ucp_worker_query(ucp_worker, &attr);
+    if (status != UCS_OK) {
+        JNU_ThrowExceptionByStatus(env, status);
+    }
+
+    field = env->GetFieldID(env->GetObjectClass(jucx_worker), "maxAmHeaderSize", "J");
+    env->SetLongField(jucx_worker, field, attr.max_am_header);
+
     return (native_ptr)ucp_worker;
 }
 
@@ -114,10 +127,18 @@ Java_org_openucx_jucx_ucp_UcpWorker_flushNonBlockingNative(JNIEnv *env, jclass c
                                                            jlong ucp_worker_ptr,
                                                            jobject callback)
 {
-    ucs_status_ptr_t request = ucp_worker_flush_nb((ucp_worker_h)ucp_worker_ptr, 0,
-                                                   jucx_request_callback);
+    ucp_request_param_t param;
 
-    return process_request(request, callback);
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, UCS_MEMORY_TYPE_UNKNOWN);
+
+    param.cb.send = jucx_request_callback;
+
+    ucs_status_ptr_t status = ucp_worker_flush_nbx((ucp_worker_h)ucp_worker_ptr, &param);
+    ucs_trace_req("JUCX: ucp_worker_flush_nbx request %p", status);
+
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT void JNICALL
@@ -145,46 +166,68 @@ Java_org_openucx_jucx_ucp_UcpWorker_recvTaggedNonBlockingNative(JNIEnv *env, jcl
                                                                 jlong ucp_worker_ptr,
                                                                 jlong laddr, jlong size,
                                                                 jlong tag, jlong tag_mask,
-                                                                jobject callback)
+                                                                jobject callback, jint memory_type)
 {
-    ucs_status_ptr_t request = ucp_tag_recv_nb((ucp_worker_h)ucp_worker_ptr,
-                                                (void *)laddr, size,
-                                                ucp_dt_make_contig(1), tag, tag_mask,
-                                                recv_callback);
+    ucp_request_param_t param = {0};
+    ucp_tag_recv_info_t recv_info = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.op_attr_mask       |= UCP_OP_ATTR_FIELD_RECV_INFO;
+    param.cb.recv             = recv_callback;
+    param.recv_info.tag_info  = &recv_info;
 
-    ucs_trace_req("JUCX: tag_recv_nb request %p, msg size: %zu, tag: %ld", request, size, tag);
+    ucs_status_ptr_t status = ucp_tag_recv_nbx((ucp_worker_h)ucp_worker_ptr,
+                                                (void *)laddr, size, tag, tag_mask, &param);
+    ucs_trace_req("JUCX: tag_recv_nb request %p, msg size: %zu, tag: %ld", status, size, tag);
 
-    return process_request(request, callback);
+    if (UCS_PTR_STATUS(status) == UCS_OK) {
+        jucx_request_update_recv_length(env, jucx_request, recv_info.length);
+        jucx_request_update_sender_tag(env, jucx_request, recv_info.sender_tag);
+    }
+
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
 Java_org_openucx_jucx_ucp_UcpWorker_recvTaggedIovNonBlockingNative(JNIEnv *env, jclass cls,
                                                                    jlong ucp_worker_ptr,
-                                                                   jlongArray addresses, jlongArray sizes,
-                                                                   jlong tag, jlong tag_mask,
-                                                                   jobject callback)
+                                                                   jlongArray addresses,
+                                                                   jlongArray sizes, jlong tag,
+                                                                   jlong tag_mask, jobject callback,
+                                                                   jint memory_type)
 {
     int iovcnt;
+    ucp_request_param_t param = {0};
+    ucp_tag_recv_info_t recv_info = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
     ucp_dt_iov_t* iovec = get_ucp_iov(env, addresses, sizes, iovcnt);
     if (iovec == NULL) {
         return NULL;
     }
 
-    ucs_status_ptr_t request = ucp_tag_recv_nb((ucp_worker_h)ucp_worker_ptr,
-                                                iovec, iovcnt,
-                                                ucp_dt_make_iov(), tag, tag_mask,
-                                                recv_callback);
+    jucx_request_set_iov(env, jucx_request, iovec);
 
-    if (UCS_PTR_IS_PTR(request)) {
-        struct jucx_context *ctx = (struct jucx_context *)request;
-        ctx->iovec = iovec;
-    } else {
-        ucs_free(iovec);
-    }
+    param.op_attr_mask       |= UCP_OP_ATTR_FIELD_RECV_INFO |
+                                UCP_OP_ATTR_FIELD_DATATYPE;
+    param.cb.recv             = recv_callback;
+    param.datatype            = ucp_dt_make_iov();
+    param.recv_info.tag_info  = &recv_info;
+
+    ucs_status_ptr_t status = ucp_tag_recv_nbx((ucp_worker_h)ucp_worker_ptr,
+                                               iovec, iovcnt, tag, tag_mask, &param);
+    ucs_trace_req("JUCX: tag_recv_iov_nb request %p, tag: %ld", status, tag);
 
-    ucs_trace_req("JUCX: tag_recv_iov_nb request %p, tag: %ld", request, tag);
+    if (UCS_PTR_STATUS(status) == UCS_OK) {
+        jucx_request_update_recv_length(env, jucx_request, recv_info.length);
+        jucx_request_update_sender_tag(env, jucx_request, recv_info.sender_tag);
+    }
+    process_request(env, jucx_request, status);
 
-    return process_request(request, callback);
+    return jucx_request;
 }
 
 JNIEXPORT jobject JNICALL
@@ -210,18 +253,33 @@ Java_org_openucx_jucx_ucp_UcpWorker_recvTaggedMessageNonBlockingNative(JNIEnv *e
                                                                        jlong ucp_worker_ptr,
                                                                        jlong laddr, jlong size,
                                                                        jlong msg_ptr,
-                                                                       jobject callback)
+                                                                       jobject callback,
+                                                                       jint memory_type)
 {
-    ucs_status_ptr_t request = ucp_tag_msg_recv_nb((ucp_worker_h)ucp_worker_ptr,
+    ucp_request_param_t param = {0};
+    ucp_tag_recv_info_t recv_info = {0};
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.op_attr_mask       |= UCP_OP_ATTR_FIELD_RECV_INFO;
+    param.cb.recv             = recv_callback;
+    param.recv_info.tag_info  = &recv_info;
+
+    ucs_status_ptr_t status = ucp_tag_msg_recv_nbx((ucp_worker_h)ucp_worker_ptr,
                                                    (void *)laddr, size,
-                                                   ucp_dt_make_contig(1),
                                                    (ucp_tag_message_h)msg_ptr,
-                                                   recv_callback);
-
-    ucs_trace_req("JUCX: tag_msg_recv_nb request %p, msg size: %zu, msg: %p", request, size,
+                                                   &param);
+    ucs_trace_req("JUCX: tag_msg_recv_nb request %p, msg size: %zu, msg: %p", status, size,
                   (ucp_tag_message_h)msg_ptr);
 
-    return process_request(request, callback);
+    if (UCS_PTR_STATUS(status) == UCS_OK) {
+        jucx_request_update_recv_length(env, jucx_request, recv_info.length);
+        jucx_request_update_sender_tag(env, jucx_request, recv_info.sender_tag);
+    }
+
+    process_request(env, jucx_request, status);
+
+    return jucx_request;
 }
 
 JNIEXPORT void JNICALL
@@ -231,3 +289,62 @@ Java_org_openucx_jucx_ucp_UcpWorker_cancelRequestNative(JNIEnv *env, jclass cls,
 {
     ucp_request_cancel((ucp_worker_h)ucp_worker_ptr, (void *)ucp_request_ptr);
 }
+
+JNIEXPORT void JNICALL
+Java_org_openucx_jucx_ucp_UcpWorker_setAmRecvHandlerNative(JNIEnv *env, jclass cls,
+                                                           jlong ucp_worker_ptr, jint amId,
+                                                           jobjectArray callbackAndWorker)
+{
+    ucp_am_handler_param_t param = {0};
+    param.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID    |
+                       UCP_AM_HANDLER_PARAM_FIELD_FLAGS |
+                       UCP_AM_HANDLER_PARAM_FIELD_CB    |
+                       UCP_AM_HANDLER_PARAM_FIELD_ARG;
+    param.id         = amId;
+    param.flags      = UCP_AM_FLAG_WHOLE_MSG;
+    param.cb         = am_recv_callback;
+    param.arg        = env->NewWeakGlobalRef(callbackAndWorker);
+
+    ucs_status_t status = ucp_worker_set_am_recv_handler((ucp_worker_h)ucp_worker_ptr, &param);
+
+    if (status != UCS_OK) {
+        JNU_ThrowExceptionByStatus(env, status);
+    }
+}
+
+JNIEXPORT jobject JNICALL
+Java_org_openucx_jucx_ucp_UcpWorker_recvAmDataNonBlockingNative(JNIEnv *env, jclass cls,
+                                                                jlong ucp_worker_ptr,
+                                                                jlong data_descr_ptr,
+                                                                jlong address, jlong length,
+                                                                jobject callback, jint memory_type)
+{
+    ucp_request_param_t param = {0};
+    size_t recv_length;
+
+
+    jobject jucx_request = jucx_request_allocate(env, callback, &param, memory_type);
+
+    param.op_attr_mask     |= UCP_OP_ATTR_FIELD_RECV_INFO;
+    param.cb.recv_am        = stream_recv_callback;
+    param.recv_info.length  = &recv_length;
+
+    ucs_status_ptr_t status = ucp_am_recv_data_nbx((ucp_worker_h)ucp_worker_ptr, (void*)data_descr_ptr,
+                                                   (void*)address, length, &param);
+    ucs_trace_req("JUCX: ucp_am_recv_data_nbx request %p, msg size: %zu, data: %p", status, length,
+                  (void*)data_descr_ptr);
+
+    if (UCS_PTR_STATUS(status) == UCS_OK) {
+        jucx_request_update_recv_length(env, jucx_request, recv_length);
+    }
+
+    process_request(env, jucx_request, status);
+    return jucx_request;
+}
+
+JNIEXPORT void JNICALL
+Java_org_openucx_jucx_ucp_UcpWorker_amDataReleaseNative(JNIEnv *env, jclass cls,
+                                                        jlong ucp_worker_ptr, jlong data_descr_ptr)
+{
+    ucp_am_data_release((ucp_worker_h)ucp_worker_ptr, (void*)data_descr_ptr);
+}
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java
index 8450604083d..1b67854bf50 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcpContextTest.java
@@ -7,28 +7,31 @@
 
 import org.junit.Test;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
 import org.openucx.jucx.ucp.UcpContext;
 import org.openucx.jucx.ucp.UcpParams;
+import org.openucx.jucx.ucs.UcsConstants;
+
+import static org.junit.Assert.*;
 
 public class UcpContextTest {
 
     public static UcpContext createContext(UcpParams contextParams) {
         UcpContext context = new UcpContext(contextParams);
         assertTrue(context.getNativeId() > 0);
+        assertTrue(UcsConstants.MEMORY_TYPE.isMemTypeSupported(context.getMemoryTypesMask(),
+            UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_HOST));
         return context;
     }
 
     public static void closeContext(UcpContext context) {
         context.close();
-        assertEquals(context.getNativeId(), null);
+        assertNull(context.getNativeId());
     }
 
     @Test
     public void testCreateSimpleUcpContext() {
-        UcpParams contextParams = new UcpParams().requestTagFeature();
+        UcpParams contextParams = new UcpParams().requestTagFeature()
+            .requestAmFeature();
         UcpContext context = createContext(contextParams);
         closeContext(context);
     }
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java
index ea5ffec6188..575f33afc88 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcpEndpointTest.java
@@ -6,17 +6,40 @@
 package org.openucx.jucx;
 
 import org.junit.Test;
+import org.junit.experimental.theories.DataPoints;
+import org.junit.experimental.theories.Theories;
+import org.junit.experimental.theories.Theory;
+import org.junit.runner.RunWith;
 import org.openucx.jucx.ucp.*;
+import org.openucx.jucx.ucs.UcsConstants;
 
 import java.nio.ByteBuffer;
-import java.util.Collections;
-import java.util.HashMap;
+import java.util.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.junit.Assert.*;
 
+@RunWith(Theories.class)
 public class UcpEndpointTest extends UcxTest {
+
+    @DataPoints
+    public static ArrayList<Integer> memTypes() {
+        ArrayList<Integer> resut = new ArrayList<>();
+        resut.add(UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_HOST);
+        UcpContext testContext = new UcpContext(new UcpParams().requestTagFeature());
+        long memTypeMask = testContext.getMemoryTypesMask();
+        if (UcsConstants.MEMORY_TYPE.isMemTypeSupported(memTypeMask,
+            UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA)) {
+            resut.add(UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA);
+        }
+        if (UcsConstants.MEMORY_TYPE.isMemTypeSupported(memTypeMask,
+            UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA_MANAGED)) {
+            resut.add(UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA_MANAGED);
+        }
+        return resut;
+    }
+
     @Test
     public void testConnectToListenerByWorkerAddr() {
         UcpContext context = new UcpContext(new UcpParams().requestStreamFeature());
@@ -30,8 +53,9 @@ public void testConnectToListenerByWorkerAddr() {
         closeResources();
     }
 
-    @Test
-    public void testGetNB() {
+    @Theory
+    public void testGetNB(int memType) throws Exception {
+        System.out.println("Running testGetNB with memType: " + memType);
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestRmaFeature();
         UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA();
@@ -46,47 +70,35 @@ public void testGetNB() {
         UcpEndpoint endpoint = worker1.newEndpoint(epParams);
 
         // Allocate 2 source and 2 destination buffers, to perform 2 RDMA Read operations
-        ByteBuffer src1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        ByteBuffer src2 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        ByteBuffer dst1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        ByteBuffer dst2 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        src1.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT);
-        src2.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT);
+        MemoryBlock src1 = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
+        MemoryBlock src2 = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
+        MemoryBlock dst1 = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);
+        MemoryBlock dst2 = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);
+
+        src1.setData(UcpMemoryTest.RANDOM_TEXT);
+        src2.setData(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT);
 
         // Register source buffers on context2
-        UcpMemory memory1 = context2.registerMemory(src1);
-        UcpMemory memory2 = context2.registerMemory(src2);
+        UcpMemory memory1 = src1.getMemory();
+        UcpMemory memory2 = src2.getMemory();
 
         UcpRemoteKey rkey1 = endpoint.unpackRemoteKey(memory1.getRemoteKeyBuffer());
         UcpRemoteKey rkey2 = endpoint.unpackRemoteKey(memory2.getRemoteKeyBuffer());
 
         AtomicInteger numCompletedRequests = new AtomicInteger(0);
-        HashMap<UcpRequest, ByteBuffer> requestToData = new HashMap<>();
+
         UcxCallback callback = new UcxCallback() {
             @Override
             public void onSuccess(UcpRequest request) {
-                // Here thread safety is guaranteed since worker progress is called after
-                // request added to map. In multithreaded environment could be an issue that
-                // callback is called, but request wasn't added yet to map.
-                if (requestToData.get(request) == dst1) {
-                    assertEquals(UcpMemoryTest.RANDOM_TEXT, dst1.asCharBuffer().toString().trim());
-                    memory1.deregister();
-                } else {
-                    assertEquals(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT,
-                        dst2.asCharBuffer().toString().trim());
-                    memory2.deregister();
-                }
                 numCompletedRequests.incrementAndGet();
             }
         };
 
         // Submit 2 get requests
-        UcpRequest request1 = endpoint.getNonBlocking(memory1.getAddress(), rkey1, dst1, callback);
-        UcpRequest request2 = endpoint.getNonBlocking(memory2.getAddress(), rkey2, dst2, callback);
-
-        // Map each request to corresponding data buffer.
-        requestToData.put(request1, dst1);
-        requestToData.put(request2, dst2);
+        UcpRequest request1 = endpoint.getNonBlocking(memory1.getAddress(), rkey1,
+            dst1.getMemory().getAddress(), dst1.getMemory().getLength(), callback);
+        UcpRequest request2 = endpoint.getNonBlocking(memory2.getAddress(), rkey2,
+            dst2.getMemory().getAddress(), dst2.getMemory().getLength(), callback);
 
         // Wait for 2 get operations to complete
         while (numCompletedRequests.get() != 2) {
@@ -94,15 +106,18 @@ public void onSuccess(UcpRequest request) {
             worker2.progress();
         }
 
+        assertEquals(src1.getData().asCharBuffer(), dst1.getData().asCharBuffer());
+        assertEquals(src2.getData().asCharBuffer(), dst2.getData().asCharBuffer());
         assertTrue(request1.isCompleted() && request2.isCompleted());
 
         Collections.addAll(resources, context2, context1, worker2, worker1, endpoint, rkey2,
-            rkey1);
+            rkey1, src1, src2, dst1, dst2);
         closeResources();
     }
 
-    @Test
-    public void testPutNB() {
+    @Theory
+    public void testPutNB(int memType) throws Exception {
+        System.out.println("Running testPutNB with memType: " + memType);
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestRmaFeature();
         UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA();
@@ -111,28 +126,29 @@ public void testPutNB() {
         UcpWorker worker1 = context1.newWorker(rdmaWorkerParams);
         UcpWorker worker2 = context2.newWorker(rdmaWorkerParams);
 
-        ByteBuffer src = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        ByteBuffer dst = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        src.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT);
+        MemoryBlock src = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);
+        MemoryBlock dst = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
+
+        src.setData(UcpMemoryTest.RANDOM_TEXT);
 
-        // Register destination buffer on context2
-        UcpMemory memory = context2.registerMemory(dst);
         UcpEndpoint ep =
             worker1.newEndpoint(new UcpEndpointParams().setUcpAddress(worker2.getAddress()));
 
-        UcpRemoteKey rkey = ep.unpackRemoteKey(memory.getRemoteKeyBuffer());
-        ep.putNonBlocking(src, memory.getAddress(), rkey, null);
+        UcpRemoteKey rkey = ep.unpackRemoteKey(dst.getMemory().getRemoteKeyBuffer());
+        ep.putNonBlocking(src.getMemory().getAddress(), UcpMemoryTest.MEM_SIZE,
+            dst.getMemory().getAddress(), rkey, null);
 
         worker1.progressRequest(worker1.flushNonBlocking(null));
 
-        assertEquals(UcpMemoryTest.RANDOM_TEXT, dst.asCharBuffer().toString().trim());
+        assertEquals(UcpMemoryTest.RANDOM_TEXT, dst.getData().asCharBuffer().toString().trim());
 
-        Collections.addAll(resources, context2, context1, worker2, worker1, rkey, ep, memory);
+        Collections.addAll(resources, context2, context1, worker2, worker1, rkey, ep, src, dst);
         closeResources();
     }
 
-    @Test
-    public void testSendRecv() throws Exception {
+    @Theory
+    public void testSendRecv(int memType) throws Exception {
+        System.out.println("Running testSendRecv with memType: " + memType);
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestRmaFeature().requestTagFeature();
         UcpWorkerParams rdmaWorkerParams = new UcpWorkerParams().requestWakeupRMA();
@@ -141,47 +157,48 @@ public void testSendRecv() throws Exception {
         UcpWorker worker1 = context1.newWorker(rdmaWorkerParams);
         UcpWorker worker2 = context2.newWorker(rdmaWorkerParams);
 
-        // Allocate 2 source and 2 destination buffers, to perform 2 RDMA Read operations
-        UcpMemMapParams allocationParams = new UcpMemMapParams().allocate()
-            .setLength(UcpMemoryTest.MEM_SIZE);
-        UcpMemory memory1 = context1.memoryMap(allocationParams);
-        UcpMemory memory2 = context1.memoryMap(allocationParams);
-        ByteBuffer src1 = UcxUtils.getByteBufferView(memory1.getAddress(), UcpMemoryTest.MEM_SIZE);
-        ByteBuffer src2 = UcxUtils.getByteBufferView(memory1.getAddress(), UcpMemoryTest.MEM_SIZE);
-        ByteBuffer dst1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        ByteBuffer dst2 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        src1.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT);
-        src2.asCharBuffer().put(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT);
+        MemoryBlock src1 = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);
+        MemoryBlock src2 = allocateMemory(context1, worker1, memType, UcpMemoryTest.MEM_SIZE);
+
+        MemoryBlock dst1 = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
+        MemoryBlock dst2 = allocateMemory(context2, worker2, memType, UcpMemoryTest.MEM_SIZE);
+
+        src1.setData(UcpMemoryTest.RANDOM_TEXT);
+        src2.setData(UcpMemoryTest.RANDOM_TEXT + UcpMemoryTest.RANDOM_TEXT);
 
         AtomicInteger receivedMessages = new AtomicInteger(0);
-        worker2.recvTaggedNonBlocking(dst1, 0, 0, new UcxCallback() {
-            @Override
-            public void onSuccess(UcpRequest request) {
-                assertEquals(dst1, src1);
-                receivedMessages.incrementAndGet();
-            }
-        });
+        worker2.recvTaggedNonBlocking(dst1.getMemory().getAddress(), UcpMemoryTest.MEM_SIZE, 0, 0,
+            new UcxCallback() {
+                @Override
+                public void onSuccess(UcpRequest request) {
+                    receivedMessages.incrementAndGet();
+                }
+            });
 
-        worker2.recvTaggedNonBlocking(dst2, 1, -1, new UcxCallback() {
-            @Override
-            public void onSuccess(UcpRequest request) {
-                assertEquals(dst2, src2);
-                receivedMessages.incrementAndGet();
-            }
-        });
+        worker2.recvTaggedNonBlocking(dst2.getMemory().getAddress(), UcpMemoryTest.MEM_SIZE,
+            1, -1, new UcxCallback() {
+                @Override
+                public void onSuccess(UcpRequest request) {
+                    receivedMessages.incrementAndGet();
+                }
+            });
 
         UcpEndpoint ep = worker1.newEndpoint(new UcpEndpointParams()
             .setUcpAddress(worker2.getAddress()));
 
-        ep.sendTaggedNonBlocking(src1, 0, null);
-        ep.sendTaggedNonBlocking(src2, 1, null);
+        ep.sendTaggedNonBlocking(src1.getMemory().getAddress(), UcpMemoryTest.MEM_SIZE, 0, null);
+        ep.sendTaggedNonBlocking(src2.getMemory().getAddress(), UcpMemoryTest.MEM_SIZE, 1, null);
 
         while (receivedMessages.get() != 2) {
             worker1.progress();
             worker2.progress();
         }
 
-        Collections.addAll(resources, context2, context1, worker2, worker1, memory2, memory1, ep);
+        assertEquals(src1.getData().asCharBuffer(), dst1.getData().asCharBuffer());
+        assertEquals(src2.getData().asCharBuffer(), dst2.getData().asCharBuffer());
+
+        Collections.addAll(resources, context2, context1, worker2, worker1, ep,
+            src1, src2, dst1, dst2);
         closeResources();
     }
 
@@ -200,6 +217,7 @@ public void testRecvAfterSend() {
 
         UcpEndpoint ep = worker1.newEndpoint(new UcpEndpointParams()
             .setPeerErrorHandlingMode()
+            .setErrorHandler((errEp, status, errorMsg) -> { })
             .setUcpAddress(worker2.getAddress()));
 
         ByteBuffer src1 = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
@@ -211,8 +229,13 @@ public void testRecvAfterSend() {
             @Override
             public void run() {
                 while (!isInterrupted()) {
-                    worker1.progress();
-                    worker2.progress();
+                    try {
+                        worker1.progress();
+                        worker2.progress();
+                    } catch (Exception ex) {
+                        System.err.println(ex.getMessage());
+                        ex.printStackTrace();
+                    }
                 }
             }
         };
@@ -248,8 +271,6 @@ public void onSuccess(UcpRequest request) {
                 Thread.sleep(10);
             } catch (InterruptedException e) {
                 e.printStackTrace();
-            } finally {
-                closeRequest.close();
             }
         }
 
@@ -263,7 +284,7 @@ public void onSuccess(UcpRequest request) {
     }
 
     @Test
-    public void testBufferOffset() {
+    public void testBufferOffset() throws Exception {
         int msgSize = 200;
         int offset = 100;
         // Crerate 2 contexts + 2 workers
@@ -311,7 +332,7 @@ public void testBufferOffset() {
     }
 
     @Test
-    public void testFlushEp() {
+    public void testFlushEp() throws Exception {
         int numRequests = 10;
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestRmaFeature();
@@ -356,7 +377,7 @@ public void onSuccess(UcpRequest request) {
     }
 
     @Test
-    public void testRecvSize() {
+    public void testRecvSize() throws Exception {
         UcpContext context1 = new UcpContext(new UcpParams().requestTagFeature());
         UcpContext context2 = new UcpContext(new UcpParams().requestTagFeature());
 
@@ -386,7 +407,7 @@ public void testRecvSize() {
     }
 
     @Test
-    public void testStreamingAPI() {
+    public void testStreamingAPI() throws Exception {
         UcpParams params = new UcpParams().requestStreamFeature().requestRmaFeature();
         UcpContext context1 = new UcpContext(params);
         UcpContext context2 = new UcpContext(params);
@@ -401,7 +422,7 @@ public void testStreamingAPI() {
             new UcpEndpointParams().setUcpAddress(worker1.getAddress()));
 
         ByteBuffer sendBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE);
-        sendBuffer.put(0, (byte)1);
+        sendBuffer.put(0, (byte) 1);
         ByteBuffer recvBuffer = ByteBuffer.allocateDirect(UcpMemoryTest.MEM_SIZE * 2);
 
         UcpRequest[] sends = new UcpRequest[2];
@@ -421,7 +442,7 @@ public void onSuccess(UcpRequest request) {
 
         AtomicBoolean received = new AtomicBoolean(false);
         serverToClient.recvStreamNonBlocking(
-            UcxUtils.getAddress(recvBuffer), UcpMemoryTest.MEM_SIZE * 2,
+            UcxUtils.getAddress(recvBuffer), UcpMemoryTest.MEM_SIZE * 2L,
             UcpConstants.UCP_STREAM_RECV_FLAG_WAITALL,
             new UcxCallback() {
                 @Override
@@ -443,12 +464,12 @@ public void onSuccess(UcpRequest request) {
         closeResources();
     }
 
-    @Test
-    public void testIovOperations() throws Exception {
+    @Theory
+    public void testIovOperations(int memType) throws Exception {
+        System.out.println("Running testIovOperations with memType: " + memType);
         int NUM_IOV = 6;
         long buffMultiplier = 10L;
 
-        UcpMemMapParams memMapParams = new UcpMemMapParams().allocate();
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestTagFeature().requestStreamFeature();
         UcpWorkerParams workerParams = new UcpWorkerParams();
@@ -463,10 +484,12 @@ public void testIovOperations() throws Exception {
         UcpEndpoint recvEp = worker2.newEndpoint(new UcpEndpointParams()
             .setUcpAddress(worker1.getAddress()));
 
+        MemoryBlock[] sendMemory = new MemoryBlock[NUM_IOV];
         UcpMemory[] sendBuffers = new UcpMemory[NUM_IOV];
         long[] sendAddresses = new long[NUM_IOV];
         long[] sizes = new long[NUM_IOV];
 
+        MemoryBlock[] recvMemory = new MemoryBlock[NUM_IOV];
         UcpMemory[] recvBuffers = new UcpMemory[NUM_IOV];
         long[] recvAddresses = new long[NUM_IOV];
 
@@ -475,16 +498,16 @@ public void testIovOperations() throws Exception {
         for (int i = 0; i < NUM_IOV; i++) {
             long bufferSize = (i + 1) * buffMultiplier;
             totalSize += bufferSize;
-            memMapParams.setLength(bufferSize);
 
-            sendBuffers[i] = context1.memoryMap(memMapParams);
+            sendMemory[i] = allocateMemory(context1, worker1, memType, bufferSize);
+            sendBuffers[i] = sendMemory[i].getMemory();
             sendAddresses[i] = sendBuffers[i].getAddress();
             sizes[i] = bufferSize;
 
-            ByteBuffer buf = UcxUtils.getByteBufferView(sendAddresses[i], (int)bufferSize);
-            buf.putInt(0, (i + 1));
+            sendMemory[i].setData(String.valueOf(i + 1));
 
-            recvBuffers[i] = context2.memoryMap(memMapParams);
+            recvMemory[i] = allocateMemory(context2, worker2, memType, bufferSize);
+            recvBuffers[i] = recvMemory[i].getMemory();
             recvAddresses[i] = recvBuffers[i].getAddress();
         }
 
@@ -499,12 +522,13 @@ public void testIovOperations() throws Exception {
         assertEquals(totalSize, recv.getRecvSize());
 
         for (int i = 0; i < NUM_IOV; i++) {
-            ByteBuffer buf = UcxUtils.getByteBufferView(recvAddresses[i], (int)sizes[i]);
-            assertEquals((i + 1), buf.getInt(0));
-            recvBuffers[i].deregister();
+            assertEquals(String.valueOf(i + 1),
+                recvMemory[i].getData().asCharBuffer().toString().trim());
+            recvMemory[i].close();
         }
 
         // Test 6 send IOV to 3 recv IOV
+        recvMemory = new MemoryBlock[NUM_IOV / 2];
         recvBuffers = new UcpMemory[NUM_IOV / 2];
         recvAddresses = new long[NUM_IOV / 2];
         long[] recvSizes = new long[NUM_IOV / 2];
@@ -513,7 +537,8 @@ public void testIovOperations() throws Exception {
         for (int i = 0; i < NUM_IOV / 2; i++) {
             long bufferLength = (i + 1) * buffMultiplier * 2;
             totalSize += bufferLength;
-            recvBuffers[i] = context2.memoryMap(memMapParams.setLength(bufferLength));
+            recvMemory[i] = allocateMemory(context2, worker2, memType, bufferLength);
+            recvBuffers[i] = recvMemory[i].getMemory();
             recvAddresses[i] = recvBuffers[i].getAddress();
             recvSizes[i] = bufferLength;
         }
@@ -527,17 +552,16 @@ public void testIovOperations() throws Exception {
         }
 
         assertEquals(totalSize, recv.getRecvSize());
-        ByteBuffer buf = UcxUtils.getByteBufferView(recvAddresses[0], (int)recvSizes[0]);
-        assertEquals(1, buf.getInt(0));
+        assertEquals('1', recvMemory[0].getData().asCharBuffer().get(0));
 
         Collections.addAll(resources, context1, context2, worker1, worker2, ep);
-        Collections.addAll(resources, sendBuffers);
-        Collections.addAll(resources, recvBuffers);
+        Collections.addAll(resources, sendMemory);
+        Collections.addAll(resources, recvMemory);
         closeResources();
     }
 
     @Test
-    public void testEpErrorHandler() {
+    public void testEpErrorHandler() throws Exception {
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestTagFeature();
         UcpWorkerParams workerParams = new UcpWorkerParams();
@@ -591,4 +615,124 @@ public void onError(int ucsStatus, String errorMsg) {
         worker1.close();
         context1.close();
     }
+
+    @Theory
+    public void testActiveMessages(int memType) throws Exception {
+        System.out.println("Running testActiveMessages with memType: " + memType);
+        UcpParams params = new UcpParams().requestAmFeature().requestTagFeature();
+        UcpContext context1 = new UcpContext(params);
+        UcpContext context2 = new UcpContext(params);
+
+        UcpWorker worker1 = context1.newWorker(new UcpWorkerParams());
+        UcpWorker worker2 = context2.newWorker(new UcpWorkerParams());
+
+        String headerString = "Hello";
+        String dataString = "Active messages";
+        long headerSize = headerString.length() * 2;
+        long dataSize = UcpMemoryTest.MEM_SIZE;
+        assertTrue(headerSize < worker1.getMaxAmHeaderSize());
+
+        ByteBuffer header = ByteBuffer.allocateDirect((int) headerSize);
+        header.asCharBuffer().append(headerString);
+
+        header.rewind();
+
+        MemoryBlock sendData = allocateMemory(context2, worker2, memType, dataSize);
+        sendData.setData(dataString);
+
+        MemoryBlock recvData = allocateMemory(context1, worker1, memType, dataSize);
+        MemoryBlock recvEagerData = allocateMemory(context1, worker1, memType, dataSize);
+        ByteBuffer recvHeader = ByteBuffer.allocateDirect((int) headerSize);
+        UcpRequest[] requests = new UcpRequest[6];
+
+        UcpEndpoint ep = worker2.newEndpoint(
+            new UcpEndpointParams().setUcpAddress(worker1.getAddress()));
+
+        Set<UcpEndpoint> cachedEp = new HashSet<>();
+
+        // Test rndv flow
+        worker1.setAmRecvHandler(0, (headerAddress, headerSize12, amData, replyEp) -> {
+            assertFalse(amData.isDataValid());
+            try {
+                assertEquals(headerString,
+                    UcxUtils.getByteBufferView(headerAddress, (int) headerSize12)
+                        .asCharBuffer().toString().trim());
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+
+            requests[2] = replyEp.sendTaggedNonBlocking(header, null);
+            requests[3] = amData.receive(recvData.getMemory().getAddress(), null);
+
+            if (!cachedEp.isEmpty()) {
+                assertTrue(cachedEp.contains(replyEp));
+            } else {
+                cachedEp.add(replyEp);
+            }
+
+            return UcsConstants.STATUS.UCS_OK;
+        });
+
+        // Test eager flow
+        worker1.setAmRecvHandler(1, (headerAddress, headerSize1, amData, replyEp) -> {
+            assertTrue(amData.isDataValid());
+            try {
+                assertEquals(dataString,
+                    UcxUtils.getByteBufferView(amData.getDataAddress(), (int) amData.getLength())
+                        .asCharBuffer().toString().trim());
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+
+            if (!cachedEp.isEmpty()) {
+                assertTrue(cachedEp.contains(replyEp));
+            } else {
+                cachedEp.add(replyEp);
+            }
+
+            requests[5] = amData.receive(recvEagerData.getMemory().getAddress(), null);
+
+            return UcsConstants.STATUS.UCS_OK;
+        });
+
+        requests[0] = ep.sendAmNonBlocking(0,
+            UcxUtils.getAddress(header), headerSize,
+            sendData.getMemory().getAddress(), sendData.getMemory().getLength(),
+            UcpConstants.UCP_AM_SEND_FLAG_REPLY | UcpConstants.UCP_AM_SEND_FLAG_RNDV,
+            new UcxCallback() {
+                @Override
+                public void onSuccess(UcpRequest request) {
+                    assertTrue(request.isCompleted());
+                }
+            });
+
+        requests[1] = worker2.recvTaggedNonBlocking(recvHeader, null);
+        requests[4] = ep.sendAmNonBlocking(1, 0L, 0L,
+            sendData.getMemory().getAddress(), dataSize,
+            UcpConstants.UCP_AM_SEND_FLAG_REPLY | UcpConstants.UCP_AM_SEND_FLAG_EAGER, null);
+
+
+        while (!Arrays.stream(requests).allMatch(r -> (r != null) && r.isCompleted())) {
+            worker1.progress();
+            worker2.progress();
+        }
+
+        assertEquals(dataString,
+            recvData.getData().asCharBuffer().toString().trim());
+
+        assertEquals(dataString,
+            recvEagerData.getData().asCharBuffer().toString().trim());
+
+        assertEquals(headerString,
+            recvHeader.asCharBuffer().toString().trim());
+
+        // Reset AM callback
+        worker1.removeAmRecvHandler(0);
+        worker1.removeAmRecvHandler(1);
+
+        Collections.addAll(resources, context1, context2, worker1, worker2, ep,
+            cachedEp.iterator().next(), sendData, recvData, recvEagerData);
+        closeResources();
+        cachedEp.clear();
+    }
 }
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java
index 658a6019700..54b4c8c9c0c 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcpListenerTest.java
@@ -6,6 +6,7 @@
 
 import org.junit.Test;
 import org.openucx.jucx.ucp.*;
+import org.openucx.jucx.ucs.UcsConstants;
 
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
@@ -24,32 +25,6 @@ public class UcpListenerTest  extends UcxTest {
     static final int port = Integer.parseInt(
         System.getenv().getOrDefault("JUCX_TEST_PORT", "55321"));
 
-    @Test
-    public void testCreateUcpListener() {
-        UcpContext context = new UcpContext(new UcpParams().requestStreamFeature());
-        UcpWorker worker = context.newWorker(new UcpWorkerParams());
-        InetSocketAddress ipv4 = new InetSocketAddress("0.0.0.0", port);
-        try {
-            UcpListener ipv4Listener = worker.newListener(
-                new UcpListenerParams().setSockAddr(ipv4));
-
-            assertNotNull(ipv4Listener);
-            ipv4Listener.close();
-        } catch (UcxException ignored) { }
-
-        try {
-            InetSocketAddress ipv6 = new InetSocketAddress("::", port);
-            UcpListener ipv6Listener = worker.newListener(
-                new UcpListenerParams().setSockAddr(ipv6));
-
-            assertNotNull(ipv6Listener);
-            ipv6Listener.close();
-        } catch (UcxException ignored) { }
-
-        worker.close();
-        context.close();
-    }
-
     static Stream<NetworkInterface> getInterfaces() {
         try {
             return Collections.list(NetworkInterface.getNetworkInterfaces()).stream()
@@ -74,19 +49,27 @@ static UcpListener tryBindListener(UcpWorker worker, UcpListenerParams params) {
         List<InetAddress> addresses = getInterfaces().flatMap(iface ->
             Collections.list(iface.getInetAddresses()).stream())
             .collect(Collectors.toList());
+        Collections.reverse(addresses);
         for (InetAddress address : addresses) {
-            try {
-                result = worker.newListener(
-                    params.setSockAddr(new InetSocketAddress(address, port)));
-                break;
-            } catch (UcxException ignored) { }
+            for (int i = 0; i < 10; i++) {
+                try {
+                    result = worker.newListener(
+                        params.setSockAddr(new InetSocketAddress(address, port + i)));
+                    break;
+                } catch (UcxException ex) {
+                    if (ex.getStatus() != UcsConstants.STATUS.UCS_ERR_BUSY) {
+                        break;
+                    }
+                }
+            }
         }
         assertNotNull("Could not find socket address to start UcpListener", result);
+        System.out.println("Bound UcpListner on: " + result.getAddress());
         return result;
     }
 
     @Test
-    public void testConnectionHandler() {
+    public void testConnectionHandler() throws Exception {
         UcpContext context1 = new UcpContext(new UcpParams().requestStreamFeature()
             .requestRmaFeature());
         UcpContext context2 = new UcpContext(new UcpParams().requestStreamFeature()
@@ -100,28 +83,58 @@ public void testConnectionHandler() {
         // Create listener and set connection handler
         UcpListenerParams listenerParams = new UcpListenerParams()
             .setConnectionHandler(conRequest::set);
-        UcpListener listener = tryBindListener(serverWorker1, listenerParams);
+        UcpListener serverListener = tryBindListener(serverWorker1, listenerParams);
+        UcpListener clientListener = tryBindListener(clientWorker, listenerParams);
 
         UcpEndpoint clientToServer = clientWorker.newEndpoint(new UcpEndpointParams()
-            .setSocketAddress(listener.getAddress()));
+            .setErrorHandler((ep, status, errorMsg) ->
+                System.err.println("clientToServer error: " + errorMsg))
+            .setPeerErrorHandlingMode().setSocketAddress(serverListener.getAddress()));
 
         while (conRequest.get() == null) {
             serverWorker1.progress();
             clientWorker.progress();
         }
 
+        assertNotNull(conRequest.get().getClientAddress());
+        UcpEndpoint serverToClientListener = serverWorker2.newEndpoint(
+            new UcpEndpointParams().setSocketAddress(conRequest.get().getClientAddress())
+                                   .setPeerErrorHandlingMode()
+                                   .setErrorHandler((errEp, status, errorMsg) ->
+                                       System.err.println("serverToClientListener error: " +
+                                           errorMsg)));
+        serverWorker2.progressRequest(serverToClientListener.closeNonBlockingForce());
+
         // Create endpoint from another worker from pool.
         UcpEndpoint serverToClient = serverWorker2.newEndpoint(
             new UcpEndpointParams().setConnectionRequest(conRequest.get()));
-        
-        // Temporary workaround until new connection establishment protocol in UCX.
+
+        // Test connection handler persists
         for (int i = 0; i < 10; i++) {
-            serverWorker1.progress();
-            serverWorker2.progress();
-            clientWorker.progress();
-            try {
-                Thread.sleep(10);
-            } catch (Exception ignored) { }
+            conRequest.set(null);
+            UcpEndpoint tmpEp = clientWorker.newEndpoint(new UcpEndpointParams()
+                .setSocketAddress(serverListener.getAddress()).setPeerErrorHandlingMode()
+                .setErrorHandler((ep, status, errorMsg) ->
+                    System.err.println("tmpEp error: " + errorMsg)));
+
+            while (conRequest.get() == null) {
+                serverWorker1.progress();
+                serverWorker2.progress();
+                clientWorker.progress();
+            }
+
+            UcpEndpoint tmpEp2 = serverWorker2.newEndpoint(
+                new UcpEndpointParams().setPeerErrorHandlingMode()
+                    .setConnectionRequest(conRequest.get()));
+
+            UcpRequest close1 = tmpEp.closeNonBlockingFlush();
+            UcpRequest close2 = tmpEp2.closeNonBlockingFlush();
+
+            while (!close1.isCompleted() || !close2.isCompleted()) {
+                serverWorker1.progress();
+                serverWorker2.progress();
+                clientWorker.progress();
+            }
         }
 
         UcpRequest sent = serverToClient.sendStreamNonBlocking(
@@ -142,13 +155,22 @@ public void testConnectionHandler() {
 
         while (!sent.isCompleted() || !recv.isCompleted()) {
             serverWorker1.progress();
+            serverWorker2.progress();
             clientWorker.progress();
         }
 
         assertEquals(UcpMemoryTest.MEM_SIZE, recv.getRecvSize());
 
+        UcpRequest serverClose = serverToClient.closeNonBlockingFlush();
+        UcpRequest clientClose = clientToServer.closeNonBlockingFlush();
+
+        while (!serverClose.isCompleted() || !clientClose.isCompleted()) {
+            serverWorker2.progress();
+            clientWorker.progress();
+        }
+
         Collections.addAll(resources, context2, context1, clientWorker, serverWorker1,
-            serverWorker2, listener, serverToClient, clientToServer);
+            serverWorker2, serverListener, clientListener);
         closeResources();
     }
 }
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java
index 01668d003ee..aca3c51166a 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcpMemoryTest.java
@@ -7,18 +7,21 @@
 import org.junit.Test;
 
 import org.openucx.jucx.ucp.*;
+import org.openucx.jucx.ucs.UcsConstants;
 
 import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.Collections;
 import java.util.UUID;
 
 import static java.nio.file.StandardOpenOption.*;
 import static org.junit.Assert.*;
+import static org.junit.Assume.assumeTrue;
 
-public class UcpMemoryTest {
+public class UcpMemoryTest extends UcxTest {
     static int MEM_SIZE = 4096;
     static String RANDOM_TEXT = UUID.randomUUID().toString();
 
@@ -39,7 +42,8 @@ public void testMmapFile() throws Exception {
 
         // 3. Test allocation
         UcpMemory allocatedMemory = context.memoryMap(new UcpMemMapParams()
-            .allocate().setLength(MEM_SIZE).nonBlocking());
+            .allocate().setProtection(UcpConstants.UCP_MEM_MAP_PROT_LOCAL_READ)
+            .setLength(MEM_SIZE).nonBlocking());
         assertEquals(allocatedMemory.getLength(), MEM_SIZE);
 
         allocatedMemory.deregister();
@@ -71,11 +75,8 @@ public void testRemoteKeyUnpack() {
         UcpMemory mem = context.registerMemory(buf);
         UcpRemoteKey rkey = endpoint.unpackRemoteKey(mem.getRemoteKeyBuffer());
         assertNotNull(rkey.getNativeId());
-        rkey.close();
-        mem.deregister();
-        endpoint.close();
-        worker1.close();
-        worker2.close();
-        context.close();
+
+        Collections.addAll(resources, context, worker1, worker2, endpoint, mem, rkey);
+        closeResources();
     }
 }
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java
index 0ac1fc6327c..bdca94fc67d 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcpRequestTest.java
@@ -6,13 +6,14 @@
 
 import org.junit.Test;
 import org.openucx.jucx.ucp.*;
+import org.openucx.jucx.ucs.UcsConstants;
 
 import java.nio.ByteBuffer;
 import static org.junit.Assert.*;
 
 public class UcpRequestTest {
     @Test
-    public void testCancelRequest() {
+    public void testCancelRequest() throws Exception {
         UcpContext context = new UcpContext(new UcpParams().requestTagFeature());
         UcpWorker worker = context.newWorker(new UcpWorkerParams());
         UcpRequest recv = worker.recvTaggedNonBlocking(ByteBuffer.allocateDirect(100), null);
@@ -22,6 +23,7 @@ public void testCancelRequest() {
             worker.progress();
         }
 
+        assertEquals(UcsConstants.STATUS.UCS_ERR_CANCELED, recv.getStatus());
         assertTrue(recv.isCompleted());
         assertNull(recv.getNativeId());
 
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java
index d896898a038..da6596dd07f 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcpWorkerTest.java
@@ -19,7 +19,7 @@ public class UcpWorkerTest extends UcxTest {
     private static int numWorkers = Runtime.getRuntime().availableProcessors();
 
     @Test
-    public void testSingleWorker() {
+    public void testSingleWorker() throws Exception {
         UcpContext context = new UcpContext(new UcpParams().requestTagFeature());
         assertEquals(2, UcsConstants.ThreadMode.UCS_THREAD_MODE_MULTI);
         assertNotEquals(context.getNativeId(), null);
@@ -99,8 +99,12 @@ public void testWorkerSleepWakeup() throws InterruptedException {
             @Override
             public void run() {
                 while (!isInterrupted()) {
-                    if (worker.progress() == 0) {
-                        worker.waitForEvents();
+                    try {
+                        if (worker.progress() == 0) {
+                            worker.waitForEvents();
+                        }
+                    } catch (Exception e) {
+                        e.printStackTrace();
                     }
                 }
                 success.set(true);
@@ -120,7 +124,7 @@ public void run() {
     }
 
     @Test
-    public void testFlushWorker() {
+    public void testFlushWorker() throws Exception {
         int numRequests = 10;
         // Crerate 2 contexts + 2 workers
         UcpParams params = new UcpParams().requestRmaFeature();
@@ -166,7 +170,7 @@ public void onSuccess(UcpRequest request) {
     }
 
     @Test
-    public void testTagProbe() {
+    public void testTagProbe() throws Exception {
         UcpParams params = new UcpParams().requestTagFeature();
         UcpContext context1 = new UcpContext(params);
         UcpContext context2 = new UcpContext(params);
diff --git a/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java b/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java
index 5d40f2da2bd..42de8ad8e2f 100644
--- a/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java
+++ b/bindings/java/src/test/java/org/openucx/jucx/UcxTest.java
@@ -5,11 +5,72 @@
 
 package org.openucx.jucx;
 
+import org.openucx.jucx.ucp.*;
+import org.openucx.jucx.ucs.UcsConstants;
+
 import java.io.Closeable;
 import java.io.IOException;
+import java.nio.ByteBuffer;
 import java.util.Stack;
 
 abstract class UcxTest {
+    protected static class MemoryBlock implements Closeable {
+        private final UcpMemory memory;
+        private UcpEndpoint selfEp;
+        private ByteBuffer buffer;
+        private final UcpWorker worker;
+        private UcpRemoteKey rkey;
+
+        protected MemoryBlock(UcpWorker worker, UcpMemory memory) {
+            this.memory = memory;
+            this.worker = worker;
+            if (memory.getMemType() == UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA) {
+                this.selfEp = worker.newEndpoint(
+                    new UcpEndpointParams().setUcpAddress(worker.getAddress()));
+                rkey = selfEp.unpackRemoteKey(memory.getRemoteKeyBuffer());
+            } else {
+                buffer = UcxUtils.getByteBufferView(memory.getAddress(), memory.getLength());
+            }
+        }
+
+        public UcpMemory getMemory() {
+            return memory;
+        }
+
+        public void setData(String data) throws Exception {
+            if (memory.getMemType() == UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA) {
+                ByteBuffer srcBuffer = ByteBuffer.allocateDirect(data.length());
+                srcBuffer.asCharBuffer().put(data);
+                worker.progressRequest(selfEp.putNonBlocking(srcBuffer, memory.getAddress(), rkey,
+                    null));
+            } else {
+                buffer.asCharBuffer().put(data);
+            }
+        }
+
+        public ByteBuffer getData() throws Exception {
+            if (memory.getMemType() == UcsConstants.MEMORY_TYPE.UCS_MEMORY_TYPE_CUDA) {
+                ByteBuffer dstBuffer = ByteBuffer.allocateDirect((int)memory.getLength());
+                worker.progressRequest(selfEp.getNonBlocking(memory.getAddress(), rkey,
+                    dstBuffer, null));
+                return dstBuffer;
+            } else {
+                return buffer;
+            }
+        }
+
+        @Override
+        public void close() {
+            if (rkey != null) {
+                rkey.close();
+            }
+            memory.close();
+            if (selfEp != null) {
+                selfEp.close();
+            }
+        }
+    }
+
     // Stack of closable resources (context, worker, etc.) to be closed at the end.
     protected static Stack<Closeable> resources = new Stack<>();
 
@@ -22,4 +83,11 @@ protected void closeResources() {
             }
         }
     }
+
+    protected static MemoryBlock allocateMemory(UcpContext context, UcpWorker worker, int memType,
+                                                long length) {
+        UcpMemMapParams memMapParams = new UcpMemMapParams().allocate().setLength(length)
+            .setMemoryType(memType);
+        return new MemoryBlock(worker, context.memoryMap(memMapParams));
+    }
 }
diff --git a/buildlib/az-distro-release.yml b/buildlib/az-distro-release.yml
index 720ff6ad6de..8e35231ee2b 100644
--- a/buildlib/az-distro-release.yml
+++ b/buildlib/az-distro-release.yml
@@ -1,6 +1,12 @@
 jobs:
   - job: distro_release
+    condition: eq(stageDependencies.Check_Commit.Check.outputs['Commit.Launch'], 'Yes')
     displayName: distro
+    variables:
+      ${{ if eq(variables['Build.Reason'], 'ResourceTrigger') }}:
+        POSTFIX: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}
+      ${{ if eq(variables['Build.Reason'], 'PullRequest') }}:
+        POSTFIX: ucx-test
 
     pool:
       name: MLNX
@@ -11,34 +17,53 @@ jobs:
       matrix:
         centos7_cuda10_1:
           build_container: centos7_cuda10_1
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-mofed5.0-cuda10.1.tar.bz2
+          artifact_name: $(POSTFIX)-centos7-mofed5.x-cuda10.1.tar.bz2
         centos7_cuda10_2:
           build_container: centos7_cuda10_2
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-mofed5.0-cuda10.2.tar.bz2
+          artifact_name: $(POSTFIX)-centos7-mofed5.x-cuda10.2.tar.bz2
         centos7_cuda11_0:
           build_container: centos7_cuda11_0
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-centos7-mofed5.0-cuda11.0.tar.bz2
+          artifact_name: $(POSTFIX)-centos7-mofed5.x-cuda11.0.tar.bz2
+        centos7_cuda11_2:
+          build_container: centos7_cuda11_2
+          artifact_name: $(POSTFIX)-centos7-mofed5.x-cuda11.2.tar.bz2
+        centos8_cuda11_0:
+          build_container: centos8_cuda11_0
+          artifact_name: $(POSTFIX)-centos8-mofed5.x-cuda11.0.tar.bz2
+        centos8_cuda11_2:
+          build_container: centos8_cuda11_2
+          artifact_name: $(POSTFIX)-centos8-mofed5.x-cuda11.2.tar.bz2
         ubuntu16_cuda10_1:
           build_container: ubuntu16_cuda10_1
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu16.04-mofed5.0-cuda10.1.deb
+          artifact_name: $(POSTFIX)-ubuntu16.04-mofed5.x-cuda10.1.deb
         ubuntu16_cuda10_2:
           build_container: ubuntu16_cuda10_2
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu16.04-mofed5.0-cuda10.2.deb
+          artifact_name: $(POSTFIX)-ubuntu16.04-mofed5.x-cuda10.2.deb
         ubuntu18_cuda10_1:
           build_container: ubuntu18_cuda10_1
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu18.04-mofed5.0-cuda10.1.deb
+          artifact_name: $(POSTFIX)-ubuntu18.04-mofed5.x-cuda10.1.deb
         ubuntu18_cuda10_2:
           build_container: ubuntu18_cuda10_2
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu18.04-mofed5.0-cuda10.2.deb
-        ubuntu18_cuda11.0:
-          build_container: ubuntu18_cuda11.0
-          artifact_name: ucx-${{ replace(variables['Build.SourceBranch'], 'refs/tags/', '') }}-ubuntu18.04-mofed5.0-cuda11.0.deb
+          artifact_name: $(POSTFIX)-ubuntu18.04-mofed5.x-cuda10.2.deb
+        ubuntu18_cuda11_0:
+          build_container: ubuntu18_cuda11_0
+          artifact_name: $(POSTFIX)-ubuntu18.04-mofed5.x-cuda11.0.deb
+        ubuntu18_cuda11_2:
+          build_container: ubuntu18_cuda11_2
+          artifact_name: $(POSTFIX)-ubuntu18.04-mofed5.x-cuda11.2.deb
+        ubuntu20_cuda11_0:
+          build_container: ubuntu20_cuda11_0
+          artifact_name: $(POSTFIX)-ubuntu20.04-mofed5.x-cuda11.0.deb
+        ubuntu20_cuda11_2:
+          build_container: ubuntu20_cuda11_2
+          artifact_name: $(POSTFIX)-ubuntu20.04-mofed5.x-cuda11.2.deb
 
     container: $[ variables['build_container'] ]
 
     steps:
       - checkout: self
         clean: true
+        fetchDepth: 100
         path: "we/need/to/go/deeper"
         # ^ Avoid rpmbuild error: Dest dir longer than base dir is not supported
 
@@ -47,13 +72,13 @@ jobs:
           ./autogen.sh
           mkdir pkg-build
           cd pkg-build
-          ../contrib/configure-release --with-cuda
+          ../contrib/configure-release --with-cuda --with-java=no
         displayName: Configure
 
       - bash: |
           set -eE
           cd pkg-build
-          ../contrib/buildrpm.sh -s -t -b --strict-ibverbs-dep
+          ../contrib/buildrpm.sh -s -t -b
           cd rpm-dist/`uname -m`
           tar -cjf "../../../${AZ_ARTIFACT_NAME}" *.rpm
           cd ../../..
@@ -76,6 +101,7 @@ jobs:
           AZ_ARTIFACT_NAME: $(artifact_name)
 
       - task: GithubRelease@0
+        condition: eq(variables['Build.Reason'], 'ResourceTrigger')
         displayName: Upload artifacts to draft release
         inputs:
           githubConnection: release
diff --git a/buildlib/az-helpers.sh b/buildlib/az-helpers.sh
index 286881ef591..77e504e33a0 100644
--- a/buildlib/az-helpers.sh
+++ b/buildlib/az-helpers.sh
@@ -38,6 +38,30 @@ function azure_log_issue() {
     echo "##vso[task.complete result=Failed;]"
 }
 
+# Report an error message to Azure pipeline
+function azure_log_error() {
+    test "x$RUNNING_IN_AZURE" = "xno" && return
+    msg=$1
+    set +x
+    echo "##vso[task.logissue type=error]${msg}"
+}
+
+# Report an warning message to Azure pipeline
+function azure_log_warning() {
+    test "x$RUNNING_IN_AZURE" = "xno" && return
+    msg=$1
+    set +x
+    echo "##vso[task.logissue type=warning]${msg}"
+}
+
+# Complete the task as "succeeeded with issues"
+function azure_complete_with_issues() {
+    test "x$RUNNING_IN_AZURE" = "xno" && return
+    msg=$1
+    set +x
+    echo "##vso[task.complete result=SucceededWithIssues;]DONE${msg}"
+}
+
 # Get IPv4 address of an interface
 function get_ip() {
     iface=$1
@@ -45,6 +69,11 @@ function get_ip() {
     echo "$ip"
 }
 
+# Get active RDMA interfaces
+function get_rdma_interfaces() {
+    echo `ibdev2netdev | grep Up | awk '{print $5}'`
+}
+
 # Prepend each line with a timestamp
 function add_timestamp() {
     set +x
@@ -52,3 +81,68 @@ function add_timestamp() {
         echo "$(date -u +"%Y-%m-%dT%T.%NZ") $line"
     done
 }
+
+function az_init_modules() {
+    . /etc/profile.d/modules.sh
+    export MODULEPATH="/hpc/local/etc/modulefiles:$MODULEPATH"
+}
+
+#
+# Test if an environment module exists and load it if yes.
+# Otherwise, return error code.
+#
+function az_module_load() {
+    module=$1
+
+    if module avail -t 2>&1 | grep -q "^$module\$"
+    then
+        module load $module
+        return 0
+    else
+        echo "MODULEPATH='${MODULEPATH}'"
+        module avail || true
+        azure_log_warning "Module $module cannot be loaded"
+        return 1
+    fi
+}
+
+#
+# Safe unload for env modules (even if it doesn't exist)
+#
+function az_module_unload() {
+    module=$1
+    module unload "${module}" || true
+}
+
+
+#
+# try load cuda modules if nvidia driver is installed
+#
+try_load_cuda_env() {
+	num_gpus=0
+	have_cuda=no
+	have_gdrcopy=no
+	if [ -f "/proc/driver/nvidia/version" ]; then
+		have_cuda=yes
+		have_gdrcopy=yes
+		az_module_load dev/cuda11.1.1 || have_cuda=no
+		az_module_load dev/gdrcopy2.1_cuda11.1.1 || have_gdrcopy=no
+		num_gpus=$(nvidia-smi -L | wc -l)
+	fi
+}
+
+
+check_commit_message() {
+    git_id=$1
+    title_mask=$2
+    build_reason=$3
+    echo "Get commit message target $git_id"
+    title=`git log -1 --format="%s" $git_id`
+
+    if [[ ( "$build_reason" == "IndividualCI" ) || ( "$title" == "$title_mask"* && "$build_reason" == "PullRequest" ) ]]
+    then
+        echo "##vso[task.setvariable variable=Launch;isOutput=true]Yes"
+    else
+        echo "##vso[task.setvariable variable=Launch;isOutput=true]No"
+    fi
+}
diff --git a/buildlib/az-io_demo.sh b/buildlib/az-io_demo.sh
deleted file mode 100755
index 32e67fa3cd5..00000000000
--- a/buildlib/az-io_demo.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash -leE
-
-# avoid Azure error: TERM environment variable not set
-export TERM=xterm
-
-basedir=$(cd $(dirname $0) && pwd)
-workspace=${WORKSPACE:="$basedir"}
-cd "$workspace"
-
-echo "Running $0 $*..."
-eval "$*"
-source "${workspace}/az-helpers.sh"
-
-server_ip=${server_ip:=""}
-duration=${duration:=2}
-iface=${iface:="bond0"}
-
-export UCX_MAX_EAGER_LANES=2
-export UCX_TLS=rc
-export UCX_IB_SEG_SIZE=2k
-export UCX_IB_RX_QUEUE_LEN=1024
-export UCX_RC_MAX_RD_ATOMIC=16
-export UCX_RC_ROCE_PATH_FACTOR=2
-export UCX_SOCKADDR_CM_ENABLE=y
-export UCX_RC_MAX_GET_ZCOPY=32k
-export UCX_RC_TX_NUM_GET_BYTES=256K
-
-## run server
-if [ "x$server_ip" = "x" ]; then
-    ip addr show ${iface}
-    server_ip=$(get_ip ${iface})
-    azure_set_variable "server_ip" "$server_ip"
-    echo "Starting server on IP ${server_ip}"
-
-    server_cmd="${workspace}/../test/apps/iodemo/io_demo"
-    if ! "${server_cmd}" |& add_timestamp &>server.log & then
-        cat server.log
-        error "Failed to run server command ${server_cmd}"
-    fi
-
-    # wait for io_demo to start
-    echo "Waiting for server to start.."
-    sleep 10
-
-    server_pid=$(pgrep -u "$USER" -f 'apps/iodemo')
-    echo "Server pid is '${server_pid}'"
-
-    num_pids=$(echo "${server_pid}" | wc -w)
-    if [ ${num_pids} -ne 1 ]; then
-        cat server.log
-        ps -f -U "$USER"  # show all runing processes
-        error "Expected 1 running server, found ${num_pids}"
-    fi
-
-    echo "Server is running, PID='$server_pid'"
-    azure_set_variable "server_pid" "$server_pid"
-
-    # double check the process is running
-    sleep 5
-    if ! kill -0 "$server_pid"; then
-        cat server.log
-        error "Failed to start server"
-    fi
-
-    exit 0
-fi
-
-## run client
-
-timeout="$(( duration - 1 ))m"
-
-echo "Client connecting to server at IP $server_ip"
-echo "Timeout is $timeout"
-
-if ! "${workspace}/../test/apps/iodemo/io_demo" -l $timeout -i 10000000 "$server_ip"; then
-    error "Failed to start client"
-fi
diff --git a/buildlib/az-network-corrupter.sh b/buildlib/az-network-corrupter.sh
index 16ebeef8b72..eb4ffc7054b 100755
--- a/buildlib/az-network-corrupter.sh
+++ b/buildlib/az-network-corrupter.sh
@@ -14,6 +14,7 @@ manager_script=/hpc/noarch/git_projects/swx_infrastructure/clusters/bin/manage_h
 if [ "x$reset" = "xyes" ]; then
     echo "Resetting interface on $(hostname)..."
     ${manager_script} "$(hostname)" "bond-up"
+    sleep "$uptime"
     exit $?
 fi
 
diff --git a/buildlib/azure-pipelines-int4.yml b/buildlib/azure-pipelines-int4.yml
new file mode 100644
index 00000000000..9cff3be2173
--- /dev/null
+++ b/buildlib/azure-pipelines-int4.yml
@@ -0,0 +1,22 @@
+# See https://aka.ms/yaml
+# This pipeline to be run on PRs
+
+trigger: none
+
+resources:
+  pipelines:
+  - pipeline: rebaseMainUcx
+    source: UCX snapshot
+
+  containers:
+    - container: centos7
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7:2
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+
+stages:
+  - stage: Rebase
+    jobs:
+      - job: rebase_master
+        steps:
+          - bash: |
+            echo "Hello world"
\ No newline at end of file
diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml
index 1a3c9121c7e..6c2ea30ef7f 100644
--- a/buildlib/azure-pipelines-pr.yml
+++ b/buildlib/azure-pipelines-pr.yml
@@ -3,17 +3,61 @@
 
 trigger: none
 pr:
-  - master
-  - v*.*.x
+  branches:
+    include:
+    - master
+    - v*.*.x
+  paths:
+    exclude:
+    - .gitignore
+    - docs/source
+    - docs/CodeStyle.md
+    - docs/LoggingStyle.md
+    - docs/OptimizationStyle.md
+    - README.md
+    - NEWS
 
 resources:
   containers:
     - container: centos7
-      image: ucfconsort.azurecr.io/ucx/centos7:1
-      endpoint: ucfconsort_registry
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7:2
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
     - container: fedora
-      image: ucfconsort.azurecr.io/ucx/fedora:3
-      endpoint: ucfconsort_registry
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/fedora33:1
+      options: --privileged
+    - container: fedora34
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/fedora34:2
+      options: --privileged -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: coverity_rh7
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/coverity:mofed-5.1-2.3.8.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: rhel76
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/rhel7.6/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: rhel76_mofed47
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/rhel7.6/builder:mofed-4.7-1.0.0.1
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: rhel74
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/rhel7.4/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: rhel72
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/rhel7.2/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: rhel82
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/rhel8.2/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: ubuntu2004
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/ubuntu20.04/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: ubuntu1804
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/ubuntu18.04/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: sles15sp2
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/sles15sp2/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
+    - container: sles12sp5
+      image: rdmz-harbor.rdmz.labs.mlnx/swx-infra/x86_64/sles12sp5/builder:mofed-5.0-1.0.0.0
+      options: -v /hpc/local:/hpc/local -v /auto/sw_tools:/auto/sw_tools
 
 stages:
   - stage: Codestyle
@@ -24,10 +68,12 @@ stages:
         steps:
           - checkout: self
             clean: true
+            fetchDepth: 100
 
           - bash: |
               set -eE
-              range="remotes/origin/$(System.PullRequest.TargetBranch)..$(Build.SourceVersion)"
+              BASE_SOURCEVERSION=$(git rev-parse HEAD^)
+              range="$BASE_SOURCEVERSION..$(Build.SourceVersion)"
               ok=1
               for sha1 in `git log $range --format="%h"`
               do
@@ -48,40 +94,71 @@ stages:
               fi
             condition: eq(variables['Build.Reason'], 'PullRequest')
 
-  - stage: Build
+      # Check that the code is formatted according to the code style guidelines
+      - job: format
+        displayName: format code
+        pool:
+          name: MLNX
+          demands:
+          - ucx_docker -equals yes
+        container: fedora
+        steps:
+          - checkout: self
+            clean: true
+            fetchDepth: 100
+
+          - bash: |
+              source ./buildlib/az-helpers.sh
+              set -x
+              git log -1 HEAD
+              git log -1 HEAD^
+              BASE_SOURCEVERSION=$(git rev-parse HEAD^)
+              echo "Checking code format on diff ${BASE_SOURCEVERSION}..${BUILD_SOURCEVERSION}"
+              git-clang-format --diff ${BASE_SOURCEVERSION} ${BUILD_SOURCEVERSION} > format.patch
+              echo "Generated patch file:"
+              cat format.patch
+              if [ "`cat format.patch`" = "no modified files to format" ]; then
+                  exit
+              fi
+              git apply format.patch
+              if ! git diff --quiet --exit-code
+              then
+                  url="https://github.com/openucx/ucx/wiki/Code-style-checking"
+                  azure_complete_with_issues "Code is not formatted according to the code style, see $url for more info."
+              fi
+            condition: eq(variables['Build.Reason'], 'PullRequest')
+
+  - stage: Static_check
+    dependsOn: [Codestyle]
     jobs:
       - job: static_checks
         displayName: Static checks
+        pool:
+          name: MLNX
+          demands:
+          - ucx_docker_fast -equals yes
         container: fedora
         steps:
           - checkout: self
             clean: true
+            fetchDepth: 100
 
           - bash: ./autogen.sh
             displayName: Setup autotools
 
           - bash: |
               set -eE
-              mkdir build && cd build
+              . buildlib/tools/common.sh
+              prepare_build
               clang --version
               gcc --version
               cppcheck --version
-              ../contrib/configure-release
-            displayName: Configure
-
-          - bash: |
-              set -eE
-
-              cd build
+              ${WORKSPACE}/contrib/configure-release
 
               export PATH="`csclng --print-path-to-wrap`:`cscppc --print-path-to-wrap`:`cswrap --print-path-to-wrap`:$PATH"
+              set -o pipefail
               make -j`nproc` 2>&1 | tee compile.log
-            displayName: Build
-
-          - bash: |
-              set -eE
-
-              cd build
+              set +o pipefail
 
               cs_errors="cs.err"
               cslinker --quiet compile.log \
@@ -99,10 +176,66 @@ stages:
                 echo "No errors reported by static checkers"
               fi
             displayName: cstools reports
+            env:
+              BUILD_ID: "$(Build.BuildId)-$(Build.BuildNumber)"
+
+  - stage: Build
+    dependsOn: [Static_check]
+    jobs:
+      - job: build_source
+        pool:
+          name: MLNX
+          demands:
+            - ucx_docker -equals yes
+        strategy:
+          matrix:
+            rhel72:
+              CONTAINER: rhel72
+            rhel74:
+              CONTAINER: rhel74
+            rhel76:
+              CONTAINER: rhel76
+              long_test: yes
+            rhel76_mofed47:
+              CONTAINER: rhel76_mofed47
+              long_test: yes
+            ubuntu2004:
+              CONTAINER: ubuntu2004
+              long_test: yes
+            ubuntu1804:
+              CONTAINER: ubuntu1804
+            sles15sp2:
+              CONTAINER: sles15sp2
+            rhel82:
+              CONTAINER: rhel82
+            fedora34:
+              CONTAINER: fedora34
+              long_test: yes
+        container: $[ variables['CONTAINER'] ]
+        timeoutInMinutes: 240
+
+        steps:
+          - checkout: self
+            clean: true
+            fetchDepth: 100
 
-      # Perform test builds on relevant distributions
+          - bash: |
+              ./buildlib/tools/builds.sh
+            displayName: Build
+            env:
+              BUILD_ID: "$(Build.BuildId)-$(Build.BuildNumber)"
+              long_test: $(long_test)
+
+  - stage: Distro
+    dependsOn: [Static_check]
+    jobs:
+       # Perform test builds on relevant distributions.
       - job: Distros
         displayName: Build for
+        pool:
+          name: MLNX
+          demands:
+          - ucx_docker -equals yes
         strategy:
           matrix:
             centos7:
@@ -112,50 +245,36 @@ stages:
         steps:
           - checkout: self
             clean: true
-
-          - bash: ./autogen.sh
-            displayName: Setup autotools
-
-          - bash: |
-              set -eE
-              mkdir build && cd build
-              ../configure $(CONFIGURE_OPTS)
-            displayName: Configure
+            fetchDepth: 100
 
           - bash: |
               set -eE
-              cd build
+              . buildlib/tools/common.sh
+              prepare_build
+              ${WORKSPACE}/configure $(CONFIGURE_OPTS)
               gcc -v
               make -s -j `nproc`
-            displayName: Build for $(CONTAINER)
-
-      # Test RPM build
-      - job: build_rpm
-        displayName: Build tarball and source rpm
-        container: fedora
-        steps:
-          - checkout: self
-            clean: true
-
-          - bash: ./autogen.sh
-            displayName: Setup autotools
-
-          - bash: |
-              set -eE
-              gcc --version
-              ./contrib/configure-release
-              stdbuf -e0 -o0 ./contrib/buildrpm.sh -s -t -b |& tee rpmbuild.log
-              pattern='^warning: '
-              if grep -q "$pattern" rpmbuild.log; then
-                echo "rpm build generated warnings:"
-                grep "$pattern" rpmbuild.log
-                echo "##vso[task.logissue type=error]rpm build generated warnings"
-                echo "##vso[task.complete result=Failed;]"
+              set +eE
+              set -x
+              ./src/tools/info/ucx_info -e -u t 2>&1 | tee info.txt
+              grep -i error info.txt
+              retVal=$?
+              if [ $retVal -eq 0 ]; then
+                  exit 1;
               fi
-            displayName: Configure source and build RPM
+              exit 0;
+            displayName: Test ucx_info
+
+  - stage: Coverity
+    dependsOn: [Static_check]
+    jobs:
+      - template: coverity.yml
+        parameters:
+          demands: ucx_docker -equals yes
+          container: coverity_rh7
 
   - stage: Tests
-    dependsOn: [Codestyle]
+    dependsOn: [Static_check]
     jobs:
     - template: tests.yml
       parameters:
@@ -177,4 +296,25 @@ stages:
         name: hwi
         demands: ucx_hwi -equals yes
         test_perf: 0
-    - template: io-demo.yml
+    - template: tests.yml
+      parameters:
+        name: sputnik
+        demands: ucx_sputnik -equals yes
+        test_perf: 0
+
+  - stage: io_demo
+    dependsOn: [Static_check]
+    jobs:
+    - template: io_demo/io-demo.yml
+
+  - stage: jucx
+    dependsOn: [Static_check]
+    jobs:
+      - template: jucx/jucx-test.yml
+        parameters:
+          name: new
+          demands: ucx_new -equals yes
+      - template: jucx/jucx-test.yml
+        parameters:
+          name: gpu
+          demands: ucx_gpu -equals yes
diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml
index c83eefcaacb..25083f43f18 100644
--- a/buildlib/azure-pipelines-release.yml
+++ b/buildlib/azure-pipelines-release.yml
@@ -1,23 +1,28 @@
 # See https://aka.ms/yaml
 # This pipeline to be run on tags creation
 
-pr: none
 trigger:
   tags:
     include:
       - v*
+pr:
+  - master
+  - v*.*.x
 
 resources:
   containers:
-    - container: centos7
-      image: ucfconsort.azurecr.io/ucx/centos7:2
-      endpoint: ucfconsort_registry
     - container: centos7_cuda10_1
       image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda10.1:1
     - container: centos7_cuda10_2
       image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda10.2:1
     - container: centos7_cuda11_0
-      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda11.0:1
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda11.0:2
+    - container: centos7_cuda11_2
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda11.2:2
+    - container: centos8_cuda11_0
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos8-mofed5.0-cuda11.0:2
+    - container: centos8_cuda11_2
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos8-mofed5.1-cuda11.2:2
     - container: ubuntu16_cuda10_1
       image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu16.04-mofed5.0-cuda10.1:1
     - container: ubuntu16_cuda10_2
@@ -26,18 +31,44 @@ resources:
       image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda10.1:1
     - container: ubuntu18_cuda10_2
       image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda10.2:1
-    - container: ubuntu18_cuda11
-      image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda11.0:1
+    - container: ubuntu18_cuda11_0
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda11.0:2
+    - container: ubuntu18_cuda11_2
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu18.04-mofed5.0-cuda11.2:2
+    - container: ubuntu20_cuda11_0
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu20.04-mofed5.0-cuda11.0:2
+    - container: ubuntu20_cuda11_2
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/ubuntu20.04-mofed5.0-cuda11.2:2
 
 stages:
+  - stage: Check_Commit
+    jobs:
+      - job: Check
+        steps:
+          - checkout: self
+            clean: true
+
+          - bash: |
+              set -eE
+              source ./buildlib/az-helpers.sh
+              set -x
+              check_commit_message $(system.pullRequest.sourceCommitId) "AZP/RELEASE: " $(Build.Reason)
+            name: Commit
   # Create an empty draft to avoid race condition in distro releases
   - stage: GitHubDraft
+    dependsOn: Check_Commit
     jobs:
       - job: DraftRelease
-        container: centos7
+        condition: eq(stageDependencies.Check_Commit.Check.outputs['Commit.Launch'], 'Yes')
+        container: centos7_cuda11_2
+        pool:
+          name: MLNX
+          demands:
+          - ucx_docker -equals yes
         steps:
         - checkout: self
           clean: true
+          fetchDepth: 100
           path: "we/need/to/go/deeper"
 
         - bash: ./autogen.sh
@@ -46,11 +77,12 @@ stages:
         - bash: |
             set -eE
             gcc --version
-            ./contrib/configure-release
+            ./contrib/configure-release --with-java=no
             ./contrib/buildrpm.sh -s -t -b
           displayName: Build tarball
 
         - task: GithubRelease@0
+          condition: eq(variables['Build.Reason'], 'IndividualCI')
           displayName: Create/edit GitHub Draft Release
           inputs:
             githubConnection: release
@@ -67,8 +99,14 @@ stages:
               ./rpm-dist/ucx-*.src.rpm
 
   - stage: Release
+    dependsOn: Check_Commit
+    variables:
+      ${{ if eq(variables['Build.Reason'], 'IndividualCI') }}:
+        TARGET: publish-release
+      ${{ if eq(variables['Build.Reason'], 'PullRequest') }}:
+        TARGET: package
     jobs:
       - template: az-distro-release.yml
-      - template: jucx-publish.yml
+      - template: jucx/jucx-publish.yml
         parameters:
-          target: publish-release
+          target: $(TARGET)
diff --git a/buildlib/azure-pipelines.yml b/buildlib/azure-pipelines.yml
index e0d833f24a3..280e1657581 100644
--- a/buildlib/azure-pipelines.yml
+++ b/buildlib/azure-pipelines.yml
@@ -8,13 +8,23 @@ trigger:
 
 resources:
   containers:
-    - container: centos7
-      image: ucfconsort.azurecr.io/ucx/centos7:1
-      endpoint: ucfconsort_registry
+    - container: centos7_cuda10_1
+      image: rdmz-harbor.rdmz.labs.mlnx/ucx/centos7-mofed5.0-cuda10.1:1
 
 stages:
+  - stage: Check_Commit
+    jobs:
+      - job: Check
+        steps:
+          - checkout: self
+            clean: true
+          - bash: |
+              echo "##vso[task.setvariable variable=Launch;isOutput=true]Yes"
+            name: Commit
   - stage: Build
+    dependsOn: Check_Commit
     jobs:
-      - template: jucx-publish.yml
+      - template: jucx/jucx-publish.yml
         parameters:
           target: publish-snapshot
+
diff --git a/buildlib/coverity.yml b/buildlib/coverity.yml
new file mode 100644
index 00000000000..86a93236132
--- /dev/null
+++ b/buildlib/coverity.yml
@@ -0,0 +1,43 @@
+parameters:
+  demands: []
+  container: rhel76
+  modes: ["release", "devel"]
+
+jobs:
+  - ${{each mode in parameters.modes }}:
+    - job: coverity_${{ mode }}
+      workspace:
+        clean: all
+      pool:
+        name: MLNX
+        demands: ${{ parameters.demands }}
+      displayName: coverity ${{ mode }} on ${{ parameters.container }}
+      container: ${{ parameters.container }}
+      timeoutInMinutes: 30
+      steps:
+        - checkout: self
+          clean: true
+          fetchDepth: 100
+        - bash: |
+            ./buildlib/tools/coverity.sh ${{ mode }}
+            res=$?
+            reportExists=False
+            set -x
+            cov_error_folder=$(System.DefaultWorkingDirectory)/cov_build_${{ mode }}/output/errors
+            echo "##vso[task.setvariable variable=cov_error_folder]$cov_error_folder"
+            ls -la $cov_error_folder
+            test -f $cov_error_folder/index.html && reportExists=True
+            echo "##vso[task.setvariable variable=reportExists]$reportExists"
+            if [[ $res -eq 0 ]] ; then
+              echo "##vso[task.complete result=Succeeded;]Done"
+            else
+              echo "##vso[task.complete result=Failed;]Coverity have errors"
+            fi
+          displayName: ${{ mode }}
+          env:
+            BUILD_ID: "$(Build.BuildId)-$(Build.BuildNumber)"
+        - task: PublishPipelineArtifact@1
+          inputs:
+            targetPath: $(cov_error_folder)
+            artifactName: coverity_${{ mode }}
+          condition: eq(variables['reportExists'], 'True')
diff --git a/buildlib/docker-compose.yml b/buildlib/docker-compose.yml
deleted file mode 100644
index 312d5698781..00000000000
--- a/buildlib/docker-compose.yml
+++ /dev/null
@@ -1,84 +0,0 @@
-version: "3"
-
-services:
-  centos7-mofed5.0-cuda10.1:
-    image: centos7-mofed5.0-cuda10.1
-    build:
-      context: .
-      network: host
-      dockerfile: centos7-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        MOFED_OS: rhel7.6
-        CUDA_VERSION: 10.1
-  centos7-mofed5.0-cuda10.2:
-    image: centos7-mofed5.0-cuda10.2
-    build:
-      context: .
-      network: host
-      dockerfile: centos7-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        MOFED_OS: rhel7.6
-        CUDA_VERSION: 10.2
-  centos7-mofed5.0-cuda11.0:
-    image: centos7-mofed5.0-cuda11.0
-    build:
-      context: .
-      network: host
-      dockerfile: centos7-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        MOFED_OS: rhel7.6
-        CUDA_VERSION: 11.0
-  ubuntu16.04-mofed5.0-cuda10.1:
-    image: ubuntu16.04-mofed5.0-cuda10.1
-    build:
-      context: .
-      network: host
-      dockerfile: ubuntu-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        UBUNTU_VERSION: 16.04
-        CUDA_VERSION: 10.1
-  ubuntu16.04-mofed5.0-cuda10.2:
-    image: ubuntu16.04-mofed5.0-cuda10.2
-    build:
-      context: .
-      network: host
-      dockerfile: ubuntu-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        UBUNTU_VERSION: 16.04
-        CUDA_VERSION: 10.2
-  ubuntu18.04-mofed5.0-cuda10.1:
-    image: ubuntu18.04-mofed5.0-cuda10.1
-    build:
-      context: .
-      network: host
-      dockerfile: ubuntu-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        UBUNTU_VERSION: 18.04
-        CUDA_VERSION: 10.1
-  ubuntu18.04-mofed5.0-cuda10.2:
-    image: ubuntu18.04-mofed5.0-cuda10.2
-    build:
-      context: .
-      network: host
-      dockerfile: ubuntu-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        UBUNTU_VERSION: 18.04
-        CUDA_VERSION: 10.2
-  ubuntu18.04-mofed5.0-cuda11.0:
-    image: ubuntu18.04-mofed5.0-cuda11.0
-    build:
-      context: .
-      network: host
-      dockerfile: ubuntu-release.Dockerfile
-      args:
-        MOFED_VERSION: 5.0-1.0.0.0
-        UBUNTU_VERSION: 18.04
-        CUDA_VERSION: 11.0
-
diff --git a/buildlib/dockers/centos-release.Dockerfile b/buildlib/dockers/centos-release.Dockerfile
new file mode 100644
index 00000000000..ba653433e47
--- /dev/null
+++ b/buildlib/dockers/centos-release.Dockerfile
@@ -0,0 +1,49 @@
+ARG CUDA_VERSION
+ARG OS_VERSION
+FROM nvidia/cuda:${CUDA_VERSION}-devel-centos${OS_VERSION}
+
+RUN yum install -y \
+    autoconf \
+    automake \
+    doxygen \
+    file \
+    gcc-c++ \
+    git \
+    glibc-devel \
+    libtool \
+    make \
+    maven \
+    numactl-devel \
+    rdma-core-devel \
+    rpm-build \
+    tcl \
+    tcsh \
+    tk \
+    wget \
+    libusbx \
+    fuse-libs \
+    && yum clean all
+
+# MOFED
+ARG MOFED_VERSION
+ARG MOFED_OS
+ENV MOFED_DIR MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
+ENV MOFED_SITE_PLACE MLNX_OFED-${MOFED_VERSION}
+ENV MOFED_IMAGE ${MOFED_DIR}.tgz
+RUN wget --no-verbose http://content.mellanox.com/ofed/${MOFED_SITE_PLACE}/${MOFED_IMAGE} && \
+    tar -xzf ${MOFED_IMAGE} && \
+    ${MOFED_DIR}/mlnxofedinstall --all -q \
+        --user-space-only \
+        --without-fw-update \
+        --skip-distro-check \
+        --without-ucx \
+        --without-hcoll \
+        --without-openmpi \
+        --without-sharp \
+    && rm -rf ${MOFED_DIR} && rm -rf *.tgz
+
+ENV CPATH /usr/local/cuda/include:${CPATH}
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV PATH /usr/local/cuda/compat:${PATH}
+
diff --git a/buildlib/centos7-release.Dockerfile b/buildlib/dockers/centos7-release.Dockerfile
similarity index 92%
rename from buildlib/centos7-release.Dockerfile
rename to buildlib/dockers/centos7-release.Dockerfile
index 65ad6cfbac2..5e2609684cb 100644
--- a/buildlib/centos7-release.Dockerfile
+++ b/buildlib/dockers/centos7-release.Dockerfile
@@ -1,4 +1,4 @@
-ARG CUDA_VERSION=10.1
+ARG CUDA_VERSION
 FROM nvidia/cuda:${CUDA_VERSION}-devel-centos7
 
 RUN yum install -y \
@@ -10,6 +10,7 @@ RUN yum install -y \
     git \
     glibc-devel \
     libtool \
+    librdmacm \
     make \
     maven \
     numactl-devel \
@@ -19,11 +20,13 @@ RUN yum install -y \
     tcsh \
     tk \
     wget \
+    libusbx \
+    fuse-libs \
     && yum clean all
 
 # MOFED
-ARG MOFED_VERSION=5.0-1.0.0.0
-ARG MOFED_OS=rhel7.6
+ARG MOFED_VERSION
+ARG MOFED_OS
 ENV MOFED_DIR MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
 ENV MOFED_SITE_PLACE MLNX_OFED-${MOFED_VERSION}
 ENV MOFED_IMAGE ${MOFED_DIR}.tgz
@@ -43,4 +46,3 @@ ENV CPATH /usr/local/cuda/include:${CPATH}
 ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
 ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
 ENV PATH /usr/local/cuda/compat:${PATH}
-
diff --git a/buildlib/centos7.Dockerfile b/buildlib/dockers/centos7.Dockerfile
similarity index 95%
rename from buildlib/centos7.Dockerfile
rename to buildlib/dockers/centos7.Dockerfile
index 8f997b702ae..0d484ec4a61 100644
--- a/buildlib/centos7.Dockerfile
+++ b/buildlib/dockers/centos7.Dockerfile
@@ -10,6 +10,7 @@ RUN yum install -y \
     git \
     glibc-devel \
     libtool \
+    librdmacm \
     make \
     maven \
     numactl-devel \
diff --git a/buildlib/dockers/centos8-release.Dockerfile b/buildlib/dockers/centos8-release.Dockerfile
new file mode 100644
index 00000000000..cf7d7b6f8c3
--- /dev/null
+++ b/buildlib/dockers/centos8-release.Dockerfile
@@ -0,0 +1,49 @@
+ARG CUDA_VERSION
+ARG OS_VERSION
+FROM nvidia/cuda:${CUDA_VERSION}-devel-centos${OS_VERSION}
+
+RUN yum install -y \
+    autoconf \
+    automake \
+    file \
+    gcc-c++ \
+    git \
+    glibc-devel \
+    libtool \
+    make \
+    maven \
+    numactl-devel \
+    rdma-core-devel \
+    rpm-build \
+    tcl \
+    tcsh \
+    tk \
+    wget \
+    libusbx \
+    fuse-libs \
+    python36 \
+    && yum clean all
+
+# MOFED
+ARG MOFED_VERSION
+ARG MOFED_OS
+ENV MOFED_DIR MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
+ENV MOFED_SITE_PLACE MLNX_OFED-${MOFED_VERSION}
+ENV MOFED_IMAGE ${MOFED_DIR}.tgz
+RUN wget --no-verbose http://content.mellanox.com/ofed/${MOFED_SITE_PLACE}/${MOFED_IMAGE} && \
+    tar -xzf ${MOFED_IMAGE} && \
+    ${MOFED_DIR}/mlnxofedinstall --all -q \
+        --user-space-only \
+        --without-fw-update \
+        --skip-distro-check \
+        --without-ucx \
+        --without-hcoll \
+        --without-openmpi \
+        --without-sharp \
+    && rm -rf ${MOFED_DIR} && rm -rf *.tgz
+
+ENV CPATH /usr/local/cuda/include:${CPATH}
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV PATH /usr/local/cuda/compat:${PATH}
+
diff --git a/buildlib/dockers/docker-compose.yml b/buildlib/dockers/docker-compose.yml
new file mode 100644
index 00000000000..82584563f2a
--- /dev/null
+++ b/buildlib/dockers/docker-compose.yml
@@ -0,0 +1,67 @@
+version: "3"
+
+services:
+  centos7-mofed5.1-cuda11.1:
+    image: centos7-mofed5.1-cuda11.1
+    build:
+      context: .
+      network: host
+      dockerfile: centos-release.Dockerfile
+      args:
+        MOFED_VERSION: 5.1-2.5.8.0
+        MOFED_OS: rhel7.6
+        CUDA_VERSION: 11.1
+        OS_VERSION: 7
+  centos8-mofed5.1-cuda11.1:
+    image: centos8-mofed5.1-cuda11.1
+    build:
+      context: .
+      network: host
+      dockerfile: centos8-release.Dockerfile
+      args:
+        MOFED_VERSION: 5.1-2.5.8.0
+        MOFED_OS: rhel8.3
+        CUDA_VERSION: 11.1
+        OS_VERSION: 8
+  ubuntu18.04-mofed5.1-cuda11.1:
+    image: ubuntu18.04-mofed5.1-cuda11.1
+    build:
+      context: .
+      network: host
+      dockerfile: ubuntu-release.Dockerfile
+      args:
+        MOFED_VERSION: 5.1-2.5.8.0
+        UBUNTU_VERSION: 18.04
+        CUDA_VERSION: 11.1
+  ubuntu20.04-mofed5.1-cuda11.1:
+    image: ubuntu20.04-mofed5.1-cuda11.1
+    build:
+      context: .
+      network: host
+      dockerfile: ubuntu-release.Dockerfile
+      args:
+        MOFED_VERSION: 5.1-2.5.8.0
+        UBUNTU_VERSION: 20.04
+        CUDA_VERSION: 11.1
+  ubuntu20.10-mofed5.1-cuda11.1:
+    image: ubuntu20.10-mofed5.1-cuda11.1
+    build:
+      context: .
+      network: host
+      dockerfile: ubuntu-release.Dockerfile
+      args:
+        MOFED_VERSION: 5.1-2.5.8.0
+        UBUNTU_VERSION: 20.10
+        CUDA_VERSION: 11.1
+        MOFED_OS: ubuntu20.04
+  fedora31-mofed5.1-cuda11.1:
+    image: fedora31-mofed5.1-cuda11.1
+    build:
+      context: .
+      network: host
+      dockerfile: fedora-release.Dockerfile
+      args:
+        MOFED_VERSION: 5.1-2.5.8.0
+        OS_VERSION: 31
+        CUDA_VERSION: 11.1
+        MOFED_OS: fc31
diff --git a/buildlib/dockers/fedora-release.Dockerfile b/buildlib/dockers/fedora-release.Dockerfile
new file mode 100644
index 00000000000..6e660dda934
--- /dev/null
+++ b/buildlib/dockers/fedora-release.Dockerfile
@@ -0,0 +1,51 @@
+ARG OS_VERSION
+FROM fedora:${OS_VERSION}
+
+RUN dnf install -y \
+    autoconf \
+    automake \
+    clang \
+    cppcheck \
+    csclng \
+    cscppc \
+    csmock-common \
+    doxygen \
+    file \
+    gcc-c++ \
+    git \
+    git-clang-format \
+    glibc-devel \
+    java-1.8.0-openjdk-devel \
+    libtool \
+    make \
+    maven \
+    numactl-devel \
+    rdma-core-devel \
+    rpm-build \
+    libusbx \
+    fuse-libs \
+    && dnf clean dbcache packages
+
+# MOFED
+ARG MOFED_VERSION
+ARG MOFED_OS
+ENV MOFED_DIR MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
+ENV MOFED_SITE_PLACE MLNX_OFED-${MOFED_VERSION}
+ENV MOFED_IMAGE ${MOFED_DIR}.tgz
+RUN wget --no-verbose http://content.mellanox.com/ofed/${MOFED_SITE_PLACE}/${MOFED_IMAGE} && \
+    tar -xzf ${MOFED_IMAGE} && \
+    ${MOFED_DIR}/mlnxofedinstall --all -q \
+        --user-space-only \
+        --without-fw-update \
+        --skip-distro-check \
+        --without-ucx \
+        --without-hcoll \
+        --without-openmpi \
+        --without-sharp \
+    && rm -rf ${MOFED_DIR} && rm -rf *.tgz
+
+ENV CPATH /usr/local/cuda/include:${CPATH}
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
+ENV LIBRARY_PATH /usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV PATH /usr/local/cuda/compat:${PATH}
+
diff --git a/buildlib/dockers/fedora.Dockerfile b/buildlib/dockers/fedora.Dockerfile
new file mode 100644
index 00000000000..cb75a540f5b
--- /dev/null
+++ b/buildlib/dockers/fedora.Dockerfile
@@ -0,0 +1,32 @@
+# docker build -t ucfconsort.azurecr.io/ucx/fedora:5 -f buildlib/fedora.Dockerfile buildlib/
+FROM fedora:33
+
+RUN dnf install -y \
+    autoconf \
+    automake \
+    cmake \
+    cppcheck \
+    csclng \
+    cscppc \
+    csmock-common \
+    doxygen \
+    file \
+    gcc-c++ \
+    git \
+    git-clang-format \
+    glibc-devel \
+    java-1.8.0-openjdk-devel \
+    libtool \
+    make \
+    maven \
+    numactl-devel \
+    python \
+    rdma-core-devel \
+    rpm-build \
+    && dnf clean dbcache packages
+RUN export BUILD_ROOT=/tmp/llvm-project && \
+    git clone https://github.com/openucx/llvm-project.git --depth=1 -b ucx-clang-format --single-branch ${BUILD_ROOT} && \
+    mkdir -p ${BUILD_ROOT}/build && cd ${BUILD_ROOT}/build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=clang -G "Unix Makefiles" \
+    ../llvm && \
+    make -j$(nproc) && make install && rm -rf ${BUILD_ROOT}
diff --git a/buildlib/push-release-images.sh b/buildlib/dockers/push-release-images.sh
similarity index 100%
rename from buildlib/push-release-images.sh
rename to buildlib/dockers/push-release-images.sh
diff --git a/buildlib/dockers/sles-release.Dockerfile b/buildlib/dockers/sles-release.Dockerfile
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/buildlib/ubuntu-release.Dockerfile b/buildlib/dockers/ubuntu-release.Dockerfile
similarity index 93%
rename from buildlib/ubuntu-release.Dockerfile
rename to buildlib/dockers/ubuntu-release.Dockerfile
index 61140f2a82a..53c0b262901 100644
--- a/buildlib/ubuntu-release.Dockerfile
+++ b/buildlib/dockers/ubuntu-release.Dockerfile
@@ -3,6 +3,7 @@ ARG UBUNTU_VERSION=16.04
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
 RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \
     apt-get install -y \
         automake \
         default-jdk \
diff --git a/buildlib/fedora.Dockerfile b/buildlib/fedora.Dockerfile
deleted file mode 100644
index 785d84bd445..00000000000
--- a/buildlib/fedora.Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-# docker build -t ucfconsort.azurecr.io/ucx/fedora:1 -f buildlib/fedora.Dockerfile buildlib/
-FROM fedora:32
-
-RUN dnf install -y \
-    autoconf \
-    automake \
-    clang \
-    cppcheck \
-    csclng \
-    cscppc \
-    csmock-common \
-    doxygen \
-    file \
-    gcc-c++ \
-    git \
-    glibc-devel \
-    java-1.8.0-openjdk-devel \
-    libtool \
-    make \
-    maven \
-    numactl-devel \
-    rdma-core-devel \
-    rpm-build \
-    && dnf clean dbcache packages
diff --git a/buildlib/io-demo.yml b/buildlib/io-demo.yml
deleted file mode 100644
index eb51e4bbe15..00000000000
--- a/buildlib/io-demo.yml
+++ /dev/null
@@ -1,165 +0,0 @@
-parameters:
-  - name: test_duration
-    type: number
-    default: 2
-  - name: tests
-    type: object
-    default:
-      base:
-        initial_delay: 20
-        cycles: 100
-        downtime: 5
-        uptime: 20
-
-jobs:
-  - job: io_build
-    displayName: Build io_demo
-
-    pool:
-      name: MLNX
-      demands:
-        - ucx_roce -equals yes
-
-    steps:
-      # address permissions issue when some files created as read-only
-      - bash: chmod u+rwx ./ -R
-
-      - checkout: self
-        clean: true
-        displayName: Checkout
-
-      - bash: |
-          set -eEx
-
-          ./autogen.sh
-          ./contrib/configure-release --prefix=$PWD/__install
-          make -j`nproc`
-
-          echo "##vso[task.setvariable variable=workspace;isOutput=true]$(Build.Repository.LocalPath)/buildlib"
-        displayName: Build
-        name: build
-
-  - job: test
-    dependsOn: io_build
-
-    pool:
-      name: MLNX
-      demands:
-        - ucx_roce -equals yes
-
-    strategy:
-      matrix:
-        ${{ each test in parameters.tests }}:
-          ${{ test.Key }}:
-            test_name: ${{ test.Key }}
-            initial_delay: ${{ test.Value.initial_delay }}
-            cycles: ${{ test.Value.cycles }}
-            downtime: ${{ test.Value.downtime }}
-            uptime: ${{ test.Value.uptime }}
-      maxParallel: 1
-
-    variables:
-      workspace: $[ dependencies.io_build.outputs['build.workspace'] ]
-
-    displayName: "io_demo: "
-    steps:
-      - checkout: none
-
-      - bash: |
-          set -eEx
-          # set UCX environment variables
-          export UCX_SOCKADDR_CM_ENABLE=y
-          # TODO get hostname of 'ucx-roce-client' SSH service endpoint, and run on it
-          $(workspace)/../test/apps/iodemo/run_io_demo.sh \
-              -H swx-rdmz-ucx-roce-01,swx-rdmz-ucx-roce-02 \
-              --tasks-per-node 1 \
-              --duration 60 \
-              -v \
-              --num-clients 1 \
-              --num-servers 1 \
-              --map-by slot \
-              $(workspace)/../test/apps/iodemo/io_demo \
-                  -d 512:524288 \
-                  -o read,write \
-                  -i 0 \
-                  -w 16 \
-                  -t 10
-        displayName: Launch with run_io_demo.sh
-        timeoutInMinutes: 2
-
-      - bash: |
-          set -eEx
-          source ./buildlib/az-helpers.sh
-          ./buildlib/az-network-corrupter.sh \
-            initial_delay=$(initial_delay) \
-            cycles=$(cycles) \
-            downtime=$(downtime) \
-            uptime=$(uptime) \
-            |& add_timestamp &>corrupter.log &
-
-          pgrep -u "$USER" -f 'network-corrupter'
-          corrupter_pid=$(pgrep -u "$USER" -f 'network-corrupter')
-          echo "corrupter_pid=$corrupter_pid"
-          azure_set_variable "corrupter_pid" "$corrupter_pid"
-        displayName: Start network corrupter
-        timeoutInMinutes: 5
-
-      - bash: |
-          set -eEx
-          source ./buildlib/az-helpers.sh
-          ./buildlib/az-io_demo.sh workspace=$(workspace)
-        displayName: Start server
-        name: server
-        timeoutInMinutes: 5
-
-      - task: SSH@0
-        inputs:
-          sshEndpoint: ucx-roce-client
-          runOptions: inline
-          inline: |
-            set -eEx
-            $(workspace)/az-io_demo.sh \
-                workspace=$(workspace) \
-                server_ip=$(server_ip)
-                duration=${{ parameters.test_duration }}
-          failOnStdErr: false
-        displayName: Test
-        timeoutInMinutes: ${{ parameters.test_duration }}
-
-      - bash: |
-          set -eEx
-          cd $(workspace)
-          pid=$(server_pid)
-          echo "Stopping the server, PID=${pid}"
-          if ! kill ${pid}; then
-            echo "##vso[task.logissue type=error]Can't stop server: process doesn't exist"
-            echo "##vso[task.complete result=Failed;]"
-          else
-            echo "Server stopped successfully"
-          fi
-          cat $(workspace)/server.log
-        displayName: Kill the server
-        condition: always()
-        timeoutInMinutes: 5
-
-      - bash: |
-          set -eEx
-          pid=$(corrupter_pid)
-          echo "Stopping corrupter, PID=${pid}"
-          if ! kill ${pid}; then
-            echo "##vso[task.logissue type=warning]Can't stop corrupter: process doesn't exist"
-            echo "##vso[task.complete result=Failed;]"
-          else
-            echo "Corrupter stopped successfully"
-          fi
-          cat corrupter.log
-        displayName: Kill corrupter
-        condition: always()
-        timeoutInMinutes: 10
-
-      - bash: |
-          set -eEx
-          ./buildlib/az-network-corrupter.sh reset=yes
-        displayName: Restore port state
-        condition: always()
-        timeoutInMinutes: 1
diff --git a/buildlib/io_demo/az-stage-io-demo.yaml b/buildlib/io_demo/az-stage-io-demo.yaml
new file mode 100644
index 00000000000..f251f02c30c
--- /dev/null
+++ b/buildlib/io_demo/az-stage-io-demo.yaml
@@ -0,0 +1,86 @@
+parameters:
+- name: name  # defaults for any parameters that aren't specified
+  default: 'test'
+- name: iodemo_args
+  default: ''
+- name: iodemo_tls
+  default: 'rc_x'
+- name: duration
+  default: 60
+
+steps:
+- bash: |
+    set -eEx
+    source $(workspace)/buildlib/az-helpers.sh
+    $(workspace)/buildlib/az-network-corrupter.sh \
+      initial_delay=$(initial_delay) \
+      cycles=$(cycles) \
+      downtime=$(downtime) \
+      uptime=$(uptime) \
+      |& add_timestamp &>corrupter.log &
+    while ! pgrep -u "$USER" -f 'network-corrupter'
+    do
+      sleep 1
+    done
+    pgrep -u "$USER" -f 'network-corrupter'
+    corrupter_pid=$(pgrep -u "$USER" -f 'network-corrupter')
+    echo "corrupter_pid=$corrupter_pid"
+    azure_set_variable "corrupter_pid" "$corrupter_pid"
+  displayName: Start network corrupter
+  timeoutInMinutes: 2
+
+- bash: |
+    set -eEx
+    sudo /hpc/local/bin/lshca
+    mkdir -p $(workspace)/${{ parameters.name }}
+    # set UCX environment variables
+    export UCX_NET_DEVICES=$(ibdev2netdev | sed -ne 's/\(\w*\) port \([0-9]\) ==> '${roce_iface}' .*/\1:\2/p')
+    export UCX_TLS=${{ parameters.iodemo_tls }}
+    export LD_LIBRARY_PATH=$(workspace)/install/lib:$LD_LIBRARY_PATH
+    $(workspace)/test/apps/iodemo/run_io_demo.sh \
+        -H $(agent_hosts) \
+        --tasks-per-node 1 \
+        --duration ${{ parameters.duration }} \
+        -v \
+        --num-clients 1 \
+        --num-servers 1 \
+        --map-by slot \
+        --log-dir $(workspace)/${{ parameters.name }} \
+        -i $(roce_iface) \
+        $(io_demo_exe) \
+            -d 512:524288 \
+            -P 2 \
+            -o read,write \
+            -i 0 \
+            -w 16 \
+            -t 10 \
+            ${{ parameters.iodemo_args }}
+  displayName: Launch with run_io_demo.sh ( ${{ parameters.name }} )
+  timeoutInMinutes: 10
+
+- bash: |
+    python $(workspace)/buildlib/io_demo/iodemo_analyzer.py -d $(workspace)/${{ parameters.name }} --duration ${{ parameters.duration }}
+  displayName: Analyze for ${{ parameters.name }}
+  timeoutInMinutes: 1
+
+- bash: |
+    set -eEx
+    pid=$(corrupter_pid)
+    echo "Stopping corrupter, PID=${pid}"
+    if ! kill ${pid}; then
+      echo "##vso[task.logissue type=warning]Can't stop corrupter: process doesn't exist"
+      echo "##vso[task.complete result=Failed;]"
+    else
+      echo "Corrupter stopped successfully"
+    fi
+    cat corrupter.log
+  displayName: Kill corrupter
+  condition: always()
+  timeoutInMinutes: 10
+
+- bash: |
+    set -eEx
+    $(workspace)/buildlib/az-network-corrupter.sh reset=yes
+  displayName: Restore port state
+  condition: always()
+  timeoutInMinutes: 2
diff --git a/buildlib/io_demo/io-demo.yml b/buildlib/io_demo/io-demo.yml
new file mode 100644
index 00000000000..51c5141ea58
--- /dev/null
+++ b/buildlib/io_demo/io-demo.yml
@@ -0,0 +1,104 @@
+parameters:
+  - name: demands
+    type: string
+    default: "ucx_iodemo -equals yes"
+  - name: initial_delay
+    type: number
+    default: 20
+  - name: cycles
+    type: number
+    default: 100
+  - name: downtime
+    type: number
+    default: 5
+  - name: uptime
+    type: number
+    default: 40
+  - name: tests
+    type: object
+    default:
+      tag:
+        args: ""
+        duration: 480
+      active:
+        args: "-q -A"
+        duration: 480
+
+jobs:
+  - job: io_build
+    displayName: Build io_demo
+
+    pool:
+      name: MLNX
+      demands: ${{ parameters.demands }}
+
+    steps:
+      # address permissions issue when some files created as read-only
+      - bash: chmod u+rwx ./ -R
+
+      - checkout: self
+        clean: true
+        fetchDepth: 100
+        displayName: Checkout
+      - bash: |
+          set -eEx
+          ./autogen.sh
+          ./contrib/configure-release --prefix=$(Build.Repository.LocalPath)/install
+          make -j`nproc`
+          make install
+        displayName: Build
+        name: build
+      - task: CopyFiles@2
+        inputs:
+          sourceFolder: '$(Build.Repository.LocalPath)'
+          contents: |
+            buildlib/az-helpers.sh
+            buildlib/az-network-corrupter.sh
+            buildlib/io_demo/iodemo_analyzer.py
+            install/**
+            test/apps/iodemo/run_io_demo.sh
+          targetFolder: '$(Build.ArtifactStagingDirectory)'
+      - task: PublishBuildArtifacts@1
+        inputs:
+          pathToPublish: '$(Build.ArtifactStagingDirectory)'
+          artifactName: drop_$(Build.BuildId)
+
+  - job: test
+    dependsOn: io_build
+
+    pool:
+      name: MLNX
+      demands: ${{ parameters.demands }}
+
+    strategy:
+      matrix:
+        ${{ each test in parameters.tests }}:
+          ${{ test.Key }}:
+            test_name: ${{ test.Key }}
+            test_args: ${{ test.Value.args }}
+            test_time: ${{ test.Value.duration }}
+      maxParallel: 1
+
+    variables:
+      workspace: drop_$(Build.BuildId)
+      io_demo_exe: drop_$(Build.BuildId)/install/bin/io_demo
+      initial_delay: ${{ parameters.initial_delay }}
+      cycles: ${{ parameters.cycles }}
+      downtime: ${{ parameters.downtime }}
+      uptime: ${{ parameters.uptime }}
+
+
+    displayName: "io_demo: "
+    steps:
+      - checkout: none
+      - task: DownloadBuildArtifacts@0
+        displayName: 'Download Build Artifacts'
+        inputs:
+          artifactName: drop_$(Build.BuildId)
+          downloadPath: $(System.DefaultWorkingDirectory)
+      - bash: chmod u+rwx $(workspace) -R
+      - template: az-stage-io-demo.yaml
+        parameters:
+          name: $(test_name)
+          iodemo_args: $(test_args)
+          duration: $(test_time)
diff --git a/buildlib/io_demo/iodemo_analyzer.py b/buildlib/io_demo/iodemo_analyzer.py
new file mode 100644
index 00000000000..f898163c499
--- /dev/null
+++ b/buildlib/io_demo/iodemo_analyzer.py
@@ -0,0 +1,246 @@
+import subprocess
+import os
+import argparse
+import re
+import datetime,time
+import traceback,sys
+
+allow_error_list = [
+    'Connection reset by remote peer',
+    'UCX-connection.*detected error:',
+    'ERROR Remote QP on mlx',
+    'UCX  ERROR RC QP',
+    'ERROR IB Async event on',
+    'setting error flag on connection',
+    'Operation rejected by remote peer',
+    'got error event RDMA_CM_EVENT_ADDR_ERROR',
+    'rdma_accept',
+    'UCX  ERROR Remote access on',
+    'UCX  ERROR Transport retry count exceeded on',
+    'UCX  WARN  failed to disconnect CM lane',
+    'ucp_ep_create\(\) failed: Input/output error',
+    'terminate connection.*due to Input/output error',
+    'UCX  ERROR Local QP operation on',
+    'conn_id send request.*failed: Input/output error',
+    'deleting connection with status Input/output error',
+    'UCX  WARN  failed to disconnect CM lane .* Operation rejected by remote peer',
+    'ucp_ep_query\(\) failed: Endpoint timeout',
+    'UCX  ERROR rdma_reject.*failed with error: Invalid argument',
+    'UCX  ERROR rdma_init_qp_attr.*failed: Invalid argument',
+    'UCX  ERROR rdma_establish on ep.*failed: Invalid argument',
+    'UCX  ERROR .*client.*failed to process a connect response'
+    ]
+
+
+re_allow_list = re.compile("|".join(allow_error_list), re.I)
+re_timestamp = re.compile(r"\[(\d+\.\d+)\].*")
+re_traffic = re.compile(r"\[(\d+\.\d+)\].*read (\d+.\d+).*min:(\d+).*write (\d+.\d+).*min:(\d+).*")
+re_traffic_read = re.compile(r"\[(\d+\.\d+)\].*read (\d+.\d+) MB\/s min:(\d+).*")
+re_traffic_write = re.compile(r"\[(\d+\.\d+)\].*write (\d+.\d+) MB\/s min:(\d+).*")
+re_error = re.compile(r".*(error|assert|backtrace|segmentation).*", re.I)
+re_warning = re.compile(r".*warn.*", re.I)
+
+
+def in_allow_list(line, is_allow_list):
+    if is_allow_list:
+        s = re_allow_list.search(line)
+        if s:
+            return True
+    return False
+
+
+def process_seek(seek_file):
+    data = {}
+    if not seek_file or not os.path.exists(seek_file):
+        return data
+
+    with open(seek_file) as f:
+        for line in f.readlines():
+            if line:
+                # [log name] \t [position] \t [previous  timestamp]
+                d = line.split('\t')
+                ts = datetime.datetime.fromtimestamp(float(d[2]))
+                rx_ts = float(d[3])
+                tx_ts = float(d[4])
+                data[d[0]] = {
+                    'pos': int(d[1]),
+                    'timestamp': ts,
+                    'timestamp_rx': rx_ts,
+                    'timestamp_tx': tx_ts,
+                    }
+    # Burn After Reading
+    open(seek_file, 'w').close()
+    return data
+
+
+def get_logs(directory):
+    client_list = []
+    server_list = []
+    for f in os.listdir(directory):
+        filename = os.path.join(directory, f)
+        if os.path.isfile(filename) and "_client_" in f:
+            client_list.append(filename)
+        if os.path.isfile(filename) and "_server_" in f:
+            server_list.append(filename)
+    return client_list, server_list
+
+
+def process_server(files, is_allow_list):
+    for log in files:
+        with open(log) as f:
+            while True:
+                line = f.readline()
+                if not line:
+                    break
+
+                m = re_error.match(line, re.IGNORECASE) or re_warning.match(line, re.IGNORECASE)
+                if m and not in_allow_list(line, is_allow_list):
+                    raise Exception("Contains error: {}\nLog {}:\nLine {}".format(line, log, line))
+
+
+def process_client(files, threshold, seek_file, is_allow_list, duration):
+    seek_data = process_seek(seek_file)
+    for log in files:
+        with open(log) as f:
+            curr_ts = 0
+            curr_traffic_ts = 0
+            start_traffic_ts = 0
+            cur_traffic_date = ""
+            prev_traffic_ts = seek_data.get(log, {}).get('timestamp', 0)
+            zero_rx_ts = seek_data.get(log, {}).get('timestamp_rx', 0)
+            zero_tx_ts = seek_data.get(log, {}).get('timestamp_tx', 0)
+            pos_prev = seek_data.get(log, {}).get('pos', 0)
+            f.seek(pos_prev)
+            i = 0
+            while True:
+                line = f.readline()
+                if not line:
+                    if seek_file and cur_traffic_date:
+                        pos = f.tell()
+                        with open(seek_file, 'a+') as s:
+                            s.write("{}\t{}\t{}\t{}\t{}\n".format(
+                                log, pos, cur_traffic_date, zero_rx_ts, zero_tx_ts))
+                    break
+
+                timestamp_match = re_timestamp.match(line)
+                if timestamp_match:
+                    date = float(timestamp_match.group(1))
+                    curr_ts = datetime.datetime.fromtimestamp(date)
+                    if not prev_traffic_ts:
+                        prev_traffic_ts = curr_ts
+                    if not start_traffic_ts:
+                        start_traffic_ts = curr_ts
+
+                i += 1
+                read_match = re_traffic_read.match(line)
+                write_match = re_traffic_write.match(line)
+
+                current_match = None
+
+                if read_match:
+                    current_match = read_match
+                    cur_traffic_date = current_match.group(1)
+                    date_traffic = float(cur_traffic_date)
+                    curr_traffic_ts = datetime.datetime.fromtimestamp(date_traffic)
+                    rx = float(current_match.group(2))
+                    min_server_rx = int(current_match.group(3))
+
+                    if min_server_rx == 0 and zero_rx_ts:
+                        delta = curr_traffic_ts - datetime.datetime.fromtimestamp(zero_rx_ts)
+                        if delta.total_seconds() > threshold * 60:
+                            raise Exception("Have read min:0 servers {} minutes \
+                                (more threshold:{})\nlog {}:\nLine {}".format(
+                                    delta.total_seconds()/60.0, threshold, log, line))
+                    else:
+                        zero_rx_ts = date_traffic
+
+                    if not rx:
+                        raise Exception("Have read zero speed:\nLog {}:\nLine {}".format(log, line))
+                    prev_traffic_ts = curr_traffic_ts
+
+                if write_match:
+                    current_match = write_match
+                    cur_traffic_date = current_match.group(1)
+                    date_traffic = float(cur_traffic_date)
+                    curr_traffic_ts = datetime.datetime.fromtimestamp(date_traffic)
+                    tx = float(current_match.group(2))
+                    min_server_tx=int(current_match.group(3))
+
+                    if min_server_tx == 0 and zero_tx_ts:
+                        delta = curr_traffic_ts - datetime.datetime.fromtimestamp(zero_tx_ts)
+                        if delta.total_seconds() > threshold * 60:
+                            raise Exception("Have write min:0 servers {} minutes \
+                                (more threshold:{})\nLog {}:\nLine {}".format(
+                                    delta.total_seconds()/60.0, threshold, log, line))
+                    else:
+                        zero_tx_ts = date_traffic
+
+                    if not tx:
+                        raise Exception("Have write zero speed:\nLog {}:\nLine {}".format(log, line))
+
+                    prev_traffic_ts = curr_traffic_ts
+
+
+                if current_match and prev_traffic_ts:
+                    delta = curr_traffic_ts - prev_traffic_ts
+                    if delta.total_seconds() > threshold * 60:
+                        raise Exception("Have delta {} more {} minutes\nLog {}:\nLine {}".format(
+                            delta.total_seconds()/60.0, threshold, log, line))
+
+                if not current_match:
+                    current_match = re_error.match(line, re.IGNORECASE)
+                    if current_match:
+                        if not in_allow_list(line, is_allow_list):
+                            raise Exception("contains error: {}\nLog {}:\nLine {}".format(line, log, line))
+                    else:
+                        current_match = re_warning.match(line, re.IGNORECASE)
+                        if current_match:
+                            print("log {} [{}] contains warning: {}".format(log, i, line))
+
+                if curr_ts and (curr_ts - prev_traffic_ts).total_seconds() > threshold * 60:
+                    raise Exception("No traffic\n{}\nLog {}".format(line, log))
+            if duration and curr_traffic_ts and start_traffic_ts:
+                traffic_duration = curr_traffic_ts - start_traffic_ts
+                delta = duration - traffic_duration.total_seconds()
+                if delta > threshold * 60:
+                    raise Exception("No traffic for more than {} minutes at the end of the test".format(
+                        delta/60.0))
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--filename', type=str,
+                        help='Log filename')
+    parser.add_argument('-d', '--directory', type=str,
+                        help='Directory name with Logs')
+    parser.add_argument('-t', '--no_traffic_in_sec', type=int, default=1,
+                        help='No traffic Threshold in min')
+    parser.add_argument('-s', '--seek', type=str, default="",
+                        help='path to seek file')
+    parser.add_argument('--duration', type=int, default=0,
+                        help='set io_demo duration time in sec')
+    parser.add_argument('-r', '--role', type=str, default="client", choices=['client', 'server'],
+                        help='choice role if you set filename')
+    parser.add_argument('--no-allow-list', dest='allow_list', action='store_false')
+
+    args = parser.parse_args()
+
+    clients = []
+    servers = []
+    if args.filename:
+        if args.role == "client":
+            clients.append(args.filename)
+        elif args.role == "server":
+            servers.append(args.filename)
+
+    if args.directory:
+        clients, servers = get_logs(args.directory)
+
+    try:
+        process_client(clients, args.no_traffic_in_sec, args.seek, args.allow_list, args.duration)
+        process_server(servers, args.allow_list)
+    except Exception as e:
+        print("Error iodemo analyzer: {}\n".format(e))
+        traceback.print_exc(file=sys.stdout)
+        exit(1)
diff --git a/buildlib/jucx-publish.yml b/buildlib/jucx/jucx-publish.yml
similarity index 89%
rename from buildlib/jucx-publish.yml
rename to buildlib/jucx/jucx-publish.yml
index de4bef44960..9dbdff32821 100644
--- a/buildlib/jucx-publish.yml
+++ b/buildlib/jucx/jucx-publish.yml
@@ -5,12 +5,18 @@ parameters:
 
 jobs:
   - job: jucx_release
-
-    container: centos7
+    condition: eq(stageDependencies.Check_Commit.Check.outputs['Commit.Launch'], 'Yes')
+    # we need to use lowest version for compatible
+    container: centos7_cuda10_1
+    pool:
+      name: MLNX
+      demands:
+      - ucx_docker -equals yes
 
     steps:
       - checkout: self
         clean: true
+        fetchDepth: 100
 
       - bash: |
           set -eE
diff --git a/buildlib/jucx/jucx-test.yml b/buildlib/jucx/jucx-test.yml
new file mode 100755
index 00000000000..41709a5bf3f
--- /dev/null
+++ b/buildlib/jucx/jucx-test.yml
@@ -0,0 +1,91 @@
+parameters:
+  name: java test
+  demands: []
+
+jobs:
+  - job: ${{ parameters.name }}
+
+    pool:
+      name: MLNX
+      demands: ${{ parameters.demands }}
+
+    strategy:
+      matrix:
+        java8:
+          JAVA_VERSION: 1.8
+        java11:
+          JAVA_VERSION: 1.11
+
+    steps:
+      - checkout: self
+        fetchDepth: 100
+        clean: true
+        displayName: Checkout
+      - bash: |
+          set -x
+          source buildlib/az-helpers.sh
+          az_init_modules
+          res=0
+          az_module_load dev/mvn
+          res=$(($res+$?))
+          az_module_load dev/jdk-${JAVA_VERSION}
+          res=$(($res+$?))
+          if [ $res -ne 0 ]; then
+            exit 0;
+          fi
+          try_load_cuda_env
+          set -eE
+          ./autogen.sh
+          ./contrib/configure-devel --prefix=$(Build.Repository.LocalPath)/install \
+            --with-java --enable-gtest=no --with-cuda=$have_cuda
+          make -j`nproc`
+          make install
+        displayName: Build UCX
+      - bash: |
+          set -x
+          source buildlib/az-helpers.sh
+          az_init_modules
+          try_load_cuda_env
+          res=0
+          az_module_load dev/mvn
+          res=$(($res+$?))
+          az_module_load dev/jdk-${JAVA_VERSION}
+          res=$(($res+$?))
+          if [ $res -ne 0 ]; then
+            exit 0;
+          fi
+          set -eE
+          ifaces=`get_rdma_interfaces`
+          if [ -z "$ifaces" ]; then
+              azure_log_warning "No active RDMA interfaces on machine"
+              exit 0;
+          fi
+          jucx_port=$((20000 + $RANDOM % 10000))
+          export JUCX_TEST_PORT=$jucx_port
+          make -C bindings/java/src/main/native test
+          make -C bindings/java/src/main/native package
+          ipv4_found=0
+          for iface in $ifaces
+          do
+              server_ip=$(get_ip ${iface})
+              if [ -z "$server_ip" ]; then
+                  continue
+              fi
+              echo "Running standalone benchamrk on $iface:$jucx_port"
+              java_cmd='java -XX:ErrorFile=$(Build.ArtifactStagingDirectory)/hs_err_$(Build.BuildId)_%p.log  \
+                  -XX:OnError="cat $(Build.ArtifactStagingDirectory)/hs_err_$(Build.BuildId)_%p.log" \
+                  -cp "bindings/java/resources/:bindings/java/src/main/native/build-java/*" \
+                  org.openucx.jucx.examples.$bench_class s=$server_ip p=$jucx_port t=1000000'
+              bench_class=UcxReadBWBenchmarkReceiver
+              eval "$java_cmd &"
+              java_pid=$!
+              sleep 10
+              bench_class=UcxReadBWBenchmarkSender
+              eval "$java_cmd"
+              wait $java_pid
+              ipv4_found=1
+          done
+          if [[ $ipv4_found -eq 0 ]]; then
+              azure_log_warning "No IPv4 address on any of $ifaces"
+          fi
+        displayName: Run jucx tests
diff --git a/buildlib/tests.yml b/buildlib/tests.yml
index 040d7024e76..bef13f02548 100644
--- a/buildlib/tests.yml
+++ b/buildlib/tests.yml
@@ -10,7 +10,7 @@ jobs:
       name: MLNX
       demands: ${{ parameters.demands }}
     displayName: ${{ parameters.name }} on worker
-    timeoutInMinutes: 360
+    timeoutInMinutes: 300
     strategy:
       matrix:
         ${{ each wid in parameters.worker_ids }}:
@@ -22,8 +22,10 @@ jobs:
 
       - checkout: self
         clean: true
+        fetchDepth: 100
 
       - bash: |
+          source ./buildlib/az-helpers.sh
           ./contrib/test_jenkins.sh
         displayName: Run ./contrib/test_jenkins.sh
         env:
@@ -37,4 +39,5 @@ jobs:
           EXECUTOR_NUMBER: $(AZP_AGENT_ID)
           RUN_TESTS: yes
           JENKINS_TEST_PERF: ${{ parameters.test_perf }}
-
+          JENKINS_NO_VALGRIND: ${{ parameters.valgrind_disable }}
+          RUNNING_IN_AZURE: yes
diff --git a/buildlib/tools/builds.sh b/buildlib/tools/builds.sh
new file mode 100755
index 00000000000..cc4e4eba6c3
--- /dev/null
+++ b/buildlib/tools/builds.sh
@@ -0,0 +1,356 @@
+#!/bin/bash -eExl
+
+realdir=$(realpath $(dirname $0))
+source ${realdir}/common.sh
+source ${realdir}/../az-helpers.sh
+long_test=${long_test:-no}
+
+#
+# Build documentation
+#
+build_docs() {
+	if [ `cat /etc/system-release | grep -i "fedora release 34" | wc -l` -gt 0 ]; then
+		azure_log_warning "Skip build docs on Fedora 34"
+		return 0
+	fi
+	doxy_ready=0
+	doxy_target_version="1.8.11"
+	doxy_version="$(doxygen --version)" || true
+
+	# Try load newer doxygen if native is older than 1.8.11
+	if ! (echo $doxy_target_version; echo $doxy_version) | sort -CV
+	then
+		if az_module_load tools/doxygen-1.8.11
+		then
+			doxy_ready=1
+		fi
+	else
+		doxy_ready=1
+	fi
+
+	if [ $doxy_ready -eq 1 ]
+	then
+		echo " ==== Build docs only ===="
+		${WORKSPACE}/contrib/configure-release --prefix=$ucx_inst --with-docs-only
+		$MAKE docs
+	fi
+}
+
+#
+# Build without verbs
+#
+build_no_verbs() {
+	echo "==== Build without IB verbs ===="
+	${WORKSPACE}/contrib/configure-release --prefix=$ucx_inst --without-verbs
+	$MAKEP
+}
+
+#
+# Build without numa support check
+#
+build_disable_numa() {
+	echo "==== Check --disable-numa compilation option ===="
+	${WORKSPACE}/contrib/configure-release --prefix=$ucx_inst --disable-numa
+	$MAKEP
+	# Make sure config.h file undefines HAVE_NUMA proceprocessor macro
+	grep 'undef HAVE_NUMA' config.h || exit 1
+}
+
+#
+# Build a package in release mode
+#
+build_release_pkg() {
+	echo "==== Build release ===="
+	${WORKSPACE}/contrib/configure-release
+	$MAKEP distcheck
+
+	if [ -f /etc/redhat-release -o -f /etc/fedora-release ]; then
+		rpm_based=yes
+	elif [ `cat /etc/os-release | grep -i "ubuntu\|mint"|wc -l` -gt 0 ]; then
+		rpm_based=no
+	else
+		# try rpm tool to detect distro
+		set +e
+		out=$(rpm -q rpm 2>/dev/null)
+		rc=$?
+		set -e
+		rpm_based=yes
+		if [[ $rc != 0 || "$out" == *"not installed"* ]]; then
+			rpm_based=no
+		fi
+	fi
+
+	if [[ "$rpm_based" == "no" && -x /usr/bin/dpkg-buildpackage ]]; then
+		echo "==== Build debian package ===="
+		dpkg-buildpackage -us -uc
+	else
+		echo "==== Build RPM ===="
+		echo "$PWD"
+		${WORKSPACE}/contrib/buildrpm.sh -s -b --nodeps --define "_topdir $PWD"
+	fi
+
+	# check that UCX version is present in spec file
+	cd ${WORKSPACE}
+	# extract version from configure.ac and convert to MAJOR.MINOR.PATCH representation
+	version=$(grep -P "define\S+ucx_ver" configure.ac | awk '{print $2}' | sed 's,),,' | xargs echo | tr ' ' '.')
+	if ! grep -q "$version" ucx.spec.in; then
+		azure_log_error "Current UCX version ($version) is not present in ucx.spec.in changelog"
+		exit 1
+	fi
+	cd -
+}
+
+#
+# Build with Intel compiler
+#
+build_icc() {
+	if az_module_load $INTEL_MODULE && icc -v
+	then
+		echo "==== Build with Intel compiler ===="
+		${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst CC=icc CXX=icpc
+		$MAKEP
+		make_clean distclean
+
+		echo "==== Build with Intel compiler (clang) ===="
+		${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst CC=clang CXX=clang++
+		$MAKEP
+		make_clean distclean
+	else
+		azure_log_warning "Not building with Intel compiler"
+	fi
+	az_module_unload $INTEL_MODULE
+}
+
+#
+# Build with PGI compiler
+#
+build_pgi() {
+	if az_module_load $PGI_MODULE
+	then
+		# add_network_host utility from $PGI_MODULE it create config file for machine
+		# Doc: https://docs.nvidia.com/hpc-sdk/hpc-sdk-install-guide/index.html
+		add_network_host
+		echo "==== Build with PGI compiler ===="
+		# PGI failed to build valgrind headers, disable it for now
+		# TODO: Using non-default PGI compiler - pgcc18 which is going to be default
+		#       in next versions.
+		#       Switch to default CC compiler after pgcc18 is default for pgi module
+		${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst --without-valgrind
+		$MAKEP
+		# TODO: Check why "make distclean" is needed to cleanup after PGI compiler
+		make_clean distclean
+	else
+		azure_log_warning "Not building with PGI compiler"
+	fi
+	az_module_unload $PGI_MODULE
+}
+
+#
+# Build debug version
+#
+build_debug() {
+	echo "==== Build with --enable-debug option ===="
+	${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst --enable-debug --enable-examples
+	$MAKEP
+
+	# Show UCX info
+	./src/tools/info/ucx_info -s -f -c -v -y -d -b -p -w -e -uart -m 20M
+}
+
+#
+# Build prof
+#
+build_prof() {
+	echo "==== Build configure-prof ===="
+	${WORKSPACE}/contrib/configure-prof --prefix=$ucx_inst
+	$MAKEP
+}
+
+#
+# Build UGNI
+#
+build_ugni() {
+	echo "==== Build with cray-ugni ===="
+	#
+	# Point pkg-config to contrib/cray-ugni-mock, and replace
+	# PKG_CONFIG_TOP_BUILD_DIR with source dir, since the mock .pc files contain
+	# relative paths.
+	#
+	${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst --with-ugni \
+		PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/${WORKSPACE}/contrib/cray-ugni-mock \
+		PKG_CONFIG_TOP_BUILD_DIR=${WORKSPACE}
+	$MAKEP
+
+	# make sure UGNI transport is enabled
+	grep '#define HAVE_TL_UGNI 1' config.h
+
+	$MAKEP distcheck
+}
+
+#
+# Build CUDA
+#
+build_cuda() {
+	if az_module_load $CUDA_MODULE
+	then
+		if az_module_load $GDRCOPY_MODULE
+		then
+			echo "==== Build with enable cuda, gdr_copy ===="
+			${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst --with-cuda --with-gdrcopy
+			$MAKEP
+			make_clean distclean
+
+			${WORKSPACE}/contrib/configure-release --prefix=$ucx_inst --with-cuda --with-gdrcopy
+			$MAKEP
+			make_clean distclean
+			az_module_unload $GDRCOPY_MODULE
+		fi
+
+		echo "==== Build with enable cuda, w/o gdr_copy ===="
+		${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst --with-cuda --without-gdrcopy
+		$MAKEP
+
+		az_module_unload $CUDA_MODULE
+
+		echo "==== Running test_link_map with cuda build but no cuda module ===="
+		env UCX_HANDLE_ERRORS=bt ./test/apps/test_link_map
+	else
+		echo "==== Not building with cuda flags ===="
+	fi
+}
+
+#
+# Build with clang compiler
+#
+build_clang() {
+	if which clang > /dev/null 2>&1
+	then
+		echo "==== Build with clang compiler ===="
+		${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst CC=clang CXX=clang++
+		$MAKEP
+		$MAKEP install
+	else
+		echo "==== Not building with clang compiler ===="
+	fi
+}
+
+#
+# Build with gcc-latest module
+#
+build_gcc() {
+	#If the glibc version on the host is older than 2.14, don't run
+	#check the glibc version with the ldd version since it comes with glibc
+	#see https://www.linuxquestions.org/questions/linux-software-2/how-to-check-glibc-version-263103/
+	#see https://benohead.com/linux-check-glibc-version/
+	#see https://stackoverflow.com/questions/9705660/check-glibc-version-for-a-particular-gcc-compiler
+	if [ `cat /etc/os-release | grep -i "ubuntu\|mint"|wc -l` -gt 0 ]; then
+		azure_log_warning "Not building with latest gcc compiler on Ubuntu"
+		return 0
+	fi
+
+	ldd_ver="$(ldd --version | awk '/ldd/{print $NF}')"
+	if (echo "2.14"; echo $ldd_ver) | sort -CV
+	then
+		if az_module_load $GCC_MODULE
+		then
+			echo "==== Build with GCC compiler ($(gcc --version|head -1)) ===="
+			${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst
+			$MAKEP
+			$MAKEP install
+			az_module_unload $GCC_MODULE
+		fi
+	else
+		azure_log_warning "Not building with gcc compiler, glibc version is too old ($ldd_ver)"
+	fi
+}
+
+#
+# Build with armclang compiler
+#
+build_armclang() {
+	arch=$(uname -m)
+	if [ "${arch}" != "aarch64" ]
+	then
+		echo "==== Not building with armclang compiler on ${arch} ===="
+		return 0
+	fi
+
+	armclang_test_file=$(mktemp ./XXXXXX).c
+	echo "int main() {return 0;}" > ${armclang_test_file}
+	if az_module_load $ARM_MODULE && armclang --version && armclang ${armclang_test_file} -o ${armclang_test_file}.out
+	then
+		echo "==== Build with armclang compiler ===="
+		${WORKSPACE}/contrib/configure-devel --prefix=$ucx_inst CC=armclang CXX=armclang++
+		$MAKEP
+		$MAKEP install
+	fi
+
+	rm -rf ${armclang_test_file} ${armclang_test_file}.out
+	az_module_unload $ARM_MODULE
+}
+
+check_inst_headers() {
+	echo "==== Testing installed headers ===="
+
+	${WORKSPACE}/contrib/configure-release --prefix=${ucx_inst}
+	$MAKEP install
+	${WORKSPACE}/contrib/check_inst_headers.sh ${ucx_inst}/include
+}
+
+check_config_h() {
+	srcdir=${WORKSPACE}/src
+
+	# Check if all .c files include config.h
+	echo "==== Checking for config.h files in directory $srcdir ===="
+
+	missing=`find $srcdir \( -name "*.c" -o -name "*.cc" \) -type f -exec grep -LP '\#\s*include\s+"config.h"' {} \;`
+
+	if [ `echo $missing | wc -w` -eq 0 ]
+	then
+		echo "Check successful "
+	else
+		azure_log_error "Missing include config.h in files: $missing"
+		exit 1
+	fi
+}
+
+#
+# Do a given task and update progress indicator
+#
+do_task() {
+	amount=$1
+	shift
+	# cleanup build dir before the task
+	[ -n "${ucx_build_dir}" ] && rm -rf "${ucx_build_dir}/*"
+
+	$@
+
+	echo "##vso[task.setprogress value=$PROGRESS;]Progress Indicator"
+	PROGRESS=$((PROGRESS+amount))
+}
+
+
+az_init_modules
+prepare_build
+
+[ "${long_test}" = "yes" ] && prog=5 || prog=12
+
+do_task "${prog}" build_docs
+do_task "${prog}" build_debug
+do_task "${prog}" build_prof
+do_task "${prog}" build_ugni
+do_task "${prog}" build_disable_numa
+do_task "${prog}" build_cuda
+do_task "${prog}" build_no_verbs
+do_task "${prog}" build_release_pkg
+
+if [ "${long_test}" = "yes" ]
+then
+	do_task 5 check_config_h
+	do_task 5 check_inst_headers
+	do_task 10 build_icc
+	do_task 10 build_pgi
+	do_task 10 build_gcc
+	do_task 10 build_clang
+	do_task 10 build_armclang
+fi
diff --git a/buildlib/tools/common.sh b/buildlib/tools/common.sh
new file mode 100644
index 00000000000..121165e8f82
--- /dev/null
+++ b/buildlib/tools/common.sh
@@ -0,0 +1,56 @@
+#!/bin/bash -eExl
+
+WORKSPACE=${WORKSPACE:=$PWD}
+# build in local directory which goes away when docker exits
+ucx_build_dir=$HOME/${BUILD_ID}/build
+ucx_inst=$ucx_build_dir/install
+CUDA_MODULE="dev/cuda11.1.1"
+GDRCOPY_MODULE="dev/gdrcopy2.1_cuda11.1.1"
+JDK_MODULE="dev/jdk"
+MVN_MODULE="dev/mvn"
+XPMEM_MODULE="dev/xpmem-90a95a4"
+PGI_MODULE="hpc-sdk/nvhpc/21.2"
+GCC_MODULE="dev/gcc-10.1.0"
+ARM_MODULE="arm-compiler/armcc-19.0"
+INTEL_MODULE="intel/ics-19.1.1"
+
+#
+# Parallel build command runs with 4 tasks, or number of cores on the system,
+# whichever is lowest
+#
+num_cpus=$(lscpu -p | grep -v '^#' | wc -l)
+[ -z $num_cpus ] && num_cpus=1
+parallel_jobs=4
+[ $parallel_jobs -gt $num_cpus ] && parallel_jobs=$num_cpus
+num_pinned_threads=$(nproc)
+[ $parallel_jobs -gt $num_pinned_threads ] && parallel_jobs=$num_pinned_threads
+
+MAKE="make V=1"
+MAKEP="make V=1 -j${parallel_jobs}"
+export AUTOMAKE_JOBS=$parallel_jobs
+
+#
+# cleanup ucx
+#
+make_clean() {
+	rm -rf ${ucx_inst}
+	$MAKEP ${1:-clean}
+}
+
+#
+# Prepare build environment
+#
+prepare_build() {
+	echo " ==== Prepare ===="
+	env
+	cd ${WORKSPACE}
+	if [ -d ${ucx_build_dir} ]
+	then
+		chmod u+rwx ${ucx_build_dir} -R
+		rm -rf ${ucx_build_dir}
+	fi
+	./autogen.sh
+	mkdir -p ${ucx_build_dir}
+	cd ${ucx_build_dir}
+	export PROGRESS=0
+}
diff --git a/buildlib/tools/coverity.sh b/buildlib/tools/coverity.sh
new file mode 100755
index 00000000000..fe3acc01546
--- /dev/null
+++ b/buildlib/tools/coverity.sh
@@ -0,0 +1,80 @@
+#!/bin/bash -eExl
+
+realdir=$(realpath $(dirname $0))
+source ${realdir}/common.sh
+source ${realdir}/../az-helpers.sh
+
+COV_MODULE="tools/cov"
+
+#
+# Run Coverity and report errors
+# The argument is a UCX build type: devel or release
+#
+modules_for_coverity() {
+	res=0
+	az_module_load $COV_MODULE
+	res=$(($res+$?))
+	az_module_load $CUDA_MODULE
+	res=$(($res+$?))
+	az_module_load $GDRCOPY_MODULE
+	res=$(($res+$?))
+	az_module_load $JDK_MODULE
+	res=$(($res+$?))
+	az_module_load $MVN_MODULE
+	res=$(($res+$?))
+	az_module_load $XPMEM_MODULE
+	res=$(($res+$?))
+	return $res
+}
+
+modules_for_coverity_unload() {
+	res=0
+	az_module_unload $COV_MODULE
+	res=$(($res+$?))
+	az_module_unload $CUDA_MODULE
+	res=$(($res+$?))
+	az_module_unload $GDRCOPY_MODULE
+	res=$(($res+$?))
+	az_module_unload $JDK_MODULE
+	res=$(($res+$?))
+	az_module_unload $MVN_MODULE
+	res=$(($res+$?))
+	az_module_unload $XPMEM_MODULE
+	res=$(($res+$?))
+	return $res
+}
+
+run_coverity() {
+
+	az_init_modules
+	modules_for_coverity
+
+	ucx_build_type=$1
+
+	xpmem_root=$(module show $XPMEM_MODULE 2>&1 | awk '/CPATH/ {print $3}' | sed -e 's,/include,,')
+	with_xpmem="--with-xpmem=$xpmem_root"
+
+	${WORKSPACE}/contrib/configure-$ucx_build_type --prefix=$ucx_inst --with-cuda --with-gdrcopy --with-java $with_xpmem
+	cov_build_id="cov_build_${ucx_build_type}"
+	cov_build="$ucx_build_dir/$cov_build_id"
+	rm -rf $cov_build
+	mkdir -p $cov_build
+	cov-build --dir $cov_build $MAKEP all
+	cov-analyze --jobs $parallel_jobs $COV_OPT --security --concurrency --dir $cov_build
+	nerrors=$(cov-format-errors --dir $cov_build | awk '/Processing [0-9]+ errors?/ { print $2 }')
+	rc=$(($rc+$nerrors))
+
+	if [ $nerrors -gt 0 ]; then
+		cov-format-errors --dir $cov_build --emacs-style
+		cp -ar $cov_build $WORKSPACE/$cov_build_id
+		echo "not ok 1 Coverity Detected $nerrors failures"
+	else
+		echo "ok 1 Coverity found no issues"
+		rm -rf $cov_build
+	fi
+	modules_for_coverity_unload
+	return $rc
+}
+
+prepare_build
+run_coverity "$@"
diff --git a/config/m4/compiler.m4 b/config/m4/compiler.m4
index 8e40b335d40..4d27c0053a9 100644
--- a/config/m4/compiler.m4
+++ b/config/m4/compiler.m4
@@ -205,7 +205,7 @@ AC_DEFUN([DETECT_UARCH],
 # CHECK_COMPILER_FLAG
 # Usage: CHECK_COMPILER_FLAG([name], [flag], [program], [if-true], [if-false])
 #
-# The macro checks if program may be compiled using specified flag
+# The macro checks if program may be compiled and linked using specified flag
 #
 AC_DEFUN([CHECK_COMPILER_FLAG],
 [
@@ -214,15 +214,15 @@ AC_DEFUN([CHECK_COMPILER_FLAG],
          SAVE_CXXFLAGS="$CFLAGS"
          CFLAGS="$BASE_CFLAGS $CFLAGS $2"
          CXXFLAGS="$BASE_CXXFLAGS $CXXFLAGS $2"
-         AC_COMPILE_IFELSE([$3],
-                           [AC_MSG_RESULT([yes])
-                            CFLAGS="$SAVE_CFLAGS"
-                            CXXFLAGS="$SAVE_CXXFLAGS"
-                            $4],
-                           [AC_MSG_RESULT([no])
-                            CFLAGS="$SAVE_CFLAGS"
-                            CXXFLAGS="$SAVE_CXXFLAGS"
-                            $5])
+         AC_LINK_IFELSE([$3],
+                        [AC_MSG_RESULT([yes])
+                         CFLAGS="$SAVE_CFLAGS"
+                         CXXFLAGS="$SAVE_CXXFLAGS"
+                         $4],
+                        [AC_MSG_RESULT([no])
+                         CFLAGS="$SAVE_CFLAGS"
+                         CXXFLAGS="$SAVE_CXXFLAGS"
+                         $5])
 ])
 
 
@@ -314,7 +314,7 @@ ADD_COMPILER_FLAG_IF_SUPPORTED([-diag-disable 269],
 # Set default datatype alignment to 16 bytes.
 # Some compilers (LLVM based, clang) expects allocation of datatypes by 32 bytes
 # to optimize operations memset/memcpy/etc using vectorized processor instructions
-# which requires aligment of memory buffer by 32 or higer bytes. Default malloc method
+# which requires alignment of memory buffer by 32 or higer bytes. Default malloc method
 # guarantee alignment for 16 bytes only. Force using compiler 16-bytes alignment
 # by default if option is supported.
 #
@@ -322,7 +322,7 @@ UCX_ALLOC_ALIGN=16
 ADD_COMPILER_FLAG_IF_SUPPORTED([-fmax-type-align=$UCX_ALLOC_ALIGN],
                                [-fmax-type-align=$UCX_ALLOC_ALIGN],
                                [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])],
-                               [AC_DEFINE_UNQUOTED([UCX_ALLOC_ALIGN], $UCX_ALLOC_ALIGN, [Set aligment assumption for compiler])],
+                               [AC_DEFINE_UNQUOTED([UCX_ALLOC_ALIGN], $UCX_ALLOC_ALIGN, [Set alignment assumption for compiler])],
                                [])
 
 
@@ -467,13 +467,23 @@ AC_LANG_POP
 #
 # PGI specific switches
 #
+# --diag_suppress 1    - Suppress last line ends without a newline
+# --diag_suppress 68   - Suppress integer conversion resulted in a change of sign
+# --diag_suppress 111  - Suppress statement is unreachable
+# --diag_suppress 167  - Suppress int* incompatible with unsigned int*
 # --diag_suppress 181  - Suppress incorrect printf format for PGI18 compiler. TODO: remove it after compiler fix
+# --diag_suppress 188  - Suppress enumerated type mixed with another type
 # --diag_suppress 381  - Suppress extra ";" ignored
 # --diag_suppress 1215 - Suppress deprecated API warning for PGI18 compiler
 # --diag_suppress 1901 - Use of a const variable in a constant expression is nonstandard in C
 # --diag_suppress 1902 - Use of a const variable in a constant expression is nonstandard in C (same as 1901)
 ADD_COMPILER_FLAGS_IF_SUPPORTED([[--display_error_number],
+                                 [--diag_suppress 1],
+                                 [--diag_suppress 68],
+                                 [--diag_suppress 111],
+                                 [--diag_suppress 167],
                                  [--diag_suppress 181],
+                                 [--diag_suppress 188],
                                  [--diag_suppress 381],
                                  [--diag_suppress 1215],
                                  [--diag_suppress 1901],
@@ -519,7 +529,8 @@ ADD_COMPILER_FLAGS_IF_SUPPORTED([[-Wno-pointer-sign],
                                  [-Werror-implicit-function-declaration],
                                  [-Wno-format-zero-length],
                                  [-Wnested-externs],
-                                 [-Wshadow]],
+                                 [-Wshadow],
+                                 [-Werror=declaration-after-statement]],
                                 [AC_LANG_SOURCE([[int main(int argc, char **argv){return 0;}]])])
 
 
diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4
index 1862eb6148c..74bf0e56435 100644
--- a/config/m4/cuda.m4
+++ b/config/m4/cuda.m4
@@ -12,13 +12,19 @@ AS_IF([test "x$cuda_checked" != "xyes"],
                 [], [with_cuda=guess])
 
     AS_IF([test "x$with_cuda" = "xno"],
-        [cuda_happy=no],
+        [
+         cuda_happy=no
+         have_cuda_static=no
+        ],
         [
          save_CPPFLAGS="$CPPFLAGS"
          save_LDFLAGS="$LDFLAGS"
+         save_LIBS="$LIBS"
 
          CUDA_CPPFLAGS=""
          CUDA_LDFLAGS=""
+         CUDA_LIBS=""
+         CUDA_STATIC_LIBS=""
 
          AS_IF([test ! -z "$with_cuda" -a "x$with_cuda" != "xyes" -a "x$with_cuda" != "xguess"],
                [ucx_check_cuda_dir="$with_cuda"
@@ -40,18 +46,31 @@ AS_IF([test "x$cuda_checked" != "xyes"],
 
          # Check cuda libraries
          AS_IF([test "x$cuda_happy" = "xyes"],
-                [AC_CHECK_LIB([cuda], [cuDeviceGetUuid],
-                              [CUDA_LDFLAGS="$CUDA_LDFLAGS -lcuda"], [cuda_happy="no"])])
+               [AC_CHECK_LIB([cuda], [cuDeviceGetUuid],
+                             [CUDA_LIBS="$CUDA_LIBS -lcuda"], [cuda_happy="no"])])
+         AS_IF([test "x$cuda_happy" = "xyes"],
+               [AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
+                             [CUDA_LIBS="$CUDA_LIBS -lcudart"], [cuda_happy="no"])])
+
+         LDFLAGS="$save_LDFLAGS"
+
+         # Check for cuda static library
+         have_cuda_static="no"
          AS_IF([test "x$cuda_happy" = "xyes"],
-                [AC_CHECK_LIB([cudart], [cudaGetDeviceCount],
-                              [CUDA_LDFLAGS="$CUDA_LDFLAGS -lcudart"], [cuda_happy="no"])])
+               [AC_CHECK_LIB([cudart_static], [cudaGetDeviceCount],
+                             [CUDA_STATIC_LIBS="$CUDA_STATIC_LIBS -lcudart_static"
+                              have_cuda_static="yes"],
+                             [], [-ldl -lrt -lpthread])])
 
          CPPFLAGS="$save_CPPFLAGS"
          LDFLAGS="$save_LDFLAGS"
+         LIBS="$save_LIBS"
 
          AS_IF([test "x$cuda_happy" = "xyes"],
                [AC_SUBST([CUDA_CPPFLAGS], ["$CUDA_CPPFLAGS"])
                 AC_SUBST([CUDA_LDFLAGS], ["$CUDA_LDFLAGS"])
+                AC_SUBST([CUDA_LIBS], ["$CUDA_LIBS"])
+                AC_SUBST([CUDA_STATIC_LIBS], ["$CUDA_STATIC_LIBS"])
                 AC_DEFINE([HAVE_CUDA], 1, [Enable CUDA support])],
                [AS_IF([test "x$with_cuda" != "xguess"],
                       [AC_MSG_ERROR([CUDA support is requested but cuda packages cannot be found])],
@@ -61,6 +80,7 @@ AS_IF([test "x$cuda_checked" != "xyes"],
 
         cuda_checked=yes
         AM_CONDITIONAL([HAVE_CUDA], [test "x$cuda_happy" != xno])
+        AM_CONDITIONAL([HAVE_CUDA_STATIC], [test "X$have_cuda_static" = "Xyes"])
 
    ]) # "x$cuda_checked" != "xyes"
 
diff --git a/config/m4/fuse3.m4 b/config/m4/fuse3.m4
new file mode 100644
index 00000000000..c79b31a5019
--- /dev/null
+++ b/config/m4/fuse3.m4
@@ -0,0 +1,51 @@
+#
+# Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+# See file LICENSE for terms.
+#
+
+fuse3_happy="no"
+
+AC_ARG_WITH([fuse3],
+            [AS_HELP_STRING([--with-fuse3=(DIR)],
+            [Enable the use of FUSEv3 (default is guess).])],
+            [], [with_fuse3=guess])
+
+AS_IF([test "x$with_fuse3" != xno],
+      [
+       AS_IF([test "x$with_fuse3" = "xguess" \
+                -o "x$with_fuse3" = "xyes" \
+                -o "x$with_fuse3" = "x"],
+             [FUSE3_CPPFLAGS=$(pkg-config --cflags fuse3)
+              FUSE3_LIBS=$(pkg-config --libs fuse3)],
+             [FUSE3_CPPFLAGS="-I${with_fuse3}/include/fuse3"
+              FUSE3_LIBS="-L${with_fuse3}/lib -L${with_fuse3}/lib64"])
+
+       save_CPPFLAGS="$CPPFLAGS"
+       save_LDFLAGS="$LDFLAGS"
+
+       CPPFLAGS="$FUSE3_CPPFLAGS $CPPFLAGS"
+       LDFLAGS="$FUSE3_LIBS $LDFLAGS"
+
+       fuse3_happy="yes"
+       AC_CHECK_DECLS([fuse_open_channel, fuse_mount, fuse_unmount],
+                      [AC_SUBST([FUSE3_CPPFLAGS], [$FUSE3_CPPFLAGS])
+                       AC_DEFINE([FUSE_USE_VERSION], 30, [Fuse API version])],
+                      [fuse3_happy="no"],
+                      [[#define FUSE_USE_VERSION 30
+                        #include <fuse.h>]])
+
+       AC_CHECK_FUNCS([fuse_open_channel fuse_mount fuse_unmount],
+                      [AC_SUBST([FUSE3_LIBS], [$FUSE3_LIBS])],
+                      [fuse3_happy="no"])
+
+       AS_IF([test "x$fuse3_happy" != "xyes" -a "x$with_fuse3" != "xguess"],
+             [AC_MSG_ERROR([FUSEv3 requested but could not be found])])
+
+       CPPFLAGS="$save_CPPFLAGS"
+       LDFLAGS="$save_LDFLAGS"
+    ],
+    [AC_MSG_WARN([FUSEv3 was explicitly disabled])]
+)
+
+AM_CONDITIONAL([HAVE_FUSE3], [test "x$fuse3_happy" != xno])
+vfs_enable=$fuse3_happy
diff --git a/config/m4/sysdep.m4 b/config/m4/sysdep.m4
index 9a8d5d8f4c6..cec8cc585ac 100644
--- a/config/m4/sysdep.m4
+++ b/config/m4/sysdep.m4
@@ -134,21 +134,42 @@ AS_IF([test "x$with_valgrind" = xno],
 #
 AC_ARG_ENABLE([numa],
     AC_HELP_STRING([--disable-numa], [Disable NUMA support]),
+    [],
+    [enable_numa=guess])
+AS_IF([test "x$enable_numa" = xno],
     [
-        AC_MSG_NOTICE([NUMA support is disabled])
+     AC_MSG_NOTICE([NUMA support is explictly disabled])
+     numa_enable=disabled
     ],
     [
-        AC_DEFUN([NUMA_W1], [not found. Please reconfigure with --disable-numa. ])
-        AC_DEFUN([NUMA_W2], [Warning: this may have negative impact on library performance. It is better to install])
-        AC_CHECK_HEADERS([numa.h numaif.h], [],
-                         [AC_MSG_ERROR([NUMA headers NUMA_W1 NUMA_W2 libnuma-devel package])])
-        AC_CHECK_LIB(numa, mbind,
-                     [AC_SUBST(NUMA_LIBS, [-lnuma])],
-                     [AC_MSG_ERROR([NUMA library NUMA_W1 NUMA_W2 libnuma package])])
-        AC_DEFINE([HAVE_NUMA], 1, [Define to 1 to enable NUMA support])
-        AC_CHECK_TYPES([struct bitmask], [], [], [[#include <numa.h>]])
-    ]
-)
+     save_LDFLAGS="$LDFLAGS"
+
+     numa_happy=yes
+     AC_CHECK_HEADERS([numa.h numaif.h], [], [numa_happy=no])
+     AC_CHECK_LIB(numa, mbind,
+                  [AC_SUBST(NUMA_LIBS, [-lnuma])],
+                  [numa_happy=no])
+     AC_CHECK_TYPES([struct bitmask], [], [numa_happy=no], [[#include <numa.h>]])
+
+     LDFLAGS="$save_LDFLAGS"
+
+     AS_IF([test "x$numa_happy" = xyes],
+           [
+            AC_DEFINE([HAVE_NUMA], 1, [Define to 1 to enable NUMA support])
+            numa_enable=enabled
+           ],
+           [
+            AC_DEFUN([NUMA_W1], [NUMA support not found])
+            AC_DEFUN([NUMA_W2], [Please consider installing libnuma-devel package.])
+            AS_IF([test "x$enable_numa" = xyes],
+                  [AC_MSG_ERROR([NUMA_W1. NUMA_W2])],
+                  [
+                   AC_MSG_WARN([NUMA_W1, this many impact library performance.])
+                   AC_MSG_WARN([NUMA_W2])
+                  ])
+            numa_enable=disabled
+           ])
+    ])
 
 
 #
diff --git a/config/m4/ucm.m4 b/config/m4/ucm.m4
index 9c7c820d9ff..1e229edc51f 100644
--- a/config/m4/ucm.m4
+++ b/config/m4/ucm.m4
@@ -5,19 +5,6 @@
 #
 
 
-#
-# Enable overriding library symbols
-#
-AC_ARG_ENABLE([symbol-override],
-	AS_HELP_STRING([--disable-symbol-override], [Disable overriding library symbols, default: NO]),
-	[],
-	[enable_symbol_override=yes])
-	
-AS_IF([test "x$enable_symbol_override" = xyes],
-	[AC_DEFINE([ENABLE_SYMBOL_OVERRIDE], [1], [Enable symbol override])]
-	[:]
-)
-
 #
 # Memory allocator selection
 #
diff --git a/configure.ac b/configure.ac
index 3c4483cae2f..6621985a8db 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2001-2011.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 # Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED.
 # Copyright (C) The University of Tennessee and The University
 #               of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
@@ -9,7 +9,7 @@
 AC_PREREQ([2.63])
 
 define([ucx_ver_major], 1)
-define([ucx_ver_minor], 10)
+define([ucx_ver_minor], 11)
 define([ucx_ver_patch], 0)
 define([ts], esyscmd([sh -c "date +%Y%m%d%H%M%S"]))
 
@@ -109,6 +109,10 @@ AS_IF([${LN_S} --relative symlinktest 2>/dev/null],
 #
 AC_DEFINE_UNQUOTED([UCX_CONFIGURE_FLAGS], ["$config_flags"], [UCX configure flags])
 
+#
+# Define path of ucx.conf configuration file
+#
+AC_SUBST([ucx_conf_dir], [${sysconfdir}/ucx])
 
 #
 # Provide the functionality of AS_VAR_APPEND if Autoconf does not have it.
@@ -182,9 +186,9 @@ AS_IF([test "x$with_docs_only" = xyes],
      AM_CONDITIONAL([HAVE_DC_DV], [false])
      AM_CONDITIONAL([HAVE_DC_EXP], [false])
      AM_CONDITIONAL([HAVE_TL_UD], [false])
-     AM_CONDITIONAL([HAVE_TL_CM], [false])
      AM_CONDITIONAL([HAVE_CRAY_UGNI], [false])
      AM_CONDITIONAL([HAVE_CUDA], [false])
+     AM_CONDITIONAL([HAVE_CUDA_STATIC], [false])
      AM_CONDITIONAL([HAVE_GDR_COPY], [false])
      AM_CONDITIONAL([HAVE_ROCM], [false])
      AM_CONDITIONAL([HAVE_HIP], [false])
@@ -192,7 +196,6 @@ AS_IF([test "x$with_docs_only" = xyes],
      AM_CONDITIONAL([HAVE_CMA], [false])
      AM_CONDITIONAL([HAVE_KNEM], [false])
      AM_CONDITIONAL([HAVE_RDMACM], [false])
-     AM_CONDITIONAL([HAVE_RDMACM_QP_LESS], [false])
      AM_CONDITIONAL([HAVE_MPI], [false])
      AM_CONDITIONAL([HAVE_MPIRUN], [false])
      AM_CONDITIONAL([HAVE_MPICC], [false])
@@ -209,20 +212,22 @@ AS_IF([test "x$with_docs_only" = xyes],
      AM_CONDITIONAL([HAVE_AARCH64_THUNDERX2], [false])
      AM_CONDITIONAL([HAVE_AARCH64_THUNDERX1], [false])
      AM_CONDITIONAL([HAVE_AARCH64_HI1620], [false])
+     AM_CONDITIONAL([HAVE_FUSE3], [false])
     ],
     [
      AM_CONDITIONAL([DOCS_ONLY], [false])
      m4_include([config/m4/compiler.m4])
      m4_include([config/m4/sysdep.m4])
-     m4_include([config/m4/ucs.m4])
      m4_include([config/m4/ucm.m4])
      m4_include([config/m4/mpi.m4])
      m4_include([config/m4/rte.m4])
+     m4_include([config/m4/fuse3.m4])
      m4_include([config/m4/java.m4])
      m4_include([config/m4/cuda.m4])
      m4_include([config/m4/rocm.m4])
      m4_include([config/m4/gdrcopy.m4])
      m4_include([src/ucm/configure.m4])
+     m4_include([src/ucs/configure.m4])
      m4_include([src/uct/configure.m4])
      m4_include([src/tools/perf/configure.m4])
      m4_include([test/gtest/configure.m4])
@@ -328,6 +333,7 @@ AS_IF([test "x$with_docs_only" = xyes],
 # Print which transports are built
 #
 build_modules="${uct_modules}"
+build_modules="${build_modules}${ucs_modules}"
 build_modules="${build_modules}${uct_ib_modules}"
 build_modules="${build_modules}${uct_cuda_modules}"
 build_modules="${build_modules}${ucm_modules}"
@@ -357,10 +363,10 @@ AC_CONFIG_FILES([
                  debian/rules
                  debian/control
                  debian/changelog
-                 src/ucs/Makefile
                  src/ucp/Makefile
                  src/ucp/api/ucp_version.h
                  src/ucp/core/ucp_version.c
+                 src/tools/vfs/Makefile
                  src/tools/info/Makefile
                  src/tools/profile/Makefile
                  test/apps/Makefile
@@ -392,13 +398,17 @@ AC_MSG_NOTICE([Building documents only])
 [
 AC_MSG_NOTICE([UCX build configuration:])
 AC_MSG_NOTICE([      Build prefix:   ${prefix}])
+AC_MSG_NOTICE([ Configuration dir:   ${ucx_conf_dir}])
 AC_MSG_NOTICE([Preprocessor flags:   ${BASE_CPPFLAGS}])
 AC_MSG_NOTICE([        C compiler:   ${CC} ${BASE_CFLAGS}])
 AC_MSG_NOTICE([      C++ compiler:   ${CXX} ${BASE_CXXFLAGS}])
 AC_MSG_NOTICE([      Multi-thread:   ${mt_enable}])
+AC_MSG_NOTICE([      NUMA support:   ${numa_enable}])
 AC_MSG_NOTICE([         MPI tests:   ${mpi_enable}])
+AC_MSG_NOTICE([       VFS support:   ${vfs_enable}])
 AC_MSG_NOTICE([     Devel headers:   ${enable_devel_headers}])
 AC_MSG_NOTICE([          Bindings:   <$(echo ${build_bindings}|tr ':' ' ') >])
+AC_MSG_NOTICE([       UCS modules:   <$(echo ${ucs_modules}|tr ':' ' ') >])
 AC_MSG_NOTICE([       UCT modules:   <$(echo ${uct_modules}|tr ':' ' ') >])
 AC_MSG_NOTICE([      CUDA modules:   <$(echo ${uct_cuda_modules}|tr ':' ' ') >])
 AC_MSG_NOTICE([      ROCM modules:   <$(echo ${uct_rocm_modules}|tr ':' ' ') >])
diff --git a/contrib/buildrpm.sh b/contrib/buildrpm.sh
index 46d4187b8fb..6f45f63af7f 100755
--- a/contrib/buildrpm.sh
+++ b/contrib/buildrpm.sh
@@ -100,12 +100,12 @@ if [ $opt_binrpm -eq 1 ]; then
 	with_args+=" $(with_arg cuda)"
 	with_args+=" $(with_arg gdrcopy)"
 	with_args+=" $(with_arg ib)"
-	with_args+=" $(with_arg cm ib_cm)"
 	with_args+=" $(with_arg knem)"
 	with_args+=" $(with_arg rdmacm)"
 	with_args+=" $(with_arg rocm)"
 	with_args+=" $(with_arg ugni)"
 	with_args+=" $(with_arg xpmem)"
+	with_args+=" $(with_arg vfs)"
 	with_args+=" $(with_arg java)"
 
 	echo rpmbuild -bb $rpmmacros $rpmopts $rpmspec $defines $with_args | bash -eEx
diff --git a/contrib/check_inst_headers.sh b/contrib/check_inst_headers.sh
index 865fd37a075..fcf4f90a795 100755
--- a/contrib/check_inst_headers.sh
+++ b/contrib/check_inst_headers.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/sh -eE
 #
 # Copyright (C) Mellanox Technologies Ltd. 2001-2017.  ALL RIGHTS RESERVED.
 #
@@ -12,6 +12,7 @@
 #
 
 CC=${CC:-gcc}
+CXX=${CXX:-g++}
 
 cd ${1:-.}
 
@@ -27,9 +28,12 @@ do
 	fi
 
 	# try to compile a test program (from stdin) which includes hfile
-	${CC} -I. -x c -c - -o /dev/null -DHAVE_CONFIG_H=1 <<EOF || exit $?
+	for compile in "${CC} -x c" "${CXX} -x c++"
+	do
+		${compile} -I. -c - -o /dev/null -DHAVE_CONFIG_H=1 <<EOF
 #include "${hfile}"
 EOF
+	done
 
 	echo "OK $hfile"
 done
diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh
index f38bfc15780..e79ed0733cf 100755
--- a/contrib/test_jenkins.sh
+++ b/contrib/test_jenkins.sh
@@ -26,8 +26,8 @@
 
 WORKSPACE=${WORKSPACE:=$PWD}
 ucx_inst=${WORKSPACE}/install
-CUDA_MODULE="dev/cuda11.0"
-GDRCOPY_MODULE="dev/gdrcopy2.0_cuda11.0"
+CUDA_MODULE="dev/cuda11.1.1"
+GDRCOPY_MODULE="dev/gdrcopy2.1_cuda11.1.1"
 
 if [ -z "$BUILD_NUMBER" ]; then
 	echo "Running interactive"
@@ -92,6 +92,18 @@ then
 fi
 echo "==== Running on $(hostname), worker $worker / $nworkers ===="
 
+# Report an warning message to Azure pipeline
+log_warning() {
+	msg=$1
+	test "x$RUNNING_IN_AZURE" = "xyes" && { azure_log_warning "${msg}" ; set -x; } || echo "${msg}"
+}
+
+# Report an error message to Azure pipeline
+log_error() {
+	msg=$1
+	test "x$RUNNING_IN_AZURE" = "xyes" && { azure_log_error "${msg}" ; set -x; } || echo "${msg}"
+}
+
 #
 # cleanup ucx
 #
@@ -204,17 +216,43 @@ get_my_tasks() {
 }
 
 #
-# Get list of active IB devices
+# Get list IB devices
 #
-get_active_ib_devices() {
-	device_list=$(ibv_devinfo -l | tail -n +2 | sed -e 's/^[ \t]*//' | head -n -1)
+get_ib_devices() {
+	state=$1
+	device_list=$(ibv_devinfo -l | tail -n +2)
 	for ibdev in $device_list
 	do
-		port=1
-		(ibv_devinfo -d $ibdev -i $port | grep -q PORT_ACTIVE) && echo "$ibdev:$port" || true
+		num_ports=$(ibv_devinfo -d $ibdev| awk '/phys_port_cnt:/ {print $2}')
+		for port in $(seq 1 $num_ports)
+		do
+			if ibv_devinfo -d $ibdev -i $port | grep -q $state
+			then
+				echo "$ibdev:$port"
+			fi
+		done
 	done
 }
 
+#
+# Get IB devices on state Active
+#
+get_active_ib_devices() {
+	get_ib_devices PORT_ACTIVE
+}
+
+#
+# Check IB devices on state INIT
+#
+check_machine() {
+	init_dev=$(get_ib_devices PORT_INIT)
+	if [ -n "${init_dev}" ]
+	then
+		echo "${init_dev} have state PORT_INIT"
+		exit 1
+	fi
+}
+
 #
 # Get list of active IP interfaces
 #
@@ -268,6 +306,26 @@ get_rdma_device_ip_addr() {
 	echo $ipaddr
 }
 
+get_non_rdma_ip_addr() {
+	if ! which ibdev2netdev >&/dev/null
+	then
+		return
+	fi
+
+	# get the interface of the ip address that is the default gateway (pure Ethernet IPv4 address).
+	eth_iface=$(ip route show| sed -n 's/default via \(\S*\) dev \(\S*\).*/\2/p')
+
+	# the pure Ethernet interface should not appear in the ibdev2netdev output. it should not be an IPoIB or
+	# RoCE interface.
+	if ibdev2netdev|grep -qw "${eth_iface}"
+	then
+		echo "Failed to retrieve an IP of a non IPoIB/RoCE interface"
+		exit 1
+	fi
+
+	get_ifaddr ${eth_iface}
+}
+
 #
 # Prepare build environment
 #
@@ -285,387 +343,6 @@ prepare() {
 	cd build-test
 }
 
-#
-# Build documentation
-#
-build_docs() {
-	doxy_ready=0
-	doxy_target_version="1.8.11"
-	doxy_version="$(doxygen --version)" || true
-
-	# Try load newer doxygen if native is older than 1.8.11
-	if ! (echo $doxy_target_version; echo $doxy_version) | sort -CV
-	then
-		if module_load tools/doxygen-1.8.11
-		then
-			doxy_ready=1
-		else
-			echo " doxygen was not found"
-		fi
-	else
-		doxy_ready=1
-	fi
-
-	if [ $doxy_ready -eq 1 ]
-	then
-		echo " ==== Build docs only ===="
-		../configure --prefix=$ucx_inst --with-docs-only
-		make_clean
-		$MAKE  docs
-		make_clean # FIXME distclean does not work with docs-only
-	fi
-}
-
-#
-# Building java docs
-#
-build_java_docs() {
-	echo " ==== Building java docs ===="
-	if module_load dev/jdk && module_load dev/mvn
-	then
-		../configure --prefix=$ucx_inst --with-java
-		$MAKE -C ../build-test/bindings/java/src/main/native docs
-		module unload dev/jdk
-		module unload dev/mvn
-	else
-		echo "No jdk and mvn module, failed to build docs".
-	fi
-}
-
-#
-# Build without verbs
-#
-build_no_verbs() {
-	echo "==== Build without IB verbs ===="
-	../contrib/configure-release --prefix=$ucx_inst --without-verbs
-	make_clean
-	$MAKEP
-	make_clean distclean
-}
-
-#
-# Build without numa support check
-#
-build_disable_numa() {
-	echo "==== Check --disable-numa compilation option ===="
-	../contrib/configure-release --prefix=$ucx_inst --disable-numa
-	make_clean
-	$MAKEP
-	make_clean distclean
-}
-
-#
-# Build a package in release mode
-#
-build_release_pkg() {
-	echo "==== Build release ===="
-	../contrib/configure-release
-	make_clean
-	$MAKEP
-	$MAKEP distcheck
-
-	# Show UCX info
-	./src/tools/info/ucx_info -s -f -c -v -y -d -b -p -w -e -uart -m 20M
-
-	if [ -f /etc/redhat-release -o -f /etc/fedora-release ]; then
-		rpm_based=yes
-	elif [ `cat /etc/os-release | grep -i "ubuntu\|mint"|wc -l` -gt 0 ]; then
-		rpm_based=no
-	else
-		# try rpm tool to detect distro
-		set +e
-		out=$(rpm -q rpm 2>/dev/null)
-		rc=$?
-		set -e
-		rpm_based=yes
-		if [[ $rc != 0 || "$out" == *"not installed"* ]]; then
-			rpm_based=no
-		fi
-	fi
-
-	if [[ "$rpm_based" == "no" && -x /usr/bin/dpkg-buildpackage ]]; then
-		echo "==== Build debian package ===="
-		dpkg-buildpackage -us -uc
-	else
-		echo "==== Build RPM ===="
-		../contrib/buildrpm.sh -s -b --nodeps --define "_topdir $PWD"
-	fi
-
-	# check that UCX version is present in spec file
-	cd ${WORKSPACE}
-	# extract version from configure.ac and convert to MAJOR.MINOR.PATCH representation
-	version=$(grep -P "define\S+ucx_ver" configure.ac | awk '{print $2}' | sed 's,),,' | xargs echo | tr ' ' '.')
-	if ! grep -q "$version" ucx.spec.in; then
-		echo "Current UCX version ($version) is not present in ucx.spec.in changelog"
-		exit 1
-	fi
-	cd -
-
-	make_clean distclean
-}
-
-#
-# Build with Intel compiler
-#
-build_icc() {
-	echo 1..1 > build_icc.tap
-	if module_load intel/ics && icc -v
-	then
-		echo "==== Build with Intel compiler ===="
-		../contrib/configure-devel --prefix=$ucx_inst CC=icc CXX=icpc
-		make_clean
-		$MAKEP
-		make_clean distclean
-		echo "==== Build with Intel compiler (clang) ===="
-		../contrib/configure-devel --prefix=$ucx_inst CC=clang CXX=clang++
-		make_clean
-		$MAKEP
-		make_clean distclean
-		echo "ok 1 - build successful " >> build_icc.tap
-	else
-		echo "==== Not building with Intel compiler ===="
-		echo "ok 1 - # SKIP because Intel compiler not installed" >> build_icc.tap
-	fi
-	module_unload intel/ics
-}
-
-#
-# Build with PGI compiler
-#
-build_pgi() {
-	echo 1..1 > build_pgi.tap
-	pgi_test_file=$(mktemp ./XXXXXX).c
-	echo "int main() {return 0;}" > ${pgi_test_file}
-
-	if module_load pgi/latest && pgcc18 --version && pgcc18 ${pgi_test_file} -o ${pgi_test_file}.out
-	then
-		echo "==== Build with PGI compiler ===="
-		# PGI failed to build valgrind headers, disable it for now
-		# TODO: Using non-default PGI compiler - pgcc18 which is going to be default
-		#       in next versions.
-		#       Switch to default CC compiler after pgcc18 is default for pgi module
-		../contrib/configure-devel --prefix=$ucx_inst CC=pgcc18 --without-valgrind
-		make_clean
-		$MAKEP
-		make_clean distclean
-		echo "ok 1 - build successful " >> build_pgi.tap
-	else
-		echo "==== Not building with PGI compiler ===="
-		echo "ok 1 - # SKIP because PGI compiler not installed" >> build_pgi.tap
-	fi
-
-	rm -rf ${pgi_test_file} ${pgi_test_file}.out
-	module_unload pgi/latest
-}
-
-#
-# Build debug version
-#
-build_debug() {
-	echo "==== Build with --enable-debug option ===="
-	../contrib/configure-devel --prefix=$ucx_inst --enable-debug --enable-examples
-	make_clean
-	$MAKEP
-	make_clean distclean
-}
-
-#
-# Build prof
-#
-build_prof() {
-	echo "==== Build configure-prof ===="
-	../contrib/configure-prof --prefix=$ucx_inst
-	make_clean
-	$MAKEP
-	make_clean distclean
-}
-
-#
-# Build UGNI
-#
-build_ugni() {
-	echo 1..1 > build_ugni.tap
-
-	echo "==== Build with cray-ugni ===="
-	#
-	# Point pkg-config to contrib/cray-ugni-mock, and replace
-	# PKG_CONFIG_TOP_BUILD_DIR with source dir, since the mock .pc files contain
-	# relative paths.
-	#
-	../contrib/configure-devel --prefix=$ucx_inst --with-ugni \
-		PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$PWD/../contrib/cray-ugni-mock \
-		PKG_CONFIG_TOP_BUILD_DIR=$PWD/..
-	make_clean
-	$MAKEP
-
-	# make sure UGNI transport is enabled
-	grep '#define HAVE_TL_UGNI 1' config.h
-
-	$MAKE  distcheck
-	make_clean distclean
-
-	module_unload dev/cray-ugni
-	echo "ok 1 - build successful " >> build_ugni.tap
-}
-
-#
-# Build CUDA
-#
-build_cuda() {
-	echo 1..1 > build_cuda.tap
-	if module_load $CUDA_MODULE
-	then
-		if module_load $GDRCOPY_MODULE
-		then
-			echo "==== Build with enable cuda, gdr_copy ===="
-			../contrib/configure-devel --prefix=$ucx_inst --with-cuda --with-gdrcopy
-			make_clean
-			$MAKEP
-			make_clean distclean
-
-			../contrib/configure-release --prefix=$ucx_inst --with-cuda --with-gdrcopy
-			make_clean
-			$MAKEP
-			make_clean distclean
-			module unload $GDRCOPY_MODULE
-		fi
-
-		echo "==== Build with enable cuda, w/o gdr_copy ===="
-		../contrib/configure-devel --prefix=$ucx_inst --with-cuda --without-gdrcopy
-		make_clean
-		$MAKEP
-
-		module unload $CUDA_MODULE
-
-		echo "==== Running test_link_map with cuda build but no cuda module ===="
-		env UCX_HANDLE_ERRORS=bt ./test/apps/test_link_map
-
-		make_clean distclean
-		echo "ok 1 - build successful " >> build_cuda.tap
-	else
-		echo "==== Not building with cuda flags ===="
-		echo "ok 1 - # SKIP because cuda not installed" >> build_cuda.tap
-	fi
-	unload_cuda_env
-}
-
-#
-# Build with clang compiler
-#
-build_clang() {
-	echo 1..1 > build_clang.tap
-	if which clang > /dev/null 2>&1
-	then
-		echo "==== Build with clang compiler ===="
-		../contrib/configure-devel --prefix=$ucx_inst CC=clang CXX=clang++
-		make_clean
-		$MAKEP
-		$MAKEP install
-		UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d
-		make_clean distclean
-		echo "ok 1 - build successful " >> build_clang.tap
-	else
-		echo "==== Not building with clang compiler ===="
-		echo "ok 1 - # SKIP because clang not installed" >> build_clang.tap
-	fi
-}
-
-#
-# Build with gcc-latest module
-#
-build_gcc_latest() {
-	echo 1..1 > build_gcc_latest.tap
-	#If the glibc version on the host is older than 2.14, don't run
-	#check the glibc version with the ldd version since it comes with glibc
-	#see https://www.linuxquestions.org/questions/linux-software-2/how-to-check-glibc-version-263103/
-	#see https://benohead.com/linux-check-glibc-version/
-	#see https://stackoverflow.com/questions/9705660/check-glibc-version-for-a-particular-gcc-compiler
-	ldd_ver="$(ldd --version | awk '/ldd/{print $NF}')"
-	if (echo "2.14"; echo $ldd_ver) | sort -CV
-	then
-		if module_load dev/gcc-latest
-		then
-			echo "==== Build with GCC compiler ($(gcc --version|head -1)) ===="
-			../contrib/configure-devel --prefix=$ucx_inst
-			make_clean
-			$MAKEP
-			$MAKEP install
-			UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d
-			make_clean distclean
-			echo "ok 1 - build successful " >> build_gcc_latest.tap
-			module unload dev/gcc-latest
-		else
-			echo "==== Not building with latest gcc compiler ===="
-			echo "ok 1 - # SKIP because dev/gcc-latest module is not available" >> build_gcc_latest.tap
-		fi
-	else
-		echo "==== Not building with gcc compiler ===="
-		echo "Required glibc version is too old ($ldd_ver)"
-		echo "ok 1 - # SKIP because glibc version is older than 2.14" >> build_gcc_latest.tap
-	fi
-}
-
-#
-# Builds jucx
-#
-build_jucx() {
-	echo 1..1 > build_jucx.tap
-	if module_load dev/jdk && module_load dev/mvn
-	then
-		echo "==== Building JUCX bindings (java api for ucx) ===="
-		../contrib/configure-release --prefix=$ucx_inst --with-java
-		make_clean
-		$MAKEP
-		$MAKEP install
-		make_clean distclean
-		echo "ok 1 - build successful " >> build_jucx.tap
-		module unload dev/jdk
-		module unload dev/mvn
-	else
-		echo "==== No jdk and mvn modules ==== "
-		echo "ok 1 - # SKIP because dev/jdk and dev/mvn modules are not available" >> build_jucx.tap
-	fi
-}
-
-#
-# Build with armclang compiler
-#
-build_armclang() {
-	echo 1..1 > build_armclang.tap
-	armclang_test_file=$(mktemp ./XXXXXX).c
-	echo "int main() {return 0;}" > ${armclang_test_file}
-	if module_load arm-compiler/latest && armclang --version && armclang ${armclang_test_file} -o ${armclang_test_file}.out
-	then
-		echo "==== Build with armclang compiler ===="
-		../contrib/configure-devel --prefix=$ucx_inst CC=armclang CXX=armclang++
-		make_clean
-		$MAKEP
-		$MAKEP install
-		UCX_HANDLE_ERRORS=bt,freeze UCX_LOG_LEVEL_TRIGGER=ERROR $ucx_inst/bin/ucx_info -d
-		make_clean distclean
-		echo "ok 1 - build successful " >> build_armclang.tap
-	else
-		echo "==== Not building with armclang compiler ===="
-		echo "ok 1 - # SKIP because armclang not installed" >> build_armclang.tap
-	fi
-
-	rm -rf ${armclang_test_file} ${armclang_test_file}.out
-	module_unload arm-compiler/latest
-}
-
-check_inst_headers() {
-	echo 1..1 > inst_headers.tap
-	echo "==== Testing installed headers ===="
-
-	../contrib/configure-release --prefix=$PWD/install
-	make_clean
-	$MAKEP install
-	../contrib/check_inst_headers.sh $PWD/install/include
-	make_clean distclean
-
-	echo "ok 1 - build successful " >> inst_headers.tap
-}
-
 check_make_distcheck() {
 	echo 1..1 > make_distcheck.tap
 
@@ -679,26 +356,7 @@ check_make_distcheck() {
 		../contrib/configure-release --prefix=$PWD/install
 		$MAKEP DISTCHECK_CONFIGURE_FLAGS="--enable-gtest" distcheck
 	else
-		echo "Not testing make distcheck: GCC version is too old ($(gcc --version|head -1))"
-	fi
-}
-
-check_config_h() {
-	echo 1..1 > check_config_h.tap
-
-	srcdir=$PWD/../src
-
-	# Check if all .c files include config.h
-	echo "==== Checking for config.h files in directory $srcdir ===="
-
-	missing=`find $srcdir \( -name "*.c" -o -name "*.cc" \) -type f -exec grep -LP '\#\s*include\s+"config.h"' {} \;`
-
-	if [ `echo $missing | wc -w` -eq 0 ]
-	then
-		echo "ok 1 - check successful " >> check_config_h.tap
-	else
-		echo "Error: missing include config.h in files: $missing"
-		exit 1
+		log_warning "Not testing make distcheck: GCC version is too old ($(gcc --version|head -1))"
 	fi
 }
 
@@ -751,7 +409,7 @@ rename_files() {
 }
 
 run_client_server_app() {
-	test_name=$1
+	test_exe=$1
 	test_args=$2
 	server_addr_arg=$3
 	kill_server=$4
@@ -763,7 +421,7 @@ run_client_server_app() {
 	affinity_server=$(slice_affinity 0)
 	affinity_client=$(slice_affinity 1)
 
-	taskset -c $affinity_server ${test_name} ${test_args} ${server_port_arg} &
+	taskset -c $affinity_server ${test_exe} ${test_args} ${server_port_arg} &
 	server_pid=$!
 
 	sleep 15
@@ -773,7 +431,7 @@ run_client_server_app() {
 		set +Ee
 	fi
 
-	taskset -c $affinity_client ${test_name} ${test_args} ${server_addr_arg} ${server_port_arg} &
+	taskset -c $affinity_client ${test_exe} ${test_args} ${server_addr_arg} ${server_port_arg} &
 	client_pid=$!
 
 	wait ${client_pid}
@@ -802,21 +460,21 @@ run_hello() {
 	fi
 
 	# set smaller timeouts so the test will complete faster
-	if [[ ${test_args} == *"-e"* ]]
+	if [[ ${test_args} =~ "-e" ]]
 	then
 		export UCX_UD_TIMEOUT=15s
 		export UCX_RC_TIMEOUT=1ms
 		export UCX_RC_RETRY_COUNT=4
 	fi
 
-	if [[ ${test_args} == *"-e"* ]]
+	if [[ ${test_args} =~ "-e" ]]
 	then
 		error_emulation=1
 	else
 		error_emulation=0
 	fi
 
-	run_client_server_app "./examples/${test_name}" "${test_args}" "-n $(hostname)" 0 $error_emulation
+	run_client_server_app "./examples/${test_name}" "${test_args}" "-n $(hostname)" 0 ${error_emulation}
 
 	if [[ ${test_args} == *"-e"* ]]
 	then
@@ -842,7 +500,10 @@ run_ucp_hello() {
 		mem_types_list+="cuda cuda-managed "
 	fi
 
-	for test_mode in -w -f -b -e
+	export UCX_KEEPALIVE_INTERVAL=1s
+	export UCX_KEEPALIVE_NUM_EPS=10
+
+	for test_mode in -w -f -b -erecv -esend -ekeepalive
 	do
 		for mem_type in $mem_types_list
 		do
@@ -851,6 +512,9 @@ run_ucp_hello() {
 		done
 	done
 	rm -f ./ucp_hello_world
+
+	unset UCX_KEEPALIVE_INTERVAL
+	unset UCX_KEEPALIVE_NUM_EPS
 }
 
 #
@@ -891,50 +555,76 @@ run_uct_hello() {
 run_client_server() {
 	test_name=ucp_client_server
 
+	mem_types_list="host"
+
+	if [ "X$have_cuda" == "Xyes" ]
+	then
+		mem_types_list+=" cuda cuda-managed "
+	fi
+
 	if [ ! -x ${test_name} ]
 	then
-		gcc -o ${test_name} ${ucx_inst}/share/ucx/examples/${test_name}.c \
-			-lucp -lucs -I${ucx_inst}/include -L${ucx_inst}/lib \
-			-Wl,-rpath=${ucx_inst}/lib
+		$MAKEP -C examples ${test_name}
 	fi
 
-	server_ip=$(get_rdma_device_ip_addr)
+	server_ip=$1
 	if [ "$server_ip" == "" ]
 	then
 		return
 	fi
 
-	run_client_server_app "./${test_name}" "" "-a ${server_ip}" 1 0
+	for mem_type in ${mem_types_list}
+	do
+		echo "==== Running UCP client-server with \"${mem_type}\" memory type ===="
+		run_client_server_app "./examples/${test_name}" "-m ${mem_type}" "-a ${server_ip}" 1 0
+	done
 }
 
 run_ucp_client_server() {
 	echo "==== Running UCP client-server  ===="
-	run_client_server
-
-	rm -f ./ucp_client_server
+	run_client_server $(get_rdma_device_ip_addr)
+	run_client_server $(get_non_rdma_ip_addr)
+	run_client_server "127.0.0.1"
 }
 
 run_io_demo() {
-	server_ip=$(get_rdma_device_ip_addr)
-	if [ "$server_ip" == "" ]
+	server_rdma_addr=$(get_rdma_device_ip_addr)
+	server_nonrdma_addr=$(get_non_rdma_ip_addr)
+	server_loopback_addr="127.0.0.1"
+	mem_types_list="host "
+
+	if [ "X$have_cuda" == "Xyes" ]
+	then
+		mem_types_list+="cuda cuda-managed "
+	fi
+
+	if [ -z "$server_rdma_addr" ] && [ -z "$server_nonrdma_addr" ]
 	then
 		return
 	fi
 
-	echo "==== Running UCP IO demo  ===="
+	for mem_type in $mem_types_list
+	do
+		echo "==== Running UCP IO demo with \"${mem_type}\" memory type ===="
 
-	test_args="$@ -o write,read -d 128:4194304 -i 10000 -w 10"
-	test_name=io_demo
+		test_args="$@ -o write,read -d 128:4194304 -P 2 -i 10000 -w 10 -m ${mem_type}"
+		test_name=io_demo
 
-	if [ ! -x ${test_name} ]
-	then
-		$MAKEP -C test/apps/iodemo ${test_name}
-	fi
+		if [ ! -x ${test_name} ]
+		then
+			$MAKEP -C test/apps/iodemo ${test_name}
+		fi
 
-	export UCX_SOCKADDR_CM_ENABLE=y
-	run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "${server_ip}" 1 0
+		for server_ip in $server_rdma_addr $server_nonrdma_addr $server_loopback_addr
+		do
+			run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "${server_ip}" 1 0
+			for server_ip in $server_rdma_addr $server_nonrdma_addr
+			do
+				run_client_server_app "./test/apps/iodemo/${test_name}" "${test_args}" "${server_ip}" 1 0
+			done
+		done
+	done
 
-	unset UCX_SOCKADDR_CM_ENABLE
 	make_clean
 }
 
@@ -1096,16 +786,21 @@ run_ucx_perftest() {
 # Test malloc hooks with mpi
 #
 test_malloc_hooks_mpi() {
-	for tname in malloc_hooks malloc_hooks_unmapped external_events flag_no_install
+	for mode in reloc bistro
 	do
-		echo "==== Running memory hook (${tname}) on MPI ===="
-		$MPIRUN -np 1 $AFFINITY ./test/mpi/test_memhooks -t $tname
-	done
+		for tname in malloc_hooks malloc_hooks_unmapped external_events flag_no_install
+		do
+			echo "==== Running memory hook (${tname} mode ${mode}) on MPI ===="
+			$MPIRUN -np 1 $AFFINITY \
+				./test/mpi/test_memhooks -t $tname -m ${mode}
+		done
 
-	echo "==== Running memory hook (malloc_hooks) on MPI with LD_PRELOAD ===="
-	ucm_lib=$PWD/src/ucm/.libs/libucm.so
-	ls -l $ucm_lib
-	$MPIRUN -np 1 -x LD_PRELOAD=$ucm_lib $AFFINITY ./test/mpi/test_memhooks -t malloc_hooks
+		echo "==== Running memory hook (malloc_hooks mode ${mode}) on MPI with LD_PRELOAD ===="
+		ucm_lib=$PWD/src/ucm/.libs/libucm.so
+		ls -l $ucm_lib
+		$MPIRUN -np 1 -x LD_PRELOAD=$ucm_lib $AFFINITY \
+			./test/mpi/test_memhooks -t malloc_hooks -m ${mode}
+	done
 }
 
 #
@@ -1222,6 +917,15 @@ test_ucp_dlopen() {
 	fi
 }
 
+test_init_mt() {
+	echo "==== Running multi-thread init ===="
+	$MAKEP
+	for ((i=0;i<50;++i))
+	do
+		$AFFINITY timeout 1m ./test/apps/test_init_mt
+	done
+}
+
 test_memtrack() {
 	../contrib/configure-devel --prefix=$ucx_inst
 	make_clean
@@ -1234,7 +938,7 @@ test_memtrack() {
 test_unused_env_var() {
 	# We must create a UCP worker to get the warning about unused variables
 	echo "==== Running ucx_info env vars test ===="
-	UCX_SOCKADDR_CM_ENABLE=y UCX_IB_PORTS=mlx5_0:1 ./src/tools/info/ucx_info -epw -u t | grep "unused" | grep -q -E "UCX_IB_PORTS"
+	UCX_IB_PORTS=mlx5_0:1 ./src/tools/info/ucx_info -epw -u t | grep "unused" | grep -q -E "UCX_IB_PORTS"
 }
 
 test_env_var_aliases() {
@@ -1280,113 +984,42 @@ test_malloc_hook() {
 	then
 		./test/apps/test_tcmalloc
 	fi
-}
 
-test_jucx() {
-	echo "==== Running jucx test ===="
-	echo "1..2" > jucx_tests.tap
-	iface=`ibdev2netdev | grep Up | awk '{print $5}' | head -1`
-	if [ -z "$iface" ]
-	then
-		echo "Failed to find active ib devices." >> jucx_tests.tap
-		return
-	elif module_load dev/jdk && module_load dev/mvn
+	if [ "X$have_cuda" == "Xyes" ]
 	then
-		jucx_port=$((20000 + EXECUTOR_NUMBER))
-		export JUCX_TEST_PORT=$jucx_port
-		export UCX_MEM_EVENTS=no
-		$MAKE -C bindings/java/src/main/native test
-		ifaces=`ibdev2netdev | grep Up | awk '{print $5}'`
-		if [ -n "$ifaces" ]
-		then
-			$MAKE -C bindings/java/src/main/native package
-		fi
-		for iface in $ifaces
+		cuda_dynamic_exe=./test/apps/test_cuda_hook_dynamic
+		cuda_static_exe=./test/apps/test_cuda_hook_static
+
+		for mode in reloc bistro
 		do
-			if [ -n "$iface" ]
-			then
-				server_ip=$(get_ifaddr ${iface})
-			fi
+			export UCX_MEM_CUDA_HOOK_MODE=${mode}
+
+			# Run cuda memory hooks with dynamic link
+			${cuda_dynamic_exe}
 
-			if [ -z "$server_ip" ]
+			# Run cuda memory hooks with static link, if exists. If the static
+			# library 'libcudart_static.a' is not present, static test will not
+			# be built.
+			if [ -x ${cuda_static_exe} ]
 			then
-				echo "Interface $iface has no IPv4"
-				continue
+				${cuda_static_exe} && status="pass" || status="fail"
+				[ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail"
+				if [ ${status} == ${exp_status} ]
+				then
+					echo "Static link with cuda ${status}, as expected"
+				else
+					echo "Static link with cuda is expected to ${exp_status}, actual: ${status}"
+					exit 1
+				fi
 			fi
 
-			echo "Running standalone benchamrk on $iface"
-
-			java -XX:ErrorFile=$WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log  \
-			     -XX:OnError="cat $WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log" \
-			     -cp "bindings/java/resources/:bindings/java/src/main/native/build-java/*" \
-			  org.openucx.jucx.examples.UcxReadBWBenchmarkReceiver \
-			     s=$server_ip p=$JUCX_TEST_PORT &
-			     java_pid=$!
-
-			sleep 10
+			# Test that driver API hooks work in both reloc and bistro modes,
+			# since we call them directly from the test
+			${cuda_dynamic_exe} -d
+			[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d
 
-			java -XX:ErrorFile=$WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log \
-			     -XX:OnError="cat $WORKSPACE/hs_err_${BUILD_NUMBER}_%p.log" \
-			     -cp "bindings/java/resources/:bindings/java/src/main/native/build-java/*"  \
-			  org.openucx.jucx.examples.UcxReadBWBenchmarkSender \
-			     s=$server_ip p=$JUCX_TEST_PORT t=10000000
-			wait $java_pid
+			unset UCX_MEM_CUDA_HOOK_MODE
 		done
-
-		unset JUCX_TEST_PORT
-		unset UCX_MEM_EVENTS
-		module unload dev/jdk
-		module unload dev/mvn
-		echo "ok 1 - jucx test" >> jucx_tests.tap
-	else
-		echo "Failed to load dev/jdk and dev/mvn modules." >> jucx_tests.tap
-	fi
-}
-
-#
-# Run Coverity and report errors
-# The argument is a UCX build type: devel or release
-#
-run_coverity() {
-	echo 1..1 > coverity.tap
-	if module_load tools/cov
-	then
-		ucx_build_type=$1
-
-		echo "==== Running coverity ===="
-		../contrib/configure-$ucx_build_type --prefix=$ucx_inst
-		make_clean
-		cov_build_id="cov_build_${ucx_build_type}_${BUILD_NUMBER}"
-		cov_build="$WORKSPACE/$cov_build_id"
-		rm -rf $cov_build
-		cov-build --dir $cov_build $MAKEP all
-		cov-analyze --jobs $parallel_jobs $COV_OPT --security --concurrency --dir $cov_build
-		nerrors=$(cov-format-errors --dir $cov_build | awk '/Processing [0-9]+ errors?/ { print $2 }')
-		rc=$(($rc+$nerrors))
-
-		index_html=$(cd $cov_build && find . -name index.html | cut -c 3-)
-		if [ -z "$BUILD_URL" ]; then
-			cov_url="${WS_URL}/${cov_build_id}/${index_html}"
-		else
-			cov_url="${BUILD_URL}/artifact/${cov_build_id}/${index_html}"
-		fi
-		rm -f jenkins_sidelinks.txt
-		if [ $nerrors -gt 0 ]; then
-			cov-format-errors --dir $cov_build --emacs-style
-			echo "not ok 1 Coverity Detected $nerrors failures # $cov_url" >> coverity.tap
-		else
-			echo "ok 1 Coverity found no issues" >> coverity.tap
-			rm -rf $cov_build
-		fi
-
-		echo Coverity report: $cov_url
-		printf "%s\t%s\n" Coverity $cov_url >> jenkins_sidelinks.txt
-		module unload tools/cov
-
-		return $rc
-	else
-		echo "==== Not running Coverity ===="
-		echo "ok 1 - # SKIP because Coverity not installed" >> coverity.tap
 	fi
 }
 
@@ -1504,12 +1137,12 @@ run_gtest() {
 		# Load newer valgrind if naative is older than 3.10
 		if ! (echo "valgrind-3.10.0"; valgrind --version) | sort -CV
 		then
-			module load tools/valgrind-latest
+			module load tools/valgrind-3.12.0
 		fi
 
 		$AFFINITY $TIMEOUT_VALGRIND make -C test/gtest test_valgrind
 		(cd test/gtest && rename_files .tap _vg.tap *.tap && mv *.tap $GTEST_REPORT_DIR)
-		module unload tools/valgrind-latest
+		module unload tools/valgrind-3.12.0
 	else
 		echo "==== Not running valgrind tests with $compiler_name compiler ===="
 		echo "1..1"                                          > vg_skipped.tap
@@ -1586,8 +1219,15 @@ run_ucx_tl_check() {
 
 	echo "1..1" > ucx_tl_check.tap
 
+	# Test transport selection
 	../test/apps/test_ucx_tls.py -p $ucx_inst
 
+	# Test setting many lanes
+	UCX_IB_NUM_PATHS=8 \
+		UCX_MAX_EAGER_LANES=4 \
+		UCX_MAX_RNDV_LANES=4 \
+		./src/tools/info/ucx_info -u t -e
+
 	if [ $? -ne 0 ]; then
 		echo "not ok 1" >> ucx_tl_check.tap
 	else
@@ -1604,24 +1244,11 @@ run_tests() {
 	export UCX_ERROR_MAIL_TO=$ghprbActualCommitAuthorEmail
 	export UCX_ERROR_MAIL_FOOTER=$JOB_URL/$BUILD_NUMBER/console
 	export UCX_TCP_PORT_RANGE="$((33000 + EXECUTOR_NUMBER * 100))"-"$((34000 + EXECUTOR_NUMBER * 100))"
-	export UCX_TCP_CM_ALLOW_ADDR_INUSE=y
-
-	# test cuda build if cuda modules available
-	do_distributed_task 2 4 build_cuda
+	export UCX_TCP_CM_REUSEADDR=y
 
 	# load cuda env only if GPU available for remaining tests
 	try_load_cuda_env
 
-	do_distributed_task 0 4 build_icc
-	do_distributed_task 0 4 build_pgi
-	do_distributed_task 1 4 build_debug
-	do_distributed_task 1 4 build_prof
-	do_distributed_task 1 4 build_ugni
-	do_distributed_task 3 4 build_clang
-	do_distributed_task 0 4 build_armclang
-	do_distributed_task 1 4 build_gcc_latest
-	do_distributed_task 0 4 build_jucx
-
 	# all are running mpi tests
 	run_mpi_tests
 
@@ -1634,43 +1261,33 @@ run_tests() {
 	$MAKEP
 	$MAKEP install
 
-	run_ucx_tl_check
-
+	do_distributed_task 2 4 run_ucx_tl_check
 	do_distributed_task 1 4 run_ucp_hello
 	do_distributed_task 2 4 run_uct_hello
 	do_distributed_task 1 4 run_ucp_client_server
 	do_distributed_task 2 4 run_ucx_perftest
 	do_distributed_task 1 4 run_io_demo
 	do_distributed_task 3 4 test_profiling
-	do_distributed_task 0 3 test_jucx
 	do_distributed_task 1 4 test_ucs_dlopen
 	do_distributed_task 3 4 test_ucs_load
 	do_distributed_task 3 4 test_memtrack
 	do_distributed_task 0 4 test_unused_env_var
 	do_distributed_task 2 4 test_env_var_aliases
-	do_distributed_task 1 3 test_malloc_hook
+	do_distributed_task 1 4 test_malloc_hook
 	do_distributed_task 0 4 test_ucp_dlopen
+	do_distributed_task 1 4 test_init_mt
 
 	# all are running gtest
 	run_gtest_default
 	run_gtest_armclang
 
-	do_distributed_task 3 4 run_coverity release
-	do_distributed_task 0 4 run_coverity devel
 	do_distributed_task 1 4 run_gtest_release
 }
 
 prepare
 try_load_cuda_env
-do_distributed_task 0 4 build_docs
-do_distributed_task 0 4 build_java_docs
-do_distributed_task 0 4 build_disable_numa
-do_distributed_task 1 4 build_no_verbs
-do_distributed_task 2 4 build_release_pkg
-do_distributed_task 3 4 check_inst_headers
-do_distributed_task 1 4 check_make_distcheck
-do_distributed_task 2 4 check_config_h
 if [ -n "$JENKINS_RUN_TESTS" ] || [ -n "$RUN_TESTS" ]
 then
+	check_machine
 	run_tests
 fi
diff --git a/contrib/ucx_perftest_config/test_types_ucp b/contrib/ucx_perftest_config/test_types_ucp
index 2a9ecfa44d6..2c28996fe65 100644
--- a/contrib/ucx_perftest_config/test_types_ucp
+++ b/contrib/ucx_perftest_config/test_types_ucp
@@ -1,36 +1,48 @@
-# UCP
-ucp_iov_contig_tag_lat      -t tag_lat -D iov,contig
-ucp_iov_iov_tag_lat         -t tag_lat -D iov,iov
-ucp_contig_contig_tag_lat   -t tag_lat -D contig,contig
+#
+# UCP basic
+#
+ucp_iov_contig_tag_lat               -t tag_lat -D iov,contig
+ucp_iov_iov_tag_lat                  -t tag_lat -D iov,iov
+ucp_contig_tag_lat                   -t tag_lat -D contig,contig
 #IOV with RNDV is not yet supported
-#ucp_contig_iov_tag_lat      -t tag_lat -D contig,iov
-ucp_iov_contig_tag_bw       -t tag_bw  -D iov,contig
-ucp_iov_iov_tag_bw          -t tag_bw  -D iov,iov
-ucp_contig_contig_tag_bw    -t tag_bw  -D contig,contig
+#ucp_contig_iov_tag_lat              -t tag_lat -D contig,iov
+ucp_iov_contig_tag_bw                -t tag_bw  -D iov,contig
+ucp_iov_iov_tag_bw                   -t tag_bw  -D iov,iov
+ucp_contig_tag_bw                    -t tag_bw  -D contig,contig
 #IOV with RNDV is not yet supported
-#ucp_contig_iov_tag_bw       -t tag_bw  -D contig,iov
-ucp_sync_tag_lat            -t tag_sync_lat
-ucp_unexp_tag_lat           -t tag_lat -U
-ucp_wild_tag_lat            -t tag_lat -C
-ucp_contig_stream_bw        -t stream_bw  -r recv_data
-ucp_contig_stream_lat       -t stream_lat -r recv_data
-ucp_contig_stream_bw        -t stream_bw  -r recv
-ucp_contig_stream_lat       -t stream_lat -r recv
-#CUDA
-ucp_contig_contig_cuda_tag_lat   -t tag_lat -D contig,contig -m cuda,cuda
-ucp_contig_contig_cuda_tag_lat   -t tag_lat -D contig,contig -m cuda,host
-ucp_contig_contig_cuda_tag_lat   -t tag_lat -D contig,contig -m host,cuda
-ucp_contig_contig_cuda_tag_bw    -t tag_bw  -D contig,contig -m cuda,cuda
-ucp_contig_contig_cuda_tag_bw    -t tag_bw  -D contig,contig -m cuda,host
-ucp_contig_contig_cuda_tag_bw    -t tag_bw  -D contig,contig -m host,cuda
-ucp_contig_cuda_stream_bw        -t stream_bw  -r recv_data -m cuda
-ucp_contig_cuda_stream_lat       -t stream_lat -r recv_data -m cuda
-ucp_contig_cuda_stream_bw        -t stream_bw  -r recv -m cuda
-ucp_contig_cuda_stream_lat       -t stream_lat -r recv -m cuda
-ucp_contig_contig_cuda_mng_tag_lat   -t tag_lat -D contig,contig -m cuda-managed
-ucp_contig_contig_cuda_mng_tag_bw    -t tag_bw  -D contig,contig -m cuda-managed
-ucp_contig_cuda_mng_stream_bw        -t stream_bw  -r recv_data -m cuda-managed
-ucp_contig_cuda_mng_stream_lat       -t stream_lat -r recv_data -m cuda-managed
+#ucp_contig_iov_tag_bw               -t tag_bw  -D contig,iov
+ucp_sync_tag_lat                     -t tag_sync_lat
+ucp_unexp_tag_lat                    -t tag_lat -U
+ucp_wild_tag_lat                     -t tag_lat -C
+ucp_contig_stream_data_bw            -t stream_bw  -r recv_data
+ucp_contig_stream_data_lat           -t stream_lat -r recv_data
+ucp_contig_stream_bw                 -t stream_bw  -r recv
+ucp_contig_stream_lat                -t stream_lat -r recv
+#
+# CUDA
+#
+ucp_contig_cuda_tag_lat              -t tag_lat -D contig,contig -m cuda,cuda
+ucp_contig_cuda_host_tag_lat         -t tag_lat -D contig,contig -m cuda,host
+ucp_contig_host_cuda_tag_lat         -t tag_lat -D contig,contig -m host,cuda
+ucp_contig_cuda_tag_bw               -t tag_bw  -D contig,contig -m cuda,cuda
+ucp_contig_cuda_host_tag_bw          -t tag_bw  -D contig,contig -m cuda,host
+ucp_contig_host_cuda_tag_bw          -t tag_bw  -D contig,contig -m host,cuda
+ucp_contig_cuda_stream_bw            -t stream_bw  -r recv -m cuda
+ucp_contig_cuda_stream_lat           -t stream_lat -r recv -m cuda
+ucp_contig_cuda_stream_data_bw       -t stream_bw  -r recv_data -m cuda
+ucp_contig_cuda_stream_data_lat      -t stream_lat -r recv_data -m cuda
+ucp_contig_cuda_mng_tag_lat          -t tag_lat -D contig,contig -m cuda-managed
+ucp_contig_cuda_mng_tag_bw           -t tag_bw  -D contig,contig -m cuda-managed
+ucp_contig_cuda_mng_stream_data_bw   -t stream_bw  -r recv_data -m cuda-managed
+ucp_contig_cuda_mng_stream_data_lat  -t stream_lat -r recv_data -m cuda-managed
 ucp_contig_cuda_mng_stream_bw        -t stream_bw  -r recv -m cuda-managed
 ucp_contig_cuda_mng_stream_lat       -t stream_lat -r recv -m cuda-managed
-
+#
+# CUDA wakeup mode
+#
+ucp_contig_cuda_tag_lat_sleep        -I -E sleep  -t tag_lat -D contig,contig -m cuda,cuda
+ucp_contig_cuda_host_tag_lat_sleep   -I -E sleep  -t tag_lat -D contig,contig -m cuda,host
+ucp_contig_host_cuda_tag_lat_sleep   -I -E sleep  -t tag_lat -D contig,contig -m host,cuda
+ucp_contig_cuda_tag_bw_sleep         -I -E sleep  -t tag_bw  -D contig,contig -m cuda,cuda
+ucp_contig_cuda_host_tag_bw_sleep    -I -E sleep  -t tag_bw  -D contig,contig -m cuda,host
+ucp_contig_host_cuda_tag_bw_sleep    -I -E sleep  -t tag_bw  -D contig,contig -m host,cuda
diff --git a/contrib/ucx_perftest_config/test_types_uct b/contrib/ucx_perftest_config/test_types_uct
index 2769ee481de..40edda3cb20 100644
--- a/contrib/ucx_perftest_config/test_types_uct
+++ b/contrib/ucx_perftest_config/test_types_uct
@@ -6,12 +6,14 @@ put_short_lat -t put_lat -D short
 put_bcopy_lat -t put_lat -D bcopy
 put_zcopy_lat -t put_lat -D zcopy
 # AM
-am_short_lat  -t am_lat -D short
-am_bcopy_lat  -t am_lat -D bcopy
-am_zcopy_lat  -t am_lat -D zcopy
-am_short_bw   -t am_bw -D short
-am_bcopy_bw   -t am_bw -D bcopy
-am_zcopy_bw   -t am_bw -D zcopy
+am_short_lat     -t am_lat -D short
+am_short_iov_lat -t am_lat -D shortiov
+am_bcopy_lat     -t am_lat -D bcopy
+am_zcopy_lat     -t am_lat -D zcopy
+am_short_bw      -t am_bw  -D short
+am_short_iov_bw  -t am_bw  -D shortiov
+am_bcopy_bw      -t am_bw  -D bcopy
+am_zcopy_bw      -t am_bw  -D zcopy
 # GET
 get_bcopy     -t get -D bcopy
 get_zcopy     -t get -D zcopy
diff --git a/contrib/valgrind.supp b/contrib/valgrind.supp
index a6ad76fc0a3..37718a5d7a3 100644
--- a/contrib/valgrind.supp
+++ b/contrib/valgrind.supp
@@ -287,3 +287,20 @@
     ...
     fun:cudaGetDeviceCount
 }
+{
+   rdmacm_event_channel
+   Memcheck:Leak
+   ...
+   fun:rdma_create_event_channel
+}
+{
+   rdmacm_bind_addr
+   Memcheck:Leak
+   ...
+   fun:rdma_bind_addr
+}
+{
+   xpmem_get
+   Memcheck:Cond
+   fun:xpmem_get
+}
diff --git a/debian/control.in b/debian/control.in
index 767f02fe054..8575b7d0ef7 100644
--- a/debian/control.in
+++ b/debian/control.in
@@ -14,6 +14,7 @@ Homepage: http://www.openucx.org
 
 Package: @PACKAGE@
 Section: libs
+Depends: libc6, libgomp1, libnuma1
 Architecture: any
 Description: Unified Communication X
  UCX is a communication library implementing high-performance messaging.
diff --git a/debian/rules.in b/debian/rules.in
index a2e812d695b..9f85dfecb7d 100755
--- a/debian/rules.in
+++ b/debian/rules.in
@@ -13,7 +13,7 @@
 	dh $@ 
 
 override_dh_auto_configure:
-	@top_top_srcdir@/contrib/configure-release --prefix=/usr --enable-examples
+	@top_top_srcdir@/contrib/configure-release --prefix=/usr --enable-examples --with-java=no
 	chmod +x debian/rules
 
 override_dh_shlibdeps:
diff --git a/docs/doxygen/design.md b/docs/doxygen/design.md
index dc88dccbe3f..94d008db512 100644
--- a/docs/doxygen/design.md
+++ b/docs/doxygen/design.md
@@ -22,8 +22,8 @@ the differences across various hardware architectures and provides a
 low-level API that enables the implementation of communication protocols.
 The primary goal of the layer is to provide direct and efficient access to
 hardware network functionality. For this purpose,
-UCT relies on vendor provided low-level drivers such as InfiniBand
-Verbs, Cray's uGNI, libfabrics, etc. In addition, the layer provides
+UCT relies on vendor provided low-level drivers such as uGNI, Verbs,
+shared memory, ROCM, CUDA. In addition, the layer provides
 constructs for communication context management (thread-based and application level), and
 allocation and management of device-specific memories including those found
 in accelerators. In terms of communication APIs, UCT defines interfaces for
diff --git a/docs/doxygen/intro.md b/docs/doxygen/intro.md
index 35404bbfd67..178849929d2 100644
--- a/docs/doxygen/intro.md
+++ b/docs/doxygen/intro.md
@@ -42,8 +42,8 @@ communications (one-sided and two-sided), collective communication,
 and remote atomic operations required for popular parallel programming models.
 Also, the initial UCX reference implementation
 is targeted to support current network technologies such as:
-+ Open Fabrics - InfiniBand (Mellanox, Qlogic, IBM), libfabrics, iWARP, RoCE
-+ Cray GEMINI \& ARIES
++ Open Fabrics - InfiniBand (Mellanox, Qlogic, IBM), iWARP, RoCE
++ Cray uGNI - GEMINI and ARIES interconnects
 + Shared memory (MMAP, Posix, CMA, KNEM, XPMEM, etc.)
 + Ethernet (TCP/UDP)
 
diff --git a/docs/source/faq.md b/docs/source/faq.md
index 9a0d2a70b17..a5fc4fef490 100644
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@@ -10,12 +10,12 @@ and relatively easy way to construct widely used HPC protocols: MPI tag matching
 RMA operations, rendezvous protocols, stream, fragmentation, remote atomic operations, etc.
 
 #### What is UCP, UCT, UCS?
-* **UCT** is a transport layer that abstracts the differences across various hardware architectures and provides a low-level API that enables the implementation of communication protocols. The primary goal of the layer is to provide direct and efficient access to hardware network resources with minimal software overhead. For this purpose UCT relies on low-level drivers provided by vendors such as InfiniBand Verbs, Cray's uGNI, libfabrics, etc. In addition, the layer provides constructs for communication context management (thread-based and ap- plication level), and allocation and management of device- specific memories including those found in accelerators. In terms of communication APIs, UCT defines interfaces for immediate (short), buffered copy-and-send (bcopy), and zero- copy (zcopy) communication operations. The short operations are optimized for small messages that can be posted and completed in place. The bcopy operations are optimized for medium size messages that are typically sent through a so- called bouncing-buffer. Finally, the zcopy operations expose zero-copy memory-to-memory communication semantics.
+* **UCT** is a transport layer that abstracts the differences across various hardware architectures and provides a low-level API that enables the implementation of communication protocols. The primary goal of the layer is to provide direct and efficient access to hardware network resources with minimal software overhead. For this purpose, UCT relies on low-level drivers such as uGNI, Verbs, shared memory, ROCM, CUDA. In addition, the layer provides constructs for communication context management (thread-based and application level), and allocation and management of device-specific memories including those found in accelerators. In terms of communication APIs, UCT defines interfaces for immediate (short), buffered copy-and-send (bcopy), and zero-copy (zcopy) communication operations. The short operations are optimized for small messages that can be posted and completed in place. The bcopy operations are optimized for medium size messages that are typically sent through a so-called bouncing-buffer. Finally, the zcopy operations expose zero-copy memory-to-memory communication semantics.
 
 * **UCP** implements higher-level protocols that are typically used by message passing (MPI) and PGAS programming models by using lower-level capabilities exposed through the UCT layer.
 UCP is responsible for the following functionality: initialization of the library, selection of transports for communication, message fragmentation, and multi-rail communication. Currently, the API has the following classes of interfaces: Initialization, Remote Memory Access (RMA) communication, Atomic Memory Operations (AMO), Active Message, Tag-Matching, and Collectives. 
 
-* **UCS** is a service layer that provides the necessary func- tionality for implementing portable and efficient utilities. 
+* **UCS** is a service layer that provides the necessary functionality for implementing portable and efficient utilities.
 
 #### How can I contribute?
 1. Fork
@@ -35,20 +35,20 @@ submit issues on github: https://github.com/openucx/ucx/issues
 The UCX framework is maintained and supported by hardware vendors in addition to the open source community. Every pull-request is tested and multiple hardware platforms supported by vendors community.
 
 * **Performance, performance, performance!** 
-The framework design, data structures, and components are design to provide highly optimized access to the network hardware. 
+The framework architecture, data structures, and components are designed to provide optimized access to the network hardware.
 
 * **High level API for a broad range HPC programming models.**  
-UCX provides a high level API implemented in software 'UCP' to fill in the gaps across interconnects. This allows to use a single set of APIs in a library  to implement multiple interconnects. This reduces the level of complexities when implementing libraries such as Open MPI or OpenSHMEM.  Because of this, UCX performance portable  because a single implementation (in Open MPI or OpenSHMEM) will work efficiently on multiple interconnects. (e.g. uGNI, Verbs, libfabrics, etc). 
+UCX provides a high-level and performance-portable network API. The API targets a variety of programming models ranging from high-performance MPI implementation to Apache Spark. UCP API abstracts differences and fills in the gaps across interconnects implemented in the UCT layer. As a result, implementations of programming models and libraries (MPI, OpenSHMEM, Apache Spark, RAPIDS, etc.) is simplified while providing efficient support for multiple interconnects (uGNI, Verbs, TCP, shared memory, ROCM, CUDA, etc.).
 
 * **Support for interaction between multiple transports (or providers) to deliver messages.**  
-For example, UCX has the logic (in UCP) to make 'GPUDirect', IB' and share memory work together efficiently to deliver the data where is needed without the user dealing with this. 
+For example, UCX has the logic (in UCP) to make 'GPUDirect', IB' and share memory work together efficiently to deliver the data where it is needed without the user dealing with this.
 
 * **Cross-transport multi-rail capabilities.** UCX protocol layer can utilize multiple transports,
  event on different types of hardware, to deliver messages faster, without the need for
  any special tuning.
 
 * **Utilizing hardware offloads for optimized performance**, such as RDMA, Hardware tag-matching
-  hardware atomic operations, etc. 
+ hardware atomic operations, etc.
 
 #### What protocols are supported by UCX?
 UCP implements RMA put/get, send/receive with tag matching, Active messages, atomic operations. In near future we plan to add support for commonly used collective operations.
@@ -61,10 +61,10 @@ Instead, GASNET can leverage UCX framework for fast end efficient implementation
 UCX framework does not provide drivers, instead it relies on the drivers provided by vendors. Currently we use: OFA VERBs, Cray's UGNI, NVIDIA CUDA.
 
 #### What is the relation between UCX and OFA Verbs or Libfabrics?
-UCX, is a middleware communication layer that relies on vendors provided user level drivers including OFA Verbs or libfabrics (or any other drivers provided by another communities or vendors) to implement high-level protocols which can be used to close functionality gaps between various vendors drivers including various libfabrics providers: coordination across various drivers, multi-rail capabilities, software based RMA, AMOs, tag-matching for transports and drivers that do not support such capabilities natively.
+UCX is a middleware communication framework that relies on device drivers, e.g. RDMA, CUDA, ROCM. RDMA and OS-bypass network devices typically implement device drivers using the RDMA-core Linux subsystem that is supported by UCX. Support for other network abstractions can be added based on requests and contributions from the community.
 
-#### Is UCX a user level driver?
-No. Typically,  Drivers  aim to expose fine-grain access to the network architecture specific features.
+#### Is UCX a user-level driver?
+UCX is not a user-level driver. Typically, drivers aim to expose fine-grained access to the network architecture-specific features.
 UCX abstracts the differences across various drivers and fill-in the gaps using software protocols for some of the architectures that don't provide hardware level support for all the operations.
 
 <br/>
@@ -99,15 +99,21 @@ UCX does not depend on an external runtime environment.
 
 UCX takes parameters from specific **environment variables**, which start with the
 prefix `UCX_`.  
-> **IMPORTANT NOTE:** Changing the values of UCX environment variables to non-default
-may lead to undefined behavior. The environment variables are mostly indented for
- dvanced users, or for specific tunings or workarounds recommended by UCX community.
+> **IMPORTANT NOTE:** Setting UCX environment variables to non-default values
+may lead to undefined behavior. The environment variables are mostly intended for
+advanced users, or for specific tunings or workarounds recommended by the UCX community.
 
-#### 2. Where can I see all UCX environment variables?
+#### Where can I see all UCX environment variables?
 
 * Running `ucx_info -c` prints all environment variables and their default values.
 * Running `ucx_info -cf` prints the documentation for all environment variables.
 
+#### UCX configuration file
+
+Upon installing the UCX package (RPM/DEB), a `/etc/ucx/ucx.conf` file is created.
+It allows customization of the various parameters. An environment variable
+has precedence over the value defined in `ucx.conf`.
+
 
 <br/>
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8712fdb5f29..58c8070e373 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,4 +1,4 @@
-.. 
+..
 .. Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
 ..
 .. See file LICENSE for terms.
@@ -9,12 +9,12 @@ OpenUCX
 *******
 
 Unified Communication X (UCX) is an `award winning <https://losalamosreporter.com/2019/11/07/nine-los-alamos-national-laboratory-projects-win-rd-100-awards>`_,
-optimized production proven communication framework for modern, high-bandwidth
+optimized production-proven communication framework for modern, high-bandwidth
 and low-latency networks.
 
-UCX exposes a set of abstract communication primitives which utilize the best of
+UCX exposes a set of abstract communication primitives that utilize the best of
 available hardware resources and offloads. These include RDMA (InfiniBand and RoCE),
-TCP, GPUs, shared Memory, and network atomic operations.
+TCP, GPUs, shared memory, and network atomic operations.
 
 UCX facilitates rapid development by providing a high-level API, masking the
 low-level details, while maintaining high-performance and scalability.
@@ -73,7 +73,7 @@ Documentation
 *************
 
 *  API doc: `HTML <https://openucx.github.io/ucx/api/v{VERSION}/html/index.html>`_ `PDF <https://openucx.github.io/ucx/api/v{VERSION}/ucx-v{VERSION}.pdf>`_
-* `Examples <https://github.com/openucx/ucx/tree/v{VERSION}.x/test/examples>`_
+* `Examples <https://github.com/openucx/ucx/tree/v{VERSION}.x/examples>`_
 
 
 Projects using UCX
diff --git a/docs/source/running.md b/docs/source/running.md
index 76e773ed900..006b9facd05 100644
--- a/docs/source/running.md
+++ b/docs/source/running.md
@@ -75,7 +75,7 @@ improvements.
   ```
   $ mkdir build-ucx
   $ cd build-ucx
-  $ ../configure --prefix=<ucx-install-path> --with-ucx=<ompi-install-path>
+  $ ../configure --prefix=<ompi-install-path> --with-ucx=<ucx-install-path>
   ```
 > **NOTE**: With OpenMPI 4.0 and above, there could be compilation errors from "btl_uct" component.
 > This component is not critical for using UCX; so it could be disabled this way:
diff --git a/docs/source/ucx_features.rst b/docs/source/ucx_features.rst
index a7bfdbd52a4..681e06adb29 100644
--- a/docs/source/ucx_features.rst
+++ b/docs/source/ucx_features.rst
@@ -57,6 +57,6 @@ Protocols, Optimizations and Advanced Features
 - Pipeline protocols for GPU memory
 - QoS and traffic isolation for RDMA transports
 - Platform (micro-architecture) specific optimizations (such as memcpy, memory barriers, etc.)
-- Multi-rail support
+- Multi-rail and RoCE link aggregation group support
 - Bare-metal, containers and cloud environments support
 - Advanced protocols for transfer messages of different sizes
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 05cde2765dd..e9ed2d49652 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -13,18 +13,20 @@ dist_examples_DATA =   \
 	ucp_client_server.c
 
 if HAVE_CUDA
-EXAMPLE_CUDA_LDFLAGS = $(CUDA_LDFLAGS)
+EXAMPLE_CUDA_LD_FLAGS = $(CUDA_LDFLAGS)
+EXAMPLE_CUDA_LIBS = $(CUDA_LIBS)
 # cuda.h couldn't be compiled with -pedantic flag
 EXAMPLE_CUDA_CFLAGS =
 EXAMPLE_CUDA_CPPFLAGS = $(CUDA_CPPFLAGS) -DHAVE_CUDA
 else
-EXAMPLE_CUDA_LDFLAGS =
+EXAMPLE_CUDA_LD_FLAGS =
+EXAMPLE_CUDA_LIBS =
 EXAMPLE_CUDA_CFLAGS = $(CFLAGS_PEDANTIC)
 EXAMPLE_CUDA_CPPFLAGS =
 endif
 
 EXAMPLE_CCLD_FLAGS = -lucs -I$(includedir) -L$(libdir) -Wall -Werror -Wl,-rpath,$(libdir) \
-                     $(EXAMPLE_CUDA_LDFLAGS) $(EXAMPLE_CUDA_CPPFLAGS)
+                     $(EXAMPLE_CUDA_LD_FLAGS) $(EXAMPLE_CUDA_LIBS) $(EXAMPLE_CUDA_CPPFLAGS)
 
 installcheck-local:
 	@echo "INSTALLCHECK: Compiling examples with installed library"
@@ -35,7 +37,7 @@ installcheck-local:
 
 if HAVE_EXAMPLES
 
-bin_PROGRAMS = \
+noinst_PROGRAMS = \
 	ucp_hello_world \
 	uct_hello_world \
 	ucp_client_server
@@ -43,21 +45,25 @@ bin_PROGRAMS = \
 ucp_hello_world_SOURCES  = ucp_hello_world.c
 ucp_hello_world_CFLAGS   = $(BASE_CFLAGS) $(EXAMPLE_CUDA_CFLAGS)
 ucp_hello_world_CPPFLAGS = $(BASE_CPPFLAGS) $(EXAMPLE_CUDA_CPPFLAGS)
+ucp_hello_world_LDFLAGS  = $(EXAMPLE_CUDA_LD_FLAGS)
 ucp_hello_world_LDADD    = $(top_builddir)/src/ucs/libucs.la \
                            $(top_builddir)/src/ucp/libucp.la \
-                           $(EXAMPLE_CUDA_LDFLAGS)
+                           $(EXAMPLE_CUDA_LIBS)
 
 uct_hello_world_SOURCES  = uct_hello_world.c
 uct_hello_world_CFLAGS   = $(BASE_CFLAGS) $(EXAMPLE_CUDA_CFLAGS)
 uct_hello_world_CPPFLAGS = $(BASE_CPPFLAGS) $(EXAMPLE_CUDA_CPPFLAGS)
+uct_hello_world_LDFLAGS  = $(EXAMPLE_CUDA_LD_FLAGS)
 uct_hello_world_LDADD    = $(top_builddir)/src/ucs/libucs.la \
                            $(top_builddir)/src/uct/libuct.la \
-                           $(EXAMPLE_CUDA_LDFLAGS)
+                           $(EXAMPLE_CUDA_LIBS)
 
 ucp_client_server_SOURCES  = ucp_client_server.c
-ucp_client_server_CFLAGS   = $(BASE_CFLAGS) $(CFLAGS_PEDANTIC)
-ucp_client_server_CPPFLAGS = $(BASE_CPPFLAGS)
+ucp_client_server_CFLAGS   = $(BASE_CFLAGS) $(EXAMPLE_CUDA_CFLAGS)
+ucp_client_server_CPPFLAGS = $(BASE_CPPFLAGS) $(EXAMPLE_CUDA_CPPFLAGS)
+ucp_client_server_LDFLAGS  = $(EXAMPLE_CUDA_LD_FLAGS)
 ucp_client_server_LDADD    = $(top_builddir)/src/ucs/libucs.la \
-                             $(top_builddir)/src/ucp/libucp.la
+                             $(top_builddir)/src/ucp/libucp.la \
+                             $(EXAMPLE_CUDA_LIBS)
 
 endif
diff --git a/examples/hello_world_util.h b/examples/hello_world_util.h
index c51134e1683..51cff49c830 100644
--- a/examples/hello_world_util.h
+++ b/examples/hello_world_util.h
@@ -181,18 +181,15 @@ ucs_memory_type_t parse_mem_type(const char *opt_arg)
 
 void print_common_help()
 {
-    fprintf(stderr, "  -n name Set node name or IP address "
-            "of the server (required for client and should be ignored "
-            "for server)\n");
-    fprintf(stderr, "  -p port Set alternative server port (default:13337)\n");
-    fprintf(stderr, "  -s size Set test string length (default:16)\n");
-    fprintf(stderr, "  -m <mem type>  memory type of messages\n");
-    fprintf(stderr, "                 host - system memory (default)\n");
+    fprintf(stderr, "  -p <port>     Set alternative server port (default:13337)\n");
+    fprintf(stderr, "  -s <size>     Set test string length (default:16)\n");
+    fprintf(stderr, "  -m <mem type> Memory type of messages\n");
+    fprintf(stderr, "                host - system memory (default)\n");
     if (check_mem_type_support(UCS_MEMORY_TYPE_CUDA)) {
-        fprintf(stderr, "                 cuda - NVIDIA GPU memory\n");
+        fprintf(stderr, "                cuda - NVIDIA GPU memory\n");
     }
     if (check_mem_type_support(UCS_MEMORY_TYPE_CUDA_MANAGED)) {
-        fprintf(stderr, "                 cuda-managed - NVIDIA GPU managed/unified memory\n");
+        fprintf(stderr, "                cuda-managed - NVIDIA GPU managed/unified memory\n");
     }
 }
 
@@ -268,7 +265,7 @@ int client_connect(const char *server, uint16_t server_port)
     return -1;
 }
 
-static int barrier(int oob_sock)
+static inline int barrier(int oob_sock)
 {
     int dummy = 0;
     ssize_t res;
@@ -284,7 +281,7 @@ static int barrier(int oob_sock)
     return !(res == sizeof(dummy));
 }
 
-static int generate_test_string(char *str, int size)
+static inline int generate_test_string(char *str, int size)
 {
     char *tmp_str;
     int i;
diff --git a/examples/ucp_client_server.c b/examples/ucp_client_server.c
index 1ed6f46a1b2..670d5ac389c 100644
--- a/examples/ucp_client_server.c
+++ b/examples/ucp_client_server.c
@@ -27,6 +27,8 @@
  *      13337.
  */
 
+#include "hello_world_util.h"
+
 #include <ucp/api/ucp.h>
 
 #include <string.h>    /* memset */
@@ -34,7 +36,6 @@
 #include <unistd.h>    /* getopt */
 #include <stdlib.h>    /* atoi */
 
-#define TEST_STRING_LEN        sizeof(test_message)
 #define DEFAULT_PORT           13337
 #define IP_STRING_LEN          50
 #define PORT_STRING_LEN        8
@@ -44,9 +45,10 @@
 #define DEFAULT_NUM_ITERATIONS 1
 #define TEST_AM_ID             0
 
-const  char test_message[]           = "UCX Client-Server Hello World";
-static uint16_t server_port          = DEFAULT_PORT;
-static int num_iterations            = DEFAULT_NUM_ITERATIONS;
+
+static long test_string_length = 16;
+static uint16_t server_port    = DEFAULT_PORT;
+static int num_iterations      = DEFAULT_NUM_ITERATIONS;
 
 
 typedef enum {
@@ -219,19 +221,20 @@ static ucs_status_t start_client(ucp_worker_h ucp_worker, const char *ip,
  * Print the received message on the server side or the sent data on the client
  * side.
  */
-static void print_result(int is_server, char *recv_message, int current_iter)
+static void print_result(int is_server, char *msg_str, int current_iter)
 {
     if (is_server) {
         printf("Server: iteration #%d\n", (current_iter + 1));
         printf("UCX data message was received\n");
         printf("\n\n----- UCP TEST SUCCESS -------\n\n");
-        printf("%s", recv_message);
+        printf("%s", msg_str);
         printf("\n\n------------------------------\n\n");
     } else {
         printf("Client: iteration #%d\n", (current_iter + 1));
         printf("\n\n-----------------------------------------\n\n");
         printf("Client sent message: \n%s.\nlength: %ld\n",
-               test_message, TEST_STRING_LEN);
+               (test_string_length != 0) ? msg_str : "<none>",
+               test_string_length);
         printf("\n-----------------------------------------\n\n");
     }
 }
@@ -264,11 +267,11 @@ static ucs_status_t request_wait(ucp_worker_h ucp_worker, void *request,
 }
 
 static int request_finalize(ucp_worker_h ucp_worker, test_req_t *request,
-                            test_req_t *ctx, int is_server,
-                            char *recv_message, int current_iter)
+                            test_req_t *ctx, int is_server, void *msg,
+                            int current_iter)
 {
     ucs_status_t status;
-    int ret = 0;
+    char *msg_str;
 
     status = request_wait(ucp_worker, request, ctx);
     if (status != UCS_OK) {
@@ -280,10 +283,18 @@ static int request_finalize(ucp_worker_h ucp_worker, test_req_t *request,
     /* Print the output of the first, last and every PRINT_INTERVAL iteration */
     if ((current_iter == 0) || (current_iter == (num_iterations - 1)) ||
         !((current_iter + 1) % (PRINT_INTERVAL))) {
-        print_result(is_server, recv_message, current_iter);
+        msg_str = calloc(1, test_string_length + 1);
+        if (msg_str == NULL) {
+            fprintf(stderr, "memory allocation failed\n");
+            return -1;
+        }
+
+        mem_type_memcpy(msg_str, msg, test_string_length);
+        print_result(is_server, msg_str, current_iter);
+        free(msg_str);
     }
 
-    return ret;
+    return 0;
 }
 
 /**
@@ -294,33 +305,41 @@ static int request_finalize(ucp_worker_h ucp_worker, test_req_t *request,
 static int send_recv_stream(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server,
                             int current_iter)
 {
-    char recv_message[TEST_STRING_LEN]= "";
     ucp_request_param_t param;
     test_req_t *request;
-    size_t length;
+    size_t msg_length;
+    void *msg;
     test_req_t ctx;
+    int ret;
+
+    msg_length = test_string_length;
+    msg        = mem_type_malloc(msg_length);
+    CHKERR_ACTION(msg == NULL, "allocate memory\n", return -1;);
+    mem_type_memset(msg, 0, msg_length);
 
-    ctx.complete = 0;
+    ctx.complete       = 0;
     param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
                          UCP_OP_ATTR_FIELD_USER_DATA;
     param.user_data    = &ctx;
+
     if (!is_server) {
+        ret = generate_test_string(msg, msg_length);
+        CHKERR_ACTION(ret < 0, "generate test string", return -1;);
+
         /* Client sends a message to the server using the stream API */
         param.cb.send = send_cb;
-        request       = ucp_stream_send_nbx(ep, test_message, TEST_STRING_LEN,
-                                            &param);
+        request       = ucp_stream_send_nbx(ep, msg, msg_length, &param);
     } else {
         /* Server receives a message from the client using the stream API */
         param.op_attr_mask  |= UCP_OP_ATTR_FIELD_FLAGS;
         param.flags          = UCP_STREAM_RECV_FLAG_WAITALL;
         param.cb.recv_stream = stream_recv_cb;
-        request              = ucp_stream_recv_nbx(ep, &recv_message,
-                                                   TEST_STRING_LEN,
-                                                   &length, &param);
+        request              = ucp_stream_recv_nbx(ep, msg, msg_length,
+                                                   &msg_length, &param);
     }
 
-    return request_finalize(ucp_worker, request, &ctx, is_server,
-                            recv_message, current_iter);
+    return request_finalize(ucp_worker, request, &ctx, is_server, msg,
+                            current_iter);
 }
 
 /**
@@ -331,28 +350,37 @@ static int send_recv_stream(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server,
 static int send_recv_tag(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server,
                          int current_iter)
 {
-    char recv_message[TEST_STRING_LEN]= "";
     ucp_request_param_t param;
     void *request;
+    size_t msg_length;
+    void *msg;
     test_req_t ctx;
+    int ret;
 
-    ctx.complete = 0;
+    msg_length = test_string_length;
+    msg        = mem_type_malloc(msg_length);
+    CHKERR_ACTION(msg == NULL, "allocate memory\n", return -1;);
+    mem_type_memset(msg, 0, msg_length);
+
+    ctx.complete       = 0;
     param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
                          UCP_OP_ATTR_FIELD_USER_DATA;
     param.user_data    = &ctx;
     if (!is_server) {
+        ret = generate_test_string(msg, msg_length);
+        CHKERR_ACTION(ret < 0, "generate test string", return -1;);
+
         /* Client sends a message to the server using the Tag-Matching API */
         param.cb.send = send_cb;
-        request       = ucp_tag_send_nbx(ep, test_message, TEST_STRING_LEN,
-                                         TAG, &param);
+        request       = ucp_tag_send_nbx(ep, msg, msg_length, TAG, &param);
     } else {
         /* Server receives a message from the client using the Tag-Matching API */
         param.cb.recv = tag_recv_cb;
-        request       = ucp_tag_recv_nbx(ucp_worker, &recv_message,
-                                         TEST_STRING_LEN, TAG, 0, &param);
+        request       = ucp_tag_recv_nbx(ucp_worker, msg, msg_length, TAG, 0,
+                                         &param);
     }
 
-    return request_finalize(ucp_worker, request, &ctx, is_server, recv_message,
+    return request_finalize(ucp_worker, request, &ctx, is_server, msg,
                             current_iter);
 }
 
@@ -360,16 +388,18 @@ ucs_status_t ucp_am_data_cb(void *arg, const void *header, size_t header_length,
                             void *data, size_t length,
                             const ucp_am_recv_param_t *param)
 {
-    if (length != TEST_STRING_LEN) {
+    if (length != test_string_length) {
         fprintf(stderr, "received wrong data length %ld (expected %ld)",
-                length, TEST_STRING_LEN);
-        goto out;
+                length, test_string_length);
+        return UCS_OK;
     }
 
     if ((header != NULL) || (header_length != 0)) {
         fprintf(stderr, "received unexpected header, length %ld", header_length);
     }
 
+    am_data_desc.complete = 1;
+
     if (param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV) {
         /* Rendezvous request arrived, data contains an internal UCX descriptor,
          * which has to be passed to ucp_am_recv_data_nbx function to confirm
@@ -384,10 +414,8 @@ ucs_status_t ucp_am_data_cb(void *arg, const void *header, size_t header_length,
      * immediately
      */
     am_data_desc.is_rndv = 0;
-    memcpy(am_data_desc.recv_buf, data, length);
+    mem_type_memcpy(am_data_desc.recv_buf, data, length);
 
-out:
-    am_data_desc.complete = 1;
     return UCS_OK;
 }
 
@@ -400,21 +428,31 @@ ucs_status_t ucp_am_data_cb(void *arg, const void *header, size_t header_length,
 static int send_recv_am(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server,
                         int current_iter)
 {
-    char recv_message[TEST_STRING_LEN] = "";
     test_req_t *request;
     ucp_request_param_t params;
+    size_t msg_length;
+    void *msg;
     test_req_t ctx;
+    int ret;
+
+    msg_length = test_string_length;
+    msg        = mem_type_malloc(msg_length);
+    CHKERR_ACTION(msg == NULL, "allocate memory\n", return -1;);
+    mem_type_memset(msg, 0, msg_length);
 
-    am_data_desc.recv_buf = recv_message;
-    ctx.complete          = 0;
-    params.op_attr_mask   = UCP_OP_ATTR_FIELD_CALLBACK |
-                            UCP_OP_ATTR_FIELD_USER_DATA;
-    params.user_data      = &ctx;
+    ctx.complete        = 0;
+    params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
+                          UCP_OP_ATTR_FIELD_USER_DATA;
+    params.user_data    = &ctx;
 
     if (is_server) {
+        am_data_desc.recv_buf = msg;
+
+        /* waiting for AM callback has called */
         while (!am_data_desc.complete) {
             ucp_worker_progress(ucp_worker);
         }
+
         am_data_desc.complete = 0;
 
         if (am_data_desc.is_rndv) {
@@ -425,8 +463,7 @@ static int send_recv_am(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server,
             params.cb.recv_am    = am_recv_cb,
             request              = ucp_am_recv_data_nbx(ucp_worker,
                                                         am_data_desc.desc,
-                                                        &recv_message,
-                                                        TEST_STRING_LEN,
+                                                        msg, msg_length,
                                                         &params);
         } else {
             /* Data has arrived eagerly and is ready for use, no need to
@@ -434,14 +471,16 @@ static int send_recv_am(ucp_worker_h ucp_worker, ucp_ep_h ep, int is_server,
             request = NULL;
         }
     } else {
+        ret = generate_test_string(msg, msg_length);
+        CHKERR_ACTION(ret < 0, "generate test string", return -1;);
+
         /* Client sends a message to the server using the AM API */
         params.cb.send = (ucp_send_nbx_callback_t)send_cb,
-        request        = ucp_am_send_nbx(ep, TEST_AM_ID, NULL, 0ul,
-                                         test_message, TEST_STRING_LEN,
-                                         &params);
+        request        = ucp_am_send_nbx(ep, TEST_AM_ID, NULL, 0ul, msg,
+                                         msg_length, &params);
     }
 
-    return request_finalize(ucp_worker, request, &ctx, is_server, recv_message,
+    return request_finalize(ucp_worker, request, &ctx, is_server, msg,
                             current_iter);
 }
 
@@ -480,24 +519,25 @@ static void usage()
     fprintf(stderr, "Usage: ucp_client_server [parameters]\n");
     fprintf(stderr, "UCP client-server example utility\n");
     fprintf(stderr, "\nParameters are:\n");
-    fprintf(stderr, " -a Set IP address of the server "
+    fprintf(stderr, "  -a Set IP address of the server "
                     "(required for client and should not be specified "
                     "for the server)\n");
-    fprintf(stderr, " -l Set IP address where server listens "
+    fprintf(stderr, "  -l Set IP address where server listens "
                     "(If not specified, server uses INADDR_ANY; "
                     "Irrelevant at client)\n");
-    fprintf(stderr, " -p Port number to listen/connect to (default = %d). "
+    fprintf(stderr, "  -p Port number to listen/connect to (default = %d). "
                     "0 on the server side means select a random port and print it\n",
                     DEFAULT_PORT);
-    fprintf(stderr, " -c Communication type for the client and server. "
-                    " Valid values are:\n"
-                    "     'stream' : Stream API\n"
-                    "     'tag'    : Tag API\n"
-                    "     'am'     : AM API\n"
-                    "    If not specified, %s API will be used.\n", COMM_TYPE_DEFAULT);
-    fprintf(stderr, " -i Number of iterations to run. Client and server must "
+    fprintf(stderr, "  -c Communication type for the client and server. "
+                    "  Valid values are:\n"
+                    "      'stream' : Stream API\n"
+                    "      'tag'    : Tag API\n"
+                    "      'am'     : AM API\n"
+                    "     If not specified, %s API will be used.\n", COMM_TYPE_DEFAULT);
+    fprintf(stderr, "  -i Number of iterations to run. Client and server must "
                     "have the same value. (default = %d).\n",
                     num_iterations);
+    print_common_help();
     fprintf(stderr, "\n");
 }
 
@@ -510,9 +550,7 @@ static int parse_cmd(int argc, char *const argv[], char **server_addr,
     int c = 0;
     int port;
 
-    opterr = 0;
-
-    while ((c = getopt(argc, argv, "a:l:p:c:i:")) != -1) {
+    while ((c = getopt(argc, argv, "a:l:p:c:i:s:m:h")) != -1) {
         switch (c) {
         case 'a':
             *server_addr = optarg;
@@ -523,10 +561,7 @@ static int parse_cmd(int argc, char *const argv[], char **server_addr,
             } else if (!strcasecmp(optarg, "tag")) {
                 *send_recv_type = CLIENT_SERVER_SEND_RECV_TAG;
             } else if (!strcasecmp(optarg, "am")) {
-                /* TODO: uncomment below when AM API is fully supported.
-                 * *send_recv_type = CLIENT_SERVER_SEND_RECV_AM; */
-                fprintf(stderr, "AM API is not fully supported yet\n");
-                return -1;
+                *send_recv_type = CLIENT_SERVER_SEND_RECV_AM;
             } else {
                 fprintf(stderr, "Wrong communication type %s. "
                         "Using %s as default\n", optarg, COMM_TYPE_DEFAULT);
@@ -547,6 +582,20 @@ static int parse_cmd(int argc, char *const argv[], char **server_addr,
         case 'i':
             num_iterations = atoi(optarg);
             break;
+        case 's':
+            test_string_length = atol(optarg);
+            if (test_string_length < 0) {
+                fprintf(stderr, "Wrong string size %ld\n", test_string_length);
+                return UCS_ERR_UNSUPPORTED;
+            }	
+            break;
+        case 'm':
+            test_mem_type = parse_mem_type(optarg);
+            if (test_mem_type == UCS_MEMORY_TYPE_LAST) {
+                return UCS_ERR_UNSUPPORTED;
+            }
+            break;
+        case 'h':
         default:
             usage();
             return -1;
diff --git a/examples/ucp_hello_world.c b/examples/ucp_hello_world.c
index 897de0c90a3..39b308f125a 100644
--- a/examples/ucp_hello_world.c
+++ b/examples/ucp_hello_world.c
@@ -47,7 +47,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>  /* getopt */
-#include <ctype.h>   /* isprint */
 #include <pthread.h> /* pthread_self */
 #include <errno.h>   /* errno */
 #include <time.h>
@@ -67,18 +66,25 @@ enum ucp_test_mode_t {
     TEST_MODE_EVENTFD
 } ucp_test_mode = TEST_MODE_PROBE;
 
+typedef enum {
+    FAILURE_MODE_NONE,
+    FAILURE_MODE_SEND,      /* fail send operation on server */
+    FAILURE_MODE_RECV,      /* fail receive operation on client */
+    FAILURE_MODE_KEEPALIVE  /* fail without communication on client */
+} failure_mode_t;
+
 static struct err_handling {
     ucp_err_handling_mode_t ucp_err_mode;
-    int                     failure;
+    failure_mode_t          failure_mode;
 } err_handling_opt;
 
-static ucs_status_t client_status = UCS_OK;
-static uint16_t server_port       = 13337;
-static long test_string_length    = 16;
-static const ucp_tag_t tag        = 0x1337a880u;
-static const ucp_tag_t tag_mask   = UINT64_MAX;
-static const char *addr_msg_str   = "UCX address message";
-static const char *data_msg_str   = "UCX data message";
+static ucs_status_t ep_status   = UCS_OK;
+static uint16_t server_port     = 13337;
+static long test_string_length  = 16;
+static const ucp_tag_t tag      = 0x1337a880u;
+static const ucp_tag_t tag_mask = UINT64_MAX;
+static const char *addr_msg_str = "UCX address message";
+static const char *data_msg_str = "UCX data message";
 static ucp_address_t *local_addr;
 static ucp_address_t *peer_addr;
 
@@ -220,9 +226,14 @@ static int run_ucx_client(ucp_worker_h ucp_worker)
 
     /* Send client UCX address to server */
     ep_params.field_mask      = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS |
-                                UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;
+                                UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE |
+                                UCP_EP_PARAM_FIELD_ERR_HANDLER |
+                                UCP_EP_PARAM_FIELD_USER_DATA;
     ep_params.address         = peer_addr;
     ep_params.err_mode        = err_handling_opt.ucp_err_mode;
+    ep_params.err_handler.cb  = failure_handler;
+    ep_params.err_handler.arg = NULL;
+    ep_params.user_data       = &ep_status;
 
     status = ucp_ep_create(ucp_worker, &ep_params, &server_ep);
     CHKERR_JUMP(status != UCS_OK, "ucp_ep_create\n", err);
@@ -250,14 +261,14 @@ static int run_ucx_client(ucp_worker_h ucp_worker)
 
     free(msg);
 
-    if (err_handling_opt.failure) {
-        fprintf(stderr, "Emulating unexpected failure on client side\n");
+    if (err_handling_opt.failure_mode == FAILURE_MODE_RECV) {
+        fprintf(stderr, "Emulating failure before receive operation on client side\n");
         raise(SIGKILL);
     }
 
     /* Receive test string from server */
     for (;;) {
-
+        CHKERR_JUMP(ep_status != UCS_OK, "receive data: EP disconnected\n", err_ep);
         /* Probing incoming events in non-block mode */
         msg_tag = ucp_tag_probe_nb(ucp_worker, tag, tag_mask, 1, &info_tag);
         if (msg_tag != NULL) {
@@ -281,6 +292,13 @@ static int run_ucx_client(ucp_worker_h ucp_worker)
             CHKERR_JUMP(status != UCS_OK, "test_poll_wait\n", err_ep);
         }
     }
+    
+    if (err_handling_opt.failure_mode == FAILURE_MODE_KEEPALIVE) {
+        fprintf(stderr, "Emulating unexpected failure after receive completion "
+                        "on client side, server should detect error by "
+                        "keepalive mechanism\n");
+        raise(SIGKILL);
+    }
 
     msg = mem_type_malloc(info_tag.length);
     CHKERR_JUMP(msg == NULL, "allocate memory\n", err_ep);
@@ -295,25 +313,23 @@ static int run_ucx_client(ucp_worker_h ucp_worker)
     }
 
     str = calloc(1, test_string_length);
-    if (str != NULL) {
-        mem_type_memcpy(str, msg + 1, test_string_length);
-        printf("\n\n----- UCP TEST SUCCESS ----\n\n");
-        printf("%s", str);
-        printf("\n\n---------------------------\n\n");
-        free(str);
-    } else {
+    if (str == NULL) {
         fprintf(stderr, "Memory allocation failed\n");
-        mem_type_free(msg);
-        goto err_ep;
+        ret = -1;
+        goto err_msg;
     }
 
-    mem_type_free(msg);
-
+    mem_type_memcpy(str, msg + 1, test_string_length);
+    printf("\n\n----- UCP TEST SUCCESS ----\n\n");
+    printf("%s", str);
+    printf("\n\n---------------------------\n\n");
+    free(str);
     ret = 0;
 
+err_msg:
+    mem_type_free(msg);
 err_ep:
     ucp_ep_destroy(server_ep);
-
 err:
     return ret;
 }
@@ -378,6 +394,14 @@ static int run_ucx_server(ucp_worker_h ucp_worker)
         goto err;
     }
 
+    if (err_handling_opt.failure_mode == FAILURE_MODE_SEND) {
+        fprintf(stderr, "Emulating unexpected failure on server side, client "
+                        "should detect error by keepalive mechanism\n");
+        free(msg);
+        raise(SIGKILL);
+        exit(1);
+    }
+
     peer_addr_len = msg->data_len;
     peer_addr     = malloc(peer_addr_len);
     if (peer_addr == NULL) {
@@ -400,13 +424,13 @@ static int run_ucx_server(ucp_worker_h ucp_worker)
     ep_params.err_mode        = err_handling_opt.ucp_err_mode;
     ep_params.err_handler.cb  = failure_handler;
     ep_params.err_handler.arg = NULL;
-    ep_params.user_data       = &client_status;
+    ep_params.user_data       = &ep_status;
 
     status = ucp_ep_create(ucp_worker, &ep_params, &client_ep);
     /* If peer failure testing was requested, it could be possible that UCP EP
      * couldn't be created; in this case set `ret = 0` to report success */
-    CHKERR_ACTION(status != UCS_OK, "ucp_ep_create\n",
-                  ret = (err_handling_opt.failure) ? 0 : -1; goto err);
+    ret = (err_handling_opt.failure_mode != FAILURE_MODE_NONE) ? 0 : -1;
+    CHKERR_ACTION(status != UCS_OK, "ucp_ep_create\n", goto err);
 
     msg_len = sizeof(*msg) + test_string_length;
     msg = mem_type_malloc(msg_len);
@@ -417,12 +441,14 @@ static int run_ucx_server(ucp_worker_h ucp_worker)
     ret = generate_test_string((char *)(msg + 1), test_string_length);
     CHKERR_JUMP(ret < 0, "generate test string", err_free_mem_type_msg);
 
-    if (err_handling_opt.failure) {
-        /* Sleep for small amount of time to ensure that server was killed
+    if (err_handling_opt.failure_mode == FAILURE_MODE_RECV) {
+        /* Sleep for small amount of time to ensure that client was killed
          * and peer failure handling is covered */
         sleep(5);
     }
 
+    ucp_worker_progress(ucp_worker);
+
     send_param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK  |
                               UCP_OP_ATTR_FIELD_USER_DATA |
                               UCP_OP_ATTR_FIELD_MEMORY_TYPE;
@@ -434,7 +460,7 @@ static int run_ucx_server(ucp_worker_h ucp_worker)
     status                  = ucx_wait(ucp_worker, request, "send",
                                        data_msg_str);
     if (status != UCS_OK) {
-        if (!err_handling_opt.failure) {
+        if (err_handling_opt.failure_mode != FAILURE_MODE_NONE) {
             ret = -1;
         } else {
             /* If peer failure testing was requested, set `ret = 0` to report
@@ -442,13 +468,20 @@ static int run_ucx_server(ucp_worker_h ucp_worker)
             ret = 0;
 
             /* Make sure that failure_handler was called */
-            while (client_status == UCS_OK) {
+            while (ep_status == UCS_OK) {
                 ucp_worker_progress(ucp_worker);
             }
         }
         goto err_free_mem_type_msg;
     }
 
+    if (err_handling_opt.failure_mode == FAILURE_MODE_KEEPALIVE) {
+        fprintf(stderr, "Waiting for client is terminated\n");
+        while (ep_status == UCS_OK) {
+            ucp_worker_progress(ucp_worker);
+        }
+    }
+
     status = flush_ep(ucp_worker, client_ep);
     printf("flush_ep completed with status %d (%s)\n",
            status, ucs_status_string(status));
@@ -564,7 +597,7 @@ int main(int argc, char **argv)
 
     ret = run_test(client_target_name, ucp_worker);
 
-    if (!ret && !err_handling_opt.failure) {
+    if (!ret && (err_handling_opt.failure_mode != FAILURE_MODE_NONE)) {
         /* Make sure remote is disconnected before destroying local worker */
         ret = barrier(oob_sock);
     }
@@ -586,15 +619,40 @@ int main(int argc, char **argv)
     return ret;
 }
 
+static void print_usage()
+{
+    fprintf(stderr, "Usage: ucp_hello_world [parameters]\n");
+    fprintf(stderr, "UCP hello world client/server example utility\n");
+    fprintf(stderr, "\nParameters are:\n");
+    fprintf(stderr, "  -w      Select test mode \"wait\" to test "
+            "ucp_worker_wait function\n");
+    fprintf(stderr, "  -f      Select test mode \"event fd\" to test "
+            "ucp_worker_get_efd function with later poll\n");
+    fprintf(stderr, "  -b      Select test mode \"busy polling\" to test "
+            "ucp_tag_probe_nb and ucp_worker_progress (default)\n");
+    fprintf(stderr, "  -n <name> Set node name or IP address "
+            "of the server (required for client and should be ignored "
+            "for server)\n");
+    fprintf(stderr, "  -e <type> Emulate unexpected failure and handle an "
+                                "error with enabled UCP_ERR_HANDLING_MODE_PEER\n");
+    fprintf(stderr, "            send      - send failure on server side "
+                                "before send initiated\n");
+    fprintf(stderr, "            recv      - receive failure on client side "
+                                "before receive completed\n");
+    fprintf(stderr, "            keepalive - keepalive failure on client side "
+                                "after communication completed\n");
+    print_common_help();
+    fprintf(stderr, "\n");
+}
+
 ucs_status_t parse_cmd(int argc, char * const argv[], char **server_name)
 {
     int c = 0, idx = 0;
-    opterr = 0;
 
-    err_handling_opt.ucp_err_mode   = UCP_ERR_HANDLING_MODE_NONE;
-    err_handling_opt.failure        = 0;
+    err_handling_opt.ucp_err_mode = UCP_ERR_HANDLING_MODE_NONE;
+    err_handling_opt.failure_mode = FAILURE_MODE_NONE;
 
-    while ((c = getopt(argc, argv, "wfben:p:s:m:h")) != -1) {
+    while ((c = getopt(argc, argv, "wfbe:n:p:s:m:h")) != -1) {
         switch (c) {
         case 'w':
             ucp_test_mode = TEST_MODE_WAIT;
@@ -606,8 +664,17 @@ ucs_status_t parse_cmd(int argc, char * const argv[], char **server_name)
             ucp_test_mode = TEST_MODE_PROBE;
             break;
         case 'e':
-            err_handling_opt.ucp_err_mode   = UCP_ERR_HANDLING_MODE_PEER;
-            err_handling_opt.failure        = 1;
+            err_handling_opt.ucp_err_mode = UCP_ERR_HANDLING_MODE_PEER;
+            if (!strcmp(optarg, "recv")) {
+                err_handling_opt.failure_mode = FAILURE_MODE_RECV;
+            } else if (!strcmp(optarg, "send")) {
+                err_handling_opt.failure_mode = FAILURE_MODE_SEND;
+            } else if (!strcmp(optarg, "keepalive")) {
+                err_handling_opt.failure_mode = FAILURE_MODE_KEEPALIVE;
+            } else {
+                print_usage();
+                return UCS_ERR_UNSUPPORTED;
+            }
             break;
         case 'n':
             *server_name = optarg;
@@ -621,7 +688,7 @@ ucs_status_t parse_cmd(int argc, char * const argv[], char **server_name)
             break;
         case 's':
             test_string_length = atol(optarg);
-            if (test_string_length <= 0) {
+            if (test_string_length < 0) {
                 fprintf(stderr, "Wrong string size %ld\n", test_string_length);
                 return UCS_ERR_UNSUPPORTED;
             }	
@@ -632,36 +699,14 @@ ucs_status_t parse_cmd(int argc, char * const argv[], char **server_name)
                 return UCS_ERR_UNSUPPORTED;
             }
             break;
-        case '?':
-            if (optopt == 's') {
-                fprintf(stderr, "Option -%c requires an argument.\n", optopt);
-            } else if (isprint (optopt)) {
-                fprintf(stderr, "Unknown option `-%c'.\n", optopt);
-            } else {
-                fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
-            }
-            /* Fall through */
         case 'h':
         default:
-            fprintf(stderr, "Usage: ucp_hello_world [parameters]\n");
-            fprintf(stderr, "UCP hello world client/server example utility\n");
-            fprintf(stderr, "\nParameters are:\n");
-            fprintf(stderr, "  -w      Select test mode \"wait\" to test "
-                    "ucp_worker_wait function\n");
-            fprintf(stderr, "  -f      Select test mode \"event fd\" to test "
-                    "ucp_worker_get_efd function with later poll\n");
-            fprintf(stderr, "  -b      Select test mode \"busy polling\" to test "
-                    "ucp_tag_probe_nb and ucp_worker_progress (default)\n");
-            fprintf(stderr, "  -e      Emulate unexpected failure on server side"
-                    "and handle an error on client side with enabled "
-                    "UCP_ERR_HANDLING_MODE_PEER\n");
-            print_common_help();
-            fprintf(stderr, "\n");
+            print_usage();
             return UCS_ERR_UNSUPPORTED;
         }
     }
-    fprintf(stderr, "INFO: UCP_HELLO_WORLD mode = %d server = %s port = %d\n",
-            ucp_test_mode, *server_name, server_port);
+    fprintf(stderr, "INFO: UCP_HELLO_WORLD mode = %d server = %s port = %d, pid = %d\n",
+            ucp_test_mode, *server_name, server_port, getpid());
 
     for (idx = optind; idx < argc; idx++) {
         fprintf(stderr, "WARNING: Non-option argument %s\n", argv[idx]);
diff --git a/examples/uct_hello_world.c b/examples/uct_hello_world.c
index ceaff6380b3..66a51d81f39 100644
--- a/examples/uct_hello_world.c
+++ b/examples/uct_hello_world.c
@@ -10,7 +10,8 @@
 #include <uct/api/uct.h>
 
 #include <assert.h>
-#include <ctype.h>
+#include <inttypes.h>
+
 
 typedef enum {
     FUNC_AM_SHORT,
@@ -206,7 +207,8 @@ static void print_strings(const char *label, const char *local_str,
                           const char *remote_str, size_t length)
 {
     fprintf(stdout, "\n\n----- UCT TEST SUCCESS ----\n\n");
-    fprintf(stdout, "[%s] %s sent %s", label, local_str, remote_str);
+    fprintf(stdout, "[%s] %s sent %s (%" PRIu64 " bytes)", label, local_str,
+            (length != 0) ? remote_str : "<none>", length);
     fprintf(stdout, "\n\n---------------------------\n");
     fflush(stdout);
 }
@@ -424,8 +426,11 @@ int print_err_usage()
     fprintf(stderr, func_template, 'i', func_am_t_str(FUNC_AM_SHORT), " (default)");
     fprintf(stderr, func_template, 'b', func_am_t_str(FUNC_AM_BCOPY), "");
     fprintf(stderr, func_template, 'z', func_am_t_str(FUNC_AM_ZCOPY), "");
-    fprintf(stderr, "  -d      Select device name\n");
-    fprintf(stderr, "  -t      Select transport layer\n");
+    fprintf(stderr, "  -d        Select device name\n");
+    fprintf(stderr, "  -t        Select transport layer\n");
+    fprintf(stderr, "  -n <name> Set node name or IP address "
+            "of the server (required for client and should be ignored "
+            "for server)\n");
     print_common_help();
     fprintf(stderr, "\nExample:\n");
     fprintf(stderr, "  Server: uct_hello_world -d eth0 -t tcp\n");
@@ -446,7 +451,6 @@ int parse_cmd(int argc, char * const argv[], cmd_args_t *args)
     args->func_am_type  = FUNC_AM_SHORT;
     args->test_strlen   = 16;
 
-    opterr = 0;
     while ((c = getopt(argc, argv, "ibzd:t:n:p:s:m:h")) != -1) {
         switch (c) {
         case 'i':
@@ -477,7 +481,7 @@ int parse_cmd(int argc, char * const argv[], cmd_args_t *args)
             break;
         case 's':
             args->test_strlen = atol(optarg);
-            if (args->test_strlen <= 0) {
+            if (args->test_strlen < 0) {
                 fprintf(stderr, "Wrong string size %ld\n", args->test_strlen);
                 return UCS_ERR_UNSUPPORTED;
             }
@@ -488,14 +492,6 @@ int parse_cmd(int argc, char * const argv[], cmd_args_t *args)
                 return UCS_ERR_UNSUPPORTED;
             }
             break;
-        case '?':
-            if (optopt == 's') {
-                fprintf(stderr, "Option -%c requires an argument.\n", optopt);
-            } else if (isprint (optopt)) {
-                fprintf(stderr, "Unknown option `-%c'.\n", optopt);
-            } else {
-                fprintf(stderr, "Unknown option character `\\x%x'.\n", optopt);
-            }
         case 'h':
         default:
             return print_err_usage();
diff --git a/src/tools/info/proto_info.c b/src/tools/info/proto_info.c
index 4c74cd0dc8c..c58577e029b 100644
--- a/src/tools/info/proto_info.c
+++ b/src/tools/info/proto_info.c
@@ -13,6 +13,7 @@
 #include <ucp/api/ucp.h>
 #include <ucs/time/time.h>
 #include <ucs/sys/string.h>
+#include <ucs/debug/assert.h>
 #include <sys/resource.h>
 #include <dirent.h>
 #include <string.h>
@@ -93,27 +94,206 @@ static void print_resource_usage(const resource_usage_t *usage_before,
     printf("#\n");
 }
 
-void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags,
-                    uint64_t ctx_features, const ucp_ep_params_t *base_ep_params,
-                    size_t estimated_num_eps, size_t estimated_num_ppn,
-                    unsigned dev_type_bitmap, const char *mem_size)
+static void listener_accept_callback(ucp_ep_h ep, void *arg)
 {
-    ucp_config_t *config;
+    *(ucp_ep_h*)arg = ep;
+}
+
+static void
+set_saddr(const char *addr_str, uint16_t port, struct sockaddr_in *saddr)
+{
+    memset(saddr, 0, sizeof(*saddr));
+    saddr->sin_family      = AF_INET;
+    saddr->sin_addr.s_addr = inet_addr(addr_str);
+    saddr->sin_port        = htons(port);
+}
+
+static ucs_status_t
+wait_completion(ucp_worker_h worker, ucs_status_ptr_t status_ptr)
+{
+    ucs_status_t status;
+
+    if (status_ptr == NULL) {
+        status = UCS_OK;
+    } else if (UCS_PTR_IS_PTR(status_ptr)) {
+        do {
+            ucp_worker_progress(worker);
+            status = ucp_request_test(status_ptr, NULL);
+        } while (status == UCS_INPROGRESS);
+        ucp_request_release(status_ptr);
+    } else {
+        status = UCS_PTR_STATUS(status_ptr);
+    }
+
+    return status;
+}
+
+static void
+ep_close(ucp_worker_h worker, ucp_ep_h ep, ucp_ep_close_flags_t flags,
+         const char *ep_type)
+{
+    ucp_request_param_t request_param;
+    ucs_status_ptr_t status_ptr;
+
+    request_param.op_attr_mask = UCP_OP_ATTR_FIELD_FLAGS;
+    request_param.flags        = flags;
+
+    status_ptr = ucp_ep_close_nbx(ep, &request_param);
+    wait_completion(worker, status_ptr);
+}
+
+static ucs_status_t
+create_listener(ucp_worker_h worker, ucp_listener_h *listener_p,
+                uint16_t *listen_port_p, void *accept_cb_arg)
+{
+    ucp_listener_h listener;
+    struct sockaddr_in listen_saddr;
+    ucp_listener_params_t listen_params;
+    ucp_listener_attr_t listen_attr;
+    ucs_status_t status;
+
+    set_saddr("0.0.0.0", 0, &listen_saddr);
+
+    listen_params.field_mask         = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR |
+                                       UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER;
+    listen_params.sockaddr.addr      = (const struct sockaddr*)&listen_saddr;
+    listen_params.sockaddr.addrlen   = sizeof(listen_saddr);
+    listen_params.accept_handler.cb  = listener_accept_callback;
+    listen_params.accept_handler.arg = accept_cb_arg;
+
+    status = ucp_listener_create(worker, &listen_params, &listener);
+    if (status != UCS_OK) {
+        printf("<Failed to create UCP listener>\n");
+        goto out;
+    }
+
+    listen_attr.field_mask = UCP_LISTENER_ATTR_FIELD_SOCKADDR;
+
+    status = ucp_listener_query(listener, &listen_attr);
+    if (status != UCS_OK) {
+        printf("<Failed to query UCP listener>\n");
+        goto out_destroy_listener;
+    }
+
+    status = ucs_sockaddr_get_port((struct sockaddr*)&listen_attr.sockaddr,
+                                   listen_port_p);
+    if (status != UCS_OK) {
+        printf("<Failed to get port>\n");
+        goto out_destroy_listener;
+    }
+
+    *listener_p = listener;
+out:
+    return status;
+
+out_destroy_listener:
+    ucp_listener_destroy(listener);
+    goto out;
+}
+
+ucs_status_t
+print_ucp_ep_info(ucp_worker_h worker, const ucp_ep_params_t *base_ep_params,
+                  const char *ip_addr)
+{
+    ucp_listener_h listener    = NULL;
+    ucp_ep_h server_ep         = NULL;
+    ucp_address_t *worker_addr = NULL;
+    ucp_ep_params_t ep_params  = *base_ep_params;
     ucs_status_t status;
     ucs_status_ptr_t status_ptr;
+    size_t worker_addr_length;
+    struct sockaddr_in connect_saddr;
+    uint16_t listen_port;
+    ucp_ep_h ep;
+    char ep_name[64];
+    ucp_request_param_t request_param;
+
+    if (ip_addr != NULL) {
+        status = create_listener(worker, &listener, &listen_port, &server_ep);
+        if (status != UCS_OK) {
+            return status;
+        }
+
+        ucs_strncpy_zero(ep_name, "client", sizeof(ep_name));
+
+        set_saddr(ip_addr, listen_port, &connect_saddr);
+
+        ep_params.field_mask      |= UCP_EP_PARAM_FIELD_FLAGS |
+                                     UCP_EP_PARAM_FIELD_SOCK_ADDR;
+        ep_params.flags            = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER;
+        ep_params.sockaddr.addr    = (struct sockaddr*)&connect_saddr;
+        ep_params.sockaddr.addrlen = sizeof(connect_saddr);
+    } else {
+        status = ucp_worker_get_address(worker, &worker_addr,
+                                        &worker_addr_length);
+        if (status != UCS_OK) {
+            printf("<Failed to get UCP worker address>\n");
+            return status;
+        }
+
+        ucs_strncpy_zero(ep_name, "connected to UCP worker", sizeof(ep_name));
+
+        ep_params.field_mask |= UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
+        ep_params.address     = worker_addr;
+    }
+
+    status = ucp_ep_create(worker, &ep_params, &ep);
+    if (status != UCS_OK) {
+        printf("<Failed to create UCP endpoint>\n");
+        goto out;
+    }
+
+    request_param.op_attr_mask = 0;
+    /* do EP flush to make sure that fully completed to a peer and final
+     * configuration is applied */
+    status_ptr = ucp_ep_flush_nbx(ep, &request_param);
+    status     = wait_completion(worker, status_ptr);
+    if (status != UCS_OK) {
+        printf("<Failed to flush UCP endpoint>\n");
+        goto out_close_eps;
+    }
+
+    ucp_ep_print_info(ep, stdout);
+
+out_close_eps:
+    ep_close(worker, ep, 0, ep_name);
+
+    if (server_ep != NULL) {
+        ucs_assert(ip_addr != NULL); /* server EP is created only for sockaddr
+                                      * connection flow */
+        ep_close(worker, server_ep, UCP_EP_CLOSE_FLAG_FORCE, "server");
+    }
+
+out:
+    if (listener != NULL) {
+        ucp_listener_destroy(listener);
+    }
+
+    if (worker_addr == NULL) {
+        ucp_worker_release_address(worker, worker_addr);
+    }
+
+    return status;
+}
+
+ucs_status_t
+print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags,
+               uint64_t ctx_features, const ucp_ep_params_t *base_ep_params,
+               size_t estimated_num_eps, size_t estimated_num_ppn,
+               unsigned dev_type_bitmap, const char *mem_size,
+               const char *ip_addr)
+{
+    ucp_config_t *config;
+    ucs_status_t status;
     ucp_context_h context;
     ucp_worker_h worker;
     ucp_params_t params;
     ucp_worker_params_t worker_params;
-    ucp_ep_params_t ep_params;
-    ucp_address_t *address;
-    size_t address_length;
     resource_usage_t usage;
-    ucp_ep_h ep;
 
     status = ucp_config_read(NULL, NULL, &config);
     if (status != UCS_OK) {
-        return;
+        goto out;
     }
 
     memset(&params, 0, sizeof(params));
@@ -172,40 +352,15 @@ void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags,
     }
 
     if (print_opts & PRINT_UCP_EP) {
-        status = ucp_worker_get_address(worker, &address, &address_length);
-        if (status != UCS_OK) {
-            printf("<Failed to get UCP worker address>\n");
-            goto out_destroy_worker;
-        }
-
-        ep_params             = *base_ep_params;
-
-        ep_params.field_mask |= UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
-        ep_params.address     = address;
-
-        status = ucp_ep_create(worker, &ep_params, &ep);
-        ucp_worker_release_address(worker, address);
-        if (status != UCS_OK) {
-            printf("<Failed to create UCP endpoint>\n");
-            goto out_destroy_worker;
-        }
-
-        ucp_ep_print_info(ep, stdout);
-
-        status_ptr = ucp_disconnect_nb(ep);
-        if (UCS_PTR_IS_PTR(status_ptr)) {
-            do {
-                ucp_worker_progress(worker);
-                status = ucp_request_test(status_ptr, NULL);
-            } while (status == UCS_INPROGRESS);
-            ucp_request_release(status_ptr);
-        }
+        status = print_ucp_ep_info(worker, base_ep_params, ip_addr);
     }
 
-out_destroy_worker:
     ucp_worker_destroy(worker);
-out_cleanup_context:
+
+ out_cleanup_context:
     ucp_cleanup(context);
 out_release_config:
     ucp_config_release(config);
+out:
+    return status;
 }
diff --git a/src/tools/info/sys_info.c b/src/tools/info/sys_info.c
index 88d31767046..7c355a264c2 100644
--- a/src/tools/info/sys_info.c
+++ b/src/tools/info/sys_info.c
@@ -1,5 +1,6 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+* Copyright (C) Shanghai Zhaoxin Semiconductor Co., Ltd. 2020. ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -11,6 +12,7 @@
 #include "ucx_info.h"
 
 #include <ucs/sys/sys.h>
+#include <ucs/sys/math.h>
 #include <ucs/time/time.h>
 #include <ucs/config/parser.h>
 #include <ucs/config/global_opts.h>
@@ -19,17 +21,20 @@
 
 
 static const char* cpu_model_names[] = {
-    [UCS_CPU_MODEL_UNKNOWN]           = "unknown",
-    [UCS_CPU_MODEL_INTEL_IVYBRIDGE]   = "IvyBridge",
-    [UCS_CPU_MODEL_INTEL_SANDYBRIDGE] = "SandyBridge",
-    [UCS_CPU_MODEL_INTEL_NEHALEM]     = "Nehalem",
-    [UCS_CPU_MODEL_INTEL_WESTMERE]    = "Westmere",
-    [UCS_CPU_MODEL_INTEL_HASWELL]     = "Haswell",
-    [UCS_CPU_MODEL_INTEL_BROADWELL]   = "Broadwell",
-    [UCS_CPU_MODEL_INTEL_SKYLAKE]     = "Skylake",
-    [UCS_CPU_MODEL_ARM_AARCH64]       = "ARM 64-bit",
-    [UCS_CPU_MODEL_AMD_NAPLES]        = "Naples",
-    [UCS_CPU_MODEL_AMD_ROME]          = "Rome"
+    [UCS_CPU_MODEL_UNKNOWN]            = "unknown",
+    [UCS_CPU_MODEL_INTEL_IVYBRIDGE]    = "IvyBridge",
+    [UCS_CPU_MODEL_INTEL_SANDYBRIDGE]  = "SandyBridge",
+    [UCS_CPU_MODEL_INTEL_NEHALEM]      = "Nehalem",
+    [UCS_CPU_MODEL_INTEL_WESTMERE]     = "Westmere",
+    [UCS_CPU_MODEL_INTEL_HASWELL]      = "Haswell",
+    [UCS_CPU_MODEL_INTEL_BROADWELL]    = "Broadwell",
+    [UCS_CPU_MODEL_INTEL_SKYLAKE]      = "Skylake",
+    [UCS_CPU_MODEL_ARM_AARCH64]        = "ARM 64-bit",
+    [UCS_CPU_MODEL_AMD_NAPLES]         = "Naples",
+    [UCS_CPU_MODEL_AMD_ROME]           = "Rome",
+    [UCS_CPU_MODEL_ZHAOXIN_ZHANGJIANG] = "Zhangjiang",
+    [UCS_CPU_MODEL_ZHAOXIN_WUDAOKOU]   = "Wudaokou",
+    [UCS_CPU_MODEL_ZHAOXIN_LUJIAZUI]   = "Lujiazui"
 };
 
 static const char* cpu_vendor_names[] = {
@@ -38,7 +43,8 @@ static const char* cpu_vendor_names[] = {
     [UCS_CPU_VENDOR_AMD]              = "AMD",
     [UCS_CPU_VENDOR_GENERIC_ARM]      = "Generic ARM",
     [UCS_CPU_VENDOR_GENERIC_PPC]      = "Generic PPC",
-    [UCS_CPU_VENDOR_FUJITSU_ARM]      = "Fujitsu ARM"
+    [UCS_CPU_VENDOR_FUJITSU_ARM]      = "Fujitsu ARM",
+    [UCS_CPU_VENDOR_ZHAOXIN]          = "Zhaoxin"
 };
 
 static double measure_memcpy_bandwidth(size_t size)
diff --git a/src/tools/info/tl_info.c b/src/tools/info/tl_info.c
index f0211abce8d..112ee2de7e4 100644
--- a/src/tools/info/tl_info.c
+++ b/src/tools/info/tl_info.c
@@ -118,7 +118,7 @@ static const char *size_limit_to_str(size_t min_size, size_t max_size)
 static void print_iface_info(uct_worker_h worker, uct_md_h md,
                              uct_tl_resource_desc_t *resource)
 {
-    char buf[200]                   = {0};
+    char buf[256]                   = {0};
     uct_iface_params_t iface_params = {
         .field_mask            = UCT_IFACE_PARAM_FIELD_OPEN_MODE   |
                                  UCT_IFACE_PARAM_FIELD_DEVICE      |
@@ -291,12 +291,14 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md,
         }
 
         buf[0] = '\0';
-        if (iface_attr.cap.flags & (UCT_IFACE_FLAG_ERRHANDLE_SHORT_BUF   |
-                                    UCT_IFACE_FLAG_ERRHANDLE_BCOPY_BUF   |
-                                    UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF   |
-                                    UCT_IFACE_FLAG_ERRHANDLE_AM_ID       |
-                                    UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM  |
-                                    UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)) {
+        if (iface_attr.cap.flags & (UCT_IFACE_FLAG_ERRHANDLE_SHORT_BUF    |
+                                    UCT_IFACE_FLAG_ERRHANDLE_BCOPY_BUF    |
+                                    UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF    |
+                                    UCT_IFACE_FLAG_ERRHANDLE_AM_ID        |
+                                    UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM   |
+                                    UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
+                                    UCT_IFACE_FLAG_EP_CHECK               |
+                                    UCT_IFACE_FLAG_EP_KEEPALIVE)) {
 
             if (iface_attr.cap.flags & (UCT_IFACE_FLAG_ERRHANDLE_SHORT_BUF |
                                         UCT_IFACE_FLAG_ERRHANDLE_BCOPY_BUF |
@@ -323,6 +325,12 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md,
             if (iface_attr.cap.flags & UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE) {
                 strncat(buf, " peer failure,", sizeof(buf) - strlen(buf) - 1);
             }
+            if (iface_attr.cap.flags & UCT_IFACE_FLAG_EP_CHECK) {
+                strncat(buf, " ep_check,", sizeof(buf) - strlen(buf) - 1);
+            }
+            if (iface_attr.cap.flags & UCT_IFACE_FLAG_EP_KEEPALIVE) {
+                strncat(buf, " keepalive,", sizeof(buf) - strlen(buf) - 1);
+            }
             buf[strlen(buf) - 1] = '\0';
         } else {
             strncat(buf, " none", sizeof(buf) - strlen(buf) - 1);
@@ -444,9 +452,6 @@ static void print_md_info(uct_component_h component,
         if (md_attr.cap.flags & UCT_MD_FLAG_RKEY_PTR) {
             printf("#           rkey_ptr is supported\n");
         }
-        if (md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) {
-            printf("#           supports client-server connection establishment via sockaddr\n");
-        }
     }
 
     if (num_resources == 0) {
diff --git a/src/tools/info/type_info.c b/src/tools/info/type_info.c
index f2b8abd75c1..b24154d21b5 100644
--- a/src/tools/info/type_info.c
+++ b/src/tools/info/type_info.c
@@ -17,6 +17,7 @@
 #include <ucs/datastruct/mpool.h>
 #include <ucs/datastruct/pgtable.h>
 #include <ucs/datastruct/ptr_array.h>
+#include <ucs/memory/memtype_cache.h>
 #include <ucs/memory/rcache.h>
 #include <ucs/memory/rcache_int.h>
 #include <ucs/time/timerq.h>
@@ -126,6 +127,7 @@ void print_type_info(const char * tl_name)
         PRINT_SIZE(ucs_rcache_t);
         PRINT_SIZE(ucs_rcache_region_t);
         PRINT_SIZE(ucs_conn_match_elem_t);
+        PRINT_SIZE(ucs_memory_info_t);
 
         printf("\nUCT:\n");
         PRINT_SIZE(uct_am_handler_t);
diff --git a/src/tools/info/ucx_info.c b/src/tools/info/ucx_info.c
index 27be23d238d..0a231a23c5d 100644
--- a/src/tools/info/ucx_info.c
+++ b/src/tools/info/ucx_info.c
@@ -28,6 +28,7 @@ static void usage() {
     printf("  -y              Show type and structures information\n");
     printf("  -s              Show system information\n");
     printf("  -c              Show UCX configuration\n");
+    printf("  -C              Comment-out default configuration values\n");
     printf("  -a              Show also hidden configuration\n");
     printf("  -f              Display fully decorated output\n");
     printf("\nUCP information (-u is required):\n");
@@ -39,6 +40,7 @@ static void usage() {
     printf("                    'a' : atomic operations\n");
     printf("                    'r' : remote memory access\n");
     printf("                    't' : tag matching \n");
+    printf("                    'm' : active messages \n");
     printf("                    'w' : wakeup\n");
     printf("                  Modifiers to use in combination with above features:\n");
     printf("                    'e' : error handling\n");
@@ -51,12 +53,16 @@ static void usage() {
     printf("                    'shm'  : shared memory devices only\n");
     printf("                    'net'  : network devices only\n");
     printf("                    'self' : self transport only\n");
+    /* TODO: add IPv6 support */
+    printf("  -A <ipv4>       Local IPv4 device address to use for creating\n"
+           "                  endpoint in client/server mode");
     printf("  -h              Show this help message\n");
     printf("\n");
 }
 
 int main(int argc, char **argv)
 {
+    char *ip_addr = NULL;
     ucs_config_print_flags_t print_flags;
     ucp_ep_params_t ucp_ep_params;
     unsigned dev_type_bitmap;
@@ -77,7 +83,8 @@ int main(int argc, char **argv)
     mem_size                 = NULL;
     dev_type_bitmap          = UINT_MAX;
     ucp_ep_params.field_mask = 0;
-    while ((c = getopt(argc, argv, "fahvcydbswpet:n:u:D:m:N:")) != -1) {
+
+    while ((c = getopt(argc, argv, "fahvcydbswpeCt:n:u:D:m:N:A:")) != -1) {
         switch (c) {
         case 'f':
             print_flags |= UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HEADER | UCS_CONFIG_PRINT_DOC;
@@ -88,6 +95,9 @@ int main(int argc, char **argv)
         case 'c':
             print_flags |= UCS_CONFIG_PRINT_CONFIG;
             break;
+        case 'C':
+            print_flags |= UCS_CONFIG_PRINT_COMMENT_DEFAULT;
+            break;
         case 'v':
             print_opts |= PRINT_VERSION;
             break;
@@ -140,6 +150,9 @@ int main(int argc, char **argv)
                 case 'w':
                     ucp_features |= UCP_FEATURE_WAKEUP;
                     break;
+                case 'm':
+                    ucp_features |= UCP_FEATURE_AM;
+                    break;
                 case 'e':
                     ucp_ep_params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;
                     ucp_ep_params.err_mode    = UCP_ERR_HANDLING_MODE_PEER;
@@ -164,6 +177,9 @@ int main(int argc, char **argv)
                 return -1;
             }
             break;
+        case 'A':
+            ip_addr = optarg;
+            break;
         case 'h':
             usage();
             return 0;
@@ -208,12 +224,14 @@ int main(int argc, char **argv)
 
     if (print_opts & (PRINT_UCP_CONTEXT|PRINT_UCP_WORKER|PRINT_UCP_EP|PRINT_MEM_MAP)) {
         if (ucp_features == 0) {
-            printf("Please select UCP features using -u switch: a|r|t|w\n");
+            printf("Please select UCP features using -u switch: a|r|t|m|w\n");
             usage();
             return -1;
         }
-        print_ucp_info(print_opts, print_flags, ucp_features, &ucp_ep_params,
-                       ucp_num_eps, ucp_num_ppn, dev_type_bitmap, mem_size);
+
+        return print_ucp_info(print_opts, print_flags, ucp_features,
+                              &ucp_ep_params, ucp_num_eps, ucp_num_ppn,
+                              dev_type_bitmap, mem_size, ip_addr);
     }
 
     return 0;
diff --git a/src/tools/info/ucx_info.h b/src/tools/info/ucx_info.h
index 037de535c6f..4b9f96dc1c0 100644
--- a/src/tools/info/ucx_info.h
+++ b/src/tools/info/ucx_info.h
@@ -7,9 +7,12 @@
 #ifndef UCX_INFO_H
 #define UCX_INFO_H
 
+#include <ucs/sys/sock.h>
 #include <uct/api/uct.h>
 #include <ucp/api/ucp.h>
 
+#include <arpa/inet.h>
+
 
 enum {
     PRINT_VERSION        = UCS_BIT(0),
@@ -35,9 +38,11 @@ void print_uct_info(int print_opts, ucs_config_print_flags_t print_flags,
 
 void print_type_info(const char * tl_name);
 
-void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags,
-                    uint64_t ctx_features, const ucp_ep_params_t *base_ep_params,
-                    size_t estimated_num_eps, size_t estimated_num_ppn,
-                    unsigned dev_type_bitmap, const char *mem_size);
+ucs_status_t
+print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags,
+               uint64_t ctx_features, const ucp_ep_params_t *base_ep_params,
+               size_t estimated_num_eps, size_t estimated_num_ppn,
+               unsigned dev_type_bitmap, const char *mem_size,
+               const char *ip_addr);
 
 #endif
diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h
index 4e2bb9842f1..230e6416add 100644
--- a/src/tools/perf/api/libperf.h
+++ b/src/tools/perf/api/libperf.h
@@ -1,8 +1,9 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
 * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
-* Copyright (C) The University of Tennessee and The University 
+* Copyright (C) The University of Tennessee and The University
 *               of Tennessee Research Foundation. 2015. ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2020.  ALL RIGHTS RESERVED.
 * See file LICENSE for terms.
 */
 
@@ -15,12 +16,8 @@ BEGIN_C_DECLS
 
 /** @file libperf.h */
 
-#include <sys/uio.h>
 #include <uct/api/uct.h>
 #include <ucp/api/ucp.h>
-#include <ucs/sys/math.h>
-#include <ucs/sys/stubs.h>
-#include <ucs/type/status.h>
 
 
 typedef enum {
@@ -47,6 +44,8 @@ typedef enum {
 
 typedef enum {
     UCX_PERF_TEST_TYPE_PINGPONG,         /* Ping-pong mode */
+    UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM,/* Ping-pong mode with
+                                            ucp_worker_wait_mem() */
     UCX_PERF_TEST_TYPE_STREAM_UNI,       /* Unidirectional stream */
     UCX_PERF_TEST_TYPE_STREAM_BI,        /* Bidirectional stream */
     UCX_PERF_TEST_TYPE_LAST
@@ -61,6 +60,7 @@ typedef enum {
 
 typedef enum {
     UCT_PERF_DATA_LAYOUT_SHORT,
+    UCT_PERF_DATA_LAYOUT_SHORT_IOV,
     UCT_PERF_DATA_LAYOUT_BCOPY,
     UCT_PERF_DATA_LAYOUT_ZCOPY,
     UCT_PERF_DATA_LAYOUT_LAST
@@ -68,7 +68,7 @@ typedef enum {
 
 
 typedef enum {
-    UCX_PERF_WAIT_MODE_PROGRESS,     /* Repeatedly call progress */
+    UCX_PERF_WAIT_MODE_POLL,         /* Repeatedly call progress */
     UCX_PERF_WAIT_MODE_SLEEP,        /* Go to sleep */
     UCX_PERF_WAIT_MODE_SPIN,         /* Spin without calling progress */
     UCX_PERF_WAIT_MODE_LAST
@@ -85,7 +85,8 @@ enum ucx_perf_test_flags {
     UCX_PERF_TEST_FLAG_VERBOSE          = UCS_BIT(7), /* Print error messages */
     UCX_PERF_TEST_FLAG_STREAM_RECV_DATA = UCS_BIT(8), /* For stream tests, use recv data API */
     UCX_PERF_TEST_FLAG_FLUSH_EP         = UCS_BIT(9), /* Issue flush on endpoint instead of worker */
-    UCX_PERF_TEST_FLAG_WAKEUP           = UCS_BIT(10) /* Create context with wakeup feature enabled */
+    UCX_PERF_TEST_FLAG_WAKEUP           = UCS_BIT(10), /* Create context with wakeup feature enabled */
+    UCX_PERF_TEST_FLAG_ERR_HANDLING     = UCS_BIT(11) /* Create UCP eps with error handling support */
 };
 
 
@@ -188,7 +189,6 @@ typedef struct ucx_perf_params {
     size_t                 iov_stride;      /* Distance between starting address
                                                of consecutive IOV entries. It is
                                                similar to UCT uct_iov_t type stride */
-    size_t                 am_hdr_size;     /* Active message header size (included in message size) */
     size_t                 alignment;       /* Message buffer alignment */
     unsigned               max_outstanding; /* Maximal number of outstanding sends */
     ucx_perf_counter_t     warmup_iter;     /* Number of warm-up iterations */
@@ -206,12 +206,16 @@ typedef struct ucx_perf_params {
         char                   md_name[UCT_MD_NAME_MAX];      /* Memory domain name to use */
         uct_perf_data_layout_t data_layout; /* Data layout to use */
         unsigned               fc_window;   /* Window size for flow control <= UCX_PERF_TEST_MAX_FC_WINDOW */
+        size_t                 am_hdr_size; /* UCT Active Message header size
+                                               (included in message size) */
     } uct;
 
     struct {
         unsigned               nonblocking_mode; /* TBD */
         ucp_perf_datatype_t    send_datatype;
         ucp_perf_datatype_t    recv_datatype;
+        size_t                 am_hdr_size; /* UCP Active Message header size
+                                               (not included in message size) */
     } ucp;
 
 } ucx_perf_params_t;
diff --git a/src/tools/perf/cuda/Makefile.am b/src/tools/perf/cuda/Makefile.am
index aa6cb37065a..ecb7a33c0f7 100644
--- a/src/tools/perf/cuda/Makefile.am
+++ b/src/tools/perf/cuda/Makefile.am
@@ -10,6 +10,7 @@ module_LTLIBRARIES               = libucx_perftest_cuda.la
 libucx_perftest_cuda_la_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS)
 libucx_perftest_cuda_la_CFLAGS   = $(BASE_CFLAGS) $(CUDA_CFLAGS)
 libucx_perftest_cuda_la_LDFLAGS  = $(CUDA_LDFLAGS) -version-info $(SOVERSION)
+libucx_perftest_cuda_la_LIBADD   = $(CUDA_LIBS)
 libucx_perftest_cuda_la_SOURCES  = cuda_alloc.c
 
 include $(top_srcdir)/config/module.am
diff --git a/src/tools/perf/lib/libperf.c b/src/tools/perf/lib/libperf.c
index 911a29b2e3f..134301f20e2 100644
--- a/src/tools/perf/lib/libperf.c
+++ b/src/tools/perf/lib/libperf.c
@@ -3,7 +3,8 @@
 * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
 * Copyright (C) The University of Tennessee and The University
 *               of Tennessee Research Foundation. 2015-2016. ALL RIGHTS RESERVED.
-* Copyright (C) ARM Ltd. 2017.  ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2017-2020.  ALL RIGHTS RESERVED.
+* Copyright (C) Huawei Technologies Co., Ltd. 2021.  ALL RIGHTS RESERVED.
 * See file LICENSE for terms.
 */
 
@@ -81,7 +82,9 @@ static const char *perf_iface_ops[] = {
     [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_SHORT)]  = "tag eager short",
     [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)]  = "tag eager bcopy",
     [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)]  = "tag eager zcopy",
-    [ucs_ilog2(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)]   = "tag rndv zcopy"
+    [ucs_ilog2(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)]   = "tag rndv zcopy",
+    [ucs_ilog2(UCT_IFACE_FLAG_EP_CHECK)]         = "ep check",
+    [ucs_ilog2(UCT_IFACE_FLAG_EP_KEEPALIVE)]     = "ep keepalive"
 };
 
 static const char *perf_atomic_op[] = {
@@ -167,7 +170,6 @@ uct_perf_test_alloc_host(const ucx_perf_context_t *perf, size_t length,
     status = uct_iface_mem_alloc(perf->uct.iface, length,
                                  flags, "perftest", alloc_mem);
     if (status != UCS_OK) {
-        ucs_free(alloc_mem);
         ucs_error("failed to allocate memory: %s", ucs_status_string(status));
         return status;
     }
@@ -323,7 +325,8 @@ void ucx_perf_calc_result(ucx_perf_context_t *perf, ucx_perf_result_t *result)
     ucs_time_t median;
     double factor;
 
-    if (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) {
+    if ((perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) ||
+        (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM)) {
         factor = 2.0;
     } else {
         factor = 1.0;
@@ -479,7 +482,8 @@ void uct_perf_iface_flush_b(ucx_perf_context_t *perf)
 static inline uint64_t __get_flag(uct_perf_data_layout_t layout, uint64_t short_f,
                                   uint64_t bcopy_f, uint64_t zcopy_f)
 {
-    return (layout == UCT_PERF_DATA_LAYOUT_SHORT) ? short_f :
+    return ((layout == UCT_PERF_DATA_LAYOUT_SHORT) ||
+            (layout == UCT_PERF_DATA_LAYOUT_SHORT_IOV)) ? short_f :
            (layout == UCT_PERF_DATA_LAYOUT_BCOPY) ? bcopy_f :
            (layout == UCT_PERF_DATA_LAYOUT_ZCOPY) ? zcopy_f :
            0;
@@ -501,7 +505,8 @@ static inline ucs_status_t __get_atomic_flag(size_t size, uint64_t *op32,
 static inline size_t __get_max_size(uct_perf_data_layout_t layout, size_t short_m,
                                     size_t bcopy_m, uint64_t zcopy_m)
 {
-    return (layout == UCT_PERF_DATA_LAYOUT_SHORT) ? short_m :
+    return ((layout == UCT_PERF_DATA_LAYOUT_SHORT) ||
+            (layout == UCT_PERF_DATA_LAYOUT_SHORT_IOV)) ? short_m :
            (layout == UCT_PERF_DATA_LAYOUT_BCOPY) ? bcopy_m :
            (layout == UCT_PERF_DATA_LAYOUT_ZCOPY) ? zcopy_m :
            0;
@@ -650,8 +655,7 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params,
 
     if (params->command == UCX_PERF_CMD_AM) {
         if ((params->uct.data_layout == UCT_PERF_DATA_LAYOUT_SHORT) &&
-            (params->am_hdr_size != sizeof(uint64_t)))
-        {
+            (params->uct.am_hdr_size != sizeof(uint64_t))) {
             if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                 ucs_error("Short AM header size must be 8 bytes");
             }
@@ -659,19 +663,20 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params,
         }
 
         if ((params->uct.data_layout == UCT_PERF_DATA_LAYOUT_ZCOPY) &&
-            (params->am_hdr_size > attr.cap.am.max_hdr))
-        {
+            (params->uct.am_hdr_size > attr.cap.am.max_hdr)) {
             if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) {
-                ucs_error("AM header size (%zu) is larger than max supported (%zu)",
-                          params->am_hdr_size, attr.cap.am.max_hdr);
+                ucs_error("AM header size (%zu) is larger than max supported "
+                          "(%zu)",
+                          params->uct.am_hdr_size, attr.cap.am.max_hdr);
             }
             return UCS_ERR_UNSUPPORTED;
         }
 
-        if (params->am_hdr_size > message_size) {
+        if (params->uct.am_hdr_size > message_size) {
             if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) {
-                ucs_error("AM header size (%zu) is larger than message size (%zu)",
-                          params->am_hdr_size, message_size);
+                ucs_error("AM header size (%zu) is larger than message size "
+                          "(%zu)",
+                          params->uct.am_hdr_size, message_size);
             }
             return UCS_ERR_INVALID_PARAM;
         }
@@ -691,7 +696,8 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params,
         }
     }
 
-    if (UCT_PERF_DATA_LAYOUT_ZCOPY == params->uct.data_layout) {
+    if ((UCT_PERF_DATA_LAYOUT_ZCOPY == params->uct.data_layout) ||
+        (UCT_PERF_DATA_LAYOUT_SHORT_IOV == params->uct.data_layout)) {
         if (params->msg_size_cnt > max_iov) {
             if ((params->flags & UCX_PERF_TEST_FLAG_VERBOSE) ||
                 !params->msg_size_cnt) {
@@ -702,11 +708,13 @@ static ucs_status_t uct_perf_test_check_capabilities(ucx_perf_params_t *params,
             return UCS_ERR_UNSUPPORTED;
         }
         /* if msg_size_cnt == 1 the message size checked above */
-        if ((UCX_PERF_CMD_AM == params->command) && (params->msg_size_cnt > 1)) {
-            if (params->am_hdr_size > params->msg_size_list[0]) {
+        if ((UCT_PERF_DATA_LAYOUT_ZCOPY == params->uct.data_layout) &&
+            (UCX_PERF_CMD_AM == params->command) && (params->msg_size_cnt > 1)) {
+            if (params->uct.am_hdr_size > params->msg_size_list[0]) {
                 if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) {
                     ucs_error("AM header size (%lu) larger than the first IOV "
-                              "message size (%lu)", params->am_hdr_size,
+                              "message size (%lu)",
+                              params->uct.am_hdr_size,
                               params->msg_size_list[0]);
                 }
                 return UCS_ERR_INVALID_PARAM;
@@ -972,6 +980,9 @@ static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params,
     case UCX_PERF_CMD_STREAM:
         ucp_params->features |= UCP_FEATURE_STREAM;
         break;
+    case UCX_PERF_CMD_AM:
+        ucp_params->features |= UCP_FEATURE_AM;
+        break;
     default:
         if (params->flags & UCX_PERF_TEST_FLAG_VERBOSE) {
             ucs_error("Invalid test command");
@@ -979,7 +990,8 @@ static ucs_status_t ucp_perf_test_fill_params(ucx_perf_params_t *params,
         return UCS_ERR_INVALID_PARAM;
     }
 
-    if (params->flags & UCX_PERF_TEST_FLAG_WAKEUP) {
+    if ((params->flags & UCX_PERF_TEST_FLAG_WAKEUP) ||
+        (params->wait_mode == UCX_PERF_WAIT_MODE_SLEEP)) {
         ucp_params->features |= UCP_FEATURE_WAKEUP;
     }
 
@@ -1086,6 +1098,16 @@ static ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf)
         goto err_free_send_buffer;
     }
 
+    /* Allocate AM header */
+    if (params->ucp.am_hdr_size != 0) {
+        perf->ucp.am_hdr = malloc(params->ucp.am_hdr_size);
+        if (perf->ucp.am_hdr == NULL) {
+            goto err_free_buffers;
+        }
+    } else {
+        perf->ucp.am_hdr = NULL;
+    }
+
     /* Allocate IOV datatype memory */
     perf->ucp.send_iov = NULL;
     status = ucp_perf_test_alloc_iov_mem(params->ucp.send_datatype,
@@ -1093,7 +1115,7 @@ static ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf)
                                          params->thread_count,
                                          &perf->ucp.send_iov);
     if (UCS_OK != status) {
-        goto err_free_buffers;
+        goto err_free_am_hdr;
     }
 
     perf->ucp.recv_iov = NULL;
@@ -1109,6 +1131,8 @@ static ucs_status_t ucp_perf_test_alloc_mem(ucx_perf_context_t *perf)
 
 err_free_send_iov_buffers:
     free(perf->ucp.send_iov);
+err_free_am_hdr:
+    free(perf->ucp.am_hdr);
 err_free_buffers:
     perf->allocator->ucp_free(perf, perf->recv_buffer, perf->ucp.recv_memh);
 err_free_send_buffer:
@@ -1121,6 +1145,7 @@ static void ucp_perf_test_free_mem(ucx_perf_context_t *perf)
 {
     free(perf->ucp.recv_iov);
     free(perf->ucp.send_iov);
+    free(perf->ucp.am_hdr);
     perf->allocator->ucp_free(perf, perf->recv_buffer, perf->ucp.recv_memh);
     perf->allocator->ucp_free(perf, perf->send_buffer, perf->ucp.send_memh);
 }
@@ -1179,6 +1204,13 @@ static ucs_status_t ucp_perf_test_exchange_status(ucx_perf_context_t *perf,
     return collective_status;
 }
 
+static void ucp_perf_test_err_handler(void *arg, ucp_ep_h ep,
+                                      ucs_status_t status)
+{
+    ucs_error("error handler called with status %d (%s)\n", status,
+              ucs_status_string(status));
+}
+
 static ucs_status_t ucp_perf_test_receive_remote_data(ucx_perf_context_t *perf)
 {
     unsigned thread_count = perf->params.thread_count;
@@ -1230,6 +1262,14 @@ static ucs_status_t ucp_perf_test_receive_remote_data(ucx_perf_context_t *perf)
         ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS;
         ep_params.address    = address;
 
+        if (perf->params.flags & UCX_PERF_TEST_FLAG_ERR_HANDLING) {
+            ep_params.field_mask     |= UCP_EP_PARAM_FIELD_ERR_HANDLER |
+                                        UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;
+            ep_params.err_handler.cb  = ucp_perf_test_err_handler;
+            ep_params.err_handler.arg = NULL;
+            ep_params.err_mode        = UCP_ERR_HANDLING_MODE_PEER;
+        }
+
         status = ucp_ep_create(perf->ucp.tctx[i].perf.ucp.worker, &ep_params,
                                &perf->ucp.tctx[i].perf.ucp.ep);
         if (status != UCS_OK) {
@@ -1396,7 +1436,7 @@ static ucs_status_t ucp_perf_test_setup_endpoints(ucx_perf_context_t *perf,
     for (i = 0; i < perf->params.thread_count; i++) {
         status = ucp_worker_flush(perf->ucp.tctx[i].perf.ucp.worker);
         if (status != UCS_OK) {
-            ucs_warn("ucp_worker_flush() failed on theread %d: %s",
+            ucs_warn("ucp_worker_flush() failed on thread %d: %s",
                      i, ucs_status_string(status));
         }
     }
@@ -1599,7 +1639,7 @@ static ucs_status_t uct_perf_setup(ucx_perf_context_t *perf)
     }
 
     /* Enable progress before `uct_iface_flush` and `uct_worker_progress` called
-     * to give a chance to finish connection for some tranports (ib/ud, tcp).
+     * to give a chance to finish connection for some transports (ib/ud, tcp).
      * They may return UCS_INPROGRESS from `uct_iface_flush` when connections are
      * in progress */
     uct_iface_progress_enable(perf->uct.iface,
@@ -1648,6 +1688,7 @@ static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf)
 {
     ucp_params_t ucp_params;
     ucp_worker_params_t worker_params;
+    ucp_worker_attr_t worker_attr;
     ucp_config_t *config;
     ucs_status_t status;
     unsigned i, thread_count;
@@ -1663,7 +1704,7 @@ static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf)
     if (perf->params.thread_count > 1) {
         /* when there is more than one thread, a ucp_worker would be created for
          * each. all of them will share the same ucp_context */
-        ucp_params.features          |= UCP_PARAM_FIELD_MT_WORKERS_SHARED;
+        ucp_params.field_mask        |= UCP_PARAM_FIELD_MT_WORKERS_SHARED;
         ucp_params.mt_workers_shared  = 1;
     }
 
@@ -1717,6 +1758,23 @@ static ucs_status_t ucp_perf_setup(ucx_perf_context_t *perf)
         }
     }
 
+    if (perf->params.command == UCX_PERF_CMD_AM) {
+        /* Check that requested AM header size is not larger than max supported. */
+        worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_MAX_AM_HEADER;
+        status = ucp_worker_query(perf->ucp.tctx[0].perf.ucp.worker,
+                                  &worker_attr);
+        if (status != UCS_OK) {
+            goto err_free_tctx_destroy_workers;
+        }
+
+        if (worker_attr.max_am_header < perf->params.ucp.am_hdr_size) {
+            ucs_error("AM header size (%zu) is larger than max supported (%zu)",
+                      perf->params.ucp.am_hdr_size, worker_attr.max_am_header);
+            status = UCS_ERR_INVALID_PARAM;
+            goto err_free_tctx_destroy_workers;
+        }
+    }
+
     status = ucp_perf_test_setup_endpoints(perf, ucp_params.features);
     if (status != UCS_OK) {
         if (perf->params.flags & UCX_PERF_TEST_FLAG_VERBOSE) {
@@ -1865,6 +1923,12 @@ static ucs_status_t ucx_perf_thread_run_test(void* arg)
     ucx_perf_params_t* params       = &perf->params;
     ucs_status_t status;
 
+    /* new threads need explicit device association */
+    status = perf->allocator->init(perf);
+    if (status != UCS_OK) {
+        goto out;
+    }
+
     if (params->warmup_iter > 0) {
         ucx_perf_set_warmup(perf, params);
         status = ucx_perf_funcs[params->api].run(perf);
diff --git a/src/tools/perf/lib/libperf_int.h b/src/tools/perf/lib/libperf_int.h
index 74592000db0..498f9742778 100644
--- a/src/tools/perf/lib/libperf_int.h
+++ b/src/tools/perf/lib/libperf_int.h
@@ -15,8 +15,10 @@ BEGIN_C_DECLS
 
 /** @file libperf_int.h */
 
-#include <ucs/time/time.h>
 #include <ucs/async/async.h>
+#include <ucs/time/time.h>
+#include <ucs/sys/math.h>
+
 
 #if _OPENMP
 #include <omp.h>
@@ -102,6 +104,7 @@ struct ucx_perf_context {
             ucp_mem_h                  recv_memh;
             ucp_dt_iov_t               *send_iov;
             ucp_dt_iov_t               *recv_iov;
+            void                       *am_hdr;
         } ucp;
     };
 };
diff --git a/src/tools/perf/lib/ucp_tests.cc b/src/tools/perf/lib/ucp_tests.cc
index 46e13e8ee15..1e27857c2b7 100644
--- a/src/tools/perf/lib/ucp_tests.cc
+++ b/src/tools/perf/lib/ucp_tests.cc
@@ -2,6 +2,7 @@
 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
 * Copyright (C) The University of Tennessee and The University
 *               of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2020.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -10,15 +11,9 @@
 #  include "config.h"
 #endif
 
-#include <tools/perf/lib/libperf_int.h>
+#include "libperf_int.h"
 
-extern "C" {
-#include <ucs/debug/log.h>
-#include <ucs/sys/math.h>
-#include <ucs/sys/sys.h>
-}
 #include <ucs/sys/preprocessor.h>
-
 #include <limits>
 
 
@@ -28,19 +23,53 @@ extern "C" {
 template <ucx_perf_cmd_t CMD, ucx_perf_test_type_t TYPE, unsigned FLAGS>
 class ucp_perf_test_runner {
 public:
+    static const unsigned AM_ID     = 1;
     static const ucp_tag_t TAG      = 0x1337a880u;
     static const ucp_tag_t TAG_MASK = (FLAGS & UCX_PERF_TEST_FLAG_TAG_WILDCARD) ?
                                       0 : (ucp_tag_t)-1;
 
     typedef uint8_t psn_t;
 
-    ucp_perf_test_runner(ucx_perf_context_t &perf) :
-        m_perf(perf),
-        m_outstanding(0),
-        m_max_outstanding(m_perf.params.max_outstanding)
+    ucp_perf_test_runner(ucx_perf_context_t &perf)
+        : m_perf(perf),
+          m_outstanding(0),
+          m_max_outstanding(m_perf.params.max_outstanding),
+          m_am_rx_buffer(NULL),
+          m_am_rx_length(0ul)
 
     {
+        memset(&m_am_rx_params, 0, sizeof(m_am_rx_params));
+
         ucs_assert_always(m_max_outstanding > 0);
+
+        set_am_handler(am_data_handler, this, UCP_AM_FLAG_WHOLE_MSG);
+    }
+
+    ~ucp_perf_test_runner()
+    {
+        set_am_handler(NULL, this, 0);
+    }
+
+    void set_am_handler(ucp_am_recv_callback_t cb, void *arg, unsigned flags)
+    {
+        if (CMD == UCX_PERF_CMD_AM) {
+            ucp_am_handler_param_t param;
+            param.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID |
+                               UCP_AM_HANDLER_PARAM_FIELD_CB |
+                               UCP_AM_HANDLER_PARAM_FIELD_ARG;
+            param.id         = AM_ID;
+            param.cb         = cb;
+            param.arg        = arg;
+
+            if (flags != 0) {
+                param.field_mask |= UCP_AM_HANDLER_PARAM_FIELD_FLAGS;
+                param.flags       = flags;
+            }
+
+            ucs_status_t status = ucp_worker_set_am_recv_handler(
+                                      m_perf.ucp.worker, &param);
+            ucs_assert_always(status == UCS_OK);
+        }
     }
 
     void create_iov_buffer(ucp_dt_iov_t *iov, void *buffer)
@@ -89,16 +118,65 @@ class ucp_perf_test_runner {
         }
     }
 
+    void ucp_perf_init_common_params(size_t *total_length, size_t *send_length,
+                                     ucp_datatype_t *send_dt,
+                                     void **send_buffer, size_t *recv_length,
+                                     ucp_datatype_t *recv_dt,
+                                     void **recv_buffer)
+    {
+        *total_length = ucx_perf_get_message_size(&m_perf.params);
+
+        if (CMD == UCX_PERF_CMD_PUT) {
+            ucs_assert(*total_length >= sizeof(psn_t));
+        }
+
+        ucp_perf_test_prepare_iov_buffers();
+
+        *send_length = *recv_length = *total_length;
+
+        *send_dt = ucp_perf_test_get_datatype(m_perf.params.ucp.send_datatype,
+                                              m_perf.ucp.send_iov, send_length,
+                                              send_buffer);
+        *recv_dt = ucp_perf_test_get_datatype(m_perf.params.ucp.recv_datatype,
+                                              m_perf.ucp.recv_iov, recv_length,
+                                              recv_buffer);
+        if (CMD == UCX_PERF_CMD_AM) {
+            m_am_rx_params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
+                                          UCP_OP_ATTR_FIELD_USER_DATA |
+                                          UCP_OP_ATTR_FIELD_DATATYPE |
+                                          UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+            m_am_rx_params.datatype     = *recv_dt;
+            m_am_rx_params.cb.recv_am   = am_data_recv_cb;
+            m_am_rx_params.user_data    = this;
+            m_am_rx_buffer              = *recv_buffer;
+            m_am_rx_length              = *recv_length;
+        }
+    }
+
+    void UCS_F_ALWAYS_INLINE blocking_progress() {
+        if (ucp_worker_progress(m_perf.ucp.worker) == 0) {
+            ucp_worker_wait(m_perf.ucp.worker);
+        }
+    }
+
+    void UCS_F_ALWAYS_INLINE progress() {
+        if (ucs_unlikely(UCX_PERF_WAIT_MODE_SLEEP == m_perf.params.wait_mode)) {
+            blocking_progress();
+        } else {
+            ucp_worker_progress(m_perf.ucp.worker);
+        }
+    }
+
     void UCS_F_ALWAYS_INLINE progress_responder() {
         if (!(FLAGS & UCX_PERF_TEST_FLAG_ONE_SIDED) &&
             !(m_perf.params.flags & UCX_PERF_TEST_FLAG_ONE_SIDED))
         {
-            ucp_worker_progress(m_perf.ucp.worker);
+            progress();
         }
     }
 
     void UCS_F_ALWAYS_INLINE progress_requestor() {
-        ucp_worker_progress(m_perf.ucp.worker);
+        progress();
     }
 
     ssize_t UCS_F_ALWAYS_INLINE wait_stream_recv(void *request)
@@ -117,6 +195,25 @@ class ucp_perf_test_runner {
         return ucs_likely(status == UCS_OK) ? length : status;
     }
 
+    ucs_status_t am_rndv_recv(void *data, size_t length,
+                              const ucp_am_recv_param_t *rx_params)
+    {
+        ucs_assert(!(rx_params->recv_attr &
+                     (UCP_AM_RECV_ATTR_FLAG_DATA | UCP_AM_RECV_ATTR_FLAG_FIRST |
+                      UCP_AM_RECV_ATTR_FLAG_ONLY)));
+        ucs_assert(length == ucx_perf_get_message_size(&m_perf.params));
+
+        ucs_status_ptr_t sp = ucp_am_recv_data_nbx(m_perf.ucp.worker, data,
+                                                   m_am_rx_buffer,
+                                                   m_am_rx_length,
+                                                   &m_am_rx_params);
+        ucs_assert(UCS_PTR_IS_PTR(sp));
+        ucp_request_release(sp);
+
+        return UCS_INPROGRESS;
+    }
+
+
     static void send_cb(void *request, ucs_status_t status)
     {
         ucp_perf_request_t *r      = reinterpret_cast<ucp_perf_request_t*>(
@@ -128,6 +225,11 @@ class ucp_perf_test_runner {
         ucp_request_free(request);
     }
 
+    static void send_nbx_cb(void *request, ucs_status_t status, void *user_data)
+    {
+        send_cb(request, status);
+    }
+
     static void tag_recv_cb(void *request, ucs_status_t status,
                             ucp_tag_recv_info_t *info)
     {
@@ -146,6 +248,28 @@ class ucp_perf_test_runner {
         ucp_request_free(request);
     }
 
+    static void am_data_recv_cb(void *request, ucs_status_t status,
+                                size_t length, void *user_data)
+    {
+        ucp_perf_test_runner *test = (ucp_perf_test_runner*)user_data;
+        test->op_completed();
+    }
+
+    static ucs_status_t
+    am_data_handler(void *arg, const void *header, size_t header_length,
+                    void *data, size_t length, const ucp_am_recv_param_t *param)
+    {
+        ucp_perf_test_runner *test = (ucp_perf_test_runner*)arg;
+
+        if (param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV) {
+            return test->am_rndv_recv(data, length, param);
+        }
+
+        /* TODO: Add option to do memcopy here */
+        test->op_completed();
+        return UCS_OK;
+    }
+
     void UCS_F_ALWAYS_INLINE wait_window(unsigned n, bool is_requestor)
     {
         while (m_outstanding >= (m_max_outstanding - n + 1)) {
@@ -162,12 +286,14 @@ class ucp_perf_test_runner {
          uint8_t sn, uint64_t remote_addr, ucp_rkey_h rkey)
     {
         void *request;
+        ucp_request_param_t param;
 
         /* coverity[switch_selector_expr_is_constant] */
         switch (CMD) {
         case UCX_PERF_CMD_TAG:
         case UCX_PERF_CMD_TAG_SYNC:
         case UCX_PERF_CMD_STREAM:
+        case UCX_PERF_CMD_AM:
             wait_window(1, true);
             /* coverity[switch_selector_expr_is_constant] */
             switch (CMD) {
@@ -183,6 +309,16 @@ class ucp_perf_test_runner {
                 request = ucp_stream_send_nb(ep, buffer, length, datatype,
                                              send_cb, 0);
                 break;
+            case UCX_PERF_CMD_AM:
+                param.op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE |
+                                     UCP_OP_ATTR_FIELD_CALLBACK;
+                param.cb.send      = send_nbx_cb;
+                param.datatype     = datatype;
+                request            = ucp_am_send_nbx(ep, AM_ID,
+                                         m_perf.ucp.am_hdr,
+                                         m_perf.params.ucp.am_hdr_size, buffer,
+                                         length, &param);
+                break;
             default:
                 request = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM);
                 break;
@@ -197,6 +333,7 @@ class ucp_perf_test_runner {
             /* coverity[switch_selector_expr_is_constant] */
             switch (TYPE) {
             case UCX_PERF_TEST_TYPE_PINGPONG:
+            case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM:
                 *((uint8_t*)buffer + length - 1) = sn;
                 break;
             case UCX_PERF_TEST_TYPE_STREAM_UNI:
@@ -276,6 +413,9 @@ class ucp_perf_test_runner {
             reinterpret_cast<ucp_perf_request_t*>(request)->context = this;
             op_started();
             return UCS_OK;
+        case UCX_PERF_CMD_AM:
+            op_started();
+            return UCS_OK;
         case UCX_PERF_CMD_PUT:
             /* coverity[switch_selector_expr_is_constant] */
             switch (TYPE) {
@@ -285,6 +425,13 @@ class ucp_perf_test_runner {
                     progress_responder();
                 }
                 return UCS_OK;
+            case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM:
+                ptr = (volatile uint8_t*)buffer + length - 1;
+                while (*ptr != sn) {
+                    ucp_worker_wait_mem(worker, (void *)ptr);
+                    progress_responder();
+                }
+                return UCS_OK;
             case UCX_PERF_TEST_TYPE_STREAM_UNI:
                 return UCS_OK;
             default:
@@ -372,10 +519,17 @@ class ucp_perf_test_runner {
         ucp_rkey_h rkey;
         size_t length, send_length, recv_length;
 
-        length        = ucx_perf_get_message_size(&m_perf.params);
-        ucs_assert(length >= sizeof(psn_t));
+        send_buffer = m_perf.send_buffer;
+        recv_buffer = m_perf.recv_buffer;
+        worker      = m_perf.ucp.worker;
+        ep          = m_perf.ucp.ep;
+        remote_addr = m_perf.ucp.remote_addr;
+        rkey        = m_perf.ucp.rkey;
+        sn          = 0;
 
-        ucp_perf_test_prepare_iov_buffers();
+        ucp_perf_init_common_params(&length, &send_length, &send_datatype,
+                                    &send_buffer, &recv_length, &recv_datatype,
+                                    &recv_buffer);
 
         if (CMD == UCX_PERF_CMD_PUT) {
             m_perf.allocator->memcpy((psn_t*)m_perf.recv_buffer + length - 1,
@@ -386,28 +540,12 @@ class ucp_perf_test_runner {
 
         ucp_perf_barrier(&m_perf);
 
-        my_index      = rte_call(&m_perf, group_index);
+        my_index = rte_call(&m_perf, group_index);
 
         ucx_perf_test_start_clock(&m_perf);
 
         ucx_perf_omp_barrier(&m_perf);
 
-        send_buffer   = m_perf.send_buffer;
-        recv_buffer   = m_perf.recv_buffer;
-        worker        = m_perf.ucp.worker;
-        ep            = m_perf.ucp.ep;
-        remote_addr   = m_perf.ucp.remote_addr;
-        rkey          = m_perf.ucp.rkey;
-        sn            = 0;
-        send_length   = length;
-        recv_length   = length;
-        send_datatype = ucp_perf_test_get_datatype(m_perf.params.ucp.send_datatype,
-                                                   m_perf.ucp.send_iov, &send_length,
-                                                   &send_buffer);
-        recv_datatype = ucp_perf_test_get_datatype(m_perf.params.ucp.recv_datatype,
-                                                   m_perf.ucp.recv_iov, &recv_length,
-                                                   &recv_buffer);
-
         if (my_index == 0) {
             UCX_PERF_TEST_FOREACH(&m_perf) {
                 send(ep, send_buffer, send_length, send_datatype, sn, remote_addr, rkey);
@@ -446,35 +584,26 @@ class ucp_perf_test_runner {
         size_t length, send_length, recv_length;
         uint8_t sn;
 
-        length        = ucx_perf_get_message_size(&m_perf.params);
-        ucs_assert(length >= sizeof(psn_t));
+        send_buffer = m_perf.send_buffer;
+        recv_buffer = m_perf.recv_buffer;
+        worker      = m_perf.ucp.worker;
+        ep          = m_perf.ucp.ep;
+        remote_addr = m_perf.ucp.remote_addr;
+        rkey        = m_perf.ucp.rkey;
+        sn          = 0;
 
-        ucp_perf_test_prepare_iov_buffers();
+        ucp_perf_init_common_params(&length, &send_length, &send_datatype,
+                                    &send_buffer, &recv_length, &recv_datatype,
+                                    &recv_buffer);
 
         ucp_perf_barrier(&m_perf);
 
-        my_index      = rte_call(&m_perf, group_index);
+        my_index = rte_call(&m_perf, group_index);
 
         ucx_perf_test_start_clock(&m_perf);
 
         ucx_perf_omp_barrier(&m_perf);
 
-        send_buffer   = m_perf.send_buffer;
-        recv_buffer   = m_perf.recv_buffer;
-        worker        = m_perf.ucp.worker;
-        ep            = m_perf.ucp.ep;
-        remote_addr   = m_perf.ucp.remote_addr;
-        rkey          = m_perf.ucp.rkey;
-        sn            = 0;
-        send_length   = length;
-        recv_length   = length;
-        send_datatype = ucp_perf_test_get_datatype(m_perf.params.ucp.send_datatype,
-                                                   m_perf.ucp.send_iov, &send_length,
-                                                   &send_buffer);
-        recv_datatype = ucp_perf_test_get_datatype(m_perf.params.ucp.recv_datatype,
-                                                   m_perf.ucp.recv_iov, &recv_length,
-                                                   &recv_buffer);
-
         if (my_index == 0) {
             UCX_PERF_TEST_FOREACH(&m_perf) {
                 recv(worker, ep, recv_buffer, recv_length, recv_datatype, sn);
@@ -510,6 +639,7 @@ class ucp_perf_test_runner {
         /* coverity[switch_selector_expr_is_constant] */
         switch (TYPE) {
         case UCX_PERF_TEST_TYPE_PINGPONG:
+        case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM:
             return run_pingpong();
         case UCX_PERF_TEST_TYPE_STREAM_UNI:
             return run_stream_uni();
@@ -578,9 +708,16 @@ class ucp_perf_test_runner {
         --m_outstanding;
     }
 
-    ucx_perf_context_t &m_perf;
-    unsigned           m_outstanding;
-    const unsigned     m_max_outstanding;
+    ucx_perf_context_t  &m_perf;
+    unsigned            m_outstanding;
+    const unsigned      m_max_outstanding;
+    /*
+     * These fields are used by UCP AM flow only, because receive operation is
+     * initiated from the data receive callback.
+     */
+    void                *m_am_rx_buffer;
+    size_t              m_am_rx_length;
+    ucp_request_param_t m_am_rx_params;
 };
 
 
@@ -621,10 +758,14 @@ class ucp_perf_test_runner {
     TEST_CASE(_perf, UCS_PP_TUPLE_0 _case, UCS_PP_TUPLE_1 _case, \
               UCX_PERF_TEST_FLAG_ONE_SIDED, UCX_PERF_TEST_FLAG_ONE_SIDED)
 
+#define TEST_CASE_ALL_AM(_perf, _case) \
+    TEST_CASE(_perf, UCS_PP_TUPLE_0 _case, UCS_PP_TUPLE_1 _case, 0, 0)
+
 ucs_status_t ucp_perf_test_dispatch(ucx_perf_context_t *perf)
 {
     UCS_PP_FOREACH(TEST_CASE_ALL_OSD, perf,
         (UCX_PERF_CMD_PUT,   UCX_PERF_TEST_TYPE_PINGPONG),
+        (UCX_PERF_CMD_PUT,   UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM),
         (UCX_PERF_CMD_PUT,   UCX_PERF_TEST_TYPE_STREAM_UNI),
         (UCX_PERF_CMD_GET,   UCX_PERF_TEST_TYPE_STREAM_UNI),
         (UCX_PERF_CMD_ADD,   UCX_PERF_TEST_TYPE_STREAM_UNI),
@@ -645,6 +786,11 @@ ucs_status_t ucp_perf_test_dispatch(ucx_perf_context_t *perf)
         (UCX_PERF_CMD_STREAM,   UCX_PERF_TEST_TYPE_PINGPONG)
         );
 
+    UCS_PP_FOREACH(TEST_CASE_ALL_AM, perf,
+        (UCX_PERF_CMD_AM,       UCX_PERF_TEST_TYPE_PINGPONG),
+        (UCX_PERF_CMD_AM,       UCX_PERF_TEST_TYPE_STREAM_UNI)
+        );
+
     ucs_error("Invalid test case: %d/%d/0x%x",
               perf->params.command, perf->params.test_type,
               perf->params.flags);
diff --git a/src/tools/perf/lib/uct_tests.cc b/src/tools/perf/lib/uct_tests.cc
index 81d7d227e03..591c4fe0eb7 100644
--- a/src/tools/perf/lib/uct_tests.cc
+++ b/src/tools/perf/lib/uct_tests.cc
@@ -2,6 +2,7 @@
 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
 * Copyright (C) The University of Tennessee and The University
 *               of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2020.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -10,19 +11,13 @@
 #  include "config.h"
 #endif
 
-#define __STDC_FORMAT_MACROS 1
-#include <inttypes.h>
-#include <tools/perf/lib/libperf_int.h>
+#define __STDC_FORMAT_MACROS /* For PRIu64 */
 
-extern "C" {
-#include <ucs/debug/log.h>
-#include <ucs/sys/preprocessor.h>
-#include <ucs/sys/math.h>
-#include <ucs/sys/sys.h>
-}
+#include "libperf_int.h"
 
 #include <limits>
 
+
 template <ucx_perf_cmd_t CMD, ucx_perf_test_type_t TYPE, uct_perf_data_layout_t DATA, bool ONESIDED>
 class uct_perf_test_runner {
 public:
@@ -72,7 +67,8 @@ class uct_perf_test_runner {
         const size_t iovcnt    = perf->params.msg_size_cnt;
         size_t iov_length_it, iov_it;
 
-        ucs_assert(UCT_PERF_DATA_LAYOUT_ZCOPY == DATA);
+        ucs_assert((UCT_PERF_DATA_LAYOUT_ZCOPY == DATA) ||
+                   (UCT_PERF_DATA_LAYOUT_SHORT_IOV == DATA));
         ucs_assert(NULL != perf->params.msg_size_list);
         ucs_assert(iovcnt > 0);
         ucs_assert(perf->params.msg_size_list[0] >= header_size);
@@ -99,10 +95,11 @@ class uct_perf_test_runner {
     }
 
     void uct_perf_test_prepare_iov_buffer() {
-        if (UCT_PERF_DATA_LAYOUT_ZCOPY == DATA) {
+        if ((UCT_PERF_DATA_LAYOUT_ZCOPY == DATA) ||
+            (UCT_PERF_DATA_LAYOUT_SHORT_IOV == DATA)) {
             size_t start_iov_buffer_size = 0;
-            if (UCX_PERF_CMD_AM == CMD) {
-                start_iov_buffer_size = m_perf.params.am_hdr_size;
+            if ((UCX_PERF_CMD_AM == CMD) && (UCT_PERF_DATA_LAYOUT_ZCOPY == DATA)) {
+                start_iov_buffer_size = m_perf.params.uct.am_hdr_size;
             }
             uct_perf_get_buffer_iov(m_perf.uct.iov, m_perf.send_buffer,
                                     start_iov_buffer_size,
@@ -248,6 +245,10 @@ class uct_perf_test_runner {
                 return uct_ep_am_short(ep, UCT_PERF_TEST_AM_ID, am_short_hdr,
                                        (char*)buffer + sizeof(am_short_hdr),
                                        length - sizeof(am_short_hdr));
+            case UCT_PERF_DATA_LAYOUT_SHORT_IOV:
+                set_sn(buffer, m_perf.uct.send_mem.mem_type, &sn);
+                return uct_ep_am_short_iov(ep, UCT_PERF_TEST_AM_ID, m_perf.uct.iov,
+                                           m_perf.params.msg_size_cnt);
             case UCT_PERF_DATA_LAYOUT_BCOPY:
                 set_sn(buffer, m_perf.uct.send_mem.mem_type, &sn);
                 packed_len = uct_ep_am_bcopy(ep, UCT_PERF_TEST_AM_ID, pack_cb,
@@ -255,7 +256,7 @@ class uct_perf_test_runner {
                 return (packed_len >= 0) ? UCS_OK : (ucs_status_t)packed_len;
             case UCT_PERF_DATA_LAYOUT_ZCOPY:
                 set_sn(buffer, m_perf.uct.send_mem.mem_type, &sn);
-                header_size = m_perf.params.am_hdr_size;
+                header_size = m_perf.params.uct.am_hdr_size;
                 return uct_ep_am_zcopy(ep, UCT_PERF_TEST_AM_ID, buffer, header_size,
                                        m_perf.uct.iov, m_perf.params.msg_size_cnt,
                                        0, comp);
@@ -263,7 +264,8 @@ class uct_perf_test_runner {
                 return UCS_ERR_INVALID_PARAM;
             }
         case UCX_PERF_CMD_PUT:
-            if (TYPE == UCX_PERF_TEST_TYPE_PINGPONG) {
+            if ((TYPE == UCX_PERF_TEST_TYPE_PINGPONG) ||
+                (TYPE == UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM)) {
                 /* Put the control word at the latest byte of the IOV message */
                 set_sn(UCS_PTR_BYTE_OFFSET(buffer,
                                            uct_perf_get_buffer_extent(&m_perf.params) - 1),
@@ -621,6 +623,7 @@ class uct_perf_test_runner {
         /* coverity[switch_selector_expr_is_constant] */
         switch (TYPE) {
         case UCX_PERF_TEST_TYPE_PINGPONG:
+        case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM:
             return run_pingpong();
         case UCX_PERF_TEST_TYPE_STREAM_UNI:
             /* coverity[switch_selector_expr_is_constant] */
@@ -687,6 +690,7 @@ class uct_perf_test_runner {
    TEST_CASE(_perf, UCS_PP_TUPLE_0 _case, UCS_PP_TUPLE_1 _case, _data, false)
 #define TEST_CASE_ALL_DATA(_perf, _case) \
    TEST_CASE_ALL_OSD(_perf, _case, UCT_PERF_DATA_LAYOUT_SHORT) \
+   TEST_CASE_ALL_OSD(_perf, _case, UCT_PERF_DATA_LAYOUT_SHORT_IOV) \
    TEST_CASE_ALL_OSD(_perf, _case, UCT_PERF_DATA_LAYOUT_BCOPY) \
    TEST_CASE_ALL_OSD(_perf, _case, UCT_PERF_DATA_LAYOUT_ZCOPY)
 
diff --git a/src/tools/perf/perftest.c b/src/tools/perf/perftest.c
index 9973e29b6dd..dad3f260cc7 100644
--- a/src/tools/perf/perftest.c
+++ b/src/tools/perf/perftest.c
@@ -40,7 +40,7 @@
 #define MAX_BATCH_FILES         32
 #define MAX_CPUS                1024
 #define TL_RESOURCE_NAME_NONE   "<none>"
-#define TEST_PARAMS_ARGS        "t:n:s:W:O:w:D:i:H:oSCIqM:r:T:d:x:A:BUm:"
+#define TEST_PARAMS_ARGS        "t:n:s:W:O:w:D:i:H:oSCIqM:r:E:T:d:x:A:BUem:"
 #define TEST_ID_UNDEFINED       -1
 
 enum {
@@ -162,7 +162,13 @@ test_type_t tests[] = {
     {"stream_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_PINGPONG,
      "stream latency", "latency", 1},
 
-     {NULL}
+    {"ucp_am_lat", UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+     "am latency", "latency", 1},
+
+    {"ucp_am_bw", UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+     "am bandwidth / message rate", "overhead", 32},
+
+    {NULL}
 };
 
 static int sock_io(int sock, ssize_t (*sock_call)(int, void *, size_t, int),
@@ -296,6 +302,9 @@ static void print_header(struct perftest_context *ctx)
             case UCT_PERF_DATA_LAYOUT_SHORT:
                 test_data_str = "short";
                 break;
+            case UCT_PERF_DATA_LAYOUT_SHORT_IOV:
+                test_data_str = "short iov";
+                break;
             case UCT_PERF_DATA_LAYOUT_BCOPY:
                 test_data_str = "bcopy";
                 break;
@@ -320,6 +329,11 @@ static void print_header(struct perftest_context *ctx)
         printf("| Send memory:  %-60s               |\n", ucs_memory_type_names[ctx->params.super.send_mem_type]);
         printf("| Recv memory:  %-60s               |\n", ucs_memory_type_names[ctx->params.super.recv_mem_type]);
         printf("| Message size: %-60zu               |\n", ucx_perf_get_message_size(&ctx->params.super));
+        if ((test->api == UCX_PERF_API_UCP) &&
+            (test->command == UCX_PERF_CMD_AM)) {
+            printf("| AM header size: %-60zu             |\n",
+                   ctx->params.super.ucp.am_hdr_size);
+        }
     }
 
     if (ctx->flags & TEST_FLAG_PRINT_CSV) {
@@ -350,7 +364,7 @@ static void print_test_name(struct perftest_context *ctx)
     unsigned i, pos;
 
     if (!(ctx->flags & TEST_FLAG_PRINT_CSV) && (ctx->num_batch_files > 0)) {
-        strcpy(buf, "+--------------+---------+---------+---------+----------+----------+-----------+-----------+");
+        strcpy(buf, "+--------------+--------------+---------+---------+---------+----------+----------+-----------+-----------+");
 
         pos = 1;
         for (i = 0; i < ctx->num_batch_files; ++i) {
@@ -447,14 +461,15 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("     -d <device>    device to use for testing\n");
     printf("     -x <tl>        transport to use for testing\n");
     printf("     -D <layout>    data layout for sender side:\n");
-    printf("                        short - short messages (default, cannot be used for get)\n");
-    printf("                        bcopy - copy-out (cannot be used for atomics)\n");
-    printf("                        zcopy - zero-copy (cannot be used for atomics)\n");
-    printf("                        iov    - scatter-gather list (iovec)\n");
+    printf("                        short    - short messages (default, cannot be used for get)\n");
+    printf("                        shortiov - short io-vector messages (only for active messages)\n");
+    printf("                        bcopy    - copy-out (cannot be used for atomics)\n");
+    printf("                        zcopy    - zero-copy (cannot be used for atomics)\n");
+    printf("                        iov      - scatter-gather list (iovec)\n");
     printf("     -W <count>     flow control window size, for active messages (%u)\n",
                                 ctx->params.super.uct.fc_window);
-    printf("     -H <size>      active message header size (%zu)\n",
-                                ctx->params.super.am_hdr_size);
+    printf("     -H <size>      active message header size (%zu), included in message size\n",
+                                ctx->params.super.uct.am_hdr_size);
     printf("     -A <mode>      asynchronous progress mode (thread_spinlock)\n");
     printf("                        thread_spinlock - separate progress thread with spin locking\n");
     printf("                        thread_mutex - separate progress thread with mutex locking\n");
@@ -475,6 +490,12 @@ static void usage(const struct perftest_context *ctx, const char *program)
     printf("                        recv       : Use ucp_stream_recv_nb\n");
     printf("                        recv_data  : Use ucp_stream_recv_data_nb\n");
     printf("     -I             create context with wakeup feature enabled\n");
+    printf("     -e             create endpoints with error handling support\n");
+    printf("     -E <mode>      wait mode for tests\n");
+    printf("                        poll       : repeatedly call worker_progress\n");
+    printf("                        sleep      : go to sleep after posting requests\n");
+    printf("     -H <size>      active message header size (%zu), not included in message size\n",
+                                ctx->params.super.ucp.am_hdr_size);
     printf("\n");
     printf("   NOTE: When running UCP tests, transport and device should be specified by\n");
     printf("         environment variables: UCX_TLS and UCX_[SELF|SHM|NET]_DEVICES.\n");
@@ -590,7 +611,6 @@ static ucs_status_t init_test_params(perftest_params_t *params)
     params->super.wait_mode         = UCX_PERF_WAIT_MODE_LAST;
     params->super.max_outstanding   = 0;
     params->super.warmup_iter       = 10000;
-    params->super.am_hdr_size       = 8;
     params->super.alignment         = ucs_get_page_size();
     params->super.max_iter          = 1000000l;
     params->super.max_time          = 0.0;
@@ -598,12 +618,14 @@ static ucs_status_t init_test_params(perftest_params_t *params)
     params->super.flags             = UCX_PERF_TEST_FLAG_VERBOSE;
     params->super.uct.fc_window     = UCT_PERF_TEST_MAX_FC_WINDOW;
     params->super.uct.data_layout   = UCT_PERF_DATA_LAYOUT_SHORT;
+    params->super.uct.am_hdr_size   = 8;
     params->super.send_mem_type     = UCS_MEMORY_TYPE_HOST;
     params->super.recv_mem_type     = UCS_MEMORY_TYPE_HOST;
     params->super.msg_size_cnt      = 1;
     params->super.iov_stride        = 0;
     params->super.ucp.send_datatype = UCP_PERF_DATATYPE_CONTIG;
     params->super.ucp.recv_datatype = UCP_PERF_DATATYPE_CONTIG;
+    params->super.ucp.am_hdr_size   = 0;
     strcpy(params->super.uct.dev_name, TL_RESOURCE_NAME_NONE);
     strcpy(params->super.uct.tl_name,  TL_RESOURCE_NAME_NONE);
 
@@ -654,6 +676,8 @@ static ucs_status_t parse_test_params(perftest_params_t *params, char opt,
     case 'D':
         if (!strcmp(opt_arg, "short")) {
             params->super.uct.data_layout   = UCT_PERF_DATA_LAYOUT_SHORT;
+        } else if (!strcmp(opt_arg, "shortiov")) {
+            params->super.uct.data_layout   = UCT_PERF_DATA_LAYOUT_SHORT_IOV;
         } else if (!strcmp(opt_arg, "bcopy")) {
             params->super.uct.data_layout   = UCT_PERF_DATA_LAYOUT_BCOPY;
         } else if (!strcmp(opt_arg, "zcopy")) {
@@ -672,6 +696,18 @@ static ucs_status_t parse_test_params(perftest_params_t *params, char opt,
             return UCS_ERR_INVALID_PARAM;
         }
         return UCS_OK;
+    case 'E':
+        if (!strcmp(opt_arg, "poll")) {
+            params->super.wait_mode = UCX_PERF_WAIT_MODE_POLL;
+            return UCS_OK;
+        } else if (!strcmp(opt_arg, "sleep")) {
+            params->super.wait_mode = UCX_PERF_WAIT_MODE_SLEEP;
+            return UCS_OK;
+        } else {
+            ucs_error("Invalid option argument for -E");
+            return UCS_ERR_INVALID_PARAM;
+        }
+        return UCS_OK;
     case 'i':
         params->super.iov_stride = atol(opt_arg);
         return UCS_OK;
@@ -681,7 +717,8 @@ static ucs_status_t parse_test_params(perftest_params_t *params, char opt,
     case 's':
         return parse_message_sizes_params(opt_arg, &params->super);
     case 'H':
-        params->super.am_hdr_size = atol(opt_arg);
+        params->super.uct.am_hdr_size = atol(opt_arg);
+        params->super.ucp.am_hdr_size = atol(opt_arg);
         return UCS_OK;
     case 'W':
         params->super.uct.fc_window = atoi(opt_arg);
@@ -710,6 +747,9 @@ static ucs_status_t parse_test_params(perftest_params_t *params, char opt,
     case 'I':
         params->super.flags |= UCX_PERF_TEST_FLAG_WAKEUP;
         return UCS_OK;
+    case 'e':
+        params->super.flags |= UCX_PERF_TEST_FLAG_ERR_HANDLING;
+        return UCS_OK;
     case 'M':
         if (!strcmp(opt_arg, "single")) {
             params->super.thread_mode = UCS_THREAD_MODE_SINGLE;
@@ -1433,8 +1473,6 @@ static ucx_perf_rte_t ext_rte = {
 
 static ucs_status_t setup_mpi_rte(struct perftest_context *ctx)
 {
-    ucs_trace_func("");
-
 #if defined (HAVE_MPI)
     static ucx_perf_rte_t mpi_rte = {
         .group_size    = mpi_rte_group_size,
@@ -1448,6 +1486,8 @@ static ucs_status_t setup_mpi_rte(struct perftest_context *ctx)
 
     int size, rank;
 
+    ucs_trace_func("");
+
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     if (size != 2) {
         ucs_error("This test should run with exactly 2 processes (actual: %d)", size);
@@ -1463,6 +1503,8 @@ static ucs_status_t setup_mpi_rte(struct perftest_context *ctx)
     ctx->params.super.rte        = &mpi_rte;
     ctx->params.super.report_arg = ctx;
 #elif defined (HAVE_RTE)
+    ucs_trace_func("");
+    
     ctx->params.rte_group         = NULL;
     ctx->params.rte               = &mpi_rte;
     ctx->params.report_arg        = ctx;
diff --git a/src/tools/vfs/Makefile.am b/src/tools/vfs/Makefile.am
new file mode 100644
index 00000000000..62408f8aea7
--- /dev/null
+++ b/src/tools/vfs/Makefile.am
@@ -0,0 +1,17 @@
+#
+# Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+#
+# See file LICENSE for terms.
+#
+
+if HAVE_FUSE3
+
+bin_PROGRAMS     = ucx_vfs
+ucx_vfs_CPPFLAGS = $(BASE_CPPFLAGS) $(FUSE3_CPPFLAGS)
+ucx_vfs_CFLAGS   = $(BASE_CFLAGS)
+ucx_vfs_SOURCES  = vfs_main.c vfs_server.c
+noinst_HEADERS   = vfs_daemon.h
+ucx_vfs_LDADD    = $(FUSE3_LIBS) \
+                   $(top_builddir)/src/ucs/vfs/sock/libucs_vfs_sock.la
+
+endif
diff --git a/src/tools/vfs/vfs_daemon.h b/src/tools/vfs/vfs_daemon.h
new file mode 100644
index 00000000000..094a1e182c1
--- /dev/null
+++ b/src/tools/vfs/vfs_daemon.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef VFS_DAEMON_H_
+#define VFS_DAEMON_H_
+
+#include <ucs/vfs/sock/vfs_sock.h>
+#include <ucs/sys/compiler_def.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdio.h>
+#include <fuse.h>
+
+
+#define VFS_DEFAULT_MOUNTPOINT_DIR "/tmp/ucx"
+#define VFS_FUSE_MOUNT_PROG        "fusermount3"
+
+
+enum {
+    VFS_DAEMON_ACTION_START = UCS_VFS_SOCK_ACTION_NOP
+};
+
+
+#define vfs_error(_fmt, ...) \
+    { \
+        fprintf(stderr, "Error: " _fmt "\n", ##__VA_ARGS__); \
+    }
+
+
+#define vfs_log(_fmt, ...) \
+    { \
+        if (g_opts.verbose) { \
+            fprintf(stderr, "Debug: " _fmt "\n", ##__VA_ARGS__); \
+        } \
+    }
+
+
+typedef struct {
+    int        action;
+    int        foreground;
+    int        verbose;
+    const char *mountpoint_dir;
+    const char *mount_opts;
+    const char *sock_path;
+} vfs_opts_t;
+
+
+extern vfs_opts_t g_opts;
+extern const char *vfs_action_names[];
+
+int vfs_mount(int pid);
+
+int vfs_unmount(int pid);
+
+int vfs_server_loop(int listen_fd);
+
+#endif
diff --git a/src/tools/vfs/vfs_main.c b/src/tools/vfs/vfs_main.c
new file mode 100644
index 00000000000..b6732c95454
--- /dev/null
+++ b/src/tools/vfs/vfs_main.c
@@ -0,0 +1,504 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "vfs_daemon.h"
+
+#include <sys/wait.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <ctype.h>
+
+
+vfs_opts_t g_opts = {
+    .action         = VFS_DAEMON_ACTION_START,
+    .foreground     = 0,
+    .verbose        = 0,
+    .mountpoint_dir = VFS_DEFAULT_MOUNTPOINT_DIR,
+    .mount_opts     = "",
+    .sock_path      = NULL
+};
+
+const char *vfs_action_names[] = {
+    [UCS_VFS_SOCK_ACTION_STOP]  = "stop",
+    [UCS_VFS_SOCK_ACTION_MOUNT] = "mount",
+    [VFS_DAEMON_ACTION_START]   = "start"
+};
+
+static struct sockaddr_un g_sockaddr;
+
+
+static int vfs_run_fusermount(char **extra_argv)
+{
+    char command[128];
+    pid_t child_pid;
+    int ret, status;
+    int devnull_fd;
+    char *p, *endp;
+    char *argv[16];
+    int i, argc;
+
+    argc         = 0;
+    argv[argc++] = VFS_FUSE_MOUNT_PROG;
+    if (!g_opts.verbose) {
+        argv[argc++] = "-q";
+    }
+    while (*extra_argv != NULL) {
+        argv[argc++] = *(extra_argv++);
+    }
+    argv[argc++] = NULL;
+    assert(argc <= ucs_static_array_size(argv));
+
+    /* save the whole command to log */
+    p    = command;
+    endp = command + sizeof(command);
+    for (i = 0; argv[i] != NULL; ++i) {
+        snprintf(p, endp - p, "%s ", argv[i]);
+        p += strlen(p);
+    }
+    *(p - 1) = '\0';
+
+    vfs_log("exec '%s'", command);
+
+    child_pid = fork();
+    if (child_pid == -1) {
+        vfs_error("fork() failed: %m");
+        return -1;
+    }
+
+    if (child_pid == 0) {
+        if (!g_opts.verbose) {
+            devnull_fd = open("/dev/null", O_WRONLY);
+            if (devnull_fd < 0) {
+                vfs_error("failed open /dev/null: %m");
+                exit(1);
+            }
+
+            dup2(devnull_fd, 1);
+            dup2(devnull_fd, 2);
+            close(devnull_fd);
+        }
+        execvp(argv[0], argv);
+        vfs_error("failed to execute '%s': %m", command);
+        exit(1);
+    }
+
+    ret = waitpid(child_pid, &status, 0);
+    if (ret < 0) {
+        vfs_error("waitpid(%d) failed: %m", child_pid);
+        return -errno;
+    } else if (WIFEXITED(status) && (WEXITSTATUS(status) != 0)) {
+        vfs_error("'%s' exited with status %d", command, WEXITSTATUS(status));
+        return -1;
+    } else if (!WIFEXITED(status)) {
+        vfs_error("'%s' did not exit properly (%d)", command, status);
+        return -1;
+    }
+
+    return 0;
+}
+
+static void vfs_get_mountpoint(pid_t pid, char *mountpoint, size_t max_length)
+{
+    snprintf(mountpoint, max_length, "%s/%d", g_opts.mountpoint_dir, pid);
+}
+
+static const char *vfs_get_process_name(int pid, char *buf, size_t max_length)
+{
+    char procfs_comm[NAME_MAX];
+    size_t length;
+    FILE *file;
+    char *p;
+
+    /* open /proc/<pid>/comm to read command name */
+    snprintf(procfs_comm, sizeof(procfs_comm), "/proc/%d/comm", pid);
+    file = fopen(procfs_comm, "r");
+    if (file == NULL) {
+        goto err;
+    }
+
+    /* read command to buffer */
+    if (fgets(buf, max_length, file) == NULL) {
+        goto err_close;
+    }
+
+    /* remove trailing space/newline */
+    length = strlen(buf);
+    for (p = &buf[length - 1]; (p >= buf) && isspace(*p); --p) {
+        *p = '\0';
+        --length;
+    }
+
+    /* append process id */
+    snprintf(buf + length, max_length - length, "@pid:%d", pid);
+    fclose(file);
+    goto out;
+
+err_close:
+    fclose(file);
+err:
+    snprintf(buf, max_length, "pid:%d", pid);
+out:
+    return buf;
+}
+
+int vfs_mount(int pid)
+{
+    char mountpoint[PATH_MAX];
+    char mountopts[1024];
+    char name[NAME_MAX];
+    int fuse_fd, ret;
+
+    /* Add common mount options:
+     * - File system name (source) : process name and pid
+     * - File system type          : ucx_vfs
+     * - Enable permissions check  : yes
+     * - Direct IO (no caching)    : yes
+     */
+    ret = snprintf(
+            mountopts, sizeof(mountopts),
+            "fsname=%s,subtype=ucx_vfs,default_permissions,direct_io%s%s",
+            vfs_get_process_name(pid, name, sizeof(name)),
+            (strlen(g_opts.mount_opts) > 0) ? "," : "", g_opts.mount_opts);
+    if (ret >= sizeof(mountopts)) {
+        return -ENOMEM;
+    }
+
+    /* Create the mount point directory, and ignore "already exists" error */
+    vfs_get_mountpoint(pid, mountpoint, sizeof(mountpoint));
+    ret = mkdir(mountpoint, S_IRWXU);
+    if ((ret < 0) && (errno != EEXIST)) {
+        ret = -errno;
+        vfs_error("failed to create directory '%s': %m", mountpoint);
+        return ret;
+    }
+
+    /* Mount a new FUSE filesystem in the mount point directory */
+    vfs_log("mounting directory '%s' with options '%s'", mountpoint, mountopts);
+    fuse_fd = fuse_open_channel(mountpoint, mountopts);
+    if (fuse_fd < 0) {
+        vfs_error("fuse_open_channel(%s,opts=%s) failed: %m", mountpoint,
+                  mountopts);
+        return fuse_fd;
+    }
+
+    vfs_log("mounted directory '%s' with fd %d", mountpoint, fuse_fd);
+    return fuse_fd;
+}
+
+int vfs_unmount(int pid)
+{
+    char mountpoint[PATH_MAX];
+    char *argv[5];
+    int ret;
+
+    /* Unmount FUSE file system */
+    vfs_get_mountpoint(pid, mountpoint, sizeof(mountpoint));
+    argv[0] = "-u";
+    argv[1] = "-z";
+    argv[2] = "--";
+    argv[3] = mountpoint;
+    argv[4] = NULL;
+    ret     = vfs_run_fusermount(argv);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Remove mount point directory */
+    vfs_log("removing directory '%s'", mountpoint);
+    ret = rmdir(mountpoint);
+    if (ret < 0) {
+        vfs_error("failed to remove directory '%s': %m", mountpoint);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int vfs_unlink_socket(int silent_notexist)
+{
+    int ret;
+
+    vfs_log("removing existing socket '%s'", g_sockaddr.sun_path);
+
+    ret = unlink(g_sockaddr.sun_path);
+    if (ret < 0) {
+        ret = -errno;
+        if (silent_notexist && (errno == ENOENT)) {
+            vfs_log("could not unlink '%s': %m", g_sockaddr.sun_path);
+        } else {
+            vfs_error("could not unlink '%s': %m", g_sockaddr.sun_path);
+        }
+        return ret;
+    }
+
+    return 0;
+}
+
+/* return 0 or the (negative) value of errno in case of error */
+static int vfs_listen(int silent_addinuse_err)
+{
+    int listen_fd, ret;
+
+    ret = umask(~S_IRWXU);
+    if (ret < 0) {
+        ret = -errno;
+        vfs_error("failed to set umask permissions: %m");
+        goto out;
+    }
+
+    listen_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (listen_fd < 0) {
+        ret = -errno;
+        vfs_error("failed to create listening socket: %m");
+        goto out;
+    }
+
+    ret = bind(listen_fd, (const struct sockaddr*)&g_sockaddr,
+               sizeof(g_sockaddr));
+    if (ret < 0) {
+        ret = -errno;
+        if ((errno != EADDRINUSE) || !silent_addinuse_err) {
+            vfs_error("bind(%s) failed: %m", g_sockaddr.sun_path);
+        }
+        goto out_close;
+    }
+
+    ret = listen(listen_fd, 128);
+    if (ret < 0) {
+        ret = -errno;
+        vfs_error("listen() failed: %m");
+        goto out_unlink;
+    }
+
+    vfs_log("listening for connections on '%s'", g_sockaddr.sun_path);
+    ret = vfs_server_loop(listen_fd);
+
+out_unlink:
+    vfs_unlink_socket(0);
+out_close:
+    close(listen_fd);
+out:
+    return ret;
+}
+
+/* return 0 or the (negative) value of errno in case of error */
+static int vfs_connect_and_act()
+{
+    ucs_vfs_sock_message_t vfs_msg_out;
+    int connfd;
+    int ret;
+
+    vfs_log("connecting to '%s'", g_sockaddr.sun_path);
+
+    connfd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (connfd < 0) {
+        ret = -errno;
+        vfs_error("failed to create connection socket: %m");
+        goto out;
+    }
+
+    ret = connect(connfd, (const struct sockaddr*)&g_sockaddr,
+                  sizeof(g_sockaddr));
+    if (ret < 0) {
+        ret = -errno;
+        if (errno == ECONNREFUSED) {
+            vfs_log("connect(%s) failed: %m", g_sockaddr.sun_path);
+        } else {
+            vfs_error("connect(%s) failed: %m", g_sockaddr.sun_path);
+        }
+        goto out_close;
+    }
+
+    if (g_opts.action < UCS_VFS_SOCK_ACTION_LAST) {
+        vfs_log("sending action '%s'", vfs_action_names[g_opts.action]);
+
+        /* send action */
+        vfs_msg_out.action = g_opts.action;
+        ret                = ucs_vfs_sock_send(connfd, &vfs_msg_out);
+        if (ret < 0) {
+            vfs_error("failed to send: %d", ret);
+            goto out_close;
+        }
+
+        ret = 0;
+    }
+
+out_close:
+    close(connfd);
+out:
+    return ret;
+}
+
+/* return 0 or negative value in case of error */
+int vfs_start()
+{
+    int ret;
+
+    ret = vfs_listen(1);
+    if (ret != -EADDRINUSE) {
+        return ret;
+    }
+
+    /* Failed to listen because 'socket_name' path already exists - try to
+     * connect */
+    ret = vfs_connect_and_act();
+    if (ret != -ECONNREFUSED) {
+        return ret;
+    }
+
+    /* Could not connect to the socket because no one is listening - remove the
+     * socket file and try listening again */
+    ret = vfs_unlink_socket(0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return vfs_listen(0);
+}
+
+static void vfs_usage()
+{
+    struct sockaddr_un sock_addr = {};
+
+    ucs_vfs_sock_get_address(&sock_addr);
+    printf("Usage:   ucx_vfs [options]  [action]\n");
+    printf("\n");
+    printf("Options:\n");
+    printf("  -d <dir>   Set parent directory for mount points (default: %s)\n",
+           g_opts.mountpoint_dir);
+    printf("  -o <opts>  Pass these mount options to mount.fuse\n");
+    printf("  -f         Do not daemonize; run in foreground\n");
+    printf("  -v         Enable verbose logging (requires -f)\n");
+    printf("  -l <path>  Set listening unix socket path (default: %s)\n",
+           sock_addr.sun_path);
+    printf("\n");
+    printf("Actions:\n");
+    printf("   start     Run the daemon and listen for connection from UCX\n");
+    printf("             If a daemon is already running, do nothing\n");
+    printf("             This is the default action.\n");
+    printf("   stop      Stop the running daemon\n");
+    printf("\n");
+}
+
+static int vfs_parse_args(int argc, char **argv)
+{
+    const char *action_str;
+    int c, i;
+
+    while ((c = getopt(argc, argv, "d:o:vfl:h")) != -1) {
+        switch (c) {
+        case 'd':
+            g_opts.mountpoint_dir = optarg;
+            break;
+        case 'o':
+            g_opts.mount_opts = optarg;
+            break;
+        case 'v':
+            ++g_opts.verbose;
+            break;
+        case 'f':
+            g_opts.foreground = 1;
+            break;
+        case 'l':
+            g_opts.sock_path = optarg;
+            break;
+        case 'h':
+        default:
+            vfs_usage();
+            return -127;
+        }
+    }
+
+    if (g_opts.verbose && !g_opts.foreground) {
+        vfs_error("Option -v requires -f");
+        vfs_usage();
+        return -1;
+    }
+
+    if (optind < argc) {
+        action_str    = argv[optind];
+        g_opts.action = UCS_VFS_SOCK_ACTION_LAST;
+        for (i = 0; i < ucs_static_array_size(vfs_action_names); ++i) {
+            if ((vfs_action_names[i] != NULL) &&
+                !strcmp(action_str, vfs_action_names[i])) {
+                g_opts.action = i;
+            }
+        }
+        if (g_opts.action == UCS_VFS_SOCK_ACTION_LAST) {
+            vfs_error("invalid action '%s'", action_str);
+            vfs_usage();
+            return 0;
+        }
+        ++optind;
+    }
+
+    if (optind < argc) {
+        vfs_error("only one action can be specified");
+        vfs_usage();
+        return -1;
+    }
+
+    return 0;
+}
+
+static int vfs_test_fuse()
+{
+    char *argv[] = {"-V", NULL};
+    return vfs_run_fusermount(argv);
+}
+
+int main(int argc, char **argv)
+{
+    int ret;
+
+    ret = vfs_parse_args(argc, argv);
+    if (ret < 0) {
+        return -1;
+    }
+
+    ret = vfs_test_fuse();
+    if (ret < 0) {
+        return -1;
+    }
+
+    ret = mkdir(g_opts.mountpoint_dir, S_IRWXU);
+    if ((ret < 0) && (errno != EEXIST)) {
+        vfs_error("could not create directory '%s': %m", g_opts.mountpoint_dir);
+        return -1;
+    }
+
+    if (!g_opts.foreground) {
+        fuse_daemonize(0);
+    }
+
+    if (g_opts.sock_path == NULL) {
+        ret = ucs_vfs_sock_get_address(&g_sockaddr);
+        if (ret < 0) {
+            vfs_error("failed to initialize socket address: %d", ret);
+            return -1;
+        }
+    } else {
+        g_sockaddr.sun_family = AF_UNIX;
+        memset(g_sockaddr.sun_path, 0, sizeof(g_sockaddr.sun_path));
+        strncpy(g_sockaddr.sun_path, g_opts.sock_path,
+                sizeof(g_sockaddr.sun_path) - 1);
+    }
+
+    switch (g_opts.action) {
+    case VFS_DAEMON_ACTION_START:
+        return vfs_start();
+    case UCS_VFS_SOCK_ACTION_STOP:
+        return vfs_connect_and_act();
+    default:
+        vfs_error("unexpected action %d", g_opts.action);
+        return -1;
+    }
+}
diff --git a/src/tools/vfs/vfs_server.c b/src/tools/vfs/vfs_server.c
new file mode 100644
index 00000000000..e6a62541fbf
--- /dev/null
+++ b/src/tools/vfs/vfs_server.c
@@ -0,0 +1,378 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "vfs_daemon.h"
+
+#include <ucs/datastruct/khash.h>
+#include <sys/poll.h>
+#include <signal.h>
+
+
+#define VFS_MAX_FDS 1024
+
+typedef enum {
+    VFS_FD_STATE_LISTENING,
+    VFS_FD_STATE_ACCEPTED,
+    VFS_FD_STATE_MOUNTED,
+    VFS_FD_STATE_FD_SENT,
+    VFS_FD_STATE_CLOSED
+} vfs_socket_state_t;
+
+typedef struct {
+    vfs_socket_state_t state;
+    pid_t              pid;
+    int                fuse_fd;
+} vfs_serever_fd_state_t;
+
+typedef struct {
+    vfs_serever_fd_state_t fd_state[VFS_MAX_FDS];
+    struct pollfd          poll_fds[VFS_MAX_FDS];
+    int                    nfds;
+    int                    stop;
+} vfs_server_context_t;
+
+static vfs_server_context_t vfs_server_context;
+
+static const char *vfs_server_fd_state_names[] = {
+    [VFS_FD_STATE_LISTENING] = "LISTENING",
+    [VFS_FD_STATE_ACCEPTED]  = "ACCEPTED",
+    [VFS_FD_STATE_MOUNTED]   = "MOUNTED",
+    [VFS_FD_STATE_FD_SENT]   = "FD_SENT",
+    [VFS_FD_STATE_CLOSED]    = "CLOSED"
+};
+
+static void vfs_server_log_context(int events)
+{
+    vfs_serever_fd_state_t *fd_state;
+    char log_message[1024];
+    struct pollfd *pfd;
+    char *p, *endp;
+    int idx;
+
+    if (g_opts.verbose < 2) {
+        return;
+    }
+
+    p    = log_message;
+    endp = log_message + sizeof(log_message);
+
+    for (idx = 0; idx < vfs_server_context.nfds; ++idx) {
+        pfd      = &vfs_server_context.poll_fds[idx];
+        fd_state = &vfs_server_context.fd_state[idx];
+        snprintf(p, endp - p, "[%d]{%c %d%s%s %d} ", idx,
+                 vfs_server_fd_state_names[fd_state->state][0],
+                 vfs_server_context.poll_fds[idx].fd,
+                 (events && (pfd->revents & POLLIN)) ? "i" : "",
+                 (events && (pfd->revents & POLLOUT)) ? "o" : "",
+                 fd_state->pid);
+        p += strlen(p);
+    }
+
+    if (p == log_message) {
+        vfs_log("<no open sockets>");
+    } else {
+        *(p - 1) = '\0';
+        vfs_log("%s", log_message);
+    }
+}
+
+static int vfs_server_poll_events()
+{
+    int ret;
+
+    vfs_server_log_context(0);
+
+    ret = poll(vfs_server_context.poll_fds, vfs_server_context.nfds, -1);
+    if (ret < 0) {
+        ret = -errno;
+        if (errno != EINTR) {
+            vfs_error("poll(nfds=%d) failed: %m", vfs_server_context.nfds)
+        }
+        return ret;
+    }
+
+    vfs_server_log_context(1);
+    return 0;
+}
+
+static void vfs_server_close_fd(int fd)
+{
+    int ret = close(fd);
+    if (ret < 0) {
+        vfs_error("failed to close fd %d: %m", fd);
+    }
+}
+
+static void vfs_server_log_fd(int idx, const char *message)
+{
+    vfs_serever_fd_state_t *fd_state = &vfs_server_context.fd_state[idx];
+    struct pollfd *pfd               = &vfs_server_context.poll_fds[idx];
+
+    vfs_log("%s fd[%d]=%d %s, pid: %d fuse_fd: %d", message, idx, pfd->fd,
+            vfs_server_fd_state_names[fd_state->state], fd_state->fuse_fd,
+            fd_state->pid);
+}
+
+static int vfs_server_add_fd(int fd, vfs_socket_state_t state)
+{
+    int idx, ret;
+
+    ret = fcntl(fd, F_GETFL);
+    if (ret < 0) {
+        vfs_error("fcntl(%d, F_GETFL) failed: %m", fd);
+        return -errno;
+    }
+
+    ret = fcntl(fd, F_SETFL, ret | O_NONBLOCK);
+    if (ret < 0) {
+        vfs_error("fcntl(%d, F_SETFL) failed: %m", fd);
+        return -errno;
+    }
+
+    idx                                      = vfs_server_context.nfds++;
+    vfs_server_context.fd_state[idx].state   = state;
+    vfs_server_context.fd_state[idx].pid     = -1;
+    vfs_server_context.fd_state[idx].fuse_fd = -1;
+    vfs_server_context.poll_fds[idx].events  = POLLIN;
+    vfs_server_context.poll_fds[idx].fd      = fd;
+    vfs_server_context.poll_fds[idx].revents = 0;
+
+    vfs_server_log_fd(idx, "added");
+    return 0;
+}
+
+static void vfs_server_remove_fd(int idx)
+{
+    vfs_server_log_fd(idx, "removing");
+
+    switch (vfs_server_context.fd_state[idx].state) {
+    case VFS_FD_STATE_FD_SENT:
+    case VFS_FD_STATE_MOUNTED:
+        vfs_server_close_fd(vfs_server_context.fd_state[idx].fuse_fd);
+        vfs_unmount(vfs_server_context.fd_state[idx].pid);
+        /* Fall through */
+    case VFS_FD_STATE_ACCEPTED:
+        vfs_server_close_fd(vfs_server_context.poll_fds[idx].fd);
+        /* Fall through */
+    default:
+        break;
+    }
+
+    vfs_server_context.fd_state[idx].state   = VFS_FD_STATE_CLOSED;
+    vfs_server_context.fd_state[idx].pid     = -1;
+    vfs_server_context.fd_state[idx].fuse_fd = -1;
+    vfs_server_context.poll_fds[idx].events  = 0;
+    vfs_server_context.poll_fds[idx].fd      = -1;
+    vfs_server_context.poll_fds[idx].revents = 0;
+}
+
+static void vfs_server_remove_all_fds()
+{
+    while (vfs_server_context.nfds > 0) {
+        vfs_server_remove_fd(--vfs_server_context.nfds);
+    }
+}
+
+static void vfs_server_accept(int listen_fd)
+{
+    int ret, connfd;
+
+    connfd = accept(listen_fd, NULL, NULL);
+    if (connfd < 0) {
+        vfs_error("accept(listen_fd=%d) failed: %m", listen_fd);
+        return;
+    }
+
+    ret = ucs_vfs_sock_setopt_passcred(connfd);
+    if (ret < 0) {
+        close(connfd);
+        return;
+    }
+
+    vfs_server_add_fd(connfd, VFS_FD_STATE_ACCEPTED);
+}
+
+static void vfs_server_mount(int idx, pid_t pid)
+{
+    int fuse_fd;
+
+    if (pid < 0) {
+        vfs_error("received invalid pid: %d", pid);
+        vfs_server_remove_fd(idx);
+        return;
+    }
+
+    fuse_fd = vfs_mount(pid);
+    if (fuse_fd < 0) {
+        vfs_server_remove_fd(idx);
+        return;
+    }
+
+    vfs_server_context.fd_state[idx].state   = VFS_FD_STATE_MOUNTED;
+    vfs_server_context.fd_state[idx].pid     = pid;
+    vfs_server_context.fd_state[idx].fuse_fd = fuse_fd;
+    vfs_server_context.poll_fds[idx].events |= POLLOUT;
+}
+
+static void vfs_server_recv(int idx)
+{
+    ucs_vfs_sock_message_t vfs_msg_in;
+    char message[64];
+    int ret;
+
+    ret = ucs_vfs_sock_recv(vfs_server_context.poll_fds[idx].fd, &vfs_msg_in);
+    if (ret < 0) {
+        vfs_error("failed to receive a message: %d (%s)", ret, strerror(-ret));
+        vfs_server_remove_fd(idx);
+        return;
+    }
+
+    snprintf(message, sizeof(message), "got action '%s' on",
+             vfs_action_names[vfs_msg_in.action]);
+    vfs_server_log_fd(idx, message);
+
+    switch (vfs_msg_in.action) {
+    case UCS_VFS_SOCK_ACTION_STOP:
+        vfs_server_context.stop = 1;
+        break;
+    case UCS_VFS_SOCK_ACTION_MOUNT:
+        vfs_server_mount(idx, vfs_msg_in.pid);
+        break;
+    case UCS_VFS_SOCK_ACTION_NOP:
+        vfs_server_remove_fd(idx);
+        break;
+    default:
+        vfs_error("ignoring invalid action %d", vfs_msg_in.action);
+        vfs_server_remove_fd(idx);
+        break;
+    }
+}
+
+static void vfs_server_handle_pollin(int idx)
+{
+    switch (vfs_server_context.fd_state[idx].state) {
+    case VFS_FD_STATE_LISTENING:
+        vfs_server_accept(vfs_server_context.poll_fds[idx].fd);
+        break;
+    case VFS_FD_STATE_ACCEPTED:
+        vfs_server_recv(idx);
+        break;
+    case VFS_FD_STATE_FD_SENT:
+        vfs_server_remove_fd(idx);
+        break;
+    default:
+        vfs_server_log_fd(idx, "unexpected POLLIN event on");
+        vfs_server_remove_fd(idx);
+        break;
+    }
+}
+
+static void vfs_server_handle_pollout(int idx)
+{
+    ucs_vfs_sock_message_t vfs_msg_out;
+    int ret;
+
+    if (vfs_server_context.fd_state[idx].state != VFS_FD_STATE_MOUNTED) {
+        vfs_server_log_fd(idx, "unexpected POLLOUT event on");
+        vfs_server_remove_fd(idx);
+        return;
+    }
+
+    /* Send reply with file descriptor from fuse mount */
+    vfs_msg_out.action = UCS_VFS_SOCK_ACTION_MOUNT_REPLY;
+    vfs_msg_out.fd     = vfs_server_context.fd_state[idx].fuse_fd;
+    ret = ucs_vfs_sock_send(vfs_server_context.poll_fds[idx].fd, &vfs_msg_out);
+    if (ret < 0) {
+        vfs_error("failed to send a message: %d", ret);
+        vfs_server_remove_fd(idx);
+        return;
+    }
+
+    vfs_server_log_fd(idx, "sent fuse_fd on");
+    vfs_server_context.fd_state[idx].state   = VFS_FD_STATE_FD_SENT;
+    vfs_server_context.poll_fds[idx].events &= ~POLLOUT;
+}
+
+static void vfs_server_copy_fd_state(int dest_idx, int src_idx)
+{
+    if (dest_idx != src_idx) {
+        vfs_server_context.fd_state[dest_idx] =
+                vfs_server_context.fd_state[src_idx];
+        vfs_server_context.poll_fds[dest_idx] =
+                vfs_server_context.poll_fds[src_idx];
+    }
+}
+
+static void vfs_server_sighandler(int signo)
+{
+    vfs_server_context.stop = 1;
+}
+
+static void vfs_server_set_sighandler()
+{
+    struct sigaction sigact;
+
+    sigact.sa_handler = vfs_server_sighandler;
+    sigact.sa_flags   = 0;
+    sigemptyset(&sigact.sa_mask);
+
+    sigaction(SIGINT, &sigact, NULL);
+    sigaction(SIGHUP, &sigact, NULL);
+    sigaction(SIGTERM, &sigact, NULL);
+}
+
+int vfs_server_loop(int listen_fd)
+{
+    int idx, valid_idx;
+    int ret;
+
+    vfs_server_context.nfds = 0;
+    vfs_server_context.stop = 0;
+
+    vfs_server_set_sighandler();
+
+    vfs_server_add_fd(listen_fd, VFS_FD_STATE_LISTENING);
+
+    while (!vfs_server_context.stop) {
+        ret = vfs_server_poll_events();
+        if (ret < 0) {
+            if (ret == -EINTR) {
+                continue;
+            } else {
+                return ret;
+            }
+        }
+
+        valid_idx = 0;
+        for (idx = 0; idx < vfs_server_context.nfds; ++idx) {
+            if (vfs_server_context.poll_fds[idx].events == 0) {
+                vfs_server_copy_fd_state(valid_idx++, idx);
+                continue;
+            }
+
+            if (vfs_server_context.poll_fds[idx].revents & POLLIN) {
+                vfs_server_handle_pollin(idx);
+            }
+            if (vfs_server_context.poll_fds[idx].revents & POLLOUT) {
+                vfs_server_handle_pollout(idx);
+            }
+
+            if (vfs_server_context.fd_state[idx].state != VFS_FD_STATE_CLOSED) {
+                vfs_server_copy_fd_state(valid_idx++, idx);
+            }
+        }
+
+        vfs_server_context.nfds = valid_idx;
+    }
+
+    vfs_server_remove_all_fds();
+
+    return 0;
+}
diff --git a/src/ucm/api/ucm.h b/src/ucm/api/ucm.h
index 6308c369e89..65ff4b29a3b 100644
--- a/src/ucm/api/ucm.h
+++ b/src/ucm/api/ucm.h
@@ -37,6 +37,7 @@ typedef enum ucm_event_type {
     UCM_EVENT_SHMDT           = UCS_BIT(4),
     UCM_EVENT_SBRK            = UCS_BIT(5),
     UCM_EVENT_MADVISE         = UCS_BIT(6),
+    UCM_EVENT_BRK             = UCS_BIT(7),
 
     /* Aggregate events */
     UCM_EVENT_VM_MAPPED       = UCS_BIT(16),
@@ -161,6 +162,15 @@ typedef union ucm_event {
         int                advice;
     } madvise;
 
+    /*
+     * UCM_EVENT_BRK
+     * brk() is called.
+     */
+    struct {
+        int                result;
+        void               *addr;
+    } brk;
+
     /*
      * UCM_EVENT_VM_MAPPED, UCM_EVENT_VM_UNMAPPED
      *
@@ -203,7 +213,7 @@ typedef struct ucm_global_config {
     ucm_mmap_hook_mode_t mmap_hook_mode;              /* MMAP hook mode */
     int                  enable_malloc_hooks;         /* Enable installing malloc hooks */
     int                  enable_malloc_reloc;         /* Enable installing malloc relocations */
-    int                  enable_cuda_reloc;           /* Enable installing CUDA relocations */
+    ucm_mmap_hook_mode_t cuda_hook_mode;              /* Cuda hooks mode */
     int                  enable_dynamic_mmap_thresh;  /* Enable adaptive mmap threshold */
     size_t               alloc_alignment;             /* Alignment for memory allocations */
     int                  dlopen_process_rpath;        /* Process RPATH section in dlopen hook */
@@ -211,7 +221,10 @@ typedef struct ucm_global_config {
 } ucm_global_config_t;
 
 
-/* Global UCM configuration */
+/*
+ * Global UCM configuration to be set externally.
+ * @deprecated replaced by @ref ucm_library_init.
+ */
 extern ucm_global_config_t ucm_global_opts;
 
 
@@ -253,6 +266,17 @@ typedef void (*ucm_event_callback_t)(ucm_event_type_t event_type,
                                      ucm_event_t *event, void *arg);
 
 
+/**
+ * Initialize UCM library and set its configuration.
+ *
+ * @param [in]  ucm_opts   UCM library global configuration. If NULL, default
+ *                         configuration is applied.
+ *
+ * @note Calling this function more than once in the same process has no effect.
+ */
+void ucm_library_init(const ucm_global_config_t *ucm_opts);
+
+
 /**
  * @brief Install a handler for memory events.
  *
diff --git a/src/ucm/bistro/bistro.c b/src/ucm/bistro/bistro.c
index 51a807e91d5..c31741aa8d5 100644
--- a/src/ucm/bistro/bistro.c
+++ b/src/ucm/bistro/bistro.c
@@ -10,13 +10,16 @@
 
 #include <dlfcn.h>
 #include <stdlib.h>
+#include <pthread.h>
 
 #include <ucm/bistro/bistro.h>
 #include <ucm/bistro/bistro_int.h>
+#include <ucs/sys/math.h>
+
 
 ucs_status_t ucm_bistro_remove_restore_point(ucm_bistro_restore_point_t *rp)
 {
-    ucs_assert(rp != NULL);
+    ucm_assert(rp != NULL);
     free(rp);
     return UCS_OK;
 }
@@ -103,8 +106,52 @@ ucs_status_t ucm_bistro_restore(ucm_bistro_restore_point_t *rp)
 
 void *ucm_bistro_restore_addr(ucm_bistro_restore_point_t *rp)
 {
-    ucs_assert(rp != NULL);
+    ucm_assert(rp != NULL);
     return rp->addr;
 }
 
+void *ucm_bistro_allocate_code(size_t size)
+{
+    static const size_t mmap_size = 16 * UCS_KBYTE;
+    static pthread_mutex_t mutex  = PTHREAD_MUTEX_INITIALIZER;
+    static void *mem_area         = MAP_FAILED;
+    static size_t alloc_offset    = 0;
+    size_t alloc_size;
+    void *result;
+
+    pthread_mutex_lock(&mutex);
+
+    if (mem_area == MAP_FAILED) {
+        /* Allocate executable memory block once, and reuse it for
+         * subsequent allocations. We assume bistro would not really need
+         * more than 'mmap_size' in total, since it's used for limited number
+         * of library functions. Also, the memory is never really released, so
+         * our allocator is very simple.
+         */
+        mem_area = mmap(NULL, ucs_align_up_pow2(mmap_size, ucm_get_page_size()),
+                        PROT_READ | PROT_WRITE | PROT_EXEC,
+                        MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+        if (mem_area == MAP_FAILED) {
+            ucm_error("failed to allocated executable memory of %zu bytes: %m",
+                      mmap_size);
+            result = NULL;
+            goto out;
+        }
+    }
+
+    alloc_size = ucs_align_up_pow2(size, UCS_SYS_PARAGRAPH_SIZE);
+    if ((alloc_size + alloc_offset) > mmap_size) {
+        result = NULL;
+        goto out;
+    }
+
+    /* Allocate next memory block in the mmap-ed area */
+    result        = UCS_PTR_BYTE_OFFSET(mem_area, alloc_offset);
+    alloc_offset += alloc_size;
+
+out:
+    pthread_mutex_unlock(&mutex);
+    return result;
+}
+
 #endif
diff --git a/src/ucm/bistro/bistro.h b/src/ucm/bistro/bistro.h
index 16e988700c3..101000455e6 100644
--- a/src/ucm/bistro/bistro.h
+++ b/src/ucm/bistro/bistro.h
@@ -36,6 +36,7 @@ typedef struct ucm_bistro_restore_point ucm_bistro_restore_point_t;
  */
 ucs_status_t ucm_bistro_restore(ucm_bistro_restore_point_t *rp);
 
+
 /**
  * Remove resore point created by @ref ucm_bistro_patch witout
  * restore original function body
@@ -46,6 +47,7 @@ ucs_status_t ucm_bistro_restore(ucm_bistro_restore_point_t *rp);
  */
 ucs_status_t ucm_bistro_remove_restore_point(ucm_bistro_restore_point_t *rp);
 
+
 /**
  * Get patch address for restore point
  *
@@ -55,4 +57,15 @@ ucs_status_t ucm_bistro_remove_restore_point(ucm_bistro_restore_point_t *rp);
  */
 void *ucm_bistro_restore_addr(ucm_bistro_restore_point_t *rp);
 
+
+/**
+ * Allocate executable memory which can be used to create trampolines or
+ * temporary functions.
+ *
+ * @param size   Memory size to allocated
+ *
+ * @return Pointer to allocated memory, or NULL if failed.
+ */
+void *ucm_bistro_allocate_code(size_t size);
+
 #endif
diff --git a/src/ucm/bistro/bistro_aarch64.c b/src/ucm/bistro/bistro_aarch64.c
index e161292f9ae..8659c0ee6ab 100644
--- a/src/ucm/bistro/bistro_aarch64.c
+++ b/src/ucm/bistro/bistro_aarch64.c
@@ -59,12 +59,10 @@
  */
 #define BR(_reg) ((0xd61f << 16) + ((_reg) << 5))
 
-ucs_status_t ucm_bistro_patch(const char *symbol, void *hook,
+ucs_status_t ucm_bistro_patch(void *func_ptr, void *hook, const char *symbol,
+                              void **orig_func_p,
                               ucm_bistro_restore_point_t **rp)
 {
-    void *func;
-    ucs_status_t status;
-
     ucm_bistro_patch_t patch = {
         .reg3 = MOVZ(R15, 3, (uintptr_t)hook >> 48),
         .reg2 = MOVK(R15, 2, (uintptr_t)hook >> 32),
@@ -72,15 +70,18 @@ ucs_status_t ucm_bistro_patch(const char *symbol, void *hook,
         .reg0 = MOVK(R15, 0, (uintptr_t)hook),
         .br   = BR(R15)
     };
+    ucs_status_t status;
 
-    UCM_LOOKUP_SYMBOL(func, symbol);
+    if (orig_func_p != NULL) {
+        return UCS_ERR_UNSUPPORTED;
+    }
 
-    status = ucm_bistro_create_restore_point(func, sizeof(patch), rp);
+    status = ucm_bistro_create_restore_point(func_ptr, sizeof(patch), rp);
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
     }
 
-    return ucm_bistro_apply_patch(func, &patch, sizeof(patch));
+    return ucm_bistro_apply_patch(func_ptr, &patch, sizeof(patch));
 }
 
 #endif
diff --git a/src/ucm/bistro/bistro_aarch64.h b/src/ucm/bistro/bistro_aarch64.h
index 487aa923d08..b0770b30fe7 100644
--- a/src/ucm/bistro/bistro_aarch64.h
+++ b/src/ucm/bistro/bistro_aarch64.h
@@ -28,14 +28,19 @@ typedef struct ucm_bistro_patch {
  * Set library function call hook using Binary Instrumentation
  * method (BISTRO): replace function body by user defined call
  *
- * @param symbol function name to replace
- * @param hook   user-defined function-replacer
- * @param rp     restore point used to restore original function,
- *               optional, may be NULL
+ * @param func_ptr     Pointer to function to patch.
+ * @param hook         User-defined function-replacer.
+ * @param symbol       Function name to replace.
+ * @param orig_func_p  Unsupported on this architecture and must be NULL.
+ *                     If set to a non-NULL value, this function returns
+ *                     @ref UCS_ERR_UNSUPPORTED.
+ * @param rp           Restore point used to restore original function.
+ *                     Optional, may be NULL.
  *
  * @return Error code as defined by @ref ucs_status_t
  */
-ucs_status_t ucm_bistro_patch(const char *symbol, void *hook,
+ucs_status_t ucm_bistro_patch(void *func_ptr, void *hook, const char *symbol,
+                              void **orig_func_p,
                               ucm_bistro_restore_point_t **rp);
 
 #endif
diff --git a/src/ucm/bistro/bistro_int.h b/src/ucm/bistro/bistro_int.h
index e6c08a4994a..80b423b8cd7 100644
--- a/src/ucm/bistro/bistro_int.h
+++ b/src/ucm/bistro/bistro_int.h
@@ -33,17 +33,4 @@ ucs_status_t ucm_bistro_apply_patch(void *dst, void *patch, size_t len);
 ucs_status_t ucm_bistro_create_restore_point(void *addr, size_t len,
                                              ucm_bistro_restore_point_t **rp);
 
-static inline void *ucm_bistro_lookup(const char *symbol)
-{
-    void *addr;
-
-    ucs_assert(symbol != NULL);
-
-    addr = dlsym(RTLD_NEXT, symbol);
-    if (!addr) {
-        addr = dlsym(RTLD_DEFAULT, symbol);
-    }
-    return addr;
-}
-
 #endif
diff --git a/src/ucm/bistro/bistro_ppc64.c b/src/ucm/bistro/bistro_ppc64.c
index 4b14250cd97..942346c8c07 100644
--- a/src/ucm/bistro/bistro_ppc64.c
+++ b/src/ucm/bistro/bistro_ppc64.c
@@ -76,7 +76,7 @@ struct ucm_bistro_restore_point {
 static void ucm_bistro_fill_base_patch(ucm_bistro_base_patch_t *patch,
                                        uint32_t reg, uintptr_t value)
 {
-    ucs_assert(patch != NULL);
+    ucm_assert(patch != NULL);
 
     patch->addis  = ADDIS ( reg, 0,   (value >> 48));
     patch->ori1   = ORI   ( reg, reg, (value >> 32));
@@ -88,7 +88,7 @@ static void ucm_bistro_fill_base_patch(ucm_bistro_base_patch_t *patch,
 static void ucm_bistro_fill_patch(ucm_bistro_patch_t *patch,
                                   uint32_t reg, uintptr_t value)
 {
-    ucs_assert(patch != NULL);
+    ucm_assert(patch != NULL);
 
     ucm_bistro_fill_base_patch(&patch->super, reg, value);
 
@@ -134,7 +134,7 @@ static void *ucm_bistro_get_text_addr(void *addr)
 #endif
 }
 
-ucs_status_t ucm_bistro_patch_toc(const char *symbol, void *hook,
+ucs_status_t ucm_bistro_patch_toc(void *func_ptr, void *hook,
                                   ucm_bistro_restore_point_t **rp,
                                   uint64_t toc)
 {
@@ -143,11 +143,9 @@ ucs_status_t ucm_bistro_patch_toc(const char *symbol, void *hook,
     ucm_bistro_restore_point_t restore;
     ucm_bistro_patch_t patch;
 
-    UCM_LOOKUP_SYMBOL(func, symbol);
+    restore.entry = func_ptr;
 
-    restore.entry = func;
-
-    func = ucm_bistro_get_text_addr(func);
+    func = ucm_bistro_get_text_addr(func_ptr);
     hook = ucm_bistro_get_text_addr(hook);
 
     status = ucm_bistro_patch_hook(hook, &restore, toc);
@@ -185,7 +183,7 @@ ucs_status_t ucm_bistro_restore(ucm_bistro_restore_point_t *rp)
 {
     ucs_status_t status;
 
-    ucs_assert(rp != NULL);
+    ucm_assert(rp != NULL);
 
     status = ucm_bistro_apply_patch(rp->func, &rp->func_patch, sizeof(rp->func_patch));
     if (UCS_STATUS_IS_ERR(status)) {
@@ -202,7 +200,7 @@ ucs_status_t ucm_bistro_restore(ucm_bistro_restore_point_t *rp)
 
 void *ucm_bistro_restore_addr(ucm_bistro_restore_point_t *rp)
 {
-    ucs_assert(rp != NULL);
+    ucm_assert(rp != NULL);
     return rp->entry;
 }
 
diff --git a/src/ucm/bistro/bistro_ppc64.h b/src/ucm/bistro/bistro_ppc64.h
index 7b5c3b46e7e..e0d4635ca5a 100644
--- a/src/ucm/bistro/bistro_ppc64.h
+++ b/src/ucm/bistro/bistro_ppc64.h
@@ -4,13 +4,15 @@
  * See file LICENSE for terms.
  */
 
-
 #ifndef UCM_BISTRO_BISTRO_PPC64_H_
 #define UCM_BISTRO_BISTRO_PPC64_H_
 
-#include <stdint.h>
+#include "bistro.h"
 
 #include <ucs/type/status.h>
+#include <stdint.h>
+#include <stddef.h>
+
 
 /* special processing for ppc64 to save and restore TOC (r2)
  * Reference: "64-bit PowerPC ELF Application Binary Interface Supplement 1.9" */
@@ -26,26 +28,31 @@
  * Set library function call hook using Binary Instrumentation
  * method (BISTRO): replace function body by user defined call
  *
- * @param symbol function name to replace
- * @param hook   user-defined function-replacer
- * @param rp     restore point used to restore original function,
- *               optional, may be NULL
+ * @param func_ptr  Function to patch.
+ * @param hook      User-defined function-replacer.
+ * @param rp        Restore point used to restore original function.
+                    Optional, may be NULL.
  *
  * @return Error code as defined by @ref ucs_status_t
  */
 /* we have to use inline proxy call to save TOC register
  * value - PPC is very sensible to this register value */
-ucs_status_t ucm_bistro_patch_toc(const char *symbol, void *hook,
+ucs_status_t ucm_bistro_patch_toc(void *func_ptr, void *hook,
                                   ucm_bistro_restore_point_t **rp,
                                   uint64_t toc);
 
-static inline
-ucs_status_t ucm_bistro_patch(const char *symbol, void *hook,
-                              ucm_bistro_restore_point_t **rp)
+static inline ucs_status_t
+ucm_bistro_patch(void *func_ptr, void *hook, const char *symbol,
+                 void **orig_func_p, ucm_bistro_restore_point_t **rp)
 {
     uint64_t toc;
+
+    if (orig_func_p != NULL) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
     asm volatile ("std 2, %0" : "=m" (toc));
-    return ucm_bistro_patch_toc(symbol, hook, rp, toc);
+    return ucm_bistro_patch_toc(func_ptr, hook, rp, toc);
 }
 
 #endif
diff --git a/src/ucm/bistro/bistro_x86_64.c b/src/ucm/bistro/bistro_x86_64.c
index da67dbc517f..5b0f7ace0f6 100644
--- a/src/ucm/bistro/bistro_x86_64.c
+++ b/src/ucm/bistro/bistro_x86_64.c
@@ -5,16 +5,16 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#  include "config.h"
+#include "config.h"
 #endif
 
 /* *******************************************************
  * x86 processors family                                 *
  * ***************************************************** */
+
 #if defined(__x86_64__)
 
 #include <sys/mman.h>
-#include <dlfcn.h>
 #include <string.h>
 #include <stdlib.h>
 
@@ -22,47 +22,218 @@
 #include <ucm/bistro/bistro_int.h>
 #include <ucm/util/sys.h>
 #include <ucs/sys/math.h>
-#include <ucs/arch/cpu.h>
-#include <ucs/debug/assert.h>
 
 
-ucs_status_t ucm_bistro_patch(const char *symbol, void *hook,
+typedef struct {
+    void *jmp_addr;
+    char code[];
+} ucm_bistro_orig_func_t;
+
+typedef struct {
+    uint8_t opcode; /* 0xff */
+    uint8_t modrm; /* 0x25 */
+    int32_t displ;
+} UCS_S_PACKED ucm_bistro_jmp_indirect_t;
+
+
+/* REX prefix */
+#define UCM_BISTRO_X86_REX_MASK  0xF0 /* Mask */
+#define UCM_BISTRO_X86_REX       0x40 /* Value */
+
+#define UCM_BISTRO_X86_REX_W     0x48 /* REX.W value */
+#define UCM_BISTRO_X86_REX_B     0x41 /* REX.B value */
+
+/* PUSH general register
+ * "push $reg"
+ */
+#define UCM_BISTRO_X86_PUSH_R_MASK 0xF0 /* Mask */
+#define UCM_BISTRO_X86_PUSH_R      0x50 /* Value */
+
+/* Immediate Grp 1(1A), Ev, Iz */
+#define UCM_BISTRO_X86_IMM_GRP1_EV_IZ 0x81
+
+/* MOV Ev,Gv */
+#define UCM_BISTRO_X86_MOV_EV_GV 0x89
+
+/* MOV immediate word or double into word, double, or quad register
+ * "mov $imm32, %reg"
+ */
+#define UCM_BISTRO_X86_MOV_IR_MASK 0xF8 /* Mask */
+#define UCM_BISTRO_X86_MOV_IR      0xB8 /* Value */
+
+/* ModR/M encoding:
+ * [ mod | reg   | r/m   ]
+ * [ 7 6 | 5 4 3 | 2 1 0 ]
+ */
+#define UCM_BISTRO_X86_MODRM_MOD_SHIFT 6 /* mod */
+#define UCM_BISTRO_X86_MODRM_REG_SHIFT 3 /* reg */
+#define UCM_BISTRO_X86_MODRM_RM_BITS   3 /* r/m */
+
+/* Table 2-2 */
+#define UCM_BISTRO_X86_MODRM_MOD_DISP8  1 /* 0b01 */
+#define UCM_BISTRO_X86_MODRM_MOD_DISP32 2 /* 0b10 */
+#define UCM_BISTRO_X86_MODRM_MOD_REG    3 /* 0b11 */
+#define UCM_BISTRO_X86_MODRM_RM_SIB     4 /* 0b100 */
+
+/* ModR/M encoding for SUB RSP
+ * mod=0b11, reg=0b101 (SUB as opcode extension), r/m=0b100
+ */
+#define UCM_BISTRO_X86_MODRM_SUB_SP 0xEC /* 11 101 100 */
+
+/* ModR/M encoding for EBP/BP/CH/MM5/XMM5, AH/SP/ESP/MM4/XMM4 */
+#define UCM_BISTRO_X86_MODRM_BP_SP 0xE5 /* 11 100 101 */
+
+
+/*
+ * Find the minimal length of initial instructions in the function which can be
+ * safely executed from any memory location.
+ * Uses a very simplified disassembler which supports only the typical
+ * instructions found in function prologue.
+ */
+static size_t ucm_bistro_detect_pic_prefix(const void *func, size_t min_length)
+{
+    uint8_t rex, opcode, modrm, mod;
+    size_t offset, prev_offset;
+
+    offset = 0;
+    while (offset < min_length) {
+        prev_offset = offset;
+        opcode      = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
+
+        /* check for REX prefix */
+        if ((opcode & UCM_BISTRO_X86_REX_MASK) == UCM_BISTRO_X86_REX) {
+            rex    = opcode;
+            opcode = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
+        } else {
+            rex = 0;
+        }
+
+        /* check the opcode */
+        if (((rex == 0) || rex == UCM_BISTRO_X86_REX_B) &&
+            ((opcode & UCM_BISTRO_X86_PUSH_R_MASK) == UCM_BISTRO_X86_PUSH_R)) {
+            continue;
+        } else if ((rex == UCM_BISTRO_X86_REX_W) &&
+                   (opcode == UCM_BISTRO_X86_IMM_GRP1_EV_IZ)) {
+            modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
+            if (modrm == UCM_BISTRO_X86_MODRM_SUB_SP) {
+                /* sub $imm32, %rsp */
+                offset += sizeof(uint32_t);
+                continue;
+            }
+        } else if ((rex == UCM_BISTRO_X86_REX_W) &&
+                   (opcode == UCM_BISTRO_X86_MOV_EV_GV)) {
+            modrm = *(uint8_t*)UCS_PTR_BYTE_OFFSET(func, offset++);
+            if (modrm == UCM_BISTRO_X86_MODRM_BP_SP) {
+                /* mov %rsp, %rbp */
+                continue;
+            }
+            mod = modrm >> UCM_BISTRO_X86_MODRM_MOD_SHIFT;
+            if ((mod != UCM_BISTRO_X86_MODRM_MOD_REG) &&
+                ((modrm & UCS_MASK(UCM_BISTRO_X86_MODRM_RM_BITS)) ==
+                 UCM_BISTRO_X86_MODRM_RM_SIB)) {
+                /* r/m = 0b100, mod = 0b00/0b01/0b10 */
+                ++offset; /* skip SIB */
+                if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP8) {
+                    offset += sizeof(uint8_t); /* skip disp8 */
+                } else if (mod == UCM_BISTRO_X86_MODRM_MOD_DISP32) {
+                    offset += sizeof(uint32_t); /* skip disp32 */
+                }
+                continue;
+            }
+        } else if ((rex == 0) &&
+                   ((opcode & UCM_BISTRO_X86_MOV_IR_MASK) == UCM_BISTRO_X86_MOV_IR)) {
+            offset += sizeof(uint32_t);
+            continue;
+        }
+
+        /* unsupported instruction - bail */
+        return prev_offset;
+    }
+
+    return offset;
+}
+
+static ucs_status_t
+ucm_bistro_construct_orig_func(const void *func_ptr, size_t patch_len,
+                               const char *symbol, void **orig_func_p)
+{
+    ucm_bistro_jmp_indirect_t *jmp_back;
+    ucm_bistro_orig_func_t *orig_func;
+    size_t prefix_len, code_size;
+
+    prefix_len = ucm_bistro_detect_pic_prefix(func_ptr, patch_len);
+    ucm_debug("'%s' at %p prefix length %zu/%zu", symbol, func_ptr, prefix_len,
+              patch_len);
+    if (prefix_len < patch_len) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    /* Allocate executable page */
+    code_size = sizeof(*orig_func) + patch_len + sizeof(*jmp_back);
+    orig_func = ucm_bistro_allocate_code(code_size);
+    if (orig_func == NULL) {
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    /* Copy code fragment from original function */
+    memcpy(orig_func->code, func_ptr, prefix_len);
+
+    /* Indirect jump to *orig_func->jmp_address */
+    orig_func->jmp_addr = UCS_PTR_BYTE_OFFSET(func_ptr, prefix_len);
+    jmp_back            = UCS_PTR_BYTE_OFFSET(orig_func->code, prefix_len);
+    jmp_back->opcode    = 0xff;
+    jmp_back->modrm     = 0x25;
+    jmp_back->displ     = UCS_PTR_BYTE_DIFF(jmp_back + 1, &orig_func->jmp_addr);
+    *orig_func_p        = orig_func->code;
+
+    return UCS_OK;
+}
+
+ucs_status_t ucm_bistro_patch(void *func_ptr, void *hook, const char *symbol,
+                              void **orig_func_p,
                               ucm_bistro_restore_point_t **rp)
 {
-    ucm_bistro_jmp_r11_patch_t patch_jmp_r11   = {
+    ucm_bistro_jmp_r11_patch_t jmp_r11   = {
         .mov_r11 = {0x49, 0xbb},
         .jmp_r11 = {0x41, 0xff, 0xe3}
     };
-    ucm_bistro_jmp_near_patch_t patch_jmp_near = {
+    ucm_bistro_jmp_near_patch_t jmp_near = {
         .jmp_rel = 0xe9
     };
-    void *func, *patch, *jmp_base;
+    void *patch, *jmp_base;
     ucs_status_t status;
     ptrdiff_t jmp_disp;
     size_t patch_len;
 
-    UCM_LOOKUP_SYMBOL(func, symbol);
-
-    jmp_base = UCS_PTR_BYTE_OFFSET(func, sizeof(patch_jmp_near));
+    jmp_base = UCS_PTR_BYTE_OFFSET(func_ptr, sizeof(jmp_near));
     jmp_disp = UCS_PTR_BYTE_DIFF(jmp_base, hook);
     if (labs(jmp_disp) < INT32_MAX) {
         /* if 32-bit near jump is possible, use it, since it's a short 5-byte
          * instruction which reduces the chances of racing with other thread
          */
-        patch_jmp_near.disp = jmp_disp;
-        patch               = &patch_jmp_near;
-        patch_len           = sizeof(patch_jmp_near);
+        jmp_near.disp = jmp_disp;
+        patch         = &jmp_near;
+        patch_len     = sizeof(jmp_near);
     } else {
-        patch_jmp_r11.ptr   = hook;
-        patch               = &patch_jmp_r11;
-        patch_len           = sizeof(patch_jmp_r11);
+        jmp_r11.ptr = hook;
+        patch       = &jmp_r11;
+        patch_len   = sizeof(jmp_r11);
     }
 
-    status = ucm_bistro_create_restore_point(func, patch_len, rp);
+    if (orig_func_p != NULL) {
+        status = ucm_bistro_construct_orig_func(func_ptr, patch_len, symbol,
+                                                orig_func_p);
+        if (status != UCS_OK) {
+            return status;
+        }
+    }
+
+    status = ucm_bistro_create_restore_point(func_ptr, patch_len, rp);
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
     }
 
-    return ucm_bistro_apply_patch(func, patch, patch_len);
+    return ucm_bistro_apply_patch(func_ptr, patch, patch_len);
 }
+
 #endif
diff --git a/src/ucm/bistro/bistro_x86_64.h b/src/ucm/bistro/bistro_x86_64.h
index 04f09b87415..87ef636cc8a 100644
--- a/src/ucm/bistro/bistro_x86_64.h
+++ b/src/ucm/bistro/bistro_x86_64.h
@@ -35,14 +35,20 @@ typedef struct ucm_bistro_jmp_near_patch {
  * Set library function call hook using Binary Instrumentation
  * method (BISTRO): replace function body by user defined call
  *
- * @param symbol function name to replace
- * @param hook   user-defined function-replacer
- * @param rp     restore point used to restore original function,
- *               optional, may be NULL
+ * @param func_ptr     Pointer to function to patch.
+ * @param hook         User-defined function-replacer.
+ * @param symbol       Function name to replace.
+ * @param orig_func_p  If non-NULL, set to pointer to a trampoline which calls
+ *                     the original function (before patching). If it's not
+ *                     possible to create such trampoline, the function returns
+ *                     @ref UCS_ERR_UNSUPPORTED.
+ * @param rp           Restore point used to restore original function.
+ *                     Optional, may be NULL.
  *
  * @return Error code as defined by @ref ucs_status_t
  */
-ucs_status_t ucm_bistro_patch(const char *symbol, void *hook,
+ucs_status_t ucm_bistro_patch(void *func_ptr, void *hook, const char *symbol,
+                              void **orig_func_p,
                               ucm_bistro_restore_point_t **rp);
 
 #endif
diff --git a/src/ucm/cuda/Makefile.am b/src/ucm/cuda/Makefile.am
index 438960e2c28..03fd621f97a 100644
--- a/src/ucm/cuda/Makefile.am
+++ b/src/ucm/cuda/Makefile.am
@@ -9,7 +9,7 @@ if HAVE_CUDA
 module_LTLIBRARIES      = libucm_cuda.la
 libucm_cuda_la_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS)
 libucm_cuda_la_CFLAGS   = $(BASE_CFLAGS) $(CUDA_CFLAGS)
-libucm_cuda_la_LIBADD   = ../libucm.la
+libucm_cuda_la_LIBADD   = ../libucm.la $(CUDA_LIBS)
 libucm_cuda_la_LDFLAGS  = $(UCM_MODULE_LDFLAGS) \
                           $(patsubst %, -Xlinker %, $(CUDA_LDFLAGS)) \
                           -version-info $(SOVERSION)
diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c
index 02fd8c32f03..dfab90b886f 100644
--- a/src/ucm/cuda/cudamem.c
+++ b/src/ucm/cuda/cudamem.c
@@ -6,392 +6,276 @@
  */
 
 #ifdef HAVE_CONFIG_H
-#  include "config.h"
+#include "config.h"
 #endif
 
-#include <ucm/cuda/cudamem.h>
+#include "cudamem.h"
 
 #include <ucm/event/event.h>
+#include <ucm/mmap/mmap.h>
 #include <ucm/util/log.h>
 #include <ucm/util/reloc.h>
 #include <ucm/util/replace.h>
 #include <ucm/util/sys.h>
-#include <ucs/debug/assert.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/preprocessor.h>
 
 #include <sys/mman.h>
-#include <pthread.h>
 #include <string.h>
-#include <unistd.h>
-
-
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFree, CUresult, -1, CUdeviceptr)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemFreeHost, CUresult, -1, void *)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr *, size_t)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr *,
-                              size_t, unsigned int)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr *, size_t *,
-                              size_t, size_t, unsigned int)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemHostGetDevicePointer, CUresult, -1, CUdeviceptr *,
-                              void *, unsigned int)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cuMemHostUnregister, CUresult, -1, void *)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaFree, cudaError_t, -1, void*)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaFreeHost, cudaError_t, -1, void*)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMallocManaged, cudaError_t, -1, void**, size_t, unsigned int)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaMallocPitch, cudaError_t, -1, void**, size_t *,
-                              size_t, size_t)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaHostGetDevicePointer, cudaError_t, -1, void**,
-                              void *, unsigned int)
-UCM_DEFINE_REPLACE_DLSYM_FUNC(cudaHostUnregister, cudaError_t, -1, void*)
-
-#if ENABLE_SYMBOL_OVERRIDE
-UCM_OVERRIDE_FUNC(cuMemFree,                 CUresult)
-UCM_OVERRIDE_FUNC(cuMemFreeHost,             CUresult)
-UCM_OVERRIDE_FUNC(cuMemAlloc,                CUresult)
-UCM_OVERRIDE_FUNC(cuMemAllocManaged,         CUresult)
-UCM_OVERRIDE_FUNC(cuMemAllocPitch,           CUresult)
-UCM_OVERRIDE_FUNC(cuMemHostGetDevicePointer, CUresult)
-UCM_OVERRIDE_FUNC(cuMemHostUnregister,       CUresult)
-UCM_OVERRIDE_FUNC(cudaFree,                  cudaError_t)
-UCM_OVERRIDE_FUNC(cudaFreeHost,              cudaError_t)
-UCM_OVERRIDE_FUNC(cudaMalloc,                cudaError_t)
-UCM_OVERRIDE_FUNC(cudaMallocManaged,         cudaError_t)
-UCM_OVERRIDE_FUNC(cudaMallocPitch,           cudaError_t)
-UCM_OVERRIDE_FUNC(cudaHostGetDevicePointer,  cudaError_t)
-UCM_OVERRIDE_FUNC(cudaHostUnregister,        cudaError_t)
-#endif
 
 
-static void ucm_cuda_set_ptr_attr(CUdeviceptr dptr)
-{
-    if ((void*)dptr == NULL) {
-        ucm_trace("skipping cuPointerSetAttribute for null pointer");
-        return;
+/* Create a body of CUDA memory allocation replacement function */
+#define UCM_CUDA_ALLOC_FUNC(_name, _mem_type, _retval, _success, _size, \
+                            _ptr_type, _args_fmt, ...) \
+    _retval ucm_##_name(_ptr_type *ptr_p, UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
+    { \
+        _ptr_type ptr; \
+        _retval ret; \
+        \
+        ucm_event_enter(); \
+        ret = ucm_orig_##_name(ptr_p, UCM_FUNC_PASS_ARGS(__VA_ARGS__)); \
+        if (ret == (_success)) { \
+            ptr = *ptr_p; \
+            ucm_trace("%s(" _args_fmt ") allocated %p", __FUNCTION__, \
+                      UCM_FUNC_PASS_ARGS(__VA_ARGS__), (void*)ptr); \
+            ucm_cuda_dispatch_mem_alloc((CUdeviceptr)ptr, (_size), \
+                                        (_mem_type)); \
+        } \
+        ucm_event_leave(); \
+        return ret; \
     }
 
-    unsigned int value = 1;
-    CUresult ret;
-    const char *cu_err_str;
+/* Create a body of CUDA memory release replacement function */
+#define UCM_CUDA_FREE_FUNC(_name, _retval, _ptr_type, _mem_type) \
+    _retval ucm_##_name(_ptr_type ptr) \
+    { \
+        _retval ret; \
+        \
+        ucm_event_enter(); \
+        ucm_trace("%s(ptr=%p)", __FUNCTION__, (void*)ptr); \
+        ucm_cuda_dispatch_mem_free((CUdeviceptr)ptr, _mem_type, #_name); \
+        ret = ucm_orig_##_name(ptr); \
+        ucm_event_leave(); \
+        return ret; \
+    }
 
-    ret = cuPointerSetAttribute(&value, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dptr);
-    if (ret != CUDA_SUCCESS) {
-        cuGetErrorString(ret, &cu_err_str);
-        ucm_warn("cuPointerSetAttribute(%p) failed: %s", (void *) dptr, cu_err_str);
+#define UCM_CUDA_FUNC_ENTRY(_func) \
+    { \
+        {#_func, ucm_override_##_func}, (void**)&ucm_orig_##_func \
     }
-}
 
-static UCS_F_ALWAYS_INLINE void
-ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucs_memory_type_t mem_type)
+typedef struct {
+    ucm_reloc_patch_t patch;
+    void              **orig_func_ptr;
+} ucm_cuda_func_t;
+
+
+/* Driver API */
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAlloc, CUresult, -1, CUdeviceptr*,
+                                  size_t)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAlloc_v2, CUresult, -1, CUdeviceptr*,
+                                  size_t)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocManaged, CUresult, -1, CUdeviceptr*,
+                                  size_t, unsigned int)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch, CUresult, -1, CUdeviceptr*,
+                                  size_t*, size_t, size_t, unsigned int)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemAllocPitch_v2, CUresult, -1,
+                                  CUdeviceptr*, size_t*, size_t, size_t,
+                                  unsigned int)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree, CUresult, -1, CUdeviceptr)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*)
+
+/* Runtime API */
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**,
+                                  size_t, unsigned int)
+UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**,
+                                  size_t*, size_t, size_t)
+
+static void ucm_cuda_dispatch_mem_alloc(CUdeviceptr ptr, size_t length,
+                                        ucs_memory_type_t mem_type)
 {
+    unsigned sync_atr_value = 1;
+    const char *cu_err_str;
     ucm_event_t event;
+    CUresult ret;
 
-    event.mem_type.address  = addr;
+    if ((ptr != 0) && (mem_type == UCS_MEMORY_TYPE_CUDA)) {
+        /* Synchronous operation for GPU direct */
+        ret = cuPointerSetAttribute(&sync_atr_value,
+                                    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr);
+        if (ret != CUDA_SUCCESS) {
+            cuGetErrorString(ret, &cu_err_str);
+            ucm_warn("cuPointerSetAttribute(%p) failed: %s", (void*)ptr,
+                     cu_err_str);
+        }
+    }
+
+    event.mem_type.address  = (void*)ptr;
     event.mem_type.size     = length;
     event.mem_type.mem_type = mem_type;
     ucm_event_dispatch(UCM_EVENT_MEM_TYPE_ALLOC, &event);
 }
 
-static UCS_F_ALWAYS_INLINE void
-ucm_dispatch_mem_type_free(void *addr, size_t length, ucs_memory_type_t mem_type)
+static void ucm_cuda_dispatch_mem_free(CUdeviceptr ptr,
+                                       ucs_memory_type_t mem_type,
+                                       const char *func_name)
 {
     ucm_event_t event;
-
-    event.mem_type.address  = addr;
-    event.mem_type.size     = length;
-    event.mem_type.mem_type = mem_type;
-    ucm_event_dispatch(UCM_EVENT_MEM_TYPE_FREE, &event);
-}
-
-static void ucm_cudafree_dispatch_events(CUdeviceptr dptr, const char *func_name)
-{
-    CUresult ret;
     CUdeviceptr pbase;
-    size_t psize;
+    size_t length;
+    CUresult ret;
 
-    if (dptr == 0) {
+    if (ptr == 0) {
         return;
     }
 
-    ret = cuMemGetAddressRange(&pbase, &psize, dptr);
+    ret = cuMemGetAddressRange(&pbase, &length, ptr);
     if (ret == CUDA_SUCCESS) {
-        if (dptr != pbase) {
+        if (ptr != pbase) {
             ucm_warn("%s(%p) called with unexpected pointer (expected: %p)",
-                     func_name, (void*)dptr, (void*)pbase);
+                     func_name, (void*)ptr, (void*)pbase);
         }
     } else {
-        ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)dptr);
-        psize = 1; /* set minimum length */
+        ucm_debug("cuMemGetAddressRange(devPtr=%p) failed", (void*)ptr);
+        length = 1; /* set minimum length */
     }
 
-    ucm_dispatch_mem_type_free((void *)dptr, psize, UCS_MEMORY_TYPE_CUDA);
-}
-
-CUresult ucm_cuMemFree(CUdeviceptr dptr)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ucm_trace("ucm_cuMemFree(dptr=%p)",(void*)dptr);
-
-    ucm_cudafree_dispatch_events(dptr, "cuMemFree");
-
-    ret = ucm_orig_cuMemFree(dptr);
-
-    ucm_event_leave();
-    return ret;
-}
-
-CUresult ucm_cuMemFreeHost(void *p)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ucm_trace("ucm_cuMemFreeHost(ptr=%p)", p);
-
-    ucm_dispatch_vm_munmap(p, 0);
-
-    ret = ucm_orig_cuMemFreeHost(p);
-
-    ucm_event_leave();
-    return ret;
-}
-
-CUresult ucm_cuMemAlloc(CUdeviceptr *dptr, size_t size)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cuMemAlloc(dptr, size);
-    if (ret == CUDA_SUCCESS) {
-        ucm_trace("ucm_cuMemAlloc(dptr=%p size:%lu)",(void *)*dptr, size);
-        ucm_dispatch_mem_type_alloc((void *)*dptr, size, UCS_MEMORY_TYPE_CUDA);
-        ucm_cuda_set_ptr_attr(*dptr);
-    }
-
-    ucm_event_leave();
-    return ret;
-}
-
-CUresult ucm_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flags)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cuMemAllocManaged(dptr, size, flags);
-    if (ret == CUDA_SUCCESS) {
-        ucm_trace("ucm_cuMemAllocManaged(dptr=%p size:%lu, flags:%d)",
-                  (void *)*dptr, size, flags);
-        ucm_dispatch_mem_type_alloc((void *)*dptr, size,
-                                    UCS_MEMORY_TYPE_CUDA_MANAGED);
-    }
-
-    ucm_event_leave();
-    return ret;
-}
-
-CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                             size_t WidthInBytes, size_t Height,
-                             unsigned int ElementSizeBytes)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cuMemAllocPitch(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
-    if (ret == CUDA_SUCCESS) {
-        ucm_trace("ucm_cuMemAllocPitch(dptr=%p size:%lu)",(void *)*dptr,
-                  (WidthInBytes * Height));
-        ucm_dispatch_mem_type_alloc((void *)*dptr, WidthInBytes * Height,
-                                    UCS_MEMORY_TYPE_CUDA);
-        ucm_cuda_set_ptr_attr(*dptr);
-    }
-
-    ucm_event_leave();
-    return ret;
-}
-
-CUresult ucm_cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cuMemHostGetDevicePointer(pdptr, p, Flags);
-    if (ret == CUDA_SUCCESS) {
-        ucm_trace("ucm_cuMemHostGetDevicePointer(pdptr=%p p=%p)",(void *)*pdptr, p);
-    }
-
-    ucm_event_leave();
-    return ret;
-}
-
-CUresult ucm_cuMemHostUnregister(void *p)
-{
-    CUresult ret;
-
-    ucm_event_enter();
-
-    ucm_trace("ucm_cuMemHostUnregister(ptr=%p)", p);
-
-    ret = ucm_orig_cuMemHostUnregister(p);
-
-    ucm_event_leave();
-    return ret;
-}
-
-cudaError_t ucm_cudaFree(void *devPtr)
-{
-    cudaError_t ret;
-
-    ucm_event_enter();
-
-    ucm_trace("ucm_cudaFree(devPtr=%p)", devPtr);
-
-    ucm_cudafree_dispatch_events((CUdeviceptr)devPtr, "cudaFree");
-
-    ret = ucm_orig_cudaFree(devPtr);
-
-    ucm_event_leave();
-
-    return ret;
-}
-
-cudaError_t ucm_cudaFreeHost(void *ptr)
-{
-    cudaError_t ret;
-
-    ucm_event_enter();
-
-    ucm_trace("ucm_cudaFreeHost(ptr=%p)", ptr);
-
-    ucm_dispatch_vm_munmap(ptr, 0);
-
-    ret = ucm_orig_cudaFreeHost(ptr);
-
-    ucm_event_leave();
-    return ret;
+    event.mem_type.address  = (void*)ptr;
+    event.mem_type.size     = length;
+    event.mem_type.mem_type = mem_type;
+    ucm_event_dispatch(UCM_EVENT_MEM_TYPE_FREE, &event);
 }
 
-cudaError_t ucm_cudaMalloc(void **devPtr, size_t size)
-{
-    cudaError_t ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cudaMalloc(devPtr, size);
-    if (ret == cudaSuccess) {
-        ucm_trace("ucm_cudaMalloc(devPtr=%p size:%lu)", *devPtr, size);
-        ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA);
-        ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr);
-    }
-
-    ucm_event_leave();
+/* Driver API replacements */
+UCM_CUDA_ALLOC_FUNC(cuMemAlloc, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS,
+                    arg0, CUdeviceptr, "size=%zu", size_t)
+UCM_CUDA_ALLOC_FUNC(cuMemAlloc_v2, UCS_MEMORY_TYPE_CUDA, CUresult, CUDA_SUCCESS,
+                    arg0, CUdeviceptr, "size=%zu", size_t)
+UCM_CUDA_ALLOC_FUNC(cuMemAllocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED, CUresult,
+                    CUDA_SUCCESS, arg0, CUdeviceptr, "size=%zu flags=0x%x",
+                    size_t, unsigned)
+UCM_CUDA_ALLOC_FUNC(cuMemAllocPitch, UCS_MEMORY_TYPE_CUDA, CUresult,
+                    CUDA_SUCCESS, (size_t)arg1 * arg2, CUdeviceptr,
+                    "pitch=%p width=%zu height=%zu elem=%u", size_t*, size_t,
+                    size_t, unsigned)
+UCM_CUDA_ALLOC_FUNC(cuMemAllocPitch_v2, UCS_MEMORY_TYPE_CUDA, CUresult,
+                    CUDA_SUCCESS, (size_t)arg1 * arg2, CUdeviceptr,
+                    "pitch=%p width=%zu height=%zu elem=%u", size_t*, size_t,
+                    size_t, unsigned)
+UCM_CUDA_FREE_FUNC(cuMemFree, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA)
+UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA)
+UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST)
+UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST)
+
+static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
+    UCM_CUDA_FUNC_ENTRY(cuMemAlloc),
+    UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2),
+    UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
+    UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
+    UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
+    UCM_CUDA_FUNC_ENTRY(cuMemFree),
+    UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
+    UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
+    UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
+    {{NULL}, NULL}
+};
 
-    return ret;
-}
+/* Runtime API replacements */
+UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess,
+                    arg0, void*, "size=%zu", size_t)
+UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED,
+                    cudaError_t, cudaSuccess, arg0, void*,
+                    "size=%zu flags=0x%x", size_t, unsigned)
+UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t,
+                    cudaSuccess, (size_t)arg1 * arg2, void*,
+                    "pitch=%p width=%zu height=%zu", size_t*, size_t, size_t)
+UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA)
+UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST)
+
+static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
+    UCM_CUDA_FUNC_ENTRY(cudaFree),
+    UCM_CUDA_FUNC_ENTRY(cudaFreeHost),
+    UCM_CUDA_FUNC_ENTRY(cudaMalloc),
+    UCM_CUDA_FUNC_ENTRY(cudaMallocManaged),
+    UCM_CUDA_FUNC_ENTRY(cudaMallocPitch),
+    {{NULL}, NULL}
+};
 
-cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags)
+static ucm_mmap_hook_mode_t ucm_cuda_hook_mode()
 {
-    cudaError_t ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cudaMallocManaged(devPtr, size, flags);
-    if (ret == cudaSuccess) {
-        ucm_trace("ucm_cudaMallocManaged(devPtr=%p size:%lu flags:%d)",
-                  *devPtr, size, flags);
-        ucm_dispatch_mem_type_alloc(*devPtr, size, UCS_MEMORY_TYPE_CUDA_MANAGED);
-    }
-
-    ucm_event_leave();
-
-    return ret;
+    return ucm_get_hook_mode(ucm_global_opts.cuda_hook_mode);
 }
 
-cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
-                                size_t width, size_t height)
+static ucs_status_t
+ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,
+                       const char *name)
 {
-    cudaError_t ret;
-
-    ucm_event_enter();
-
-    ret = ucm_orig_cudaMallocPitch(devPtr, pitch, width, height);
-    if (ret == cudaSuccess) {
-        ucm_trace("ucm_cudaMallocPitch(devPtr=%p size:%lu)",*devPtr, (width * height));
-        ucm_dispatch_mem_type_alloc(*devPtr, (width * height), UCS_MEMORY_TYPE_CUDA);
-        ucm_cuda_set_ptr_attr((CUdeviceptr) *devPtr);
-    }
+    const char UCS_V_UNUSED *hook_mode;
+    unsigned num_bistro, num_reloc;
+    ucm_cuda_func_t *func;
+    ucs_status_t status;
+    void *func_ptr;
+
+    num_bistro  = 0;
+    num_reloc   = 0;
+    for (func = funcs; func->patch.symbol != NULL; ++func) {
+        func_ptr = ucm_reloc_get_orig(func->patch.symbol, func->patch.value);
+        if (func_ptr == NULL) {
+            continue;
+        }
 
-    ucm_event_leave();
-    return ret;
-}
+        status = UCS_ERR_UNSUPPORTED;
 
-cudaError_t ucm_cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags)
-{
-    cudaError_t ret;
+        if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_BISTRO) {
+            status = ucm_bistro_patch(func_ptr, func->patch.value,
+                                      func->patch.symbol, func->orig_func_ptr,
+                                      NULL);
+            if (status == UCS_OK) {
+                ucm_trace("installed bistro hook for '%s': %s",
+                          func->patch.symbol, ucs_status_string(status));
+                ++num_bistro;
+                continue;
+            }
+
+            ucm_debug("failed to install bistro hook for '%s', trying reloc",
+                      func->patch.symbol);
+        }
 
-    ucm_event_enter();
+        status = ucm_reloc_modify(&func->patch);
+        if (status != UCS_OK) {
+            ucm_diag("failed to install relocation table entry for '%s'",
+                     func->patch.symbol);
+            return status;
+        }
 
-    ret = ucm_orig_cudaHostGetDevicePointer(pDevice, pHost, flags);
-    if (ret == cudaSuccess) {
-        ucm_trace("ucm_cuMemHostGetDevicePointer(pDevice=%p pHost=%p)", pDevice, pHost);
+        ++num_reloc;
+        ucm_trace("installed reloc hook on '%s'", func->patch.symbol);
     }
 
-    ucm_event_leave();
-    return ret;
+    *used_reloc = num_reloc > 0;
+    ucm_info("cuda memory hooks on %s API: installed %u bistro and %u reloc",
+             name, num_bistro, num_reloc);
+    return UCS_OK;
 }
 
-cudaError_t ucm_cudaHostUnregister(void *ptr)
-{
-    cudaError_t ret;
-
-    ucm_event_enter();
-
-    ucm_trace("ucm_cudaHostUnregister(ptr=%p)", ptr);
-
-    ret = ucm_orig_cudaHostUnregister(ptr);
-
-    ucm_event_leave();
-    return ret;
-}
-
-static ucm_reloc_patch_t patches[] = {
-    {UCS_PP_MAKE_STRING(cuMemFree),                 ucm_override_cuMemFree},
-    {UCS_PP_MAKE_STRING(cuMemFreeHost),             ucm_override_cuMemFreeHost},
-    {UCS_PP_MAKE_STRING(cuMemAlloc),                ucm_override_cuMemAlloc},
-    {UCS_PP_MAKE_STRING(cuMemAllocManaged),         ucm_override_cuMemAllocManaged},
-    {UCS_PP_MAKE_STRING(cuMemAllocPitch),           ucm_override_cuMemAllocPitch},
-    {UCS_PP_MAKE_STRING(cuMemHostGetDevicePointer), ucm_override_cuMemHostGetDevicePointer},
-    {UCS_PP_MAKE_STRING(cuMemHostUnregister),       ucm_override_cuMemHostUnregister},
-    {UCS_PP_MAKE_STRING(cudaFree),                  ucm_override_cudaFree},
-    {UCS_PP_MAKE_STRING(cudaFreeHost),              ucm_override_cudaFreeHost},
-    {UCS_PP_MAKE_STRING(cudaMalloc),                ucm_override_cudaMalloc},
-    {UCS_PP_MAKE_STRING(cudaMallocManaged),         ucm_override_cudaMallocManaged},
-    {UCS_PP_MAKE_STRING(cudaMallocPitch),           ucm_override_cudaMallocPitch},
-    {UCS_PP_MAKE_STRING(cudaHostGetDevicePointer),  ucm_override_cudaHostGetDevicePointer},
-    {UCS_PP_MAKE_STRING(cudaHostUnregister),        ucm_override_cudaHostUnregister},
-    {NULL,                                          NULL}
-};
-
 static ucs_status_t ucm_cudamem_install(int events)
 {
-    static int ucm_cudamem_installed = 0;
+    static int ucm_cudamem_installed     = 0;
     static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
-    ucm_reloc_patch_t *patch;
-    ucs_status_t status = UCS_OK;
+    ucs_status_t status                  = UCS_OK;
+    int used_reloc;
 
     if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) {
         goto out;
     }
 
-    if (!ucm_global_opts.enable_cuda_reloc) {
-        ucm_debug("installing cudamem relocations is disabled by configuration");
+    if (ucm_cuda_hook_mode() == UCM_MMAP_HOOK_NONE) {
+        ucm_info("cuda memory hooks are disabled by configuration");
         status = UCS_ERR_UNSUPPORTED;
         goto out;
     }
@@ -402,17 +286,24 @@ static ucs_status_t ucm_cudamem_install(int events)
         goto out_unlock;
     }
 
-    for (patch = patches; patch->symbol != NULL; ++patch) {
-        status = ucm_reloc_modify(patch);
-        if (status != UCS_OK) {
-            ucm_warn("failed to install relocation table entry for '%s'", patch->symbol);
-            goto out_unlock;
+    status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, &used_reloc,
+                                    "driver");
+    if (status != UCS_OK) {
+        ucm_warn("failed to install cuda memory hooks on driver API");
+    } else if (!used_reloc) {
+        ucm_cudamem_installed = 1;
+    } else if (status == UCS_OK) {
+        /* Failed to install bistro hooks on all driver APIs, so need to install
+           hooks on runtime APIs. */
+        status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, &used_reloc,
+                                        "runtime");
+        if (status == UCS_OK) {
+            ucm_cudamem_installed = 1;
+        } else {
+            ucm_warn("failed to install cuda memory hooks on runtime API")
         }
     }
 
-    ucm_debug("cudaFree hooks are ready");
-    ucm_cudamem_installed = 1;
-
 out_unlock:
     pthread_mutex_unlock(&install_mutex);
 out:
@@ -429,13 +320,13 @@ static int ucm_cudamem_scan_regions_cb(void *arg, void *addr, size_t length,
     /* we are interested in blocks which don't have any access permissions, or
      * mapped to nvidia device.
      */
-    if ((prot & (PROT_READ|PROT_WRITE|PROT_EXEC)) &&
+    if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
         strncmp(path, cuda_path_pattern, strlen(cuda_path_pattern))) {
         return 0;
     }
 
-    ucm_debug("dispatching initial memtype allocation for %p..%p %s",
-              addr, UCS_PTR_BYTE_OFFSET(addr, length), path);
+    ucm_trace("dispatching initial memtype allocation for %p..%p %s", addr,
+              UCS_PTR_BYTE_OFFSET(addr, length), path);
 
     event.mem_type.address  = addr;
     event.mem_type.size     = length;
@@ -460,10 +351,12 @@ static ucm_event_installer_t ucm_cuda_initializer = {
     .get_existing_alloc = ucm_cudamem_get_existing_alloc
 };
 
-UCS_STATIC_INIT {
+UCS_STATIC_INIT
+{
     ucs_list_add_tail(&ucm_event_installer_list, &ucm_cuda_initializer.list);
 }
 
-UCS_STATIC_CLEANUP {
+UCS_STATIC_CLEANUP
+{
     ucs_list_del(&ucm_cuda_initializer.list);
 }
diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h
index 03268231067..cd5a4087e27 100644
--- a/src/ucm/cuda/cudamem.h
+++ b/src/ucm/cuda/cudamem.h
@@ -7,95 +7,29 @@
 #ifndef UCM_CUDAMEM_H_
 #define UCM_CUDAMEM_H_
 
-#include <ucm/api/ucm.h>
 #include <cuda_runtime.h>
 #include <cuda.h>
 
 
-/*cuMemFree */
-CUresult ucm_override_cuMemFree(CUdeviceptr dptr);
-CUresult ucm_orig_cuMemFree(CUdeviceptr dptr);
-CUresult ucm_cuMemFree(CUdeviceptr dptr);
-
-/*cuMemFreeHost */
-CUresult ucm_override_cuMemFreeHost(void *p);
-CUresult ucm_orig_cuMemFreeHost(void *p);
-CUresult ucm_cuMemFreeHost(void *p);
-
-/*cuMemAlloc*/
-CUresult ucm_override_cuMemAlloc(CUdeviceptr *dptr, size_t size);
-CUresult ucm_orig_cuMemAlloc(CUdeviceptr *dptr, size_t size);
 CUresult ucm_cuMemAlloc(CUdeviceptr *dptr, size_t size);
-
-/*cuMemAllocManaged*/
-CUresult ucm_override_cuMemAllocManaged(CUdeviceptr *dptr, size_t size,
-                                        unsigned int flags);
-CUresult ucm_orig_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flags);
+CUresult ucm_cuMemAlloc_v2(CUdeviceptr *dptr, size_t size);
 CUresult ucm_cuMemAllocManaged(CUdeviceptr *dptr, size_t size, unsigned int flags);
-
-/*cuMemAllocPitch*/
-CUresult ucm_override_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                      size_t WidthInBytes, size_t Height,
-                                      unsigned int ElementSizeBytes);
-CUresult ucm_orig_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
-                                  size_t WidthInBytes, size_t Height,
-                                  unsigned int ElementSizeBytes);
 CUresult ucm_cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
                              size_t WidthInBytes, size_t Height,
                              unsigned int ElementSizeBytes);
+CUresult ucm_cuMemAllocPitch_v2(CUdeviceptr *dptr, size_t *pPitch,
+                                size_t WidthInBytes, size_t Height,
+                                unsigned int ElementSizeBytes);
+CUresult ucm_cuMemFree(CUdeviceptr dptr);
+CUresult ucm_cuMemFree_v2(CUdeviceptr dptr);
+CUresult ucm_cuMemFreeHost(void *p);
+CUresult ucm_cuMemFreeHost_v2(void *p);
 
-/*cuMemHostGetDevicePointer*/
-CUresult ucm_override_cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                                unsigned int Flags);
-CUresult ucm_orig_cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
-                                            unsigned int Flags);
-CUresult ucm_cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
-
-/*cuMemHostUnregister */
-CUresult ucm_override_cuMemHostUnregister(void *p);
-CUresult ucm_orig_cuMemHostUnregister(void *p);
-CUresult ucm_cuMemHostUnregister(void *p);
-
-/*cudaFree*/
-cudaError_t ucm_override_cudaFree(void *devPtr);
-cudaError_t ucm_orig_cudaFree(void *devPtr);
 cudaError_t ucm_cudaFree(void *devPtr);
-
-/*cudaFreeHost*/
-cudaError_t ucm_override_cudaFreeHost(void *ptr);
-cudaError_t ucm_orig_cudaFreeHost(void *ptr);
 cudaError_t ucm_cudaFreeHost(void *ptr);
-
-/*cudaMalloc*/
-cudaError_t ucm_override_cudaMalloc(void **devPtr, size_t size);
-cudaError_t ucm_orig_cudaMalloc(void **devPtr, size_t size);
 cudaError_t ucm_cudaMalloc(void **devPtr, size_t size);
-
-/*cudaMallocManaged*/
-cudaError_t ucm_override_cudaMallocManaged(void **devPtr, size_t size,
-                                           unsigned int flags);
-cudaError_t ucm_orig_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
 cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
-
-/*cudaMallocPitch*/
-cudaError_t ucm_override_cudaMallocPitch(void **devPtr, size_t *pitch,
-                                         size_t width, size_t height);
-cudaError_t ucm_orig_cudaMallocPitch(void **devPtr, size_t *pitch,
-                                     size_t width, size_t height);
 cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
                                 size_t width, size_t height);
 
-/*cudaHostGetDevicePointer*/
-cudaError_t ucm_override_cudaHostGetDevicePointer(void **pDevice, void *pHost,
-                                                  unsigned int flags);
-cudaError_t ucm_orig_cudaHostGetDevicePointer(void **pDevice, void *pHost,
-                                              unsigned int flags);
-cudaError_t ucm_cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
-
-
-/*cudaHostUnregister*/
-cudaError_t ucm_override_cudaHostUnregister(void *ptr);
-cudaError_t ucm_orig_cudaHostUnregister(void *ptr);
-cudaError_t ucm_cudaHostUnregister(void *ptr);
-
 #endif
diff --git a/src/ucm/event/event.c b/src/ucm/event/event.c
index 1985b9b254f..d98dad64210 100644
--- a/src/ucm/event/event.c
+++ b/src/ucm/event/event.c
@@ -94,6 +94,11 @@ static void ucm_event_call_orig(ucm_event_type_t event_type, ucm_event_t *event,
             event->shmdt.result = ucm_orig_shmdt(event->shmdt.shmaddr);
         }
         break;
+    case UCM_EVENT_BRK:
+        if (event->brk.result == -1) {
+            event->brk.result = ucm_orig_brk(event->brk.addr);
+        }
+        break;
     case UCM_EVENT_SBRK:
         if (event->sbrk.result == MAP_FAILED) {
             event->sbrk.result = ucm_orig_sbrk(event->sbrk.increment);
@@ -120,8 +125,8 @@ static ucm_event_handler_t ucm_event_orig_handler = {
     .list     = UCS_LIST_INITIALIZER(&ucm_event_handlers, &ucm_event_handlers),
     .events   = UCM_EVENT_MMAP | UCM_EVENT_MUNMAP | UCM_EVENT_MREMAP |
                 UCM_EVENT_SHMAT | UCM_EVENT_SHMDT | UCM_EVENT_SBRK |
-                UCM_EVENT_MADVISE,      /* All events */
-    .priority = 0,                      /* Between negative and positive handlers */
+                UCM_EVENT_MADVISE | UCM_EVENT_BRK,             /* All events */
+    .priority = 0,                 /* Between negative and positive handlers */
     .cb       = ucm_event_call_orig
 };
 static ucs_list_link_t ucm_event_handlers =
@@ -166,6 +171,7 @@ void ucm_event_leave()
     pthread_rwlock_unlock(&ucm_event_lock);
 }
 
+UCS_F_NOINLINE
 void *ucm_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
 {
     ucm_event_t event;
@@ -198,6 +204,7 @@ void *ucm_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t off
     return event.mmap.result;
 }
 
+UCS_F_NOINLINE
 int ucm_munmap(void *addr, size_t length)
 {
     ucm_event_t event;
@@ -238,6 +245,7 @@ void ucm_vm_munmap(void *addr, size_t length)
     ucm_event_leave();
 }
 
+UCS_F_NOINLINE
 void *ucm_mremap(void *old_address, size_t old_size, size_t new_size, int flags)
 {
     ucm_event_t event;
@@ -285,6 +293,7 @@ static int ucm_shm_del_entry_from_khash(const void *addr, size_t *size)
     return 0;
 }
 
+UCS_F_NOINLINE
 void *ucm_shmat(int shmid, const void *shmaddr, int shmflg)
 {
 #ifdef SHM_REMAP
@@ -334,6 +343,7 @@ void *ucm_shmat(int shmid, const void *shmaddr, int shmflg)
     return event.shmat.result;
 }
 
+UCS_F_NOINLINE
 int ucm_shmdt(const void *shmaddr)
 {
     ucm_event_t event;
@@ -341,7 +351,7 @@ int ucm_shmdt(const void *shmaddr)
 
     ucm_event_enter();
 
-    ucm_debug("ucm_shmdt(shmaddr=%p)", shmaddr);
+    ucm_trace("ucm_shmdt(shmaddr=%p)", shmaddr);
 
     if (!ucm_shm_del_entry_from_khash(shmaddr, &size)) {
         size = ucm_get_shm_seg_size(shmaddr);
@@ -358,6 +368,7 @@ int ucm_shmdt(const void *shmaddr)
     return event.shmdt.result;
 }
 
+UCS_F_NOINLINE
 void *ucm_sbrk(intptr_t increment)
 {
     ucm_event_t event;
@@ -367,7 +378,8 @@ void *ucm_sbrk(intptr_t increment)
     ucm_trace("ucm_sbrk(increment=%+ld)", increment);
 
     if (increment < 0) {
-        ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(ucm_orig_sbrk(0), increment),
+        ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(ucm_get_current_brk(),
+                                                   increment),
                                -increment);
     }
 
@@ -376,7 +388,8 @@ void *ucm_sbrk(intptr_t increment)
     ucm_event_dispatch(UCM_EVENT_SBRK, &event);
 
     if ((increment > 0) && (event.sbrk.result != MAP_FAILED)) {
-        ucm_dispatch_vm_mmap(UCS_PTR_BYTE_OFFSET(ucm_orig_sbrk(0), -increment),
+        ucm_dispatch_vm_mmap(UCS_PTR_BYTE_OFFSET(ucm_get_current_brk(),
+                                                 -increment),
                              increment);
     }
 
@@ -385,42 +398,42 @@ void *ucm_sbrk(intptr_t increment)
     return event.sbrk.result;
 }
 
+UCS_F_NOINLINE
 int ucm_brk(void *addr)
 {
-#if UCM_BISTRO_HOOKS
-    void *old_addr;
-    intptr_t increment;
+    ptrdiff_t increment;
+    void *current_brk;
     ucm_event_t event;
 
-    old_addr  = ucm_brk_syscall(0);
-    /* in case if addr == NULL - it just returns current pointer */
-    increment = addr ? ((intptr_t)addr - (intptr_t)old_addr) : 0;
-
     ucm_event_enter();
 
     ucm_trace("ucm_brk(addr=%p)", addr);
 
+    if (addr == NULL) {
+        increment = 0;
+    } else {
+        current_brk = ucm_get_current_brk();
+        increment   = UCS_PTR_BYTE_DIFF(current_brk, addr);
+    }
+
     if (increment < 0) {
-        ucm_dispatch_vm_munmap(UCS_PTR_BYTE_OFFSET(old_addr, increment),
-                               -increment);
+        ucm_dispatch_vm_munmap(addr, -increment);
     }
 
-    event.sbrk.result    = (void*)-1;
-    event.sbrk.increment = increment;
-    ucm_event_dispatch(UCM_EVENT_SBRK, &event);
+    event.brk.result = -1;
+    event.brk.addr   = addr;
+    ucm_event_dispatch(UCM_EVENT_BRK, &event);
 
-    if ((increment > 0) && (event.sbrk.result != MAP_FAILED)) {
-        ucm_dispatch_vm_mmap(old_addr, increment);
+    if ((increment > 0) && (event.brk.result != -1)) {
+        ucm_dispatch_vm_mmap(current_brk, increment);
     }
 
     ucm_event_leave();
 
-    return event.sbrk.result == MAP_FAILED ? -1 : 0;
-#else
-    return -1;
-#endif
+    return event.brk.result;
 }
 
+UCS_F_NOINLINE
 int ucm_madvise(void *addr, size_t length, int advice)
 {
     ucm_event_t event;
@@ -455,6 +468,18 @@ int ucm_madvise(void *addr, size_t length, int advice)
     return event.madvise.result;
 }
 
+void ucm_library_init(const ucm_global_config_t *ucm_opts)
+{
+    static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER;
+
+    UCS_INIT_ONCE(&init_once) {
+        if (ucm_opts != NULL) {
+            ucm_global_opts = *ucm_opts;
+        }
+        ucm_mmap_init();
+    }
+}
+
 void ucm_event_handler_add(ucm_event_handler_t *handler)
 {
     ucm_event_handler_t *elem;
@@ -481,20 +506,17 @@ void ucm_event_handler_remove(ucm_event_handler_t *handler)
 
 static ucs_status_t ucm_event_install(int events)
 {
-    static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER;
     UCS_MODULE_FRAMEWORK_DECLARE(ucm);
     ucm_event_installer_t *event_installer;
     int malloc_events;
     ucs_status_t status;
 
-    UCS_INIT_ONCE(&init_once) {
-        ucm_prevent_dl_unload();
-    }
+    ucm_prevent_dl_unload();
 
     /* TODO lock */
-    status = ucm_mmap_install(events);
+    status = ucm_mmap_install(events, 0);
     if (status != UCS_OK) {
-        ucm_debug("failed to install mmap events");
+        ucm_diag("failed to install mmap events");
         goto out_unlock;
     }
 
@@ -523,7 +545,6 @@ static ucs_status_t ucm_event_install(int events)
 
 out_unlock:
     return status;
-
 }
 
 ucs_status_t ucm_set_event_handler(int events, int priority,
@@ -536,7 +557,7 @@ ucs_status_t ucm_set_event_handler(int events, int priority,
 
     if (events & ~(UCM_EVENT_MMAP|UCM_EVENT_MUNMAP|UCM_EVENT_MREMAP|
                    UCM_EVENT_SHMAT|UCM_EVENT_SHMDT|
-                   UCM_EVENT_SBRK|
+                   UCM_EVENT_BRK|UCM_EVENT_SBRK|
                    UCM_EVENT_MADVISE|
                    UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED|
                    UCM_EVENT_MEM_TYPE_ALLOC|UCM_EVENT_MEM_TYPE_FREE|
@@ -549,6 +570,8 @@ ucs_status_t ucm_set_event_handler(int events, int priority,
         return UCS_ERR_UNSUPPORTED;
     }
 
+    ucm_library_init(NULL);
+
     /* separate event flags from real events */
     flags   = events & (UCM_EVENT_FLAG_NO_INSTALL |
                         UCM_EVENT_FLAG_EXISTING_ALLOC);
@@ -587,6 +610,7 @@ ucs_status_t ucm_set_event_handler(int events, int priority,
 void ucm_set_external_event(int events)
 {
     ucm_event_enter_exclusive();
+    ucm_debug("set external events: 0x%x", events);
     ucm_external_events |= events;
     ucm_event_leave();
 }
@@ -594,6 +618,7 @@ void ucm_set_external_event(int events)
 void ucm_unset_external_event(int events)
 {
     ucm_event_enter_exclusive();
+    ucm_debug("unset external events: 0x%x", events);
     ucm_external_events &= ~events;
     ucm_event_leave();
 }
@@ -623,11 +648,13 @@ void ucm_unset_event_handler(int events, ucm_event_callback_t cb, void *arg)
 
 ucs_status_t ucm_test_events(int events)
 {
+    ucm_library_init(NULL);
     return ucm_mmap_test_installed_events(events);
 }
 
 ucs_status_t ucm_test_external_events(int events)
 {
+    ucm_library_init(NULL);
     return ucm_mmap_test_events(events & ucm_external_events, "external");
 }
 
diff --git a/src/ucm/event/event.h b/src/ucm/event/event.h
index 763ac3b2098..e7ae14ec6ad 100644
--- a/src/ucm/event/event.h
+++ b/src/ucm/event/event.h
@@ -13,12 +13,13 @@
 #include <ucs/type/status.h>
 
 #define UCM_NATIVE_EVENT_VM_MAPPED (UCM_EVENT_MMAP  | UCM_EVENT_MREMAP | \
-                                    UCM_EVENT_SHMAT | UCM_EVENT_SBRK)
+                                    UCM_EVENT_SHMAT | UCM_EVENT_SBRK | \
+                                    UCM_EVENT_BRK)
 
 #define UCM_NATIVE_EVENT_VM_UNMAPPED (UCM_EVENT_MMAP   | UCM_EVENT_MUNMAP | \
                                       UCM_EVENT_MREMAP | UCM_EVENT_SHMDT  | \
                                       UCM_EVENT_SHMAT  | UCM_EVENT_SBRK   | \
-                                      UCM_EVENT_MADVISE)
+                                      UCM_EVENT_MADVISE | UCM_EVENT_BRK)
 
 
 typedef struct ucm_event_handler {
diff --git a/src/ucm/malloc/malloc_hook.c b/src/ucm/malloc/malloc_hook.c
index 50f7b974a64..9bfbdb3bba7 100644
--- a/src/ucm/malloc/malloc_hook.c
+++ b/src/ucm/malloc/malloc_hook.c
@@ -141,8 +141,8 @@ static void ucm_malloc_mmaped_ptr_add(void *ptr)
 
     hash_it = kh_put(mmap_ptrs, &ucm_malloc_hook_state.ptrs, ptr,
                      &hash_extra_status);
-    ucs_assert_always(hash_extra_status >= 0);
-    ucs_assert_always(hash_it != kh_end(&ucm_malloc_hook_state.ptrs));
+    ucm_assert_always(hash_extra_status >= 0);
+    ucm_assert_always(hash_it != kh_end(&ucm_malloc_hook_state.ptrs));
 
     ucs_recursive_spin_unlock(&ucm_malloc_hook_state.lock);
 }
@@ -550,7 +550,7 @@ static void ucm_malloc_sbrk(ucm_event_type_t event_type,
     if (ucm_malloc_hook_state.heap_start == (void*)-1) {
         ucm_malloc_hook_state.heap_start = event->sbrk.result; /* sbrk() returns the previous break */
     }
-    ucm_malloc_hook_state.heap_end = ucm_orig_sbrk(0);
+    ucm_malloc_hook_state.heap_end = ucm_get_current_brk();
 
     ucm_trace("sbrk(%+ld)=%p - adjusting heap to [%p..%p]",
               event->sbrk.increment, event->sbrk.result,
diff --git a/src/ucm/mmap/install.c b/src/ucm/mmap/install.c
index 83d4b7d37e5..e9573ac7bce 100644
--- a/src/ucm/mmap/install.c
+++ b/src/ucm/mmap/install.c
@@ -15,6 +15,7 @@
 #include <ucm/event/event.h>
 #include <ucm/util/log.h>
 #include <ucm/util/reloc.h>
+#include <ucm/util/replace.h>
 #include <ucm/util/sys.h>
 #include <ucm/bistro/bistro.h>
 #include <ucs/arch/atomic.h>
@@ -30,8 +31,6 @@
 #include <unistd.h>
 #include <pthread.h>
 
-#define UCM_IS_HOOK_ENABLED(_entry) \
-    ((_entry)->hook_type & UCS_BIT(ucm_mmap_hook_mode()))
 
 #define UCM_HOOK_STR \
     ((ucm_mmap_hook_mode() == UCM_MMAP_HOOK_RELOC) ?  "reloc" : "bistro")
@@ -43,7 +42,7 @@
         _call;                                                                \
         ucm_trace("after %s: got 0x%x/0x%x", UCS_PP_MAKE_STRING(_call),       \
                   (_data)->fired_events, exp_events);                         \
-        /* in case if any event is missed - set correcponding bit to 0     */ \
+        /* in case if any event is missed - set corresponding bit to 0     */ \
         /* same as equation:                                               */ \
         /* (_data)->out_events &= ~(exp_events ^                           */ \
         /*                          ((_data)->fired_events & exp_events)); */ \
@@ -61,17 +60,10 @@
 
 extern const char *ucm_mmap_hook_modes[];
 
-typedef enum ucm_mmap_hook_type {
-    UCM_HOOK_RELOC  = UCS_BIT(UCM_MMAP_HOOK_RELOC),
-    UCM_HOOK_BISTRO = UCS_BIT(UCM_MMAP_HOOK_BISTRO),
-    UCM_HOOK_BOTH   = UCM_HOOK_RELOC | UCM_HOOK_BISTRO
-} ucm_mmap_hook_type_t;
-
 typedef struct ucm_mmap_func {
     ucm_reloc_patch_t    patch;
     ucm_event_type_t     event_type;
     ucm_event_type_t     deps;
-    ucm_mmap_hook_type_t hook_type;
 } ucm_mmap_func_t;
 
 typedef struct ucm_mmap_test_events_data {
@@ -81,18 +73,16 @@ typedef struct ucm_mmap_test_events_data {
 } ucm_mmap_test_events_data_t;
 
 static ucm_mmap_func_t ucm_mmap_funcs[] = {
-    { {"mmap",    ucm_override_mmap},    UCM_EVENT_MMAP,    UCM_EVENT_NONE,  UCM_HOOK_BOTH},
-    { {"munmap",  ucm_override_munmap},  UCM_EVENT_MUNMAP,  UCM_EVENT_NONE,  UCM_HOOK_BOTH},
+    { {"mmap",    ucm_override_mmap},    UCM_EVENT_MMAP,    UCM_EVENT_NONE},
+    { {"munmap",  ucm_override_munmap},  UCM_EVENT_MUNMAP,  UCM_EVENT_NONE},
 #if HAVE_MREMAP
-    { {"mremap",  ucm_override_mremap},  UCM_EVENT_MREMAP,  UCM_EVENT_NONE,  UCM_HOOK_BOTH},
-#endif
-    { {"shmat",   ucm_override_shmat},   UCM_EVENT_SHMAT,   UCM_EVENT_NONE,  UCM_HOOK_BOTH},
-    { {"shmdt",   ucm_override_shmdt},   UCM_EVENT_SHMDT,   UCM_EVENT_SHMAT, UCM_HOOK_BOTH},
-    { {"sbrk",    ucm_override_sbrk},    UCM_EVENT_SBRK,    UCM_EVENT_NONE,  UCM_HOOK_RELOC},
-#if UCM_BISTRO_HOOKS
-    { {"brk",     ucm_override_brk},     UCM_EVENT_SBRK,    UCM_EVENT_NONE,  UCM_HOOK_BISTRO},
+    { {"mremap",  ucm_override_mremap},  UCM_EVENT_MREMAP,  UCM_EVENT_NONE},
 #endif
-    { {"madvise", ucm_override_madvise}, UCM_EVENT_MADVISE, UCM_EVENT_NONE,  UCM_HOOK_BOTH},
+    { {"shmat",   ucm_override_shmat},   UCM_EVENT_SHMAT,   UCM_EVENT_NONE},
+    { {"shmdt",   ucm_override_shmdt},   UCM_EVENT_SHMDT,   UCM_EVENT_SHMAT},
+    { {"sbrk",    ucm_override_sbrk},    UCM_EVENT_SBRK,    UCM_EVENT_NONE},
+    { {"brk",     ucm_override_brk},     UCM_EVENT_BRK,     UCM_EVENT_NONE},
+    { {"madvise", ucm_override_madvise}, UCM_EVENT_MADVISE, UCM_EVENT_NONE},
     { {NULL, NULL}, UCM_EVENT_NONE}
 };
 
@@ -108,6 +98,7 @@ static const char *ucm_mmap_event_name[] = {
     UCM_MMAP_EVENT_NAME_ENTRY(SHMDT),
     UCM_MMAP_EVENT_NAME_ENTRY(SBRK),
     UCM_MMAP_EVENT_NAME_ENTRY(MADVISE),
+    UCM_MMAP_EVENT_NAME_ENTRY(BRK),
 
     /* Aggregate events */
     UCM_MMAP_EVENT_NAME_ENTRY(VM_MAPPED),
@@ -124,11 +115,20 @@ static void ucm_mmap_event_test_callback(ucm_event_type_t event_type,
      * So ignore calls from other threads to ensure the only requested events
      * are proceeded.
      */
-    if (data->tid == ucs_get_tid()) {
+    if (data->tid == ucm_get_tid()) {
         data->fired_events |= event_type;
     }
 }
 
+/* Call brk() and check return value, to avoid compile error of unused result */
+static void ucm_brk_checked(void *addr)
+{
+    int ret = brk(addr);
+    if ((ret != 0) && (addr != NULL)) {
+        ucm_diag("brk(addr=%p) failed: %m", addr);
+    }
+}
+
 /* Fire events with pre/post action. The problem is in call sequence: we
  * can't just fire single event - most of the system calls require set of
  * calls to eliminate resource leaks or data corruption, such sequence
@@ -136,10 +136,10 @@ static void ucm_mmap_event_test_callback(ucm_event_type_t event_type,
  * exclude additional events from processing used pre/post actions where
  * set of handled events is cleared and evaluated for every system call */
 static void
-ucm_fire_mmap_events_internal(int events, ucm_mmap_test_events_data_t *data)
+ucm_fire_mmap_events_internal(int events, ucm_mmap_test_events_data_t *data,
+                              int exclusive)
 {
     size_t sbrk_size;
-    int sbrk_mask;
     int shmid;
     void *p;
 
@@ -183,19 +183,29 @@ ucm_fire_mmap_events_internal(int events, ucm_mmap_test_events_data_t *data)
                        data, shmdt(p));
     }
 
-    if (events & (UCM_EVENT_SBRK|UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED)) {
-        if (RUNNING_ON_VALGRIND) {
-            /* on valgrind, doing a non-trivial sbrk() causes heap corruption */
-            sbrk_size = 0;
-            sbrk_mask = UCM_EVENT_SBRK;
-        } else {
-            sbrk_size = ucm_get_page_size();
-            sbrk_mask = UCM_EVENT_SBRK|UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED;
+    if (exclusive && !RUNNING_ON_VALGRIND) {
+        sbrk_size = ucm_get_page_size();
+        if (events & (UCM_EVENT_BRK|UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED)) {
+            p = ucm_get_current_brk();
+            UCM_FIRE_EVENT(events, UCM_EVENT_BRK|UCM_EVENT_VM_MAPPED, data,
+                           ucm_brk_checked(UCS_PTR_BYTE_OFFSET(p, sbrk_size)));
+            UCM_FIRE_EVENT(events, UCM_EVENT_BRK|UCM_EVENT_VM_UNMAPPED, data,
+                           ucm_brk_checked(p));
+        }
+        if (events & (UCM_EVENT_SBRK|UCM_EVENT_VM_MAPPED|UCM_EVENT_VM_UNMAPPED)) {
+            UCM_FIRE_EVENT(events, UCM_EVENT_SBRK|UCM_EVENT_VM_MAPPED,
+                           data, (void)sbrk(sbrk_size));
+            UCM_FIRE_EVENT(events, UCM_EVENT_SBRK|UCM_EVENT_VM_UNMAPPED,
+                           data, (void)sbrk(-sbrk_size));
+        }
+    } else {
+        /* To avoid side effects on other threads and valgrind heap corruption,
+         * pass invalid parameters. We assume that if the natives events are
+         * delivered, it means VM_MAPPED/UNMAPPED would be delivered as well.
+         */
+        if (events & UCM_EVENT_BRK) {
+            UCM_FIRE_EVENT(events, UCM_EVENT_BRK, data, ucm_brk_checked(NULL));
         }
-        UCM_FIRE_EVENT(events, (UCM_EVENT_SBRK|UCM_EVENT_VM_MAPPED) & sbrk_mask,
-                       data, (void)sbrk(sbrk_size));
-        UCM_FIRE_EVENT(events, (UCM_EVENT_SBRK|UCM_EVENT_VM_UNMAPPED) & sbrk_mask,
-                       data, (void)sbrk(-sbrk_size));
     }
 
     if (events & (UCM_EVENT_MADVISE|UCM_EVENT_VM_UNMAPPED)) {
@@ -217,7 +227,7 @@ void ucm_fire_mmap_events(int events)
 {
     ucm_mmap_test_events_data_t data;
 
-    ucm_fire_mmap_events_internal(events, &data);
+    ucm_fire_mmap_events_internal(events, &data, 0);
 }
 
 static void ucm_mmap_event_report_missing(int expected, int actual,
@@ -252,7 +262,7 @@ static void ucm_mmap_event_report_missing(int expected, int actual,
 
 /* Called with lock held */
 static ucs_status_t
-ucm_mmap_test_events_nolock(int events, const char *event_type)
+ucm_mmap_test_events_nolock(int events, int exclusive, const char *event_type)
 {
     ucm_event_handler_t handler;
     ucm_mmap_test_events_data_t data;
@@ -262,13 +272,16 @@ ucm_mmap_test_events_nolock(int events, const char *event_type)
     handler.cb        = ucm_mmap_event_test_callback;
     handler.arg       = &data;
     data.out_events   = events;
-    data.tid          = ucs_get_tid();
+    data.tid          = ucm_get_tid();
+
+    ucm_debug("testing mmap %s events 0x%x", event_type, events);
 
     ucm_event_handler_add(&handler);
-    ucm_fire_mmap_events_internal(events, &data);
+    ucm_fire_mmap_events_internal(events, &data, exclusive);
     ucm_event_handler_remove(&handler);
 
-    ucm_debug("mmap test: got 0x%x out of 0x%x", data.out_events, events);
+    ucm_debug("mmap %s events test: got 0x%x out of 0x%x", event_type,
+              data.out_events, events);
 
     /* Return success if we caught all wanted events */
     if (!ucs_test_all_flags(data.out_events, events)) {
@@ -279,6 +292,23 @@ ucm_mmap_test_events_nolock(int events, const char *event_type)
     return UCS_OK;
 }
 
+static int ucm_mmap_events_to_native_events(int events)
+{
+    int native_events;
+
+    native_events = events & ~(UCM_EVENT_MEM_TYPE_ALLOC |
+                               UCM_EVENT_MEM_TYPE_FREE);
+
+    if (events & UCM_EVENT_VM_MAPPED) {
+        native_events |= UCM_NATIVE_EVENT_VM_MAPPED;
+    }
+    if (events & UCM_EVENT_VM_UNMAPPED) {
+        native_events |= UCM_NATIVE_EVENT_VM_UNMAPPED;
+    }
+
+    return native_events;
+}
+
 ucs_status_t ucm_mmap_test_events(int events, const char *event_type)
 {
     ucs_status_t status;
@@ -287,7 +317,7 @@ ucs_status_t ucm_mmap_test_events(int events, const char *event_type)
      * return UCS_OK iff all events are actually working
      */
     pthread_mutex_lock(&ucm_mmap_install_mutex);
-    status = ucm_mmap_test_events_nolock(events, event_type);
+    status = ucm_mmap_test_events_nolock(events, 0, event_type);
     pthread_mutex_unlock(&ucm_mmap_install_mutex);
 
     return status;
@@ -296,8 +326,11 @@ ucs_status_t ucm_mmap_test_events(int events, const char *event_type)
 ucs_status_t ucm_mmap_test_installed_events(int events)
 {
     /*
-     * return UCS_OK iff all installed events are actually working
-     * we don't check the status of events which were not successfully installed
+     * Return UCS_OK iff all installed events are actually working.
+     * - We should not expand 'events' to native events, and test only the exact
+     *   set of events the user asked to test.
+     * - We don't check the status of events which were not reported as
+     *   successfully installed.
      */
     return ucm_mmap_test_events(events & ucm_mmap_installed_events, "internal");
 }
@@ -308,6 +341,7 @@ static ucs_status_t ucs_mmap_install_reloc(int events)
     static int installed_events = 0;
     ucm_mmap_func_t *entry;
     ucs_status_t status;
+    void *func_ptr;
 
     if (ucm_mmap_hook_mode() == UCM_MMAP_HOOK_NONE) {
         ucm_debug("installing mmap hooks is disabled by configuration");
@@ -325,47 +359,36 @@ static ucs_status_t ucs_mmap_install_reloc(int events)
             continue;
         }
 
-        if (UCM_IS_HOOK_ENABLED(entry)) {
-            ucm_debug("mmap: installing %s hook for %s = %p for event 0x%x", UCM_HOOK_STR,
-                      entry->patch.symbol, entry->patch.value, entry->event_type);
+        ucm_debug("mmap: installing %s hook for %s = %p for event 0x%x",
+                  UCM_HOOK_STR, entry->patch.symbol, entry->patch.value,
+                  entry->event_type);
 
-            if (ucm_mmap_hook_mode() == UCM_MMAP_HOOK_RELOC) {
-                status = ucm_reloc_modify(&entry->patch);
+        if (ucm_mmap_hook_mode() == UCM_MMAP_HOOK_RELOC) {
+            status = ucm_reloc_modify(&entry->patch);
+        } else {
+            ucm_assert(ucm_mmap_hook_mode() == UCM_MMAP_HOOK_BISTRO);
+            func_ptr = ucm_reloc_get_orig(entry->patch.symbol,
+                                          entry->patch.value);
+            if (func_ptr == NULL) {
+                status = UCS_ERR_NO_ELEM;
             } else {
-                ucs_assert(ucm_mmap_hook_mode() == UCM_MMAP_HOOK_BISTRO);
-                status = ucm_bistro_patch(entry->patch.symbol, entry->patch.value, NULL);
-            }
-            if (status != UCS_OK) {
-                ucm_warn("failed to install %s hook for '%s'",
-                         UCM_HOOK_STR, entry->patch.symbol);
-                return status;
+                status = ucm_bistro_patch(func_ptr, entry->patch.value,
+                                          entry->patch.symbol, NULL, NULL);
             }
-
-            installed_events |= entry->event_type;
         }
-    }
-
-    return UCS_OK;
-}
-
-static int ucm_mmap_events_to_native_events(int events)
-{
-    int native_events;
-
-    native_events = events & ~(UCM_EVENT_MEM_TYPE_ALLOC |
-                               UCM_EVENT_MEM_TYPE_FREE);
+        if (status != UCS_OK) {
+            ucm_warn("failed to install %s hook for '%s'", UCM_HOOK_STR,
+                     entry->patch.symbol);
+            return status;
+        }
 
-    if (events & UCM_EVENT_VM_MAPPED) {
-        native_events |= UCM_NATIVE_EVENT_VM_MAPPED;
-    }
-    if (events & UCM_EVENT_VM_UNMAPPED) {
-        native_events |= UCM_NATIVE_EVENT_VM_UNMAPPED;
+        installed_events |= entry->event_type;
     }
 
-    return native_events;
+    return UCS_OK;
 }
 
-ucs_status_t ucm_mmap_install(int events)
+ucs_status_t ucm_mmap_install(int events, int exclusive)
 {
     ucs_status_t status;
     int native_events;
@@ -378,7 +401,8 @@ ucs_status_t ucm_mmap_install(int events)
         /* if we already installed these events, check that they are still
          * working, and if not - reinstall them.
          */
-        status = ucm_mmap_test_events_nolock(native_events, 0);
+        status = ucm_mmap_test_events_nolock(native_events, exclusive,
+                                             "existing");
         if (status == UCS_OK) {
             goto out_unlock;
         }
@@ -390,7 +414,7 @@ ucs_status_t ucm_mmap_install(int events)
         goto out_unlock;
     }
 
-    status = ucm_mmap_test_events_nolock(native_events, 0);
+    status = ucm_mmap_test_events_nolock(native_events, exclusive, "installed");
     if (status != UCS_OK) {
         ucm_debug("failed to install mmap events");
         goto out_unlock;
@@ -398,9 +422,38 @@ ucs_status_t ucm_mmap_install(int events)
 
     /* status == UCS_OK */
     ucm_mmap_installed_events |= native_events;
-    ucm_debug("mmap installed events = 0x%x", ucm_mmap_installed_events);
+    ucm_info("mmap installed events = 0x%x", ucm_mmap_installed_events);
 
 out_unlock:
     pthread_mutex_unlock(&ucm_mmap_install_mutex);
     return status;
 }
+
+void ucm_mmap_init()
+{
+    ucm_event_type_t events;
+    ucm_mmap_func_t *entry;
+
+    if (!ucm_global_opts.enable_events ||
+        (ucm_mmap_hook_mode() != UCM_MMAP_HOOK_BISTRO)) {
+        return;
+    }
+
+    /* We must initialize bistro hooks during startup and not later, before
+     * other threads could execute the modified functions and fail on invalid
+     * instructions
+     */
+    events = 0;
+    for (entry = ucm_mmap_funcs; entry->patch.symbol != NULL; ++entry) {
+        events |= entry->event_type;
+    }
+    if (events & UCM_NATIVE_EVENT_VM_MAPPED) {
+        events |= UCM_EVENT_VM_MAPPED;
+    }
+    if (events & UCM_NATIVE_EVENT_VM_UNMAPPED) {
+        events |= UCM_EVENT_VM_UNMAPPED;
+    }
+
+    ucm_prevent_dl_unload();
+    ucm_mmap_install(events, 1);
+}
diff --git a/src/ucm/mmap/mmap.h b/src/ucm/mmap/mmap.h
index ed90a801238..c7b4e4e37b2 100644
--- a/src/ucm/mmap/mmap.h
+++ b/src/ucm/mmap/mmap.h
@@ -8,6 +8,7 @@
 #define UCM_MMAP_H_
 
 #include <ucm/api/ucm.h>
+#include <ucm/util/sys.h>
 #include <ucs/sys/checker.h>
 
 #define UCM_MMAP_HOOK_RELOC_STR  "reloc"
@@ -21,7 +22,7 @@
 #  define UCM_DEFAULT_HOOK_MODE_STR UCM_MMAP_HOOK_RELOC_STR
 #endif
 
-ucs_status_t ucm_mmap_install(int events);
+ucs_status_t ucm_mmap_install(int events, int exclusive);
 
 void *ucm_override_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
 int ucm_override_munmap(void *addr, size_t length);
@@ -31,23 +32,16 @@ int ucm_override_shmdt(const void *shmaddr);
 void *ucm_override_sbrk(intptr_t increment);
 void *ucm_sbrk_select(intptr_t increment);
 int ucm_override_brk(void *addr);
-void *ucm_brk_syscall(void *addr);
 int ucm_override_madvise(void *addr, size_t length, int advice);
+void *ucm_get_current_brk();
 void ucm_fire_mmap_events(int events);
 ucs_status_t ucm_mmap_test_installed_events(int events);
 ucs_status_t ucm_mmap_test_events(int events, const char *event_type);
+void ucm_mmap_init();
 
 static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t ucm_mmap_hook_mode(void)
 {
-#ifdef __SANITIZE_ADDRESS__
-    return UCM_MMAP_HOOK_NONE;
-#else
-    if (RUNNING_ON_VALGRIND && (ucm_global_opts.mmap_hook_mode == UCM_MMAP_HOOK_BISTRO)) {
-        return UCM_MMAP_HOOK_RELOC;
-    }
-
-    return ucm_global_opts.mmap_hook_mode;
-#endif
+    return ucm_get_hook_mode(ucm_global_opts.mmap_hook_mode);
 }
 
 #endif
diff --git a/src/ucm/rocm/rocmmem.c b/src/ucm/rocm/rocmmem.c
index bf441d6057d..386f07fdcea 100644
--- a/src/ucm/rocm/rocmmem.c
+++ b/src/ucm/rocm/rocmmem.c
@@ -28,11 +28,6 @@ UCM_DEFINE_REPLACE_DLSYM_FUNC(hsa_amd_memory_pool_allocate, hsa_status_t,
 UCM_DEFINE_REPLACE_DLSYM_FUNC(hsa_amd_memory_pool_free, hsa_status_t,
                               HSA_STATUS_ERROR, void*)
 
-#if ENABLE_SYMBOL_OVERRIDE
-UCM_OVERRIDE_FUNC(hsa_amd_memory_pool_allocate, hsa_status_t)
-UCM_OVERRIDE_FUNC(hsa_amd_memory_pool_free, hsa_status_t)
-#endif
-
 static UCS_F_ALWAYS_INLINE void
 ucm_dispatch_mem_type_alloc(void *addr, size_t length, ucs_memory_type_t mem_type)
 {
@@ -172,7 +167,7 @@ static ucs_status_t ucm_rocmmem_install(int events)
         }
     }
 
-    ucm_debug("rocm hooks are ready");
+    ucm_info("rocm hooks are ready");
     ucm_rocmmem_installed = 1;
 
 out_unlock:
diff --git a/src/ucm/util/log.c b/src/ucm/util/log.c
index ec41746a477..7325a341952 100644
--- a/src/ucm/util/log.c
+++ b/src/ucm/util/log.c
@@ -23,6 +23,7 @@
 #include <ctype.h>
 #include <errno.h>
 #include <limits.h>
+#include <syscall.h>
 
 #define UCM_LOG_BUG_SIZE   512
 
@@ -65,7 +66,7 @@ static char *ucm_log_ltoa(char *p, char *end, long n, int base, int flags,
                           int pad)
 {
     static const char digits[] = "0123456789abcdef";
-    long divider;
+    long divider, top_divider;
 
     if (((n < 0) || (flags & UCM_LOG_LTOA_FLAG_SIGN)) && (p < end)) {
         *(p++) = (n < 0 ) ? '-' : '+';
@@ -80,9 +81,11 @@ static char *ucm_log_ltoa(char *p, char *end, long n, int base, int flags,
 
     n = labs(n);
 
-    divider = 1;
-    while ((n / divider) != 0) {
-        divider *= base;
+    divider     = 1;
+    top_divider = 0;
+    while ((divider > 0) && ((n / divider) != 0)) {
+        top_divider = divider;
+        divider    *= base;
         --pad;
     }
 
@@ -91,7 +94,7 @@ static char *ucm_log_ltoa(char *p, char *end, long n, int base, int flags,
                                 (flags & UCM_LOG_LTOA_FLAG_PAD0) ? '0' : ' ');
     }
 
-    divider /= base;
+    divider = top_divider;
     while ((p < end) && (divider > 0)) {
         *(p++) = digits[(n / divider + base) % base];
         divider /= base;
@@ -260,11 +263,15 @@ void __ucm_log(const char *file, unsigned line, const char *function,
     va_list ap;
     struct timeval tv;
     ssize_t nwrite;
+    pid_t pid;
 
     gettimeofday(&tv, NULL);
-    ucm_log_snprintf(buf, UCM_LOG_BUG_SIZE - 1, "[%lu.%06lu] [%s:%d] %18s:%-4d UCX  %s ",
-                     tv.tv_sec, tv.tv_usec, ucm_log_hostname, getpid(),
-                     ucs_basename(file), line, ucm_log_level_names[level]);
+    pid = getpid();
+    ucm_log_snprintf(buf, UCM_LOG_BUG_SIZE - 1,
+                     "[%lu.%06lu] [%s:%d:%d] %18s:%-4d UCX  %s ",
+                     tv.tv_sec, tv.tv_usec, ucm_log_hostname, pid,
+                     ucm_get_tid() - pid, ucs_basename(file), line,
+                     ucm_log_level_names[level]);
     buf[UCM_LOG_BUG_SIZE - 1] = '\0';
 
     length = strlen(buf);
diff --git a/src/ucm/util/log.h b/src/ucm/util/log.h
index 9dcfd317406..ac0e32ab418 100644
--- a/src/ucm/util/log.h
+++ b/src/ucm/util/log.h
@@ -22,6 +22,7 @@
                   ## __VA_ARGS__); \
     }
 
+
 #define ucm_fatal(_message, ...) ucm_log(UCS_LOG_LEVEL_FATAL, _message, ## __VA_ARGS__)
 #define ucm_error(_message, ...) ucm_log(UCS_LOG_LEVEL_ERROR, _message, ## __VA_ARGS__)
 #define ucm_warn(_message, ...)  ucm_log(UCS_LOG_LEVEL_WARN,  _message, ## __VA_ARGS__)
@@ -30,8 +31,25 @@
 #define ucm_debug(_message, ...) ucm_log(UCS_LOG_LEVEL_DEBUG, _message, ## __VA_ARGS__)
 #define ucm_trace(_message, ...) ucm_log(UCS_LOG_LEVEL_TRACE, _message, ## __VA_ARGS__)
 
+
+#define ucm_assert_always(_expression) \
+    do { \
+        if (!(_expression)) { \
+            ucm_fatal("Assertion `%s' failed", #_expression); \
+        } \
+    } while (0)
+
+
+#if ENABLE_ASSERT
+#  define ucm_assert(...)    ucm_assert_always(__VA_ARGS__)
+#else
+#  define ucm_assert(...)    {}
+#endif
+
+
 extern const char *ucm_log_level_names[];
 
+
 void __ucm_log(const char *file, unsigned line, const char *function,
                ucs_log_level_t level, const char *message, ...)
     UCS_F_PRINTF(5, 6);
diff --git a/src/ucm/util/reloc.c b/src/ucm/util/reloc.c
index 4443a73b5c7..143c146148a 100644
--- a/src/ucm/util/reloc.c
+++ b/src/ucm/util/reloc.c
@@ -8,12 +8,6 @@
 #  include "config.h"
 #endif
 
-#ifndef NVALGRIND
-#  include <valgrind/memcheck.h>
-#else
-#  define RUNNING_ON_VALGRIND 0
-#endif
-
 #include "reloc.h"
 
 #include <ucm/util/khash_safe.h>
@@ -35,6 +29,14 @@
 #include <link.h>
 #include <limits.h>
 
+/* Ensure this macro is defined (from <link.h>) - otherwise, cppcheck might
+   fail with an "unknown macro" warning */
+#ifndef ElfW
+#define ElfW(type)	_ElfW (Elf, __ELF_NATIVE_CLASS, type)
+#define _ElfW(e,w,t)	_ElfW_1 (e, w, _##t)
+#define _ElfW_1(e,w,t)	e##w##t
+#endif
+
 typedef void * (*ucm_reloc_dlopen_func_t)(const char *, int);
 typedef int    (*ucm_reloc_dlclose_func_t)(void *);
 
@@ -226,7 +228,7 @@ ucm_reloc_dl_apply_patch(const ucm_dl_info_t *dl_info, const char *dl_basename,
 
     /* modify the relocation to the new value */
     *entry = patch->value;
-    ucm_debug("symbol '%s' in %s at [%p] modified from %p to %p",
+    ucm_trace("symbol '%s' in %s at [%p] modified from %p to %p",
               patch->symbol, dl_basename, entry, prev_value, patch->value);
 
     /* store default entry to prev_value to guarantee valid pointers
@@ -238,7 +240,7 @@ ucm_reloc_dl_apply_patch(const ucm_dl_info_t *dl_info, const char *dl_basename,
         !((prev_value >= (void*)dl_info->start) &&
           (prev_value <  (void*)dl_info->end))) {
         patch->prev_value = prev_value;
-        ucm_debug("'%s' prev_value is %p", patch->symbol, prev_value);
+        ucm_trace("'%s' prev_value is %p", patch->symbol, prev_value);
     }
 
     return UCS_OK;
@@ -387,7 +389,7 @@ static void ucm_reloc_dl_info_cleanup(ElfW(Addr) dlpi_addr, const char *dl_name)
 
     khiter = kh_get(ucm_dl_info_hash, &ucm_dl_info_hash, dlpi_addr);
     if (khiter == kh_end(&ucm_dl_info_hash)) {
-        ucm_debug("no dl_info entry for address 0x%lx", dlpi_addr);
+        ucm_trace("no dl_info entry for address 0x%lx", dlpi_addr);
         return;
     }
 
diff --git a/src/ucm/util/replace.c b/src/ucm/util/replace.c
index 6d8abae9405..58a06325778 100644
--- a/src/ucm/util/replace.c
+++ b/src/ucm/util/replace.c
@@ -17,14 +17,20 @@
 #include <ucm/util/log.h>
 #include <ucm/util/reloc.h>
 #include <ucm/util/replace.h>
+#include <ucm/util/sys.h>
 #include <ucm/mmap/mmap.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/preprocessor.h>
 
+
 #ifndef MAP_FAILED
 #define MAP_FAILED ((void*)-1)
 #endif
 
+#if HAVE___CURBRK
+extern void *__curbrk;
+#endif
+
 #ifdef PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP
 pthread_mutex_t ucm_reloc_get_orig_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
 #else
@@ -113,17 +119,8 @@ int ucm_orig_shmdt(const void *shmaddr)
 
 #endif
 
-#if HAVE___CURBRK
-extern void *__curbrk;
-#endif
-
 _UCM_DEFINE_DLSYM_FUNC(brk, ucm_orig_dlsym_brk, ucm_override_brk, int, -1, void*)
 
-void *ucm_brk_syscall(void *addr)
-{
-    return (void*)syscall(SYS_brk, addr);
-}
-
 int ucm_orig_brk(void *addr)
 {
     void *new_addr;
@@ -133,7 +130,7 @@ int ucm_orig_brk(void *addr)
 #endif
     new_addr = ucm_brk_syscall(addr);
 
-    if (new_addr < addr) {
+    if (new_addr != addr) {
         errno = ENOMEM;
         return -1;
     } else {
@@ -151,15 +148,26 @@ void *ucm_orig_sbrk(intptr_t increment)
     if (ucm_mmap_hook_mode() == UCM_MMAP_HOOK_RELOC) {
         return ucm_orig_dlsym_sbrk(increment);
     } else {
-        prev = ucm_brk_syscall(0);
-        return ucm_orig_brk(UCS_PTR_BYTE_OFFSET(prev, increment)) ? (void*)-1 : prev;
+        prev = ucm_get_current_brk();
+        return ucm_orig_brk(UCS_PTR_BYTE_OFFSET(prev, increment)) ?
+               (void*)-1 : prev;
     }
 }
 
 #else /* UCM_BISTRO_HOOKS */
 
+UCM_DEFINE_DLSYM_FUNC(brk, int, -1, void*)
 UCM_DEFINE_DLSYM_FUNC(sbrk, void*, MAP_FAILED, intptr_t)
 UCM_DEFINE_DLSYM_FUNC(shmat, void*, MAP_FAILED, int, const void*, int)
 UCM_DEFINE_DLSYM_FUNC(shmdt, int, -1, const void*)
 
 #endif /* UCM_BISTRO_HOOKS */
+
+void *ucm_get_current_brk()
+{
+#if HAVE___CURBRK
+    return __curbrk;
+#else
+    return ucm_brk_syscall(0);
+#endif
+}
diff --git a/src/ucm/util/replace.h b/src/ucm/util/replace.h
index 4b91b037d99..de060d1e048 100644
--- a/src/ucm/util/replace.h
+++ b/src/ucm/util/replace.h
@@ -79,6 +79,24 @@ extern pthread_t volatile ucm_reloc_get_orig_thread;
     _UCM_DEFINE_REPLACE_FUNC(ucm_override_##_name, ucm_##_name, \
                              _rettype, _fail_val, __VA_ARGS__)
 
+/**
+ * Defines the following:
+ *  - ucm_orig_##_name##_dlsym - calls original function by symbol lookup
+ *  - ucm_orig_##_name         - function pointer, initialized by default to
+ *                               ucm_orig_##_name##_dlsym
+ *  - ucm_override_##_name     - calls ucm_##_name
+ */
+#define UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(_name, _rettype, _fail_val, ...) \
+    _UCM_DEFINE_DLSYM_FUNC(_name, ucm_orig_##_name##_dlsym, \
+                           ucm_override_##_name, _rettype, _fail_val, \
+                           __VA_ARGS__) \
+    \
+    _rettype (*ucm_orig_##_name)(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) = \
+        ucm_orig_##_name##_dlsym; \
+    \
+    _UCM_DEFINE_REPLACE_FUNC(ucm_override_##_name, ucm_##_name, \
+                             _rettype, _fail_val, __VA_ARGS__)
+
 #define UCM_DEFINE_SYSCALL_FUNC(_name, _rettype, _syscall_id, ...) \
     /* Call syscall */ \
     _rettype ucm_orig_##_name(UCM_FUNC_DEFINE_ARGS(__VA_ARGS__)) \
diff --git a/src/ucm/util/sys.c b/src/ucm/util/sys.c
index 92bf834da5c..9f0bcacdffb 100644
--- a/src/ucm/util/sys.c
+++ b/src/ucm/util/sys.c
@@ -17,10 +17,12 @@
 #include <ucm/api/ucm.h>
 #include <ucm/util/log.h>
 #include <ucm/mmap/mmap.h>
+#include <ucs/type/init_once.h>
 #include <ucs/sys/math.h>
 #include <linux/mman.h>
 #include <sys/mman.h>
 #include <pthread.h>
+#include <syscall.h>
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
@@ -36,7 +38,7 @@ ucm_global_config_t ucm_global_opts = {
     .mmap_hook_mode             = UCM_DEFAULT_HOOK_MODE,
     .enable_malloc_hooks        = 1,
     .enable_malloc_reloc        = 0,
-    .enable_cuda_reloc          = 1,
+    .cuda_hook_mode             = UCM_DEFAULT_HOOK_MODE,
     .enable_dynamic_mmap_thresh = 1,
     .alloc_alignment            = 16,
     .dlopen_process_rpath       = 1
@@ -282,7 +284,8 @@ void ucm_strerror(int eno, char *buf, size_t max)
 
 void ucm_prevent_dl_unload()
 {
-    int flags = RTLD_LOCAL | RTLD_NODELETE;
+    static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER;
+    int flags                        = RTLD_LOCAL | RTLD_NODELETE;
     Dl_info info;
     void *dl;
     int ret;
@@ -292,32 +295,34 @@ void ucm_prevent_dl_unload()
         return;
     }
 
-    flags |= (ucm_global_opts.module_unload_prevent_mode ==
-              UCM_UNLOAD_PREVENT_MODE_NOW) ? RTLD_NOW : RTLD_LAZY;
+    UCS_INIT_ONCE(&init_once) {
+        flags |= (ucm_global_opts.module_unload_prevent_mode ==
+                  UCM_UNLOAD_PREVENT_MODE_NOW) ? RTLD_NOW : RTLD_LAZY;
 
-    /* Get the path to current library by current function pointer */
-    (void)dlerror();
-    ret = dladdr(ucm_prevent_dl_unload, &info);
-    if (ret == 0) {
-        ucm_warn("could not find address of current library: %s", dlerror());
-        return;
-    }
+        /* Get the path to current library by current function pointer */
+        (void)dlerror();
+        ret = dladdr(ucm_prevent_dl_unload, &info);
+        if (ret == 0) {
+            ucm_warn("could not find address of current library: %s", dlerror());
+            return;
+        }
 
-    /* Load the current library with NODELETE flag, to prevent it from being
-     * unloaded. This will create extra reference to the library, but also add
-     * NODELETE flag to the dynamic link map.
-     */
-    (void)dlerror();
-    dl = dlopen(info.dli_fname, flags);
-    if (dl == NULL) {
-        ucm_warn("failed to load '%s': %s", info.dli_fname, dlerror());
-        return;
-    }
+        /* Load the current library with NODELETE flag, to prevent it from being
+         * unloaded. This will create extra reference to the library, but also add
+         * NODELETE flag to the dynamic link map.
+         */
+        (void)dlerror();
+        dl = dlopen(info.dli_fname, flags);
+        if (dl == NULL) {
+            ucm_warn("failed to load '%s': %s", info.dli_fname, dlerror());
+            return;
+        }
 
-    ucm_debug("reloaded '%s' at %p with NODELETE flag", info.dli_fname, dl);
+        ucm_debug("loaded '%s' at %p with NODELETE flag", info.dli_fname, dl);
 
-    /* Now we drop our reference to the lib, and it won't be unloaded anymore */
-    dlclose(dl);
+        /* coverity[overwrite_var] */
+        dl = NULL;
+    }
 }
 
 char *ucm_concat_path(char *buffer, size_t max, const char *dir, const char *file)
@@ -349,3 +354,25 @@ char *ucm_concat_path(char *buffer, size_t max, const char *dir, const char *fil
 
     return buffer;
 }
+
+void *ucm_brk_syscall(void *addr)
+{
+    void *result;
+
+#ifdef __x86_64__
+    asm volatile("mov %1, %%rdi\n\t"
+                 "mov $0xc, %%eax\n\t"
+                 "syscall\n\t"
+                 : "=a"(result)
+                 : "m"(addr));
+#else
+    /* TODO implement 64-bit syscall for aarch64, ppc64le */
+    result = (void*)syscall(SYS_brk, addr);
+#endif
+    return result;
+}
+
+pid_t ucm_get_tid()
+{
+    return syscall(SYS_gettid);
+}
diff --git a/src/ucm/util/sys.h b/src/ucm/util/sys.h
index 37a1d927ef3..838fb61a1c5 100644
--- a/src/ucm/util/sys.h
+++ b/src/ucm/util/sys.h
@@ -8,6 +8,9 @@
 #ifndef UCM_UTIL_SYS_H_
 #define UCM_UTIL_SYS_H_
 
+#include <ucm/api/ucm.h>
+#include <ucs/sys/checker.h>
+#include <sys/types.h>
 #include <stddef.h>
 
 
@@ -88,4 +91,44 @@ void ucm_prevent_dl_unload();
 char *ucm_concat_path(char *buffer, size_t max, const char *dir, const char *file);
 
 
+/**
+ * Perform brk() syscall
+ *
+ * @param addr   Address to set as new program break.
+ *
+ * @return New program break.
+ *
+ * @note If the break could not be changed (for example, parameter was invalid
+ *       or exceeds limits) the break remains unchanged.
+ */
+void *ucm_brk_syscall(void *addr);
+
+
+/**
+ * @return System thread id of the current thread.
+ */
+pid_t ucm_get_tid();
+
+
+/**
+ * Get memory hooks mode to use, based on the configured mode and runtime.
+ *
+ * @param config_mode   Configured memory hook mode.
+ *
+ * @return Memory hook mode to use.
+ */
+static UCS_F_ALWAYS_INLINE ucm_mmap_hook_mode_t
+ucm_get_hook_mode(ucm_mmap_hook_mode_t config_mode)
+{
+#ifdef __SANITIZE_ADDRESS__
+    return UCM_MMAP_HOOK_NONE;
+#else
+    if (RUNNING_ON_VALGRIND && (config_mode == UCM_MMAP_HOOK_BISTRO)) {
+        return UCM_MMAP_HOOK_RELOC;
+    }
+
+    return config_mode;
+#endif
+}
+
 #endif
diff --git a/src/ucp/Makefile.am b/src/ucp/Makefile.am
index 09396bfb4d9..a6e2c9bc75e 100644
--- a/src/ucp/Makefile.am
+++ b/src/ucp/Makefile.am
@@ -1,5 +1,5 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 # Copyright (c) UT-Battelle, LLC. 2017. ALL RIGHTS RESERVED.
 # Copyright (C) Los Alamos National Security, LLC. 2019. ALL RIGHTS RESERVED.
 # See file LICENSE for terms.
@@ -57,6 +57,8 @@ noinst_HEADERS = \
 	proto/proto.h \
 	rma/rma.h \
 	rma/rma.inl \
+	rndv/proto_rndv.h \
+	rndv/proto_rndv.inl \
 	rndv/rndv.h \
 	tag/eager.h \
 	tag/tag_rndv.h \
@@ -103,6 +105,7 @@ libucp_la_SOURCES = \
 	proto/lane_type.c \
 	proto/proto_am.c \
 	proto/proto_common.c \
+	proto/proto_reconfig.c \
 	proto/proto_multi.c \
 	proto/proto_select.c \
 	proto/proto_single.c \
@@ -118,6 +121,10 @@ libucp_la_SOURCES = \
 	rma/rma_send.c \
 	rma/rma_sw.c \
 	rma/flush.c \
+	rndv/proto_rndv.c \
+	rndv/rndv_am.c \
+	rndv/rndv_get.c \
+	rndv/rndv_rtr.c \
 	rndv/rndv.c \
 	tag/eager_multi.c \
 	tag/eager_rcv.c \
@@ -129,6 +136,7 @@ libucp_la_SOURCES = \
 	tag/tag_recv.c \
 	tag/tag_send.c \
 	tag/offload.c \
+	tag/offload/eager.c \
 	wireup/address.c \
 	wireup/ep_match.c \
 	wireup/select.c \
diff --git a/src/ucp/api/ucp.h b/src/ucp/api/ucp.h
index a6fe1da61c7..c95b2230365 100644
--- a/src/ucp/api/ucp.h
+++ b/src/ucp/api/ucp.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
 * Copyright (C) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED.
 * Copyright (C) ARM Ltd. 2016-2017.  ALL RIGHTS RESERVED.
 * Copyright (C) Los Alamos National Security, LLC. 2018 ALL RIGHTS RESERVED.
@@ -124,7 +124,8 @@ enum ucp_params_field {
     UCP_PARAM_FIELD_TAG_SENDER_MASK   = UCS_BIT(4), /**< tag_sender_mask */
     UCP_PARAM_FIELD_MT_WORKERS_SHARED = UCS_BIT(5), /**< mt_workers_shared */
     UCP_PARAM_FIELD_ESTIMATED_NUM_EPS = UCS_BIT(6), /**< estimated_num_eps */
-    UCP_PARAM_FIELD_ESTIMATED_NUM_PPN = UCS_BIT(7)  /**< estimated_num_ppn */
+    UCP_PARAM_FIELD_ESTIMATED_NUM_PPN = UCS_BIT(7), /**< estimated_num_ppn */
+    UCP_PARAM_FIELD_NAME              = UCS_BIT(8)  /**< name */
 };
 
 
@@ -165,11 +166,26 @@ enum ucp_worker_params_field {
     UCP_WORKER_PARAM_FIELD_CPU_MASK     = UCS_BIT(1), /**< Worker's CPU bitmap */
     UCP_WORKER_PARAM_FIELD_EVENTS       = UCS_BIT(2), /**< Worker's events bitmap */
     UCP_WORKER_PARAM_FIELD_USER_DATA    = UCS_BIT(3), /**< User data */
-    UCP_WORKER_PARAM_FIELD_EVENT_FD     = UCS_BIT(4)  /**< External event file
+    UCP_WORKER_PARAM_FIELD_EVENT_FD     = UCS_BIT(4), /**< External event file
                                                            descriptor */
+    UCP_WORKER_PARAM_FIELD_FLAGS        = UCS_BIT(5), /**< Worker flags */
+    UCP_WORKER_PARAM_FIELD_NAME         = UCS_BIT(6) /**< Worker name */
 };
 
 
+/**
+ * @ingroup UCP_WORKER
+ * @brief UCP worker flags
+ *
+ * This enumeration allows specifying flags for @ref ucp_worker_params_t.flags,
+ * which is used as parameter for @ref ucp_worker_create.
+ */
+typedef enum {
+    UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK = UCS_BIT(0) /**< Do not print warnings
+                                                          about request leaks */
+} ucp_worker_flags_t;
+
+
 /**
  * @ingroup UCP_WORKER
  * @brief UCP listener parameters field mask.
@@ -283,7 +299,11 @@ typedef enum {
                                               flag is not set then
                                               @ref ucp_ep_close_nbx schedules
                                               flushes on all outstanding
-                                              operations. */
+                                              operations.
+                                              @note this flag is incompatible
+                                              with @ref UCP_OP_ATTR_FLAG_NO_IMM_CMPL,
+                                              since it forces immediate completion.
+                                              */
 } ucp_ep_close_flags_t;
 
 
@@ -313,6 +333,34 @@ enum ucp_ep_close_mode {
 };
 
 
+/**
+ * @ingroup UCP_ENDPOINT
+ * @brief UCP performance fields and flags
+ *
+ * The enumeration allows specifying which fields in @ref ucp_ep_evaluate_perf_param_t are
+ * present and operation flags are used. It is used to enable backward
+ * compatibility support.
+ */
+typedef enum ucp_ep_perf_param_field {
+    /** Enables @ref ucp_ep_evaluate_perf_param_t::message_size */
+    UCP_EP_PERF_PARAM_FIELD_MESSAGE_SIZE       = UCS_BIT(0)
+} ucp_ep_perf_param_field_t;
+
+
+/**
+ * @ingroup UCP_ENDPOINT
+ * @brief UCP performance fields and flags
+ *
+ * The enumeration allows specifying which fields in @ref ucp_ep_evaluate_perf_attr_t are
+ * present and operation flags are used. It is used to enable backward
+ * compatibility support.
+ */
+typedef enum ucp_ep_perf_attr_field {
+    /** Enables @ref ucp_ep_evaluate_perf_attr_t::estimated_time */
+    UCP_EP_PERF_ATTR_FIELD_ESTIMATED_TIME = UCS_BIT(0)
+} ucp_ep_perf_attr_field_t;
+
+
 /**
  * @ingroup UCP_MEM
  * @brief UCP memory mapping parameters field mask.
@@ -347,6 +395,19 @@ enum ucp_mem_advise_params_field {
 };
 
 
+/**
+ * @ingroup UCP_CONTEXT
+ * @brief UCP library attributes field mask.
+ *
+ * The enumeration allows specifying which fields in @ref ucp_lib_attr_t are
+ * present. It is used to enable backward compatibility support.
+ */
+enum ucp_lib_attr_field {
+    /**< UCP library maximum supported thread level flag */
+    UCP_LIB_ATTR_FIELD_MAX_THREAD_LEVEL = UCS_BIT(0)
+};
+
+
 /**
  * @ingroup UCP_CONTEXT
  * @brief UCP context attributes field mask.
@@ -357,7 +418,8 @@ enum ucp_mem_advise_params_field {
 enum ucp_context_attr_field {
     UCP_ATTR_FIELD_REQUEST_SIZE = UCS_BIT(0), /**< UCP request size */
     UCP_ATTR_FIELD_THREAD_MODE  = UCS_BIT(1), /**< UCP context thread flag */
-    UCP_ATTR_FIELD_MEMORY_TYPES = UCS_BIT(2)  /**< UCP supported memory types */
+    UCP_ATTR_FIELD_MEMORY_TYPES = UCS_BIT(2), /**< UCP supported memory types */
+    UCP_ATTR_FIELD_NAME         = UCS_BIT(3)  /**< UCP context name */
 };
 
 
@@ -372,8 +434,9 @@ enum ucp_worker_attr_field {
     UCP_WORKER_ATTR_FIELD_THREAD_MODE   = UCS_BIT(0), /**< UCP thread mode */
     UCP_WORKER_ATTR_FIELD_ADDRESS       = UCS_BIT(1), /**< UCP address */
     UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS = UCS_BIT(2), /**< UCP address flags */
-    UCP_WORKER_ATTR_FIELD_MAX_AM_HEADER = UCS_BIT(3)  /**< Maximal header size
+    UCP_WORKER_ATTR_FIELD_MAX_AM_HEADER = UCS_BIT(3), /**< Maximum header size
                                                            used by UCP AM API */
+    UCP_WORKER_ATTR_FIELD_NAME          = UCS_BIT(4) /**< UCP worker name */
 };
 
 
@@ -463,13 +526,30 @@ enum {
  * @ingroup UCP_WORKER
  * @brief Flags for a UCP Active Message callback.
  *
- * Flags that indicate how to handle UCP Active Messages
- * Currently only UCP_AM_FLAG_WHOLE_MSG is supported,
- * which indicates the entire message is handled in one
- * callback.
+ * Flags that indicate how to handle UCP Active Messages.
  */
 enum ucp_am_cb_flags {
-    UCP_AM_FLAG_WHOLE_MSG = UCS_BIT(0)
+    /**
+     * Indicates that the entire message will be handled in one callback. With this
+     * option, message ordering is not guaranteed (i.e. receive callbacks may be
+     * invoked in a different order than messages were sent).
+     * If this flag is not set, the data callback may be invoked several times for
+     * the same message (if, for example, it was split into several fragments by
+     * the transport layer). It is guaranteed that the first data callback for a
+     * particular message is invoked for the first fragment. The ordering of first
+     * message fragments is guaranteed (i.e. receive callbacks will be called
+     * in the order the messages were sent). The order of other fragments is not
+     * guaranteed. User header is passed with the first fragment only.
+     */
+    UCP_AM_FLAG_WHOLE_MSG       = UCS_BIT(0),
+
+    /**
+     * Guarantees that the specified @ref ucp_am_recv_callback_t callback,
+     * will always be called with @ref UCP_AM_RECV_ATTR_FLAG_DATA flag set,
+     * so the data will be accessible outside the callback, until
+     * @ref ucp_am_data_release is called.
+     */
+    UCP_AM_FLAG_PERSISTENT_DATA = UCS_BIT(1)
 };
 
 
@@ -595,10 +675,11 @@ typedef enum {
     UCP_OP_ATTR_FIELD_FLAGS         = UCS_BIT(4),  /**< operation-specific flags */
     UCP_OP_ATTR_FIELD_REPLY_BUFFER  = UCS_BIT(5),  /**< reply_buffer field */
     UCP_OP_ATTR_FIELD_MEMORY_TYPE   = UCS_BIT(6),  /**< memory type field */
+    UCP_OP_ATTR_FIELD_RECV_INFO     = UCS_BIT(7),  /**< recv_info field */
 
     UCP_OP_ATTR_FLAG_NO_IMM_CMPL    = UCS_BIT(16), /**< deny immediate completion */
     UCP_OP_ATTR_FLAG_FAST_CMPL      = UCS_BIT(17), /**< expedite local completion,
-                                                        even if it delays remote 
+                                                        even if it delays remote
                                                         data delivery. Note for
                                                         implementer: this option
                                                         can disable zero copy
@@ -623,7 +704,10 @@ typedef enum {
  * backward compatibility support.
  */
 typedef enum {
-    UCP_AM_RECV_ATTR_FIELD_REPLY_EP    = UCS_BIT(0),  /**< reply_ep field */
+    UCP_AM_RECV_ATTR_FIELD_REPLY_EP     = UCS_BIT(0),  /**< reply_ep field */
+    UCP_AM_RECV_ATTR_FIELD_TOTAL_LENGTH = UCS_BIT(1),  /**< total_length field */
+    UCP_AM_RECV_ATTR_FIELD_FRAG_OFFSET  = UCS_BIT(2),  /**< frag_offset field */
+    UCP_AM_RECV_ATTR_FIELD_MSG_CONTEXT  = UCS_BIT(3),  /**< msg_context field */
 
     /**
      * Indicates that the data provided in @ref ucp_am_recv_callback_t callback
@@ -632,16 +716,34 @@ typedef enum {
      * @ref ucp_am_data_release when data is no longer needed. This flag is
      * mutually exclusive with @a UCP_AM_RECV_ATTR_FLAG_RNDV.
      */
-    UCP_AM_RECV_ATTR_FLAG_DATA         = UCS_BIT(16),
+    UCP_AM_RECV_ATTR_FLAG_DATA          = UCS_BIT(16),
 
     /**
      * Indicates that the arriving data was sent using rendezvous protocol.
      * In this case @a data parameter of the @ref ucp_am_recv_callback_t points
      * to the internal UCP descriptor, which can be used for obtaining the actual
      * data by calling @ref ucp_am_recv_data_nbx routine. This flag is mutually
-     * exclusive with @a UCP_AM_RECV_ATTR_FLAG_DATA.
+     * exclusive with @a UCP_AM_RECV_ATTR_FLAG_DATA, @a UCP_AM_RECV_ATTR_FLAG_FIRST
+     * and @a UCP_AM_RECV_ATTR_FLAG_ONLY flags.
+     */
+    UCP_AM_RECV_ATTR_FLAG_RNDV          = UCS_BIT(17),
+
+    /**
+     * Indicates that the incoming data is the first fragment of the multi-fragment
+     * eager message. This flag can only be passed to data handlers registered
+     * without @a UCP_AM_FLAG_WHOLE_MSG flag. This flag is mutually exclusive
+     * with @a UCP_AM_RECV_ATTR_FLAG_RNDV and @a UCP_AM_RECV_ATTR_FLAG_ONLY flags.
      */
-    UCP_AM_RECV_ATTR_FLAG_RNDV         = UCS_BIT(17)
+    UCP_AM_RECV_ATTR_FLAG_FIRST         = UCS_BIT(18),
+
+    /**
+     * Indicates that the incoming data carries the whole message. This flag is
+     * mutually exclusive with @a UCP_AM_RECV_ATTR_FLAG_RNDV and
+     * @a UCP_AM_RECV_ATTR_FLAG_FIRST flags. Also this flags is always passed to
+     * the data handlers, which are registered with @a UCP_AM_FLAG_WHOLE_MSG
+     * flag.
+     */
+    UCP_AM_RECV_ATTR_FLAG_ONLY          = UCS_BIT(19)
 } ucp_am_recv_attr_t;
 
 
@@ -794,8 +896,8 @@ typedef struct ucp_generic_dt_ops {
      *                             @ref ucp_generic_dt_ops::start_pack
      *                             "start_pack()" routine.
      * @param [in]  offset         Virtual offset in the output stream.
-     * @param [in]  dest           Destination to pack the data to.
-     * @param [in]  max_length     Maximal length to pack.
+     * @param [in]  dest           Destination buffer to pack the data.
+     * @param [in]  max_length     Maximum length to pack.
      *
      * @return The size of the data that was written to the destination buffer.
      *         Must be less than or equal to @e max_length.
@@ -942,14 +1044,50 @@ typedef struct ucp_params {
      * will override the number of endpoints set by @e estimated_num_ppn
      */
     size_t                             estimated_num_ppn;
+
+    /**
+     * Tracing and analysis tools can identify the context using this name.
+     * To retrieve the context's name, use @ref ucp_context_query, as the name
+     * you supply may be changed by UCX under some circumstances, e.g. a name
+     * conflict. This field is only assigned if you set
+     * @ref UCP_PARAM_FIELD_NAME in the field mask. If not, then a default
+     * unique name will be created for you.
+     */
+    const char                         *name;
 } ucp_params_t;
 
 
+/**
+ * @ingroup UCP_CONTEXT
+ * @brief Lib attributes.
+ *
+ * The structure defines the attributes that characterize the Library.
+ */
+typedef struct ucp_lib_attr {
+    /**
+     * Mask of valid fields in this structure, using bits from
+     * @ref ucp_lib_attr_field.
+     * Fields not specified in this mask will be ignored.
+     * Provides ABI compatibility with respect to adding new fields.
+     */
+    uint64_t          field_mask;
+
+    /**
+     * Maximum level of thread support of the library, which is permanent
+     * throughout the lifetime of the library. Accordingly, the user can call
+     * @ref ucp_worker_create with appropriate
+     * @ref ucp_worker_params_t.thread_mode.
+     * For supported thread levels please see @ref ucs_thread_mode_t.
+     */
+    ucs_thread_mode_t max_thread_level;
+} ucp_lib_attr_t;
+
+
 /**
  * @ingroup UCP_CONTEXT
  * @brief Context attributes.
  *
- * The structure defines the attributes which characterize
+ * The structure defines the attributes that characterize
  * the particular context.
  */
 typedef struct ucp_context_attr {
@@ -979,6 +1117,11 @@ typedef struct ucp_context_attr {
      * please see @ref ucs_memory_type_t.
      */
     uint64_t              memory_types;
+
+    /**
+     * Tracing and analysis tools can use name to identify this UCX context.
+     */
+    char                  name[UCP_ENTITY_NAME_MAX];
 } ucp_context_attr_t;
 
 
@@ -1026,9 +1169,14 @@ typedef struct ucp_worker_attr {
     size_t                address_length;
 
     /**
-     * Maximal allowed header size for @ref ucp_am_send_nbx routine
+     * Maximum allowed header size for @ref ucp_am_send_nbx routine.
      */
     size_t                max_am_header;
+
+    /**
+     * Tracing and analysis tools can identify the worker using this name.
+     */
+    char                  name[UCP_ENTITY_NAME_MAX];
 } ucp_worker_attr_t;
 
 
@@ -1104,9 +1252,75 @@ typedef struct ucp_worker_params {
      */
     int                     event_fd;
 
+    /**
+     * Worker flags.
+     * This value is optional.
+     * If @ref UCP_WORKER_PARAM_FIELD_FLAGS is not set in the field_mask, the
+     * value of this field will default to 0.
+     */
+    uint64_t                flags;
+
+    /**
+     * Tracing and analysis tools can identify the worker using this name. To
+     * retrieve the worker's name, use @ref ucp_worker_query, as the name you
+     * supply may be changed by UCX under some circumstances, e.g. a name
+     * conflict. This field is only assigned if you set
+     * @ref UCP_WORKER_PARAM_FIELD_NAME in the field mask. If not, then a
+     * default unique name will be created for you.
+     */
+    const char              *name;
+
 } ucp_worker_params_t;
 
 
+/**
+ * @ingroup UCP_ENDPOINT
+ * @brief UCP endpoint performance evaluation request attributes.
+ *
+ * The structure defines the attributes which characterize
+ * the request for performance estimation of a particular endpoint.
+ */
+typedef struct {
+    /**
+     * Mask of valid fields in this structure, using bits from
+     * @ref ucp_ep_perf_param_field_t.
+     * Fields not specified in this mask will be ignored.
+     * Provides ABI compatibility with respect to adding new fields.
+     */
+    uint64_t          field_mask;
+
+    /**
+     * Message size to use for determining performance.
+     * This field must be initialized by the caller.
+     */
+    size_t            message_size;
+} ucp_ep_evaluate_perf_param_t;
+
+
+/**
+ * @ingroup UCP_ENDPOINT
+ * @brief UCP endpoint performance evaluation result attributes.
+ *
+ * The structure defines the attributes which characterize
+ * the result of performance estimation of a particular endpoint.
+ */
+typedef struct {
+    /**
+     * Mask of valid fields in this structure, using bits from
+     * @ref ucp_ep_perf_attr_field_t.
+     * Fields not specified in this mask will be ignored.
+     * Provides ABI compatibility with respect to adding new fields.
+     */
+    uint64_t          field_mask;
+
+    /**
+     * Estimated time (in seconds) required to send a message of a given size
+     * on this endpoint.
+     * This field is set by the @ref ucp_ep_evaluate_perf function.
+     */
+    double            estimated_time;
+} ucp_ep_evaluate_perf_attr_t;
+
 /**
  * @ingroup UCP_WORKER
  * @brief UCP listener attributes.
@@ -1193,6 +1407,11 @@ typedef struct ucp_listener_params {
      * flow. In order for the callback inside this handler to be invoked, the
      * @ref UCP_LISTENER_PARAM_FIELD_CONN_HANDLER needs to be set in the
      * field_mask.
+     * @note User is expected to call ucp_ep_create with set
+     *       @ref UCP_EP_PARAM_FIELD_CONN_REQUEST flag to
+     *       @ref ucp_ep_params_t::field_mask and
+     *       @ref ucp_ep_params_t::conn_request in order to be able to receive
+     *       communications.
      */
     ucp_listener_conn_handler_t         conn_handler;
 } ucp_listener_params_t;
@@ -1296,7 +1515,7 @@ typedef struct ucp_mem_map_params {
       * - Memory registration: This field specifies the type of memory which is
       *    pointed by @ref ucp_mem_map_params.address. If it's not set (along with its
       *    corresponding bit in the field_mask - @ref UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE),
-      *    or set to @ref UCS_MEMORY_TYPE_UNKNOWN, the memory type will be dectected
+      *    or set to @ref UCS_MEMORY_TYPE_UNKNOWN, the memory type will be detected
       *    internally.
       */
      ucs_memory_type_t      memory_type;
@@ -1343,7 +1562,7 @@ struct ucp_tag_recv_info {
  *         .op_attr_mask               = UCP_OP_ATTR_FIELD_CALLBACK |
  *                                       UCP_OP_ATTR_FIELD_REQUEST,
  *         .request                    = request,
- *         .cb.ucp_send_nbx_callback_t = custom_send_callback_f,
+ *         .cb.send                    = custom_send_callback_f,
  *         .user_data                  = pointer_to_user_context_passed_to_cb
  *     };
  *
@@ -1416,6 +1635,21 @@ typedef struct {
      * which means the memory type will be detected internally.
      */
     ucs_memory_type_t memory_type;
+
+    /**
+     * Pointer to the information where received data details are stored
+     * in case of an immediate completion of receive operation. The user has to
+     * provide a pointer to valid memory/variable which will be updated on function
+     * return.
+     */
+    union {
+        size_t              *length;   /* Length of received message in bytes.
+                                          Relevant for non-tagged receive
+                                          operations. */
+        ucp_tag_recv_info_t *tag_info; /* Information about received message.
+                                          Relevant for @a ucp_tag_recv_nbx
+                                          function. */
+    } recv_info;
 } ucp_request_param_t;
 
 
@@ -1469,12 +1703,53 @@ struct ucp_am_recv_param {
     uint64_t           recv_attr;
 
     /**
-     * Endpoint, which can be used for reply to this message.
+     * Endpoint, which can be used for the reply to this message.
      */
     ucp_ep_h           reply_ep;
+
+    /**
+     * Length of the whole message in bytes. Relevant for multi-fragment eager
+     * messages handled by data handlers registered without UCP_AM_FLAG_WHOLE_MSG
+     * flag.
+     */
+    size_t             total_length;
+
+    /**
+     * Offset of the message fragment in bytes relative to the beginning of
+     * overall message. Layout of the multi-fragment message is depicted below:
+     *        Multi-fragment message
+     *  +--------+--------+--------+--------+
+     *  |frag 1  |frag 2  |  ...   |frag N  |
+     *  +--------+--------+--------+--------+
+     *           |                 v
+     *           |               offset of the N-th fragment
+     *           v
+     *         offset of the 2-d fragment
+     */
+    size_t             frag_offset;
+
+    /**
+     * Storage for a per-message user-defined context. User initializes it
+     * when the first fragment arrives and then it is provided with each
+     * consecutive fragment of this message.
+     */
+    void               **msg_context;
 };
 
 
+/**
+ * @ingroup UCP_CONTEXT
+ * @brief Get attributes of the UCP library.
+ *
+ * This routine fetches information about the UCP library attributes.
+ *
+ * @param [out] attr       Filled with attributes of the UCP library.
+ *
+ * @return Error code as defined by @ref ucs_status_t
+ */
+ucs_status_t ucp_lib_query(ucp_lib_attr_t *attr);
+
+
 /**
  * @ingroup UCP_CONFIG
  * @brief Read UCP configuration descriptor
@@ -1837,7 +2112,7 @@ unsigned ucp_worker_progress(ucp_worker_h worker);
  * @param [in]   worker    Worker to poll.
  * @param [out]  poll_eps  Pointer to array of endpoints, should be
  *                         allocated by user.
- * @param [in]   max_eps   Maximal number of endpoints which should be filled
+ * @param [in]   max_eps   Maximum number of endpoints that should be filled
  *                         in @a poll_eps.
  * @param [in]   flags     Reserved for future use.
  *
@@ -1965,13 +2240,13 @@ void ucp_worker_wait_mem(ucp_worker_h worker, void *address);
  * @code {.c}
  * void application_initialization() {
  * // should be called once in application init flow and before
- * // process_comminucation() is used
+ * // process_communication() is used
  *     ...
  *     status = ucp_worker_get_efd(worker, &fd);
  *     ...
  * }
  *
- * void process_comminucation() {
+ * void process_communication() {
  * // should be called every time need to wait for some condition such as
  * // ucp request completion in sleep mode.
  *
@@ -2036,22 +2311,28 @@ ucs_status_t ucp_worker_signal(ucp_worker_h worker);
 
 /**
  * @ingroup UCP_WORKER
- * @brief Accept connections on a local address of the worker object.
- *
- * This routine binds the worker object to a @ref ucs_sock_addr_t sockaddr
- * which is set by the user.
- * The worker will listen to incoming connection requests and upon receiving such
- * a request from the remote peer, an endpoint to it will be created.
- * The user's call-back will be invoked once the endpoint is created.
- *
- * @param [in]  worker           Worker object that is associated with the
- *                               params object.
+ * @brief Create a listener to accept connections on. Connection requests on
+ * the listener will arrive at a local address specified by the user.
+ *
+ * This routine creates a new listener object that is bound to a specific
+ * local address.
+ * The listener will listen to incoming connection requests.
+ * After receiving a request from the remote peer, an endpoint to this peer
+ * will be created - either right away or by calling @ref ucp_ep_create,
+ * as specified by the callback type in @ref ucp_listener_params_t.
+ * The user's callback will be invoked once the endpoint is created.
+ *
+ * @param [in]  worker           Worker object to create the listener on.
  * @param [in]  params           User defined @ref ucp_listener_params_t
  *                               configurations for the @ref ucp_listener_h.
  * @param [out] listener_p       A handle to the created listener, can be released
  *                               by calling @ref ucp_listener_destroy
  *
  * @return Error code as defined by @ref ucs_status_t
+ *
+ * @note @ref ucp_listener_params_t::conn_handler or
+ *       @ref ucp_listener_params_t::accept_handler must be provided to be
+ *       able to handle incoming connections.
  */
 ucs_status_t ucp_listener_create(ucp_worker_h worker,
                                  const ucp_listener_params_t *params,
@@ -2305,6 +2586,24 @@ ucs_status_ptr_t ucp_ep_flush_nb(ucp_ep_h ep, unsigned flags,
 ucs_status_ptr_t ucp_ep_flush_nbx(ucp_ep_h ep, const ucp_request_param_t *param);
 
 
+/**
+ * @ingroup UCP_ENDPOINT
+ * @brief Estimate performance characteristics of a specific endpoint.
+ *
+ * This routine fetches information about the endpoint.
+ * 
+ * @param [in]  ep    Endpoint to query.
+ * @param [in]  param Filled by the user with request params.
+ * @param [out] attr  Filled with performance estimation of the given operation
+ *                    on the endpoint.
+ *
+ * @return Error code as defined by @ref ucs_status_t
+ */
+ucs_status_t ucp_ep_evaluate_perf(ucp_ep_h ep,
+                                  const ucp_ep_evaluate_perf_param_t *param,
+                                  ucp_ep_evaluate_perf_attr_t *attr);
+
+
 /**
  * @ingroup UCP_MEM
  * @brief Map or allocate memory for zero-copy operations.
@@ -2532,7 +2831,7 @@ ucs_status_t ucp_mem_advise(ucp_context_h context, ucp_mem_h memh,
  *
  * @note
  * @li RKEYs for InfiniBand and Cray Aries networks typically includes
- * InifiniBand and Aries key.
+ * InfiniBand and Aries key.
  * @li In order to enable remote direct memory access to the memory associated
  * with the memory handle the application is responsible for sharing the RKEY with
  * the peers that will initiate the access.
@@ -2646,11 +2945,11 @@ void ucp_rkey_destroy(ucp_rkey_h rkey);
  * @brief Add user defined callback for Active Message.
  *
  * This routine installs a user defined callback to handle incoming Active
- * Messages with a specific id. This callback is called whenever an Active 
- * Message that was sent from the remote peer by @ref ucp_am_send_nb is 
+ * Messages with a specific id. This callback is called whenever an Active
+ * Message that was sent from the remote peer by @ref ucp_am_send_nb is
  * received on this worker.
  *
- * @param [in]  worker      UCP worker on which to set the Active Message 
+ * @param [in]  worker      UCP worker on which to set the Active Message
  *                          handler.
  * @param [in]  id          Active Message id.
  * @param [in]  cb          Active Message callback. NULL to clear.
@@ -2739,8 +3038,6 @@ ucs_status_ptr_t ucp_am_send_nb(ucp_ep_h ep, uint16_t id,
  * @note If UCP_OP_ATTR_FLAG_NO_IMM_CMPL flag is set in the op_attr_mask field
  *       of @a param, then the operation will return a request handle, even if
  *       it completes immediately.
- * @note Currently Active Message API supports communication operations with
- *       host memory only.
  * @note This operation supports specific flags, which can be passed
  *       in @a param by @ref ucp_request_param_t.flags. The exact set of flags
  *       is defined by @ref ucp_send_am_flags.
@@ -2758,9 +3055,9 @@ ucs_status_ptr_t ucp_am_send_nb(ucp_ep_h ep, uint16_t id,
  * @param [in]  param         Operation parameters, see @ref ucp_request_param_t.
  *
  * @note Sending only header without actual data is allowed and is recommended
- *       for transfering latency-critical amount of data.
+ *       for transferring a latency-critical amount of data.
  * @note The maximum allowed header size can be obtained by querying worker
- *       attributes by @ref ucp_worker_query routine.
+ *       attributes by the @ref ucp_worker_query routine.
  *
  *
  * @return NULL                 - Active Message was sent immediately.
@@ -2781,7 +3078,7 @@ ucs_status_ptr_t ucp_am_send_nbx(ucp_ep_h ep, unsigned id,
 
 /**
  * @ingroup UCP_COMM
- * @brief Receive Active Message sent with rendezvous protocol.
+ * @brief Receive Active Message as defined by provided data descriptor.
  *
  * This routine receives a message that is described by the data descriptor
  * @a data_desc, local address @a buffer, size @a count and @a param
@@ -2790,13 +3087,19 @@ ucs_status_ptr_t ucp_am_send_nbx(ucp_ep_h ep, unsigned id,
  * message is delivered to the @a buffer. If the receive operation cannot be
  * started the routine returns an error.
  *
+ * @note This routine can be performed on any valid data descriptor delivered in
+ *       @ref ucp_am_recv_callback_t.
+ *       Data descriptor is considered to be valid if:
+ *       - It is a rendezvous request (@a UCP_AM_RECV_ATTR_FLAG_RNDV is set in
+ *         @ref ucp_am_recv_param_t.recv_attr) or
+ *       - It is a persistent data pointer (@a UCP_AM_RECV_ATTR_FLAG_DATA is set
+ *         in @ref ucp_am_recv_param_t.recv_attr). In this case receive
+ *         operation may be needed to unpack data to device memory (for example
+ *         GPU device) or some specific datatype.
  * @note After this call UCP takes ownership of @a data_desc descriptor, so
  *       there is no need to release it even if the operation fails.
- *       The routine returns a request handle instead, which can further be used
- *       for tracking operation progress.
- *
- * @note Currently Active Message API supports communication operations with
- *       host memory only.
+ *       The routine returns a request handle instead, which can be used for
+ *       tracking operation progress.
  *
  * @param [in]  worker     Worker that is used for the receive operation.
  * @param [in]  data_desc  Data descriptor, provided in
@@ -2805,6 +3108,11 @@ ucs_status_ptr_t ucp_am_send_nbx(ucp_ep_h ep, unsigned id,
  * @param [in]  count      Number of elements to receive into @a buffer.
  * @param [in]  param      Operation parameters, see @ref ucp_request_param_t.
  *
+ * @return NULL                 - The receive operation was completed
+ *                                immediately. In this case, if
+ *                                @a param->recv_info.length is specified in the
+ *                                @a param, the value to which it points is updated
+ *                                with the size of the received message.
  * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed.
  * @return otherwise            - Receive operation was scheduled and can be
  *                                completed at any point in time. The request
@@ -3160,7 +3468,7 @@ ucs_status_ptr_t ucp_tag_send_sync_nbx(ucp_ep_h ep, const void *buffer,
  * returns an error.
  *
  * @param [in]     ep       UCP endpoint that is used for the receive operation.
- * @param [in]     buffer   Pointer to the buffer to receive the data to.
+ * @param [in]     buffer   Pointer to the buffer to receive the data.
  * @param [in]     count    Number of elements to receive into @a buffer.
  * @param [in]     datatype Datatype descriptor for the elements in the buffer.
  * @param [in]     cb       Callback function that is invoked whenever the
@@ -3215,7 +3523,8 @@ ucs_status_ptr_t ucp_stream_recv_nb(ucp_ep_h ep, void *buffer, size_t count,
  * @return NULL                 - The receive operation was completed
  *                                immediately. In this case the value pointed by
  *                                @a length is updated by the size of received
- *                                data.
+ *                                data. Note @a param->recv_info is not relevant
+ *                                for this function.
  * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed.
  * @return otherwise            - Operation was scheduled for receive. A request
  *                                handle is returned to the application in order
@@ -3283,7 +3592,7 @@ ucs_status_ptr_t ucp_stream_recv_data_nb(ucp_ep_h ep, size_t *length);
  *       handle or an error.
  *
  * @param [in]  worker      UCP worker that is used for the receive operation.
- * @param [in]  buffer      Pointer to the buffer to receive the data to.
+ * @param [in]  buffer      Pointer to the buffer to receive the data.
  * @param [in]  count       Number of elements to receive
  * @param [in]  datatype    Datatype descriptor for the elements in the buffer.
  * @param [in]  tag         Message tag to expect.
@@ -3322,7 +3631,7 @@ ucs_status_ptr_t ucp_tag_recv_nb(ucp_worker_h worker, void *buffer, size_t count
  * used.
  *
  * @param [in]  worker      UCP worker that is used for the receive operation.
- * @param [in]  buffer      Pointer to the buffer to receive the data to.
+ * @param [in]  buffer      Pointer to the buffer to receive the data.
  * @param [in]  count       Number of elements to receive
  * @param [in]  datatype    Datatype descriptor for the elements in the buffer.
  * @param [in]  tag         Message tag to expect.
@@ -3356,11 +3665,8 @@ ucs_status_t ucp_tag_recv_nbr(ucp_worker_h worker, void *buffer, size_t count,
  * message is in the receive buffer and ready for application access.  If the
  * receive operation cannot be stated the routine returns an error.
  *
- * @note This routine cannot return UCS_OK. It always returns a request
- *       handle or an error.
- *
  * @param [in]  worker      UCP worker that is used for the receive operation.
- * @param [in]  buffer      Pointer to the buffer to receive the data to.
+ * @param [in]  buffer      Pointer to the buffer to receive the data.
  * @param [in]  count       Number of elements to receive
  * @param [in]  tag         Message tag to expect.
  * @param [in]  tag_mask    Bit mask that indicates the bits that are used for
@@ -3368,13 +3674,18 @@ ucs_status_t ucp_tag_recv_nbr(ucp_worker_h worker, void *buffer, size_t count,
  *                          against the expected tag.
  * @param [in]  param       Operation parameters, see @ref ucp_request_param_t
  *
+ * @return NULL                 - The receive operation was completed
+ *                                immediately. In this case, if
+ *                                @a param->recv_info.tag_info is specified in the
+ *                                @a param, the value to which it points is updated
+ *                                with the information about the received message.
  * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed.
- * @return otherwise          - Operation was scheduled for receive. The request
- *                              handle is returned to the application in order
- *                              to track progress of the operation. The
- *                              application is responsible for releasing the
- *                              handle using @ref ucp_request_free
- *                              "ucp_request_free()" routine.
+ * @return otherwise            - Operation was scheduled for receive. The request
+ *                                handle is returned to the application in order
+ *                                to track progress of the operation. The
+ *                                application is responsible for releasing the
+ *                                handle using @ref ucp_request_free
+ *                                "ucp_request_free()" routine.
  */
 ucs_status_ptr_t ucp_tag_recv_nbx(ucp_worker_h worker, void *buffer, size_t count,
                                   ucp_tag_t tag, ucp_tag_t tag_mask,
@@ -3433,7 +3744,7 @@ ucp_tag_message_h ucp_tag_probe_nb(ucp_worker_h worker, ucp_tag_t tag,
  * This routine receives a message that is described by the local address @a
  * buffer, size @a count, @a message handle, and @a datatype object on the @a
  * worker. The @a message handle can be obtained by calling the @ref
- * ucp_tag_probe_nb "ucp_tag_probe_nb()" routine.  @ref ucp_tag_msg_recv_nb
+ * ucp_tag_probe_nb "ucp_tag_probe_nb()" routine. The @ref ucp_tag_msg_recv_nb
  * "ucp_tag_msg_recv_nb()" routine is non-blocking and therefore returns
  * immediately. The receive operation is considered completed when the message
  * is delivered to the @a buffer. In order to notify the application about
@@ -3443,7 +3754,7 @@ ucp_tag_message_h ucp_tag_probe_nb(ucp_worker_h worker, ucp_tag_t tag,
  * routine returns an error.
  *
  * @param [in]  worker      UCP worker that is used for the receive operation.
- * @param [in]  buffer      Pointer to the buffer to receive the data to.
+ * @param [in]  buffer      Pointer to the buffer that will receive the data.
  * @param [in]  count       Number of elements to receive
  * @param [in]  datatype    Datatype descriptor for the elements in the buffer.
  * @param [in]  message     Message handle.
@@ -3465,6 +3776,40 @@ ucs_status_ptr_t ucp_tag_msg_recv_nb(ucp_worker_h worker, void *buffer,
                                      ucp_tag_recv_callback_t cb);
 
 
+/**
+ * @ingroup UCP_COMM
+ * @brief Non-blocking receive operation for a probed message.
+ *
+ * This routine receives a message that is described by the local address @a
+ * buffer, size @a count, and @a message handle on the @a worker.
+ * The @a message handle can be obtained by calling the @ref
+ * ucp_tag_probe_nb "ucp_tag_probe_nb()" routine. The @ref ucp_tag_msg_recv_nbx
+ * "ucp_tag_msg_recv_nbx()" routine is non-blocking and therefore returns
+ * immediately. The receive operation is considered completed when the message
+ * is delivered to the @a buffer. In order to notify the application about
+ * completion of the receive operation the UCP library will invoke the
+ * call-back @a cb when the received message is in the receive buffer and ready
+ * for application access. If the receive operation cannot be started the
+ * routine returns an error.
+ *
+ * @param [in]  worker      UCP worker that is used for the receive operation.
+ * @param [in]  buffer      Pointer to the buffer that will receive the data.
+ * @param [in]  count       Number of elements to receive
+ * @param [in]  message     Message handle.
+ * @param [in]  param       Operation parameters, see @ref ucp_request_param_t
+ *
+ * @return UCS_PTR_IS_ERR(_ptr) - The receive operation failed.
+ * @return otherwise            - Operation was scheduled for receive. The request
+ *                                handle is returned to the application in order
+ *                                to track progress of the operation. The
+ *                                application is responsible for releasing the
+ *                                handle using @ref ucp_request_free
+ *                                "ucp_request_free()" routine.
+ */
+ucs_status_ptr_t ucp_tag_msg_recv_nbx(ucp_worker_h worker, void *buffer,
+                                      size_t count, ucp_tag_message_h message,
+                                      const ucp_request_param_t *param);
+
 /**
  * @ingroup UCP_COMM
  * @brief Non-blocking implicit remote memory put operation.
@@ -3588,7 +3933,7 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length,
  *                                progress of the operation. The application is
  *                                responsible for releasing the handle using
  *                                @ref ucp_request_free "ucp_request_free()" routine.
- * 
+ *
  * @note Only the datatype ucp_dt_make_contig(1) is supported
  * for @a param->datatype, see @ref ucp_dt_make_contig.
  */
@@ -3714,7 +4059,7 @@ ucs_status_ptr_t ucp_get_nb(ucp_ep_h ep, void *buffer, size_t length,
  *                                progress of the operation. The application is
  *                                responsible for releasing the handle using
  *                                @ref ucp_request_free "ucp_request_free()" routine.
- * 
+ *
  * @note Only the datatype ucp_dt_make_contig(1) is supported
  * for @a param->datatype, see @ref ucp_dt_make_contig.
  */
diff --git a/src/ucp/api/ucp_def.h b/src/ucp/api/ucp_def.h
index 7f9bd92927f..b33aa1783c6 100644
--- a/src/ucp/api/ucp_def.h
+++ b/src/ucp/api/ucp_def.h
@@ -10,6 +10,7 @@
 #ifndef UCP_DEF_H_
 #define UCP_DEF_H_
 
+#include <ucs/memory/memory_type.h>
 #include <ucs/type/status.h>
 #include <ucs/config/types.h>
 #include <stddef.h>
@@ -197,6 +198,11 @@ typedef struct ucp_mem_attr {
      * Size of the memory segment.
      */
      size_t                 length;
+
+     /**
+      * Type of allocated or registered memory
+      */
+     ucs_memory_type_t      mem_type;
 } ucp_mem_attr_t;
 
 
@@ -208,8 +214,9 @@ typedef struct ucp_mem_attr {
  * present. It is used to enable backward compatibility support.
  */
 enum ucp_mem_attr_field {
-    UCP_MEM_ATTR_FIELD_ADDRESS = UCS_BIT(0), /**< Virtual address */
-    UCP_MEM_ATTR_FIELD_LENGTH  = UCS_BIT(1)  /**< The size of memory region */
+    UCP_MEM_ATTR_FIELD_ADDRESS  = UCS_BIT(0), /**< Virtual address */
+    UCP_MEM_ATTR_FIELD_LENGTH   = UCS_BIT(1), /**< The size of memory region */
+    UCP_MEM_ATTR_FIELD_MEM_TYPE = UCS_BIT(2)  /**< Type of allocated or registered memory */
 };
 
 
@@ -629,29 +636,34 @@ typedef ucs_status_t (*ucp_am_callback_t)(void *arg, void *data, size_t length,
  * @param [in]  length        Length of data. If @a UCP_AM_RECV_ATTR_FLAG_RNDV
  *                            flag is set in @ref ucp_am_recv_param_t.recv_attr,
  *                            it indicates the required receive buffer size for
- *                            initiating rendezvous protocol.
+ *                            initiating rendezvous protocol. If this receive
+ *                            handler was registered without UCP_AM_FLAG_WHOLE_MSG
+ *                            flag set, it represents length of received fragment.
+ *                            In this case the whole message length is available in
+ *                            @ref ucp_am_recv_param_t.total_length.
  * @param [in]  param         Data receive parameters.
  *
- * @return UCS_OK         @a data will not persist after the callback returns.
- *                        If UCP_AM_RECV_ATTR_FLAG_RNDV flag is set in
- *                        @a param->recv_attr, the data descriptor will be
- *                        dropped and the corresponding @ref ucp_am_send_nbx
- *                        call should complete with UCS_OK status.
- *
- * @return UCS_INPROGRESS Can only be returned if @a param->recv_attr flags
- *                        contains UCP_AM_RECV_ATTR_FLAG_DATA or
- *                        UCP_AM_RECV_ATTR_FLAG_RNDV. The @a data will persist
- *                        after the callback has returned. To free the memory,
- *                        a pointer to the data must be passed into
- *                        @ref ucp_am_data_release or, in the case of rendezvous
- *                        descriptor, data receive is initiated by
- *                        @ref ucp_am_recv_data_nbx.
- *
- * @return otherwise      Can only be returned if @a param->recv_attr contains
- *                        UCP_AM_RECV_ATTR_FLAG_RNDV. In this case data
- *                        descriptor @a data will be dropped and the
- *                        corresponding @ref ucp_am_send_nbx call should
- *                        complete with the status returned from the callback.
+ * @return UCS_OK             @a data will not persist after the callback returns.
+ *                            If UCP_AM_RECV_ATTR_FLAG_RNDV flag is set in
+ *                            @a param->recv_attr and @ref ucp_am_recv_data_nbx was
+ *                            not called for this data, the data descriptor will be
+ *                            dropped and the corresponding @ref ucp_am_send_nbx
+ *                            call will complete with UCS_OK status.
+ *
+ * @return UCS_INPROGRESS     Can only be returned if @a param->recv_attr flags
+ *                            contains UCP_AM_RECV_ATTR_FLAG_DATA or
+ *                            UCP_AM_RECV_ATTR_FLAG_RNDV. The @a data will persist
+ *                            after the callback has returned. To free the memory,
+ *                            a pointer to the data must be passed into
+ *                            @ref ucp_am_data_release or data receive is initiated by
+ *                            @ref ucp_am_recv_data_nbx.
+ *
+ * @return otherwise          Can only be returned if @a param->recv_attr contains
+ *                            UCP_AM_RECV_ATTR_FLAG_RNDV. In this case data
+ *                            descriptor @a data will be dropped and the
+ *                            corresponding @ref ucp_am_send_nbx call on the
+ *                            sender side will complete with the status returned
+ *                            from the callback.
  *
  * @note This callback should be set and released
  *       by @ref ucp_worker_set_am_recv_handler function.
@@ -735,4 +747,12 @@ typedef struct ucp_ep_params {
 } ucp_ep_params_t;
 
 
+/**
+ * @ingroup UCP_CONTEXT
+ * @brief Maximum size of the UCP entity name in structure of entity attributes
+ * provided by a query method.
+ */
+#define UCP_ENTITY_NAME_MAX 32
+
+
 #endif
diff --git a/src/ucp/core/ucp_am.c b/src/ucp/core/ucp_am.c
index ade528a986a..83a39d17a81 100644
--- a/src/ucp/core/ucp_am.c
+++ b/src/ucp/core/ucp_am.c
@@ -23,10 +23,6 @@
 #include <ucs/datastruct/array.inl>
 
 
-#define UCP_AM_SHORT_REPLY_MAX_SIZE  (UCS_ALLOCA_MAX_SIZE - \
-                                      sizeof(ucs_ptr_map_key_t))
-
-
 UCS_ARRAY_IMPL(ucp_am_cbs, unsigned, ucp_am_entry_t, static)
 
 ucs_status_t ucp_am_init(ucp_worker_h worker)
@@ -35,8 +31,7 @@ ucs_status_t ucp_am_init(ucp_worker_h worker)
         return UCS_OK;
     }
 
-    ucs_array_init_dynamic(ucp_am_cbs, &worker->am);
-
+    ucs_array_init_dynamic(&worker->am);
     return UCS_OK;
 }
 
@@ -46,7 +41,7 @@ void ucp_am_cleanup(ucp_worker_h worker)
         return;
     }
 
-    ucs_array_cleanup_dynamic(ucp_am_cbs, &worker->am);
+    ucs_array_cleanup_dynamic(&worker->am);
 }
 
 void ucp_am_ep_init(ucp_ep_h ep)
@@ -62,34 +57,56 @@ void ucp_am_ep_init(ucp_ep_h ep)
 void ucp_am_ep_cleanup(ucp_ep_h ep)
 {
     ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep);
+    ucp_recv_desc_t *rdesc, *tmp_rdesc;
+    ucs_queue_iter_t iter;
+    size_t UCS_V_UNUSED count;
 
-    if (ep->worker->context->config.features & UCP_FEATURE_AM) {
-        if (ucs_unlikely(!ucs_list_is_empty(&ep_ext->am.started_ams))) {
-            ucs_warn("worker %p: not all UCP active messages have been"
-                     " run to completion on ep %p", ep->worker, ep);
-        }
+    if (!(ep->worker->context->config.features & UCP_FEATURE_AM)) {
+        return;
+    }
 
-        if (ucs_unlikely(!ucs_queue_is_empty(&ep_ext->am.mid_rdesc_q))) {
-            ucs_warn("worker %p: unhandled middle fragments left on ep %p",
-                     ep->worker, ep);
-        }
+    count = 0;
+    ucs_list_for_each_safe(rdesc, tmp_rdesc, &ep_ext->am.started_ams,
+                           am_first.list) {
+        ucs_list_del(&rdesc->am_first.list);
+        ucs_free(rdesc);
+        ++count;
     }
+    ucs_trace_data("worker %p: %zu unhandled first AM fragments have been"
+                   " dropped on ep %p", ep->worker, count, ep);
+
+    count = 0;
+    ucs_queue_for_each_safe(rdesc, iter, &ep_ext->am.mid_rdesc_q,
+                            am_mid_queue) {
+        ucs_queue_del_iter(&ep_ext->am.mid_rdesc_q, iter);
+        ucp_recv_desc_release(rdesc);
+        ++count;
+    }
+    ucs_trace_data("worker %p: %zu unhandled middle AM fragments have been"
+                   " dropped on ep %p", ep->worker, count, ep);
 }
 
 size_t ucp_am_max_header_size(ucp_worker_h worker)
 {
+    ucp_context_h context = worker->context;
     uct_iface_attr_t *if_attr;
     ucp_rsc_index_t iface_id;
     size_t max_am_header, max_uct_fragment;
+    size_t max_rts_size, max_ucp_header;
 
-    if (!(worker->context->config.features & UCP_FEATURE_AM)) {
+    if (!(context->config.features & UCP_FEATURE_AM)) {
         return 0ul;
     }
 
-    max_am_header = SIZE_MAX;
+    max_am_header  = SIZE_MAX;
+    max_rts_size   = sizeof(ucp_rndv_rts_hdr_t) +
+                     ucp_rkey_packed_size(context, UCS_MASK(context->num_mds),
+                                          UCS_SYS_DEVICE_ID_UNKNOWN, 0);
+    max_ucp_header = ucs_max(max_rts_size, sizeof(ucp_am_first_hdr_t));
 
-    /* TODO: Make sure maximal AM header can fit into one bcopy fragment
-     * together with RTS */
+    /* Make sure maximal AM header can fit into one bcopy fragment
+     * together with RTS or first eager header (whatever is bigger)
+     */
     for (iface_id = 0; iface_id < worker->num_ifaces; ++iface_id) {
         if_attr = &worker->ifaces[iface_id]->attr;
 
@@ -104,9 +121,8 @@ size_t ucp_am_max_header_size(ucp_worker_h worker)
          */
         if (if_attr->cap.flags & UCT_IFACE_FLAG_AM_BCOPY) {
             max_uct_fragment = ucs_max(if_attr->cap.am.max_bcopy,
-                                       sizeof(ucp_am_first_hdr_t) - 1) -
-                               sizeof(ucp_am_first_hdr_t) - 1;
-            max_am_header = ucs_min(max_am_header, max_uct_fragment);
+                                       max_ucp_header - 1) - max_ucp_header - 1;
+            max_am_header    = ucs_min(max_am_header, max_uct_fragment);
         }
     }
 
@@ -115,22 +131,55 @@ size_t ucp_am_max_header_size(ucp_worker_h worker)
     return ucs_min(max_am_header, UINT32_MAX);
 }
 
-static void ucp_am_rndv_send_ats(ucp_worker_h worker,
-                                 ucp_am_rndv_rts_hdr_t *rts,
+static void ucp_am_rndv_send_ats(ucp_worker_h worker, ucp_rndv_rts_hdr_t *rts,
                                  ucs_status_t status)
 {
     ucp_request_t *req;
+    ucp_ep_h ep;
 
+    UCP_WORKER_GET_EP_BY_ID(&ep, worker, rts->sreq.ep_id, return,
+                            "AM RNDV ATS");
     req = ucp_request_get(worker);
     if (ucs_unlikely(req == NULL)) {
         ucs_error("failed to allocate request for AM RNDV ATS");
         return;
     }
 
-    req->send.ep = ucp_worker_get_ep_by_id(worker, rts->super.sreq.ep_id);
+    req->send.ep = ep;
     req->flags   = 0;
 
-    ucp_rndv_req_send_ats(req, NULL, rts->super.sreq.req_id, status);
+    ucp_rndv_req_send_ack(req, NULL, rts->sreq.req_id, status,
+                          UCP_AM_ID_RNDV_ATS, "send_ats");
+}
+
+static UCS_F_ALWAYS_INLINE void ucp_am_release_long_desc(ucp_recv_desc_t *desc)
+{
+    /* Don't use UCS_PTR_BYTE_OFFSET here due to coverity false positive report.
+     * Need to step back by am_malloc_offset, where originally allocated pointer
+     * resides. */
+    ucs_free((char*)desc - desc->am_malloc_offset);
+}
+
+static UCS_F_ALWAYS_INLINE int
+ucp_am_rdesc_in_progress(ucp_recv_desc_t *desc, ucs_status_t am_cb_status)
+{
+    if (!(desc->flags & UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS)) {
+        /* Inprogress flag is cleared - it means ucp_am_recv_data_nbx operation
+         * was initiated and already completed. Thus, no need to save this data
+         * descriptor.
+         */
+        ucs_assert(desc->flags & UCP_RECV_DESC_FLAG_RECV_STARTED);
+        return 0;
+    } else if ((am_cb_status != UCS_INPROGRESS) &&
+               (!(desc->flags & UCP_RECV_DESC_FLAG_RECV_STARTED))) {
+        /* User returned UCS_OK or error (which is allowed in RNDV flow), and
+         * did not initiate receive operation. Thus, according to API, this data
+         * descriptor is not needed.
+         */
+        return 0;
+    }
+
+    return 1;
 }
 
 UCS_PROFILE_FUNC_VOID(ucp_am_data_release, (worker, data),
@@ -139,16 +188,14 @@ UCS_PROFILE_FUNC_VOID(ucp_am_data_release, (worker, data),
     ucp_recv_desc_t *rdesc = (ucp_recv_desc_t *)data - 1;
 
     if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_MALLOC)) {
-        /* Don't use UCS_PTR_BYTE_OFFSET here due to coverity false
-         * positive report. Need to step back by first_header size, where
-         * originally allocated pointer resides. */
-        ucs_free((char*)rdesc - sizeof(ucp_am_first_hdr_t));
+        ucp_am_release_long_desc(rdesc);
         return;
     }
 
     if (rdesc->flags & UCP_RECV_DESC_FLAG_RNDV) {
-        if (rdesc->flags & UCP_RECV_DESC_FLAG_RNDV_STARTED) {
-            ucs_error("rndv receive is initiated on desc %p and cannot be released ",
+        if (rdesc->flags & UCP_RECV_DESC_FLAG_RECV_STARTED) {
+            ucs_error("rndv receive is initiated on desc %p and cannot be "
+                      "released ",
                       data);
             return;
         }
@@ -259,7 +306,7 @@ static UCS_F_ALWAYS_INLINE ssize_t
 ucp_am_get_short_max(const ucp_request_t *req, ssize_t max_short)
 {
     return (UCP_DT_IS_CONTIG(req->send.datatype) &&
-            UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->send.mem_type)) ?
+            UCP_MEM_IS_HOST(req->send.mem_type)) ?
             max_short : -1;
 }
 
@@ -285,7 +332,7 @@ ucp_am_fill_first_header(ucp_am_first_hdr_t *hdr, ucp_request_t *req)
     ucp_am_fill_header(&hdr->super.super, req);
     hdr->super.ep_id = ucp_send_request_get_ep_remote_id(req);
     hdr->msg_id      = req->send.msg_proto.message_id;
-    hdr->total_size  = ucp_am_send_req_total_size(req);
+    hdr->total_size  = req->send.length;
 }
 
 static UCS_F_ALWAYS_INLINE void
@@ -370,20 +417,24 @@ static UCS_F_ALWAYS_INLINE ssize_t
 ucp_am_bcopy_pack_data(void *buffer, ucp_request_t *req, size_t length)
 {
     unsigned user_header_length = req->send.msg_proto.am.header_length;
+    size_t payload_length       = length - user_header_length;
+    void *user_hdr;
+
+    ucs_assertv((req->send.length == 0) || (length > user_header_length),
+                "length %zu, user_header length %u", length,
+                user_header_length);
+
 
     if (user_header_length != 0) {
-        ucs_assert((req->send.length == 0) || (length > user_header_length));
-        ucp_am_pack_user_header(buffer, req);
-    }
-
-    return user_header_length + ucp_dt_pack(req->send.ep->worker,
-                                            req->send.datatype,
-                                            UCS_MEMORY_TYPE_HOST,
-                                            UCS_PTR_BYTE_OFFSET(buffer,
-                                                            user_header_length),
-                                            req->send.buffer,
-                                            &req->send.state.dt,
-                                            length - user_header_length);
+        /* Pack user header to the end of message/fragment */
+        user_hdr = UCS_PTR_BYTE_OFFSET(buffer, payload_length);
+        ucp_am_pack_user_header(user_hdr, req);
+    }
+
+    return user_header_length +
+           ucp_dt_pack(req->send.ep->worker, req->send.datatype,
+                       req->send.mem_type, buffer, req->send.buffer,
+                       &req->send.state.dt, payload_length);
 }
 
 static size_t
@@ -458,81 +509,49 @@ ucp_am_bcopy_pack_args_mid(void *dest, void *arg)
     ucs_assert(req->send.state.dt.offset > 0);
 
     return sizeof(*hdr) + ucp_dt_pack(req->send.ep->worker, req->send.datatype,
-                                      UCS_MEMORY_TYPE_HOST, hdr + 1,
+                                      req->send.mem_type, hdr + 1,
                                       req->send.buffer, &req->send.state.dt,
                                       length);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_am_send_short(ucp_ep_h ep, uint16_t id, uint16_t flags, const void *header,
-                  size_t header_length, const void *payload, size_t length)
-{
-    uct_ep_h am_ep = ucp_ep_get_am_uct_ep(ep);
-    ucp_am_hdr_t hdr;
-    void *sbuf;
-
-    /*
-     * short can't be used if both header and payload are provided
-     * (to avoid packing on fast path)
-     * TODO: enable short protocol for such cases when uct_am_short_iov is
-     * defined in UCT
-     */
-    ucs_assert((length == 0ul) || (header_length == 0ul));
-    ucs_assert(!(flags & UCP_AM_SEND_REPLY));
-    ucp_am_fill_short_header(&hdr, id, flags, header_length);
-
-    sbuf = (header_length != 0) ? (void*)header : (void*)payload;
-
-    return uct_ep_am_short(am_ep, UCP_AM_ID_SINGLE, hdr.u64, sbuf,
-                           length + header_length);
-}
-
-static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_am_send_short_reply(ucp_ep_h ep, uint16_t id, uint16_t flags,
-                        const void *header, size_t header_length,
-                        const void *payload, size_t length)
+                  size_t header_length, const void *payload, size_t length,
+                  int is_reply)
 {
-    uct_ep_h am_ep = ucp_ep_get_am_uct_ep(ep);
-    size_t tx_length;
-    ucp_am_hdr_t hdr;
-    const void *data;
-    void *tx_buffer;
+    size_t iov_cnt = 0ul;
+    size_t am_hdr_size;
+    uct_iov_t iov[3];
+    uint8_t am_id;
+    ucp_am_reply_hdr_t am_hdr;
     ucs_status_t status;
 
-    ucs_assert(flags & UCP_AM_SEND_REPLY);
-    ucs_assert((length == 0ul) || (header_length == 0ul));
-
-    status = ucp_ep_resolve_remote_id(ep, ep->am_lane);
-    if (ucs_unlikely(status != UCS_OK)) {
-        return status;
-    }
+    ucp_am_fill_short_header(&am_hdr.super, id, flags, header_length);
+    if (is_reply) {
+        status = ucp_ep_resolve_remote_id(ep, ep->am_lane);
+        if (ucs_unlikely(status != UCS_OK)) {
+            return status;
+        }
 
-    if (header_length != 0) {
-        tx_length = header_length;
-        data      = header;
+        am_hdr.ep_id = ucp_ep_remote_id(ep);
+        am_hdr_size  = sizeof(ucp_am_reply_hdr_t);
+        am_id        = UCP_AM_ID_SINGLE_REPLY;
     } else {
-        tx_length = length;
-        data      = payload;
+        am_hdr_size = sizeof(ucp_am_hdr_t);
+        am_id       = UCP_AM_ID_SINGLE;
     }
 
-    /* Reply protocol carries ep_id in its header in addition to AM short
-     * header. UCT AM short protocol accepts only 8 bytes header, so add ep_id
-     * right before the data.
-     * TODO: Use uct_ep_am_short_iov instead, when it is defined in UCT
-     */
-    UCS_STATIC_ASSERT(ucs_offsetof(ucp_am_reply_hdr_t, ep_id) == sizeof(hdr));
+    ucp_add_uct_iov_elem(iov, &am_hdr, am_hdr_size, UCT_MEM_HANDLE_NULL,
+                         &iov_cnt);
+    ucp_add_uct_iov_elem(iov, (void*)payload, length, UCT_MEM_HANDLE_NULL,
+                         &iov_cnt);
 
-    tx_buffer = ucs_alloca(tx_length + sizeof(ucs_ptr_map_key_t));
-
-    *((ucs_ptr_map_key_t*)tx_buffer) = ucp_ep_remote_id(ep);
-
-    ucp_am_fill_short_header(&hdr, id, flags, header_length);
-
-    memcpy(UCS_PTR_BYTE_OFFSET(tx_buffer, sizeof(ucs_ptr_map_key_t)),
-           data, tx_length);
+    if (header_length != 0) {
+        ucp_add_uct_iov_elem(iov, (void*)header, header_length,
+                             UCT_MEM_HANDLE_NULL, &iov_cnt);
+    }
 
-    return uct_ep_am_short(am_ep, UCP_AM_ID_SINGLE_REPLY, hdr.u64, tx_buffer,
-                           tx_length + sizeof(ucs_ptr_map_key_t));
+    return uct_ep_am_short_iov(ucp_ep_get_am_uct_ep(ep), am_id, iov, iov_cnt);
 }
 
 static ucs_status_t ucp_am_contig_short(uct_pending_req_t *self)
@@ -546,12 +565,8 @@ static ucs_status_t ucp_am_contig_short(uct_pending_req_t *self)
                                        req->send.msg_proto.am.flags,
                                        req->send.msg_proto.am.header,
                                        req->send.msg_proto.am.header_length,
-                                       req->send.buffer, req->send.length);
-    if (ucs_likely(status == UCS_OK)) {
-        ucp_request_complete_send(req, UCS_OK);
-    }
-
-    return status;
+                                       req->send.buffer, req->send.length, 0);
+    return ucp_am_short_handle_status_from_pending(req, status);
 }
 
 static ucs_status_t ucp_am_contig_short_reply(uct_pending_req_t *self)
@@ -561,16 +576,12 @@ static ucs_status_t ucp_am_contig_short_reply(uct_pending_req_t *self)
     ucs_status_t status;
 
     req->send.lane = ucp_ep_get_am_lane(ep);
-    status         = ucp_am_send_short_reply(ep, req->send.msg_proto.am.am_id,
-                                             req->send.msg_proto.am.flags,
-                                             req->send.msg_proto.am.header,
-                                             req->send.msg_proto.am.header_length,
-                                             req->send.buffer, req->send.length);
-    if (ucs_likely(status == UCS_OK)) {
-        ucp_request_complete_send(req, UCS_OK);
-    }
-
-    return status;
+    status         = ucp_am_send_short(ep, req->send.msg_proto.am.am_id,
+                                       req->send.msg_proto.am.flags,
+                                       req->send.msg_proto.am.header,
+                                       req->send.msg_proto.am.header_length,
+                                       req->send.buffer, req->send.length, 1);
+    return ucp_am_short_handle_status_from_pending(req, status);
 }
 
 static ucs_status_t ucp_am_bcopy_single(uct_pending_req_t *self)
@@ -594,7 +605,7 @@ static ucs_status_t ucp_am_bcopy_multi(uct_pending_req_t *self)
     ucs_status_t status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_FIRST,
                                                 UCP_AM_ID_MIDDLE,
                                                 ucp_am_bcopy_pack_args_first,
-                                                ucp_am_bcopy_pack_args_mid, 0);
+                                                ucp_am_bcopy_pack_args_mid, 1);
 
     return ucp_am_bcopy_handle_status_from_pending(self, 1, 0, status);
 }
@@ -686,8 +697,7 @@ static ucs_status_t ucp_am_zcopy_multi(uct_pending_req_t *self)
     ucp_am_fill_first_header(&first_hdr, req);
 
     return ucp_do_am_zcopy_multi(self, UCP_AM_ID_FIRST, UCP_AM_ID_MIDDLE,
-                                 &first_hdr, sizeof(first_hdr),
-                                 NULL, sizeof(mid_hdr),
+                                 &first_hdr, sizeof(first_hdr), NULL, 0ul,
                                  req->send.msg_proto.am.reg_desc,
                                  user_hdr_length,
                                  ucp_am_zcopy_req_complete, 1);
@@ -695,15 +705,14 @@ static ucs_status_t ucp_am_zcopy_multi(uct_pending_req_t *self)
 
 size_t ucp_am_rndv_rts_pack(void *dest, void *arg)
 {
-    ucp_request_t *sreq               = arg;
-    ucp_am_rndv_rts_hdr_t *am_rts_hdr = dest;
-    size_t max_bcopy                  = ucp_ep_get_max_bcopy(sreq->send.ep,
-                                                             sreq->send.lane);
+    ucp_request_t *sreq         = arg;
+    ucp_rndv_rts_hdr_t *rts_hdr = dest;
+    size_t max_bcopy            = ucp_ep_get_max_bcopy(sreq->send.ep,
+                                                       sreq->send.lane);
     size_t rts_size, total_size;
 
-    ucp_am_fill_header(&am_rts_hdr->am, sreq);
-    rts_size = ucp_rndv_rts_pack(sreq, &am_rts_hdr->super,
-                                 sizeof(*am_rts_hdr), UCP_RNDV_RTS_FLAG_AM);
+    ucp_am_fill_header(ucp_am_hdr_from_rts(rts_hdr), sreq);
+    rts_size = ucp_rndv_rts_pack(sreq, rts_hdr, UCP_RNDV_RTS_AM);
 
     if (sreq->send.msg_proto.am.header_length == 0) {
         return rts_size;
@@ -715,7 +724,7 @@ size_t ucp_am_rndv_rts_pack(void *dest, void *arg)
         ucs_fatal("RTS is too big %lu, max %lu", total_size, max_bcopy);
     }
 
-    ucp_am_pack_user_header(UCS_PTR_BYTE_OFFSET(am_rts_hdr, rts_size), sreq);
+    ucp_am_pack_user_header(UCS_PTR_BYTE_OFFSET(rts_hdr, rts_size), sreq);
 
     return total_size;
 }
@@ -724,15 +733,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_proto_progress_am_rndv_rts, (self),
                  uct_pending_req_t *self)
 {
     ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct);
-    size_t max_rts_size;
 
     /* RTS consists of: AM RTS header, packed rkeys and user header */
-    max_rts_size = sizeof(ucp_am_rndv_rts_hdr_t) +
-                   ucp_ep_config(sreq->send.ep)->rndv.rkey_size +
-                   sreq->send.msg_proto.am.header_length;
-
-    return ucp_do_am_single(self, UCP_AM_ID_RNDV_RTS, ucp_am_rndv_rts_pack,
-                            max_rts_size);
+    return ucp_rndv_send_rts(sreq, ucp_am_rndv_rts_pack,
+                             sizeof(ucp_rndv_rts_hdr_t) +
+                                     sreq->send.msg_proto.am.header_length);
 }
 
 static ucs_status_t ucp_am_send_start_rndv(ucp_request_t *sreq)
@@ -742,6 +747,8 @@ static ucs_status_t ucp_am_send_start_rndv(ucp_request_t *sreq)
                   sreq->send.length);
     UCS_PROFILE_REQUEST_EVENT(sreq, "start_rndv", sreq->send.length);
 
+    ucp_send_request_id_alloc(sreq);
+
     /* Note: no need to call ucp_ep_resolve_remote_id() here, because it
      * was done in ucp_am_send_nbx
      */
@@ -752,7 +759,8 @@ static ucs_status_t ucp_am_send_start_rndv(ucp_request_t *sreq)
 static void ucp_am_send_req_init(ucp_request_t *req, ucp_ep_h ep,
                                  const void *header, size_t header_length,
                                  const void *buffer, ucp_datatype_t datatype,
-                                 size_t count, uint16_t flags, uint16_t am_id)
+                                 size_t count, uint16_t flags, uint16_t am_id,
+                                 const ucp_request_param_t *param)
 {
     req->flags                           = UCP_REQUEST_FLAG_SEND_AM;
     req->send.ep                         = ep;
@@ -762,25 +770,48 @@ static void ucp_am_send_req_init(ucp_request_t *req, ucp_ep_h ep,
     req->send.msg_proto.am.header_length = header_length;
     req->send.buffer                     = (void*)buffer;
     req->send.datatype                   = datatype;
-    req->send.mem_type                   = UCS_MEMORY_TYPE_HOST;
     req->send.lane                       = ep->am_lane;
     req->send.pending_lane               = UCP_NULL_LANE;
 
     ucp_request_send_state_init(req, datatype, count);
-    req->send.length = ucp_dt_length(req->send.datatype, count,
-                                     req->send.buffer, &req->send.state.dt);
+    req->send.length   = ucp_dt_length(req->send.datatype, count,
+                                       req->send.buffer, &req->send.state.dt);
+    req->send.mem_type = ucp_request_get_memory_type(ep->worker->context,
+                                                     req->send.buffer,
+                                                     req->send.length, param);
+}
+
+static UCS_F_ALWAYS_INLINE size_t
+ucp_am_rndv_thresh(ucp_request_t *req, const ucp_request_param_t *param,
+                   ucp_ep_config_t *ep_config, uint32_t flags,
+                   ssize_t *max_short)
+{
+    size_t rndv_rma_thresh, rndv_am_thresh;
+
+    if (flags & UCP_AM_SEND_FLAG_EAGER) {
+        return SIZE_MAX;
+    } else if (flags & UCP_AM_SEND_FLAG_RNDV) {
+        *max_short = -1; /* disable short, rndv is explicitly requested */
+        return 0;
+    } else {
+        ucp_request_param_rndv_thresh(req, param, &ep_config->rndv.rma_thresh,
+                                      &ep_config->rndv.am_thresh,
+                                      &rndv_rma_thresh, &rndv_am_thresh);
+        return ucs_min(rndv_rma_thresh, rndv_am_thresh);
+    }
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_ptr_t
 ucp_am_send_req(ucp_request_t *req, size_t count,
                 const ucp_ep_msg_config_t *msg_config,
                 const ucp_request_param_t *param,
-                const ucp_request_send_proto_t *proto, ssize_t max_short)
+                const ucp_request_send_proto_t *proto, ssize_t max_short,
+                uint32_t flags)
 {
     unsigned user_header_length = req->send.msg_proto.am.header_length;
     ucp_context_t *context      = req->send.ep->worker->context;
     ucp_ep_config_t *ep_config  = ucp_ep_config(req->send.ep);
-    size_t rndv_rma_thresh, rndv_am_thresh, rndv_thresh;
+    size_t rndv_thresh;
     size_t zcopy_thresh;
     ucs_status_t status;
 
@@ -793,11 +824,7 @@ ucp_am_send_req(ucp_request_t *req, size_t count,
         max_short = ucp_am_get_short_max(req, max_short);
     }
 
-    /* TODO: Add support for UCP_AM_SEND_EAGER/RNDV flags */
-    ucp_request_param_rndv_thresh(req, param, &ep_config->rndv.rma_thresh,
-                                  &ep_config->rndv.am_thresh, &rndv_rma_thresh,
-                                  &rndv_am_thresh);
-    rndv_thresh = ucs_min(rndv_rma_thresh, rndv_am_thresh);
+    rndv_thresh = ucp_am_rndv_thresh(req, param, ep_config, flags, &max_short);
 
     if ((user_header_length != 0) &&
         (((user_header_length + sizeof(ucp_am_first_hdr_t) + 1) >
@@ -812,12 +839,19 @@ ucp_am_send_req(ucp_request_t *req, size_t count,
          * TODO: Consider other ways to send user header, like packing together
          * with UCT AM header, direct registration of user header buffer, etc.
          */
-        zcopy_thresh = SIZE_MAX;
+        zcopy_thresh = rndv_thresh;
     } else {
         zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, count,
                                                      rndv_thresh);
     }
 
+    ucs_trace_req("select am request(%p) progress algorithm datatype=0x%"PRIx64
+                  " buffer=%p length=%zu header_length=%u max_short=%zd"
+                  " rndv_thresh=%zu zcopy_thresh=%zu",
+                  req, req->send.datatype, req->send.buffer, req->send.length,
+                  req->send.msg_proto.am.header_length, max_short, rndv_thresh,
+                  zcopy_thresh);
+
     status = ucp_request_send_start(req, max_short, zcopy_thresh, rndv_thresh,
                                     count, !!user_header_length,
                                     ucp_am_send_req_total_size(req),
@@ -827,7 +861,7 @@ ucp_am_send_req(ucp_request_t *req, size_t count,
             return UCS_STATUS_PTR(status);
         }
 
-        ucs_assert(req->send.length >= rndv_thresh);
+        ucs_assert(ucp_am_send_req_total_size(req) >= rndv_thresh);
 
         status = ucp_am_send_start_rndv(req);
         if (status != UCS_OK) {
@@ -861,23 +895,18 @@ ucp_am_send_req(ucp_request_t *req, size_t count,
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_am_try_send_short(ucp_ep_h ep, uint16_t id, uint32_t flags,
                       const void *header, size_t header_length,
-                      const void *buffer, size_t length)
+                      const void *buffer, size_t length,
+                      const ucp_memtype_thresh_t *max_eager_short)
 {
-    if (ucs_unlikely(((length != 0) && (header_length != 0)) ||
-                     ((ssize_t)(length + header_length) >
-                      ucp_ep_config(ep)->am.max_short))) {
-        goto out;
+    if (ucs_unlikely(flags & UCP_AM_SEND_FLAG_RNDV)) {
+        return UCS_ERR_NO_RESOURCE;
     }
 
-    if (!(flags & UCP_AM_SEND_REPLY)) {
-        return ucp_am_send_short(ep, id, flags, header, header_length,
-                                 buffer, length);
-    } else if ((length + header_length) < UCP_AM_SHORT_REPLY_MAX_SIZE) {
-        return ucp_am_send_short_reply(ep, id, flags, header, header_length,
-                                       buffer, length);
+    if (ucp_proto_is_inline(ep, max_eager_short, header_length + length)) {
+        return ucp_am_send_short(ep, id, flags, header, header_length, buffer,
+                                 length, flags & UCP_AM_SEND_REPLY);
     }
 
-out:
     return UCS_ERR_NO_RESOURCE;
 }
 
@@ -893,9 +922,12 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_send_nbx,
     ucp_request_t *req;
     uint32_t attr_mask;
     uint32_t flags;
+    ucp_memtype_thresh_t *max_short;
+    const ucp_request_send_proto_t *proto;
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_AM,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
 
@@ -903,9 +935,17 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_send_nbx,
     attr_mask = param->op_attr_mask &
                 (UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FLAG_NO_IMM_CMPL);
 
+    if (flags & UCP_AM_SEND_REPLY) {
+        max_short = &ucp_ep_config(ep)->am_u.max_reply_eager_short;
+        proto     = ucp_ep_config(ep)->am_u.reply_proto;
+    } else {
+        max_short = &ucp_ep_config(ep)->am_u.max_eager_short;
+        proto     = ucp_ep_config(ep)->am_u.proto;
+    }
+
     if (ucs_likely(attr_mask == 0)) {
         status = ucp_am_try_send_short(ep, id, flags, header, header_length,
-                                       buffer, count);
+                                       buffer, count, max_short);
         ucp_request_send_check_status(status, ret, goto out);
         datatype = ucp_dt_make_contig(1);
     } else if (attr_mask == UCP_OP_ATTR_FIELD_DATATYPE) {
@@ -914,7 +954,8 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_send_nbx,
             status = ucp_am_try_send_short(ep, id, flags, header,
                                            header_length, buffer,
                                            ucp_contig_dt_length(datatype,
-                                                                count));
+                                                                count),
+                                           max_short);
             ucp_request_send_check_status(status, ret, goto out);
         }
     } else {
@@ -937,18 +978,13 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_send_nbx,
                                  goto out;});
 
     ucp_am_send_req_init(req, ep, header, header_length, buffer, datatype,
-                         count, flags, id);
+                         count, flags, id, param);
 
-    if (flags & UCP_AM_SEND_REPLY) {
-        ret = ucp_am_send_req(req, count, &ucp_ep_config(ep)->am, param,
-                              ucp_ep_config(ep)->am_u.reply_proto,
-                              ucs_min(ucp_ep_config(ep)->am.max_short,
-                                      UCP_AM_SHORT_REPLY_MAX_SIZE));
-    } else {
-        ret = ucp_am_send_req(req, count, &ucp_ep_config(ep)->am, param,
-                              ucp_ep_config(ep)->am_u.proto,
-                              ucp_ep_config(ep)->am.max_short);
-    }
+    /* Note that max_eager_short.memtype_on is always initialized to real
+     * max_short value
+     */
+    ret = ucp_am_send_req(req, count, &ucp_ep_config(ep)->am, param, proto,
+                          max_short->memtype_on, flags);
 
 out:
     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker);
@@ -963,7 +999,7 @@ ucs_status_ptr_t ucp_am_send_nb(ucp_ep_h ep, uint16_t id, const void *payload,
         .op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE |
                         UCP_OP_ATTR_FIELD_CALLBACK |
                         UCP_OP_ATTR_FIELD_FLAGS,
-        .flags        = flags,
+        .flags        = flags | UCP_AM_SEND_FLAG_EAGER,
         .cb.send      = (ucp_send_nbx_callback_t)cb,
         .datatype     = datatype
     };
@@ -976,54 +1012,110 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_recv_data_nbx,
                  ucp_worker_h worker, void *data_desc, void *buffer,
                  size_t count, const ucp_request_param_t *param)
 {
-    ucp_am_rndv_rts_hdr_t *rts = data_desc;
-    ucp_recv_desc_t *desc      = (ucp_recv_desc_t*)data_desc - 1;
+    ucp_recv_desc_t *desc = (ucp_recv_desc_t*)data_desc - 1;
+    ucp_context_h context = worker->context;
     ucs_status_ptr_t ret;
     ucp_request_t *req;
     ucp_datatype_t datatype;
+    ucs_memory_type_t mem_type;
+    ucp_rndv_rts_hdr_t *rts;
+    ucs_status_t status;
+    size_t recv_length;
 
-    UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_AM,
+    /* Sanity check if the descriptor has been released */
+    if (ENABLE_PARAMS_CHECK &&
+        ucs_unlikely(desc->flags & UCP_RECV_DESC_FLAG_RELEASED)) {
+        ucs_error("attempt to receive AM data with invalid descriptor");
+        return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM);
+    }
+
+    UCP_CONTEXT_CHECK_FEATURE_FLAGS(context, UCP_FEATURE_AM,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
-    ucs_assert(rts->super.flags & UCP_RNDV_RTS_FLAG_AM);
-    ucs_assert(desc->flags & UCP_RECV_DESC_FLAG_RNDV);
 
-    if (ucs_unlikely(desc->flags & UCP_RECV_DESC_FLAG_RNDV_STARTED)) {
-        ucs_error("ucp_am_recv_data_nbx was already called for desc %p",
-                  data_desc);
+    if (ucs_unlikely(desc->flags & UCP_RECV_DESC_FLAG_RECV_STARTED)) {
+        ucs_error("ucp_am_recv_data_nbx was already called for desc %p, "
+                  "desc flags 0x%x",
+                  data_desc, desc->flags);
         ret = UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM);
         goto out;
     }
 
-    req = ucp_request_get_param(worker, param,
-                                {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
-                                 goto out;});
-
-    /* Mark that rendezvous is started on this data descriptor */
-    desc->flags       |= UCP_RECV_DESC_FLAG_RNDV_STARTED;
-
-    /* Initialize receive request */
-    datatype           = ucp_request_param_datatype(param);
-    req->status        = UCS_OK;
-    req->recv.worker   = worker;
-    req->recv.buffer   = buffer;
-    req->flags         = UCP_REQUEST_FLAG_RECV_AM;
-    req->recv.datatype = datatype;
-    ucp_dt_recv_state_init(&req->recv.state, buffer, datatype, count);
-    req->recv.length   = ucp_dt_length(datatype, count, buffer,
-                                       &req->recv.state);
-    req->recv.mem_type = UCS_MEMORY_TYPE_HOST;
-    req->recv.am.desc  = (ucp_recv_desc_t*)rts - 1;
+    desc->flags |= UCP_RECV_DESC_FLAG_RECV_STARTED;
+    datatype     = ucp_request_param_datatype(param);
+    mem_type     = ucp_request_get_memory_type(context, buffer, desc->length,
+                                               param);
+
+    ucs_trace("AM recv %s buffer %p dt 0x%lx count %zu memtype %s",
+              (desc->flags & UCP_RECV_DESC_FLAG_RNDV) ? "rndv" : "eager",
+              buffer, datatype, count, ucs_memory_type_names[mem_type]);
+
+    if (ucs_unlikely((desc->flags & UCP_RECV_DESC_FLAG_RNDV) &&
+                     (count > 0ul))) {
+        req = ucp_request_get_param(worker, param,
+                                    {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
+                                     goto out;});
+
+        /* Initialize receive request */
+        req->status        = UCS_OK;
+        req->recv.worker   = worker;
+        req->recv.buffer   = buffer;
+        req->flags         = UCP_REQUEST_FLAG_RECV_AM;
+        req->recv.datatype = datatype;
+        ucp_dt_recv_state_init(&req->recv.state, buffer, datatype, count);
+        req->recv.length   = ucp_dt_length(datatype, count, buffer,
+                                           &req->recv.state);
+        req->recv.mem_type = mem_type;
+        req->recv.am.desc  = desc;
+        rts                = data_desc;
+
+        ucp_request_set_callback_param(param, recv_am, req, recv.am);
+
+        ucs_assert(rts->opcode == UCP_RNDV_RTS_AM);
+        ucs_assertv(req->recv.length >= rts->size,
+                    "rx buffer too small %zu, need %zu", req->recv.length,
+                    rts->size);
+
+        ucp_rndv_receive(worker, req, rts, rts + 1);
+        ret = req + 1;
+        goto out;
+    }
 
-    ucp_request_set_callback_param(param, recv_am, req, recv.am);
+    if (desc->flags & UCP_RECV_DESC_FLAG_RNDV) {
+        /* Nothing to receive, send ack to sender to complete its request */
+        ucp_am_rndv_send_ats(worker, data_desc, UCS_OK);
+        recv_length = 0ul;
+        status      = UCS_OK;
+    } else {
+        /* data_desc represents eager message and can be received in place
+         * without initializing request */
+        status      = ucp_dt_unpack_only(worker, buffer, count, datatype,
+                                         mem_type, data_desc, desc->length, 1);
+        recv_length = desc->length;
+    }
 
-    ucs_assertv(req->recv.length >= rts->super.size,
-                "rx buffer too small %zu, need %zu",
-                req->recv.length, rts->super.size);
+    if (param->op_attr_mask & UCP_OP_ATTR_FLAG_NO_IMM_CMPL) {
+        req = ucp_request_get_param(worker, param,
+                                    {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
+                                     goto out;});
+        ret         = req +1;
+        req->status = status;
+        req->flags  = UCP_REQUEST_FLAG_COMPLETED;
+        ucp_request_cb_param(param, req, recv_am, recv_length);
+    } else {
+        if (param->op_attr_mask & UCP_OP_ATTR_FIELD_RECV_INFO) {
+            *param->recv_info.length = recv_length;
+        }
+        ret = UCS_STATUS_PTR(status);
+    }
 
-    ucp_rndv_receive(worker, req, &rts->super, rts + 1);
-    ret = req + 1;
+    /* Clear this flag, because receive operation is already completed and desc
+     * is not needed anymore. If receive operation was invoked from UCP AM
+     * callback, UCT AM handler would release this desc (by returning UCS_OK)
+     * back to UCT.
+     */
+    desc->flags &= ~UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS;
 
 out:
     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
@@ -1031,14 +1123,11 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_am_recv_data_nbx,
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_am_invoke_cb(ucp_worker_h worker, ucp_am_hdr_t *am_hdr,
-                 size_t am_hdr_length, size_t data_length,
+ucp_am_invoke_cb(ucp_worker_h worker, uint16_t am_id, void *user_hdr,
+                 uint32_t user_hdr_length, void *data, size_t data_length,
                  ucp_ep_h reply_ep, uint64_t recv_flags)
 {
-    uint16_t           am_id = am_hdr->am_id;
-    uint32_t user_hdr_length = am_hdr->header_length;
-        void        *am_data = UCS_PTR_BYTE_OFFSET(am_hdr, am_hdr_length);
-    ucp_am_entry_t    *am_cb = &ucs_array_elem(&worker->am, am_id);
+    ucp_am_entry_t *am_cb = &ucs_array_elem(&worker->am, am_id);
     ucp_am_recv_param_t param;
     unsigned flags;
 
@@ -1050,10 +1139,8 @@ ucp_am_invoke_cb(ucp_worker_h worker, ucp_am_hdr_t *am_hdr,
         param.recv_attr = recv_flags;
         param.reply_ep  = reply_ep;
 
-        return am_cb->cb(am_cb->context, user_hdr_length ? am_data : NULL,
-                         user_hdr_length,
-                         UCS_PTR_BYTE_OFFSET(am_data, user_hdr_length),
-                         data_length - user_hdr_length, &param);
+        return am_cb->cb(am_cb->context, user_hdr, user_hdr_length, data,
+                         data_length, &param);
     }
 
     if (ucs_unlikely(user_hdr_length != 0)) {
@@ -1066,8 +1153,7 @@ ucp_am_invoke_cb(ucp_worker_h worker, ucp_am_hdr_t *am_hdr,
     flags = (recv_flags & UCP_AM_RECV_ATTR_FLAG_DATA) ?
             UCP_CB_PARAM_FLAG_DATA : 0;
 
-    return am_cb->cb_old(am_cb->context, am_data, data_length, reply_ep,
-                         flags);
+    return am_cb->cb_old(am_cb->context, data, data_length, reply_ep, flags);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
@@ -1075,40 +1161,66 @@ ucp_am_handler_common(ucp_worker_h worker, ucp_am_hdr_t *am_hdr, size_t hdr_size
                       size_t total_length, ucp_ep_h reply_ep, unsigned am_flags,
                       uint64_t recv_flags)
 {
-    ucp_recv_desc_t *desc = NULL;
-    void *data;
+    ucp_recv_desc_t *desc    = NULL;
+    uint16_t am_id           = am_hdr->am_id;
+    uint32_t user_hdr_size   = am_hdr->header_length;
+    ucp_am_entry_t *am_cb    = &ucs_array_elem(&worker->am, am_id);
+    void *data               = UCS_PTR_BYTE_OFFSET(am_hdr, hdr_size);
+    size_t data_length       = total_length -
+                               (hdr_size + am_hdr->header_length);
+    void *user_hdr           = UCS_PTR_BYTE_OFFSET(data, data_length);
+    ucs_status_t desc_status = UCS_OK;
     ucs_status_t status;
 
-    recv_flags |= (am_flags & UCT_CB_PARAM_FLAG_DESC) ?
-                  UCP_AM_RECV_ATTR_FLAG_DATA : 0;
+    ucs_assert(total_length >= am_hdr->header_length + hdr_size);
 
-    status      = ucp_am_invoke_cb(worker, am_hdr, hdr_size,
-                                   total_length - hdr_size, reply_ep,
-                                   recv_flags);
-    if (status != UCS_INPROGRESS) {
-        return UCS_OK; /* we do not need UCT desc, just return UCS_OK */
+    /* Initialize desc in advance, so the user could invoke ucp_am_recv_data_nbx
+     * from the AM callback directly. The only exception is inline data when
+     * AM callback is registered without UCP_AM_FLAG_PERSISTENT_DATA flag.
+     */
+    if ((am_flags & UCT_CB_PARAM_FLAG_DESC) ||
+        (am_cb->flags & UCP_AM_FLAG_PERSISTENT_DATA)) {
+        /* User header can not be accessed outside the user callback, so do not
+         * include it to the total descriptor length. It helps to avoid extra
+         * memory copy of the user header if the message is short/inlined
+         * (i.e. received without UCT_CB_PARAM_FLAG_DESC flag).
+         */
+        recv_flags |= UCP_AM_RECV_ATTR_FLAG_DATA;
+        desc_status = ucp_recv_desc_init(worker, data, data_length, 0, am_flags,
+                                         0, UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS,
+                                         -hdr_size, &desc);
+        if (ucs_unlikely(UCS_STATUS_IS_ERR(desc_status))) {
+            ucs_error("worker %p could not allocate descriptor for active"
+                      " message on callback : %u",
+                      worker, am_id);
+            return UCS_OK;
+        }
+        data        = desc + 1;
+        recv_flags |= UCP_AM_RECV_ATTR_FLAG_DATA;
     }
 
-    if (ucs_unlikely(!(am_flags & UCT_CB_PARAM_FLAG_DESC))) {
-        ucs_error("can't hold data, FLAG_DATA flag is not set");
+    status = ucp_am_invoke_cb(worker, am_id, user_hdr, user_hdr_size, data,
+                              data_length, reply_ep, recv_flags);
+    if (desc == NULL) {
+        if (ucs_unlikely(status == UCS_INPROGRESS)) {
+            ucs_error("can't hold data, FLAG_DATA flag is not set");
+            return UCS_OK;
+        }
+        ucs_assert(status == UCS_OK);
+
         return UCS_OK;
     }
 
-    ucs_assert(total_length >= am_hdr->header_length + hdr_size);
-    data   = UCS_PTR_BYTE_OFFSET(am_hdr, hdr_size + am_hdr->header_length);
-    status = ucp_recv_desc_init(worker, data,
-                                total_length - hdr_size - am_hdr->header_length,
-                                0,
-                                UCT_CB_PARAM_FLAG_DESC, /* pass as a const */
-                                0, 0, -hdr_size, &desc);
-    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
-        ucs_error("worker %p could not allocate descriptor for active"
-                  " message on callback : %u", worker, am_hdr->am_id);
-        return UCS_OK;
+    ucs_assert(!UCS_STATUS_IS_ERR(status));
+
+    if (ucp_am_rdesc_in_progress(desc, status)) {
+        desc->flags &= ~UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS;
+        return desc_status;
+    } else if (!(am_flags & UCT_CB_PARAM_FLAG_DESC)) {
+        ucp_recv_desc_release(desc);
     }
-    ucs_assert(desc != NULL);
 
-    return UCS_INPROGRESS;
+    return UCS_OK;
 }
 
 UCS_PROFILE_FUNC(ucs_status_t, ucp_am_handler_reply,
@@ -1120,8 +1232,8 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_handler_reply,
     ucp_worker_h worker     = (ucp_worker_h)am_arg;
     ucp_ep_h reply_ep;
 
-    reply_ep = ucp_worker_get_ep_by_id(worker, hdr->ep_id);
-
+    UCP_WORKER_GET_VALID_EP_BY_ID(&reply_ep, worker, hdr->ep_id, return UCS_OK,
+                                  "AM (reply proto)");
     return ucp_am_handler_common(worker, &hdr->super, sizeof(*hdr),
                                  am_length, reply_ep, am_flags,
                                  UCP_AM_RECV_ATTR_FIELD_REPLY_EP);
@@ -1167,11 +1279,11 @@ ucp_am_copy_data_fragment(ucp_recv_desc_t *first_rdesc, void *data,
 }
 
 static UCS_F_ALWAYS_INLINE uint64_t
-ucp_am_hdr_reply_ep(ucp_worker_h worker, uint16_t flags, uint64_t ep_id,
+ucp_am_hdr_reply_ep(ucp_worker_h worker, uint16_t flags, ucp_ep_h ep,
                     ucp_ep_h *reply_ep_p)
 {
     if (flags & UCP_AM_SEND_REPLY) {
-        *reply_ep_p = ucp_worker_get_ep_by_id(worker, ep_id);
+        *reply_ep_p = ep;
         return UCP_AM_RECV_ATTR_FIELD_REPLY_EP;
     }
 
@@ -1182,13 +1294,15 @@ ucp_am_hdr_reply_ep(ucp_worker_h worker, uint16_t flags, uint64_t ep_id,
 
 static UCS_F_ALWAYS_INLINE void
 ucp_am_handle_unfinished(ucp_worker_h worker, ucp_recv_desc_t *first_rdesc,
-                         void *data, size_t length, size_t offset)
+                         void *data, size_t length, size_t offset,
+                         ucp_ep_h reply_ep)
 {
     ucp_am_first_hdr_t *first_hdr;
     ucs_status_t status;
-    ucp_ep_h reply_ep;
-    void *msg;
+    void *msg, *user_hdr;
     uint64_t recv_flags;
+    size_t desc_offset, user_hdr_length, total_size;
+    uint16_t am_id;
 
     ucp_am_copy_data_fragment(first_rdesc, data, length, offset);
 
@@ -1201,34 +1315,44 @@ ucp_am_handle_unfinished(ucp_worker_h worker, ucp_recv_desc_t *first_rdesc,
      * ep AM extension */
     ucs_list_del(&first_rdesc->am_first.list);
 
-    first_hdr  = (ucp_am_first_hdr_t*)(first_rdesc + 1);
-    recv_flags = ucp_am_hdr_reply_ep(worker, first_hdr->super.super.flags,
-                                     first_hdr->super.ep_id, &reply_ep);
-
-    status     = ucp_am_invoke_cb(worker, &first_hdr->super.super,
-                                  sizeof(*first_hdr), first_hdr->total_size,
-                                  reply_ep,
-                                  recv_flags | UCP_AM_RECV_ATTR_FLAG_DATA);
-    if (status != UCS_INPROGRESS) {
-        ucs_free(first_rdesc); /* user does not need to hold this data */
-        return;
-    }
+    first_hdr       = (ucp_am_first_hdr_t*)(first_rdesc + 1);
+    recv_flags      = ucp_am_hdr_reply_ep(worker, first_hdr->super.super.flags,
+                                          reply_ep, &reply_ep) |
+                      UCP_AM_RECV_ATTR_FLAG_DATA;
+    msg             = first_hdr + 1;
+    am_id           = first_hdr->super.super.am_id;
+    total_size      = first_hdr->total_size;
+    user_hdr_length = first_hdr->super.super.header_length;
+    user_hdr        = UCS_PTR_BYTE_OFFSET(msg, total_size);
 
     /* Need to reinit descriptor, because we passed data shifted by
-     * ucp_am_first_hdr_t size + user header size to the cb.
-     * In ucp_am_data_release function, we calculate desc as
-     * "data_pointer - sizeof(desc)", which would not point to the beginning
-     * of the original desc.
-     * original desc layout: |desc|first_hdr|user_hdr|data|
-     * new desc layout:                         |desc|data| (AM first and user
-     *                                                       headers are not
-     *                                                       needed anymore,
-     *                                                       can overwrite)
+     * ucp_am_first_hdr_t size to the cb. In ucp_am_data_release and
+     * ucp_am_recv_data_nbx functions, we calculate desc as
+     * "data_pointer - sizeof(desc)", which would not point to the beginning of
+     * the original desc.
+     *
+     * original desc layout: |desc|first_hdr|data|user_hdr|
+     *
+     * new desc layout:                |desc|data| (AM first header is not
+     *                                              needed anymore, can
+     *                                              overwrite)
      */
-    msg                = UCS_PTR_BYTE_OFFSET(first_rdesc + 1,
-                                             first_rdesc->payload_offset);
-    first_rdesc        = (ucp_recv_desc_t*)msg - 1;
-    first_rdesc->flags = UCP_RECV_DESC_FLAG_MALLOC;
+    desc_offset                   = first_rdesc->payload_offset;
+    first_rdesc                   = (ucp_recv_desc_t*)msg - 1;
+    first_rdesc->flags            = UCP_RECV_DESC_FLAG_MALLOC |
+                                    UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS;
+    first_rdesc->am_malloc_offset = desc_offset;
+    first_rdesc->length           = total_size;
+    status                        = ucp_am_invoke_cb(worker, am_id, user_hdr,
+                                                     user_hdr_length, msg,
+                                                     total_size, reply_ep,
+                                                     recv_flags);
+    if (!ucp_am_rdesc_in_progress(first_rdesc, status)) {
+        /* user does not need to hold this data */
+        ucp_am_release_long_desc(first_rdesc);
+    } else {
+        first_rdesc->flags &= ~UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS;
+    }
 
     return;
 }
@@ -1240,27 +1364,31 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_long_first_handler,
 {
     ucp_worker_h worker           = am_arg;
     ucp_am_first_hdr_t *first_hdr = am_data;
+    size_t user_hdr_length        = first_hdr->super.super.header_length;
     ucp_recv_desc_t *mid_rdesc, *first_rdesc;
-    ucp_ep_h ep;
     ucp_ep_ext_proto_t *ep_ext;
     ucp_am_mid_hdr_t *mid_hdr;
     ucs_queue_iter_t iter;
-    size_t remaining;
+    ucp_ep_h ep;
+    size_t total_length;
     uint64_t recv_flags;
+    void *user_hdr;
 
-    remaining = first_hdr->total_size - (am_length - sizeof(*first_hdr));
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, first_hdr->super.ep_id,
+                                  return UCS_OK, "AM first fragment");
 
-    if (ucs_unlikely(remaining == 0)) {
+    total_length = first_hdr->total_size + sizeof(*first_hdr) + user_hdr_length;
+
+    if (ucs_unlikely(am_length == total_length)) {
         /* Can be a single fragment if send was issued on stub ep */
         recv_flags = ucp_am_hdr_reply_ep(worker, first_hdr->super.super.flags,
-                                         first_hdr->super.ep_id, &ep);
+                                         ep, &ep);
 
         return ucp_am_handler_common(worker, &first_hdr->super.super,
                                      sizeof(*first_hdr), am_length, ep,
                                      am_flags, recv_flags);
     }
 
-    ep     = ucp_worker_get_ep_by_id(worker, first_hdr->super.ep_id);
     ep_ext = ucp_ep_ext_proto(ep);
 
     /* This is the first fragment, other fragments (if arrived) should be on
@@ -1271,8 +1399,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_long_first_handler,
     /* Alloc buffer for the data and its desc, as we know total_size.
      * Need to allocate a separate rdesc which would be in one contigious chunk
      * with data buffer. */
-    first_rdesc = ucs_malloc(first_hdr->total_size + sizeof(ucp_recv_desc_t) +
-                             sizeof(*first_hdr),
+    first_rdesc = ucs_malloc(total_length + sizeof(ucp_recv_desc_t),
                              "ucp recv desc for long AM");
     if (ucs_unlikely(first_rdesc == NULL)) {
         ucs_error("failed to allocate buffer for assembling UCP AM (id %u)",
@@ -1280,9 +1407,16 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_long_first_handler,
         return UCS_OK; /* release UCT desc */
     }
 
-    first_rdesc->am_first.remaining = first_hdr->total_size + sizeof(*first_hdr);
-    first_rdesc->payload_offset     = sizeof(*first_hdr) +
-                                      first_hdr->super.super.header_length;
+    first_rdesc->am_first.remaining = first_hdr->total_size +
+                                      sizeof(*first_hdr);
+    first_rdesc->payload_offset     = sizeof(*first_hdr);
+
+    /* Copy user header to the end of message */
+    user_hdr = UCS_PTR_BYTE_OFFSET(first_hdr, am_length - user_hdr_length);
+    UCS_PROFILE_NAMED_CALL("am_memcpy_recv", ucs_memcpy_relaxed,
+                           UCS_PTR_BYTE_OFFSET(first_rdesc + 1,
+                                               first_rdesc->am_first.remaining),
+                           user_hdr, user_hdr_length);
 
     /* Copy all already arrived middle fragments to the data buffer */
     ucs_queue_for_each_safe(mid_rdesc, iter, &ep_ext->am.mid_rdesc_q,
@@ -1300,9 +1434,10 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_long_first_handler,
 
     ucs_list_add_tail(&ep_ext->am.started_ams, &first_rdesc->am_first.list);
 
-    /* Note: copy first chunk of data together with header, which contains
+    /* Note: copy first chunk of data together with AM header, which contains
      * data needed to process other fragments. */
-    ucp_am_handle_unfinished(worker, first_rdesc, first_hdr, am_length, 0);
+    ucp_am_handle_unfinished(worker, first_rdesc, first_hdr,
+                             am_length - user_hdr_length, 0, ep);
 
     return UCS_OK; /* release UCT desc */
 }
@@ -1314,19 +1449,22 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_long_middle_handler,
 {
     ucp_worker_h worker        = am_arg;
     ucp_am_mid_hdr_t *mid_hdr  = am_data;
-    ucp_ep_h ep                = ucp_worker_get_ep_by_id(worker,
-                                                         mid_hdr->ep_id);
-    ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep);
     uint64_t msg_id            = mid_hdr->msg_id;
     ucp_recv_desc_t *mid_rdesc = NULL, *first_rdesc = NULL;
+    ucp_ep_ext_proto_t *ep_ext;
+    ucp_ep_h ep;
     ucs_status_t status;
 
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, mid_hdr->ep_id, return UCS_OK,
+                                  "AM middle fragment");
+    ep_ext      = ucp_ep_ext_proto(ep);
     first_rdesc = ucp_am_find_first_rdesc(worker, ep_ext, msg_id);
     if (first_rdesc != NULL) {
         /* First fragment already arrived, just copy the data */
         ucp_am_handle_unfinished(worker, first_rdesc, mid_hdr + 1,
                                  am_length - sizeof(*mid_hdr),
-                                 mid_hdr->offset + first_rdesc->payload_offset);
+                                 mid_hdr->offset + first_rdesc->payload_offset,
+                                 ep);
         return UCS_OK; /* data is copied, release UCT desc */
     }
 
@@ -1350,29 +1488,46 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_am_long_middle_handler,
 ucs_status_t ucp_am_rndv_process_rts(void *arg, void *data, size_t length,
                                      unsigned tl_flags)
 {
-    ucp_am_rndv_rts_hdr_t *rts = data;
-    ucp_worker_h worker        = arg;
-    uint16_t am_id             = rts->am.am_id;
-    ucp_recv_desc_t *desc      = NULL;
-    ucp_am_entry_t *am_cb;
+    ucp_rndv_rts_hdr_t *rts = data;
+    ucp_worker_h worker     = arg;
+    ucp_am_hdr_t *am        = ucp_am_hdr_from_rts(rts);
+    uint16_t am_id          = am->am_id;
+    ucp_recv_desc_t *desc   = NULL;
+    ucp_am_entry_t *am_cb   = &ucs_array_elem(&worker->am, am_id);
+    ucp_ep_h ep;
     ucp_am_recv_param_t param;
     ucs_status_t status, desc_status;
     void *hdr;
 
+    if (ENABLE_PARAMS_CHECK && !(am_cb->flags & UCP_AM_CB_PRIV_FLAG_NBX)) {
+        ucs_error("active message callback registered with "
+                  "ucp_worker_set_am_handler() API does not support rendezvous "
+                  "protocol, the sender side should use ucp_am_send_nbx() API");
+        status = UCS_ERR_INVALID_PARAM;
+        goto out_send_ats;
+    }
+
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, rts->sreq.ep_id,
+                                  { status = UCS_ERR_CANCELED;
+                                     goto out_send_ats; },
+                                  "AM RTS");
+
     if (ucs_unlikely(!ucp_am_recv_check_id(worker, am_id))) {
         status = UCS_ERR_INVALID_PARAM;
         goto out_send_ats;
     }
 
-    if (rts->am.header_length != 0) {
-        ucs_assert(length >= rts->am.header_length + sizeof(*rts));
-        hdr = UCS_PTR_BYTE_OFFSET(rts, length - rts->am.header_length);
+    if (am->header_length != 0) {
+        ucs_assert(length >= am->header_length + sizeof(*rts));
+        hdr = UCS_PTR_BYTE_OFFSET(rts, length - am->header_length);
     } else {
         hdr = NULL;
     }
 
     desc_status = ucp_recv_desc_init(worker, data, length, 0, tl_flags, 0,
-                                     UCP_RECV_DESC_FLAG_RNDV, 0, &desc);
+                                     UCP_RECV_DESC_FLAG_RNDV |
+                                     UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS, 0,
+                                     &desc);
     if (ucs_unlikely(UCS_STATUS_IS_ERR(desc_status))) {
         ucs_error("worker %p could not allocate descriptor for active"
                   " message RTS on callback %u", worker, am_id);
@@ -1380,21 +1535,29 @@ ucs_status_t ucp_am_rndv_process_rts(void *arg, void *data, size_t length,
         goto out_send_ats;
     }
 
-    am_cb           = &ucs_array_elem(&worker->am, am_id);
     param.recv_attr = UCP_AM_RECV_ATTR_FLAG_RNDV |
-                      ucp_am_hdr_reply_ep(worker, rts->am.flags,
-                                          rts->super.sreq.ep_id,
+                      ucp_am_hdr_reply_ep(worker, am->flags, ep,
                                           &param.reply_ep);
-    status          = am_cb->cb(am_cb->context, hdr, rts->am.header_length,
-                                desc + 1, rts->super.size, &param);
-    if ((status == UCS_INPROGRESS) ||
-        (desc->flags & UCP_RECV_DESC_FLAG_RNDV_STARTED)) {
+    status          = am_cb->cb(am_cb->context, hdr, am->header_length,
+                                desc + 1, rts->size, &param);
+    if (ucp_am_rdesc_in_progress(desc, status)) {
         /* User either wants to save descriptor for later use or initiated
          * rendezvous receive (by ucp_am_recv_data_nbx) in the callback. */
-        ucs_assert(!UCS_STATUS_IS_ERR(status));
+        ucs_assertv(!UCS_STATUS_IS_ERR(status), "%s",
+                    ucs_status_string(status));
+
+        desc->flags &= ~UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS;
         return desc_status;
+    } else if (desc->flags & UCP_RECV_DESC_FLAG_RECV_STARTED) {
+        /* User initiated rendezvous receive in the callback and it is
+         * already completed. No need to save the descriptor for further use
+         */
+        goto out;
     }
 
+    ucs_trace_data("worker %p, RTS is dropped, length %zu, status %s",
+                   worker, length, ucs_status_string(status));
+
     /* User does not want to receive the data, fall through to send ATS. */
 
 out_send_ats:
@@ -1402,10 +1565,20 @@ ucs_status_t ucp_am_rndv_process_rts(void *arg, void *data, size_t length,
      * sender to complete its send request. */
     ucp_am_rndv_send_ats(worker, rts, status);
 
-    if ((desc != NULL) && !(desc->flags & UCP_RECV_DESC_FLAG_UCT_DESC)) {
-        /* Release descriptor if it was allocated on UCP mpool, otherwise it
-         * will be freed by UCT, when UCS_OK is returned from this func. */
-        ucp_recv_desc_release(desc);
+out:
+    if (desc != NULL) {
+        if (ENABLE_PARAMS_CHECK) {
+            /* Specifying the descriptor as released. This can detect the use of
+             * the invalid descriptor in the case when the user returns UCS_OK
+             * from the AM callback and then wrongly tries to receive data with
+             * ucp_am_recv_data_nbx(). */
+            desc->flags |= UCP_RECV_DESC_FLAG_RELEASED;
+        }
+        if (!(desc->flags & UCP_RECV_DESC_FLAG_UCT_DESC)) {
+            /* Release descriptor if it was allocated on UCP mpool, otherwise it
+             * will be freed by UCT, when UCS_OK is returned from this func. */
+            ucp_recv_desc_release(desc);
+        }
     }
 
     return UCS_OK;
diff --git a/src/ucp/core/ucp_am.h b/src/ucp/core/ucp_am.h
index f2778f5e238..b3258e88904 100644
--- a/src/ucp/core/ucp_am.h
+++ b/src/ucp/core/ucp_am.h
@@ -13,6 +13,13 @@
 #include <ucp/rndv/rndv.h>
 
 
+#define ucp_am_hdr_from_rts(_rts) \
+    ({ \
+        UCS_STATIC_ASSERT(sizeof((_rts)->hdr) == sizeof(ucp_am_hdr_t)); \
+        ((ucp_am_hdr_t*)&(_rts)->hdr); \
+    })
+
+
 enum {
     UCP_AM_CB_PRIV_FIRST_FLAG = UCS_BIT(15),
 
@@ -73,16 +80,6 @@ typedef struct {
 } ucp_am_first_desc_t;
 
 
-typedef struct {
-    ucp_rndv_rts_hdr_t       super;
-    ucp_am_hdr_t             am;
-    /*
-     * 1. packed rkeys follow
-     * 2. user header follows, if am->header_length is not 0
-     */
-} UCS_S_PACKED ucp_am_rndv_rts_hdr_t;
-
-
 ucs_status_t ucp_am_init(ucp_worker_h worker);
 
 void ucp_am_cleanup(ucp_worker_h worker);
diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c
index 9a4318870ba..fbe3f5ff7c3 100644
--- a/src/ucp/core/ucp_context.c
+++ b/src/ucp/core/ucp_context.c
@@ -19,9 +19,10 @@
 #include <ucs/datastruct/queue.h>
 #include <ucs/datastruct/string_set.h>
 #include <ucs/debug/log.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/string.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <string.h>
 
 
@@ -43,18 +44,21 @@ static const char * ucp_device_type_names[] = {
     [UCT_DEVICE_TYPE_SELF] = "loopback",
 };
 
-static const char * ucp_rndv_modes[] = {
+static const char *ucp_rndv_modes[] = {
+    [UCP_RNDV_MODE_AUTO]      = "auto",
     [UCP_RNDV_MODE_GET_ZCOPY] = "get_zcopy",
     [UCP_RNDV_MODE_PUT_ZCOPY] = "put_zcopy",
-    [UCP_RNDV_MODE_AUTO]      = "auto",
+    [UCP_RNDV_MODE_AM]        = "am",
     [UCP_RNDV_MODE_LAST]      = NULL,
 };
 
-const char* ucp_operation_names[] = {
+const char *ucp_operation_names[] = {
     [UCP_OP_ID_TAG_SEND]      = "tag_send",
     [UCP_OP_ID_TAG_SEND_SYNC] = "tag_send_sync",
     [UCP_OP_ID_PUT]           = "put",
     [UCP_OP_ID_GET]           = "get",
+    [UCP_OP_ID_RNDV_SEND]     = "rndv_send",
+    [UCP_OP_ID_RNDV_RECV]     = "rndv_recv",
     [UCP_OP_ID_LAST]          = NULL
 };
 
@@ -98,7 +102,7 @@ static ucs_config_field_t ucp_config_table[] = {
    " - rocm    : ROCm (AMD GPU) memory support.\n"
    " Using a \\ prefix before a transport name treats it as an explicit transport name\n"
    " and disables aliasing.\n",
-   ucs_offsetof(ucp_config_t, tls), UCS_CONFIG_TYPE_STRING_ARRAY},
+   ucs_offsetof(ucp_config_t, tls), UCS_CONFIG_TYPE_ALLOW_LIST},
 
   {"ALLOC_PRIO", "md:sysv,md:posix,huge,thp,md:*,mmap,heap",
    "Priority of memory allocation methods. Each item in the list can be either\n"
@@ -117,6 +121,10 @@ static ucs_config_field_t ucp_config_table[] = {
    "establishing client/server connection. ",
    ucs_offsetof(ucp_config_t, sockaddr_aux_tls), UCS_CONFIG_TYPE_STRING_ARRAY},
 
+  {"SELECT_DISTANCE_MD", "cuda_cpy",
+   "MD whose distance is queried when evaluating transport selection score",
+   ucs_offsetof(ucp_config_t, selection_cmp), UCS_CONFIG_TYPE_STRING},
+
   {"WARN_INVALID_CONFIG", "y",
    "Issue a warning in case of invalid device and/or transport configuration.",
    ucs_offsetof(ucp_config_t, warn_invalid_config), UCS_CONFIG_TYPE_BOOL},
@@ -144,7 +152,7 @@ static ucs_config_field_t ucp_config_table[] = {
    "the eager_zcopy protocol",
    ucs_offsetof(ucp_config_t, ctx.rndv_perf_diff), UCS_CONFIG_TYPE_DOUBLE},
 
-  {"MULTI_LANE_MAX_RATIO", "10",
+  {"MULTI_LANE_MAX_RATIO", "4",
    "Maximal allowed ratio between slowest and fastest lane in a multi-lane "
    "protocol. Lanes slower than the specified ratio will not be used.",
    ucs_offsetof(ucp_config_t, ctx.multi_lane_max_ratio), UCS_CONFIG_TYPE_DOUBLE},
@@ -202,10 +210,15 @@ static ucs_config_field_t ucp_config_table[] = {
    "Add debugging information to worker address.",
    ucs_offsetof(ucp_config_t, ctx.address_debug_info), UCS_CONFIG_TYPE_BOOL},
 
-  {"MAX_WORKER_NAME", UCS_PP_MAKE_STRING(UCP_WORKER_NAME_MAX),
-   "Maximal length of worker name. Sent to remote peer as part of worker address\n"
-   "if UCX_ADDRESS_DEBUG_INFO is set to 'yes'",
-   ucs_offsetof(ucp_config_t, ctx.max_worker_name), UCS_CONFIG_TYPE_UINT},
+  {"MAX_WORKER_NAME", NULL, "",
+   ucs_offsetof(ucp_config_t, ctx.max_worker_address_name),
+   UCS_CONFIG_TYPE_UINT},
+
+  {"MAX_WORKER_ADDRESS_NAME", UCS_PP_MAKE_STRING(UCP_WORKER_ADDRESS_NAME_MAX),
+   "Maximal length of worker address name. Sent to remote peer as part of\n"
+   "worker address if UCX_ADDRESS_DEBUG_INFO is set to 'yes'",
+   ucs_offsetof(ucp_config_t, ctx.max_worker_address_name),
+   UCS_CONFIG_TYPE_UINT},
 
   {"USE_MT_MUTEX", "n", "Use mutex for multithreading support in UCP.\n"
    "n      - Not use mutex for multithreading support in UCP (use spinlock by default).\n"
@@ -284,13 +297,6 @@ static ucs_config_field_t ucp_config_table[] = {
    "of all entities which connect to each other are the same.",
    ucs_offsetof(ucp_config_t, ctx.unified_mode), UCS_CONFIG_TYPE_BOOL},
 
-  {"SOCKADDR_CM_ENABLE", "n" /* TODO: set try by default */,
-   "Enable alternative wireup protocol for sockaddr connected endpoints.\n"
-   "Enabling this mode changes underlying UCT mechanism for connection\n"
-   "establishment and enables synchronized close protocol which does not\n"
-   "require out of band synchronization before destroying UCP resources.",
-   ucs_offsetof(ucp_config_t, ctx.sockaddr_cm_enable), UCS_CONFIG_TYPE_TERNARY},
-
   {"CM_USE_ALL_DEVICES", "y",
    "When creating client/server endpoints, use all available devices.\n"
    "If disabled, use only the one device on which the connection\n"
@@ -307,13 +313,14 @@ static ucs_config_field_t ucp_config_table[] = {
    "Experimental: enable new protocol selection logic",
    ucs_offsetof(ucp_config_t, ctx.proto_enable), UCS_CONFIG_TYPE_BOOL},
 
-  {"KEEPALIVE_TIMEOUT", "0us",
-   "Time period between keepalive rounds (0 - disabled).",
-   ucs_offsetof(ucp_config_t, ctx.keepalive_timeout), UCS_CONFIG_TYPE_TIME_UNITS},
+  /* TODO: set for keepalive more reasonable values */
+  {"KEEPALIVE_INTERVAL", "60s",
+   "Time interval between keepalive rounds (0 - disabled).",
+   ucs_offsetof(ucp_config_t, ctx.keepalive_interval), UCS_CONFIG_TYPE_TIME},
 
-  {"KEEPALIVE_NUM_EPS", "0",
+  {"KEEPALIVE_NUM_EPS", "128",
    "Maximal number of endpoints to check on every keepalive round\n"
-   "(0 - disabled, inf - check all endpoints on every round)",
+   "(inf - check all endpoints on every round, must be greater than 0)",
    ucs_offsetof(ucp_config_t, ctx.keepalive_num_eps), UCS_CONFIG_TYPE_UINT},
 
   {"PROTO_INDIRECT_ID", "auto",
@@ -329,7 +336,7 @@ UCS_CONFIG_REGISTER_TABLE(ucp_config_table, "UCP context", NULL, ucp_config_t,
 
 
 static ucp_tl_alias_t ucp_tl_aliases[] = {
-  { "mm",    { "posix", "sysv", "xpmem" } }, /* for backward compatibility */
+  { "mm",    { "posix", "sysv", "xpmem", NULL } }, /* for backward compatibility */
   { "sm",    { "posix", "sysv", "xpmem", "knem", "cma", "rdmacm", "sockcm", NULL } },
   { "shm",   { "posix", "sysv", "xpmem", "knem", "cma", "rdmacm", "sockcm", NULL } },
   { "ib",    { "rc_verbs", "ud_verbs", "rc_mlx5", "ud_mlx5", "dc_mlx5", "rdmacm", NULL } },
@@ -360,6 +367,10 @@ const char *ucp_feature_str[] = {
 };
 
 
+const ucp_tl_bitmap_t ucp_tl_bitmap_max = {{UINT64_MAX, UINT64_MAX}};
+const ucp_tl_bitmap_t ucp_tl_bitmap_min = UCS_BITMAP_ZERO;
+
+
 ucs_status_t ucp_config_read(const char *env_prefix, const char *filename,
                              ucp_config_t **config_p)
 {
@@ -488,22 +499,24 @@ static int ucp_tls_array_is_present(const char **tls, unsigned count,
     }
 }
 
-static int ucp_config_is_tl_enabled(const char **names, unsigned count,
-                                    const char *tl_name, int is_alias,
-                                    uint8_t *rsc_flags, uint64_t *tl_cfg_mask)
+static int
+ucp_config_is_tl_name_present(const ucs_config_allow_list_t *allow_list,
+                              const char *tl_name, int is_alias,
+                              uint8_t *rsc_flags, uint64_t *tl_cfg_mask)
 {
     char strict_name[UCT_TL_NAME_MAX + 1];
 
     snprintf(strict_name, sizeof(strict_name), "\\%s", tl_name);
+
     return /* strict name, with leading \\ */
-           (!is_alias && ucp_tls_array_is_present(names, count, strict_name, "",
-                                                  rsc_flags, tl_cfg_mask)) ||
-           /* plain transport name */
-           ucp_tls_array_is_present(names, count, tl_name, "", rsc_flags,
-                                    tl_cfg_mask) ||
-           /* all available transports */
-           ucp_tls_array_is_present(names, count, UCP_RSC_CONFIG_ALL, "", rsc_flags,
-                                    tl_cfg_mask);
+            (!is_alias &&
+             (ucp_tls_array_is_present((const char**)allow_list->array.names,
+                                       allow_list->array.count, strict_name, "",
+                                       rsc_flags, tl_cfg_mask))) ||
+            /* plain transport name */
+            (ucp_tls_array_is_present((const char**)allow_list->array.names,
+                                      allow_list->array.count, tl_name, "",
+                                      rsc_flags, tl_cfg_mask));
 }
 
 static int ucp_is_resource_in_device_list(const uct_tl_resource_desc_t *resource,
@@ -537,47 +550,56 @@ static int ucp_is_resource_in_device_list(const uct_tl_resource_desc_t *resource
     return !!mask;
 }
 
-static int ucp_is_resource_in_transports_list(const char *tl_name,
-                                              const char **names, unsigned count,
-                                              uint8_t *rsc_flags, uint64_t *tl_cfg_mask)
+static int
+ucp_is_resource_in_transports_list(const char *tl_name,
+                                   const ucs_config_allow_list_t *allow_list,
+                                   uint8_t *rsc_flags, uint64_t *tl_cfg_mask)
 {
     uint64_t dummy_mask, tmp_tl_cfg_mask;
     uint8_t tmp_rsc_flags;
     ucp_tl_alias_t *alias;
-    int tl_enabled;
     char info[32];
     unsigned alias_arr_count;
 
-    ucs_assert(count > 0);
-    if (ucp_config_is_tl_enabled(names, count, tl_name, 0,
-                                 rsc_flags, tl_cfg_mask)) {
-        tl_enabled = 1;
-    } else {
-        tl_enabled = 0;
+    if (allow_list->mode == UCS_CONFIG_ALLOW_LIST_ALLOW_ALL) {
+        return 1;
+    }
 
-        /* check aliases */
-        for (alias = ucp_tl_aliases; alias->alias != NULL; ++alias) {
-            /* If an alias is enabled, and the transport is part of this alias,
-             * enable the transport.
-             */
-            alias_arr_count = ucp_tl_alias_count(alias);
-            snprintf(info, sizeof(info), "for alias '%s'", alias->alias);
-            dummy_mask      = 0;
-            tmp_rsc_flags   = 0;
-            tmp_tl_cfg_mask = 0;
-            if (ucp_config_is_tl_enabled(names, count, alias->alias, 1,
-                                         &tmp_rsc_flags, &tmp_tl_cfg_mask) &&
-                ucp_tls_array_is_present(alias->tls, alias_arr_count, tl_name,
-                                         info, &tmp_rsc_flags, &dummy_mask)) {
-                *rsc_flags   |= tmp_rsc_flags;
-                *tl_cfg_mask |= tmp_tl_cfg_mask;
-                tl_enabled  = 1;
-                break;
+    ucs_assert(allow_list->array.count > 0);
+    if (ucp_config_is_tl_name_present(allow_list, tl_name, 0, rsc_flags,
+                                      tl_cfg_mask)) {
+        /* If the TL was found by its strict name - the result is known,
+           otherwise checking aliases is required */
+        return (allow_list->mode == UCS_CONFIG_ALLOW_LIST_ALLOW);
+    }
+
+    /* check aliases */
+    for (alias = ucp_tl_aliases; alias->alias != NULL; ++alias) {
+        /* If an alias is in the list and the transport belongs this alias,
+         * enable/disable the transport (according to the list mode)
+         */
+        alias_arr_count = ucp_tl_alias_count(alias);
+        snprintf(info, sizeof(info), " for alias '%s'", alias->alias);
+        dummy_mask      = 0;
+        tmp_rsc_flags   = 0;
+        tmp_tl_cfg_mask = 0;
+        if (ucp_tls_array_is_present(alias->tls, alias_arr_count, tl_name, info,
+                                     &tmp_rsc_flags, &dummy_mask)) {
+            if (ucp_config_is_tl_name_present(allow_list, alias->alias, 1,
+                                              &tmp_rsc_flags,
+                                              &tmp_tl_cfg_mask)) {
+                if (allow_list->mode == UCS_CONFIG_ALLOW_LIST_ALLOW) {
+                    *rsc_flags   |= tmp_rsc_flags;
+                    *tl_cfg_mask |= tmp_tl_cfg_mask;
+                    return 1;
+                } else {
+                    return 0;
+                }
             }
         }
     }
 
-    return tl_enabled;
+    return allow_list->mode == UCS_CONFIG_ALLOW_LIST_NEGATE;
 }
 
 static int ucp_is_resource_enabled(const uct_tl_resource_desc_t *resource,
@@ -595,8 +617,7 @@ static int ucp_is_resource_enabled(const uct_tl_resource_desc_t *resource,
 
     /* Find the enabled UCTs */
     tl_enabled = ucp_is_resource_in_transports_list(resource->tl_name,
-                                                    (const char**)config->tls.names,
-                                                    config->tls.count, rsc_flags,
+                                                    &config->tls, rsc_flags,
                                                     tl_cfg_mask);
 
     ucs_trace(UCT_TL_RESOURCE_DESC_FMT " is %sabled",
@@ -650,10 +671,8 @@ static ucs_status_t ucp_add_tl_resources(ucp_context_h context,
 {
     ucp_tl_md_t *md = &context->tl_mds[md_index];
     uct_tl_resource_desc_t *tl_resources;
-    uct_tl_resource_desc_t sa_rsc;
     ucp_tl_resource_desc_t *tmp;
     unsigned num_tl_resources;
-    unsigned num_sa_resources;
     ucs_status_t status;
     ucp_rsc_index_t i;
 
@@ -666,18 +685,14 @@ static ucs_status_t ucp_add_tl_resources(ucp_context_h context,
         goto err;
     }
 
-    /* If the md supports client-server connection establishment via sockaddr,
-       add a new tl resource here for the client side iface. */
-    num_sa_resources = !!(md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR);
-
-    if ((num_tl_resources == 0) && (!num_sa_resources)) {
+    if (num_tl_resources == 0) {
         ucs_debug("No tl resources found for md %s", md->rsc.md_name);
         goto out_free_resources;
     }
 
     tmp = ucs_realloc(context->tl_rscs,
                       sizeof(*context->tl_rscs) *
-                      (context->num_tls + num_tl_resources + num_sa_resources),
+                      (context->num_tls + num_tl_resources),
                       "ucp resources");
     if (tmp == NULL) {
         ucs_error("Failed to allocate resources");
@@ -686,34 +701,22 @@ static ucs_status_t ucp_add_tl_resources(ucp_context_h context,
     }
 
     /* print configuration */
-    for (i = 0; i < config->tls.count; ++i) {
-        ucs_trace("allowed transport %d : '%s'", i, config->tls.names[i]);
+    for (i = 0; i < config->tls.array.count; ++i) {
+        ucs_trace("allowed transport %d : '%s'", i, config->tls.array.names[i]);
     }
 
     /* copy only the resources enabled by user configuration */
     context->tl_rscs = tmp;
     for (i = 0; i < num_tl_resources; ++i) {
-        if (!(md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR)) {
-            ucs_string_set_addf(&avail_devices[tl_resources[i].dev_type],
-                                "'%s'(%s)", tl_resources[i].dev_name,
-                                context->tl_cmpts[md->cmpt_index].attr.name);
-            ucs_string_set_add(avail_tls, tl_resources[i].tl_name);
-        }
+        ucs_string_set_addf(&avail_devices[tl_resources[i].dev_type],
+                            "'%s'(%s)", tl_resources[i].dev_name,
+                            context->tl_cmpts[md->cmpt_index].attr.name);
+        ucs_string_set_add(avail_tls, tl_resources[i].tl_name);
         ucp_add_tl_resource_if_enabled(context, md, md_index, config,
                                        &tl_resources[i], 0, num_resources_p,
                                        dev_cfg_masks, tl_cfg_mask);
     }
 
-    /* add sockaddr dummy resource, if md supports it */
-    if (md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR) {
-        sa_rsc.dev_type = UCT_DEVICE_TYPE_NET;
-        ucs_snprintf_zero(sa_rsc.tl_name, UCT_TL_NAME_MAX, "%s", md->rsc.md_name);
-        ucs_snprintf_zero(sa_rsc.dev_name, UCT_DEVICE_NAME_MAX, "sockaddr");
-        ucp_add_tl_resource_if_enabled(context, md, md_index, config, &sa_rsc,
-                                       UCP_TL_RSC_FLAG_SOCKADDR, num_resources_p,
-                                       dev_cfg_masks, tl_cfg_mask);
-    }
-
 out_free_resources:
     uct_release_tl_resource_list(tl_resources);
     return UCS_OK;
@@ -744,12 +747,11 @@ static void ucp_report_unavailable(const ucs_config_names_array_t* cfg,
                                    const char *title2,
                                    const ucs_string_set_t *avail_names)
 {
-    ucs_string_buffer_t avail_strb, unavail_strb;
+    UCS_STRING_BUFFER_ONSTACK(avail_strb,   256);
+    UCS_STRING_BUFFER_ONSTACK(unavail_strb, 256);
     unsigned i;
     int found;
 
-    ucs_string_buffer_init(&unavail_strb);
-
     found = 0;
     for (i = 0; i < cfg->count; i++) {
         if (!(mask & UCS_BIT(i)) && strcmp(cfg->names[i], UCP_RSC_CONFIG_ALL) &&
@@ -761,7 +763,6 @@ static void ucp_report_unavailable(const ucs_config_names_array_t* cfg,
     }
 
     if (found) {
-        ucs_string_buffer_init(&avail_strb);
         ucs_string_set_print_sorted(avail_names, &avail_strb, ", ");
         ucs_warn("%s%s%s %s %s not available, please use one or more of: %s",
                  title1, title2,
@@ -769,10 +770,7 @@ static void ucp_report_unavailable(const ucs_config_names_array_t* cfg,
                  ucs_string_buffer_cstr(&unavail_strb),
                  (found > 1) ? "are" : "is",
                  ucs_string_buffer_cstr(&avail_strb));
-        ucs_string_buffer_cleanup(&avail_strb);
     }
-
-    ucs_string_buffer_cleanup(&unavail_strb);
 }
 
 const char * ucp_find_tl_name_by_csum(ucp_context_t *context, uint16_t tl_name_csum)
@@ -805,8 +803,9 @@ static ucs_status_t ucp_check_tl_names(ucp_context_t *context)
     return UCS_OK;
 }
 
-const char* ucp_tl_bitmap_str(ucp_context_h context, uint64_t tl_bitmap,
-                              char *str, size_t max_str_len)
+const char *ucp_tl_bitmap_str(ucp_context_h context,
+                              const ucp_tl_bitmap_t *tl_bitmap, char *str,
+                              size_t max_str_len)
 {
     ucp_rsc_index_t i;
     char *p, *endp;
@@ -814,7 +813,7 @@ const char* ucp_tl_bitmap_str(ucp_context_h context, uint64_t tl_bitmap,
     p    = str;
     endp = str + max_str_len;
 
-    ucs_for_each_bit(i, tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(*tl_bitmap, i) {
         ucs_snprintf_zero(p, endp - p, "%s ",
                           context->tl_rscs[i].tl_rsc.tl_name);
         p += strlen(p);
@@ -855,8 +854,10 @@ static ucs_status_t ucp_check_resource_config(const ucp_config_t *config)
 
      /* if we got here then num_resources > 0.
       * if the user's tls list is empty, there is no match */
-     if (0 == config->tls.count) {
-         ucs_error("The TLs list is empty. Please specify the transports you would like to use "
+     if ((0 == config->tls.array.count) &&
+         (config->tls.mode != UCS_CONFIG_ALLOW_LIST_ALLOW_ALL)) {
+         ucs_error("The TLs list is empty. Please specify the transports you "
+                   "would like to allow/forbid "
                    "or omit the UCX_TLS so that the default will be used.");
          return UCS_ERR_NO_ELEM;
      }
@@ -937,7 +938,7 @@ static void ucp_resource_config_str(const ucp_config_t *config, char *buf,
     p    = buf;
     endp = buf + max;
 
-    ucp_resource_config_array_str(&config->tls, "", p, endp - p);
+    ucp_resource_config_array_str(&config->tls.array, "", p, endp - p);
 
     if (strlen(p)) {
         p += strlen(p);
@@ -958,98 +959,30 @@ static void ucp_resource_config_str(const ucp_config_t *config, char *buf,
     }
 }
 
-static void ucp_fill_sockaddr_aux_tls_config(ucp_context_h context,
-                                             const ucp_config_t *config)
-{
-    const char **tl_names = (const char**)config->sockaddr_aux_tls.aux_tls;
-    unsigned count        = config->sockaddr_aux_tls.count;
-    uint8_t dummy_flags   = 0;
-    uint64_t dummy_mask   = 0;
-    ucp_rsc_index_t tl_id;
-
-    context->config.sockaddr_aux_rscs_bitmap = 0;
-
-    /* Check if any of the context's resources are present in the sockaddr
-     * auxiliary transports for the client-server flow */
-    ucs_for_each_bit(tl_id, context->tl_bitmap) {
-        if (ucp_is_resource_in_transports_list(context->tl_rscs[tl_id].tl_rsc.tl_name,
-                                               tl_names, count, &dummy_flags,
-                                               &dummy_mask)) {
-            context->config.sockaddr_aux_rscs_bitmap |= UCS_BIT(tl_id);
-        }
-    }
-}
-
-static void ucp_fill_sockaddr_tls_prio_list(ucp_context_h context,
-                                            const char **sockaddr_tl_names,
-                                            ucp_rsc_index_t num_sockaddr_tls)
-{
-    uint64_t sa_tls_bitmap = 0;
-    ucp_rsc_index_t idx    = 0;
-    ucp_tl_resource_desc_t *resource;
-    ucp_rsc_index_t tl_id;
-    ucp_tl_md_t *tl_md;
-    ucp_rsc_index_t j;
-
-    /* Set a bitmap of sockaddr transports */
-    for (j = 0; j < context->num_tls; ++j) {
-        resource = &context->tl_rscs[j];
-        tl_md    = &context->tl_mds[resource->md_index];
-        if (tl_md->attr.cap.flags & UCT_MD_FLAG_SOCKADDR) {
-            sa_tls_bitmap |= UCS_BIT(j);
-        }
-    }
-
-    /* Parse the sockaddr transports priority list */
-    for (j = 0; j < num_sockaddr_tls; j++) {
-        /* go over the priority list and find the transport's tl_id in the
-         * sockaddr tls bitmap. save the tl_id's for the client/server usage
-         * later */
-        ucs_for_each_bit(tl_id, sa_tls_bitmap) {
-            resource = &context->tl_rscs[tl_id];
-
-            if (!strcmp(sockaddr_tl_names[j], "*") ||
-                !strncmp(sockaddr_tl_names[j], resource->tl_rsc.tl_name,
-                         UCT_TL_NAME_MAX)) {
-                context->config.sockaddr_tl_ids[idx] = tl_id;
-                idx++;
-                sa_tls_bitmap &= ~UCS_BIT(tl_id);
-            }
-        }
-    }
-
-    context->config.num_sockaddr_tls = idx;
-}
-
 static void ucp_fill_sockaddr_cms_prio_list(ucp_context_h context,
                                             const char **sockaddr_cm_names,
-                                            ucp_rsc_index_t num_sockaddr_cms,
-                                            int sockaddr_cm_enable)
+                                            ucp_rsc_index_t num_sockaddr_cms)
 {
-    uint64_t cm_cmpts_bitmap = context->config.cm_cmpts_bitmap;
-    uint64_t cm_cmpts_bitmap_safe;
+    ucp_tl_bitmap_t cm_cmpts_bitmap = context->config.cm_cmpts_bitmap;
+    ucp_tl_bitmap_t cm_cmpts_bitmap_safe;
     ucp_rsc_index_t cmpt_idx, cm_idx;
 
     memset(&context->config.cm_cmpt_idxs, UCP_NULL_RESOURCE, UCP_MAX_RESOURCES);
     context->config.num_cm_cmpts = 0;
 
-    if (!sockaddr_cm_enable) {
-        return;
-    }
-
     /* Parse the sockaddr CMs priority list */
     for (cm_idx = 0; cm_idx < num_sockaddr_cms; ++cm_idx) {
         /* go over the priority list and find the CM's cm_idx in the
          * sockaddr CMs bitmap. Save the cmpt_idx for the client/server usage
          * later */
         cm_cmpts_bitmap_safe = cm_cmpts_bitmap;
-        ucs_for_each_bit(cmpt_idx, cm_cmpts_bitmap_safe) {
+        UCS_BITMAP_FOR_EACH_BIT(cm_cmpts_bitmap_safe, cmpt_idx) {
             if (!strcmp(sockaddr_cm_names[cm_idx], "*") ||
                 !strncmp(sockaddr_cm_names[cm_idx],
                          context->tl_cmpts[cmpt_idx].attr.name,
                          UCT_COMPONENT_NAME_MAX)) {
                 context->config.cm_cmpt_idxs[context->config.num_cm_cmpts++] = cmpt_idx;
-                cm_cmpts_bitmap &= ~UCS_BIT(cmpt_idx);
+                UCS_BITMAP_UNSET(cm_cmpts_bitmap, cmpt_idx);
             }
         }
     }
@@ -1060,8 +993,6 @@ static ucs_status_t ucp_fill_sockaddr_prio_list(ucp_context_h context,
 {
     const char **sockaddr_tl_names = (const char**)config->sockaddr_cm_tls.cm_tls;
     unsigned num_sockaddr_tls      = config->sockaddr_cm_tls.count;
-    int sockaddr_cm_enable         = context->config.ext.sockaddr_cm_enable !=
-                                     UCS_NO;
 
     /* Check if a list of sockaddr transports/CMs has valid length */
     if (num_sockaddr_tls > UCP_MAX_RESOURCES) {
@@ -1070,13 +1001,10 @@ static ucs_status_t ucp_fill_sockaddr_prio_list(ucp_context_h context,
         num_sockaddr_tls = UCP_MAX_RESOURCES;
     }
 
-    ucp_fill_sockaddr_tls_prio_list(context, sockaddr_tl_names,
-                                    num_sockaddr_tls);
     ucp_fill_sockaddr_cms_prio_list(context, sockaddr_tl_names,
-                                    num_sockaddr_tls, sockaddr_cm_enable);
-    if ((context->config.ext.sockaddr_cm_enable == UCS_YES) &&
-        (context->config.num_cm_cmpts == 0)) {
-        ucs_error("UCX_SOCKADDR_CM_ENABLE is set to yes but none of the available components supports SOCKADDR_CM");
+                                    num_sockaddr_tls);
+    if (context->config.num_cm_cmpts == 0) {
+        ucs_diag("none of the available components supports sockaddr connection management");
         return UCS_ERR_UNSUPPORTED;
     }
 
@@ -1216,7 +1144,7 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
     context->num_mem_type_detect_mds = 0;
 
     for (i = 0; i < UCS_MEMORY_TYPE_LAST; ++i) {
-        context->mem_type_access_tls[i] = 0;
+        UCS_BITMAP_CLEAR(&context->mem_type_access_tls[i]);
     }
 
     ucs_string_set_init(&avail_tls);
@@ -1250,7 +1178,7 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
         goto out_release_components;
     }
 
-    context->config.cm_cmpts_bitmap = 0;
+    UCS_BITMAP_CLEAR(&context->config.cm_cmpts_bitmap);
 
     max_mds = 0;
     for (i = 0; i < context->num_cmpts; ++i) {
@@ -1267,15 +1195,15 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
         }
 
         if (context->tl_cmpts[i].attr.flags & UCT_COMPONENT_FLAG_CM) {
-            context->config.cm_cmpts_bitmap |= UCS_BIT(i);
+            UCS_BITMAP_SET(context->config.cm_cmpts_bitmap, i);
         }
 
         max_mds += context->tl_cmpts[i].attr.md_resource_count;
     }
 
-    if ((context->config.ext.sockaddr_cm_enable == UCS_YES) &&
-        (context->config.cm_cmpts_bitmap == 0)) {
-        ucs_error("there are no UCT components with CM capability");
+    if (UCS_BITMAP_IS_ZERO(context->config.cm_cmpts_bitmap,
+                           UCP_MAX_RESOURCES)) {
+        ucs_debug("there are no UCT components with CM capability");
         status = UCS_ERR_UNSUPPORTED;
         goto err_free_resources;
     }
@@ -1313,7 +1241,8 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
      * Then the worker will open all available transport resources and will
      * select only the best ones for each particular device.
      */
-    context->tl_bitmap = config->ctx.unified_mode ? 0 : UCS_MASK(context->num_tls);
+    UCS_BITMAP_MASK(&context->tl_bitmap,
+                    config->ctx.unified_mode ? 0 : context->num_tls);
 
     /* Warn about devices and transports which were specified explicitly in the
      * configuration, but are not available
@@ -1328,7 +1257,7 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
         }
 
         ucp_get_aliases_set(&avail_tls);
-        ucp_report_unavailable(&config->tls, tl_cfg_mask, "", "transport",
+        ucp_report_unavailable(&config->tls.array, tl_cfg_mask, "", "transport",
                                &avail_tls);
     }
 
@@ -1338,7 +1267,6 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
         goto err_free_resources;
     }
 
-    ucp_fill_sockaddr_aux_tls_config(context, config);
     status = ucp_fill_sockaddr_prio_list(context, config);
     if (status != UCS_OK) {
         goto err_free_resources;
@@ -1362,50 +1290,33 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
 static void ucp_apply_params(ucp_context_h context, const ucp_params_t *params,
                              ucp_mt_type_t mt_type)
 {
-    if (params->field_mask & UCP_PARAM_FIELD_FEATURES) {
-        context->config.features = params->features;
-    } else {
-        context->config.features = 0;
-    }
+    context->config.features = UCP_PARAM_FIELD_VALUE(params, features, FEATURES,
+                                                     0);
     if (!context->config.features) {
         ucs_warn("empty features set passed to ucp context create");
     }
 
-    if (params->field_mask & UCP_PARAM_FIELD_TAG_SENDER_MASK) {
-        context->config.tag_sender_mask = params->tag_sender_mask;
-    } else {
-        context->config.tag_sender_mask = 0;
-    }
+    context->config.tag_sender_mask = UCP_PARAM_FIELD_VALUE(params,
+                                                            tag_sender_mask,
+                                                            TAG_SENDER_MASK, 0);
 
-    if (params->field_mask & UCP_PARAM_FIELD_REQUEST_SIZE) {
-        context->config.request.size = params->request_size;
-    } else {
-        context->config.request.size = 0;
-    }
+    context->config.request.size = UCP_PARAM_FIELD_VALUE(params, request_size,
+                                                         REQUEST_SIZE, 0);
 
-    if (params->field_mask & UCP_PARAM_FIELD_REQUEST_INIT) {
-        context->config.request.init = params->request_init;
-    } else {
-        context->config.request.init = NULL;
-    }
+    context->config.request.init = UCP_PARAM_FIELD_VALUE(params, request_init,
+                                                         REQUEST_INIT, NULL);
 
-    if (params->field_mask & UCP_PARAM_FIELD_REQUEST_CLEANUP) {
-        context->config.request.cleanup = params->request_cleanup;
-    } else {
-        context->config.request.cleanup = NULL;
-    }
+    context->config.request.cleanup = UCP_PARAM_FIELD_VALUE(params,
+                                                            request_cleanup,
+                                                            REQUEST_CLEANUP, NULL);
 
-    if (params->field_mask & UCP_PARAM_FIELD_ESTIMATED_NUM_EPS) {
-        context->config.est_num_eps = params->estimated_num_eps;
-    } else {
-        context->config.est_num_eps = 1;
-    }
+    context->config.est_num_eps = UCP_PARAM_FIELD_VALUE(params,
+                                                        estimated_num_eps,
+                                                        ESTIMATED_NUM_EPS, 1);
 
-    if (params->field_mask & UCP_PARAM_FIELD_ESTIMATED_NUM_PPN) {
-        context->config.est_num_ppn = params->estimated_num_ppn;
-    } else {
-        context->config.est_num_ppn = 1;
-    }
+    context->config.est_num_ppn = UCP_PARAM_FIELD_VALUE(params,
+                                                        estimated_num_ppn,
+                                                        ESTIMATED_NUM_PPN, 1);
 
     if ((params->field_mask & UCP_PARAM_FIELD_MT_WORKERS_SHARED) &&
         params->mt_workers_shared) {
@@ -1413,6 +1324,13 @@ static void ucp_apply_params(ucp_context_h context, const ucp_params_t *params,
     } else {
         context->mt_lock.mt_type = UCP_MT_TYPE_NONE;
     }
+
+    if ((params->field_mask & UCP_PARAM_FIELD_NAME) && (params->name != NULL)) {
+        ucs_snprintf_zero(context->name, UCP_ENTITY_NAME_MAX, "%s",
+                          params->name);
+    } else {
+        ucs_snprintf_zero(context->name, UCP_ENTITY_NAME_MAX, "%p", context);
+    }
 }
 
 static ucs_status_t ucp_fill_config(ucp_context_h context,
@@ -1455,11 +1373,19 @@ static ucs_status_t ucp_fill_config(ucp_context_h context,
      * routines */
     UCP_THREAD_LOCK_INIT(&context->mt_lock);
 
+    /* save comparison MD for iface_attr adjustment */
+    context->config.selection_cmp = ucs_strdup(config->selection_cmp,
+                                               "selection cmp");
+    if (context->config.selection_cmp == NULL) {
+        status = UCS_ERR_NO_MEMORY;
+        goto err;
+    }
+
     /* save environment prefix to later notify user for unused variables */
     context->config.env_prefix = ucs_strdup(config->env_prefix, "ucp config");
     if (context->config.env_prefix == NULL) {
         status = UCS_ERR_NO_MEMORY;
-        goto err;
+        goto err_free_selection_cmp;
     }
 
     /* Get allocation alignment from configuration, make sure it's valid */
@@ -1534,12 +1460,21 @@ static ucs_status_t ucp_fill_config(ucp_context_h context,
         }
     }
 
+    if (context->config.ext.keepalive_num_eps == 0) {
+        ucs_error("UCX_KEEPALIVE_NUM_EPS value must be greater than 0");
+        status = UCS_ERR_INVALID_PARAM;
+        goto err_free_alloc_methods;
+    }
+
+    context->config.keepalive_interval = ucs_time_from_sec(context->config.ext.keepalive_interval);
     return UCS_OK;
 
 err_free_alloc_methods:
     ucs_free(context->config.alloc_methods);
 err_free_env_prefix:
     ucs_free(context->config.env_prefix);
+err_free_selection_cmp:
+    ucs_free(context->config.selection_cmp);
 err:
     UCP_THREAD_LOCK_FINALIZE(&context->mt_lock);
     return status;
@@ -1549,6 +1484,14 @@ static void ucp_free_config(ucp_context_h context)
 {
     ucs_free(context->config.alloc_methods);
     ucs_free(context->config.env_prefix);
+    ucs_free(context->config.selection_cmp);
+}
+
+static void ucp_context_create_vfs(ucp_context_h context)
+{
+    ucs_vfs_obj_add_dir(NULL, context, "ucp/context/%s", context->name);
+    ucs_vfs_obj_add_ro_file(context, ucs_vfs_show_memory_address, NULL, 0,
+                            "memory_address");
 }
 
 ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_version,
@@ -1602,9 +1545,12 @@ ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_ver
         ucp_config_release(dfl_config);
     }
 
-    ucs_debug("created ucp context %p [%d mds %d tls] features 0x%"PRIx64
-              " tl bitmap 0x%"PRIx64, context, context->num_mds,
-              context->num_tls, context->config.features, context->tl_bitmap);
+    ucp_context_create_vfs(context);
+
+    ucs_debug("created ucp context %s %p [%d mds %d tls] features 0x%" PRIx64
+              " tl bitmap " UCT_TL_BITMAP_FMT,
+              context->name, context, context->num_mds, context->num_tls,
+              context->config.features, UCT_TL_BITMAP_ARG(&context->tl_bitmap));
 
     *context_p = context;
     return UCS_OK;
@@ -1623,6 +1569,7 @@ ucs_status_t ucp_init_version(unsigned api_major_version, unsigned api_minor_ver
 
 void ucp_cleanup(ucp_context_h context)
 {
+    ucs_vfs_obj_remove(context);
     ucp_free_resources(context);
     ucp_free_config(context);
     UCP_THREAD_LOCK_FINALIZE(&context->mt_lock);
@@ -1674,6 +1621,19 @@ void ucp_context_uct_atomic_iface_flags(ucp_context_h context,
     }
 }
 
+ucs_status_t ucp_lib_query(ucp_lib_attr_t *attr)
+{
+    if (attr->field_mask & UCP_LIB_ATTR_FIELD_MAX_THREAD_LEVEL) {
+#if ENABLE_MT
+        attr->max_thread_level = UCS_THREAD_MODE_MULTI;
+#else
+        attr->max_thread_level = UCS_THREAD_MODE_SERIALIZED;
+#endif
+    }
+
+    return UCS_OK;
+}
+
 ucs_status_t ucp_context_query(ucp_context_h context, ucp_context_attr_t *attr)
 {
     if (attr->field_mask & UCP_ATTR_FIELD_REQUEST_SIZE) {
@@ -1692,6 +1652,10 @@ ucs_status_t ucp_context_query(ucp_context_h context, ucp_context_attr_t *attr)
         attr->memory_types = context->mem_type_mask;
     }
 
+    if (attr->field_mask & UCP_ATTR_FIELD_NAME) {
+        ucs_strncpy_safe(attr->name, context->name, UCP_ENTITY_NAME_MAX);
+    }
+
     return UCS_OK;
 }
 
@@ -1743,63 +1707,71 @@ uct_md_h ucp_context_find_tl_md(ucp_context_h context, const char *md_name)
     return NULL;
 }
 
-ucs_memory_type_t
-ucp_memory_type_detect_mds(ucp_context_h context, const void *address, size_t size)
+void ucp_memory_detect_slowpath(ucp_context_h context, const void *address,
+                                size_t length, ucs_memory_info_t *mem_info)
 {
-    ucs_memory_type_t mem_type;
-    unsigned i, md_index;
+    uct_md_mem_attr_t mem_attr;
     ucs_status_t status;
+    ucp_md_index_t i;
+    uct_md_h md;
+
+    mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE |
+                          UCT_MD_MEM_ATTR_FIELD_SYS_DEV;
 
     for (i = 0; i < context->num_mem_type_detect_mds; ++i) {
-        md_index = context->mem_type_detect_mds[i];
-        status   = uct_md_detect_memory_type(context->tl_mds[md_index].md,
-                                             address, size, &mem_type);
+        md     = context->tl_mds[context->mem_type_detect_mds[i]].md;
+        status = uct_md_mem_query(md, address, length, &mem_attr);
         if (status == UCS_OK) {
+            mem_info->type    = mem_attr.mem_type;
+            mem_info->sys_dev = mem_attr.sys_dev;
             if (context->memtype_cache != NULL) {
-                ucs_memtype_cache_update(context->memtype_cache, address, size,
-                                         mem_type);
+                ucs_memtype_cache_update(context->memtype_cache, address,
+                                         length, mem_info);
             }
-            return mem_type;
+            return;
         }
     }
 
     /* Memory type not detected by any memtype MD - assume it is host memory */
-    return UCS_MEMORY_TYPE_HOST;
+    ucp_memory_info_set_host(mem_info);
 }
 
-uint64_t ucp_context_dev_tl_bitmap(ucp_context_h context, const char *dev_name)
+void
+ucp_context_dev_tl_bitmap(ucp_context_h context, const char *dev_name,
+                          ucp_tl_bitmap_t *tl_bitmap)
 {
-    uint64_t        tl_bitmap;
     ucp_rsc_index_t tl_idx;
 
-    tl_bitmap = 0;
-
-    ucs_for_each_bit(tl_idx, context->tl_bitmap) {
+    UCS_BITMAP_CLEAR(tl_bitmap);
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, tl_idx) {
         if (strcmp(context->tl_rscs[tl_idx].tl_rsc.dev_name, dev_name)) {
             continue;
         }
 
-        tl_bitmap |= UCS_BIT(tl_idx);
+        UCS_BITMAP_SET(*tl_bitmap, tl_idx);
     }
-
-    return tl_bitmap;
 }
 
-uint64_t ucp_context_dev_idx_tl_bitmap(ucp_context_h context,
-                                       ucp_rsc_index_t dev_idx)
+void
+ucp_context_dev_idx_tl_bitmap(ucp_context_h context, ucp_rsc_index_t dev_idx,
+                              ucp_tl_bitmap_t *tl_bitmap)
 {
-    uint64_t        tl_bitmap;
     ucp_rsc_index_t tl_idx;
 
-    tl_bitmap = 0;
-
-    ucs_for_each_bit(tl_idx, context->tl_bitmap) {
+    UCS_BITMAP_CLEAR(tl_bitmap);
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, tl_idx) {
         if (context->tl_rscs[tl_idx].dev_index == dev_idx) {
-            tl_bitmap |= UCS_BIT(tl_idx);
+            UCS_BITMAP_SET(*tl_bitmap, tl_idx);
         }
     }
+}
 
-    return tl_bitmap;
+void ucp_tl_bitmap_validate(const ucp_tl_bitmap_t *tl_bitmap,
+                            const ucp_tl_bitmap_t *tl_bitmap_super)
+{
+    ucs_assert_always(UCS_BITMAP_IS_ZERO(UCP_TL_BITMAP_AND_NOT(*tl_bitmap,
+                                                               *tl_bitmap_super),
+                                         UCP_MAX_RESOURCES));
 }
 
 const char* ucp_context_cm_name(ucp_context_h context, ucp_rsc_index_t cm_idx)
diff --git a/src/ucp/core/ucp_context.h b/src/ucp/core/ucp_context.h
index 2212442983b..4c92bec99b4 100644
--- a/src/ucp/core/ucp_context.h
+++ b/src/ucp/core/ucp_context.h
@@ -18,9 +18,11 @@
 #include <uct/api/uct.h>
 #include <ucs/datastruct/mpool.h>
 #include <ucs/datastruct/queue_types.h>
+#include <ucs/datastruct/bitmap.h>
 #include <ucs/memory/memtype_cache.h>
 #include <ucs/type/spinlock.h>
 #include <ucs/sys/string.h>
+#include <ucs/type/param.h>
 
 
 enum {
@@ -76,8 +78,8 @@ typedef struct ucp_context_config {
     int                                    tm_sw_rndv;
     /** Pack debug information in worker address */
     int                                    address_debug_info;
-    /** Maximal size of worker name for debugging */
-    unsigned                               max_worker_name;
+    /** Maximal size of worker address name for debugging */
+    unsigned                               max_worker_address_name;
     /** Atomic mode */
     ucp_atomic_mode_t                      atomic_mode;
     /** If use mutex for MT support or not */
@@ -98,8 +100,6 @@ typedef struct ucp_context_config {
     int                                    flush_worker_eps;
     /** Enable optimizations suitable for homogeneous systems */
     int                                    unified_mode;
-    /** Enable cm wireup-and-close protocol for client-server connections */
-    ucs_ternary_value_t                    sockaddr_cm_enable;
     /** Enable cm wireup message exchange to select the best transports
      *  for all lanes after cm phase is done */
     int                                    cm_use_all_devices;
@@ -108,7 +108,7 @@ typedef struct ucp_context_config {
     /** Enable new protocol selection logic */
     int                                    proto_enable;
     /** Time period between keepalive rounds (0 - disabled) */
-    ucs_time_t                             keepalive_timeout;
+    double                                 keepalive_interval;
     /** Maximal number of endpoints to check on every keepalive round
      * (0 - disabled, inf - check all endpoints on every round) */
     unsigned                               keepalive_num_eps;
@@ -123,7 +123,7 @@ struct ucp_config {
      *  and acceleration devices */
     ucs_config_names_array_t               devices[UCT_DEVICE_TYPE_LAST];
     /** Array of transport names to use */
-    ucs_config_names_array_t               tls;
+    ucs_config_allow_list_t                tls;
     /** Array of memory allocation methods */
     UCS_CONFIG_STRING_ARRAY_FIELD(methods) alloc_prio;
     /** Array of transports for partial worker address to pack */
@@ -134,6 +134,8 @@ struct ucp_config {
     int                                    warn_invalid_config;
     /** This config environment prefix */
     char                                   *env_prefix;
+    /** MD to compare for transport selection scores */
+    char                                   *selection_cmp;
     /** Configuration saved directly in the context */
     ucp_context_config_t                   ctx;
 };
@@ -199,13 +201,13 @@ typedef struct ucp_context {
     ucs_memtype_cache_t           *memtype_cache;           /* mem type allocation cache */
 
     ucp_tl_resource_desc_t        *tl_rscs;   /* Array of communication resources */
-    uint64_t                      tl_bitmap;  /* Cached map of tl resources used by workers.
+    ucp_tl_bitmap_t               tl_bitmap;  /* Cached map of tl resources used by workers.
                                                * Not all resources may be used if unified
                                                * mode is enabled. */
     ucp_rsc_index_t               num_tls;    /* Number of resources in the array */
 
     /* Mask of memory type communication resources */
-    uint64_t                      mem_type_access_tls[UCS_MEMORY_TYPE_LAST];
+    ucp_tl_bitmap_t               mem_type_access_tls[UCS_MEMORY_TYPE_LAST];
 
     struct {
 
@@ -236,15 +238,8 @@ typedef struct ucp_context {
         unsigned                  num_alloc_methods;
 
         /* Cached map of components which support CM capability */
-        uint64_t                  cm_cmpts_bitmap;
+        ucp_tl_bitmap_t           cm_cmpts_bitmap;
 
-        /* Bitmap of sockaddr auxiliary transports to pack for client/server flow */
-        uint64_t                  sockaddr_aux_rscs_bitmap;
-
-        /* Array of sockaddr transports indexes.
-         * The indexes appear in the configured priority order */
-        ucp_rsc_index_t           sockaddr_tl_ids[UCP_MAX_RESOURCES];
-        ucp_rsc_index_t           num_sockaddr_tls;
         /* Array of CMs indexes. The indexes appear in the configured priority
          * order. */
         ucp_rsc_index_t           cm_cmpt_idxs[UCP_MAX_RESOURCES];
@@ -252,15 +247,22 @@ typedef struct ucp_context {
 
         /* Configuration supplied by the user */
         ucp_context_config_t      ext;
-        
+
         /* Config environment prefix used to create the context */
         char                      *env_prefix;
 
+        /* Time period between keepalive rounds */
+        ucs_time_t                keepalive_interval;
+
+        /* MD to compare for transport selection scores */
+        char                      *selection_cmp;
     } config;
 
-    /* All configurations about multithreading support */
+    /* Configuration of multi-threadiing support */
     ucp_mt_lock_t                 mt_lock;
 
+    char                          name[UCP_ENTITY_NAME_MAX];
+
 } ucp_context_t;
 
 
@@ -351,8 +353,12 @@ typedef struct ucp_tl_iface_atomic_flags {
 
 
 #define UCP_PARAM_VALUE(_obj, _params, _name, _flag, _default) \
-    (((_params)->field_mask & (UCP_##_obj##_PARAM_FIELD_##_flag)) ? \
-                    (_params)->_name : (_default))
+    UCS_PARAM_VALUE(UCS_PP_TOKENPASTE3(UCP_, _obj, _PARAM_FIELD), _params, \
+                    _name, _flag, _default)
+
+
+#define UCP_PARAM_FIELD_VALUE(_params, _name, _flag, _default) \
+    UCS_PARAM_VALUE(UCP_PARAM_FIELD, _params, _name, _flag, _default)
 
 
 #define ucp_assert_memtype(_context, _buffer, _length, _mem_type) \
@@ -374,14 +380,15 @@ void ucp_context_uct_atomic_iface_flags(ucp_context_h context,
 
 const char * ucp_find_tl_name_by_csum(ucp_context_t *context, uint16_t tl_name_csum);
 
-const char* ucp_tl_bitmap_str(ucp_context_h context, uint64_t tl_bitmap,
-                              char *str, size_t max_str_len);
+const char *ucp_tl_bitmap_str(ucp_context_h context,
+                              const ucp_tl_bitmap_t *tl_bitmap, char *str,
+                              size_t max_str_len);
 
 const char* ucp_feature_flags_str(unsigned feature_flags, char *str,
                                   size_t max_str_len);
 
-ucs_memory_type_t
-ucp_memory_type_detect_mds(ucp_context_h context, const void *address, size_t length);
+void ucp_memory_detect_slowpath(ucp_context_h context, const void *address,
+                                size_t length, ucs_memory_info_t *mem_info);
 
 /**
  * Calculate a small value to overcome float imprecision
@@ -443,51 +450,68 @@ static UCS_F_ALWAYS_INLINE int ucp_memory_type_cache_is_empty(ucp_context_h cont
             !context->memtype_cache->pgtable.num_regions);
 }
 
-static UCS_F_ALWAYS_INLINE ucs_memory_type_t
-ucp_memory_type_detect(ucp_context_h context, const void *address, size_t length)
+static UCS_F_ALWAYS_INLINE void
+ucp_memory_info_set_host(ucs_memory_info_t *mem_info)
+{
+    mem_info->type    = UCS_MEMORY_TYPE_HOST;
+    mem_info->sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_memory_detect(ucp_context_h context, const void *address, size_t length,
+                  ucs_memory_info_t *mem_info)
 {
-    ucs_memory_type_t mem_type;
     ucs_status_t status;
 
     if (ucs_likely(context->num_mem_type_detect_mds == 0)) {
-        return UCS_MEMORY_TYPE_HOST;
+        goto out_host_mem;
     }
 
     if (ucs_likely(context->memtype_cache != NULL)) {
         if (!context->memtype_cache->pgtable.num_regions) {
-            return UCS_MEMORY_TYPE_HOST;
+            goto out_host_mem;
         }
 
         status = ucs_memtype_cache_lookup(context->memtype_cache, address,
-                                          length, &mem_type);
-        if (status != UCS_OK) {
+                                          length, mem_info);
+        if (ucs_likely(status != UCS_OK)) {
             ucs_assert(status == UCS_ERR_NO_ELEM);
-            return UCS_MEMORY_TYPE_HOST;
+            goto out_host_mem;
         }
 
-        if (mem_type != UCS_MEMORY_TYPE_LAST) {
-            return mem_type;
+        if ((mem_info->type != UCS_MEMORY_TYPE_UNKNOWN) &&
+            ((mem_info->sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN))) {
+            return;
         }
 
-        /* mem_type is UCS_MEMORY_TYPE_LAST: fall thru to memory detection by
-         * UCT memory domains */
+        /* Fall thru to slow-path memory type and system device detection by UCT
+         * memory domains. In any case, the memory type cache is not expected to
+         * return HOST memory type.
+         */
+        ucs_assert(mem_info->type != UCS_MEMORY_TYPE_HOST);
     }
 
-    return ucp_memory_type_detect_mds(context, address, length);
-}
+    ucp_memory_detect_slowpath(context, address, length, mem_info);
+    return;
 
-static UCS_F_ALWAYS_INLINE ucs_memory_type_t
-ucp_get_memory_type(ucp_context_h context, const void *address,
-                    size_t length, ucs_memory_type_t memory_type)
-{
-    return (memory_type == UCS_MEMORY_TYPE_UNKNOWN) ?
-           ucp_memory_type_detect(context, address, length) : memory_type;
+out_host_mem:
+    ucp_memory_info_set_host(mem_info);
 }
 
-uint64_t ucp_context_dev_tl_bitmap(ucp_context_h context, const char *dev_name);
 
-uint64_t ucp_context_dev_idx_tl_bitmap(ucp_context_h context,
-                                       ucp_rsc_index_t dev_idx);
+void
+ucp_context_dev_tl_bitmap(ucp_context_h context, const char *dev_name,
+                          ucp_tl_bitmap_t *tl_bitmap);
+
+
+void
+ucp_context_dev_idx_tl_bitmap(ucp_context_h context, ucp_rsc_index_t dev_idx,
+                              ucp_tl_bitmap_t *tl_bitmap);
+
+
+void ucp_tl_bitmap_validate(const ucp_tl_bitmap_t *tl_bitmap,
+                            const ucp_tl_bitmap_t *tl_bitmap_super);
+
 
 const char* ucp_context_cm_name(ucp_context_h context, ucp_rsc_index_t cm_idx);
 
diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c
index 54dd8cb0891..c52fc5d2903 100644
--- a/src/ucp/core/ucp_ep.c
+++ b/src/ucp/core/ucp_ep.c
@@ -25,11 +25,16 @@
 #include <ucp/rndv/rndv.h>
 #include <ucp/stream/stream.h>
 #include <ucp/core/ucp_listener.h>
+#include <ucp/rma/rma.inl>
+#include <ucp/rma/rma.h>
+
 #include <ucs/datastruct/queue.h>
 #include <ucs/debug/memtrack.h>
 #include <ucs/debug/log.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/sys/string.h>
 #include <ucs/sys/sock.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <string.h>
 
 
@@ -57,6 +62,56 @@ static ucs_stats_class_t ucp_ep_stats_class = {
 };
 #endif
 
+static uct_iface_t ucp_failed_tl_iface = {
+    .ops = {
+        .ep_put_short        = (uct_ep_put_short_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_put_bcopy        = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout,
+        .ep_put_zcopy        = (uct_ep_put_zcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_get_short        = (uct_ep_get_short_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_get_bcopy        = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_get_zcopy        = (uct_ep_get_zcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_am_short         = (uct_ep_am_short_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_am_short_iov     = (uct_ep_am_short_iov_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_am_bcopy         = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout,
+        .ep_am_zcopy         = (uct_ep_am_zcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_atomic_cswap64   = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_atomic_cswap32   = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_atomic64_post    = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_atomic32_post    = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_atomic64_fetch   = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_atomic32_fetch   = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_tag_eager_short  = (uct_ep_tag_eager_short_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_tag_eager_bcopy  = (uct_ep_tag_eager_bcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_tag_eager_zcopy  = (uct_ep_tag_eager_zcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_tag_rndv_zcopy   = (uct_ep_tag_rndv_zcopy_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_tag_rndv_cancel  = (uct_ep_tag_rndv_cancel_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_tag_rndv_request = (uct_ep_tag_rndv_request_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_pending_add      = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy,
+        .ep_pending_purge    = (uct_ep_pending_purge_func_t)ucs_empty_function_return_success,
+        .ep_flush            = (uct_ep_flush_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_fence            = (uct_ep_fence_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_check            = (uct_ep_check_func_t)ucs_empty_function_return_success,
+        .ep_connect_to_ep    = (uct_ep_connect_to_ep_func_t)ucs_empty_function_return_ep_timeout,
+        .ep_destroy          = (uct_ep_destroy_func_t)ucs_empty_function,
+        .ep_get_address      = (uct_ep_get_address_func_t)ucs_empty_function_return_ep_timeout
+    }
+};
+
+static uct_ep_t ucp_failed_tl_ep = {
+    .iface = &ucp_failed_tl_iface
+};
+
+static const char *ucp_err_handling_mode_names[] = {
+    [UCP_ERR_HANDLING_MODE_NONE] = "none",
+    [UCP_ERR_HANDLING_MODE_PEER] = "peer"
+};
+
+
+int ucp_is_uct_ep_failed(uct_ep_h uct_ep)
+{
+    return uct_ep == &ucp_failed_tl_ep;
+}
+
 void ucp_ep_config_key_reset(ucp_ep_config_key_t *key)
 {
     ucp_lane_index_t i;
@@ -65,8 +120,10 @@ void ucp_ep_config_key_reset(ucp_ep_config_key_t *key)
     key->num_lanes        = 0;
     for (i = 0; i < UCP_MAX_LANES; ++i) {
         key->lanes[i].rsc_index    = UCP_NULL_RESOURCE;
-        key->lanes[i].lane_types   = 0;
         key->lanes[i].dst_md_index = UCP_NULL_RESOURCE;
+        key->lanes[i].dst_sys_dev  = UCS_SYS_DEVICE_ID_UNKNOWN;
+        key->lanes[i].path_index   = 0;
+        key->lanes[i].lane_types   = 0;
     }
     key->am_lane          = UCP_NULL_LANE;
     key->wireup_msg_lane  = UCP_NULL_LANE;
@@ -78,7 +135,6 @@ void ucp_ep_config_key_reset(ucp_ep_config_key_t *key)
     key->dst_md_cmpts     = NULL;
     key->ep_check_map     = 0;
     key->err_mode         = UCP_ERR_HANDLING_MODE_NONE;
-    key->status           = UCS_OK;
     memset(key->am_bw_lanes,  UCP_NULL_LANE, sizeof(key->am_bw_lanes));
     memset(key->rma_lanes,    UCP_NULL_LANE, sizeof(key->rma_lanes));
     memset(key->rma_bw_lanes, UCP_NULL_LANE, sizeof(key->rma_bw_lanes));
@@ -108,21 +164,28 @@ ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name,
         goto err_free_ep;
     }
 
+    ep->refcount                         = 1;
     ep->cfg_index                        = UCP_WORKER_CFG_INDEX_NULL;
     ep->worker                           = worker;
     ep->am_lane                          = UCP_NULL_LANE;
     ep->flags                            = 0;
     ep->conn_sn                          = UCP_EP_MATCH_CONN_SN_MAX;
+#if UCS_ENABLE_ASSERT
+    ep->flush_iter_refcount              = 0;
+    ep->discard_refcount                 = 0;
+#endif
     ucp_ep_ext_gen(ep)->user_data        = NULL;
+    ucp_ep_ext_control(ep)->cm_idx       = UCP_NULL_RESOURCE;
     ucp_ep_ext_control(ep)->err_cb       = NULL;
-    ucp_ep_ext_control(ep)->local_ep_id  =
-    ucp_ep_ext_control(ep)->remote_ep_id = UCP_EP_ID_INVALID;
+    ucp_ep_ext_control(ep)->local_ep_id  = UCS_PTR_MAP_KEY_INVALID;
+    ucp_ep_ext_control(ep)->remote_ep_id = UCS_PTR_MAP_KEY_INVALID;
 
     UCS_STATIC_ASSERT(sizeof(ucp_ep_ext_gen(ep)->ep_match) >=
                       sizeof(ucp_ep_ext_gen(ep)->flush_state));
     memset(&ucp_ep_ext_gen(ep)->ep_match, 0,
            sizeof(ucp_ep_ext_gen(ep)->ep_match));
 
+    ucs_hlist_head_init(&ucp_ep_ext_gen(ep)->proto_reqs);
     ucp_stream_ep_init(ep);
     ucp_am_ep_init(ep);
 
@@ -131,7 +194,8 @@ ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name,
     }
 
 #if ENABLE_DEBUG_DATA
-    ucs_snprintf_zero(ep->peer_name, UCP_WORKER_NAME_MAX, "%s", peer_name);
+    ucs_snprintf_zero(ep->peer_name, UCP_WORKER_ADDRESS_NAME_MAX, "%s",
+                      peer_name);
 #endif
 
     /* Create statistics */
@@ -141,7 +205,8 @@ ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name,
         goto err_free_ep_control_ext;
     }
 
-    ucs_list_head_init(&ucp_ep_ext_gen(ep)->ep_list);
+    /* Create endpoint VFS node on demand to avoid memory bloat */
+    ucs_vfs_obj_set_dirty(worker, ucp_worker_vfs_refresh);
 
     *ep_p = ep;
     ucs_debug("created ep %p to %s %s", ep, ucp_ep_peer_name(ep), message);
@@ -155,17 +220,91 @@ ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name,
     return status;
 }
 
-void ucp_ep_destroy_base(ucp_ep_h ep)
+static int
+ucp_ep_local_disconnect_progress_remove_filter(const ucs_callbackq_elem_t *elem,
+                                               void *arg)
 {
+    ucp_ep_h ep = (ucp_ep_h)arg;
+    ucp_request_t *req;
+
+    if (elem->cb != ucp_ep_local_disconnect_progress) {
+        return 0;
+    }
+
+    req = (ucp_request_t*)elem->arg;
+    if (ep != req->send.ep) {
+        return 0;
+    }
+
+    /* Expect that only EP flush request can be remained in the callback queue,
+     * because reply UCP EP created for sending WIREUP_MSG/EP_REMOVED message is
+     * not exposed to a user */
+    ucs_assert(req->flags & UCP_REQUEST_FLAG_RELEASED);
+    ucs_assert(req->send.uct.func == ucp_ep_flush_progress_pending);
+
+    ucp_request_complete_send(req, UCS_OK);
+    return 1;
+}
+
+static UCS_F_NOINLINE void ucp_ep_remove_progress_callbacks(ucp_ep_h ep)
+{
+    ucp_worker_h worker = ep->worker;
+
+    /* Remove pending slow-path functions after all UCT EP lanes are destroyed,
+     * because cleanup lanes purges all outstanding operation and purged
+     * operations could add callbacks on the progress */
+    ucs_assert(ep->refcount == 0);
+
+    ucs_callbackq_remove_if(&ep->worker->uct->progress_q,
+                            ucp_wireup_msg_ack_cb_pred, ep);
+
+    ucs_callbackq_remove_if(&worker->uct->progress_q,
+                            ucp_worker_err_handle_remove_filter, ep);
+
+    ucs_callbackq_remove_if(&worker->uct->progress_q,
+                            ucp_listener_accept_cb_remove_filter, ep);
+
+    ucs_callbackq_remove_if(&worker->uct->progress_q,
+                            ucp_ep_local_disconnect_progress_remove_filter, ep);
+}
+
+static void ucp_ep_destroy_base(ucp_ep_h ep)
+{
+    ucs_assert(ep->refcount == 0);
+    ucs_assert(ep->flush_iter_refcount == 0);
+    ucs_assert(ep->discard_refcount == 0);
+    ucs_assert(ucs_hlist_is_empty(&ucp_ep_ext_gen(ep)->proto_reqs));
+
+    ucs_vfs_obj_remove(ep);
+    ucp_ep_remove_progress_callbacks(ep);
     UCS_STATS_NODE_FREE(ep->stats);
     ucs_free(ucp_ep_ext_control(ep));
     ucs_strided_alloc_put(&ep->worker->ep_alloc, ep);
 }
 
+void ucp_ep_add_ref(ucp_ep_h ep)
+{
+    ucs_assert(ep->refcount < UINT8_MAX);
+    ++ep->refcount;
+}
+
+/* Return 1 if the endpoint was destroyed, 0 if not */
+int ucp_ep_remove_ref(ucp_ep_h ep)
+{
+    ucs_assert(ep->refcount > 0);
+    if (--ep->refcount == 0) {
+        ucp_ep_destroy_base(ep);
+        return 1;
+    }
+
+    return 0;
+}
+
 ucs_status_t ucp_worker_create_ep(ucp_worker_h worker, unsigned ep_init_flags,
                                   const char *peer_name, const char *message,
                                   ucp_ep_h *ep_p)
 {
+    ucp_context_h context = worker->context;
     ucs_status_t status;
     ucp_ep_h ep;
 
@@ -174,86 +313,70 @@ ucs_status_t ucp_worker_create_ep(ucp_worker_h worker, unsigned ep_init_flags,
         goto err;
     }
 
-    if ((worker->context->config.ext.proto_indirect_id == UCS_CONFIG_ON) ||
-        ((worker->context->config.ext.proto_indirect_id == UCS_CONFIG_AUTO) &&
-         (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) &&
-         !(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE))) {
-        ep->flags |= UCP_EP_FLAG_INDIRECT_ID;
+    if (!(ep_init_flags & UCP_EP_INIT_FLAG_INTERNAL) &&
+        ((context->config.ext.proto_indirect_id == UCS_CONFIG_ON) ||
+         ((context->config.ext.proto_indirect_id == UCS_CONFIG_AUTO) &&
+          (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE)))) {
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_INDIRECT_ID, 0);
     }
 
     status = ucs_ptr_map_put(&worker->ptr_map, ep,
                              !!(ep->flags & UCP_EP_FLAG_INDIRECT_ID),
                              &ucp_ep_ext_control(ep)->local_ep_id);
-    if (status != UCS_OK) {
+    if ((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS)) {
+        ucs_error("ep %p: failed to allocate ID: %s", ep,
+                  ucs_status_string(status));
         goto err_destroy_ep_base;
     }
 
-    if (!(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE)) {
+    if (ep_init_flags & UCP_EP_INIT_FLAG_INTERNAL) {
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_INTERNAL, 0);
+        ucs_list_add_tail(&worker->internal_eps, &ucp_ep_ext_gen(ep)->ep_list);
+    } else {
         ucs_list_add_tail(&worker->all_eps, &ucp_ep_ext_gen(ep)->ep_list);
+        ucs_assert(ep->worker->num_all_eps < UINT_MAX);
+        ++ep->worker->num_all_eps;
     }
 
     *ep_p = ep;
-
     return UCS_OK;
 
 err_destroy_ep_base:
-    ucp_ep_destroy_base(ep);
+    ucp_ep_remove_ref(ep);
 err:
     return status;
 }
 
 void ucp_ep_delete(ucp_ep_h ep)
 {
-    ucs_status_t status;
-
-    ucs_callbackq_remove_if(&ep->worker->uct->progress_q,
-                            ucp_wireup_msg_ack_cb_pred, ep);
-    ucp_worker_keepalive_remove_ep(ep);
-    ucs_list_del(&ucp_ep_ext_gen(ep)->ep_list);
-    status = ucs_ptr_map_del(&ep->worker->ptr_map, ucp_ep_local_id(ep));
-    if (status != UCS_OK) {
-        ucs_warn("ep %p local id 0x%"PRIxPTR": ucs_ptr_map_del failed with status %s",
-                 ep, ucp_ep_local_id(ep), ucs_status_string(status));
+    if (!(ep->flags & UCP_EP_FLAG_INTERNAL)) {
+        ucs_assert(ep->worker->num_all_eps > 0);
+        --ep->worker->num_all_eps;
+        ucp_worker_keepalive_remove_ep(ep);
     }
 
-    ucp_ep_destroy_base(ep);
+    ucp_ep_release_id(ep);
+    ucs_list_del(&ucp_ep_ext_gen(ep)->ep_list);
+    ucp_ep_remove_ref(ep);
 }
 
-ucs_status_t
-ucp_ep_create_sockaddr_aux(ucp_worker_h worker, unsigned ep_init_flags,
-                           const ucp_unpacked_address_t *remote_address,
-                           ucp_ep_h *ep_p)
+/* Since release function resets EP ID to @ref UCS_PTR_MAP_KEY_INVALID and PTR
+ * MAP considers @ref UCS_PTR_MAP_KEY_INVALID as direct key, release EP ID is
+ * re-entrant function */
+void ucp_ep_release_id(ucp_ep_h ep)
 {
-    ucp_wireup_ep_t *wireup_ep;
     ucs_status_t status;
-    ucp_ep_h ep;
-
-    /* allocate endpoint */
-    status = ucp_worker_create_ep(worker, ep_init_flags, remote_address->name,
-                                  "listener", &ep);
-    if (status != UCS_OK) {
-        goto err;
-    }
-
-    status = ucp_ep_init_create_wireup(ep, ep_init_flags, &wireup_ep);
-    if (status != UCS_OK) {
-        goto err_delete;
-    }
 
-    status = ucp_wireup_ep_connect_aux(wireup_ep, ep_init_flags, remote_address);
-    if (status != UCS_OK) {
-        goto err_destroy_wireup_ep;
+    /* Don't use ucp_ep_local_id() function here to avoid assertion failure,
+     * because local_ep_id can be set to @ref UCS_PTR_MAP_KEY_INVALID */
+    status = ucs_ptr_map_del(&ep->worker->ptr_map,
+                             ucp_ep_ext_control(ep)->local_ep_id);
+    if ((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS)) {
+        ucs_warn("ep %p local id 0x%" PRIxPTR ": ucs_ptr_map_del failed: %s",
+                 ep, ucp_ep_local_id(ep), ucs_status_string(status));
     }
 
-    *ep_p = ep;
-    return status;
-
-err_destroy_wireup_ep:
-    uct_ep_destroy(ep->uct_eps[0]);
-err_delete:
-    ucp_ep_delete(ep);
-err:
-    return status;
+    ucp_ep_ext_control(ep)->local_ep_id = UCS_PTR_MAP_KEY_INVALID;
 }
 
 void ucp_ep_config_key_set_err_mode(ucp_ep_config_key_t *key,
@@ -263,13 +386,6 @@ void ucp_ep_config_key_set_err_mode(ucp_ep_config_key_t *key,
                     UCP_ERR_HANDLING_MODE_PEER : UCP_ERR_HANDLING_MODE_NONE;
 }
 
-int ucp_ep_is_sockaddr_stub(ucp_ep_h ep)
-{
-    /* Only a sockaddr client-side endpoint may be created as a "stub" */
-    return (ucp_ep_get_rsc_index(ep, 0) == UCP_NULL_RESOURCE) &&
-           !ucp_ep_has_cm_lane(ep);
-}
-
 static ucs_status_t
 ucp_ep_adjust_params(ucp_ep_h ep, const ucp_ep_params_t *params)
 {
@@ -296,45 +412,105 @@ ucp_ep_adjust_params(ucp_ep_h ep, const ucp_ep_params_t *params)
     return UCS_OK;
 }
 
-ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker)
+ucs_status_t ucp_ep_evaluate_perf(ucp_ep_h ep,
+                                  const ucp_ep_evaluate_perf_param_t *param,
+                                  ucp_ep_evaluate_perf_attr_t *attr)
+{
+    const ucp_worker_h worker               = ep->worker;
+    const ucp_context_h context             = worker->context;
+    const ucp_ep_config_key_t *key          = &ucp_ep_config(ep)->key;
+    double max_bandwidth                    = 0;
+    ucp_rsc_index_t max_bandwidth_rsc_index = 0;
+    ucp_rsc_index_t rsc_index;
+    double bandwidth;
+    ucp_lane_index_t lane;
+    ucp_worker_iface_t *wiface;
+    uct_iface_attr_t *iface_attr;
+    ucs_linear_func_t estimated_time;
+
+    if (!ucs_test_all_flags(attr->field_mask,
+                            UCP_EP_PERF_ATTR_FIELD_ESTIMATED_TIME &
+                            UCP_EP_PERF_PARAM_FIELD_MESSAGE_SIZE)) {
+        return UCS_ERR_INVALID_PARAM;
+    }
+
+    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
+        if (lane == key->cm_lane) {
+            /* Skip CM lanes for bandwidth calculation */
+            continue;
+        }
+
+        rsc_index = key->lanes[lane].rsc_index;
+        wiface    = worker->ifaces[rsc_index];
+        bandwidth = ucp_tl_iface_bandwidth(context,
+                                            &wiface->attr.bandwidth);
+        if (bandwidth > max_bandwidth) {
+            max_bandwidth           = bandwidth;
+            max_bandwidth_rsc_index = rsc_index;
+        }
+    }
+
+    iface_attr           = ucp_worker_iface_get_attr(worker,
+                                                     max_bandwidth_rsc_index);
+    estimated_time.c     = ucp_tl_iface_latency(context, &iface_attr->latency);
+    estimated_time.m     = param->message_size / max_bandwidth;
+    attr->estimated_time = estimated_time.c + estimated_time.m;
+
+    return UCS_OK;
+}
+
+ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker)
 {
     ucp_context_h context = worker->context;
+    unsigned pack_flags   = ucp_worker_default_address_pack_flags(worker);
     ucp_unpacked_address_t local_address;
     ucs_memory_type_t mem_type;
     ucs_status_t status;
     void *address_buffer;
     size_t address_length;
+    char ep_name[UCP_WORKER_ADDRESS_NAME_MAX];
 
     ucs_memory_type_for_each(mem_type) {
-        if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type) ||
-            !context->mem_type_access_tls[mem_type]) {
+        if (UCP_MEM_IS_HOST(mem_type) ||
+            UCS_BITMAP_IS_ZERO_INPLACE(
+                    &context->mem_type_access_tls[mem_type])) {
             continue;
         }
 
         status = ucp_address_pack(worker, NULL,
-                                  context->mem_type_access_tls[mem_type],
-                                  UCP_ADDRESS_PACK_FLAGS_WORKER_DEFAULT, NULL,
-                                  &address_length, &address_buffer);
+                                  &context->mem_type_access_tls[mem_type],
+                                  pack_flags, NULL, &address_length,
+                                  &address_buffer);
         if (status != UCS_OK) {
             goto err_cleanup_eps;
         }
 
-        status = ucp_address_unpack(worker, address_buffer,
-                                    UCP_ADDRESS_PACK_FLAGS_WORKER_DEFAULT,
+        status = ucp_address_unpack(worker, address_buffer, pack_flags,
                                     &local_address);
         if (status != UCS_OK) {
             goto err_free_address_buffer;
         }
 
-        status = ucp_ep_create_to_worker_addr(worker, UINT64_MAX,
+        ucs_snprintf_zero(ep_name, UCP_WORKER_ADDRESS_NAME_MAX,
+                          "mem_type_ep:%s", ucs_memory_type_names[mem_type]);
+
+        /* create memtype UCP EPs after blocking async context, because they set
+         * INTERNAL flag (setting EP flags is expected to be guarded) */
+        UCS_ASYNC_BLOCK(&worker->async);
+        status = ucp_ep_create_to_worker_addr(worker, &ucp_tl_bitmap_max,
                                               &local_address,
-                                              UCP_EP_INIT_FLAG_MEM_TYPE,
-                                              "mem type",
+                                              UCP_EP_INIT_FLAG_MEM_TYPE |
+                                              UCP_EP_INIT_FLAG_INTERNAL,
+                                              ep_name,
                                               &worker->mem_type_ep[mem_type]);
         if (status != UCS_OK) {
+            UCS_ASYNC_UNBLOCK(&worker->async);
             goto err_free_address_list;
         }
 
+        ucp_ep_flush_state_reset(worker->mem_type_ep[mem_type]);
+        UCS_ASYNC_UNBLOCK(&worker->async);
+
         ucs_free(local_address.address_list);
         ucs_free(address_buffer);
     }
@@ -346,20 +522,33 @@ ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker)
 err_free_address_buffer:
     ucs_free(address_buffer);
 err_cleanup_eps:
-    ucp_worker_destroy_mem_type_endpoints(worker);
+    ucp_worker_mem_type_eps_destroy(worker);
     return status;
 }
 
-void ucp_worker_destroy_mem_type_endpoints(ucp_worker_h worker)
+void ucp_worker_mem_type_eps_destroy(ucp_worker_h worker)
 {
     ucs_memory_type_t mem_type;
+    ucp_ep_h ep;
+
+    /* Destroy memtype UCP EPs after blocking async context, because cleanup
+     * lanes set FAILED flag (setting EP flags is expected to be guarded) */
+    UCS_ASYNC_BLOCK(&worker->async);
 
     ucs_memory_type_for_each(mem_type) {
-        if (worker->mem_type_ep[mem_type] != NULL) {
-           ucp_ep_destroy_internal(worker->mem_type_ep[mem_type]);
-           worker->mem_type_ep[mem_type] = NULL;
+        ep = worker->mem_type_ep[mem_type];
+        if (ep == NULL) {
+            continue;
         }
+
+        ucs_debug("memtype ep %p: destroy", ep);
+        ucs_assert(ep->flags & UCP_EP_FLAG_INTERNAL);
+
+        ucp_ep_destroy_internal(ep);
+        worker->mem_type_ep[mem_type] = NULL;
     }
+
+    UCS_ASYNC_UNBLOCK(&worker->async);
 }
 
 ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags,
@@ -368,6 +557,9 @@ ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags,
     ucp_ep_config_key_t key;
     ucs_status_t status;
 
+    ucs_assert(ep_init_flags & UCP_EP_INIT_CM_WIREUP_CLIENT);
+    ucs_assert(ucp_worker_num_cm_cmpts(ep->worker) != 0);
+
     ucp_ep_config_key_reset(&key);
     ucp_ep_config_key_set_err_mode(&key, ep_init_flags);
 
@@ -375,8 +567,12 @@ ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags,
     /* all operations will use the first lane, which is a stub endpoint before
      * reconfiguration */
     key.am_lane             = 0;
-    if (ucp_worker_sockaddr_is_cm_proto(ep->worker)) {
+    if (ucp_ep_init_flags_has_cm(ep_init_flags)) {
         key.cm_lane         = 0;
+        /* Send keepalive on wireup_ep (which will send on aux_ep) */
+        if (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) {
+            key.ep_check_map |= UCS_BIT(key.cm_lane);
+        }
     } else {
         key.wireup_msg_lane = 0;
     }
@@ -388,7 +584,7 @@ ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags,
 
     ep->am_lane = key.am_lane;
     if (!ucp_ep_has_cm_lane(ep)) {
-        ep->flags |= UCP_EP_FLAG_CONNECT_REQ_QUEUED;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_REQ_QUEUED, 0);
     }
 
     status = ucp_wireup_ep_create(ep, &ep->uct_eps[0]);
@@ -400,13 +596,15 @@ ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags,
     return UCS_OK;
 }
 
-ucs_status_t ucp_ep_create_to_worker_addr(ucp_worker_h worker,
-                                          uint64_t local_tl_bitmap,
-                                          const ucp_unpacked_address_t *remote_address,
-                                          unsigned ep_init_flags,
-                                          const char *message, ucp_ep_h *ep_p)
+ucs_status_t
+ucp_ep_create_to_worker_addr(ucp_worker_h worker,
+                             const ucp_tl_bitmap_t *local_tl_bitmap,
+                             const ucp_unpacked_address_t *remote_address,
+                             unsigned ep_init_flags, const char *message,
+                             ucp_ep_h *ep_p)
 {
     unsigned addr_indices[UCP_MAX_LANES];
+    ucp_tl_bitmap_t ep_tl_bitmap;
     ucs_status_t status;
     ucp_ep_h ep;
 
@@ -424,7 +622,8 @@ ucs_status_t ucp_ep_create_to_worker_addr(ucp_worker_h worker,
         goto err_delete;
     }
 
-    ucs_assert(!(ucp_ep_get_tl_bitmap(ep) & ~local_tl_bitmap));
+    ucp_ep_get_tl_bitmap(ep, &ep_tl_bitmap);
+    ucp_tl_bitmap_validate(&ep_tl_bitmap, local_tl_bitmap);
 
     *ep_p = ep;
     return UCS_OK;
@@ -455,7 +654,8 @@ static ucs_status_t ucp_ep_create_to_sock_addr(ucp_worker_h worker,
 
     /* allocate endpoint */
     ucs_sockaddr_str(params->sockaddr.addr, peer_name, sizeof(peer_name));
-    ep_init_flags = ucp_ep_init_flags(worker, params);
+    ep_init_flags = ucp_ep_init_flags(worker, params) |
+                    ucp_cm_ep_init_flags(params);
 
     status = ucp_worker_create_ep(worker, ep_init_flags, peer_name,
                                   "from api call", &ep);
@@ -473,13 +673,13 @@ static ucs_status_t ucp_ep_create_to_sock_addr(ucp_worker_h worker,
         goto err_cleanup_lanes;
     }
 
-    status = ucp_worker_sockaddr_is_cm_proto(ep->worker) ?
-             ucp_ep_client_cm_connect_start(ep, params) :
-             ucp_wireup_ep_connect_to_sockaddr(ep->uct_eps[0], params);
+    status = ucp_ep_client_cm_connect_start(ep, params);
     if (status != UCS_OK) {
         goto err_cleanup_lanes;
     }
 
+    ucp_ep_flush_state_reset(ep);
+
     *ep_p = ep;
     return UCS_OK;
 
@@ -509,12 +709,14 @@ ucs_status_t ucp_ep_create_server_accept(ucp_worker_h worker,
         ep_init_flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE;
     }
 
-    if (sa_data->addr_mode == UCP_WIREUP_SA_DATA_CM_ADDR) {
-        addr_flags = UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT;
-    } else {
-        addr_flags = UCP_ADDRESS_PACK_FLAGS_ALL;
+    if (sa_data->addr_mode != UCP_WIREUP_SA_DATA_CM_ADDR) {
+        ucs_fatal("client sockaddr data contains invalid address mode %d",
+                  sa_data->addr_mode);
     }
 
+    addr_flags = ucp_worker_common_address_pack_flags(worker) |
+                 UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT;
+
     /* coverity[overrun-local] */
     status = ucp_address_unpack(worker, sa_data + 1, addr_flags, &remote_addr);
     if (status != UCS_OK) {
@@ -522,77 +724,15 @@ ucs_status_t ucp_ep_create_server_accept(ucp_worker_h worker,
         return status;
     }
 
-    switch (sa_data->addr_mode) {
-    case UCP_WIREUP_SA_DATA_FULL_ADDR:
-        /* create endpoint to the worker address we got in the private data */
-        status = ucp_ep_create_to_worker_addr(worker, UINT64_MAX, &remote_addr,
-                                              ep_init_flags |
-                                              UCP_EP_INIT_CREATE_AM_LANE,
-                                              "listener", ep_p);
-        if (status != UCS_OK) {
-            goto non_cm_err_reject;
-        }
-
-        ucs_assert(ucp_ep_config(*ep_p)->key.err_mode == sa_data->err_mode);
-        ucp_ep_flush_state_reset(*ep_p);
-        ucp_ep_update_remote_id(*ep_p, sa_data->ep_id);
-        /* send wireup request message, to connect the client to the server's
-           new endpoint */
-        ucs_assert(!((*ep_p)->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED));
-        status = ucp_wireup_send_request(*ep_p);
-        if (status != UCS_OK) {
-            goto non_cm_err_destroy_ep;
-        }
-        break;
-    case UCP_WIREUP_SA_DATA_PARTIAL_ADDR:
-        status = ucp_ep_create_sockaddr_aux(worker, ep_init_flags,
-                                            &remote_addr, ep_p);
-        if (status != UCS_OK) {
-            goto non_cm_err_reject;
-        }
-
-        ucp_ep_update_remote_id(*ep_p, sa_data->ep_id);
-        /* the server's ep should be aware of the sent address from the client */
-        (*ep_p)->flags |= UCP_EP_FLAG_LISTENER;
-        /* NOTE: protect union */
-        ucs_assert(!((*ep_p)->flags & (UCP_EP_FLAG_ON_MATCH_CTX |
-                                       UCP_EP_FLAG_FLUSH_STATE_VALID)));
-        status = ucp_wireup_send_pre_request(*ep_p);
-        if (status != UCS_OK) {
-            goto non_cm_err_destroy_ep;
-        }
-        break;
-    case UCP_WIREUP_SA_DATA_CM_ADDR:
-        ucs_assert(ucp_worker_sockaddr_is_cm_proto(worker));
-        for (i = 0; i < remote_addr.address_count; ++i) {
-            remote_addr.address_list[i].dev_addr  = conn_request->remote_dev_addr;
-            remote_addr.address_list[i].dev_index = conn_request->sa_data.dev_index;
-        }
-        status = ucp_ep_cm_server_create_connected(worker, ep_init_flags,
-                                                   &remote_addr, conn_request,
-                                                   ep_p);
-        ucs_free(remote_addr.address_list);
-        return status;
-    default:
-        ucs_fatal("client sockaddr data contains invalid address mode %d",
-                  sa_data->addr_mode);
+    for (i = 0; i < remote_addr.address_count; ++i) {
+        remote_addr.address_list[i].dev_addr  = conn_request->remote_dev_addr;
+        remote_addr.address_list[i].dev_index = conn_request->sa_data.dev_index;
     }
 
-    /* common non-CM flow */
-    status = uct_iface_accept(conn_request->uct.iface,
-                              conn_request->uct_req);
-    goto non_cm_out;
-
-non_cm_err_destroy_ep:
-    ucp_ep_destroy_internal(*ep_p);
-non_cm_err_reject:
-    ucs_error("connection request failed on listener %p with status %s",
-              conn_request->listener, ucs_status_string(status));
-    uct_iface_reject(conn_request->uct.iface, conn_request->uct_req);
-non_cm_out:
-    ucs_free(conn_request);
+    status = ucp_ep_cm_server_create_connected(worker, ep_init_flags,
+                                               &remote_addr, conn_request,
+                                               ep_p);
     ucs_free(remote_addr.address_list);
-    ucs_assert(!ucp_worker_sockaddr_is_cm_proto(worker));
     return status;
 }
 
@@ -638,7 +778,7 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker,
     UCP_CHECK_PARAM_NON_NULL(params->address, status, goto out);
 
     status = ucp_address_unpack(worker, params->address,
-                                UCP_ADDRESS_PACK_FLAGS_WORKER_DEFAULT,
+                                ucp_worker_default_address_pack_flags(worker),
                                 &remote_address);
     if (status != UCS_OK) {
         goto out;
@@ -671,7 +811,8 @@ ucp_ep_create_api_to_worker_addr(ucp_worker_h worker,
         goto out_free_address;
     }
 
-    status = ucp_ep_create_to_worker_addr(worker, UINT64_MAX, &remote_address,
+    status = ucp_ep_create_to_worker_addr(worker, &ucp_tl_bitmap_max,
+                                          &remote_address,
                                           ucp_ep_init_flags(worker, params),
                                           "from api call", &ep);
     if (status != UCS_OK) {
@@ -743,8 +884,8 @@ ucs_status_t ucp_ep_create(ucp_worker_h worker, const ucp_ep_params_t *params,
     }
 
     if (status == UCS_OK) {
-        ep->flags |= UCP_EP_FLAG_USED;
-        *ep_p      = ep;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_USED, 0);
+        *ep_p = ep;
     }
 
     UCS_ASYNC_UNBLOCK(&worker->async);
@@ -779,71 +920,124 @@ void ucp_ep_err_pending_purge(uct_pending_req_t *self, void *arg)
     ucp_request_send_state_ff(req, status);
 }
 
-static void ucp_destroyed_ep_pending_purge(uct_pending_req_t *self, void *arg)
+void ucp_destroyed_ep_pending_purge(uct_pending_req_t *self, void *arg)
 {
-    ucs_bug("pending request %p on ep %p should have been flushed", self, arg);
+    ucs_bug("pending request %p (%s) on ep %p should have been flushed",
+            self, ucs_debug_get_symbol_name(self->func), arg);
+}
+
+void
+ucp_ep_purge_lanes(ucp_ep_h ep, uct_pending_purge_callback_t purge_cb,
+                   void *purge_arg)
+{
+    ucp_lane_index_t lane;
+    uct_ep_h uct_ep;
+
+    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
+        uct_ep = ep->uct_eps[lane];
+        if ((lane == ucp_ep_get_cm_lane(ep)) || (uct_ep == NULL)) {
+            continue;
+        }
+
+        ucs_debug("ep %p: purge uct_ep[%d]=%p", ep, lane, uct_ep);
+        uct_ep_pending_purge(uct_ep, purge_cb, purge_arg);
+    }
 }
 
 void ucp_ep_destroy_internal(ucp_ep_h ep)
 {
     ucs_debug("ep %p: destroy", ep);
     ucp_ep_cleanup_lanes(ep);
-    if (ep->flags & UCP_EP_FLAG_TEMPORARY) {
-        /* it's failed tmp ep of main ep */
-        ucs_assert(ucp_ep_ext_control(ep)->local_ep_id == UCP_EP_ID_INVALID);
-        ucp_ep_destroy_base(ep);
+    ucp_ep_delete(ep);
+}
+
+static void ucp_ep_check_lanes(ucp_ep_h ep)
+{
+#if UCS_ENABLE_ASSERT
+    uint8_t num_inprog       = ep->discard_refcount + ep->flush_iter_refcount;
+    uint8_t num_failed_tl_ep = 0;
+    ucp_lane_index_t lane;
+
+
+    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
+        num_failed_tl_ep += ucp_is_uct_ep_failed(ep->uct_eps[lane]);
+    }
+
+    ucs_assert((num_failed_tl_ep == 0) ||
+               (ucp_ep_num_lanes(ep) == num_failed_tl_ep));
+
+    if (num_failed_tl_ep != 0) {
+        /* EP reference count is the number of outstanding flush operations and
+         * discards, plus 1 if not destroyed yet */
+        ucs_assert((ep->refcount == num_inprog) ||
+                   (ep->refcount == (num_inprog + 1)));
     } else {
-        ucp_ep_delete(ep);
+        /* EP is used, so reference count is the number of outstanding flush
+         * operations and discards plus 1 */
+        ucs_assert(ep->refcount == (num_inprog + 1));
     }
+#endif
 }
 
-void ucp_ep_cleanup_lanes(ucp_ep_h ep)
+static void ucp_ep_set_lanes_failed(ucp_ep_h ep, uct_ep_h *uct_eps)
 {
     ucp_lane_index_t lane;
     uct_ep_h uct_ep;
 
-    ucs_debug("ep %p: cleanup lanes", ep);
+    ucp_ep_check_lanes(ep);
+    ucp_ep_release_id(ep);
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_FAILED, UCP_EP_FLAG_LOCAL_CONNECTED);
 
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
-        uct_ep = ep->uct_eps[lane];
-        if (uct_ep != NULL) {
-            ucs_debug("ep %p: purge uct_ep[%d]=%p", ep, lane, uct_ep);
-            uct_ep_pending_purge(uct_ep, ucp_destroyed_ep_pending_purge, ep);
-        }
+        uct_ep        = ep->uct_eps[lane];
+        uct_eps[lane] = uct_ep;
+
+        /* Set UCT EP to failed UCT EP to make sure if UCP EP won't be destroyed
+         * due to some UCT EP discarding procedures are in-progress and UCP EP
+         * may get some operation completions which could try to dereference its
+         * lanes */
+        ep->uct_eps[lane] = &ucp_failed_tl_ep;
     }
+}
+
+void ucp_ep_cleanup_lanes(ucp_ep_h ep)
+{
+    uct_ep_h uct_eps[UCP_MAX_LANES] = { NULL };
+    ucp_lane_index_t lane;
+    uct_ep_h uct_ep;
+
+    ucs_debug("ep %p: cleanup lanes", ep);
+
+    ucp_ep_set_lanes_failed(ep, uct_eps);
 
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
-        uct_ep = ep->uct_eps[lane];
+        uct_ep = uct_eps[lane];
         if (uct_ep == NULL) {
             continue;
         }
 
-        ucs_debug("ep %p: destroy uct_ep[%d]=%p", ep, lane, uct_ep);
+        ucs_debug("ep %p: pending & destroy uct_ep[%d]=%p", ep, lane, uct_ep);
+        uct_ep_pending_purge(uct_ep, ucp_destroyed_ep_pending_purge, ep);
+        /* coverity wrongly resolves ucp_failed_tl_ep's no-op EP destroy
+         * function to 'ucp_proxy_ep_destroy' */
+        /* coverity[incorrect_free] */
         uct_ep_destroy(uct_ep);
     }
-
-    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
-        ep->uct_eps[lane] = NULL;
-    }
 }
 
-/* Must be called with async lock held */
 void ucp_ep_disconnected(ucp_ep_h ep, int force)
 {
-    /* remove pending slow-path progress in case it wasn't removed yet */
-    ucs_callbackq_remove_if(&ep->worker->uct->progress_q,
-                            ucp_worker_err_handle_remove_filter, ep);
+    ucp_worker_h worker = ep->worker;
 
-    /* remove pending slow-path function if it wasn't removed yet */
-    ucs_callbackq_remove_if(&ep->worker->uct->progress_q,
-                            ucp_listener_accept_cb_remove_filter, ep);
+    UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED(worker);
 
     ucp_ep_cm_slow_cbq_cleanup(ep);
 
     ucp_stream_ep_cleanup(ep);
     ucp_am_ep_cleanup(ep);
+    ucp_ep_reqs_purge(ep, UCS_ERR_CANCELED);
 
-    ep->flags &= ~UCP_EP_FLAG_USED;
+    ucp_ep_update_flags(ep, 0, UCP_EP_FLAG_USED);
 
     if ((ep->flags & (UCP_EP_FLAG_CONNECT_REQ_QUEUED |
                       UCP_EP_FLAG_REMOTE_CONNECTED)) && !force) {
@@ -855,7 +1049,7 @@ void ucp_ep_disconnected(ucp_ep_h ep, int force)
         return;
     }
 
-    ucp_ep_match_remove_ep(ep->worker, ep);
+    ucp_ep_match_remove_ep(worker, ep);
     ucp_ep_destroy_internal(ep);
 }
 
@@ -887,7 +1081,22 @@ static void ucp_ep_set_close_request(ucp_ep_h ep, ucp_request_t *request,
 
     ucp_ep_flush_state_invalidate(ep);
     ucp_ep_ext_control(ep)->close_req.req = request;
-    ep->flags                            |= UCP_EP_FLAG_CLOSE_REQ_VALID;
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_CLOSE_REQ_VALID, 0);
+}
+
+void ucp_ep_register_disconnect_progress(ucp_request_t *req)
+{
+    ucp_ep_h ep                = req->send.ep;
+    uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
+
+    /* If a flush is completed from a pending/completion callback, we need to
+     * schedule slow-path callback to release the endpoint later, since a UCT
+     * endpoint cannot be released from pending/completion callback context.
+     */
+    ucs_trace("adding slow-path callback to destroy ep %p", ep);
+    uct_worker_progress_register_safe(ep->worker->uct,
+                                      ucp_ep_local_disconnect_progress, req,
+                                      UCS_CALLBACKQ_FLAG_ONESHOT, &prog_id);
 }
 
 static void ucp_ep_close_flushed_callback(ucp_request_t *req)
@@ -921,16 +1130,7 @@ static void ucp_ep_close_flushed_callback(ucp_request_t *req)
     UCS_ASYNC_UNBLOCK(async);
 
 out:
-    /* If a flush is completed from a pending/completion callback, we need to
-     * schedule slow-path callback to release the endpoint later, since a UCT
-     * endpoint cannot be released from pending/completion callback context.
-     */
-    ucs_trace("adding slow-path callback to destroy ep %p", ep);
-    req->send.disconnect.prog_id = UCS_CALLBACKQ_ID_NULL;
-    uct_worker_progress_register_safe(ep->worker->uct,
-                                      ucp_ep_local_disconnect_progress,
-                                      req, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                      &req->send.disconnect.prog_id);
+    ucp_ep_register_disconnect_progress(req);
 }
 
 ucs_status_ptr_t ucp_ep_close_nb(ucp_ep_h ep, unsigned mode)
@@ -944,44 +1144,89 @@ ucs_status_ptr_t ucp_ep_close_nb(ucp_ep_h ep, unsigned mode)
     return ucp_ep_close_nbx(ep, &param);
 }
 
+void ucp_ep_discard_lanes(ucp_ep_h ep, ucs_status_t status)
+{
+    uct_ep_h uct_eps[UCP_MAX_LANES] = { NULL };
+    ucp_lane_index_t lane;
+    uct_ep_h uct_ep;
+
+    ucs_debug("ep %p: discarding lanes", ep);
+
+    /* flush CANCEL mustn't be called for EPs without error handling support */
+    ucs_assert(ucp_ep_config(ep)->key.err_mode == UCP_ERR_HANDLING_MODE_PEER);
+    ucp_ep_set_lanes_failed(ep, uct_eps);
+
+    for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
+        uct_ep = uct_eps[lane];
+        if (uct_ep == NULL) {
+            continue;
+        }
+
+        ucs_debug("ep %p: discard uct_ep[%d]=%p", ep, lane, uct_ep);
+        ucp_worker_discard_uct_ep(ep, uct_ep, UCT_FLUSH_FLAG_CANCEL,
+                                  ucp_ep_err_pending_purge,
+                                  UCS_STATUS_PTR(status),
+                                  (ucp_send_nbx_callback_t)ucs_empty_function,
+                                  NULL);
+    }
+}
+
 ucs_status_ptr_t ucp_ep_close_nbx(ucp_ep_h ep, const ucp_request_param_t *param)
 {
     ucp_worker_h  worker = ep->worker;
-    int           force;
-    void          *request;
+    void          *request = NULL;
     ucp_request_t *close_req;
-    unsigned      uct_flags;
 
-    force = ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE;
-    if (force && !ucp_ep_has_cm_lane(ep) &&
+    if ((ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE) &&
         (ucp_ep_config(ep)->key.err_mode != UCP_ERR_HANDLING_MODE_PEER)) {
         return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM);
     }
 
     UCS_ASYNC_BLOCK(&worker->async);
 
-    ep->flags |= UCP_EP_FLAG_CLOSED;
-    uct_flags  = force ? UCT_FLUSH_FLAG_CANCEL : UCT_FLUSH_FLAG_LOCAL;
-    request    = ucp_ep_flush_internal(ep, uct_flags, 0,
-                                       &ucp_request_null_param, NULL,
-                                       ucp_ep_close_flushed_callback,
-                                       "close");
-    if (!UCS_PTR_IS_PTR(request)) {
-        if (ucp_ep_is_cm_local_connected(ep) && !force) {
-            /* lanes already flushed, start disconnect on CM lane */
-            ucp_ep_cm_disconnect_cm_lane(ep);
-            close_req = ucp_ep_cm_close_request_get(ep);
-            if (close_req != NULL) {
-                request = close_req + 1;
-                ucp_ep_set_close_request(ep, close_req, "close");
+    ucs_debug("ep %p flags 0x%x cfg_index %d: close_nbx(flags=0x%x)", ep,
+              ep->flags, ep->cfg_index, ucp_request_param_flags(param));
+
+    if (ep->flags & UCP_EP_FLAG_CLOSED) {
+        ucs_error("ep %p has already been closed", ep);
+        request = UCS_STATUS_PTR(UCS_ERR_NOT_CONNECTED);
+        goto out;
+    }
+
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_CLOSED, 0);
+
+    if (ucp_request_param_flags(param) & UCP_EP_CLOSE_FLAG_FORCE) {
+        /* FIXME: there is a potential issue with flush completion after an EP
+         * was forcibly closed from a user's error handling callback after
+         * disconnect event was received, but some EP flush operation still
+         * is in-progress, so, the destroyed EP will be touched upon flush
+         * completion on some transport */
+        if (!(ep->flags & UCP_EP_FLAG_FAILED)) {
+            ucp_ep_discard_lanes(ep, UCS_ERR_CANCELED);
+        }
+
+        ucp_ep_disconnected(ep, 1);
+    } else {
+        request = ucp_ep_flush_internal(ep, 0, param, NULL,
+                                        ucp_ep_close_flushed_callback, "close");
+        if (!UCS_PTR_IS_PTR(request)) {
+            if (ucp_ep_is_cm_local_connected(ep)) {
+                /* lanes already flushed, start disconnect on CM lane */
+                ucp_ep_cm_disconnect_cm_lane(ep);
+                close_req = ucp_ep_cm_close_request_get(ep, param);
+                if (close_req != NULL) {
+                    request = close_req + 1;
+                    ucp_ep_set_close_request(ep, close_req, "close");
+                } else {
+                    request = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
+                }
             } else {
-                request = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
+                ucp_ep_disconnected(ep, 0);
             }
-        } else {
-            ucp_ep_disconnected(ep, force);
         }
     }
 
+out:
     UCS_ASYNC_UNBLOCK(&worker->async);
     return request;
 }
@@ -1019,39 +1264,54 @@ void ucp_ep_destroy(ucp_ep_h ep)
     return;
 }
 
-static int
-ucp_ep_config_lane_is_dst_rsc_index_equal(const ucp_ep_config_key_t *key1,
-                                          ucp_lane_index_t lane1,
-                                          const ucp_ep_config_key_t *key2,
-                                          ucp_lane_index_t lane2)
+ucp_lane_index_t ucp_ep_lookup_lane(ucp_ep_h ucp_ep, uct_ep_h uct_ep)
+{
+    ucp_lane_index_t lane;
+
+    for (lane = 0; lane < ucp_ep_num_lanes(ucp_ep); ++lane) {
+        if ((uct_ep == ucp_ep->uct_eps[lane]) ||
+            ucp_wireup_ep_is_owner(ucp_ep->uct_eps[lane], uct_ep)) {
+            return lane;
+        }
+    }
+
+    return UCP_NULL_LANE;
+}
+
+static int ucp_ep_lane_is_dst_index_match(ucp_rsc_index_t dst_index1,
+                                          ucp_rsc_index_t dst_index2)
 {
-    return /* at least one of destination resource index is not specified */
-           (key1->lanes[lane1].dst_rsc_index == UCP_NULL_RESOURCE) ||
-           (key2->lanes[lane2].dst_rsc_index == UCP_NULL_RESOURCE) ||
-           /* both destination resource index are the same */
-           (key1->lanes[lane1].dst_rsc_index == key2->lanes[lane2].dst_rsc_index);
+    return (dst_index1 == UCP_NULL_RESOURCE) ||
+           (dst_index2 == UCP_NULL_RESOURCE) || (dst_index1 == dst_index2);
 }
 
-int ucp_ep_config_lane_is_peer_equal(const ucp_ep_config_key_t *key1,
+int ucp_ep_config_lane_is_peer_match(const ucp_ep_config_key_t *key1,
                                      ucp_lane_index_t lane1,
                                      const ucp_ep_config_key_t *key2,
                                      ucp_lane_index_t lane2)
 {
-    return (key1->lanes[lane1].rsc_index  == key2->lanes[lane2].rsc_index) &&
-           ucp_ep_config_lane_is_dst_rsc_index_equal(key1, lane1, key2, lane2) &&
-           (key1->lanes[lane1].path_index == key2->lanes[lane2].path_index) &&
-           (key1->lanes[lane1].dst_md_index  == key2->lanes[lane2].dst_md_index);
+    const ucp_ep_config_key_lane_t *config_lane1 = &key1->lanes[lane1];
+    const ucp_ep_config_key_lane_t *config_lane2 = &key2->lanes[lane2];
+
+    return (config_lane1->rsc_index == config_lane2->rsc_index) &&
+           (config_lane1->path_index == config_lane2->path_index) &&
+           ucp_ep_lane_is_dst_index_match(config_lane1->dst_md_index,
+                                          config_lane2->dst_md_index);
 }
 
 static ucp_lane_index_t
 ucp_ep_config_find_match_lane(const ucp_ep_config_key_t *key1,
+                              const ucp_rsc_index_t *dst_rsc_indices1,
                               ucp_lane_index_t lane1,
-                              const ucp_ep_config_key_t *key2)
+                              const ucp_ep_config_key_t *key2,
+                              const ucp_rsc_index_t *dst_rsc_indices2)
 {
     ucp_lane_index_t lane_idx;
 
     for (lane_idx = 0; lane_idx < key2->num_lanes; ++lane_idx) {
-        if (ucp_ep_config_lane_is_peer_equal(key1, lane1, key2, lane_idx)) {
+        if (ucp_ep_config_lane_is_peer_match(key1, lane1, key2, lane_idx) &&
+            ucp_ep_lane_is_dst_index_match(dst_rsc_indices1[lane1],
+                                           dst_rsc_indices2[lane_idx])) {
             return lane_idx;
         }
     }
@@ -1062,15 +1322,18 @@ ucp_ep_config_find_match_lane(const ucp_ep_config_key_t *key1,
 /* Go through the first configuration and check if the lanes selected
  * for this configuration could be used for the second configuration */
 void ucp_ep_config_lanes_intersect(const ucp_ep_config_key_t *key1,
+                                   const ucp_rsc_index_t *dst_rsc_indices1,
                                    const ucp_ep_config_key_t *key2,
+                                   const ucp_rsc_index_t *dst_rsc_indices2,
                                    ucp_lane_index_t *lane_map)
 {
     ucp_lane_index_t lane1_idx;
 
     for (lane1_idx = 0; lane1_idx < key1->num_lanes; ++lane1_idx) {
         lane_map[lane1_idx] = ucp_ep_config_find_match_lane(key1,
-                                                            lane1_idx,
-                                                            key2);
+                                                            dst_rsc_indices1,
+                                                            lane1_idx, key2,
+                                                            dst_rsc_indices2);
     }
 }
 
@@ -1078,9 +1341,14 @@ static int ucp_ep_config_lane_is_equal(const ucp_ep_config_key_t *key1,
                                        const ucp_ep_config_key_t *key2,
                                        ucp_lane_index_t lane)
 {
-    return ucp_ep_config_lane_is_peer_equal(key1, lane, key2, lane)           &&
-           (key1->lanes[lane].dst_md_index == key2->lanes[lane].dst_md_index) &&
-           (key1->lanes[lane].lane_types   == key2->lanes[lane].lane_types);
+    const ucp_ep_config_key_lane_t *config_lane1 = &key1->lanes[lane];
+    const ucp_ep_config_key_lane_t *config_lane2 = &key2->lanes[lane];
+
+    return (config_lane1->rsc_index == config_lane2->rsc_index) &&
+           (config_lane1->path_index == config_lane2->path_index) &&
+           (config_lane1->dst_md_index == config_lane2->dst_md_index) &&
+           (config_lane1->dst_sys_dev == config_lane2->dst_sys_dev) &&
+           (config_lane1->lane_types == config_lane2->lane_types);
 }
 
 int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1,
@@ -1089,28 +1357,27 @@ int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1,
     ucp_lane_index_t lane;
     int i;
 
-    if ((key1->num_lanes        != key2->num_lanes)                                ||
-        memcmp(key1->rma_lanes,    key2->rma_lanes,    sizeof(key1->rma_lanes))    ||
-        memcmp(key1->am_bw_lanes,  key2->am_bw_lanes,  sizeof(key1->am_bw_lanes))  ||
-        memcmp(key1->rma_bw_lanes, key2->rma_bw_lanes, sizeof(key1->rma_bw_lanes)) ||
-        memcmp(key1->amo_lanes,    key2->amo_lanes,    sizeof(key1->amo_lanes))    ||
-        (key1->rma_bw_md_map    != key2->rma_bw_md_map)                            ||
-        (key1->reachable_md_map != key2->reachable_md_map)                         ||
-        (key1->am_lane          != key2->am_lane)                                  ||
-        (key1->tag_lane         != key2->tag_lane)                                 ||
-        (key1->wireup_msg_lane  != key2->wireup_msg_lane)                          ||
-        (key1->cm_lane          != key2->cm_lane)                                  ||
-        (key1->rkey_ptr_lane    != key2->rkey_ptr_lane)                            ||
-        (key1->ep_check_map     != key2->ep_check_map)                             ||
-        (key1->err_mode         != key2->err_mode)                                 ||
-        (key1->status           != key2->status))
-    {
+    if ((key1->num_lanes != key2->num_lanes) ||
+        memcmp(key1->rma_lanes, key2->rma_lanes, sizeof(key1->rma_lanes)) ||
+        memcmp(key1->am_bw_lanes, key2->am_bw_lanes,
+               sizeof(key1->am_bw_lanes)) ||
+        memcmp(key1->rma_bw_lanes, key2->rma_bw_lanes,
+               sizeof(key1->rma_bw_lanes)) ||
+        memcmp(key1->amo_lanes, key2->amo_lanes, sizeof(key1->amo_lanes)) ||
+        (key1->rma_bw_md_map != key2->rma_bw_md_map) ||
+        (key1->reachable_md_map != key2->reachable_md_map) ||
+        (key1->am_lane != key2->am_lane) ||
+        (key1->tag_lane != key2->tag_lane) ||
+        (key1->wireup_msg_lane != key2->wireup_msg_lane) ||
+        (key1->cm_lane != key2->cm_lane) ||
+        (key1->rkey_ptr_lane != key2->rkey_ptr_lane) ||
+        (key1->ep_check_map != key2->ep_check_map) ||
+        (key1->err_mode != key2->err_mode)) {
         return 0;
     }
 
     for (lane = 0; lane < key1->num_lanes; ++lane) {
-        if (!ucp_ep_config_lane_is_equal(key1, key2, lane))
-        {
+        if (!ucp_ep_config_lane_is_equal(key1, key2, lane)) {
             return 0;
         }
     }
@@ -1281,13 +1548,16 @@ size_t ucp_ep_tag_offload_min_rndv_thresh(ucp_ep_config_t *config)
     return sizeof(ucp_rndv_rts_hdr_t) + config->rndv.rkey_size;
 }
 
-static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker,
-                                             uct_iface_attr_t *iface_attr,
-                                             uct_md_attr_t *md_attr,
-                                             ucp_ep_config_t *config,
-                                             size_t min_rndv_thresh,
-                                             size_t max_rndv_thresh,
-                                             ucp_rndv_thresh_t *thresh)
+static void ucp_ep_config_init_short_thresh(ucp_memtype_thresh_t *thresh)
+{
+    thresh->memtype_on  = -1;
+    thresh->memtype_off = -1;
+}
+
+static void ucp_ep_config_set_am_rndv_thresh(
+        ucp_worker_h worker, uct_iface_attr_t *iface_attr,
+        uct_md_attr_t *md_attr, ucp_ep_config_t *config, size_t min_rndv_thresh,
+        size_t max_rndv_thresh, ucp_rndv_thresh_t *thresh)
 {
     ucp_context_h context = worker->context;
     size_t rndv_thresh, rndv_local_thresh, min_thresh;
@@ -1312,10 +1582,6 @@ static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker,
     } else {
         rndv_thresh       = context->config.ext.rndv_thresh;
         rndv_local_thresh = context->config.ext.rndv_thresh;
-
-        /* adjust max_short if rndv_thresh is set externally */
-        ucp_ep_config_adjust_max_short(&config->tag.eager.max_short,
-                                       rndv_thresh);
     }
 
     min_thresh     = ucs_max(iface_attr->cap.am.min_zcopy, min_rndv_thresh);
@@ -1326,12 +1592,10 @@ static void ucp_ep_config_set_am_rndv_thresh(ucp_worker_h worker,
               thresh->remote, thresh->local);
 }
 
-static void ucp_ep_config_set_rndv_thresh(ucp_worker_t *worker,
-                                          ucp_ep_config_t *config,
-                                          ucp_lane_index_t *lanes,
-                                          size_t min_rndv_thresh,
-                                          size_t max_rndv_thresh,
-                                          ucp_rndv_thresh_t *thresh)
+static void
+ucp_ep_config_set_rndv_thresh(ucp_worker_t *worker, ucp_ep_config_t *config,
+                              ucp_lane_index_t *lanes, size_t min_rndv_thresh,
+                              size_t max_rndv_thresh, ucp_rndv_thresh_t *thresh)
 {
     ucp_context_t *context = worker->context;
     ucp_lane_index_t lane  = lanes[0];
@@ -1362,10 +1626,6 @@ static void ucp_ep_config_set_rndv_thresh(ucp_worker_t *worker,
     } else {
         rndv_thresh       = context->config.ext.rndv_thresh;
         rndv_local_thresh = context->config.ext.rndv_thresh;
-
-        /* adjust max_short if rndv_thresh is set externally */
-        ucp_ep_config_adjust_max_short(&config->tag.eager.max_short,
-                                       rndv_thresh);
     }
 
     min_thresh = ucs_max(iface_attr->cap.get.min_zcopy, min_rndv_thresh);
@@ -1393,29 +1653,136 @@ static void ucp_ep_config_set_memtype_thresh(ucp_memtype_thresh_t *max_eager_sho
     max_eager_short->memtype_on = max_short;
 }
 
-static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_index,
-                                     ucp_ep_msg_config_t *config, size_t max_short,
-                                     size_t max_bcopy, size_t max_zcopy,
-                                     size_t max_iov, size_t max_hdr,
-                                     uint64_t short_flag, uint64_t bcopy_flag,
-                                     uint64_t zcopy_flag, unsigned hdr_len,
-                                     size_t adjust_min_val)
+/* Coverity assumes that mem_type_index could have value >= UCS_MEMORY_TYPE_LAST,
+ * a caller of this function should suppress this false-positive warning */
+static void
+ucp_ep_config_rndv_zcopy_max_bw_update(ucp_context_t *context,
+                                       const uct_md_attr_t *md_attr,
+                                       const uct_iface_attr_t *iface_attr,
+                                       uint64_t cap_flag,
+                                       double max_bw[UCS_MEMORY_TYPE_LAST])
+{
+    uint8_t mem_type_index;
+    double bw;
+
+    if (!(iface_attr->cap.flags & cap_flag)) {
+        return;
+    }
+
+    bw = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth);
+    ucs_for_each_bit(mem_type_index, md_attr->cap.reg_mem_types) {
+        ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST);
+        max_bw[mem_type_index] = ucs_max(max_bw[mem_type_index], bw);
+    }
+}
+
+static void
+ucp_ep_config_rndv_zcopy_set(ucp_context_t *context, uint64_t cap_flag,
+                             ucp_lane_index_t lane,
+                             const uct_md_attr_t *md_attr,
+                             const uct_iface_attr_t *iface_attr,
+                             double max_bw[UCS_MEMORY_TYPE_LAST],
+                             ucp_ep_rndv_zcopy_config_t *rndv_zcopy,
+                             ucp_lane_index_t *lanes_count_p)
+{
+    const double min_scale = 1. / context->config.ext.multi_lane_max_ratio;
+    uint8_t mem_type_index;
+    double scale;
+    size_t min, max;
+
+    if (!(iface_attr->cap.flags & cap_flag)) {
+        return;
+    }
+
+    if (cap_flag == UCT_IFACE_FLAG_GET_ZCOPY) {
+        min = iface_attr->cap.get.min_zcopy;
+        max = iface_attr->cap.get.max_zcopy;
+    } else {
+        ucs_assert(cap_flag == UCT_IFACE_FLAG_PUT_ZCOPY);
+        min = iface_attr->cap.put.min_zcopy;
+        max = iface_attr->cap.put.max_zcopy;
+    }
+
+    ucs_for_each_bit(mem_type_index, md_attr->cap.reg_mem_types) {
+        ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST);
+        scale = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth) /
+                max_bw[mem_type_index];
+        if ((scale - min_scale) < -ucp_calc_epsilon(scale, min_scale)) {
+            continue;
+        }
+
+        rndv_zcopy->min = ucs_max(rndv_zcopy->min, min);
+        rndv_zcopy->max = ucs_min(rndv_zcopy->max, max);
+        ucs_assert(*lanes_count_p < UCP_MAX_LANES);
+        rndv_zcopy->lanes[(*lanes_count_p)++] = lane;
+        rndv_zcopy->scale[lane]               = scale;
+        break;
+    }
+}
+
+static void
+ucp_ep_config_rndv_zcopy_commit(ucp_lane_index_t lanes_count,
+                                ucp_ep_rndv_zcopy_config_t *rndv_zcopy)
+{
+    if (lanes_count == 0) {
+        /* if there are no RNDV RMA BW lanes that support Zcopy operation, reset
+         * min/max values to show that the scheme is unsupported */
+        rndv_zcopy->min   = SIZE_MAX;
+        rndv_zcopy->max   = 0;
+        rndv_zcopy->split = 0;
+    } else {
+        rndv_zcopy->split = rndv_zcopy->min <= (rndv_zcopy->max / 2);
+    }
+}
+
+static ssize_t
+ucp_ep_config_max_short(ucp_context_t *context, uct_iface_attr_t *iface_attr,
+                        uint64_t short_flag, size_t max_short, unsigned hdr_len,
+                        size_t zcopy_thresh,
+                        const ucp_rndv_thresh_t *rndv_thresh)
+{
+    ssize_t cfg_max_short;
+
+    if (!(iface_attr->cap.flags & short_flag)) {
+        return -1;
+    }
+
+    cfg_max_short = max_short - hdr_len;
+
+    if ((context->config.ext.zcopy_thresh != UCS_MEMUNITS_AUTO)) {
+        /* Adjust max_short if zcopy_thresh is set externally */
+        ucp_ep_config_adjust_max_short(&cfg_max_short, zcopy_thresh);
+    }
+
+    if ((rndv_thresh != NULL) &&
+        (context->config.ext.rndv_thresh != UCS_MEMUNITS_AUTO)) {
+        /* Adjust max_short if rndv_thresh is set externally. Note local and
+         * remote threshold values are the same if set externally, so can
+         * compare with just one of them. */
+        ucs_assert(rndv_thresh->remote == rndv_thresh->local);
+        ucp_ep_config_adjust_max_short(&cfg_max_short, rndv_thresh->remote);
+    }
+
+    return cfg_max_short;
+}
+
+static void
+ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_index,
+                         ucp_ep_msg_config_t *config, size_t max_bcopy,
+                         size_t max_zcopy, size_t max_iov, size_t max_hdr,
+                         uint64_t bcopy_flag, uint64_t zcopy_flag,
+                         size_t adjust_min_val)
 {
     ucp_context_t *context = worker->context;
     const uct_md_attr_t *md_attr;
     uct_iface_attr_t *iface_attr;
     size_t it;
     size_t zcopy_thresh;
+    size_t mem_type_zcopy_thresh;
     int mem_type;
 
     iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
 
-    if ((iface_attr->cap.flags & short_flag)) {
-        config->max_short = max_short - hdr_len;
-    } else {
-        config->max_short = -1;
-    }
-
     if (iface_attr->cap.flags & bcopy_flag) {
         config->max_bcopy = max_bcopy;
     } else {
@@ -1435,6 +1802,7 @@ static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_i
 
     if (context->config.ext.zcopy_thresh == UCS_MEMUNITS_AUTO) {
         config->zcopy_auto_thresh = 1;
+        mem_type_zcopy_thresh     = 1;
         for (it = 0; it < UCP_MAX_IOV; ++it) {
             zcopy_thresh = ucp_ep_config_get_zcopy_auto_thresh(
                                it + 1, &md_attr->reg_cost, context,
@@ -1448,17 +1816,14 @@ static void ucp_ep_config_init_attrs(ucp_worker_t *worker, ucp_rsc_index_t rsc_i
         config->zcopy_auto_thresh    = 0;
         config->sync_zcopy_thresh[0] = config->zcopy_thresh[0] =
                 ucs_min(context->config.ext.zcopy_thresh, adjust_min_val);
-
-        /* adjust max_short if zcopy_thresh is set externally */
-        ucp_ep_config_adjust_max_short(&config->max_short,
-                                       config->zcopy_thresh[0]);
+        mem_type_zcopy_thresh        = config->zcopy_thresh[0];
     }
 
     ucs_memory_type_for_each(mem_type) {
-        if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) {
+        if (UCP_MEM_IS_HOST(mem_type)) {
             config->mem_type_zcopy_thresh[mem_type] = config->zcopy_thresh[0];
         } else if (md_attr->cap.reg_mem_types & UCS_BIT(mem_type)) {
-            config->mem_type_zcopy_thresh[mem_type] = 1;
+            config->mem_type_zcopy_thresh[mem_type] = mem_type_zcopy_thresh;
         }
     }
 }
@@ -1497,10 +1862,11 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
     size_t max_rndv_thresh, max_am_rndv_thresh;
     size_t min_rndv_thresh, min_am_rndv_thresh;
     size_t rma_zcopy_thresh;
-    double rndv_max_bw[UCS_MEMORY_TYPE_LAST], scale, bw;
+    size_t am_max_eager_short;
+    double get_zcopy_max_bw[UCS_MEMORY_TYPE_LAST];
+    double put_zcopy_max_bw[UCS_MEMORY_TYPE_LAST];
     ucs_status_t status;
     size_t it;
-    uint8_t mem_type_index;
 
     memset(config, 0, sizeof(*config));
 
@@ -1534,15 +1900,18 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
     config->tag.rndv.am_thresh          = config->tag.rndv.rma_thresh;
     config->rndv.rma_thresh             = config->tag.rndv.rma_thresh;
     config->rndv.am_thresh              = config->tag.rndv.am_thresh;
-    config->rndv.min_get_zcopy          = 0;
-    config->rndv.max_get_zcopy          = SIZE_MAX;
-    config->rndv.min_put_zcopy          = 0;
-    config->rndv.max_put_zcopy          = SIZE_MAX;
+    /* use 1 instead of 0, since messages passed to RNDV PUT/GET Zcopy are always > 0
+     * and make sure that multi-rail chunks are adjusted to not be 0-length */
+    config->rndv.get_zcopy.min          = 1;
+    config->rndv.get_zcopy.max          = SIZE_MAX;
+    config->rndv.put_zcopy.min          = 1;
+    config->rndv.put_zcopy.max          = SIZE_MAX;
     config->rndv.rkey_size              = ucp_rkey_packed_size(context,
-                                                               config->key.rma_bw_md_map);
+                                                               config->key.rma_bw_md_map,
+                                                               UCS_SYS_DEVICE_ID_UNKNOWN, 0);
     for (lane = 0; lane < UCP_MAX_LANES; ++lane) {
-        config->rndv.get_zcopy_lanes[lane] = UCP_NULL_LANE;
-        config->rndv.put_zcopy_lanes[lane] = UCP_NULL_LANE;
+        config->rndv.get_zcopy.lanes[lane] =
+                config->rndv.put_zcopy.lanes[lane] = UCP_NULL_LANE;
     }
 
     config->rndv.rkey_ptr_dst_mds       = 0;
@@ -1550,10 +1919,10 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
     config->am_u.proto                  = &ucp_am_proto;
     config->am_u.reply_proto            = &ucp_am_reply_proto;
 
-    config->tag.offload.max_eager_short.memtype_on   = -1;
-    config->tag.offload.max_eager_short.memtype_off  = -1;
-    config->tag.max_eager_short.memtype_on           = -1;
-    config->tag.max_eager_short.memtype_off          = -1;
+    ucp_ep_config_init_short_thresh(&config->tag.offload.max_eager_short);
+    ucp_ep_config_init_short_thresh(&config->tag.max_eager_short);
+    ucp_ep_config_init_short_thresh(&config->am_u.max_eager_short);
+    ucp_ep_config_init_short_thresh(&config->am_u.max_reply_eager_short);
 
     for (lane = 0; lane < config->key.num_lanes; ++lane) {
         rsc_index = config->key.lanes[lane].rsc_index;
@@ -1568,11 +1937,10 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
     }
 
     /* configuration for rndv */
-    get_zcopy_lane_count = 0;
-    put_zcopy_lane_count = 0;
+    get_zcopy_lane_count = put_zcopy_lane_count = 0;
 
     ucs_memory_type_for_each(i) {
-        rndv_max_bw[i] = 0;
+        get_zcopy_max_bw[i] = put_zcopy_max_bw[i] = 0;
     }
 
     for (i = 0; (i < config->key.num_lanes) &&
@@ -1585,14 +1953,18 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
 
         md_attr    = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr;
         iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
-        if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) {
-            /* only GET Zcopy RNDV scheme supports multi-rail */
-            bw = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth);
-            ucs_for_each_bit(mem_type_index, md_attr->cap.reg_mem_types) {
-                ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST);
-                rndv_max_bw[mem_type_index] = ucs_max(rndv_max_bw[mem_type_index], bw);
-            }
-        }
+
+        /* GET Zcopy */
+        /* coverity[overrun-buffer-val] */
+        ucp_ep_config_rndv_zcopy_max_bw_update(context, md_attr, iface_attr,
+                                               UCT_IFACE_FLAG_GET_ZCOPY,
+                                               get_zcopy_max_bw);
+
+        /* PUT Zcopy */
+        /* coverity[overrun-buffer-val] */
+        ucp_ep_config_rndv_zcopy_max_bw_update(context, md_attr, iface_attr,
+                                               UCT_IFACE_FLAG_PUT_ZCOPY,
+                                               put_zcopy_max_bw);
     }
 
     for (i = 0; (i < config->key.num_lanes) &&
@@ -1605,61 +1977,28 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
             md_attr    = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr;
 
             /* GET Zcopy */
-            if (iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY) {
-                ucs_for_each_bit(mem_type_index, md_attr->cap.reg_mem_types) {
-                    ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST);
-                    scale = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth) /
-                            rndv_max_bw[mem_type_index];
-                    if (scale < (1. / context->config.ext.multi_lane_max_ratio)) {
-                        continue;
-                    }
-
-                    config->rndv.min_get_zcopy = ucs_max(config->rndv.min_get_zcopy,
-                                                         iface_attr->cap.get.min_zcopy);
-
-                    config->rndv.max_get_zcopy = ucs_min(config->rndv.max_get_zcopy,
-                                                         iface_attr->cap.get.max_zcopy);
-                    ucs_assert(get_zcopy_lane_count < UCP_MAX_LANES);
-                    config->rndv.get_zcopy_lanes[get_zcopy_lane_count++] = lane;
-                    config->rndv.scale[lane]                             = scale;
-                    break;
-                }
-            }
+            ucp_ep_config_rndv_zcopy_set(context, UCT_IFACE_FLAG_GET_ZCOPY,
+                                         lane, md_attr, iface_attr,
+                                         get_zcopy_max_bw,
+                                         &config->rndv.get_zcopy,
+                                         &get_zcopy_lane_count);
 
             /* PUT Zcopy */
-            if (iface_attr->cap.flags & UCT_IFACE_FLAG_PUT_ZCOPY) {
-                config->rndv.min_put_zcopy = ucs_max(config->rndv.min_put_zcopy,
-                                                     iface_attr->cap.put.min_zcopy);
-
-                config->rndv.max_put_zcopy = ucs_min(config->rndv.max_put_zcopy,
-                                                     iface_attr->cap.put.max_zcopy);
-                ucs_assert(put_zcopy_lane_count < UCP_MAX_LANES);
-                config->rndv.put_zcopy_lanes[put_zcopy_lane_count++] = lane;
-            }
+            ucp_ep_config_rndv_zcopy_set(context, UCT_IFACE_FLAG_PUT_ZCOPY,
+                                         lane, md_attr, iface_attr,
+                                         put_zcopy_max_bw,
+                                         &config->rndv.put_zcopy,
+                                         &put_zcopy_lane_count);
         }
     }
 
-    if (get_zcopy_lane_count == 0) {
-        /* if there are no RNDV RMA BW lanes that support GET Zcopy, reset
-         * min/max values to show that the scheme is unsupported */
-        config->rndv.min_get_zcopy   = SIZE_MAX;
-        config->rndv.max_get_zcopy   = 0;
-        config->rndv.get_zcopy_split = 0;
-    } else {
-        config->rndv.get_zcopy_split = config->rndv.min_get_zcopy <=
-                                       (config->rndv.max_get_zcopy / 2);
-    }
+    /* GET Zcopy */
+    ucp_ep_config_rndv_zcopy_commit(get_zcopy_lane_count,
+                                    &config->rndv.get_zcopy);
 
-    if (put_zcopy_lane_count == 0) {
-        /* if there are no RNDV RMA BW lanes that support PUT Zcopy, reset
-         * min/max values to show that the scheme is unsupported */
-        config->rndv.min_put_zcopy   = SIZE_MAX;
-        config->rndv.max_put_zcopy   = 0;
-        config->rndv.put_zcopy_split = 0;
-    } else {
-        config->rndv.put_zcopy_split = config->rndv.min_put_zcopy <=
-                                       (config->rndv.max_put_zcopy / 2);
-    }
+    /* PUT Zcopy */
+    ucp_ep_config_rndv_zcopy_commit(put_zcopy_lane_count,
+                                    &config->rndv.put_zcopy);
 
     /* Rkey ptr */
     if (key->rkey_ptr_lane != UCP_NULL_LANE) {
@@ -1679,13 +2018,11 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
         if (rsc_index != UCP_NULL_RESOURCE) {
             iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
             ucp_ep_config_init_attrs(worker, rsc_index, &config->tag.eager,
-                                     iface_attr->cap.tag.eager.max_short,
                                      iface_attr->cap.tag.eager.max_bcopy,
                                      iface_attr->cap.tag.eager.max_zcopy,
                                      iface_attr->cap.tag.eager.max_iov, 0,
-                                     UCT_IFACE_FLAG_TAG_EAGER_SHORT,
                                      UCT_IFACE_FLAG_TAG_EAGER_BCOPY,
-                                     UCT_IFACE_FLAG_TAG_EAGER_ZCOPY, 0,
+                                     UCT_IFACE_FLAG_TAG_EAGER_ZCOPY,
                                      iface_attr->cap.tag.eager.max_bcopy);
 
             config->tag.offload.max_rndv_iov   = iface_attr->cap.tag.rndv.max_iov;
@@ -1715,6 +2052,12 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
                                                  &config->tag.rndv.am_thresh);
             }
 
+            config->tag.eager.max_short = ucp_ep_config_max_short(
+                    worker->context, iface_attr, UCT_IFACE_FLAG_TAG_EAGER_SHORT,
+                    iface_attr->cap.tag.eager.max_short, 0,
+                    config->tag.eager.zcopy_thresh[0],
+                    &config->tag.rndv.am_thresh);
+
             /* Max Eager short has to be set after Zcopy and RNDV thresholds */
             ucp_ep_config_set_memtype_thresh(&config->tag.offload.max_eager_short,
                                              config->tag.eager.max_short,
@@ -1730,15 +2073,20 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
             iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
             md_attr    = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr;
             ucp_ep_config_init_attrs(worker, rsc_index, &config->am,
-                                     iface_attr->cap.am.max_short,
                                      iface_attr->cap.am.max_bcopy,
                                      iface_attr->cap.am.max_zcopy,
                                      iface_attr->cap.am.max_iov,
                                      iface_attr->cap.am.max_hdr,
-                                     UCT_IFACE_FLAG_AM_SHORT,
                                      UCT_IFACE_FLAG_AM_BCOPY,
-                                     UCT_IFACE_FLAG_AM_ZCOPY,
-                                     sizeof(ucp_eager_hdr_t), SIZE_MAX);
+                                     UCT_IFACE_FLAG_AM_ZCOPY, SIZE_MAX);
+
+            /* Configuration stored in config->am is used by TAG, UCP AM and
+             * STREAM protocol implementations, do not adjust max_short value by
+             * zcopy and rndv thresholds. */
+            config->am.max_short = ucp_ep_config_max_short(
+                    worker->context, iface_attr, UCT_IFACE_FLAG_AM_SHORT,
+                    iface_attr->cap.am.max_short, sizeof(ucp_eager_hdr_t),
+                    SIZE_MAX, NULL);
 
             /* Calculate rendezvous thresholds which may be used by UCP AM
              * protocol. */
@@ -1746,46 +2094,61 @@ ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
                 rkey_ptr_lanes[0] = config->key.rkey_ptr_lane;
                 ucp_ep_config_set_rndv_thresh(worker, config, rkey_ptr_lanes,
                                               iface_attr->cap.get.min_zcopy,
-                                              SIZE_MAX, &config->rndv.rma_thresh);
+                                              SIZE_MAX,
+                                              &config->rndv.rma_thresh);
             } else {
                 ucp_ep_config_set_rndv_thresh(worker, config,
                                               config->key.rma_bw_lanes,
                                               iface_attr->cap.get.min_zcopy,
-                                              SIZE_MAX, &config->rndv.rma_thresh);
+                                              SIZE_MAX,
+                                              &config->rndv.rma_thresh);
             }
 
-            ucp_ep_config_set_am_rndv_thresh(worker, iface_attr, md_attr, config,
+            ucp_ep_config_set_am_rndv_thresh(worker, iface_attr, md_attr,
+                                             config,
                                              iface_attr->cap.am.min_zcopy,
                                              SIZE_MAX, &config->rndv.am_thresh);
 
+            am_max_eager_short = ucp_ep_config_max_short(
+                    worker->context, iface_attr, UCT_IFACE_FLAG_AM_SHORT,
+                    iface_attr->cap.am.max_short, sizeof(ucp_am_hdr_t),
+                    config->am.zcopy_thresh[0], &config->rndv.am_thresh);
+
+            ucp_ep_config_set_memtype_thresh(&config->am_u.max_eager_short,
+                                             am_max_eager_short,
+                                             context->num_mem_type_detect_mds);
+
             /* All keys must fit in RNDV packet.
              * TODO remove some MDs if they don't
              */
             ucs_assert_always(config->rndv.rkey_size <= config->am.max_bcopy);
 
-            if (!ucp_ep_is_tag_offload_enabled(config)) {
+            if (!ucp_ep_config_key_has_tag_lane(&config->key)) {
                 /* Tag offload is disabled, AM will be used for all
                  * tag-matching protocols */
                 /* TODO: set threshold level based on all available lanes */
 
                 config->tag.eager           = config->am;
+                config->tag.eager.max_short = am_max_eager_short;
                 config->tag.lane            = lane;
                 config->tag.rndv.am_thresh  = config->rndv.am_thresh;
                 config->tag.rndv.rma_thresh = config->rndv.rma_thresh;
 
-                if (context->config.ext.rndv_thresh != UCS_MEMUNITS_AUTO) {
-                    /* adjust max_short if rndv_thresh is set externally */
-                    min_rndv_thresh = ucs_min(config->tag.rndv.rma_thresh.remote,
-                                              config->tag.rndv.am_thresh.remote);
-                    ucp_ep_config_adjust_max_short(&config->tag.eager.max_short,
-                                                   min_rndv_thresh);
-                }
-
                 /* Max Eager short has to be set after Zcopy and RNDV thresholds */
                 ucp_ep_config_set_memtype_thresh(&config->tag.max_eager_short,
                                                  config->tag.eager.max_short,
                                                  context->num_mem_type_detect_mds);
             }
+
+            /* Calculate max short threshold for UCP AM short reply protocol */
+            am_max_eager_short = ucp_ep_config_max_short(
+                    worker->context, iface_attr, UCT_IFACE_FLAG_AM_SHORT,
+                    iface_attr->cap.am.max_short, sizeof(ucp_am_reply_hdr_t),
+                    config->am.zcopy_thresh[0], &config->rndv.am_thresh);
+
+            ucp_ep_config_set_memtype_thresh(&config->am_u.max_reply_eager_short,
+                                             am_max_eager_short,
+                                             context->num_mem_type_detect_mds);
         } else {
             /* Stub endpoint */
             config->am.max_bcopy        = UCP_MIN_BCOPY;
@@ -1890,11 +2253,10 @@ static int ucp_ep_is_short_lower_thresh(ssize_t max_short,
             (((size_t)max_short + 1) < thresh));
 }
 
-static void ucp_ep_config_print_tag_proto(FILE *stream, const char *name,
-                                          ssize_t max_eager_short,
-                                          size_t zcopy_thresh,
-                                          size_t rndv_rma_thresh,
-                                          size_t rndv_am_thresh)
+static void
+ucp_ep_config_print_proto(FILE *stream, const char *name,
+                          ssize_t max_eager_short, size_t zcopy_thresh,
+                          size_t rndv_rma_thresh, size_t rndv_am_thresh)
 {
     size_t max_bcopy, min_rndv, max_short;
 
@@ -1976,12 +2338,12 @@ void ucp_ep_config_cm_lane_info_str(ucp_worker_h worker,
                                     const ucp_ep_config_key_t *key,
                                     ucp_lane_index_t lane,
                                     ucp_rsc_index_t cm_index,
-                                    char *buf, size_t max)
+                                    ucs_string_buffer_t *strbuf)
 {
-    ucs_snprintf_zero(buf, max, "lane[%d]: cm %s", lane,
-                      (cm_index != UCP_NULL_RESOURCE) ?
-                      ucp_context_cm_name(worker->context, cm_index) :
-                      "<unknown>");
+    ucs_string_buffer_appendf(strbuf, "lane[%d]: cm %s", lane,
+                              (cm_index != UCP_NULL_RESOURCE) ?
+                              ucp_context_cm_name(worker->context, cm_index) :
+                              "<unknown>");
 }
 
 void ucp_ep_config_lane_info_str(ucp_worker_h worker,
@@ -1989,7 +2351,7 @@ void ucp_ep_config_lane_info_str(ucp_worker_h worker,
                                  const unsigned *addr_indices,
                                  ucp_lane_index_t lane,
                                  ucp_rsc_index_t aux_rsc_index,
-                                 char *buf, size_t max)
+                                 ucs_string_buffer_t *strbuf)
 {
     ucp_context_h context = worker->context;
     uct_tl_resource_desc_t *rsc;
@@ -1997,163 +2359,165 @@ void ucp_ep_config_lane_info_str(ucp_worker_h worker,
     ucp_md_index_t dst_md_index;
     ucp_rsc_index_t cmpt_index;
     unsigned path_index;
-    char *p, *endp;
     int prio;
 
-    p          = buf;
-    endp       = buf + max;
     rsc_index  = key->lanes[lane].rsc_index;
     rsc        = &context->tl_rscs[rsc_index].tl_rsc;
 
     path_index = key->lanes[lane].path_index;
-    snprintf(p, endp - p, "lane[%d]: %2d:" UCT_TL_RESOURCE_DESC_FMT ".%u md[%d] %-*c-> ",
-             lane, rsc_index, UCT_TL_RESOURCE_DESC_ARG(rsc), path_index,
-             context->tl_rscs[rsc_index].md_index,
-             20 - (int)(strlen(rsc->dev_name) + strlen(rsc->tl_name)),
-             ' ');
-    p += strlen(p);
+    ucs_string_buffer_appendf(strbuf,
+            "lane[%d]: %2d:" UCT_TL_RESOURCE_DESC_FMT ".%u md[%d] %-*c-> ",
+            lane, rsc_index, UCT_TL_RESOURCE_DESC_ARG(rsc), path_index,
+            context->tl_rscs[rsc_index].md_index,
+            20 - (int)(strlen(rsc->dev_name) + strlen(rsc->tl_name)),
+            ' ');
 
     if (addr_indices != NULL) {
-        snprintf(p, endp - p, "addr[%d].", addr_indices[lane]);
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, "addr[%d].", addr_indices[lane]);
     }
 
     dst_md_index = key->lanes[lane].dst_md_index;
     cmpt_index   = ucp_ep_config_get_dst_md_cmpt(key, dst_md_index);
-    snprintf(p, endp - p, "md[%d]/%-8s", dst_md_index,
-             context->tl_cmpts[cmpt_index].attr.name);
-    p += strlen(p);
-
-    prio = ucp_ep_config_get_multi_lane_prio(key->rma_lanes, lane);
-    if (prio != -1) {
-        snprintf(p, endp - p, " rma#%d", prio);
-        p += strlen(p);
-    }
+    ucs_string_buffer_appendf(strbuf, "md[%d]/%s/sysdev[%d]", dst_md_index,
+                              context->tl_cmpts[cmpt_index].attr.name,
+                              key->lanes[lane].dst_sys_dev);
 
     prio = ucp_ep_config_get_multi_lane_prio(key->rma_bw_lanes, lane);
     if (prio != -1) {
-        snprintf(p, endp - p, " rma_bw#%d", prio);
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " rma_bw#%d", prio);
     }
 
     prio = ucp_ep_config_get_multi_lane_prio(key->amo_lanes, lane);
     if (prio != -1) {
-        snprintf(p, endp - p, " amo#%d", prio);
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " amo#%d", prio);
     }
 
     if (key->am_lane == lane) {
-        snprintf(p, endp - p, " am");
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " am");
     }
 
     if (key->rkey_ptr_lane == lane) {
-        snprintf(p, endp - p, " rkey_ptr");
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " rkey_ptr");
     }
 
     prio = ucp_ep_config_get_multi_lane_prio(key->am_bw_lanes, lane);
     if (prio != -1) {
-        snprintf(p, endp - p, " am_bw#%d", prio);
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " am_bw#%d", prio);
     }
 
     if (lane == key->tag_lane) {
-        snprintf(p, endp - p, " tag_offload");
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " tag_offload");
     }
 
     if (key->wireup_msg_lane == lane) {
-        snprintf(p, endp - p, " wireup");
-        p += strlen(p);
+        ucs_string_buffer_appendf(strbuf, " wireup");
         if (aux_rsc_index != UCP_NULL_RESOURCE) {
-            snprintf(p, endp - p, "{" UCT_TL_RESOURCE_DESC_FMT "}",
+            ucs_string_buffer_appendf(strbuf, "{" UCT_TL_RESOURCE_DESC_FMT "}",
                      UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[aux_rsc_index].tl_rsc));
         }
     }
 }
 
 static void ucp_ep_config_print(FILE *stream, ucp_worker_h worker,
-                                const ucp_ep_config_t *config,
-                                const unsigned *addr_indices,
+                                const ucp_ep_h ep, const unsigned *addr_indices,
                                 ucp_rsc_index_t aux_rsc_index)
 {
-    ucp_context_h context = worker->context;
-    char lane_info[128]   = {0};
+    ucp_context_h context   = worker->context;
+    ucp_ep_config_t *config = ucp_ep_config(ep);
     ucp_md_index_t md_index;
     ucp_lane_index_t lane;
+    ucp_rsc_index_t cm_idx;
 
     for (lane = 0; lane < config->key.num_lanes; ++lane) {
-        ucp_ep_config_lane_info_str(worker, &config->key, addr_indices, lane,
-                                    aux_rsc_index, lane_info, sizeof(lane_info));
-        fprintf(stream, "#                 %s\n", lane_info);
+        UCS_STRING_BUFFER_ONSTACK(strb, 128);
+        if (lane == config->key.cm_lane) {
+            cm_idx = ucp_ep_ext_control(ep)->cm_idx;
+            ucp_ep_config_cm_lane_info_str(worker, &config->key, lane, cm_idx,
+                                           &strb);
+        } else {
+            ucp_ep_config_lane_info_str(worker, &config->key, addr_indices,
+                                        lane, aux_rsc_index, &strb);
+        }
+        fprintf(stream, "#                 %s\n", ucs_string_buffer_cstr(&strb));
     }
     fprintf(stream, "#\n");
 
     if (context->config.features & UCP_FEATURE_TAG) {
-        ucp_ep_config_print_tag_proto(stream, "tag_send",
-                                      config->tag.eager.max_short,
-                                      config->tag.eager.zcopy_thresh[0],
-                                      config->tag.rndv.rma_thresh.remote,
-                                      config->tag.rndv.am_thresh.remote);
-        ucp_ep_config_print_tag_proto(stream, "tag_send_nbr",
-                                      config->tag.eager.max_short,
-                                      /* disable zcopy */
-                                      ucs_min(config->tag.rndv.rma_thresh.local,
-                                              config->tag.rndv.am_thresh.local),
-                                      config->tag.rndv.rma_thresh.local,
-                                      config->tag.rndv.am_thresh.local);
-        ucp_ep_config_print_tag_proto(stream, "tag_send_sync",
-                                      config->tag.eager.max_short,
-                                      config->tag.eager.sync_zcopy_thresh[0],
-                                      config->tag.rndv.rma_thresh.remote,
-                                      config->tag.rndv.am_thresh.remote);
-    }
-
-     if (context->config.features & UCP_FEATURE_RMA) {
-         for (lane = 0; lane < config->key.num_lanes; ++lane) {
-             if (ucp_ep_config_get_multi_lane_prio(config->key.rma_lanes, lane) == -1) {
-                 continue;
-             }
-             ucp_ep_config_print_rma_proto(stream, "put", lane,
-                                           config->rma[lane].max_put_short + 1,
-                                           config->rma[lane].put_zcopy_thresh);
-             ucp_ep_config_print_rma_proto(stream, "get", lane, 0,
-                                           config->rma[lane].get_zcopy_thresh);
-         }
-     }
-
-     if (context->config.features & (UCP_FEATURE_TAG|UCP_FEATURE_RMA)) {
-         fprintf(stream, "#\n");
-         fprintf(stream, "# %23s: mds ", "rma_bw");
-         ucs_for_each_bit(md_index, config->key.rma_bw_md_map) {
-             fprintf(stream, "[%d] ", md_index);
-         }
-     }
-
-     if (context->config.features & UCP_FEATURE_TAG) {
-         fprintf(stream, "rndv_rkey_size %zu\n", config->rndv.rkey_size);
-     }
+        ucp_ep_config_print_proto(stream, "tag_send",
+                                  config->tag.eager.max_short,
+                                  config->tag.eager.zcopy_thresh[0],
+                                  config->tag.rndv.rma_thresh.remote,
+                                  config->tag.rndv.am_thresh.remote);
+        ucp_ep_config_print_proto(stream, "tag_send_nbr",
+                                  config->tag.eager.max_short,
+                                  /* disable zcopy */
+                                  ucs_min(config->tag.rndv.rma_thresh.local,
+                                          config->tag.rndv.am_thresh.local),
+                                  config->tag.rndv.rma_thresh.local,
+                                  config->tag.rndv.am_thresh.local);
+        ucp_ep_config_print_proto(stream, "tag_send_sync",
+                                  config->tag.eager.max_short,
+                                  config->tag.eager.sync_zcopy_thresh[0],
+                                  config->tag.rndv.rma_thresh.remote,
+                                  config->tag.rndv.am_thresh.remote);
+    }
+
+    if (context->config.features & UCP_FEATURE_AM) {
+        ucp_ep_config_print_proto(stream, "am_send",
+                                  config->am_u.max_eager_short.memtype_on,
+                                  config->am.zcopy_thresh[0],
+                                  config->rndv.rma_thresh.remote,
+                                  config->rndv.am_thresh.remote);
+    }
+
+    if (context->config.features & UCP_FEATURE_RMA) {
+        for (lane = 0; lane < config->key.num_lanes; ++lane) {
+            if (ucp_ep_config_get_multi_lane_prio(config->key.rma_lanes,
+                                                  lane) == -1) {
+                continue;
+            }
+            ucp_ep_config_print_rma_proto(stream, "put", lane,
+                                          config->rma[lane].max_put_short + 1,
+                                          config->rma[lane].put_zcopy_thresh);
+            ucp_ep_config_print_rma_proto(stream, "get", lane, 0,
+                                          config->rma[lane].get_zcopy_thresh);
+        }
+    }
+
+    if (context->config.features &
+        (UCP_FEATURE_TAG|UCP_FEATURE_RMA|UCP_FEATURE_AM)) {
+        fprintf(stream, "#\n");
+        fprintf(stream, "# %23s: mds ", "rma_bw");
+        ucs_for_each_bit(md_index, config->key.rma_bw_md_map) {
+            fprintf(stream, "[%d] ", md_index);
+        }
+    }
+
+    if (context->config.features & (UCP_FEATURE_TAG | UCP_FEATURE_AM)) {
+        fprintf(stream, "rndv_rkey_size %zu\n", config->rndv.rkey_size);
+    }
 }
 
-void ucp_ep_print_info(ucp_ep_h ep, FILE *stream)
+static void ucp_ep_print_info_internal(ucp_ep_h ep, const char *name,
+                                       FILE *stream)
 {
-    ucp_worker_h    worker  = ep->worker;
+    ucp_worker_h worker     = ep->worker;
     ucp_ep_config_t *config = ucp_ep_config(ep);
     ucp_rsc_index_t aux_rsc_index;
     ucp_lane_index_t wireup_msg_lane;
+    ucs_string_buffer_t strb;
     uct_ep_h wireup_ep;
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
     fprintf(stream, "#\n");
-    fprintf(stream, "# UCP endpoint\n");
+    fprintf(stream, "# UCP endpoint %s\n", name);
     fprintf(stream, "#\n");
     fprintf(stream, "#               peer: %s\n", ucp_ep_peer_name(ep));
 
     /* if there is a wireup lane, set aux_rsc_index to the stub ep resource */
     aux_rsc_index   = UCP_NULL_RESOURCE;
-    wireup_msg_lane = ucp_ep_config(ep)->key.wireup_msg_lane;
+    wireup_msg_lane = config->key.wireup_msg_lane;
     if (wireup_msg_lane != UCP_NULL_LANE) {
         wireup_ep   = ep->uct_eps[wireup_msg_lane];
         if (ucp_wireup_ep_test(wireup_ep)) {
@@ -2161,18 +2525,44 @@ void ucp_ep_print_info(ucp_ep_h ep, FILE *stream)
         }
     }
 
-    ucp_ep_config_print(stream, worker, config, NULL, aux_rsc_index);
+    ucp_ep_config_print(stream, worker, ep, NULL, aux_rsc_index);
     fprintf(stream, "#\n");
 
     if (worker->context->config.ext.proto_enable) {
+        ucs_string_buffer_init(&strb);
         ucp_proto_select_dump(worker, ep->cfg_index, UCP_WORKER_CFG_INDEX_NULL,
-                              &config->proto_select, stream);
-        fprintf(stream, "#\n");
+                              &config->proto_select, &strb);
+        ucs_string_buffer_dump(&strb, "# ", stream);
+        ucs_string_buffer_cleanup(&strb);
     }
 
     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
 }
 
+void ucp_ep_print_info(ucp_ep_h ep, FILE *stream)
+{
+    return ucp_ep_print_info_internal(ep, "", stream);
+}
+
+void ucp_worker_mem_type_eps_print_info(ucp_worker_h worker, FILE *stream)
+{
+    ucs_memory_type_t mem_type;
+    ucp_ep_h ep;
+
+    ucs_memory_type_for_each(mem_type) {
+        UCS_STRING_BUFFER_ONSTACK(strb, 128);
+
+        ep = worker->mem_type_ep[mem_type];
+        if (ep == NULL) {
+            continue;
+        }
+
+        ucs_string_buffer_appendf(&strb, "for %s",
+                                  ucs_memory_type_descs[mem_type]);
+        ucp_ep_print_info_internal(ep, ucs_string_buffer_cstr(&strb), stream);
+    }
+}
+
 size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt,
                                            const ucs_linear_func_t *reg_cost,
                                            const ucp_context_h context,
@@ -2194,6 +2584,7 @@ size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt,
 ucp_wireup_ep_t* ucp_ep_get_cm_wireup_ep(ucp_ep_h ep)
 {
     ucp_lane_index_t lane;
+    uct_ep_h uct_ep;
 
     if (ep->cfg_index == UCP_WORKER_CFG_INDEX_NULL) {
         return NULL;
@@ -2204,8 +2595,8 @@ ucp_wireup_ep_t* ucp_ep_get_cm_wireup_ep(ucp_ep_h ep)
         return NULL;
     }
 
-    return ucp_wireup_ep_test(ep->uct_eps[lane]) ?
-           ucs_derived_of(ep->uct_eps[lane], ucp_wireup_ep_t) : NULL;
+    uct_ep = ep->uct_eps[lane];
+    return (uct_ep != NULL) ? ucp_wireup_ep(uct_ep) : NULL;
 }
 
 uct_ep_h ucp_ep_get_cm_uct_ep(ucp_ep_h ep)
@@ -2228,15 +2619,16 @@ uct_ep_h ucp_ep_get_cm_uct_ep(ucp_ep_h ep)
 
 int ucp_ep_is_cm_local_connected(ucp_ep_h ep)
 {
-    return ucp_ep_has_cm_lane(ep) && (ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED);
+    return (ucp_ep_get_cm_uct_ep(ep) != NULL) &&
+           (ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED);
 }
 
-uint64_t ucp_ep_get_tl_bitmap(ucp_ep_h ep)
+void ucp_ep_get_tl_bitmap(ucp_ep_h ep, ucp_tl_bitmap_t *tl_bitmap)
 {
-    uint64_t tl_bitmap = 0;
     ucp_lane_index_t lane;
     ucp_rsc_index_t rsc_idx;
 
+    UCS_BITMAP_CLEAR(tl_bitmap);
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
         if (lane == ucp_ep_get_cm_lane(ep)) {
             continue;
@@ -2247,10 +2639,8 @@ uint64_t ucp_ep_get_tl_bitmap(ucp_ep_h ep)
             continue;
         }
 
-        tl_bitmap |= UCS_BIT(rsc_idx);
+        UCS_BITMAP_SET(*tl_bitmap, rsc_idx);
     }
-
-    return tl_bitmap;
 }
 
 void ucp_ep_invoke_err_cb(ucp_ep_h ep, ucs_status_t status)
@@ -2268,7 +2658,7 @@ void ucp_ep_invoke_err_cb(ucp_ep_h ep, ucs_status_t status)
     ucs_debug("ep %p: calling user error callback %p with arg %p and status %s",
               ep, ucp_ep_ext_control(ep)->err_cb, ucp_ep_ext_gen(ep)->user_data,
               ucs_status_string(status));
-    ep->flags |= UCP_EP_FLAG_ERR_HANDLER_INVOKED;
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_ERR_HANDLER_INVOKED, 0);
     ucp_ep_ext_control(ep)->err_cb(ucp_ep_ext_gen(ep)->user_data, ep, status);
 }
 
@@ -2278,21 +2668,214 @@ int ucp_ep_config_test_rndv_support(const ucp_ep_config_t *config)
            (config->key.cm_lane  != UCP_NULL_LANE);
 }
 
+/* if we have ep2iface transport we need to send an active-message based
+ * keepalive message to check the remote endpoint still exists */
+static UCS_F_ALWAYS_INLINE int
+ucp_ep_is_am_keepalive(ucp_ep_h ucp_ep, ucp_rsc_index_t rsc_idx)
+{
+    ucp_worker_iface_t *wiface;
+
+    if (!(ucp_ep->flags & UCP_EP_FLAG_REMOTE_ID) ||
+        (rsc_idx == UCP_NULL_RESOURCE)) {
+        /* if remote ID isn't defined or rsc index is NULL (i.e. it is CM lane),
+         * don't do AM keepalive */
+        return 0;
+    }
+
+    wiface = ucp_worker_iface(ucp_ep->worker, rsc_idx);
+    return ucs_test_all_flags(wiface->attr.cap.flags,
+                              UCT_IFACE_FLAG_CONNECT_TO_IFACE |
+                              UCT_IFACE_FLAG_AM_BCOPY);
+}
+
+ucs_status_t ucp_ep_do_uct_ep_keepalive(ucp_ep_h ucp_ep, uct_ep_h uct_ep,
+                                        ucp_rsc_index_t rsc_idx, unsigned flags,
+                                        uct_completion_t *comp)
+{
+    ucp_tl_bitmap_t tl_bitmap = UCS_BITMAP_ZERO;
+    ucs_status_t status;
+    ssize_t packed_len;
+    struct iovec wireup_msg_iov[2];
+    ucp_wireup_msg_t wireup_msg;
+
+    ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_FAILED));
+    ucs_assert((rsc_idx == UCP_NULL_RESOURCE) ||
+               (ucp_worker_iface(ucp_ep->worker, rsc_idx)->attr.cap.flags &
+                UCT_IFACE_FLAG_EP_CHECK));
+
+    if (!ucp_ep_is_am_keepalive(ucp_ep, rsc_idx)) {
+        return uct_ep_check(uct_ep, flags, comp);
+    }
+
+    ucs_assert(ucp_worker_iface(ucp_ep->worker, rsc_idx)->attr.cap.flags &
+               UCT_IFACE_FLAG_AM_BCOPY);
+
+    UCS_BITMAP_SET(tl_bitmap, rsc_idx);
+
+    status = ucp_wireup_msg_prepare(ucp_ep, UCP_WIREUP_MSG_EP_CHECK,
+                                    &tl_bitmap, NULL, &wireup_msg,
+                                    &wireup_msg_iov[1].iov_base,
+                                    &wireup_msg_iov[1].iov_len);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    wireup_msg_iov[0].iov_base = &wireup_msg;
+    wireup_msg_iov[0].iov_len  = sizeof(wireup_msg);
+
+    packed_len = uct_ep_am_bcopy(uct_ep, UCP_AM_ID_WIREUP,
+                                 ucp_wireup_msg_pack, wireup_msg_iov, 0);
+    ucs_free(wireup_msg_iov[1].iov_base);
+    return (packed_len > 0) ? UCS_OK : (ucs_status_t)packed_len;
+}
+
 void ucp_ep_do_keepalive(ucp_ep_h ep, ucp_lane_map_t *lane_map)
 {
-    ucp_lane_map_t check_lanes = *lane_map;
     ucp_lane_index_t lane;
     ucs_status_t status;
+    ucp_rsc_index_t rsc_index;
+
+    if (ep->flags & UCP_EP_FLAG_FAILED) {
+        *lane_map = 0;
+        return;
+    }
+
+    /* Take updated ep_check_map, in case ep configuration has changed */
+    *lane_map &= ucp_ep_config(ep)->key.ep_check_map;
 
-    ucs_for_each_bit(lane, check_lanes) {
+    ucs_for_each_bit(lane, *lane_map) {
         ucs_assert(lane < UCP_MAX_LANES);
-        /* coverity[overrun-local] */
-        status = uct_ep_check(ep->uct_eps[lane], 0, NULL);
-        if (status == UCS_OK) {
-            *lane_map &= ~UCS_BIT(lane);
-        } else if (status != UCS_ERR_NO_RESOURCE) {
-            ucs_warn("unexpected return status from uct_ep_check(ep=%p): %s",
-                     ep, ucs_status_string(status));
+        rsc_index = ucp_ep_get_rsc_index(ep, lane);
+        ucs_assert((rsc_index != UCP_NULL_RESOURCE) ||
+                   (lane == ucp_ep_get_cm_lane(ep)));
+
+        status = ucp_ep_do_uct_ep_keepalive(ep, ep->uct_eps[lane], rsc_index, 0,
+                                            NULL);
+        if (status == UCS_ERR_NO_RESOURCE) {
+            continue;
+        } else if (status != UCS_OK) {
+            ucs_diag("unexpected return status from doing keepalive(ep=%p, "
+                     "lane[%d]=%p): %s",
+                     ep, lane, ep->uct_eps[lane],
+                     ucs_status_string(status));
+        }
+
+        *lane_map &= ~UCS_BIT(lane);
+    }
+}
+
+static void ucp_ep_req_purge(ucp_ep_h ucp_ep, ucp_request_t *req,
+                             ucs_status_t status, int recursive)
+{
+    ucp_trace_req(req, "purged with status %s (%d) on ep %p",
+                  ucs_status_string(status), status, ucp_ep);
+
+    /* Only send operations could have request ID allocated */
+    if (!(req->flags &
+          (UCP_REQUEST_FLAG_RECV_AM | UCP_REQUEST_FLAG_RECV_TAG))) {
+        ucp_send_request_id_release(req);
+    }
+
+    if (req->flags & (UCP_REQUEST_FLAG_SEND_AM | UCP_REQUEST_FLAG_SEND_TAG)) {
+        ucs_assert(!(req->flags & UCP_REQUEST_FLAG_SUPER_VALID));
+        ucs_assert(req->send.ep == ucp_ep);
+        ucp_request_complete_and_dereg_send(req, status);
+    } else if (req->flags & UCP_REQUEST_FLAG_RECV_AM) {
+        ucs_assert(!(req->flags & UCP_REQUEST_FLAG_SUPER_VALID));
+        ucs_assert(recursive); /* Mustn't be directly contained in an EP list
+                                * of tracking requests */
+        ucp_request_complete_am_recv(req, status);
+    } else if (req->flags & UCP_REQUEST_FLAG_RECV_TAG) {
+        ucs_assert(!(req->flags & UCP_REQUEST_FLAG_SUPER_VALID));
+        ucs_assert(recursive); /* Mustn't be directly contained in an EP list
+                                * of tracking requests */
+        ucp_request_complete_tag_recv(req, status);
+    } else if (req->flags & UCP_REQUEST_FLAG_RNDV_FRAG) {
+        ucs_assert(req->flags & UCP_REQUEST_FLAG_SUPER_VALID);
+        ucs_assert(req->send.ep == ucp_ep);
+        ucs_assert(recursive); /* Mustn't be directly contained in an EP list
+                                * of tracking requests */
+
+        /* It means that purging started from a request responsible for sending
+         * RTR, so a request is responsible for copying data from staging buffer
+         * and it uses a receive part of a request */
+        req->super_req->recv.remaining -= req->recv.length;
+        if (req->super_req->recv.remaining == 0) {
+            ucp_ep_req_purge(ucp_ep, ucp_request_get_super(req), status, 1);
+        }
+
+        ucp_request_put(req);
+    } else if ((req->send.uct.func == ucp_rma_sw_proto.progress_get) ||
+               (req->send.uct.func == ucp_amo_sw_proto.progress_fetch)) {
+        /* Currently we don't support UCP EP request purging for proto mode */
+        ucs_assert(!ucp_ep->worker->context->config.ext.proto_enable);
+        ucs_assert(req->send.ep == ucp_ep);
+
+        ucp_request_send_buffer_dereg(req);
+        ucp_request_complete_send(req, status);
+        ucp_ep_rma_remote_request_completed(ucp_ep);
+    } else {
+        /* SW RMA/PUT and AMO/Post operations don't allocate local request ID
+         * and don't need to be tracked, since they complete UCP request upon
+         * sending all data to a peer. Receiving RMA/CMPL and AMO/REP packets
+         * complete flush requests */
+        ucs_assert((req->send.uct.func != ucp_rma_sw_proto.progress_put) &&
+                   (req->send.uct.func != ucp_amo_sw_proto.progress_post));
+        ucs_assert(req->send.ep == ucp_ep);
+
+        ucp_ep_req_purge(ucp_ep, ucp_request_get_super(req), status, 1);
+        ucp_request_put(req);
+    }
+}
+
+void ucp_ep_reqs_purge(ucp_ep_h ucp_ep, ucs_status_t status)
+{
+    ucs_hlist_head_t *proto_reqs = &ucp_ep_ext_gen(ucp_ep)->proto_reqs;
+    ucp_ep_flush_state_t *flush_state;
+    ucp_request_t *req;
+
+    while (!ucs_hlist_is_empty(proto_reqs)) {
+        req = ucs_hlist_head_elem(proto_reqs, ucp_request_t, send.list);
+        ucp_ep_req_purge(ucp_ep, req, status, 0);
+    }
+
+    if (/* Flush state is already valid (i.e. EP doesn't exist on matching
+         * context) and not invalidated yet, also remote EP ID is already set */
+        !(ucp_ep->flags &
+          (UCP_EP_FLAG_ON_MATCH_CTX | UCP_EP_FLAG_CLOSE_REQ_VALID))) {
+        flush_state = ucp_ep_flush_state(ucp_ep);
+
+        /* Adjust 'comp_sn' value to a value stored in 'send_sn' by emulating
+         * remote completion of RMA operations because those uncompleted
+         * uncompleted operations won't be completed anymore. This could be only
+         * SW RMA/PUT or AMO/Post operations, because SW RMA/GET or AMO/Fetch
+         * operations should already complete flush operations which are waiting
+         * for completion packets */
+        while (UCS_CIRCULAR_COMPARE32(flush_state->cmpl_sn, <,
+                                      flush_state->send_sn)) {
+            ucp_ep_rma_remote_request_completed(ucp_ep);
         }
     }
 }
+
+static void ucp_ep_vfs_show_peer_name(void *obj, ucs_string_buffer_t *strb,
+                                      void *arg_ptr, uint64_t arg_u64)
+{
+    ucp_ep_h ep = obj;
+
+    ucs_string_buffer_appendf(strb, "%s\n", ucp_ep_peer_name(ep));
+}
+
+void ucp_ep_vfs_init(ucp_ep_h ep)
+{
+    ucp_err_handling_mode_t err_mode;
+
+    ucs_vfs_obj_add_dir(ep->worker, ep, "ep/%p", ep);
+    ucs_vfs_obj_add_ro_file(ep, ucp_ep_vfs_show_peer_name, NULL, 0,
+                            "peer_name");
+
+    err_mode = ucp_ep_config(ep)->key.err_mode;
+    ucs_vfs_obj_add_ro_file(ep, ucs_vfs_show_primitive,
+                            (void*)ucp_err_handling_mode_names[err_mode],
+                            UCS_VFS_TYPE_STRING, "error_mode");
+}
diff --git a/src/ucp/core/ucp_ep.h b/src/ucp/core/ucp_ep.h
index 5fffacf887b..1c389f0a048 100644
--- a/src/ucp/core/ucp_ep.h
+++ b/src/ucp/core/ucp_ep.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
  * Copyright (C) Los Alamos National Security, LLC. 2019 ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
@@ -13,6 +13,7 @@
 #include <ucp/proto/lane_type.h>
 #include <ucp/proto/proto_select.h>
 #include <ucp/wireup/ep_match.h>
+#include <ucp/api/ucp.h>
 #include <uct/api/uct.h>
 #include <ucs/datastruct/queue.h>
 #include <ucs/datastruct/ptr_map.inl>
@@ -24,10 +25,6 @@
 #define UCP_MAX_IOV                16UL
 
 
-/* Used as invalidated value */
-#define UCP_EP_ID_INVALID          UINTPTR_MAX
-
-
 /* Endpoint flags type */
 #if ENABLE_DEBUG_DATA || UCS_ENABLE_ASSERT
 typedef uint32_t                   ucp_ep_flags_t;
@@ -35,6 +32,23 @@ typedef uint32_t                   ucp_ep_flags_t;
 typedef uint16_t                   ucp_ep_flags_t;
 #endif
 
+#if UCS_ENABLE_ASSERT
+#define UCP_EP_ASSERT_COUNTER_INC(_counter) \
+    do { \
+        ucs_assert(*(_counter) < UINT_MAX); \
+        ++(*(_counter)); \
+    } while (0)
+
+#define UCP_EP_ASSERT_COUNTER_DEC(_counter) \
+    do { \
+        ucs_assert(*(_counter) > 0); \
+        --(*(_counter)); \
+    } while (0)
+#else
+#define UCP_EP_ASSERT_COUNTER_INC(_counter)
+#define UCP_EP_ASSERT_COUNTER_DEC(_counter)
+#endif
+
 
 /**
  * Endpoint flags
@@ -52,16 +66,14 @@ enum {
     UCP_EP_FLAG_STREAM_HAS_DATA        = UCS_BIT(5), /* EP has data in the ext.stream.match_q */
     UCP_EP_FLAG_ON_MATCH_CTX           = UCS_BIT(6), /* EP is on match queue */
     UCP_EP_FLAG_REMOTE_ID              = UCS_BIT(7), /* remote ID is valid */
-    UCP_EP_FLAG_LISTENER               = UCS_BIT(8), /* EP holds pointer to a listener
-                                                        (on server side due to receiving partial
-                                                        worker address from the client) */
     UCP_EP_FLAG_CONNECT_PRE_REQ_QUEUED = UCS_BIT(9), /* Pre-Connection request was queued */
     UCP_EP_FLAG_CLOSED                 = UCS_BIT(10),/* EP was closed */
     UCP_EP_FLAG_CLOSE_REQ_VALID        = UCS_BIT(11),/* close protocol is started and
                                                         close_req is valid */
     UCP_EP_FLAG_ERR_HANDLER_INVOKED    = UCS_BIT(12),/* error handler was called */
-    UCP_EP_FLAG_TEMPORARY              = UCS_BIT(13),/* the temporary EP which holds
-                                                        temporary wireup configuration */
+    UCP_EP_FLAG_INTERNAL               = UCS_BIT(13),/* the internal EP which holds
+                                                        temporary wireup configuration or
+                                                        mem-type EP */
     UCP_EP_FLAG_INDIRECT_ID            = UCS_BIT(14),/* protocols on this endpoint will send
                                                         indirect endpoint id instead of pointer,
                                                         can be replaced with looking at local ID */
@@ -72,12 +84,12 @@ enum {
     UCP_EP_FLAG_CONNECT_ACK_SENT       = UCS_BIT(18),/* DEBUG: Connection ACK was sent */
     UCP_EP_FLAG_CONNECT_REQ_IGNORED    = UCS_BIT(19),/* DEBUG: Connection request was ignored */
     UCP_EP_FLAG_CONNECT_PRE_REQ_SENT   = UCS_BIT(20),/* DEBUG: Connection pre-request was sent */
-    UCP_EP_FLAG_SOCKADDR_PARTIAL_ADDR  = UCS_BIT(21),/* DEBUG: Partial worker address was sent
-                                                               to the remote peer when starting
-                                                               connection establishment on this EP */
-    UCP_EP_FLAG_FLUSH_STATE_VALID      = UCS_BIT(22),/* DEBUG: flush_state is valid */
-    UCP_EP_FLAG_DISCONNECTED_CM_LANE   = UCS_BIT(23) /* DEBUG: CM lane was disconnected, i.e.
+    UCP_EP_FLAG_FLUSH_STATE_VALID      = UCS_BIT(21),/* DEBUG: flush_state is valid */
+    UCP_EP_FLAG_DISCONNECTED_CM_LANE   = UCS_BIT(22),/* DEBUG: CM lane was disconnected, i.e.
                                                         @uct_ep_disconnect was called for CM EP */
+    UCP_EP_FLAG_CLIENT_CONNECT_CB      = UCS_BIT(23),/* DEBUG: Client connect callback invoked */
+    UCP_EP_FLAG_SERVER_NOTIFY_CB       = UCS_BIT(24),/* DEBUG: Server notify callback invoked */
+    UCP_EP_FLAG_DISCONNECT_CB_CALLED   = UCS_BIT(25) /* DEBUG: Got disconnect notification */
 };
 
 
@@ -104,8 +116,13 @@ enum {
                                                            server side */
     UCP_EP_INIT_ERR_MODE_PEER_FAILURE  = UCS_BIT(4),  /**< Endpoint requires an
                                                            @ref UCP_ERR_HANDLING_MODE_PEER */
-    UCP_EP_INIT_CM_PHASE               = UCS_BIT(5)   /**< Endpoint connection to a peer is on
+    UCP_EP_INIT_CM_PHASE               = UCS_BIT(5),  /**< Endpoint connection to a peer is on
                                                            CM phase */
+    UCP_EP_INIT_FLAG_INTERNAL          = UCS_BIT(6),  /**< Endpoint for internal usage
+                                                           (e.g. memtype, reply on keepalive) */
+    UCP_EP_INIT_CONNECT_TO_IFACE_ONLY  = UCS_BIT(7)   /**< Select transports which
+                                                           support CONNECT_TO_IFACE
+                                                           mode only */
 };
 
 
@@ -113,6 +130,16 @@ enum {
     UCS_STATS_UPDATE_COUNTER((_ep)->stats, UCP_EP_STAT_TAG_TX_##_op, 1);
 
 
+typedef struct ucp_ep_config_key_lane {
+    ucp_rsc_index_t      rsc_index; /* Resource index */
+    ucp_md_index_t       dst_md_index; /* Destination memory domain index */
+    ucs_sys_device_t     dst_sys_dev; /* Destination system device */
+    uint8_t              path_index; /* Device path index */
+    ucp_lane_type_mask_t lane_types; /* Which types of operations this lane
+                                        was selected for */
+} ucp_ep_config_key_lane_t;
+
+
 /*
  * Endpoint configuration key.
  * This is filled by to the transport selection logic, according to the local
@@ -121,15 +148,7 @@ enum {
 struct ucp_ep_config_key {
 
     ucp_lane_index_t         num_lanes;       /* Number of active lanes */
-
-    struct {
-        ucp_rsc_index_t      rsc_index;       /* Resource index */
-        ucp_rsc_index_t      dst_rsc_index;   /* Destination resource index */
-        ucp_md_index_t       dst_md_index;    /* Destination memory domain index */
-        uint8_t              path_index;      /* Device path index */
-        ucp_lane_type_mask_t lane_types;      /* Which types of operations this lane
-                                                 was selected for */
-    } lanes[UCP_MAX_LANES];
+    ucp_ep_config_key_lane_t lanes[UCP_MAX_LANES]; /* Active lanes */
 
     ucp_lane_index_t         am_lane;         /* Lane for AM (can be NULL) */
     ucp_lane_index_t         tag_lane;        /* Lane for tag matching offload (can be NULL) */
@@ -171,7 +190,6 @@ struct ucp_ep_config_key {
 
     /* Error handling mode */
     ucp_err_handling_mode_t  err_mode;
-    ucs_status_t             status;
 };
 
 
@@ -234,6 +252,24 @@ typedef struct ucp_rndv_thresh {
 } ucp_rndv_thresh_t;
 
 
+/*
+ * Rendezvous Zcopy configuration
+ */
+typedef struct ucp_rndv_zcopy {
+    /* Maximal total size of Zcopy operation */
+    size_t           max;
+    /* Minimal size of Zcopy operation */
+    size_t           min;
+    /* Can the message which are > maximal size be split to the segments which are
+     * >= minimal size */
+    int              split;
+    /* Lanes for Zcopy operation */
+    ucp_lane_index_t lanes[UCP_MAX_LANES];
+    /* BW based scale factor for zcopy lanes */
+    double           scale[UCP_MAX_LANES];
+} ucp_ep_rndv_zcopy_config_t;
+
+
 struct ucp_ep_config {
 
     /* A key which uniquely defines the configuration, and all other fields of
@@ -259,34 +295,18 @@ struct ucp_ep_config {
     ucp_md_index_t          md_index[UCP_MAX_LANES];
 
     struct {
-        /* Maximal total size of rndv_get_zcopy */
-        size_t            max_get_zcopy;
-        /* Minimal size of rndv_get_zcopy */
-        size_t            min_get_zcopy;
-        /* Can the message > `max_get_zcopy` be split to
-         * the segments that are >= `min_get_zcopy` */
-        int               get_zcopy_split;
-        /* Maximal total size of rndv_put_zcopy */
-        size_t            max_put_zcopy;
-        /* Minimal size of rndv_put_zcopy */
-        size_t            min_put_zcopy;
-        /* Can the message > `max_put_zcopy` be split to
-         * the segments that are >= `min_put_zcopy` */
-        int               put_zcopy_split;
+        /* RNDV GET Zcopy configuration */
+        ucp_ep_rndv_zcopy_config_t get_zcopy;
+        /* RNDV PUT Zcopy configuration */
+        ucp_ep_rndv_zcopy_config_t put_zcopy;
         /* Threshold for switching from eager to RMA based rendezvous */
-        ucp_rndv_thresh_t rma_thresh;
+        ucp_rndv_thresh_t          rma_thresh;
         /* Threshold for switching from eager to AM based rendezvous */
-        ucp_rndv_thresh_t am_thresh;
+        ucp_rndv_thresh_t          am_thresh;
         /* Total size of packed rkey, according to high-bw md_map */
-        size_t            rkey_size;
-        /* remote memory domains which support rkey_ptr */
-        ucp_md_map_t      rkey_ptr_dst_mds;
-        /* Lanes for GET zcopy */
-        ucp_lane_index_t  get_zcopy_lanes[UCP_MAX_LANES];
-        /* Lanes for PUT zcopy */
-        ucp_lane_index_t  put_zcopy_lanes[UCP_MAX_LANES];
-        /* BW based scale factor */
-        double            scale[UCP_MAX_LANES];
+        size_t                     rkey_size;
+        /* Remote memory domains which support rkey_ptr */
+        ucp_md_map_t               rkey_ptr_dst_mds;
     } rndv;
 
     struct {
@@ -334,6 +354,12 @@ struct ucp_ep_config {
         /* Protocols used for am operations */
         const ucp_request_send_proto_t   *proto;
         const ucp_request_send_proto_t   *reply_proto;
+
+        /* Maximal size for eager short */
+        ucp_memtype_thresh_t             max_eager_short;
+
+        /* Maximal size for eager short with reply protocol */
+        ucp_memtype_thresh_t             max_reply_eager_short;
     } am_u;
 
     /* Protocol selection data */
@@ -347,6 +373,8 @@ struct ucp_ep_config {
 typedef struct ucp_ep {
     ucp_worker_h                  worker;        /* Worker this endpoint belongs to */
 
+    uint8_t                       refcount;      /* Reference counter: 0 - it is
+                                                    allowed to destroy EP */
     ucp_worker_cfg_index_t        cfg_index;     /* Configuration index */
     ucp_ep_match_conn_sn_t        conn_sn;       /* Sequence number for remote connection */
     ucp_lane_index_t              am_lane;       /* Cached value */
@@ -356,7 +384,16 @@ typedef struct ucp_ep {
     uct_ep_h                      uct_eps[UCP_MAX_LANES]; /* Transports for every lane */
 
 #if ENABLE_DEBUG_DATA
-    char                          peer_name[UCP_WORKER_NAME_MAX];
+    char                          peer_name[UCP_WORKER_ADDRESS_NAME_MAX];
+#endif
+
+#if UCS_ENABLE_ASSERT
+    /* How many Worker flush operations are in-progress where the EP is the next
+     * EP for flushing */
+    unsigned                      flush_iter_refcount;
+    /* How many UCT EP discarding operations are in-progress scheduled for the
+     * EP */
+    unsigned                      discard_refcount;
 #endif
 
     UCS_STATS_NODE_DECLARE(stats)
@@ -368,10 +405,10 @@ typedef struct ucp_ep {
  * Status of protocol-level remote completions
  */
 typedef struct {
-    ucs_queue_head_t              reqs;         /* Queue of flush requests which
-                                                   are waiting for remote completion */
-    uint32_t                      send_sn;      /* Sequence number of sent operations */
-    uint32_t                      cmpl_sn;      /* Sequence number of completions */
+    ucs_hlist_head_t reqs; /* Queue of flush requests which
+                              are waiting for remote completion */
+    uint32_t         send_sn; /* Sequence number of sent operations */
+    uint32_t         cmpl_sn; /* Sequence number of completions */
 } ucp_ep_flush_state_t;
 
 
@@ -388,13 +425,11 @@ typedef struct {
  * Endpoint extension for control data path
  */
 typedef struct {
-    ucs_ptr_map_key_t             local_ep_id;   /* Local EP ID */
-    ucs_ptr_map_key_t             remote_ep_id;  /* Remote EP ID */
-    ucp_err_handler_cb_t          err_cb;        /* Error handler */
-    union {
-        ucp_listener_h            listener;      /* Listener that may be associated with ep */
-        ucp_ep_close_proto_req_t  close_req;     /* Close protocol request */
-    };
+    ucp_rsc_index_t          cm_idx; /* CM index */
+    ucs_ptr_map_key_t        local_ep_id; /* Local EP ID */
+    ucs_ptr_map_key_t        remote_ep_id; /* Remote EP ID */
+    ucp_err_handler_cb_t     err_cb; /* Error handler */
+    ucp_ep_close_proto_req_t close_req; /* Close protocol request */
 } ucp_ep_ext_control_t;
 
 
@@ -413,6 +448,8 @@ typedef struct {
         ucp_ep_flush_state_t      flush_state;   /* Remote completion status */
     };
     ucp_ep_ext_control_t          *control_ext;  /* Control data path extension */
+    /* List of requests which are waiting for remote completion */
+    ucs_hlist_head_t              proto_reqs;
 } ucp_ep_ext_gen_t;
 
 
@@ -435,12 +472,7 @@ typedef struct {
 
 
 enum {
-    UCP_WIREUP_SA_DATA_FULL_ADDR = 0,   /* Sockaddr client data contains full
-                                           address. */
-    UCP_WIREUP_SA_DATA_PARTIAL_ADDR,    /* Sockaddr client data contains partial
-                                           address, wireup protocol requires
-                                           extra MSGs. */
-    UCP_WIREUP_SA_DATA_CM_ADDR          /* Sockaddr client data contains address
+    UCP_WIREUP_SA_DATA_CM_ADDR = 2      /* Sockaddr client data contains address
                                            for CM based wireup: there is only
                                            iface and ep address of transport
                                            lanes, remote device address is
@@ -465,39 +497,41 @@ struct ucp_wireup_sockaddr_data {
 
 typedef struct ucp_conn_request {
     ucp_listener_h              listener;
-    union {
-        uct_listener_h          listener;
-        uct_iface_h             iface;
-    } uct;
+    uct_listener_h              uct_listener;
     uct_conn_request_h          uct_req;
     ucp_rsc_index_t             cm_idx;
     char                        dev_name[UCT_DEVICE_NAME_MAX];
     uct_device_addr_t           *remote_dev_addr;
     struct sockaddr_storage     client_address;
+    ucp_ep_h                    ep; /* valid only if request is handled internally */
     ucp_wireup_sockaddr_data_t  sa_data;
     /* packed worker address follows */
 } ucp_conn_request_t;
 
 
+int ucp_is_uct_ep_failed(uct_ep_h uct_ep);
+
 void ucp_ep_config_key_reset(ucp_ep_config_key_t *key);
 
 void ucp_ep_config_cm_lane_info_str(ucp_worker_h worker,
                                     const ucp_ep_config_key_t *key,
                                     ucp_lane_index_t lane,
                                     ucp_rsc_index_t cm_index,
-                                    char *buf, size_t max);
+                                    ucs_string_buffer_t *buf);
 
 void ucp_ep_config_lane_info_str(ucp_worker_h worker,
                                  const ucp_ep_config_key_t *key,
                                  const unsigned *addr_indices,
                                  ucp_lane_index_t lane,
                                  ucp_rsc_index_t aux_rsc_index,
-                                 char *buf, size_t max);
+                                 ucs_string_buffer_t *buf);
 
 ucs_status_t ucp_ep_create_base(ucp_worker_h worker, const char *peer_name,
                                 const char *message, ucp_ep_h *ep_p);
 
-void ucp_ep_destroy_base(ucp_ep_h ep);
+void ucp_ep_add_ref(ucp_ep_h ep);
+
+int ucp_ep_remove_ref(ucp_ep_h ep);
 
 ucs_status_t ucp_worker_create_ep(ucp_worker_h worker, unsigned ep_init_flags,
                                   const char *peer_name, const char *message,
@@ -505,21 +539,23 @@ ucs_status_t ucp_worker_create_ep(ucp_worker_h worker, unsigned ep_init_flags,
 
 void ucp_ep_delete(ucp_ep_h ep);
 
+void ucp_ep_release_id(ucp_ep_h ep);
+
 ucs_status_t ucp_ep_init_create_wireup(ucp_ep_h ep, unsigned ep_init_flags,
                                        ucp_wireup_ep_t **wireup_ep);
 
-ucs_status_t ucp_ep_create_to_worker_addr(ucp_worker_h worker,
-                                          uint64_t local_tl_bitmap,
-                                          const ucp_unpacked_address_t *remote_address,
-                                          unsigned ep_init_flags,
-                                          const char *message, ucp_ep_h *ep_p);
+ucs_status_t
+ucp_ep_create_to_worker_addr(ucp_worker_h worker,
+                             const ucp_tl_bitmap_t *local_tl_bitmap,
+                             const ucp_unpacked_address_t *remote_address,
+                             unsigned ep_init_flags, const char *message,
+                             ucp_ep_h *ep_p);
 
 ucs_status_t ucp_ep_create_server_accept(ucp_worker_h worker,
                                          const ucp_conn_request_h conn_request,
                                          ucp_ep_h *ep_p);
 
-ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags,
-                                       unsigned req_flags,
+ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned req_flags,
                                        const ucp_request_param_t *param,
                                        ucp_request_t *worker_req,
                                        ucp_request_callback_t flushed_cb,
@@ -535,26 +571,28 @@ void ucp_ep_config_key_set_err_mode(ucp_ep_config_key_t *key,
 
 void ucp_ep_err_pending_purge(uct_pending_req_t *self, void *arg);
 
+void ucp_destroyed_ep_pending_purge(uct_pending_req_t *self, void *arg);
+
 void ucp_ep_disconnected(ucp_ep_h ep, int force);
 
 void ucp_ep_destroy_internal(ucp_ep_h ep);
 
 void ucp_ep_cleanup_lanes(ucp_ep_h ep);
 
-int ucp_ep_is_sockaddr_stub(ucp_ep_h ep);
-
 ucs_status_t ucp_ep_config_init(ucp_worker_h worker, ucp_ep_config_t *config,
                                 const ucp_ep_config_key_t *key);
 
 void ucp_ep_config_cleanup(ucp_worker_h worker, ucp_ep_config_t *config);
 
-int ucp_ep_config_lane_is_peer_equal(const ucp_ep_config_key_t *key1,
+int ucp_ep_config_lane_is_peer_match(const ucp_ep_config_key_t *key1,
                                      ucp_lane_index_t lane1,
                                      const ucp_ep_config_key_t *key2,
                                      ucp_lane_index_t lane2);
 
 void ucp_ep_config_lanes_intersect(const ucp_ep_config_key_t *key1,
+                                   const ucp_rsc_index_t *dst_rsc_indices1,
                                    const ucp_ep_config_key_t *key2,
+                                   const ucp_rsc_index_t *dst_rsc_indices2,
                                    ucp_lane_index_t *lane_map);
 
 int ucp_ep_config_is_equal(const ucp_ep_config_key_t *key1,
@@ -568,13 +606,16 @@ size_t ucp_ep_config_get_zcopy_auto_thresh(size_t iovcnt,
                                            const ucp_context_h context,
                                            double bandwidth);
 
-ucs_status_t ucp_worker_create_mem_type_endpoints(ucp_worker_h worker);
+ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker);
 
-void ucp_worker_destroy_mem_type_endpoints(ucp_worker_h worker);
+void ucp_worker_mem_type_eps_destroy(ucp_worker_h worker);
+
+void ucp_worker_mem_type_eps_print_info(ucp_worker_h worker,
+                                              FILE *stream);
 
 ucp_wireup_ep_t * ucp_ep_get_cm_wireup_ep(ucp_ep_h ep);
 
-uint64_t ucp_ep_get_tl_bitmap(ucp_ep_h ep);
+void ucp_ep_get_tl_bitmap(ucp_ep_h ep, ucp_tl_bitmap_t *tl_bitmap);
 
 uct_ep_h ucp_ep_get_cm_uct_ep(ucp_ep_h ep);
 
@@ -588,19 +629,63 @@ void ucp_ep_invoke_err_cb(ucp_ep_h ep, ucs_status_t status);
 
 int ucp_ep_config_test_rndv_support(const ucp_ep_config_t *config);
 
+ucs_status_t ucp_ep_flush_progress_pending(uct_pending_req_t *self);
+
 void ucp_ep_flush_completion(uct_completion_t *self);
 
 void ucp_ep_flush_request_ff(ucp_request_t *req, ucs_status_t status);
 
+void
+ucp_ep_purge_lanes(ucp_ep_h ep, uct_pending_purge_callback_t purge_cb,
+                   void *purge_arg);
+
+void ucp_ep_discard_lanes(ucp_ep_h ucp_ep, ucs_status_t status);
+
+void ucp_ep_register_disconnect_progress(ucp_request_t *req);
+
+ucp_lane_index_t ucp_ep_lookup_lane(ucp_ep_h ucp_ep, uct_ep_h uct_ep);
+
+/**
+ * @brief Do keepalive operation for a specific UCT EP.
+ *
+ * @param [in] ucp_ep  UCP Endpoint object to operate keepalive.
+ * @param [in] uct_ep  UCT Endpoint object to do keepalive on.
+ * @param [in] rsc_idx Resource index to check.
+ * @param [in] flags   Flags for keepalive operation.
+ * @param [in] comp    Pointer to keepalive completion object.
+ *
+ * @return Status of keepalive operation.
+ */
+ucs_status_t ucp_ep_do_uct_ep_keepalive(ucp_ep_h ucp_ep, uct_ep_h uct_ep,
+                                        ucp_rsc_index_t rsc_idx, unsigned flags,
+                                        uct_completion_t *comp);
+
 /**
  * @brief Do keepalive operation.
  *
  * @param [in]     ep       Endpoint object to operate keepalive.
  * @param [in/out] lane_map Map of lanes to process. During processing bit
  *                          corresponding to processed lane is set to 0.
- *                          Used for procerssing situation when any UCT lane
+ *                          Used for processing situation when any UCT lane
  *                          has no resources.
  */
 void ucp_ep_do_keepalive(ucp_ep_h ep, ucp_lane_map_t *lane_map);
 
+/**
+ * @brief Purge flush and protocol requests scheduled on a given UCP endpoint.
+ *
+ * @param [in]     ucp_ep           Endpoint object on which requests should be
+ *                                  purged.
+ * @param [in]     status           Completion status.
+ */
+void ucp_ep_reqs_purge(ucp_ep_h ucp_ep, ucs_status_t status);
+
+
+/**
+ * @brief Create objects in VFS to represent endpoint and its features.
+ *
+ * @param [in] ep Endpoint object to be described.
+ */
+void ucp_ep_vfs_init(ucp_ep_h ep);
+
 #endif
diff --git a/src/ucp/core/ucp_ep.inl b/src/ucp/core/ucp_ep.inl
index d645ef64e80..d9be03faacf 100644
--- a/src/ucp/core/ucp_ep.inl
+++ b/src/ucp/core/ucp_ep.inl
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2016.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -42,14 +42,15 @@ static inline ucp_lane_index_t ucp_ep_get_tag_lane(ucp_ep_h ep)
     return ucp_ep_config(ep)->key.tag_lane;
 }
 
-static inline int ucp_ep_is_tag_offload_enabled(ucp_ep_config_t *config)
+static inline int ucp_ep_config_key_has_tag_lane(const ucp_ep_config_key_t *key)
 {
-    ucp_lane_index_t lane = config->key.tag_lane;
+    ucp_lane_index_t lane = key->tag_lane;
 
     if (lane != UCP_NULL_LANE) {
-        ucs_assert(config->key.lanes[lane].rsc_index != UCP_NULL_RESOURCE);
+        ucs_assert(key->lanes[lane].rsc_index != UCP_NULL_RESOURCE);
         return 1;
     }
+
     return 0;
 }
 
@@ -145,7 +146,6 @@ static UCS_F_ALWAYS_INLINE ucp_ep_flush_state_t* ucp_ep_flush_state(ucp_ep_h ep)
 {
     ucs_assert(ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID);
     ucs_assert(!(ep->flags & UCP_EP_FLAG_ON_MATCH_CTX));
-    ucs_assert(!(ep->flags & UCP_EP_FLAG_LISTENER));
     ucs_assert(!(ep->flags & UCP_EP_FLAG_CLOSE_REQ_VALID));
     return &ucp_ep_ext_gen(ep)->flush_state;
 }
@@ -156,12 +156,24 @@ static UCS_F_ALWAYS_INLINE ucp_ep_ext_control_t* ucp_ep_ext_control(ucp_ep_h ep)
     return ucp_ep_ext_gen(ep)->control_ext;
 }
 
+static UCS_F_ALWAYS_INLINE void ucp_ep_update_flags(
+        ucp_ep_h ep, uint32_t flags_add, uint32_t flags_remove)
+{
+    ucp_ep_flags_t ep_flags_add    = (ucp_ep_flags_t)flags_add;
+    ucp_ep_flags_t ep_flags_remove = (ucp_ep_flags_t)flags_remove;
+
+    UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED(ep->worker);
+    ucs_assert((ep_flags_add & ep_flags_remove) == 0);
+
+    ep->flags = (ep->flags | ep_flags_add) & ~ep_flags_remove;
+}
+
 static UCS_F_ALWAYS_INLINE ucs_ptr_map_key_t ucp_ep_remote_id(ucp_ep_h ep)
 {
 #if UCS_ENABLE_ASSERT
     if (!(ep->flags & UCP_EP_FLAG_REMOTE_ID)) {
         /* Let remote side assert if it gets invalid key */
-        return UCP_EP_ID_INVALID;
+        return UCS_PTR_MAP_KEY_INVALID;
     }
 #endif
     return ucp_ep_ext_control(ep)->remote_ep_id;
@@ -169,7 +181,7 @@ static UCS_F_ALWAYS_INLINE ucs_ptr_map_key_t ucp_ep_remote_id(ucp_ep_h ep)
 
 static UCS_F_ALWAYS_INLINE ucs_ptr_map_key_t ucp_ep_local_id(ucp_ep_h ep)
 {
-    ucs_assert(ucp_ep_ext_control(ep)->local_ep_id != UCP_EP_ID_INVALID);
+    ucs_assert(ucp_ep_ext_control(ep)->local_ep_id != UCS_PTR_MAP_KEY_INVALID);
     return ucp_ep_ext_control(ep)->local_ep_id;
 }
 
@@ -196,9 +208,9 @@ static inline void ucp_ep_update_remote_id(ucp_ep_h ep,
                     ep, remote_id, ucp_ep_ext_control(ep)->remote_ep_id);
     }
 
-    ucs_assert(remote_id != UCP_EP_ID_INVALID);
+    ucs_assert(remote_id != UCS_PTR_MAP_KEY_INVALID);
     ucs_trace("ep %p: set remote_id to 0x%" PRIxPTR, ep, remote_id);
-    ep->flags                           |= UCP_EP_FLAG_REMOTE_ID;
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_REMOTE_ID, 0);
     ucp_ep_ext_control(ep)->remote_ep_id = remote_id;
 }
 
@@ -215,23 +227,22 @@ static inline void ucp_ep_flush_state_reset(ucp_ep_h ep)
 {
     ucp_ep_flush_state_t *flush_state = &ucp_ep_ext_gen(ep)->flush_state;
 
-    ucs_assert(!(ep->flags & (UCP_EP_FLAG_ON_MATCH_CTX |
-                              UCP_EP_FLAG_LISTENER)));
+    ucs_assert(!(ep->flags & UCP_EP_FLAG_ON_MATCH_CTX));
     ucs_assert(!(ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID) ||
                ((flush_state->send_sn == 0) &&
                 (flush_state->cmpl_sn == 0) &&
-                ucs_queue_is_empty(&flush_state->reqs)));
+                ucs_hlist_is_empty(&flush_state->reqs)));
 
     flush_state->send_sn = 0;
     flush_state->cmpl_sn = 0;
-    ucs_queue_head_init(&flush_state->reqs);
-    ep->flags |= UCP_EP_FLAG_FLUSH_STATE_VALID;
+    ucs_hlist_head_init(&flush_state->reqs);
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_FLUSH_STATE_VALID, 0);
 }
 
 static inline void ucp_ep_flush_state_invalidate(ucp_ep_h ep)
 {
-    ucs_assert(ucs_queue_is_empty(&ucp_ep_flush_state(ep)->reqs));
-    ep->flags &= ~UCP_EP_FLAG_FLUSH_STATE_VALID;
+    ucs_assert(ucs_hlist_is_empty(&ucp_ep_flush_state(ep)->reqs));
+    ucp_ep_update_flags(ep, 0, UCP_EP_FLAG_FLUSH_STATE_VALID);
 }
 
 /* get index of the local component which can reach a remote memory domain */
@@ -278,4 +289,5 @@ static UCS_F_ALWAYS_INLINE int ucp_ep_use_indirect_id(ucp_ep_h ep)
     UCS_STATIC_ASSERT(sizeof(ep->flags) <= sizeof(int));
     return ep->flags & UCP_EP_FLAG_INDIRECT_ID;
 }
+
 #endif
diff --git a/src/ucp/core/ucp_listener.c b/src/ucp/core/ucp_listener.c
index 69556de1083..e04f28e4b8c 100644
--- a/src/ucp/core/ucp_listener.c
+++ b/src/ucp/core/ucp_listener.c
@@ -18,132 +18,46 @@
 #include <ucp/core/ucp_ep.inl>
 #include <ucs/debug/log.h>
 #include <ucs/sys/sock.h>
+#include <ucs/vfs/base/vfs_obj.h>
 
 
 static unsigned ucp_listener_accept_cb_progress(void *arg)
 {
-    ucp_ep_h       ep       = arg;
-    ucp_listener_h listener = ucp_ep_ext_control(ep)->listener;
+    ucp_conn_request_h conn_request = arg;
+    ucp_listener_h listener         = conn_request->listener;
+    ucp_ep_h ep                     = conn_request->ep;
+
+    ucs_free(conn_request->remote_dev_addr);
+    ucs_free(conn_request);
 
-    /* NOTE: protect union */
-    ucs_assert(!(ep->flags & (UCP_EP_FLAG_ON_MATCH_CTX |
-                              UCP_EP_FLAG_FLUSH_STATE_VALID)));
-    ucs_assert(ep->flags   & UCP_EP_FLAG_LISTENER);
+    UCS_ASYNC_BLOCK(&ep->worker->async);
 
-    ep->flags &= ~UCP_EP_FLAG_LISTENER;
-    ep->flags |= UCP_EP_FLAG_USED;
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_USED, 0);
     ucp_stream_ep_activate(ep);
-    ucp_ep_flush_state_reset(ep);
-
-    /*
-     * listener is NULL if the EP was created with UCP_EP_PARAM_FIELD_EP_ADDR
-     * and we are here because long address requires wireup protocol
-     */
-    if (listener && listener->accept_cb) {
-        listener->accept_cb(ep, listener->arg);
-    }
 
+    UCS_ASYNC_UNBLOCK(&ep->worker->async);
+
+    listener->accept_cb(ep, listener->arg);
     return 1;
 }
 
 int ucp_listener_accept_cb_remove_filter(const ucs_callbackq_elem_t *elem,
-                                                void *arg)
+                                         void *arg)
 {
-    ucp_ep_h ep = elem->arg;
+    ucp_conn_request_h conn_request = elem->arg;
 
-    return (elem->cb == ucp_listener_accept_cb_progress) && (ep == arg);
+    return (elem->cb == ucp_listener_accept_cb_progress) &&
+           (conn_request->ep == arg);
 }
 
-void ucp_listener_schedule_accept_cb(ucp_ep_h ep)
+void ucp_listener_schedule_accept_cb(ucp_conn_request_h conn_request)
 {
     uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
 
-    uct_worker_progress_register_safe(ep->worker->uct,
+    uct_worker_progress_register_safe(conn_request->ep->worker->uct,
                                       ucp_listener_accept_cb_progress,
-                                      ep, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                      &prog_id);
-}
-
-static unsigned ucp_listener_conn_request_progress(void *arg)
-{
-    ucp_conn_request_h conn_request = arg;
-    ucp_listener_h     listener     = conn_request->listener;
-    ucp_worker_h       worker       = listener->worker;
-    ucp_ep_h           ep;
-    ucs_status_t       status;
-
-    ucs_trace_func("listener=%p", listener);
-
-    if (listener->conn_cb) {
-        listener->conn_cb(conn_request, listener->arg);
-        return 1;
-    }
-
-    UCS_ASYNC_BLOCK(&worker->async);
-    status = ucp_ep_create_server_accept(worker, conn_request, &ep);
-    if (status != UCS_OK) {
-        goto out;
-    }
-
-    if (listener->accept_cb != NULL) {
-        if (ep->flags & UCP_EP_FLAG_LISTENER) {
-            ucs_assert(!(ep->flags & UCP_EP_FLAG_USED));
-            ucp_ep_ext_control(ep)->listener = listener;
-        } else {
-            ep->flags |= UCP_EP_FLAG_USED;
-            listener->accept_cb(ep, listener->arg);
-        }
-    }
-
-out:
-    UCS_ASYNC_UNBLOCK(&worker->async);
-    return 1;
-}
-
-static int ucp_listener_remove_filter(const ucs_callbackq_elem_t *elem,
-                                      void *arg)
-{
-    ucp_listener_h *listener = elem->arg;
-
-    return (elem->cb == ucp_listener_conn_request_progress) && (listener == arg);
-}
-
-static void ucp_listener_conn_request_callback(uct_iface_h tl_iface, void *arg,
-                                               uct_conn_request_h uct_req,
-                                               const void *conn_priv_data,
-                                               size_t length)
-{
-    ucp_listener_h     listener = arg;
-    uct_worker_cb_id_t prog_id  = UCS_CALLBACKQ_ID_NULL;
-    ucp_conn_request_h conn_request;
-
-    ucs_trace("listener %p: got connection request", listener);
-
-    /* Defer wireup init and user's callback to be invoked from the main thread */
-    conn_request = ucs_malloc(ucs_offsetof(ucp_conn_request_t, sa_data) +
-                              length, "accept connection request");
-    if (conn_request == NULL) {
-        ucs_error("failed to allocate connect request, "
-                  "rejecting connection request %p on TL iface %p, reason %s",
-                  uct_req, tl_iface, ucs_status_string(UCS_ERR_NO_MEMORY));
-        uct_iface_reject(tl_iface, uct_req);
-        return;
-    }
-
-    conn_request->listener  = listener;
-    conn_request->uct_req   = uct_req;
-    conn_request->uct.iface = tl_iface;
-    memset(&conn_request->client_address, 0, sizeof(struct sockaddr_storage));
-    memcpy(&conn_request->sa_data, conn_priv_data, length);
-
-    uct_worker_progress_register_safe(listener->worker->uct,
-                                      ucp_listener_conn_request_progress,
-                                      conn_request, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                      &prog_id);
-
-    /* If the worker supports the UCP_FEATURE_WAKEUP feature, signal the user so
-     * that he can wake-up on this event */
-    ucp_worker_signal_internal(listener->worker);
+                                      conn_request,
+                                      UCS_CALLBACKQ_FLAG_ONESHOT, &prog_id);
 }
 
 ucs_status_t ucp_conn_request_query(ucp_conn_request_h conn_request,
@@ -182,43 +96,27 @@ ucs_status_t ucp_listener_query(ucp_listener_h listener,
     return UCS_OK;
 }
 
-static void ucp_listener_close_uct_listeners(ucp_listener_h listener)
+static void ucp_listener_reset_uct_listeners(ucp_listener_h listener)
 {
     ucp_rsc_index_t i;
 
-    ucs_assert_always(ucp_worker_sockaddr_is_cm_proto(listener->worker));
-
     for (i = 0; i < listener->num_rscs; ++i) {
         uct_listener_destroy(listener->listeners[i]);
+        listener->listeners[i] = NULL;
     }
 
-    ucs_free(listener->listeners);
-
-    listener->listeners = NULL;
-    listener->num_rscs  = 0;
+    listener->num_rscs = 0;
 }
 
-static void ucp_listener_close_ifaces(ucp_listener_h listener)
+static void ucp_listener_free_uct_listeners(ucp_listener_h listener)
 {
-    ucp_worker_h worker;
-    int i;
-
-    ucs_assert_always(!ucp_worker_sockaddr_is_cm_proto(listener->worker));
-
-    for (i = 0; i < listener->num_rscs; i++) {
-        worker = listener->wifaces[i]->worker;
-        ucs_assert_always(worker == listener->worker);
-        /* remove pending slow-path progress in case it wasn't removed yet */
-        ucs_callbackq_remove_if(&worker->uct->progress_q,
-                                ucp_listener_remove_filter, listener);
-        ucp_worker_iface_cleanup(listener->wifaces[i]);
-    }
-
-    ucs_free(listener->wifaces);
+    ucp_listener_reset_uct_listeners(listener);
+    ucs_free(listener->listeners);
+    listener->listeners = NULL;
 }
 
 static ucs_status_t
-ucp_listen_on_cm(ucp_listener_h listener, const ucp_listener_params_t *params)
+ucp_listen(ucp_listener_h listener, const ucp_listener_params_t *params)
 {
     ucp_worker_h          worker  = listener->worker;
     const ucp_rsc_index_t num_cms = ucp_worker_num_cm_cmpts(worker);
@@ -228,10 +126,12 @@ ucp_listen_on_cm(ucp_listener_h listener, const ucp_listener_params_t *params)
     uct_listener_params_t uct_params;
     uct_listener_attr_t   uct_attr;
     uint16_t              port, uct_listen_port;
-    ucp_rsc_index_t       i;
+    ucp_rsc_index_t       cm_index;
     char                  addr_str[UCS_SOCKADDR_STRING_LEN];
     ucp_worker_cm_t       *ucp_cm;
     ucs_status_t          status;
+    int                   use_any_port;
+    ucs_log_level_t       log_level;
 
     addr = (struct sockaddr *)&addr_storage;
     status = ucs_sockaddr_copy(addr, params->sockaddr.addr);
@@ -239,6 +139,13 @@ ucp_listen_on_cm(ucp_listener_h listener, const ucp_listener_params_t *params)
         return status;
     }
 
+    status = ucs_sockaddr_get_port(addr, &port);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    use_any_port = (port == 0);
+
     ucs_assert_always(num_cms > 0);
 
     uct_params.field_mask       = UCT_LISTENER_PARAM_FIELD_CONN_REQUEST_CB |
@@ -259,209 +166,134 @@ ucp_listen_on_cm(ucp_listener_h listener, const ucp_listener_params_t *params)
 
     listener->listeners = uct_listeners;
 
-    for (i = 0; i < num_cms; ++i) {
-        ucp_cm = &worker->cms[i];
+    cm_index = 0;
+    while (cm_index < num_cms) {
+        ucp_cm = &worker->cms[cm_index++];
+        if (ucp_cm->cm == NULL) {
+            continue;
+        }
+
         status = uct_listener_create(ucp_cm->cm, addr,
                                      params->sockaddr.addrlen, &uct_params,
                                      &uct_listeners[listener->num_rscs]);
-        if (status != UCS_OK) {
-            ucs_debug("failed to create UCT listener on CM %p (component %s) "
-                      "with address %s status %s", ucp_cm->cm,
-                      worker->context->tl_cmpts[ucp_cm->cmpt_idx].attr.name,
-                      ucs_sockaddr_str(params->sockaddr.addr, addr_str,
-                                       UCS_SOCKADDR_STRING_LEN),
-                      ucs_status_string(status));
-
-            if (status == UCS_ERR_BUSY) {
-                goto err_destroy_listeners;
+        if (status == UCS_OK) {
+            ++listener->num_rscs;
+            status = ucs_sockaddr_get_port(addr, &port);
+            if (status != UCS_OK) {
+                goto err_free_listeners;
             }
 
-            continue;
-        }
-
-        ++listener->num_rscs;
-
-        status = ucs_sockaddr_get_port(addr, &port);
-        if (status != UCS_OK) {
-            goto err_destroy_listeners;
-        }
-
-        uct_attr.field_mask = UCT_LISTENER_ATTR_FIELD_SOCKADDR;
-        status = uct_listener_query(uct_listeners[listener->num_rscs - 1],
-                                    &uct_attr);
-        if (status != UCS_OK) {
-            goto err_destroy_listeners;
-        }
+            uct_attr.field_mask = UCT_LISTENER_ATTR_FIELD_SOCKADDR;
+            status              =
+                    uct_listener_query(uct_listeners[listener->num_rscs - 1],
+                                       &uct_attr);
+            if (status != UCS_OK) {
+                goto err_free_listeners;
+            }
 
-        status = ucs_sockaddr_get_port((struct sockaddr *)&uct_attr.sockaddr,
-                                       &uct_listen_port);
-        if (status != UCS_OK) {
-            goto err_destroy_listeners;
-        }
+            status = ucs_sockaddr_get_port((struct sockaddr *)&uct_attr.sockaddr,
+                                           &uct_listen_port);
+            if (status != UCS_OK) {
+                goto err_free_listeners;
+            }
 
-        if (port != uct_listen_port) {
-            ucs_assert(port == 0);
-            status = ucs_sockaddr_set_port(addr, uct_listen_port);
+            if (port != uct_listen_port) {
+                ucs_assert(port == 0);
+                status = ucs_sockaddr_set_port(addr, uct_listen_port);
+                if (status != UCS_OK) {
+                    goto err_free_listeners;
+                }
+            }
+        } else if ((status == UCS_ERR_BUSY) && use_any_port) {
+            /* retry another port */
+            status = ucs_sockaddr_set_port(addr, 0);
             if (status != UCS_OK) {
-                goto err_destroy_listeners;
+                goto err_free_listeners;
+            }
+
+            ucp_listener_reset_uct_listeners(listener);
+            /* TODO: to reduce probability of "any port busy" need to create
+             *       TCP listener first */
+            cm_index = 0;
+        } else {
+            log_level = ((status == UCS_ERR_BUSY) ||
+                         (status == UCS_ERR_NO_DEVICE)) ? UCS_LOG_LEVEL_DIAG :
+                        UCS_LOG_LEVEL_ERROR;
+            ucs_log(log_level,
+                    "failed to create UCT listener on CM %p (component %s) "
+                    "with address %s status %s", ucp_cm->cm,
+                    worker->context->tl_cmpts[ucp_cm->cmpt_idx].attr.name,
+                    ucs_sockaddr_str(params->sockaddr.addr, addr_str,
+                                     UCS_SOCKADDR_STRING_LEN),
+                    ucs_status_string(status));
+            if (status != UCS_ERR_NO_DEVICE) {
+                goto err_free_listeners;
             }
         }
     }
 
     if (listener->num_rscs == 0) {
         ucs_assert(status != UCS_OK);
-        goto err_destroy_listeners;
+        goto err_free_listeners;
     }
 
     status = ucs_sockaddr_copy((struct sockaddr *)&listener->sockaddr, addr);
     if (status != UCS_OK) {
-        goto err_destroy_listeners;
+        goto err_free_listeners;
     }
 
     return UCS_OK;
 
-err_destroy_listeners:
-    ucp_listener_close_uct_listeners(listener);
+err_free_listeners:
+    ucp_listener_free_uct_listeners(listener);
     /* if no listener was created, return the status of the last call of
      * uct_listener_create. else, return the error status that invoked this label. */
     return status;
 }
 
-static ucs_status_t
-ucp_listen_on_iface(ucp_listener_h listener,
-                    const ucp_listener_params_t *params)
+static void ucp_listener_vfs_show_ip(void *obj, ucs_string_buffer_t *strb,
+                                     void *arg_ptr, uint64_t arg_u64)
 {
-    ucp_worker_h worker   = listener->worker;
-    ucp_context_h context = listener->worker->context;
-    int sockaddr_tls      = 0;
-    char saddr_str[UCS_SOCKADDR_STRING_LEN];
-    ucp_tl_resource_desc_t *resource;
-    uct_iface_params_t iface_params;
-    struct sockaddr_storage *listen_sock;
-    ucp_worker_iface_t **tmp;
-    ucp_rsc_index_t tl_id;
-    ucs_status_t status;
-    ucp_tl_md_t *tl_md;
-    uint16_t port;
-    int i;
-
-    status = ucs_sockaddr_get_port(params->sockaddr.addr, &port);
-    if (status != UCS_OK) {
-       return status;
-    }
-
-    /* Go through all the available resources and for each one, check if the given
-     * sockaddr is accessible from its md. Start listening on all the mds that
-     * satisfy this.
-     * If the given port is set to 0, i.e. use a random port, the first transport
-     * in the sockaddr priority list from the environment configuration will
-     * dictate the port to listen on for the other sockaddr transports in the list.
-     * */
-    for (i = 0; i < context->config.num_sockaddr_tls; i++) {
-        tl_id    = context->config.sockaddr_tl_ids[i];
-        resource = &context->tl_rscs[tl_id];
-        tl_md    = &context->tl_mds[resource->md_index];
-
-        if (!uct_md_is_sockaddr_accessible(tl_md->md, &params->sockaddr,
-                                           UCT_SOCKADDR_ACC_LOCAL)) {
-            continue;
-        }
-
-        tmp = ucs_realloc(listener->wifaces,
-                          sizeof(*tmp) * (sockaddr_tls + 1),
-                          "listener wifaces");
-        if (tmp == NULL) {
-            ucs_error("failed to allocate listener wifaces");
-            status = UCS_ERR_NO_MEMORY;
-            goto err_close_listener_wifaces;
-        }
-
-        listener->wifaces = tmp;
-
-        iface_params.field_mask                     = UCT_IFACE_PARAM_FIELD_OPEN_MODE |
-                                                      UCT_IFACE_PARAM_FIELD_SOCKADDR;
-        iface_params.open_mode                      = UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER;
-        iface_params.mode.sockaddr.conn_request_cb  = ucp_listener_conn_request_callback;
-        iface_params.mode.sockaddr.conn_request_arg = listener;
-        iface_params.mode.sockaddr.listen_sockaddr  = params->sockaddr;
-        iface_params.mode.sockaddr.cb_flags         = UCT_CB_FLAG_ASYNC;
-
-        if (port) {
-            /* Set the port for the next sockaddr iface. This port was either
-             * obtained from the user or generated by the first created sockaddr
-             * iface if the port from the user was equal to zero */
-            status = ucs_sockaddr_set_port(
-                        (struct sockaddr *)
-                        iface_params.mode.sockaddr.listen_sockaddr.addr, port);
-            if (status != UCS_OK) {
-                ucs_error("failed to set port parameter (%d) for creating %s iface",
-                          port, resource->tl_rsc.tl_name);
-                goto err_close_listener_wifaces;
-            }
-        }
-
-        status = ucp_worker_iface_open(worker, tl_id, &iface_params,
-                                       &listener->wifaces[sockaddr_tls]);
-        if (status != UCS_OK) {
-            ucs_error("failed to open listener on %s on md %s",
-                      ucs_sockaddr_str(
-                            iface_params.mode.sockaddr.listen_sockaddr.addr,
-                            saddr_str, sizeof(saddr_str)),
-                            tl_md->rsc.md_name);
-            goto err_close_listener_wifaces;
-        }
-
-        status = ucp_worker_iface_init(worker, tl_id,
-                                       listener->wifaces[sockaddr_tls]);
-        if ((status != UCS_OK) ||
-            ((context->config.features & UCP_FEATURE_WAKEUP) &&
-             !(listener->wifaces[sockaddr_tls]->attr.cap.flags &
-               UCT_IFACE_FLAG_CB_ASYNC))) {
-            ucp_worker_iface_cleanup(listener->wifaces[sockaddr_tls]);
-            goto err_close_listener_wifaces;
-        }
-
-        listen_sock = &listener->wifaces[sockaddr_tls]->attr.listen_sockaddr;
-        status = ucs_sockaddr_get_port((struct sockaddr *)listen_sock, &port);
-        if (status != UCS_OK) {
-            goto err_close_listener_wifaces;
-        }
+    ucp_listener_h listener   = obj;
+    struct sockaddr *sockaddr = (struct sockaddr*)&listener->sockaddr;
+    char ip_str[UCS_SOCKADDR_STRING_LEN];
 
-        sockaddr_tls++;
-        listener->num_rscs = sockaddr_tls;
-        ucs_trace("listener %p: accepting connections on %s on %s",
-                  listener, tl_md->rsc.md_name,
-                  ucs_sockaddr_str(iface_params.mode.sockaddr.listen_sockaddr.addr,
-                                   saddr_str, sizeof(saddr_str)));
+    if (ucs_sockaddr_get_ipstr(sockaddr, ip_str, UCS_SOCKADDR_STRING_LEN) ==
+        UCS_OK) {
+        ucs_string_buffer_appendf(strb, "%s\n", ip_str);
+    } else {
+        ucs_string_buffer_appendf(strb, "<unable to get ip>\n");
     }
+}
 
-    if (!sockaddr_tls) {
-        ucs_error("none of the available transports can listen for connections on %s",
-                  ucs_sockaddr_str(params->sockaddr.addr, saddr_str,
-                  sizeof(saddr_str)));
-        listener->num_rscs = 0;
-        status = UCS_ERR_UNREACHABLE;
-        goto err_close_listener_wifaces;
-    }
+static void ucp_listener_vfs_show_port(void *obj, ucs_string_buffer_t *strb,
+                                       void *arg_ptr, uint64_t arg_u64)
+{
+    ucp_listener_h listener   = obj;
+    struct sockaddr *sockaddr = (struct sockaddr*)&listener->sockaddr;
+    uint16_t port;
 
-    listen_sock = &listener->wifaces[sockaddr_tls - 1]->attr.listen_sockaddr;
-    status = ucs_sockaddr_copy((struct sockaddr *)&listener->sockaddr,
-                               (struct sockaddr *)listen_sock);
-    if (status != UCS_OK) {
-        goto err_close_listener_wifaces;
+    if (ucs_sockaddr_get_port(sockaddr, &port) == UCS_OK) {
+        ucs_string_buffer_appendf(strb, "%u\n", port);
+    } else {
+        ucs_string_buffer_appendf(strb, "<unable to get port>\n");
     }
+}
 
-    return UCS_OK;
-
-err_close_listener_wifaces:
-    ucp_listener_close_ifaces(listener);
-    return status;
+void ucp_listener_vfs_init(ucp_listener_h listener)
+{
+    ucs_vfs_obj_add_dir(listener->worker, listener, "listener/%p", listener);
+    ucs_vfs_obj_add_ro_file(listener, ucp_listener_vfs_show_ip, NULL, 0, "ip");
+    ucs_vfs_obj_add_ro_file(listener, ucp_listener_vfs_show_port, NULL, 0,
+                            "port");
 }
 
 ucs_status_t ucp_listener_create(ucp_worker_h worker,
                                  const ucp_listener_params_t *params,
                                  ucp_listener_h *listener_p)
 {
+    const unsigned handlers_mask = UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER |
+                                   UCP_LISTENER_PARAM_FIELD_CONN_HANDLER;
     ucp_listener_h listener;
     ucs_status_t   status;
 
@@ -472,13 +304,17 @@ ucs_status_t ucp_listener_create(ucp_worker_h worker,
 
     UCP_CHECK_PARAM_NON_NULL(params->sockaddr.addr, status, return status);
 
-    if (ucs_test_all_flags(params->field_mask,
-                           UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER |
-                           UCP_LISTENER_PARAM_FIELD_CONN_HANDLER)) {
-        ucs_error("only one accept handler should be provided");
+    if (ucs_test_all_flags(params->field_mask, handlers_mask) ||
+        !(params->field_mask & handlers_mask)) {
+        ucs_error("one and only one accept handler should be provided");
         return UCS_ERR_INVALID_PARAM;
     }
 
+    if (ucp_worker_num_cm_cmpts(worker) == 0) {
+        ucs_error("cannot create listener: none of the available components supports it");
+        return UCS_ERR_UNSUPPORTED;
+    }
+
     listener = ucs_calloc(1, sizeof(*listener), "ucp_listener");
     if (listener == NULL) {
         ucs_error("cannot allocate memory for UCP listener");
@@ -501,13 +337,9 @@ ucs_status_t ucp_listener_create(ucp_worker_h worker,
         listener->arg       = params->conn_handler.arg;
     }
 
-    if (ucp_worker_sockaddr_is_cm_proto(worker)) {
-        status = ucp_listen_on_cm(listener, params);
-    } else {
-        status = ucp_listen_on_iface(listener, params);
-    }
-
+    status = ucp_listen(listener, params);
     if (status == UCS_OK) {
+        ucp_listener_vfs_init(listener);
         *listener_p = listener;
         goto out;
     }
@@ -523,12 +355,14 @@ void ucp_listener_destroy(ucp_listener_h listener)
 {
     ucs_trace("listener %p: destroying", listener);
 
-    if (ucp_worker_sockaddr_is_cm_proto(listener->worker)) {
-        ucp_listener_close_uct_listeners(listener);
-    } else {
-        ucp_listener_close_ifaces(listener);
-    }
+    UCS_ASYNC_BLOCK(&listener->worker->async);
+    ucs_vfs_obj_remove(listener);
+    ucs_callbackq_remove_if(&listener->worker->uct->progress_q,
+                            ucp_cm_server_conn_request_progress_cb_pred,
+                            listener);
+    UCS_ASYNC_UNBLOCK(&listener->worker->async);
 
+    ucp_listener_free_uct_listeners(listener);
     ucs_free(listener);
 }
 
@@ -537,15 +371,11 @@ ucs_status_t ucp_listener_reject(ucp_listener_h listener,
 {
     ucp_worker_h worker = listener->worker;
 
-    UCS_ASYNC_BLOCK(&worker->async);
-
-    if (ucp_worker_sockaddr_is_cm_proto(worker)) {
-        uct_listener_reject(conn_request->uct.listener, conn_request->uct_req);
-        ucs_free(conn_request->remote_dev_addr);
-    } else {
-        uct_iface_reject(conn_request->uct.iface, conn_request->uct_req);
-    }
+    ucs_trace("listener %p: free conn_request %p", listener, conn_request);
 
+    UCS_ASYNC_BLOCK(&worker->async);
+    uct_listener_reject(conn_request->uct_listener, conn_request->uct_req);
+    ucs_free(conn_request->remote_dev_addr);
     UCS_ASYNC_UNBLOCK(&worker->async);
 
     ucs_free(conn_request);
diff --git a/src/ucp/core/ucp_listener.h b/src/ucp/core/ucp_listener.h
index 5385a2e93dc..c4f9b35469f 100644
--- a/src/ucp/core/ucp_listener.h
+++ b/src/ucp/core/ucp_listener.h
@@ -40,7 +40,7 @@ typedef struct ucp_listener {
 } ucp_listener_t;
 
 
-void ucp_listener_schedule_accept_cb(ucp_ep_h ep);
+void ucp_listener_schedule_accept_cb(ucp_conn_request_h conn_request);
 
 int ucp_listener_accept_cb_remove_filter(const ucs_callbackq_elem_t *elem,
                                          void *arg);
diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c
index a86f9c9e5d2..b6979388fa3 100644
--- a/src/ucp/core/ucp_mm.c
+++ b/src/ucp/core/ucp_mm.c
@@ -102,31 +102,31 @@ ucs_status_t ucp_mem_rereg_mds(ucp_context_h context, ucp_md_map_t reg_md_map,
             ucs_assert(alloc_md_memh_p != NULL);
             uct_memh[memh_index++] = *alloc_md_memh_p;
             new_md_map            |= UCS_BIT(md_index);
-        } else if (!length) {
+        } else if (length == 0) {
             /* don't register zero-length regions */
             continue;
         } else if (md_attr->cap.flags & UCT_MD_FLAG_REG) {
-            if (!(md_attr->cap.reg_mem_types & UCS_BIT(mem_type))) {
-                status = UCS_ERR_UNSUPPORTED;
-            } else {
-                ucs_assert(address && length);
+            ucs_assert(address != NULL);
 
-                /* MD supports registration, register new memh on it */
-                status = uct_md_mem_reg(context->tl_mds[md_index].md, address,
-                        length, uct_flags, &uct_memh[memh_index]);
+            if (!(md_attr->cap.reg_mem_types & UCS_BIT(mem_type))) {
+                continue;
             }
 
+            /* MD supports registration, register new memh on it */
+            status = uct_md_mem_reg(context->tl_mds[md_index].md, address,
+                                    length, uct_flags, &uct_memh[memh_index]);
             if (status == UCS_OK) {
-                ucs_trace("registered address %p length %zu on md[%d] memh[%d]=%p",
-                        address, length, md_index, memh_index,
-                        uct_memh[memh_index]);
+                ucs_trace("registered address %p length %zu on md[%d]"
+                          " memh[%d]=%p",
+                          address, length, md_index, memh_index,
+                          uct_memh[memh_index]);
                 new_md_map |= UCS_BIT(md_index);
                 ++memh_index;
                 continue;
             }
 
             level = (uct_flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ?
-                    UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR;
+                    UCS_LOG_LEVEL_DIAG : UCS_LOG_LEVEL_ERROR;
 
             ucs_log(level,
                     "failed to register address %p mem_type bit 0x%lx length %zu on "
@@ -308,8 +308,7 @@ static ucs_status_t ucp_mem_map_common(ucp_context_h context, void *address,
             goto err_free_memh;
         }
     } else {
-        memh->mem_type     = ucp_get_memory_type(context, address, length,
-                                                 memory_type);
+        memh->mem_type     = memory_type;
         memh->alloc_method = UCT_ALLOC_METHOD_LAST;
         memh->alloc_md     = NULL;
         memh->md_map       = 0;
@@ -390,10 +389,11 @@ static ucs_status_t ucp_mem_unmap_common(ucp_context_h context, ucp_mem_h memh)
 ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *params,
                          ucp_mem_h *memh_p)
 {
-    ucs_status_t status;
-    void         *address;
-    unsigned     flags;
     ucs_memory_type_t memory_type;
+    ucs_memory_info_t mem_info;
+    ucs_status_t status;
+    unsigned flags;
+    void *address;
 
     /* always acquire context lock */
     UCP_THREAD_CS_ENTER(&context->mt_lock);
@@ -405,11 +405,8 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para
         goto out;
     }
 
-    address     = UCP_PARAM_VALUE(MEM_MAP, params, address, ADDRESS, NULL);
-    flags       = UCP_PARAM_VALUE(MEM_MAP, params, flags, FLAGS, 0);
-    memory_type = UCP_PARAM_VALUE(MEM_MAP, params, memory_type, MEMORY_TYPE,
-                                  (flags & UCP_MEM_MAP_ALLOCATE) ?
-                                  UCS_MEMORY_TYPE_HOST : UCS_MEMORY_TYPE_UNKNOWN);
+    address = UCP_PARAM_VALUE(MEM_MAP, params, address, ADDRESS, NULL);
+    flags   = UCP_PARAM_VALUE(MEM_MAP, params, flags, FLAGS, 0);
 
     if ((flags & UCP_MEM_MAP_FIXED) &&
         ((uintptr_t)address % ucs_get_page_size())) {
@@ -438,6 +435,22 @@ ucs_status_t ucp_mem_map(ucp_context_h context, const ucp_mem_map_params_t *para
         goto out;
     }
 
+    if (flags & UCP_MEM_MAP_ALLOCATE) {
+        memory_type = UCS_MEMORY_TYPE_HOST;
+    } else if (!(params->field_mask & UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE) ||
+               (params->memory_type == UCS_MEMORY_TYPE_UNKNOWN)) {
+        ucp_memory_detect(context, address, params->length, &mem_info);
+        memory_type = mem_info.type;
+    } else {
+        if (params->memory_type > UCS_MEMORY_TYPE_LAST) {
+            ucs_error("invalid memory type %d", params->memory_type);
+            status = UCS_ERR_INVALID_PARAM;
+            goto out;
+        }
+
+        memory_type = params->memory_type;
+    }
+
     status = ucp_mem_map_common(context, address, params->length, memory_type,
                                 ucp_mem_map_params2uct_flags(params),
                                 ucp_mem_map_is_allocate(params),
@@ -549,6 +562,10 @@ ucs_status_t ucp_mem_query(const ucp_mem_h memh, ucp_mem_attr_t *attr)
         attr->length = memh->length;
     }
 
+    if (attr->field_mask & UCP_MEM_ATTR_FIELD_MEM_TYPE) {
+        attr->mem_type = memh->mem_type;
+    }
+
     return UCS_OK;
 }
 
diff --git a/src/ucp/core/ucp_mm.h b/src/ucp/core/ucp_mm.h
index b130906ebda..77634fb79e5 100644
--- a/src/ucp/core/ucp_mm.h
+++ b/src/ucp/core/ucp_mm.h
@@ -28,7 +28,7 @@ typedef struct ucp_mem {
     void                          *address;     /* Region start address */
     size_t                        length;       /* Region length */
     uct_alloc_method_t            alloc_method; /* Method used to allocate the memory */
-    ucs_memory_type_t             mem_type;     /**< type of allocated memory */
+    ucs_memory_type_t             mem_type;     /* Type of allocated or registered memory */
     uct_md_h                      alloc_md;     /* MD used to allocated the memory */
     ucp_md_map_t                  md_map;       /* Which MDs have valid memory handles */
     uct_mem_h                     uct[0];       /* Valid memory handles, as popcount(md_map) */
@@ -131,5 +131,7 @@ ucp_memh2uct(ucp_mem_h memh, ucp_md_index_t md_idx)
 #define UCP_MEM_IS_ROCM_MANAGED(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_ROCM_MANAGED)
 #define UCP_MEM_IS_ACCESSIBLE_FROM_CPU(_mem_type) \
     (UCS_BIT(_mem_type) & UCS_MEMORY_TYPES_CPU_ACCESSIBLE)
+#define UCP_MEM_IS_GPU(_mem_type) ((_mem_type) == UCS_MEMORY_TYPE_CUDA || \
+                                   (_mem_type) == UCS_MEMORY_TYPE_ROCM)
 
 #endif
diff --git a/src/ucp/core/ucp_proxy_ep.c b/src/ucp/core/ucp_proxy_ep.c
index 8e384205f37..970b855394e 100644
--- a/src/ucp/core/ucp_proxy_ep.c
+++ b/src/ucp/core/ucp_proxy_ep.c
@@ -68,6 +68,8 @@ UCP_PROXY_EP_DEFINE_OP(ucs_status_t, get_zcopy, const uct_iov_t*, size_t,
                        uint64_t, uct_rkey_t, uct_completion_t*)
 UCP_PROXY_EP_DEFINE_OP(ucs_status_t, am_short, uint8_t, uint64_t, const void*,
                        unsigned)
+UCP_PROXY_EP_DEFINE_OP(ucs_status_t, am_short_iov, uint8_t, const uct_iov_t*,
+                       size_t)
 UCP_PROXY_EP_DEFINE_OP(ssize_t, am_bcopy, uint8_t, uct_pack_callback_t, void*,
                        unsigned)
 UCP_PROXY_EP_DEFINE_OP(ucs_status_t, am_zcopy, uint8_t, const void*, unsigned,
@@ -131,6 +133,7 @@ UCS_CLASS_INIT_FUNC(ucp_proxy_ep_t, const uct_iface_ops_t *ops, ucp_ep_h ucp_ep,
     UCP_PROXY_EP_SET_OP(ep_get_bcopy);
     UCP_PROXY_EP_SET_OP(ep_get_zcopy);
     UCP_PROXY_EP_SET_OP(ep_am_short);
+    UCP_PROXY_EP_SET_OP(ep_am_short_iov);
     UCP_PROXY_EP_SET_OP(ep_am_bcopy);
     UCP_PROXY_EP_SET_OP(ep_am_zcopy);
     UCP_PROXY_EP_SET_OP(ep_atomic_cswap64);
diff --git a/src/ucp/core/ucp_request.c b/src/ucp/core/ucp_request.c
index 307b4a77aef..b90ce97d7bb 100644
--- a/src/ucp/core/ucp_request.c
+++ b/src/ucp/core/ucp_request.c
@@ -15,7 +15,7 @@
 #include <ucp/proto/proto_am.h>
 
 #include <ucs/datastruct/mpool.inl>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/debug/log.h>
 
 
@@ -133,11 +133,14 @@ UCS_PROFILE_FUNC_VOID(ucp_request_cancel, (worker, request),
     }
 }
 
-static void ucp_worker_request_init_proxy(ucs_mpool_t *mp, void *obj, void *chunk)
+static void
+ucp_worker_request_init_proxy(ucs_mpool_t *mp, void *obj, void *chunk)
 {
-    ucp_worker_h worker = ucs_container_of(mp, ucp_worker_t, req_mp);
+    ucp_worker_h worker   = ucs_container_of(mp, ucp_worker_t, req_mp);
     ucp_context_h context = worker->context;
-    ucp_request_t *req = obj;
+    ucp_request_t *req    = obj;
+
+    ucp_request_id_reset(req);
 
     if (context->config.request.init != NULL) {
         context->config.request.init(req + 1);
@@ -146,9 +149,11 @@ static void ucp_worker_request_init_proxy(ucs_mpool_t *mp, void *obj, void *chun
 
 static void ucp_worker_request_fini_proxy(ucs_mpool_t *mp, void *obj)
 {
-    ucp_worker_h worker = ucs_container_of(mp, ucp_worker_t, req_mp);
+    ucp_worker_h worker   = ucs_container_of(mp, ucp_worker_t, req_mp);
     ucp_context_h context = worker->context;
-    ucp_request_t *req = obj;
+    ucp_request_t *req    = obj;
+
+    ucp_request_id_check(req, ==, UCS_PTR_MAP_KEY_INVALID);
 
     if (context->config.request.cleanup != NULL) {
         context->config.request.cleanup(req + 1);
@@ -366,6 +371,7 @@ ucp_request_send_start(ucp_request_t *req, ssize_t max_short,
             ucp_request_init_multi_proto(req, proto->bcopy_multi,
                                          "start_bcopy_multi");
         }
+
         return UCS_OK;
     } else if (length < zcopy_max) {
         /* zcopy */
@@ -396,6 +402,7 @@ ucp_request_send_start(ucp_request_t *req, ssize_t max_short,
             req->send.uct.func = proto->zcopy_single;
             UCS_PROFILE_REQUEST_EVENT(req, "start_zcopy_single", req->send.length);
         }
+
         return UCS_OK;
     }
 
@@ -404,18 +411,31 @@ ucp_request_send_start(ucp_request_t *req, ssize_t max_short,
 
 void ucp_request_send_state_ff(ucp_request_t *req, ucs_status_t status)
 {
-    /*
-     * FIXME should not fast-forward requests owned by UCT
-     */
-    ucp_trace_req(req, "fast-forward with status %s", ucs_status_string(status));
+    ucp_trace_req(req, "fast-forward with status %s",
+                  ucs_status_string(status));
 
-    if (req->send.state.uct_comp.func == ucp_ep_flush_completion) {
+    /* Set REMOTE_COMPLETED flag to make sure that TAG/Sync operations will be
+     * fully completed here */
+    req->flags |= UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED;
+    ucp_send_request_id_release(req);
+
+    if (req->send.uct.func == ucp_proto_progress_am_single) {
+        req->send.proto.comp_cb(req);
+    } else if (req->send.state.uct_comp.func == ucp_ep_flush_completion) {
         ucp_ep_flush_request_ff(req, status);
-    } else if (req->send.state.uct_comp.func) {
-        req->send.state.dt.offset      = req->send.length;
-        req->send.state.uct_comp.count = 0;
+    } else if (req->send.state.uct_comp.func != NULL) {
+        /* Fast-forward the sending state to complete the operation when last
+         * network completion callback is called
+         */
+        req->send.state.dt.offset = req->send.length;
         uct_completion_update_status(&req->send.state.uct_comp, status);
-        req->send.state.uct_comp.func(&req->send.state.uct_comp);
+
+        if (req->send.state.uct_comp.count == 0) {
+            /* If nothing is in-flight, call completion callback to ensure
+             * cleanup of zero-copy resources
+             */
+            req->send.state.uct_comp.func(&req->send.state.uct_comp);
+        }
     } else {
         ucp_request_complete_send(req, status);
     }
diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h
index 3853c9b3f4e..30dda94e4ac 100644
--- a/src/ucp/core/ucp_request.h
+++ b/src/ucp/core/ucp_request.h
@@ -26,9 +26,6 @@
 #include <ucp/core/ucp_worker.h>
 
 
-#define UCP_REQUEST_ID_INVALID      0
-
-
 #define ucp_trace_req(_sreq, _message, ...) \
     ucs_trace_req("req %p: " _message, (_sreq), ## __VA_ARGS__)
 
@@ -37,28 +34,30 @@
  * Request flags
  */
 enum {
-    UCP_REQUEST_FLAG_COMPLETED            = UCS_BIT(0),
-    UCP_REQUEST_FLAG_RELEASED             = UCS_BIT(1),
-    UCP_REQUEST_FLAG_EXPECTED             = UCS_BIT(3),
-    UCP_REQUEST_FLAG_LOCAL_COMPLETED      = UCS_BIT(4),
-    UCP_REQUEST_FLAG_REMOTE_COMPLETED     = UCS_BIT(5),
-    UCP_REQUEST_FLAG_CALLBACK             = UCS_BIT(6),
-    UCP_REQUEST_FLAG_PROTO_INITIALIZED    = UCS_BIT(7),
-    UCP_REQUEST_FLAG_SYNC                 = UCS_BIT(8),
-    UCP_REQUEST_FLAG_OFFLOADED            = UCS_BIT(10),
-    UCP_REQUEST_FLAG_BLOCK_OFFLOAD        = UCS_BIT(11),
-    UCP_REQUEST_FLAG_STREAM_RECV_WAITALL  = UCS_BIT(12),
-    UCP_REQUEST_FLAG_SEND_AM              = UCS_BIT(13),
-    UCP_REQUEST_FLAG_SEND_TAG             = UCS_BIT(14),
-    UCP_REQUEST_FLAG_RNDV_FRAG            = UCS_BIT(15),
-    UCP_REQUEST_FLAG_RECV_AM              = UCS_BIT(16),
-    UCP_REQUEST_FLAG_RECV_TAG             = UCS_BIT(17),
+    UCP_REQUEST_FLAG_COMPLETED             = UCS_BIT(0),
+    UCP_REQUEST_FLAG_RELEASED              = UCS_BIT(1),
+    UCP_REQUEST_FLAG_EXPECTED              = UCS_BIT(3),
+    UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED  = UCS_BIT(4),
+    UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED = UCS_BIT(5),
+    UCP_REQUEST_FLAG_CALLBACK              = UCS_BIT(6),
+    UCP_REQUEST_FLAG_PROTO_INITIALIZED     = UCS_BIT(7),
+    UCP_REQUEST_FLAG_SYNC                  = UCS_BIT(8),
+    UCP_REQUEST_FLAG_OFFLOADED             = UCS_BIT(10),
+    UCP_REQUEST_FLAG_BLOCK_OFFLOAD         = UCS_BIT(11),
+    UCP_REQUEST_FLAG_STREAM_RECV_WAITALL   = UCS_BIT(12),
+    UCP_REQUEST_FLAG_SEND_AM               = UCS_BIT(13),
+    UCP_REQUEST_FLAG_SEND_TAG              = UCS_BIT(14),
+    UCP_REQUEST_FLAG_RNDV_FRAG             = UCS_BIT(15),
+    UCP_REQUEST_FLAG_RECV_AM               = UCS_BIT(16),
+    UCP_REQUEST_FLAG_RECV_TAG              = UCS_BIT(17),
 #if UCS_ENABLE_ASSERT
-    UCP_REQUEST_FLAG_STREAM_RECV          = UCS_BIT(18),
-    UCP_REQUEST_DEBUG_FLAG_EXTERNAL       = UCS_BIT(19)
+    UCP_REQUEST_FLAG_STREAM_RECV           = UCS_BIT(18),
+    UCP_REQUEST_DEBUG_FLAG_EXTERNAL        = UCS_BIT(19),
+    UCP_REQUEST_FLAG_SUPER_VALID           = UCS_BIT(20)
 #else
-    UCP_REQUEST_FLAG_STREAM_RECV          = 0,
-    UCP_REQUEST_DEBUG_FLAG_EXTERNAL       = 0
+    UCP_REQUEST_FLAG_STREAM_RECV           = 0,
+    UCP_REQUEST_DEBUG_FLAG_EXTERNAL        = 0,
+    UCP_REQUEST_FLAG_SUPER_VALID           = 0    
 #endif
 };
 
@@ -79,19 +78,25 @@ enum {
  * Receive descriptor flags.
  */
 enum {
-    UCP_RECV_DESC_FLAG_UCT_DESC       = UCS_BIT(0), /* Descriptor allocated by UCT */
-    UCP_RECV_DESC_FLAG_EAGER          = UCS_BIT(1), /* Eager tag message */
-    UCP_RECV_DESC_FLAG_EAGER_ONLY     = UCS_BIT(2), /* Eager tag message with single fragment */
-    UCP_RECV_DESC_FLAG_EAGER_SYNC     = UCS_BIT(3), /* Eager tag message which requires reply */
-    UCP_RECV_DESC_FLAG_EAGER_OFFLOAD  = UCS_BIT(4), /* Eager tag from offload */
-    UCP_RECV_DESC_FLAG_EAGER_LAST     = UCS_BIT(5), /* Last fragment of eager tag message.
-                                                       Used by tag offload protocol. */
-    UCP_RECV_DESC_FLAG_RNDV           = UCS_BIT(6), /* Rendezvous request */
-    UCP_RECV_DESC_FLAG_RNDV_STARTED   = UCS_BIT(7), /* Rendezvous receive was initiated
-                                                       (in AM API) */
-    UCP_RECV_DESC_FLAG_MALLOC         = UCS_BIT(8)  /* Descriptor was allocated with malloc
-                                                       and must be freed, not returned to the
-                                                       memory pool or UCT */
+    UCP_RECV_DESC_FLAG_UCT_DESC         = UCS_BIT(0), /* Descriptor allocated by UCT */
+    UCP_RECV_DESC_FLAG_EAGER            = UCS_BIT(1), /* Eager tag message */
+    UCP_RECV_DESC_FLAG_EAGER_ONLY       = UCS_BIT(2), /* Eager tag message with single fragment */
+    UCP_RECV_DESC_FLAG_EAGER_SYNC       = UCS_BIT(3), /* Eager tag message which requires reply */
+    UCP_RECV_DESC_FLAG_EAGER_OFFLOAD    = UCS_BIT(4), /* Eager tag from offload */
+    UCP_RECV_DESC_FLAG_EAGER_LAST       = UCS_BIT(5), /* Last fragment of eager tag message.
+                                                         Used by tag offload protocol. */
+    UCP_RECV_DESC_FLAG_RNDV             = UCS_BIT(6), /* Rendezvous request */
+    UCP_RECV_DESC_FLAG_RECV_STARTED     = UCS_BIT(7), /* Receive operation on this descriptor
+                                                         was initiated by ucp_am_recv_data_nbx */
+    UCP_RECV_DESC_FLAG_MALLOC           = UCS_BIT(8), /* Descriptor was allocated with malloc
+                                                         and must be freed, not returned to the
+                                                         memory pool or UCT */
+    UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS = UCS_BIT(9), /* Descriptor should not be released,
+                                                         because UCT AM callback is still in
+                                                         the call stack and descriptor is not
+                                                         initialized yet. */
+    UCP_RECV_DESC_FLAG_RELEASED         = UCS_BIT(10) /* Indicates that the descriptor was
+                                                         released and cannot be used. */
 };
 
 
@@ -108,8 +113,13 @@ enum {
  * Request in progress.
  */
 struct ucp_request {
-    ucs_status_t                  status;     /* Operation status */
-    uint32_t                      flags;      /* Request flags */
+    /* Operation status */
+    ucs_status_t      status;
+    /* Request flags */
+    uint32_t          flags;
+    /* Local request ID taken from PTR MAP */
+    ucs_ptr_map_key_t id;
+
     union {
         void                      *user_data; /* Completion user data */
         ucp_request_t             *super_req; /* Super request that is used
@@ -122,26 +132,37 @@ struct ucp_request {
          * operations */
         struct {
             ucp_ep_h                ep;
-            void                    *buffer;    /* Send buffer */
-            ucp_datatype_t          datatype;   /* Send type */
-            size_t                  length;     /* Total length, in bytes */
-            ucp_send_nbx_callback_t cb;         /* Completion callback */
+            union {
+                void                   *buffer; /* Send buffer */
+                ucp_request_callback_t flushed_cb; /* Called when flushed */
+            };
+            ucp_datatype_t          datatype; /* Send type */
+            size_t                  length; /* Total length, in bytes */
+            ucp_send_nbx_callback_t cb; /* Completion callback */
+            ucs_hlist_link_t        list; /* Element in the per-EP list of UCP
+                                             flush/proto requests */
 
             const ucp_proto_config_t *proto_config; /* Selected protocol for the request */
-            ucp_datatype_iter_t      dt_iter;       /* Send buffer state */
+
+            /* This structure holds all mutable fields, and everything else
+             * except common send/recv fields 'status' and 'flags' is immutable
+             * TODO: rework RMA case where length is used instead of dt.offset */
+            struct {
+                union {
+                    ucp_datatype_iter_t  dt_iter;  /* Send buffer state */
+                    ucp_dt_state_t       dt;       /* Position in the send buffer */
+                };
+                uct_completion_t         uct_comp; /* UCT completion used by flush */
+            } state;
 
             union {
                 ucp_wireup_msg_t  wireup;
 
                 struct {
-                    uint64_t               message_id;  /* used to identify matching parts
-                                                           of a large message */
-                    ucs_ptr_map_key_t      rreq_id;     /* receive request ID on the
-                                                           recv side (used in AM rndv) */
+                    /* Used to identify matching parts of a large message */
+                    uint64_t message_id;
                     union {
-                        struct {
-                            ucp_tag_t      tag;
-                        } tag;
+                        ucp_tag_t tag;
 
                         struct {
                             union {
@@ -160,13 +181,13 @@ struct ucp_request {
                 } msg_proto;
 
                 struct {
-                    uint64_t      remote_addr; /* Remote address */
-                    ucp_rkey_h    rkey;     /* Remote memory key */
+                    uint64_t   remote_addr; /* Remote address */
+                    ucp_rkey_h rkey; /* Remote memory key */
                 } rma;
 
                 struct {
-                    ucs_ptr_map_key_t      remote_req_id; /* send request ID on
-                                                             receiver side */
+                    /* Remote request ID received from a peer */
+                    ucs_ptr_map_key_t      remote_req_id;
                     uint8_t                am_id;
                     ucs_status_t           status;
                     ucp_tag_t              sender_tag; /* Sender tag, which is
@@ -181,58 +202,65 @@ struct ucp_request {
                 } proxy;
 
                 struct {
-                    uint64_t             remote_address;  /* address of the sender's data buffer */
-                    ucs_ptr_map_key_t    remote_req_id;   /* the sender's request ID */
-                    ucp_rkey_h           rkey;            /* key for remote send buffer */
-                    ucp_lane_map_t       lanes_map_all;   /* actual lanes map */
-                    uint8_t              lanes_count;     /* actual lanes count */
-                    uint8_t              rkey_index[UCP_MAX_LANES];
-                } rndv_get;
+                    uint64_t          remote_address;  /* address of the sender/receiver's data
+                                                          buffer for the GET/PUT operation */
+                    /* Remote request ID received from a peer */
+                    ucs_ptr_map_key_t remote_req_id;
+                    ucp_rkey_h        rkey;            /* key for remote send/receive buffer for
+                                                          the GET/PUT operation */
+                    union {
+                        struct {
+                            ucp_lane_map_t lanes_map_all; /* actual lanes map */
+                            uint8_t        lanes_count; /* actual lanes count */
+                            uint8_t        rkey_index[UCP_MAX_LANES];
+                        };
+                        struct {
+                            ucs_ptr_map_key_t rreq_id; /* id of receive request */
+                        } rtr;
+                    };
+                } rndv;
 
                 struct {
-                    uint64_t             remote_address; /* address of the receiver's data buffer */
-                    ucs_ptr_map_key_t    rreq_remote_id; /* receiver's receive request ID */
-                    ucp_rkey_h           rkey;           /* key for remote receive buffer */
-                    uct_rkey_t           uct_rkey;       /* UCT remote key */
-                } rndv_put;
+                    /* Remote request ID received from a peer */
+                    ucs_ptr_map_key_t remote_req_id;
+                } rndv_data;
 
                 struct {
-                    ucs_queue_elem_t     queue_elem;
-                    ucs_ptr_map_key_t    req_id;         /* sender's request ID */
-                    ucp_rkey_h           rkey;           /* key for remote send buffer */
+                    /* Element in queue for segmented RKEY ptr */
+                    ucs_queue_elem_t  queue_elem;
+                    /* Remote request ID received from a peer */
+                    ucs_ptr_map_key_t remote_req_id;
+                    /* Key for remote send buffer */
+                    ucp_rkey_h        rkey;
                 } rkey_ptr;
 
                 struct {
-                    ucs_ptr_map_key_t req_id;         /* the send request ID on receiver side */
-                    size_t            length;         /* the length of the data that should be fetched
-                                                       * from sender side */
-                    size_t            offset;         /* offset in recv buffer */
+                    /* The length of the data that should be fetched from sender
+                     * side */
+                    size_t            length;
+                    /* Offset in the receiver's buffer */
+                    size_t            offset;
                 } rndv_rtr;
 
                 struct {
-                    ucp_request_callback_t flushed_cb;/* Called when flushed */
-                    ucs_queue_elem_t       queue;     /* Queue element in proto_status */
-                    unsigned               uct_flags; /* Flags to pass to @ref uct_ep_flush */
-                    uct_worker_cb_id_t     prog_id;   /* Progress callback ID */
-                    uint32_t               cmpl_sn;   /* Sequence number of the remote completion
-                                                         this request is waiting for */
-                    uint8_t                sw_started;
-                    uint8_t                sw_done;
-                    uint8_t                num_lanes; /* How many lanes are being flushed */
-                    ucp_lane_map_t         started_lanes;/* Which lanes need were flushed */
+                    unsigned           uct_flags; /* Flags to pass to @ref uct_ep_flush */
+                    uct_worker_cb_id_t prog_id; /* Progress callback ID */
+                    uint32_t           cmpl_sn; /* Sequence number of the remote completion
+                                                   this request is waiting for */
+                    uint8_t            sw_started;
+                    uint8_t            sw_done;
+                    uint8_t            num_lanes; /* How many lanes are being flushed */
+                    ucp_lane_map_t     started_lanes; /* Which lanes need were flushed */
                 } flush;
 
                 struct {
-                    uct_worker_cb_id_t        prog_id;/* Slow-path callback */
-                } disconnect;
-
-                struct {
-                    ucp_worker_h          ucp_worker;     /* UCP worker where a discard UCT EP
-                                                           * operation submitted on */
-                    uct_ep_h              uct_ep;         /* UCT EP that should be flushed and
-                                                             destroyed */
-                    unsigned              ep_flush_flags; /* Flags that should be passed into
-                                                             @ref uct_ep_flush */
+                    /* UCT EP that should be flushed and destroyed */
+                    uct_ep_h           uct_ep;
+                    /* Flags that should be passed into @ref uct_ep_flush */
+                    unsigned           ep_flush_flags;
+                    /* Progress ID, if it's UCS_CALLBACKQ_ID_NULL, no operations
+                     * are in-progress */
+                    uct_worker_cb_id_t cb_id;
                 } discard_uct_ep;
 
                 struct {
@@ -250,29 +278,24 @@ struct ucp_request {
                 } tag_offload;
 
                 struct {
-                    ucs_ptr_map_key_t req_id;   /* Remote get request ID */
+                    /* Remote request ID received from a peer */
+                    ucs_ptr_map_key_t remote_req_id;
                 } get_reply;
 
                 struct {
-                    ucs_ptr_map_key_t   req_id; /* Remote atomic request ID */
-                    ucp_atomic_reply_t  data;   /* Atomic reply data */
+                    /* Remote request ID received from a peer */
+                    ucs_ptr_map_key_t  remote_req_id;
+                    /* Atomic reply data */
+                    ucp_atomic_reply_t data;
                 } atomic_reply;
             };
 
-            /* This structure holds all mutable fields, and everything else
-             * except common send/recv fields 'status' and 'flags' is
-             * immutable
-             * TODO: rework RMA case where length is used instead of dt.offset */
-            struct {
-                ucp_dt_state_t    dt;       /* Position in the send buffer */
-                uct_completion_t  uct_comp; /* UCT completion */
-            } state;
-
             union {
                 ucp_lane_index_t  am_bw_index;     /* AM BW lane index */
                 ucp_lane_map_t    lanes_map_avail; /* Used lanes map */
             };
-            uint8_t               mem_type;        /* Memory type */
+            uint8_t               mem_type;        /* Memory type, values are
+                                                    * ucs_memory_type_t */
             ucp_lane_index_t      pending_lane;    /* Lane on which request was moved
                                                     * to pending state */
             ucp_lane_index_t      lane;            /* Lane on which this request is being sent */
@@ -294,6 +317,9 @@ struct ucp_request {
             ssize_t               remaining;  /* How much more data
                                                * to be received */
 
+            /* Remote request ID received from a peer */
+            ucs_ptr_map_key_t     remote_req_id;
+
             union {
                 struct {
                     ucp_tag_t                   tag;        /* Expected tag */
@@ -369,8 +395,14 @@ struct ucp_recv_desc {
         ucs_queue_elem_t    am_mid_queue;    /* AM middle fragments queue */
     };
     uint32_t                length;          /* Received length */
-    uint32_t                payload_offset;  /* Offset from end of the descriptor
+    union {
+        uint32_t            payload_offset;  /* Offset from end of the descriptor
                                               * to AM data */
+        uint32_t            am_malloc_offset; /* Offset from rdesc, holding
+                                                 assembled multi-fragment active
+                                                 message, to the originally
+                                                 malloc'd buffer pointer */
+    };
     uint16_t                flags;           /* Flags */
     int16_t                 uct_desc_offset; /* Offset which needs to be
                                                 substructed from rdesc when
diff --git a/src/ucp/core/ucp_request.inl b/src/ucp/core/ucp_request.inl
index 0e510295bb5..3eba3737527 100644
--- a/src/ucp/core/ucp_request.inl
+++ b/src/ucp/core/ucp_request.inl
@@ -17,6 +17,7 @@
 #include <ucs/profile/profile.h>
 #include <ucs/datastruct/mpool.inl>
 #include <ucs/datastruct/ptr_map.inl>
+#include <ucs/debug/debug_int.h>
 #include <ucp/dt/dt.inl>
 #include <inttypes.h>
 
@@ -28,7 +29,7 @@
     (((_flags) & UCP_REQUEST_FLAG_COMPLETED)       ? 'd' : '-'), \
     (((_flags) & UCP_REQUEST_FLAG_RELEASED)        ? 'f' : '-'), \
     (((_flags) & UCP_REQUEST_FLAG_EXPECTED)        ? 'e' : '-'), \
-    (((_flags) & UCP_REQUEST_FLAG_LOCAL_COMPLETED) ? 'L' : '-'), \
+    (((_flags) & UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED) ? 'L' : '-'), \
     (((_flags) & UCP_REQUEST_FLAG_CALLBACK)        ? 'c' : '-'), \
     (((_flags) & (UCP_REQUEST_FLAG_RECV_TAG | \
                   UCP_REQUEST_FLAG_RECV_AM))       ? 'r' : '-'), \
@@ -54,9 +55,8 @@
     ({ \
         ucp_request_t *_req = ucs_mpool_get_inline(&(_worker)->req_mp); \
         if (_req != NULL) { \
-            VALGRIND_MAKE_MEM_DEFINED(_req + 1, \
-                                      (_worker)->context->config.request.size); \
             ucs_trace_req("allocated request %p", _req); \
+            ucp_request_reset_internal(_req, _worker); \
             UCS_PROFILE_REQUEST_NEW(_req, "ucp_request", 0); \
         } \
         _req; \
@@ -64,12 +64,17 @@
 
 #define ucp_request_complete(_req, _cb, _status, ...) \
     { \
+        /* NOTE: external request can't have RELEASE flag and we */ \
+        /* will never put it into mpool */ \
+        uint32_t _flags = ((_req)->flags |= UCP_REQUEST_FLAG_COMPLETED); \
         (_req)->status = (_status); \
+        \
+        ucp_request_id_check(_req, ==, UCS_PTR_MAP_KEY_INVALID); \
+        \
         if (ucs_likely((_req)->flags & UCP_REQUEST_FLAG_CALLBACK)) { \
             (_req)->_cb((_req) + 1, (_status), ## __VA_ARGS__); \
         } \
-        if (ucs_unlikely(((_req)->flags  |= UCP_REQUEST_FLAG_COMPLETED) & \
-                         UCP_REQUEST_FLAG_RELEASED)) { \
+        if (ucs_unlikely(_flags & UCP_REQUEST_FLAG_RELEASED)) { \
             ucp_request_put(_req); \
         } \
     }
@@ -94,14 +99,23 @@
             } \
         } else { \
             __req = ((ucp_request_t*)(_param)->request) - 1; \
+            ucp_request_id_reset(__req); \
         } \
         __req; \
     })
 
 
+#define ucp_request_id_check(_req, _cmp, _id) \
+    ucs_assertv((_req)->id _cmp (_id), "req=%p req->id=0x%" PRIx64 " id=0x%" \
+                PRIx64, \
+                (_req), (_req)->id, (_id))
+
+
 #define ucp_request_put_param(_param, _req) \
     if (!((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_REQUEST)) { \
         ucp_request_put(_req); \
+    } else { \
+        ucp_request_id_check(_req, ==, UCS_PTR_MAP_KEY_INVALID); \
     }
 
 
@@ -146,10 +160,32 @@
     }
 
 
+#define UCP_REQUEST_CHECK_PARAM(_param) \
+    if (((_param)->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) && \
+        ((_param)->memory_type > UCS_MEMORY_TYPE_LAST)) { \
+        ucs_error("invalid memory type parameter: %d", (_param)->memory_type); \
+        return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM); \
+    }
+
+
+static UCS_F_ALWAYS_INLINE void ucp_request_id_reset(ucp_request_t *req)
+{
+    req->id = UCS_PTR_MAP_KEY_INVALID;
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_request_reset_internal(ucp_request_t *req, ucp_worker_h worker)
+{
+    VALGRIND_MAKE_MEM_DEFINED(&req->id, sizeof(req->id));
+    VALGRIND_MAKE_MEM_DEFINED(req + 1, worker->context->config.request.size);
+    ucp_request_id_check(req, ==, UCS_PTR_MAP_KEY_INVALID);
+}
+
 static UCS_F_ALWAYS_INLINE void
 ucp_request_put(ucp_request_t *req)
 {
     ucs_trace_req("put request %p", req);
+    ucp_request_id_check(req, ==, UCS_PTR_MAP_KEY_INVALID);
     UCS_PROFILE_REQUEST_FREE(req);
     ucs_mpool_put_inline(req);
 }
@@ -157,7 +193,8 @@ ucp_request_put(ucp_request_t *req)
 static UCS_F_ALWAYS_INLINE void
 ucp_request_complete_send(ucp_request_t *req, ucs_status_t status)
 {
-    ucs_trace_req("completing send request %p (%p) "UCP_REQUEST_FLAGS_FMT" %s",
+    ucs_trace_req("completing send request %p (%p) " UCP_REQUEST_FLAGS_FMT
+                  " %s",
                   req, req + 1, UCP_REQUEST_FLAGS_ARG(req->flags),
                   ucs_status_string(status));
     UCS_PROFILE_REQUEST_EVENT(req, "complete_send", status);
@@ -167,7 +204,7 @@ ucp_request_complete_send(ucp_request_t *req, ucs_status_t status)
 static UCS_F_ALWAYS_INLINE void
 ucp_request_complete_tag_recv(ucp_request_t *req, ucs_status_t status)
 {
-    ucs_trace_req("completing receive request %p (%p) "UCP_REQUEST_FLAGS_FMT
+    ucs_trace_req("completing receive request %p (%p) " UCP_REQUEST_FLAGS_FMT
                   " stag 0x%" PRIx64" len %zu, %s",
                   req, req + 1, UCP_REQUEST_FLAGS_ARG(req->flags),
                   req->recv.tag.info.sender_tag, req->recv.tag.info.length,
@@ -190,7 +227,7 @@ ucp_request_complete_stream_recv(ucp_request_t *req, ucp_ep_ext_proto_t* ep_ext,
 
     req->recv.stream.length = req->recv.stream.offset;
     ucs_trace_req("completing stream receive request %p (%p) "
-                  UCP_REQUEST_FLAGS_FMT" count %zu, %s",
+                  UCP_REQUEST_FLAGS_FMT " count %zu, %s",
                   req, req + 1, UCP_REQUEST_FLAGS_ARG(req->flags),
                   req->recv.stream.length, ucs_status_string(status));
     UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", status);
@@ -211,7 +248,8 @@ ucp_request_can_complete_stream_recv(ucp_request_t *req)
         return 0;
     }
 
-    /* 0-length stream recv is meaningless if this was not requested explicitely */
+    /* 0-length stream recv is meaningless if this was not requested
+     * explicitly */
     if (req->recv.stream.offset == 0) {
         return 0;
     }
@@ -225,6 +263,31 @@ ucp_request_can_complete_stream_recv(ucp_request_t *req)
     return 1;
 }
 
+
+static UCS_F_ALWAYS_INLINE ucp_request_t*
+ucp_request_mem_alloc(const char *name)
+{
+    ucp_request_t *req = (ucp_request_t*)ucs_malloc(sizeof(*req), name);
+
+    if (ucs_unlikely(req == NULL)) {
+        return NULL;
+    }
+
+    ucs_trace_req("allocated request %p (%s)", req, name);
+    ucp_request_id_reset(req);
+    UCS_PROFILE_REQUEST_NEW(req, "ucp_request", 0);
+
+    return req;
+}
+
+static UCS_F_ALWAYS_INLINE void ucp_request_mem_free(ucp_request_t *req)
+{
+    UCS_PROFILE_REQUEST_FREE(req);
+    ucp_request_id_check(req, ==, UCS_PTR_MAP_KEY_INVALID);
+    ucs_trace_req("freed request %p", req);
+    ucs_free(req);
+}
+
 /*
  * @return Whether completed.
  *         *req_status if filled with the completion status if completed.
@@ -332,12 +395,21 @@ ucp_request_send_state_reset(ucp_request_t *req,
     case UCP_REQUEST_SEND_PROTO_RNDV_GET:
     case UCP_REQUEST_SEND_PROTO_RNDV_PUT:
     case UCP_REQUEST_SEND_PROTO_ZCOPY_AM:
-        req->send.state.uct_comp.func   = comp_cb;
         req->send.state.uct_comp.count  = 0;
         req->send.state.uct_comp.status = UCS_OK;
         /* Fall through */
     case UCP_REQUEST_SEND_PROTO_BCOPY_AM:
-        req->send.state.dt.offset       = 0;
+        /* Always set completion function to make sure this value is initialized
+         * when doing send fast-forwarding in ucp_request_send_state_ff() */
+        req->send.state.uct_comp.func = comp_cb;
+        req->send.state.dt.offset     = 0;
+
+        if (proto == UCP_REQUEST_SEND_PROTO_BCOPY_AM) {
+            ucs_assertv(comp_cb == NULL,
+                        "completion function for AM Bcopy protocol must be NULL"
+                        " instead of %s",
+                        ucs_debug_get_symbol_name((void*)comp_cb));
+        }
         break;
     default:
         ucs_fatal("unknown protocol");
@@ -426,7 +498,8 @@ ucp_request_send_buffer_reg(ucp_request_t *req, ucp_md_map_t md_map,
     return ucp_request_memory_reg(req->send.ep->worker->context, md_map,
                                   (void*)req->send.buffer, req->send.length,
                                   req->send.datatype, &req->send.state.dt,
-                                  req->send.mem_type, req, uct_flags);
+                                  (ucs_memory_type_t)req->send.mem_type, req,
+                                  uct_flags);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
@@ -538,7 +611,8 @@ ucp_request_recv_data_unpack(ucp_request_t *req, const void *data,
     ucp_dt_generic_t *dt_gen;
     ucs_status_t status;
 
-    ucs_assert(req->status == UCS_OK);
+    ucs_assertv(req->status == UCS_OK, "status: %s",
+                ucs_status_string(req->status));
 
     ucp_trace_req(req, "unpack recv_data req_len %zu data_len %zu offset %zu last: %s",
                   req->recv.length, length, offset, last ? "yes" : "no");
@@ -556,13 +630,14 @@ ucp_request_recv_data_unpack(ucp_request_t *req, const void *data,
 
     case UCP_DATATYPE_IOV:
         if (offset != req->recv.state.offset) {
-            ucp_dt_iov_seek(req->recv.buffer, req->recv.state.dt.iov.iovcnt,
+            ucp_dt_iov_seek((ucp_dt_iov_t*)req->recv.buffer,
+                            req->recv.state.dt.iov.iovcnt,
                             offset - req->recv.state.offset,
                             &req->recv.state.dt.iov.iov_offset,
                             &req->recv.state.dt.iov.iovcnt_offset);
             req->recv.state.offset = offset;
         }
-        UCS_PROFILE_CALL(ucp_dt_iov_scatter, req->recv.buffer,
+        UCS_PROFILE_CALL(ucp_dt_iov_scatter, (ucp_dt_iov_t*)req->recv.buffer,
                          req->recv.state.dt.iov.iovcnt, data, length,
                          &req->recv.state.dt.iov.iov_offset,
                          &req->recv.state.dt.iov.iovcnt_offset);
@@ -642,12 +717,22 @@ ucp_recv_desc_release(ucp_recv_desc_t *rdesc)
 static UCS_F_ALWAYS_INLINE void
 ucp_request_complete_am_recv(ucp_request_t *req, ucs_status_t status)
 {
-    ucs_trace_req("completing AM receive request %p (%p) "UCP_REQUEST_FLAGS_FMT
+    ucs_trace_req("completing AM receive request %p (%p) " UCP_REQUEST_FLAGS_FMT
                   " length %zu, %s",
                   req, req + 1, UCP_REQUEST_FLAGS_ARG(req->flags),
                   req->recv.length, ucs_status_string(status));
     UCS_PROFILE_REQUEST_EVENT(req, "complete_recv", status);
-    ucp_recv_desc_release(req->recv.am.desc);
+
+    if (req->recv.am.desc->flags & UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS) {
+        /* Descriptor is not initialized by UCT yet, therefore can not call
+         * ucp_recv_desc_release() for it. Clear the flag to let UCT AM
+         * callback know that this descriptor is not needed anymore.
+         */
+        req->recv.am.desc->flags &= ~UCP_RECV_DESC_FLAG_AM_CB_INPROGRESS;
+    } else {
+        ucp_recv_desc_release(req->recv.am.desc);
+    }
+
     ucp_request_complete(req, recv.am.cb, status, req->recv.length,
                          req->user_data);
 }
@@ -720,10 +805,10 @@ ucp_send_request_next_am_bw_lane(ucp_request_t *req)
 static UCS_F_ALWAYS_INLINE ucs_ptr_map_key_t
 ucp_send_request_get_ep_remote_id(ucp_request_t *req)
 {
-    /* This function may return UCP_WORKER_PTR_KEY_INVALID, but in such cases
+    /* This function may return UCS_PTR_MAP_KEY_INVALID, but in such cases
      * the message should not be sent at all because the am_lane would point to
      * a wireup (proxy) endpoint. So only the receiver side has an assertion
-     * that remote_id != UCP_EP_ID_INVALID.
+     * that remote_id != UCS_PTR_MAP_KEY_INVALID.
      */
     return ucp_ep_remote_id(req->send.ep);
 }
@@ -743,17 +828,136 @@ ucp_request_param_datatype(const ucp_request_param_t *param)
 }
 
 static UCS_F_ALWAYS_INLINE ucs_memory_type_t
-ucp_request_param_mem_type(const ucp_request_param_t *param)
+ucp_request_get_memory_type(ucp_context_h context, const void *address,
+                            size_t length, const ucp_request_param_t *param)
+{
+    ucs_memory_info_t mem_info;
+
+    if (!(param->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) ||
+        (param->memory_type == UCS_MEMORY_TYPE_UNKNOWN)) {
+        ucp_memory_detect(context, address, length, &mem_info);
+        ucs_assert(mem_info.type < UCS_MEMORY_TYPE_UNKNOWN);
+        return (ucs_memory_type_t)mem_info.type;
+    }
+
+    ucs_assert(param->memory_type < UCS_MEMORY_TYPE_UNKNOWN);
+    return param->memory_type;
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_ep_ptr_map_check_status(ucp_ep_h ep, void *ptr, const char *action_str,
+                            ucs_status_t status)
+{
+    ucs_assertv((status == UCS_OK) || (status == UCS_ERR_NO_PROGRESS),
+                "ep %p: failed to %s id for %p: %s", ep, action_str, ptr,
+                ucs_status_string(status));
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_ep_ptr_id_alloc(ucp_ep_h ep, void *ptr, ucs_ptr_map_key_t *ptr_id_p)
+{
+    ucs_status_t status;
+
+    status = ucs_ptr_map_put(&ep->worker->ptr_map, ptr,
+                             ucp_ep_use_indirect_id(ep), ptr_id_p);
+    ucp_ep_ptr_map_check_status(ep, ptr, "allocate", status);
+
+    return status;
+}
+
+static UCS_F_ALWAYS_INLINE void ucp_send_request_id_alloc(ucp_request_t *req)
 {
-    return (param->op_attr_mask & UCP_OP_ATTR_FIELD_MEMORY_TYPE) ?
-           param->memory_type : UCS_MEMORY_TYPE_UNKNOWN;
+    ucp_ep_h ep = req->send.ep;
+    ucs_status_t status;
+
+    ucp_request_id_check(req, ==, UCS_PTR_MAP_KEY_INVALID);
+    status = ucp_ep_ptr_id_alloc(ep, req, &req->id);
+    if (status == UCS_OK) {
+        ucs_hlist_add_tail(&ucp_ep_ext_gen(ep)->proto_reqs,
+                           &req->send.list);
+    }
 }
 
 static UCS_F_ALWAYS_INLINE ucs_ptr_map_key_t
-ucp_send_request_get_id(ucp_request_t *req)
+ucp_send_request_get_id(const ucp_request_t *req)
+{
+    ucp_request_id_check(req, !=, UCS_PTR_MAP_KEY_INVALID);
+    return req->id;
+}
+
+/* Since release functuion resets request ID to @ref UCS_PTR_MAP_KEY_INVALID and
+ * PTR MAP considers @ref UCS_PTR_MAP_KEY_INVALID as direct key, release request
+ * ID is re-entrant function */
+static UCS_F_ALWAYS_INLINE void ucp_send_request_id_release(ucp_request_t *req)
+{
+    ucp_ep_h ep;
+    ucs_status_t UCS_V_UNUSED status;
+
+    ucs_assert(!(req->flags &
+                 (UCP_REQUEST_FLAG_RECV_AM | UCP_REQUEST_FLAG_RECV_TAG)));
+    ep = req->send.ep;
+
+    status = ucs_ptr_map_del(&ep->worker->ptr_map, req->id);
+    if (status == UCS_OK) {
+        ucs_hlist_del(&ucp_ep_ext_gen(ep)->proto_reqs, &req->send.list);
+    }
+
+    ucp_ep_ptr_map_check_status(ep, req, "release", status);
+    ucp_request_id_reset(req);
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_send_request_get_by_id(ucp_worker_h worker, ucs_ptr_map_key_t id,
+                           ucp_request_t **req_p, int extract)
+{
+    ucs_status_t status;
+    void *ptr;
+
+    ucs_assert(id != UCS_PTR_MAP_KEY_INVALID);
+
+    status = ucs_ptr_map_get(&worker->ptr_map, id, extract, &ptr);
+    if (ucs_unlikely((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS))) {
+        return status;
+    }
+
+    *req_p = (ucp_request_t*)ptr;
+    ucp_request_id_check(*req_p, ==, id);
+
+    if (extract) {
+        /* If request ID was released, then need to reset the request ID to use
+         * the value for checking whether the request ID should be put to PTR
+         * map or not in case of error handling */
+        ucp_request_id_reset(*req_p);
+
+        if (status == UCS_OK) {
+            ucs_hlist_del(&ucp_ep_ext_gen((*req_p)->send.ep)->proto_reqs,
+                                          &(*req_p)->send.list);
+        }
+    }
+
+    return UCS_OK;
+}
+
+static UCS_F_ALWAYS_INLINE void ucp_request_set_super(ucp_request_t *req,
+                                                      ucp_request_t *super_req)
+{
+    ucs_assertv(!(req->flags & UCP_REQUEST_FLAG_SUPER_VALID),
+                "req=%p req->super_req=%p", req, req->super_req);
+    req->super_req = super_req;
+    req->flags    |= UCP_REQUEST_FLAG_SUPER_VALID;
+}
+
+static UCS_F_ALWAYS_INLINE void ucp_request_reset_super(ucp_request_t *req)
 {
-    return ucp_worker_get_request_id(req->send.ep->worker, req,
-                                     ucp_ep_use_indirect_id(req->send.ep));
+    req->flags &= ~UCP_REQUEST_FLAG_SUPER_VALID;
+}
+
+static UCS_F_ALWAYS_INLINE ucp_request_t*
+ucp_request_get_super(ucp_request_t *req)
+{
+    ucs_assertv(req->flags & UCP_REQUEST_FLAG_SUPER_VALID,
+                "req=%p req->super_req=%p", req, req->super_req);
+    return req->super_req;
 }
 
 static UCS_F_ALWAYS_INLINE void
@@ -764,7 +968,7 @@ ucp_request_param_rndv_thresh(ucp_request_t *req,
                               size_t *rndv_rma_thresh, size_t *rndv_am_thresh)
 {
     if ((param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) &&
-        ucs_likely(UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->send.mem_type))) {
+        ucs_likely(UCP_MEM_IS_HOST(req->send.mem_type))) {
         *rndv_rma_thresh = rma_thresh_config->local;
         *rndv_am_thresh  = am_thresh_config->local;
     } else {
@@ -782,10 +986,35 @@ ucp_invoke_uct_completion(uct_completion_t *comp, ucs_status_t status)
     }
 }
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_request_invoke_uct_completion_success(ucp_request_t *req)
+{
+    ucp_invoke_uct_completion(&req->send.state.uct_comp, UCS_OK);
+    return UCS_OK;
+}
+
+/* The function can be used to complete any UCP send request */
 static UCS_F_ALWAYS_INLINE void
-ucp_request_invoke_uct_completion(ucp_request_t *req, ucs_status_t status)
+ucp_request_complete_and_dereg_send(ucp_request_t *sreq, ucs_status_t status)
 {
-    ucp_invoke_uct_completion(&req->send.state.uct_comp, status);
+    ucs_assert(!sreq->send.ep->worker->context->config.ext.proto_enable);
+    ucp_request_send_generic_dt_finish(sreq);
+    ucp_request_send_buffer_dereg(sreq);
+    ucp_request_complete_send(sreq, status);
 }
 
+
+#define UCP_SEND_REQUEST_GET_BY_ID(_req_p, _worker, _req_id, _extract, \
+                                   _action, _fmt_str, ...) \
+    { \
+        ucs_status_t __status = ucp_send_request_get_by_id(_worker, _req_id, \
+                                                           _req_p, _extract); \
+        if (ucs_unlikely(__status != UCS_OK)) { \
+            ucs_trace_data("worker %p: req id 0x%" PRIx64 " doesn't exist" \
+                           " drop " _fmt_str, \
+                           _worker, _req_id, ##__VA_ARGS__); \
+            _action; \
+        } \
+    }
+
 #endif
diff --git a/src/ucp/core/ucp_rkey.c b/src/ucp/core/ucp_rkey.c
index 288f27bca3e..fe90694e1ac 100644
--- a/src/ucp/core/ucp_rkey.c
+++ b/src/ucp/core/ucp_rkey.c
@@ -8,115 +8,186 @@
 #  include "config.h"
 #endif
 
-#include "ucp_rkey.h"
+#include "ucp_rkey.inl"
 #include "ucp_request.h"
 #include "ucp_ep.inl"
 
 #include <ucp/rma/rma.h>
 #include <ucs/datastruct/mpool.inl>
 #include <ucs/profile/profile.h>
+#include <ucs/type/serialize.h>
+#include <ucs/type/float8.h>
 #include <ucs/sys/string.h>
+#include <ucs/sys/topo.h>
 #include <inttypes.h>
 
 
+typedef struct {
+    uint8_t   sys_dev;
+    ucs_fp8_t latency;
+    ucs_fp8_t bandwidth;
+} UCS_S_PACKED ucp_rkey_packed_distance_t;
+
 static struct {
     ucp_md_map_t md_map;
     uint8_t      mem_type;
 } UCS_S_PACKED ucp_mem_dummy_buffer = {0, UCS_MEMORY_TYPE_HOST};
 
 
-size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map)
+size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map,
+                            ucs_sys_device_t sys_dev, uint64_t sys_dev_map)
 {
-    size_t size, md_size;
+    size_t size, tl_rkey_size;
     unsigned md_index;
 
-    size = sizeof(ucp_md_map_t);
-    size += sizeof(uint8_t);
-    ucs_for_each_bit (md_index, md_map) {
-        md_size = context->tl_mds[md_index].attr.rkey_packed_size;
-        ucs_assert_always(md_size <= UINT8_MAX);
-        size += sizeof(uint8_t) + md_size;
+    size  = sizeof(ucp_md_map_t); /* Memory domains map */
+    size += sizeof(uint8_t); /* Memory type */
+
+    ucs_for_each_bit(md_index, md_map) {
+        tl_rkey_size = context->tl_mds[md_index].attr.rkey_packed_size;
+        ucs_assert_always(tl_rkey_size <= UINT8_MAX);
+        size += sizeof(uint8_t) + tl_rkey_size;
+    }
+
+    if (sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN) {
+        /* System device id */
+        size += sizeof(uint8_t);
+
+        /* Distance of each device */
+        size += ucs_popcount(sys_dev_map) * sizeof(ucp_rkey_packed_distance_t);
     }
+
     return size;
 }
 
 void ucp_rkey_packed_copy(ucp_context_h context, ucp_md_map_t md_map,
-                          ucs_memory_type_t mem_type, void *rkey_buffer,
-                          const void* uct_rkeys[])
+                          ucs_memory_type_t mem_type, void *buffer,
+                          const void *uct_rkeys[])
 {
-    uint8_t *p = rkey_buffer;
+    void *p = buffer;
+    size_t tl_rkey_size;
     unsigned md_index;
-    size_t md_size;
-
-    *(ucp_md_map_t*)p = md_map;
-    p += sizeof(ucp_md_map_t);
 
-    *(p++) = mem_type;
+    *ucs_serialize_next(&p, ucp_md_map_t) = md_map;
+    *ucs_serialize_next(&p, uint8_t)      = mem_type;
 
     ucs_for_each_bit(md_index, md_map) {
-        md_size = context->tl_mds[md_index].attr.rkey_packed_size;
-        ucs_assert_always(md_size <= UINT8_MAX);
-        *(p++) = md_size;
-        memcpy(p, *uct_rkeys, md_size);
-        p += md_size;
-        ++uct_rkeys;
+        tl_rkey_size = context->tl_mds[md_index].attr.rkey_packed_size;
+        ucs_assert_always(tl_rkey_size <= UINT8_MAX);
+        *ucs_serialize_next(&p, uint8_t) = tl_rkey_size;
+        memcpy(ucs_serialize_next_raw(&p, void, tl_rkey_size), *(uct_rkeys++),
+               tl_rkey_size);
     }
 }
 
-ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map,
-                          const uct_mem_h *memh, ucs_memory_type_t mem_type,
-                          void *rkey_buffer)
+/* Pack bandwidth as bytes/second, range: 512 MB/s to 4 TB/s */
+UCS_FP8_DECLARE_TYPE(RKEY_BANDWIDTH, 512 * UCS_MBYTE, 4 * UCS_TBYTE)
+
+/* Pack latency as nanoseconds, range: 16 nsec to 131 usec */
+UCS_FP8_DECLARE_TYPE(RKEY_LATENCY, UCS_BIT(4), UCS_BIT(17))
+
+static void ucp_rkey_pack_distance(ucs_sys_device_t sys_dev,
+                                   const ucs_sys_dev_distance_t *distance,
+                                   ucp_rkey_packed_distance_t *packed_distance)
 {
-    uint8_t *p          = rkey_buffer;
-    ucs_status_t status = UCS_OK;
+    packed_distance->sys_dev   = sys_dev;
+    packed_distance->bandwidth = UCS_FP8_PACK(RKEY_BANDWIDTH,
+                                              distance->bandwidth);
+    packed_distance->latency   = UCS_FP8_PACK(RKEY_LATENCY,
+                                              distance->latency *
+                                                      UCS_NSEC_PER_SEC);
+}
+
+static void
+ucp_rkey_unpack_distance(const ucp_rkey_packed_distance_t *packed_distance,
+                         ucs_sys_device_t *sys_dev_p,
+                         ucs_sys_dev_distance_t *distance)
+{
+    *sys_dev_p          = packed_distance->sys_dev;
+    distance->bandwidth = UCS_FP8_UNPACK(RKEY_BANDWIDTH,
+                                         packed_distance->bandwidth);
+    distance->latency = UCS_FP8_UNPACK(RKEY_LATENCY, packed_distance->latency) /
+                        UCS_NSEC_PER_SEC;
+}
+
+UCS_PROFILE_FUNC(ssize_t, ucp_rkey_pack_uct,
+                 (context, md_map, memh, mem_info, sys_dev_map, sys_distance,
+                  buffer),
+                 ucp_context_h context, ucp_md_map_t md_map,
+                 const uct_mem_h *memh, const ucs_memory_info_t *mem_info,
+                 uint64_t sys_dev_map,
+                 const ucs_sys_dev_distance_t *sys_distance, void *buffer)
+{
+    void *p = buffer;
     unsigned md_index, uct_memh_index;
-    size_t md_size;
     char UCS_V_UNUSED buf[128];
+    ucs_sys_device_t sys_dev;
+    size_t tl_rkey_size;
+    ucs_status_t status;
+    void *tl_rkey_buf;
+    ssize_t result;
 
     /* Check that md_map is valid */
     ucs_assert(ucs_test_all_flags(UCS_MASK(context->num_mds), md_map));
 
-    /* Write the MD map */
-    *(ucp_md_map_t*)p = md_map;
-    p += sizeof(ucp_md_map_t);
+    ucs_trace("packing rkey type %s md_map 0x%" PRIx64 "dev_map 0x%" PRIx64,
+              ucs_memory_type_names[mem_info->type], md_map, sys_dev_map);
+    ucs_log_indent(1);
 
-    /* Write memory type */
     UCS_STATIC_ASSERT(UCS_MEMORY_TYPE_LAST <= 255);
-    *(p++) = mem_type;
+    *ucs_serialize_next(&p, ucp_md_map_t) = md_map;
+    *ucs_serialize_next(&p, uint8_t)      = mem_info->type;
 
     /* Write both size and rkey_buffer for each UCT rkey */
     uct_memh_index = 0;
     ucs_for_each_bit (md_index, md_map) {
-        md_size = context->tl_mds[md_index].attr.rkey_packed_size;
-        *(p++) = md_size;
-        status = uct_md_mkey_pack(context->tl_mds[md_index].md,
-                                  memh[uct_memh_index], p);
+        tl_rkey_size = context->tl_mds[md_index].attr.rkey_packed_size;
+        *ucs_serialize_next(&p, uint8_t) = tl_rkey_size;
+
+        tl_rkey_buf = ucs_serialize_next_raw(&p, void, tl_rkey_size);
+        status      = uct_md_mkey_pack(context->tl_mds[md_index].md,
+                                       memh[uct_memh_index], tl_rkey_buf);
         if (status != UCS_OK) {
-            return status;
+            result = status;
+            goto out;
         }
 
         ucs_trace("rkey[%d]=%s for md[%d]=%s", uct_memh_index,
-                  ucs_str_dump_hex(p, md_size, buf, sizeof(buf), SIZE_MAX),
+                  ucs_str_dump_hex(p, tl_rkey_size, buf, sizeof(buf), SIZE_MAX),
                   md_index, context->tl_mds[md_index].rsc.md_name);
-
         ++uct_memh_index;
-        p += md_size;
     }
 
-    return UCS_PTR_BYTE_DIFF(rkey_buffer, p);
+    if (ucs_likely(mem_info->sys_dev == UCS_SYS_DEVICE_ID_UNKNOWN)) {
+        goto out_packed_size;
+    }
+
+    /* Pack system device id */
+    *ucs_serialize_next(&p, uint8_t) = mem_info->sys_dev;
+
+    /* Pack distance from sys_dev to each device in distance_dev_map */
+    ucs_for_each_bit(sys_dev, sys_dev_map) {
+        ucp_rkey_pack_distance(sys_dev, sys_distance++,
+                               ucs_serialize_next(&p,
+                                                  ucp_rkey_packed_distance_t));
+    }
+
+out_packed_size:
+    result = UCS_PTR_BYTE_DIFF(buffer, p);
+out:
+    ucs_log_indent(-1);
+    return result;
 }
 
 ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh,
                            void **rkey_buffer_p, size_t *size_p)
 {
-    void *rkey_buffer, *p;
+    ucs_memory_info_t mem_info;
     ucs_status_t status;
     ssize_t packed_size;
+    void *rkey_buffer;
     size_t size;
 
-    UCP_CONTEXT_CHECK_FEATURE_FLAGS(context, UCP_FEATURE_RMA | UCP_FEATURE_AMO,
-                                    return UCS_ERR_INVALID_PARAM);
-
     /* always acquire context lock */
     UCP_THREAD_CS_ENTER(&context->mt_lock);
 
@@ -131,17 +202,19 @@ ucs_status_t ucp_rkey_pack(ucp_context_h context, ucp_mem_h memh,
         goto out;
     }
 
-    size = ucp_rkey_packed_size(context, memh->md_map);
+    size        = ucp_rkey_packed_size(context, memh->md_map,
+                                       UCS_SYS_DEVICE_ID_UNKNOWN, 0);
     rkey_buffer = ucs_malloc(size, "ucp_rkey_buffer");
     if (rkey_buffer == NULL) {
         status = UCS_ERR_NO_MEMORY;
         goto out;
     }
 
-    p = rkey_buffer;
+    mem_info.type    = memh->mem_type;
+    mem_info.sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
 
-    packed_size = ucp_rkey_pack_uct(context, memh->md_map, memh->uct,
-                                    memh->mem_type, p);
+    packed_size = ucp_rkey_pack_uct(context, memh->md_map, memh->uct, &mem_info,
+                                    0, NULL, rkey_buffer);
     if (packed_size < 0) {
         status = (ucs_status_t)packed_size;
         goto err_destroy;
@@ -170,74 +243,138 @@ void ucp_rkey_buffer_release(void *rkey_buffer)
     ucs_free(rkey_buffer);
 }
 
-UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack, (ep, rkey_buffer, rkey_p),
-                 ucp_ep_h ep, const void *rkey_buffer,
-                 ucp_rkey_h *rkey_p)
+static void UCS_F_NOINLINE
+ucp_rkey_unpack_lanes_distance(const ucp_ep_config_key_t *ep_config_key,
+                               ucs_sys_dev_distance_t *lanes_distance,
+                               const void *buffer, const void *buffer_end)
 {
-    ucp_worker_h  worker = ep->worker;
-    const ucp_ep_config_t *ep_config;
+    const void *p = buffer;
+    ucs_sys_dev_distance_t distance, distance_by_dev[UCS_SYS_DEVICE_ID_MAX];
+    ucs_sys_device_t sys_dev;
+    ucp_lane_index_t lane;
+    uint64_t sys_dev_map;
+    char buf[128];
+
+    /* Unpack lane distances and update distance_by_dev lookup */
+    sys_dev_map = 0;
+    while (p < buffer_end) {
+        ucp_rkey_unpack_distance(
+                ucs_serialize_next(&p, const ucp_rkey_packed_distance_t),
+                &sys_dev, &distance);
+        distance_by_dev[sys_dev] = distance;
+        sys_dev_map             |= UCS_BIT(sys_dev);
+    }
+
+    /* Initialize lane distances according to distance_by_dev */
+    for (lane = 0; lane < ep_config_key->num_lanes; ++lane) {
+        sys_dev              = ep_config_key->lanes[lane].dst_sys_dev;
+        lanes_distance[lane] = (sys_dev_map & UCS_BIT(sys_dev)) ?
+                                       distance_by_dev[sys_dev] :
+                                       ucs_topo_default_distance;
+        ucs_trace("lane[%d] dev %d distance %s", lane, sys_dev,
+                  ucs_topo_distance_str(&lanes_distance[lane], buf,
+                                        sizeof(buf)));
+    }
+}
+
+UCS_PROFILE_FUNC(ucs_status_t, ucp_rkey_proto_resolve,
+                 (rkey, ep, buffer, buffer_end), ucp_rkey_h rkey, ucp_ep_h ep,
+                 const void *buffer, const void *buffer_end)
+{
+    ucp_worker_h worker = ep->worker;
+    const void *p       = buffer;
+    ucs_sys_dev_distance_t *lanes_distance;
     ucp_rkey_config_key_t rkey_config_key;
-    unsigned remote_md_index;
+    khiter_t khiter;
+
+    /* Avoid calling ucp_ep_resolve_remote_id() from rkey_unpack, and let
+     * the APIs which are not yet using new protocols resolve the remote key
+     * on-demand.
+     */
+    rkey->cache.ep_cfg_index = UCP_WORKER_CFG_INDEX_NULL;
+
+    /* Look up remote key's configration */
+    rkey_config_key.ep_cfg_index = ep->cfg_index;
+    rkey_config_key.md_map       = rkey->md_map;
+    rkey_config_key.mem_type     = rkey->mem_type;
+
+    if (buffer < buffer_end) {
+        rkey_config_key.sys_dev = *ucs_serialize_next(&p, const uint8_t);
+    } else {
+        rkey_config_key.sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+    }
+
+    khiter = kh_get(ucp_worker_rkey_config, &worker->rkey_config_hash,
+                    rkey_config_key);
+    if (ucs_likely(khiter != kh_end(&worker->rkey_config_hash))) {
+        /* Found existing configuration in hash */
+        rkey->cfg_index = kh_val(&worker->rkey_config_hash, khiter);
+        return UCS_OK;
+    }
+
+    lanes_distance = ucs_alloca(sizeof(*lanes_distance) * UCP_MAX_LANES);
+    ucp_rkey_unpack_lanes_distance(&ucp_ep_config(ep)->key, lanes_distance, p,
+                                   buffer_end);
+    return ucp_worker_add_rkey_config(worker, &rkey_config_key, lanes_distance,
+                                      &rkey->cfg_index);
+}
+
+UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack_internal,
+                 (ep, buffer, length, rkey_p), ucp_ep_h ep, const void *buffer,
+                 size_t length, ucp_rkey_h *rkey_p)
+{
+    ucp_worker_h worker              = ep->worker;
+    const ucp_ep_config_t *ep_config = ucp_ep_config(ep);
+    const void *p                    = buffer;
     ucp_md_map_t md_map, remote_md_map;
     ucp_rsc_index_t cmpt_index;
+    unsigned remote_md_index;
+    const void *tl_rkey_buf;
     ucp_tl_rkey_t *tl_rkey;
+    size_t tl_rkey_size;
     unsigned rkey_index;
-    unsigned md_count;
     ucs_status_t status;
     ucp_rkey_h rkey;
-    ucs_memory_type_t mem_type;
-    uint8_t md_size;
-    const uint8_t *p;
     uint8_t flags;
 
-    UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
-
-    ep_config = ucp_ep_config(ep);
-
-    /* Count the number of remote MDs in the rkey buffer */
-    p = rkey_buffer;
-
-    /* Read remote MD map */
-    remote_md_map = *(ucp_md_map_t*)p;
-    ucs_trace("ep %p: unpacking rkey with md_map 0x%"PRIx64, ep, remote_md_map);
+    ucs_trace("ep %p: unpacking rkey buffer %p length %zu", ep, buffer, length);
+    ucs_log_indent(1);
 
     /* MD map for the unpacked rkey */
-    md_map   = remote_md_map & ucp_ep_config(ep)->key.reachable_md_map;
-    md_count = ucs_popcount(md_map);
-    p       += sizeof(ucp_md_map_t);
+    remote_md_map = *ucs_serialize_next(&p, const ucp_md_map_t);
+    md_map        = remote_md_map & ucp_ep_config(ep)->key.reachable_md_map;
 
     /* Allocate rkey handle which holds UCT rkeys for all remote MDs. Small key
      * allocations are done from a memory pool.
      * We keep all of them to handle a future transport switch.
      */
-    flags = 0;
-    if (md_count <= UCP_RKEY_MPOOL_MAX_MD) {
+    if (md_map <= UCS_BIT(UCP_RKEY_MPOOL_MAX_MD)) {
         rkey  = ucs_mpool_get_inline(&worker->rkey_mp);
         flags = UCP_RKEY_DESC_FLAG_POOL;
     } else {
-        rkey = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->tl_rkey[0]) * md_count),
-                          "ucp_rkey");
+        rkey  = ucs_malloc(sizeof(*rkey) + (sizeof(rkey->tl_rkey[0]) *
+                                            ucs_popcount(md_map)),
+                           "ucp_rkey");
+        flags = 0;
     }
     if (rkey == NULL) {
+        ucs_error("failed to allocate remote key");
         status = UCS_ERR_NO_MEMORY;
-        goto out_unlock;
+        goto out;
     }
 
-    /* Read memory type */
-    mem_type = (ucs_memory_type_t)*(p++);
-
     rkey->md_map   = md_map;
-    rkey->mem_type = mem_type;
+    rkey->mem_type = *ucs_serialize_next(&p, const uint8_t);
     rkey->flags    = flags;
 #if ENABLE_PARAMS_CHECK
     rkey->ep       = ep;
 #endif
 
-    /* Unpack rkey of each UCT MD */
+    /* Go over remote MD indices and unpack rkey of each UCT MD */
     rkey_index = 0; /* Index of the rkey in the array */
-    /* Go over remote MD indices */
-    ucs_for_each_bit (remote_md_index, remote_md_map) {
-        md_size = *(p++);
+    ucs_for_each_bit(remote_md_index, remote_md_map) {
+        tl_rkey_size = *ucs_serialize_next(&p, const uint8_t);
+        tl_rkey_buf  = ucs_serialize_next_raw(&p, const void, tl_rkey_size);
 
         /* Use bit operations to iterate through the indices of the remote MDs
          * as provided in the md_map. md_map always holds a bitmap of MD indices
@@ -248,109 +385,110 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack, (ep, rkey_buffer, rkey_p),
         ucs_assert_always(remote_md_index <= UCP_MD_INDEX_BITS);
 
         /* Unpack only reachable rkeys */
-        if (UCS_BIT(remote_md_index) & rkey->md_map) {
-            ucs_assert(rkey_index < md_count);
-
-            tl_rkey       = &rkey->tl_rkey[rkey_index];
-            cmpt_index    = ucp_ep_config_get_dst_md_cmpt(&ep_config->key,
-                                                          remote_md_index);
-            tl_rkey->cmpt = worker->context->tl_cmpts[cmpt_index].cmpt;
-
-            status = uct_rkey_unpack(tl_rkey->cmpt, p, &tl_rkey->rkey);
-            if (status == UCS_OK) {
-                ucs_trace("rkey[%d] for remote md %d is 0x%lx", rkey_index,
-                          remote_md_index, tl_rkey->rkey.rkey);
-                ++rkey_index;
-            } else if (status == UCS_ERR_UNREACHABLE) {
-                rkey->md_map &= ~UCS_BIT(remote_md_index);
-                ucs_trace("rkey[%d] for remote md %d is 0x%lx not reachable",
-                          rkey_index, remote_md_index, tl_rkey->rkey.rkey);
-                /* FIXME this can make malloc allocated key be released to mpool */
-            } else {
-                ucs_error("failed to unpack remote key from remote md[%d]: %s",
-                          remote_md_index, ucs_status_string(status));
-                goto err_destroy;
-            }
+        if (!(UCS_BIT(remote_md_index) & rkey->md_map)) {
+            continue;
         }
 
-        p += md_size;
+        ucs_assert(rkey_index < ucs_popcount(md_map));
+        tl_rkey       = &rkey->tl_rkey[rkey_index];
+        cmpt_index    = ucp_ep_config_get_dst_md_cmpt(&ep_config->key,
+                                                      remote_md_index);
+        tl_rkey->cmpt = worker->context->tl_cmpts[cmpt_index].cmpt;
+
+        status = uct_rkey_unpack(tl_rkey->cmpt, tl_rkey_buf, &tl_rkey->rkey);
+        if (status == UCS_OK) {
+            ucs_trace("rkey[%d] for remote md %d is 0x%lx", rkey_index,
+                      remote_md_index, tl_rkey->rkey.rkey);
+            ++rkey_index;
+        } else if (status == UCS_ERR_UNREACHABLE) {
+            rkey->md_map &= ~UCS_BIT(remote_md_index);
+            ucs_trace("rkey[%d] for remote md %d is 0x%lx not reachable",
+                      rkey_index, remote_md_index, tl_rkey->rkey.rkey);
+        } else {
+            ucs_error("failed to unpack remote key from remote md[%d]: %s",
+                      remote_md_index, ucs_status_string(status));
+            goto err_destroy;
+        }
     }
 
-    /* Silence clang checker - assert that if some rkeys are unpacked, then
-     * rkey->md_map is nozero.
-     */
-    ucs_assert((rkey_index > 0) || (rkey->md_map == 0));
-
     if (worker->context->config.ext.proto_enable) {
-        rkey_config_key.ep_cfg_index = ep->cfg_index;
-        rkey_config_key.md_map       = rkey->md_map;
-        rkey_config_key.mem_type     = rkey->mem_type;
-        rkey_config_key.sys_dev      = 0;
-
-        status = ucp_worker_get_rkey_config(worker, &rkey_config_key,
-                                            &rkey->cfg_index);
+        status = ucp_rkey_proto_resolve(rkey, ep, p,
+                                        UCS_PTR_BYTE_OFFSET(buffer, length));
         if (status != UCS_OK) {
             goto err_destroy;
         }
-
-        /* Avoid calling ucp_ep_resolve_remote_id() from rkey_unpack, and let
-         * the APIs which are not yet using new protocols resolve the remote key
-         * on-demand.
-         */
-        rkey->cache.ep_cfg_index = UCP_WORKER_CFG_INDEX_NULL;
     } else {
         ucp_rkey_resolve_inner(rkey, ep);
     }
 
+    ucs_trace("ep %p: unpacked rkey %p md_map 0x%" PRIx64 " type %s", ep, rkey,
+              rkey->md_map, ucs_memory_type_names[rkey->mem_type]);
     *rkey_p = rkey;
     status  = UCS_OK;
-
-out_unlock:
-    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
-    return status;
+    goto out;
 
 err_destroy:
     ucp_rkey_destroy(rkey);
-    goto out_unlock;
+out:
+    ucs_log_indent(-1);
+    return status;
+}
+
+UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_rkey_unpack, (ep, rkey_buffer, rkey_p),
+                 ucp_ep_h ep, const void *rkey_buffer, ucp_rkey_h *rkey_p)
+{
+    ucs_status_t status;
+
+    UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
+    status = ucp_ep_rkey_unpack_internal(ep, rkey_buffer, 0, rkey_p);
+    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker);
+
+    return status;
 }
 
-void ucp_rkey_dump_packed(const void *rkey_buffer, char *buffer, size_t max)
+void ucp_rkey_dump_packed(const void *buffer, size_t length,
+                          ucs_string_buffer_t *strb)
 {
-    char *p                 = buffer;
-    char *endp              = buffer + max;
-    const uint8_t *rkey_buf = rkey_buffer;
+    const void *p          = buffer;
+    const void *buffer_end = UCS_PTR_BYTE_OFFSET(buffer, length);
+    const ucp_rkey_packed_distance_t *packed_distance;
+    ucs_sys_dev_distance_t distance;
+    ucs_memory_type_t mem_type;
+    ucs_sys_device_t sys_dev;
+    const void *tl_tkey;
     ucp_md_map_t md_map;
     unsigned md_index;
-    uint8_t md_size;
-    int first;
+    uint8_t tl_rkey_size;
+    char buf[128];
 
-    snprintf(p, endp - p, "{");
-    p += strlen(p);
+    md_map   = *ucs_serialize_next(&p, const ucp_md_map_t);
+    mem_type = *ucs_serialize_next(&p, const uint8_t);
 
-    md_map    = *(ucp_md_map_t*)(rkey_buf);
-    rkey_buf += sizeof(ucp_md_map_t) + sizeof(uint8_t);
+    ucs_string_buffer_appendf(strb, "{%s", ucs_memory_type_names[mem_type]);
 
-    first = 1;
     ucs_for_each_bit(md_index, md_map) {
-         md_size   = *rkey_buf;
-         rkey_buf += sizeof(uint8_t);
-
-         if (!first) {
-             snprintf(p, endp - p, ",");
-             p += strlen(p);
-         }
-         first = 0;
-
-         snprintf(p, endp - p, "%d:", md_index);
-         p += strlen(p);
+        tl_rkey_size = *ucs_serialize_next(&p, const uint8_t);
+        tl_tkey      = ucs_serialize_next_raw(&p, const void, tl_rkey_size);
+        ucs_string_buffer_appendf(strb, ",%u:", md_index);
+        ucs_string_buffer_append_hex(strb, tl_tkey, tl_rkey_size, SIZE_MAX);
+    }
 
-         ucs_str_dump_hex(rkey_buf, md_size, p, endp - p, SIZE_MAX);
-         p += strlen(p);
+    if (p < buffer_end) {
+        sys_dev = *ucs_serialize_next(&p, const uint8_t);
+        ucs_string_buffer_appendf(strb, ",sys:%u", sys_dev);
+    }
 
-         rkey_buf += md_size;
+    while (p < buffer_end) {
+        packed_distance    = ucs_serialize_next(&p,
+                                                const ucp_rkey_packed_distance_t);
+        distance.bandwidth = packed_distance->bandwidth;
+        distance.latency   = packed_distance->latency;
+        ucs_string_buffer_appendf(strb, ",dev:%u:%s", packed_distance->sys_dev,
+                                  ucs_topo_distance_str(&distance, buf,
+                                                        sizeof(buf)));
     }
 
-    snprintf(p, endp - p, "}");
+    ucs_string_buffer_appendf(strb, "}");
 }
 
 ucs_status_t ucp_rkey_ptr(ucp_rkey_h rkey, uint64_t raddr, void **addr_p)
@@ -378,6 +516,8 @@ void ucp_rkey_destroy(ucp_rkey_h rkey)
     unsigned remote_md_index, rkey_index;
     ucp_worker_h UCS_V_UNUSED worker;
 
+    ucs_trace("destroying rkey %p", rkey);
+
     rkey_index = 0;
     ucs_for_each_bit(remote_md_index, rkey->md_map) {
         uct_rkey_release(rkey->tl_rkey[rkey_index].cmpt,
@@ -519,3 +659,22 @@ void ucp_rkey_resolve_inner(ucp_rkey_h rkey, ucp_ep_h ep)
               rkey->cache.rma_proto->name, rkey->cache.rma_lane, rkey->cache.rma_rkey,
               rkey->cache.amo_proto->name, rkey->cache.amo_lane, rkey->cache.amo_rkey);
 }
+
+void ucp_rkey_config_dump_brief(const ucp_rkey_config_key_t *rkey_config_key,
+                                ucs_string_buffer_t *strb)
+{
+    ucs_string_buffer_appendf(strb, "%s md_map 0x%" PRIx64,
+                              ucs_memory_type_names[rkey_config_key->mem_type],
+                              rkey_config_key->md_map);
+}
+
+void ucp_rkey_proto_select_dump(ucp_worker_h worker,
+                                ucp_worker_cfg_index_t rkey_cfg_index,
+                                ucs_string_buffer_t *strb)
+{
+    const ucp_rkey_config_t *rkey_config = &worker->rkey_config[rkey_cfg_index];
+
+    ucp_proto_select_dump_short(&rkey_config->put_short, "put_short", strb);
+    ucp_proto_select_dump(worker, rkey_config->key.ep_cfg_index, rkey_cfg_index,
+                          &rkey_config->proto_select, strb);
+}
diff --git a/src/ucp/core/ucp_rkey.h b/src/ucp/core/ucp_rkey.h
index 2e5d5ecb22d..053fa78cccc 100644
--- a/src/ucp/core/ucp_rkey.h
+++ b/src/ucp/core/ucp_rkey.h
@@ -9,6 +9,7 @@
 
 #include "ucp_types.h"
 
+#include <ucp/core/ucp_context.h>
 #include <ucp/proto/proto_select.h>
 
 
@@ -41,10 +42,17 @@ enum {
  * Rkey configuration key
  */
 struct ucp_rkey_config_key {
-    ucp_md_map_t                  md_map;       /* Which *remote* MDs have valid memory handles */
-    ucp_worker_cfg_index_t        ep_cfg_index; /* Endpoint configuration */
-    ucs_sys_device_t              sys_dev;      /* Remote device id */
-    ucs_memory_type_t             mem_type;     /* Remote memory type */
+    /* Which *remote* MDs have valid memory handles */
+    ucp_md_map_t           md_map;
+
+    /* Endpoint configuration index */
+    ucp_worker_cfg_index_t ep_cfg_index;
+
+    /* Remove system device id */
+    ucs_sys_device_t       sys_dev;
+
+    /* Remote memory type */
+    ucs_memory_type_t      mem_type;
 };
 
 
@@ -52,8 +60,20 @@ struct ucp_rkey_config_key {
  * Rkey configuration
  */
 typedef struct {
-    ucp_rkey_config_key_t         key;          /* Configuration key */
-    ucp_proto_select_t            proto_select; /* Protocol selection data */
+    /* Configuration key */
+    ucp_rkey_config_key_t    key;
+
+    /* Put-short thresholds */
+    ucp_proto_select_short_t put_short;
+
+    /* Remote system topology distance of each lane from the remote memory
+     * buffer. The number of valid entries is according to the number of lanes
+     * defined by the configuration at index "key.ep_cfg_index".
+     */
+    ucs_sys_dev_distance_t   lanes_distance[UCP_MAX_LANES];
+
+    /* Protocol selection data */
+    ucp_proto_select_t       proto_select;
 } ucp_rkey_config_t;
 
 
@@ -131,19 +151,36 @@ ucp_lane_index_t ucp_rkey_find_rma_lane(ucp_context_h context,
                                         uct_rkey_t *uct_rkey_p);
 
 
-size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map);
+size_t ucp_rkey_packed_size(ucp_context_h context, ucp_md_map_t md_map,
+                            ucs_sys_device_t sys_dev, uint64_t sys_dev_map);
 
 
 void ucp_rkey_packed_copy(ucp_context_h context, ucp_md_map_t md_map,
-                          ucs_memory_type_t mem_type, void *rkey_buffer,
-                          const void* uct_rkeys[]);
+                          ucs_memory_type_t mem_type, void *buffer,
+                          const void *uct_rkeys[]);
+
+
+ssize_t
+ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map,
+                  const uct_mem_h *memh, const ucs_memory_info_t *mem_info,
+                  uint64_t sys_dev_map,
+                  const ucs_sys_dev_distance_t *sys_distance, void *buffer);
+
+
+ucs_status_t ucp_ep_rkey_unpack_internal(ucp_ep_h ep, const void *buffer,
+                                         size_t length, ucp_rkey_h *rkey_p);
+
+
+void ucp_rkey_dump_packed(const void *buffer, size_t length,
+                          ucs_string_buffer_t *strb);
 
 
-ssize_t ucp_rkey_pack_uct(ucp_context_h context, ucp_md_map_t md_map,
-                          const uct_mem_h *memh, ucs_memory_type_t mem_type,
-                          void *rkey_buffer);
+void ucp_rkey_config_dump_brief(const ucp_rkey_config_key_t *rkey_config_key,
+                                ucs_string_buffer_t *strb);
 
 
-void ucp_rkey_dump_packed(const void *rkey_buffer, char *buffer, size_t max);
+void ucp_rkey_proto_select_dump(ucp_worker_h worker,
+                                ucp_worker_cfg_index_t rkey_cfg_index,
+                                ucs_string_buffer_t *strb);
 
 #endif
diff --git a/src/ucp/core/ucp_rkey.inl b/src/ucp/core/ucp_rkey.inl
index 2322ba42e0f..30a0678da84 100644
--- a/src/ucp/core/ucp_rkey.inl
+++ b/src/ucp/core/ucp_rkey.inl
@@ -11,6 +11,25 @@
 #include "ucp_worker.h"
 
 
+static UCS_F_ALWAYS_INLINE khint_t
+ucp_rkey_config_hash_func(ucp_rkey_config_key_t rkey_config_key)
+{
+    return (khint_t)rkey_config_key.md_map ^
+           (rkey_config_key.ep_cfg_index << 8) ^
+           (rkey_config_key.sys_dev << 16) ^
+           (rkey_config_key.mem_type << 24);
+}
+
+static UCS_F_ALWAYS_INLINE int
+ucp_rkey_config_is_equal(ucp_rkey_config_key_t rkey_config_key1,
+                         ucp_rkey_config_key_t rkey_config_key2)
+{
+    return (rkey_config_key1.md_map == rkey_config_key2.md_map) &&
+           (rkey_config_key1.ep_cfg_index == rkey_config_key2.ep_cfg_index) &&
+           (rkey_config_key1.sys_dev == rkey_config_key2.sys_dev) &&
+           (rkey_config_key1.mem_type == rkey_config_key2.mem_type);
+}
+
 static UCS_F_ALWAYS_INLINE ucp_rkey_config_t *
 ucp_rkey_config(ucp_worker_h worker, ucp_rkey_h rkey)
 {
diff --git a/src/ucp/core/ucp_types.h b/src/ucp/core/ucp_types.h
index 45831753017..4a6dcb86b14 100644
--- a/src/ucp/core/ucp_types.h
+++ b/src/ucp/core/ucp_types.h
@@ -9,16 +9,17 @@
 
 #include <ucp/api/ucp.h>
 #include <uct/api/uct.h>
+#include <ucs/datastruct/bitmap.h>
 #include <ucs/sys/preprocessor.h>
 #include <stdint.h>
 
 
-#define UCP_WORKER_NAME_MAX          32   /* Worker name for debugging */
-#define UCP_MIN_BCOPY                64   /* Minimal size for bcopy */
+#define UCP_WORKER_ADDRESS_NAME_MAX  32 /* Worker address name for debugging */
+#define UCP_MIN_BCOPY                64 /* Minimal size for bcopy */
 #define UCP_FEATURE_AMO              (UCP_FEATURE_AMO32|UCP_FEATURE_AMO64)
 
 /* Resources */
-#define UCP_MAX_RESOURCES            64 /* up to 64 only due to tl_bitmap usage */
+#define UCP_MAX_RESOURCES            128
 #define UCP_NULL_RESOURCE            ((ucp_rsc_index_t)-1)
 typedef uint8_t                      ucp_rsc_index_t;
 
@@ -38,7 +39,7 @@ typedef uint8_t                      ucp_lane_map_t;
 
 /* Worker configuration index for endpoint and rkey */
 typedef uint8_t                      ucp_worker_cfg_index_t;
-#define UCP_WORKER_MAX_EP_CONFIG     16
+#define UCP_WORKER_MAX_EP_CONFIG     64
 #define UCP_WORKER_MAX_RKEY_CONFIG   128
 #define UCP_WORKER_CFG_INDEX_NULL    UINT8_MAX
 
@@ -61,6 +62,43 @@ typedef struct ucp_rkey_config_key      ucp_rkey_config_key_t;
 typedef struct ucp_proto                ucp_proto_t;
 
 
+/**
+ * UCP TL bitmap
+ *
+ * Bitmap type for representing which TL resources are in use.
+ */
+typedef ucs_bitmap_t(UCP_MAX_RESOURCES) ucp_tl_bitmap_t;
+
+
+/**
+ * Max possible value of TL bitmap (all bits are 1)
+ */
+extern const ucp_tl_bitmap_t ucp_tl_bitmap_max;
+
+
+/**
+ * Min possible value of TL bitmap (all bits are 0)
+ */
+extern const ucp_tl_bitmap_t ucp_tl_bitmap_min;
+
+
+#define UCT_TL_BITMAP_FMT          "0x%lx 0x%lx"
+#define UCT_TL_BITMAP_ARG(_bitmap) (_bitmap)->bits[0], (_bitmap)->bits[1]
+
+
+/**
+ * Perform bitwise AND on a TL bitmap and a negation of a bitmap and return the result
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ *
+ * @return A new bitmap, which is the logical AND NOT of the operands
+ */
+#define UCP_TL_BITMAP_AND_NOT(_bitmap1, _bitmap2) \
+    UCS_BITMAP_AND(_bitmap1, UCS_BITMAP_NOT(_bitmap2, UCP_MAX_RESOURCES), \
+                   UCP_MAX_RESOURCES)
+
+
 /**
  * Operation for which protocol is selected
  */
@@ -69,6 +107,10 @@ typedef enum {
     UCP_OP_ID_TAG_SEND_SYNC,
     UCP_OP_ID_PUT,
     UCP_OP_ID_GET,
+    UCP_OP_ID_API_LAST,
+
+    UCP_OP_ID_RNDV_SEND = UCP_OP_ID_API_LAST,
+    UCP_OP_ID_RNDV_RECV,
     UCP_OP_ID_LAST
 } ucp_operation_id_t;
 
@@ -131,12 +173,14 @@ typedef enum {
  * Communication scheme in RNDV protocol.
  */
 typedef enum {
+    UCP_RNDV_MODE_AUTO, /* Runtime automatically chooses optimal scheme to use */
     UCP_RNDV_MODE_GET_ZCOPY, /* Use get_zcopy scheme in RNDV protocol */
     UCP_RNDV_MODE_PUT_ZCOPY, /* Use put_zcopy scheme in RNDV protocol */
-    UCP_RNDV_MODE_AUTO,      /* Runtime automatically chooses optimal scheme to use */
+    UCP_RNDV_MODE_AM, /* Use active-messages based RNDV protocol */
     UCP_RNDV_MODE_LAST
 } ucp_rndv_mode_t;
 
+
 /**
  * Active message tracer.
  */
diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c
index 1964238fb6d..22a3fb36301 100644
--- a/src/ucp/core/ucp_worker.c
+++ b/src/ucp/core/ucp_worker.c
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 * Copyright (C) ARM Ltd. 2016-2017.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
@@ -21,17 +21,21 @@
 #include <ucp/tag/offload.h>
 #include <ucp/stream/stream.h>
 #include <ucs/config/parser.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/datastruct/mpool.inl>
 #include <ucs/datastruct/ptr_map.inl>
 #include <ucs/datastruct/queue.h>
 #include <ucs/type/cpu_set.h>
 #include <ucs/sys/string.h>
 #include <ucs/arch/atomic.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <sys/poll.h>
 #include <sys/eventfd.h>
 #include <sys/epoll.h>
 
 
+#define UCP_WORKER_KEEPALIVE_ITER_SKIP 32
+
 #define UCP_WORKER_HEADROOM_SIZE \
     (sizeof(ucp_recv_desc_t) + UCP_WORKER_HEADROOM_PRIV_SIZE)
 
@@ -263,12 +267,10 @@ static ucs_status_t ucp_worker_wakeup_init(ucp_worker_h worker,
         goto out;
     }
 
-    if (params->field_mask & UCP_WORKER_PARAM_FIELD_EVENTS) {
-        events = params->events;
-    } else {
-        events = UCP_WAKEUP_RMA | UCP_WAKEUP_AMO | UCP_WAKEUP_TAG_SEND |
-                 UCP_WAKEUP_TAG_RECV | UCP_WAKEUP_TX | UCP_WAKEUP_RX;
-    }
+    events = UCP_PARAM_VALUE(WORKER, params, events, EVENTS,
+                             UCP_WAKEUP_RMA | UCP_WAKEUP_AMO |
+                                     UCP_WAKEUP_TAG_SEND | UCP_WAKEUP_TAG_RECV |
+                                     UCP_WAKEUP_TX | UCP_WAKEUP_RX);
 
     if (params->field_mask & UCP_WORKER_PARAM_FIELD_EVENT_FD) {
         worker->flags |= UCP_WORKER_FLAG_EXTERNAL_EVENT_FD;
@@ -423,51 +425,12 @@ void ucp_worker_signal_internal(ucp_worker_h worker)
     }
 }
 
-static uct_iface_t ucp_failed_tl_iface = {
-    .ops = {
-        .ep_put_short        = (uct_ep_put_short_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_put_bcopy        = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout,
-        .ep_put_zcopy        = (uct_ep_put_zcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_get_short        = (uct_ep_get_short_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_get_bcopy        = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_get_zcopy        = (uct_ep_get_zcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_am_short         = (uct_ep_am_short_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_am_bcopy         = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout,
-        .ep_am_zcopy         = (uct_ep_am_zcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_atomic_cswap64   = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_atomic_cswap32   = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_atomic64_post    = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_atomic32_post    = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_atomic64_fetch   = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_atomic32_fetch   = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_tag_eager_short  = (uct_ep_tag_eager_short_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_tag_eager_bcopy  = (uct_ep_tag_eager_bcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_tag_eager_zcopy  = (uct_ep_tag_eager_zcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_tag_rndv_zcopy   = (uct_ep_tag_rndv_zcopy_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_tag_rndv_cancel  = (uct_ep_tag_rndv_cancel_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_tag_rndv_request = (uct_ep_tag_rndv_request_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_pending_add      = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy,
-        .ep_pending_purge    = (uct_ep_pending_purge_func_t)ucs_empty_function_return_success,
-        .ep_flush            = (uct_ep_flush_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_fence            = (uct_ep_fence_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_check            = (uct_ep_check_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_connect_to_ep    = (uct_ep_connect_to_ep_func_t)ucs_empty_function_return_ep_timeout,
-        .ep_destroy          = (uct_ep_destroy_func_t)ucs_empty_function,
-        .ep_get_address      = (uct_ep_get_address_func_t)ucs_empty_function_return_ep_timeout
-    }
-};
-
-static uct_ep_t ucp_failed_tl_ep = {
-    .iface = &ucp_failed_tl_iface
-};
-
 static unsigned ucp_worker_iface_err_handle_progress(void *arg)
 {
     ucp_worker_err_handle_arg_t *err_handle_arg = arg;
     ucp_ep_h ucp_ep                             = err_handle_arg->ucp_ep;
     ucs_status_t status                         = err_handle_arg->status;
     ucp_worker_h worker                         = ucp_ep->worker;
-    ucp_lane_index_t lane;
     ucp_request_t *close_req;
 
     UCS_ASYNC_BLOCK(&worker->async);
@@ -476,22 +439,11 @@ static unsigned ucp_worker_iface_err_handle_progress(void *arg)
 
     ucs_assert(ucp_ep->flags & UCP_EP_FLAG_FAILED);
 
-    for (lane = 0; lane < ucp_ep_num_lanes(ucp_ep); ++lane) {
-        if (ucp_ep->uct_eps[lane] == NULL) {
-            continue;
-        }
-
-        ucs_trace("ep %p: discard uct_ep[%d]=%p", ucp_ep, lane,
-                  ucp_ep->uct_eps[lane]);
-        ucp_worker_discard_uct_ep(ucp_ep->worker, ucp_ep->uct_eps[lane],
-                                  UCT_FLUSH_FLAG_CANCEL,
-                                  ucp_ep_err_pending_purge,
-                                  UCS_STATUS_PTR(status));
-        ucp_ep->uct_eps[lane] = &ucp_failed_tl_ep;
-    }
-
+    ucp_ep_discard_lanes(ucp_ep, status);
+    ucp_ep_reqs_purge(ucp_ep, status);
     ucp_stream_ep_cleanup(ucp_ep);
-    if (ucp_ep->flags & UCP_EP_FLAG_USED) {   
+
+    if (ucp_ep->flags & UCP_EP_FLAG_USED) {
         if (ucp_ep->flags & UCP_EP_FLAG_CLOSE_REQ_VALID) {
             ucs_assert(ucp_ep->flags & UCP_EP_FLAG_CLOSED);
             /* Promote close operation to CANCEL in case of transport error,
@@ -503,9 +455,16 @@ static unsigned ucp_worker_iface_err_handle_progress(void *arg)
         } else {
             ucp_ep_invoke_err_cb(ucp_ep, status);
         }
-    } else {
-        ucs_debug("ep %p: destroy internal endpoint due to peer failure", ucp_ep);
+    } else if (!(ucp_ep->flags & UCP_EP_FLAG_INTERNAL)) {
+        ucs_debug("ep %p: destroy endpoint which is not exposed to a user due"
+                  " to peer failure", ucp_ep);
         ucp_ep_disconnected(ucp_ep, 1);
+    } else {
+        /* No additional actions are required, this is an internal EP created
+         * for sending WIREUP/EP_REMOVED messsage to a peer. So, close operation
+         * was already scheduled, this EP will be deleted after all lanes will
+         * be discarded successfully */
+        ucs_debug("ep %p: detected peer failure on internal endpoint", ucp_ep);
     }
 
     ucs_free(err_handle_arg);
@@ -535,12 +494,16 @@ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep,
                                       uct_ep_h uct_ep, ucp_lane_index_t lane,
                                       ucs_status_t status)
 {
-    uct_worker_cb_id_t          prog_id    = UCS_CALLBACKQ_ID_NULL;
-    ucs_status_t                ret_status = UCS_OK;
-    ucp_rsc_index_t             rsc_index;
-    uct_tl_resource_desc_t      *tl_rsc;
+    uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
+    ucs_status_t ret_status    = UCS_OK;
+    char lane_info_str[64];
+    ucp_rsc_index_t rsc_index;
+    uct_tl_resource_desc_t *tl_rsc;
     ucp_worker_err_handle_arg_t *err_handle_arg;
-    ucs_log_level_t             log_level;
+    ucs_log_level_t log_level;
+
+    ucs_debug("ep %p: set_ep_failed status %s on lane[%d]=%p", ucp_ep,
+              ucs_status_string(status), lane, uct_ep);
 
     /* In case if this is a local failure we need to notify remote side */
     if (ucp_ep_is_cm_local_connected(ucp_ep)) {
@@ -552,7 +515,9 @@ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep,
         goto out_ok;
     }
 
-    ucp_ep->flags |= UCP_EP_FLAG_FAILED;
+    /* Release EP ID here to prevent protocols from sending reply */
+    ucp_ep_release_id(ucp_ep);
+    ucp_ep_update_flags(ucp_ep, UCP_EP_FLAG_FAILED, 0);
 
     if (ucp_ep_config(ucp_ep)->key.err_mode == UCP_ERR_HANDLING_MODE_NONE) {
         /* NOTE: if user has not requested error handling on the endpoint,
@@ -585,18 +550,26 @@ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep,
                     UCS_LOG_LEVEL_ERROR;
 
         if (lane != UCP_NULL_LANE) {
-            rsc_index = ucp_ep_get_rsc_index(ucp_ep, lane);
-            tl_rsc    = &worker->context->tl_rscs[rsc_index].tl_rsc;
-            ucs_log(log_level, "error '%s' will not be handled for ep %p - "
-                    UCT_TL_RESOURCE_DESC_FMT " since no error callback is installed",
-                    ucs_status_string(status), ucp_ep,
-                    UCT_TL_RESOURCE_DESC_ARG(tl_rsc));
+            if (lane == ucp_ep_get_cm_lane(ucp_ep)) {
+                ucs_strncpy_safe(lane_info_str, "CM lane",
+                                 sizeof(lane_info_str));
+            } else {
+                rsc_index = ucp_ep_get_rsc_index(ucp_ep, lane);
+                tl_rsc    = &worker->context->tl_rscs[rsc_index].tl_rsc;
+
+                ucs_snprintf_safe(lane_info_str, sizeof(lane_info_str),
+                                  UCT_TL_RESOURCE_DESC_FMT,
+                                  UCT_TL_RESOURCE_DESC_ARG(tl_rsc));
+            }
         } else {
             ucs_assert(uct_ep == NULL);
-            ucs_log(log_level, "error '%s' occurred on wireup will not be "
-                    "handled for ep %p since no error callback is installed",
-                    ucs_status_string(status), ucp_ep);
+            ucs_strncpy_safe(lane_info_str, "wireup lane",
+                             sizeof(lane_info_str));
         }
+
+        ucs_log(log_level, "ep %p: error '%s' on %s will not be handled since"
+                " no error callback is installed",
+                ucp_ep, ucs_status_string(status), lane_info_str);
         ret_status = status;
         goto out;
     }
@@ -612,13 +585,70 @@ ucs_status_t ucp_worker_set_ep_failed(ucp_worker_h worker, ucp_ep_h ucp_ep,
     return ret_status;
 }
 
+static ucs_status_t
+ucp_worker_iface_handle_uct_ep_failure(ucp_ep_h ucp_ep, ucp_lane_index_t lane,
+                                       uct_ep_h uct_ep, ucs_status_t status)
+{
+    ucp_worker_h worker = ucp_ep->worker;
+    ucp_wireup_ep_t *wireup_ep;
+
+    if (ucp_ep->flags & UCP_EP_FLAG_FAILED) {
+        return UCS_OK;
+    }
+
+    wireup_ep = ucp_wireup_ep(ucp_ep->uct_eps[lane]);
+    if ((wireup_ep == NULL) ||
+        !ucp_wireup_aux_ep_is_owner(wireup_ep, uct_ep) ||
+        !(ucp_ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) {
+        /* Failure on NON-AUX EP or failure on AUX EP before it sent its address
+         * means failure on the UCP EP */
+        return ucp_worker_set_ep_failed(worker, ucp_ep, uct_ep, lane, status);
+    }
+
+    if (wireup_ep->flags & UCP_WIREUP_EP_FLAG_READY) {
+        /* @ref ucp_wireup_ep_progress was scheduled, wireup ep and its
+         * pending requests have to be handled there */
+        return UCS_OK;
+    }
+
+    /**
+     * Failure on AUX EP after recv remote address but before recv ACK
+     * assumes that remote EP is already connected and destroyed its
+     * wireup/AUX EP. If remote EP is dead, it will be detected by send
+     * operations or KA.
+     */
+    ucp_wireup_ep_discard_aux_ep(wireup_ep, UCT_FLUSH_FLAG_CANCEL,
+                                 ucp_destroyed_ep_pending_purge, ucp_ep);
+    ucp_wireup_remote_connected(ucp_ep);
+    return UCS_OK;
+}
+
+static ucp_ep_h ucp_worker_find_lane(ucs_list_link_t *ep_list, uct_ep_h uct_ep,
+                                     ucp_lane_index_t *lane_p)
+{
+    ucp_ep_ext_gen_t *ep_ext;
+    ucp_ep_h ucp_ep;
+    ucp_lane_index_t lane;
+
+    /* TODO: need to optimize uct_ep -> ucp_ep lookup */
+    ucs_list_for_each(ep_ext, ep_list, ep_list) {
+        ucp_ep = ucp_ep_from_ext_gen(ep_ext);
+        lane   = ucp_ep_lookup_lane(ucp_ep, uct_ep);
+        if (lane != UCP_NULL_LANE) {
+            *lane_p = lane;
+            return ucp_ep;
+        }
+    }
+
+    return NULL;
+}
+
 static ucs_status_t
 ucp_worker_iface_error_handler(void *arg, uct_ep_h uct_ep, ucs_status_t status)
 {
     ucp_worker_h worker = (ucp_worker_h)arg;
     ucp_lane_index_t lane;
     ucs_status_t ret_status;
-    ucp_ep_ext_gen_t *ep_ext;
     ucp_ep_h ucp_ep;
 
     UCS_ASYNC_BLOCK(&worker->async);
@@ -633,23 +663,20 @@ ucp_worker_iface_error_handler(void *arg, uct_ep_h uct_ep, ucs_status_t status)
         goto out;
     }
 
-    /* TODO: need to optimize uct_ep -> ucp_ep lookup */
-    ucs_list_for_each(ep_ext, &worker->all_eps, ep_list) {
-        ucp_ep = ucp_ep_from_ext_gen(ep_ext);
-        for (lane = 0; lane < ucp_ep_num_lanes(ucp_ep); ++lane) {
-            if ((uct_ep == ucp_ep->uct_eps[lane]) ||
-                ucp_wireup_ep_is_owner(ucp_ep->uct_eps[lane], uct_ep)) {
-                ret_status = ucp_worker_set_ep_failed(worker, ucp_ep, uct_ep,
-                                                      lane, status);
-                goto out;
-            }
-        }
+    ucp_ep = ucp_worker_find_lane(&worker->all_eps, uct_ep, &lane);
+    if (ucp_ep == NULL) {
+        ucp_ep = ucp_worker_find_lane(&worker->internal_eps, uct_ep, &lane);
     }
 
-    ucs_error("UCT EP %p isn't associated with UCP EP and was not scheduled "
-              "to be discarded on UCP Worker %p",
-              uct_ep, worker);
-    ret_status = UCS_ERR_NO_ELEM;
+    if (ucp_ep != NULL) {
+        ret_status = ucp_worker_iface_handle_uct_ep_failure(ucp_ep, lane,
+                                                            uct_ep, status);
+    } else {
+        ucs_error("worker %p: uct_ep %p isn't associated with any ucp endpoint"
+                  " and was not scheduled to be discarded",
+                  worker, uct_ep);
+        ret_status = UCS_ERR_NO_ELEM;
+    }
 
 out:
     UCS_ASYNC_UNBLOCK(&worker->async);
@@ -966,11 +993,11 @@ static int ucp_worker_iface_find_better(ucp_worker_h worker,
  *
  * @return Error code as defined by @ref ucs_status_t
  */
-static void ucp_worker_select_best_ifaces(ucp_worker_h worker,
-                                          uint64_t *tl_bitmap_p)
+static void
+ucp_worker_select_best_ifaces(ucp_worker_h worker, ucp_tl_bitmap_t *tl_bitmap_p)
 {
-    ucp_context_h context = worker->context;
-    uint64_t tl_bitmap    = 0;
+    ucp_context_h context     = worker->context;
+    ucp_tl_bitmap_t tl_bitmap = UCS_BITMAP_ZERO;
     ucp_rsc_index_t repl_ifaces[UCP_MAX_RESOURCES];
     ucp_worker_iface_t *wiface;
     ucp_rsc_index_t tl_id, iface_id;
@@ -982,12 +1009,12 @@ static void ucp_worker_select_best_ifaces(ucp_worker_h worker,
     for (tl_id = 0; tl_id < context->num_tls; ++tl_id) {
         wiface = worker->ifaces[tl_id];
         if (!ucp_worker_iface_find_better(worker, wiface, &repl_ifaces[tl_id])) {
-            tl_bitmap |= UCS_BIT(tl_id);
+            UCS_BITMAP_SET(tl_bitmap, tl_id);
         }
     }
 
     *tl_bitmap_p       = tl_bitmap;
-    worker->num_ifaces = ucs_popcount(tl_bitmap);
+    worker->num_ifaces = UCS_BITMAP_POPCOUNT(tl_bitmap);
     ucs_assert(worker->num_ifaces <= context->num_tls);
 
     if (worker->num_ifaces == context->num_tls) {
@@ -999,12 +1026,13 @@ static void ucp_worker_select_best_ifaces(ucp_worker_h worker,
     /* Some ifaces need to be closed */
     for (tl_id = 0, iface_id = 0; tl_id < context->num_tls; ++tl_id) {
         wiface = worker->ifaces[tl_id];
-        if (tl_bitmap & UCS_BIT(tl_id)) {
+        if (UCS_BITMAP_GET(tl_bitmap, tl_id)) {
             if (iface_id != tl_id) {
                 worker->ifaces[iface_id] = wiface;
             }
             ++iface_id;
         } else {
+            /* coverity[overrun-local] */
             ucs_debug("closing resource[%d] "UCT_TL_RESOURCE_DESC_FMT
                       ", since resource[%d] "UCT_TL_RESOURCE_DESC_FMT
                       " is better, worker %p",
@@ -1040,19 +1068,19 @@ static ucs_status_t ucp_worker_add_resource_ifaces(ucp_worker_h worker)
     uct_iface_params_t iface_params;
     ucp_rsc_index_t tl_id, iface_id;
     ucp_worker_iface_t *wiface;
-    uint64_t ctx_tl_bitmap, tl_bitmap;
+    ucp_tl_bitmap_t ctx_tl_bitmap, tl_bitmap;
     unsigned num_ifaces;
     ucs_status_t status;
 
     /* If tl_bitmap is already set, just use it. Otherwise open ifaces on all
      * available resources and then select the best ones. */
     ctx_tl_bitmap  = context->tl_bitmap;
-    if (ctx_tl_bitmap) {
-        num_ifaces = ucs_popcount(ctx_tl_bitmap);
+    if (!UCS_BITMAP_IS_ZERO_INPLACE(&ctx_tl_bitmap)) {
+        num_ifaces = UCS_BITMAP_POPCOUNT(ctx_tl_bitmap);
         tl_bitmap  = ctx_tl_bitmap;
     } else {
         num_ifaces = context->num_tls;
-        tl_bitmap  = UCS_MASK(context->num_tls);
+        UCS_BITMAP_MASK(&tl_bitmap, context->num_tls);
     }
 
     worker->ifaces = ucs_calloc(num_ifaces, sizeof(*worker->ifaces),
@@ -1066,7 +1094,7 @@ static ucs_status_t ucp_worker_add_resource_ifaces(ucp_worker_h worker)
     worker->num_ifaces = num_ifaces;
     iface_id           = 0;
 
-    ucs_for_each_bit(tl_id, tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(tl_bitmap, tl_id) {
         iface_params.field_mask = UCT_IFACE_PARAM_FIELD_OPEN_MODE;
         resource = &context->tl_rscs[tl_id];
 
@@ -1086,36 +1114,37 @@ static ucs_status_t ucp_worker_add_resource_ifaces(ucp_worker_h worker)
         }
     }
 
-    if (!ctx_tl_bitmap) {
+    if (UCS_BITMAP_IS_ZERO_INPLACE(&ctx_tl_bitmap)) {
         /* Context bitmap is not set, need to select the best tl resources */
-        tl_bitmap = 0;
+        UCS_BITMAP_CLEAR(&tl_bitmap);
         ucp_worker_select_best_ifaces(worker, &tl_bitmap);
-        ucs_assert(tl_bitmap);
+        ucs_assert(!UCS_BITMAP_IS_ZERO_INPLACE(&tl_bitmap));
 
         /* Cache tl_bitmap on the context, so the next workers would not need
          * to select best ifaces. */
         context->tl_bitmap = tl_bitmap;
-        ucs_debug("selected tl bitmap: 0x%"PRIx64" (%d tls)",
-                  tl_bitmap, ucs_popcount(tl_bitmap));
+        ucs_debug("selected tl bitmap: " UCT_TL_BITMAP_FMT "(%zu tls)",
+                  UCT_TL_BITMAP_ARG(&tl_bitmap),
+                  UCS_BITMAP_POPCOUNT(tl_bitmap));
     }
 
-    worker->scalable_tl_bitmap = 0;
-    ucs_for_each_bit(tl_id, context->tl_bitmap) {
+    UCS_BITMAP_CLEAR(&worker->scalable_tl_bitmap);
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, tl_id) {
         ucs_assert(ucp_worker_is_tl_p2p(worker, tl_id) ||
                    ucp_worker_is_tl_2iface(worker, tl_id) ||
                    ucp_worker_is_tl_2sockaddr(worker, tl_id));
         wiface = ucp_worker_iface(worker, tl_id);
         if (ucp_is_scalable_transport(context, wiface->attr.max_num_eps)) {
-            worker->scalable_tl_bitmap |= UCS_BIT(tl_id);
+            UCS_BITMAP_SET(worker->scalable_tl_bitmap, tl_id);
         }
     }
 
-    ucs_debug("selected scalable tl bitmap: 0x%"PRIx64" (%d tls)",
-              worker->scalable_tl_bitmap,
-              ucs_popcount(worker->scalable_tl_bitmap));
+    ucs_debug("selected scalable tl bitmap: " UCT_TL_BITMAP_FMT " (%zu tls)",
+              UCT_TL_BITMAP_ARG(&worker->scalable_tl_bitmap),
+              UCS_BITMAP_POPCOUNT(worker->scalable_tl_bitmap));
 
     iface_id = 0;
-    ucs_for_each_bit(tl_id, tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(tl_bitmap, tl_id) {
         status = ucp_worker_iface_init(worker, tl_id,
                                        worker->ifaces[iface_id++]);
         if (status != UCS_OK) {
@@ -1161,6 +1190,31 @@ static void ucp_worker_close_ifaces(ucp_worker_h worker)
     UCS_ASYNC_UNBLOCK(&worker->async);
 }
 
+static ucs_status_t
+ucp_worker_get_sys_device_distance(ucp_context_h context,
+                                   ucp_rsc_index_t rsc_index,
+                                   ucs_sys_dev_distance_t *distance)
+{
+    ucs_sys_device_t device     = UCS_SYS_DEVICE_ID_UNKNOWN;
+    ucs_sys_device_t cmp_device = UCS_SYS_DEVICE_ID_UNKNOWN;
+    ucp_rsc_index_t md_index, i;
+
+    for (i = 0; i < context->num_tls; i++) {
+        md_index = context->tl_rscs[i].md_index;
+        if (strcmp(context->tl_mds[md_index].rsc.md_name,
+                   context->config.selection_cmp)) {
+            continue;
+        }
+
+        device     = context->tl_rscs[rsc_index].tl_rsc.sys_device;
+        cmp_device = context->tl_rscs[i].tl_rsc.sys_device;
+
+        return ucs_topo_get_distance(device, cmp_device, distance);
+    }
+
+    return UCS_ERR_NO_RESOURCE;
+}
+
 ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id,
                                    uct_iface_params_t *iface_params,
                                    ucp_worker_iface_t **wiface_p)
@@ -1168,6 +1222,7 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id,
     ucp_context_h context            = worker->context;
     ucp_tl_resource_desc_t *resource = &context->tl_rscs[tl_id];
     uct_md_h md                      = context->tl_mds[resource->md_index].md;
+    ucs_sys_dev_distance_t distance  = {.latency = 0, .bandwidth = 0};
     uct_iface_config_t *iface_config;
     const char *cfg_tl_name;
     ucp_worker_iface_t *wiface;
@@ -1229,6 +1284,11 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id,
     iface_params->field_mask       |= UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG |
                                       UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB;
 
+    if (ucp_worker_keepalive_is_enabled(worker)) {
+        iface_params->field_mask        |= UCT_IFACE_PARAM_FIELD_KEEPALIVE_INTERVAL;
+        iface_params->keepalive_interval = context->config.keepalive_interval;
+    }
+
     /* Open UCT interface */
     status = uct_iface_open(md, worker->uct, iface_params, iface_config,
                             &wiface->iface);
@@ -1245,6 +1305,16 @@ ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id,
         goto err_close_iface;
     }
 
+    status = ucp_worker_get_sys_device_distance(context, wiface->rsc_index,
+                                                &distance);
+    if (status == UCS_OK) {
+        wiface->attr.latency.c          += distance.latency;
+        wiface->attr.bandwidth.shared    =
+            ucs_min(wiface->attr.bandwidth.shared, distance.bandwidth);
+        wiface->attr.bandwidth.dedicated =
+            ucs_min(wiface->attr.bandwidth.dedicated, distance.bandwidth);
+    }
+
     ucs_debug("created interface[%d]=%p using "UCT_TL_RESOURCE_DESC_FMT" on worker %p",
               tl_id, wiface->iface, UCT_TL_RESOURCE_DESC_ARG(&resource->tl_rsc),
               worker);
@@ -1332,7 +1402,7 @@ ucs_status_t ucp_worker_iface_init(ucp_worker_h worker, ucp_rsc_index_t tl_id,
     ucs_for_each_bit(mem_type_index,
         context->tl_mds[resource->md_index].attr.cap.access_mem_types) {
         ucs_assert(mem_type_index < UCS_MEMORY_TYPE_LAST);
-        context->mem_type_access_tls[mem_type_index] |= UCS_BIT(tl_id);
+        UCS_BITMAP_SET(context->mem_type_access_tls[mem_type_index], tl_id);
     }
 
     return UCS_OK;
@@ -1374,7 +1444,7 @@ static ucs_status_t ucp_worker_add_resource_cms(ucp_worker_h worker)
     ucp_rsc_index_t cmpt_index, cm_cmpt_index, i;
     ucs_status_t    status;
 
-    if (!ucp_worker_sockaddr_is_cm_proto(worker)) {
+    if (ucp_worker_num_cm_cmpts(worker) == 0) {
         worker->cms = NULL;
         return UCS_OK;
     }
@@ -1402,11 +1472,12 @@ static ucs_status_t ucp_worker_add_resource_cms(ucp_worker_h worker)
         }
 
         status = uct_cm_open(cmpt, worker->uct, cm_config, &worker->cms[i].cm);
+        uct_config_release(cm_config);
         if (status != UCS_OK) {
-            ucs_error("failed to open CM on component %s with status %s",
-                      context->tl_cmpts[cmpt_index].attr.name,
-                      ucs_status_string(status));
-            goto err_free_cms;
+            ucs_diag("failed to open CM on component %s with status %s",
+                     context->tl_cmpts[cmpt_index].attr.name,
+                     ucs_status_string(status));
+            continue;
         }
 
         worker->cms[i].attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV;
@@ -1419,7 +1490,6 @@ static ucs_status_t ucp_worker_add_resource_cms(ucp_worker_h worker)
             goto err_free_cms;
         }
 
-        uct_config_release(cm_config);
         worker->cms[i++].cmpt_idx = cmpt_index;
     }
 
@@ -1440,7 +1510,7 @@ static void ucp_worker_enable_atomic_tl(ucp_worker_h worker, const char *mode,
     ucs_trace("worker %p: using %s atomics on iface[%d]=" UCT_TL_RESOURCE_DESC_FMT,
               worker, mode, rsc_index,
               UCT_TL_RESOURCE_DESC_ARG(&worker->context->tl_rscs[rsc_index].tl_rsc));
-    worker->atomic_tls |= UCS_BIT(rsc_index);
+    UCS_BITMAP_SET(worker->atomic_tls, rsc_index);
 }
 
 static void ucp_worker_init_cpu_atomics(ucp_worker_h worker)
@@ -1461,7 +1531,8 @@ static void ucp_worker_init_cpu_atomics(ucp_worker_h worker)
 
 static void ucp_worker_init_device_atomics(ucp_worker_h worker)
 {
-    ucp_context_h context = worker->context;
+    ucp_context_h context    = worker->context;
+    ucp_tl_bitmap_t supp_tls = UCS_BITMAP_ZERO;
     ucp_address_iface_attr_t dummy_iface_attr;
     ucp_tl_resource_desc_t *rsc, *best_rsc;
     uct_iface_attr_t *iface_attr;
@@ -1472,7 +1543,6 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker)
     ucp_md_index_t md_index;
     ucp_worker_iface_t *wiface;
     uct_md_attr_t *md_attr;
-    uint64_t supp_tls;
     uint8_t priority, best_priority;
     ucp_tl_iface_atomic_flags_t atomic;
 
@@ -1487,7 +1557,6 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker)
     dummy_iface_attr.priority            = 0;
     dummy_iface_attr.lat_ovh             = 0;
 
-    supp_tls                             = 0;
     best_score                           = -1;
     best_rsc                             = NULL;
     best_priority                        = 0;
@@ -1511,7 +1580,7 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker)
             continue;
         }
 
-        supp_tls |= UCS_BIT(rsc_index);
+        UCS_BITMAP_SET(supp_tls, rsc_index);
         priority  = iface_attr->priority;
 
         score = ucp_wireup_amo_score_func(context, md_attr, iface_attr,
@@ -1535,13 +1604,12 @@ static void ucp_worker_init_device_atomics(ucp_worker_h worker)
     ucs_debug("worker %p: using device atomics", worker);
 
     /* Enable atomics on all resources using same device as the "best" resource */
-    ucs_for_each_bit(rsc_index, context->tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, rsc_index) {
         rsc = &context->tl_rscs[rsc_index];
-        if ((supp_tls & UCS_BIT(rsc_index)) &&
+        if (UCS_BITMAP_GET(supp_tls, rsc_index) &&
             (rsc->md_index == best_rsc->md_index) &&
             !strncmp(rsc->tl_rsc.dev_name, best_rsc->tl_rsc.dev_name,
-                     UCT_DEVICE_NAME_MAX))
-        {
+                     UCT_DEVICE_NAME_MAX)) {
             ucp_worker_enable_atomic_tl(worker, "device", rsc_index);
         }
     }
@@ -1570,7 +1638,7 @@ static void ucp_worker_init_atomic_tls(ucp_worker_h worker)
 {
     ucp_context_h context = worker->context;
 
-    worker->atomic_tls = 0;
+    UCS_BITMAP_CLEAR(&worker->atomic_tls);
 
     if (context->config.features & UCP_FEATURE_AMO) {
         switch(context->config.ext.atomic_mode) {
@@ -1624,33 +1692,41 @@ static char* ucp_worker_add_feature_rsc(ucp_context_h context,
     return p;
 }
 
-static void ucp_worker_print_used_tls(const ucp_ep_config_key_t *key,
-                                      ucp_context_h context,
-                                      ucp_worker_cfg_index_t config_idx)
+char *ucp_worker_print_used_tls(const ucp_ep_config_key_t *key,
+                                ucp_context_h context,
+                                ucp_worker_cfg_index_t config_idx, char *info,
+                                size_t max)
 {
-    char info[256]                  = {0};
     ucp_lane_map_t tag_lanes_map    = 0;
     ucp_lane_map_t rma_lanes_map    = 0;
     ucp_lane_map_t amo_lanes_map    = 0;
     ucp_lane_map_t stream_lanes_map = 0;
+    ucp_lane_map_t am_lanes_map     = 0;
     ucp_lane_index_t lane;
     char *p, *endp;
 
-    if (!ucs_log_is_enabled(UCS_LOG_LEVEL_INFO)) {
-        return;
-    }
-
     p    = info;
-    endp = p + sizeof(info);
+    endp = p + max;
 
     snprintf(p, endp - p,  "ep_cfg[%d]: ", config_idx);
     p += strlen(p);
 
     for (lane = 0; lane < key->num_lanes; ++lane) {
-        if (((key->am_lane == lane) || (lane == key->tag_lane) ||
+        if ((key->am_lane == lane) ||
             (ucp_ep_config_get_multi_lane_prio(key->am_bw_lanes, lane) >= 0)  ||
-            (ucp_ep_config_get_multi_lane_prio(key->rma_bw_lanes, lane) >= 0)) &&
-            (context->config.features & UCP_FEATURE_TAG)) {
+            (ucp_ep_config_get_multi_lane_prio(key->rma_bw_lanes, lane) >= 0)) {
+            if (context->config.features & UCP_FEATURE_TAG) {
+                tag_lanes_map |= UCS_BIT(lane);
+            }
+
+            if (context->config.features & UCP_FEATURE_AM) {
+                am_lanes_map |= UCS_BIT(lane);
+            }
+        }
+
+        if (key->tag_lane == lane) {
+            /* tag_lane is initialized if TAG feature is requested */
+            ucs_assert(context->config.features & UCP_FEATURE_TAG);
             tag_lanes_map |= UCS_BIT(lane);
         }
 
@@ -1674,9 +1750,12 @@ static void ucp_worker_print_used_tls(const ucp_ep_config_key_t *key,
                                    p, endp - p);
     p = ucp_worker_add_feature_rsc(context, key, amo_lanes_map, "amo",
                                    p, endp - p);
+    p = ucp_worker_add_feature_rsc(context, key, am_lanes_map, "am",
+                                   p, endp - p);
     ucp_worker_add_feature_rsc(context, key, stream_lanes_map, "stream",
                                p, endp - p);
-    ucs_info("%s", info);
+
+    return info;
 }
 
 static ucs_status_t ucp_worker_init_mpools(ucp_worker_h worker)
@@ -1763,7 +1842,8 @@ static void ucp_worker_destroy_mpools(ucp_worker_h worker)
     ucs_mpool_cleanup(&worker->reg_mp, 1);
     ucs_mpool_cleanup(&worker->am_mp, 1);
     ucs_mpool_cleanup(&worker->rkey_mp, 1);
-    ucs_mpool_cleanup(&worker->req_mp, 1);
+    ucs_mpool_cleanup(&worker->req_mp,
+                      !(worker->flags & UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK));
 }
 
 /* All the ucp endpoints will share the configurations. No need for every ep to
@@ -1778,8 +1858,11 @@ ucp_worker_get_ep_config(ucp_worker_h worker, const ucp_ep_config_key_t *key,
 {
     ucp_context_h context = worker->context;
     ucp_worker_cfg_index_t ep_cfg_index;
+    ucp_proto_select_short_t tag_short;
     ucp_ep_config_t *ep_config;
+    ucp_memtype_thresh_t *max_eager_short;
     ucs_status_t status;
+    char tl_info[256];
 
     /* Search for the given key in the ep_config array */
     for (ep_cfg_index = 0; ep_cfg_index < worker->ep_config_count;
@@ -1803,25 +1886,57 @@ ucp_worker_get_ep_config(ucp_worker_h worker, const ucp_ep_config_key_t *key,
         return status;
     }
 
-    ++worker->ep_config_count;
+    if (context->config.ext.proto_enable) {
+        if (context->config.features & UCP_FEATURE_TAG) {
+            /* Set threshold for short send */
+            ucp_proto_select_short_init(worker, &ep_config->proto_select,
+                                        ep_cfg_index, UCP_WORKER_CFG_INDEX_NULL,
+                                        UCP_OP_ID_TAG_SEND, 0,
+                                        ucp_ep_config_key_has_tag_lane(key) ?
+                                                UCP_PROTO_FLAG_TAG_SHORT :
+                                                UCP_PROTO_FLAG_AM_SHORT,
+                                        &tag_short);
+            /* short protocol should be either disabled, or use key->am_lane */
+            ucs_assert((tag_short.max_length_host_mem < 0) ||
+                       (tag_short.lane == key->am_lane));
+        } else {
+            ucp_proto_select_short_disable(&tag_short);
+        }
+
+        /* TODO replace ep_config->tag.max_eager_short by this struct */
+        max_eager_short = ucp_ep_config_key_has_tag_lane(key) ?
+                                  &ep_config->tag.max_eager_short :
+                                  &ep_config->tag.offload.max_eager_short;
+
+        max_eager_short->memtype_off = tag_short.max_length_unknown_mem;
+        max_eager_short->memtype_on  = tag_short.max_length_host_mem;
+    }
 
     if (print_cfg) {
-        ucp_worker_print_used_tls(key, context, ep_cfg_index);
+        ucs_info("%s", ucp_worker_print_used_tls(key, context, ep_cfg_index,
+                                                 tl_info, sizeof(tl_info)));
     }
 
+    ++worker->ep_config_count;
+
 out:
     *cfg_index_p = ep_cfg_index;
     return UCS_OK;
 }
 
 ucs_status_t
-ucp_worker_add_rkey_config(ucp_worker_h worker, const ucp_rkey_config_key_t *key,
+ucp_worker_add_rkey_config(ucp_worker_h worker,
+                           const ucp_rkey_config_key_t *key,
+                           const ucs_sys_dev_distance_t *lanes_distance,
                            ucp_worker_cfg_index_t *cfg_index_p)
 {
+    const ucp_ep_config_t *ep_config = &worker->ep_config[key->ep_cfg_index];
     ucp_worker_cfg_index_t rkey_cfg_index;
     ucp_rkey_config_t *rkey_config;
+    ucp_lane_index_t lane;
     ucs_status_t status;
     khiter_t khiter;
+    char buf[128];
     int khret;
 
     ucs_assert(worker->context->config.ext.proto_enable);
@@ -1833,15 +1948,44 @@ ucp_worker_add_rkey_config(ucp_worker_h worker, const ucp_rkey_config_key_t *key
         goto err;
     }
 
-    /* initialize rkey configuration */
+    ucs_assert((key->sys_dev == UCS_SYS_DEVICE_ID_UNKNOWN) ||
+               (lanes_distance != NULL));
+
+    /* Initialize rkey configuration */
     rkey_cfg_index   = worker->rkey_config_count;
     rkey_config      = &worker->rkey_config[rkey_cfg_index];
     rkey_config->key = *key;
-    status           = ucp_proto_select_init(&rkey_config->proto_select);
+
+    /* Copy remote-memory distance of each lane to rkey config */
+    for (lane = 0; lane < ep_config->key.num_lanes; ++lane) {
+        if (key->sys_dev == UCS_SYS_DEVICE_ID_UNKNOWN) {
+            rkey_config->lanes_distance[lane] = ucs_topo_default_distance;
+        } else {
+            rkey_config->lanes_distance[lane] = lanes_distance[lane];
+        }
+        ucs_trace("rkey_config[%d] lane [%d] distance %s", rkey_cfg_index, lane,
+                  ucs_topo_distance_str(&rkey_config->lanes_distance[lane], buf,
+                                        sizeof(buf)));
+    }
+
+    /* Initialize protocol selection */
+    status = ucp_proto_select_init(&rkey_config->proto_select);
     if (status != UCS_OK) {
         goto err;
     }
 
+    /* Set threshold for short put */
+    if (worker->context->config.features & UCP_FEATURE_RMA) {
+        ucp_proto_select_short_init(worker, &rkey_config->proto_select,
+                                    key->ep_cfg_index, rkey_cfg_index,
+                                    UCP_OP_ID_PUT, UCP_OP_ATTR_FLAG_FAST_CMPL,
+                                    UCP_PROTO_FLAG_PUT_SHORT,
+                                    &rkey_config->put_short);
+    } else {
+        ucp_proto_select_short_disable(&rkey_config->put_short);
+    }
+
+    /* Save key-to-index lookup */
     khiter = kh_put(ucp_worker_rkey_config, &worker->rkey_config_hash, *key,
                     &khret);
     if (khret == UCS_KH_PUT_FAILED) {
@@ -1849,9 +1993,8 @@ ucp_worker_add_rkey_config(ucp_worker_h worker, const ucp_rkey_config_key_t *key
         goto err_proto_cleanup;
     }
 
-    /* we should not get into this function if key already exists */
+    /* We should not get into this function if key already exists */
     ucs_assert_always(khret != UCS_KH_PUT_KEY_PRESENT);
-
     kh_value(&worker->rkey_config_hash, khiter) = rkey_cfg_index;
 
     ++worker->rkey_config_count;
@@ -1866,7 +2009,12 @@ ucp_worker_add_rkey_config(ucp_worker_h worker, const ucp_rkey_config_key_t *key
 
 static UCS_F_ALWAYS_INLINE void ucp_worker_keepalive_reset(ucp_worker_h worker)
 {
-    worker->keepalive.iter = &worker->all_eps;
+    worker->keepalive.cb_id      = UCS_CALLBACKQ_ID_NULL;
+    worker->keepalive.last_round = 0;
+    worker->keepalive.lane_map   = 0;
+    worker->keepalive.ep_count   = 0;
+    worker->keepalive.iter_count = 0;
+    worker->keepalive.iter       = &worker->all_eps;
 }
 
 static void ucp_worker_destroy_configs(ucp_worker_h worker)
@@ -1884,11 +2032,76 @@ static void ucp_worker_destroy_configs(ucp_worker_h worker)
     worker->rkey_config_count = 0;
 }
 
+static void ucp_worker_vfs_show_address_name(void *obj,
+                                             ucs_string_buffer_t *strb,
+                                             void *arg_ptr, uint64_t arg_u64)
+{
+    ucp_worker_h worker = obj;
+
+    UCS_ASYNC_BLOCK(&worker->async);
+    ucs_string_buffer_appendf(strb, "%s\n",
+                              ucp_worker_get_address_name(worker));
+    UCS_ASYNC_UNBLOCK(&worker->async);
+}
+
+static void ucp_worker_vfs_show_num_all_eps(void *obj,
+                                            ucs_string_buffer_t *strb,
+                                            void *arg_ptr, uint64_t arg_u64)
+{
+    ucp_worker_h worker = obj;
+
+    UCS_ASYNC_BLOCK(&worker->async);
+    ucs_string_buffer_appendf(strb, "%u\n", worker->num_all_eps);
+    UCS_ASYNC_UNBLOCK(&worker->async);
+}
+
+static void
+ucp_worker_vfs_show_keepalive_ep_count(void *obj, ucs_string_buffer_t *strb,
+                                       void *arg_ptr, uint64_t arg_u64)
+{
+    ucp_worker_h worker = obj;
+
+    UCS_ASYNC_BLOCK(&worker->async);
+    ucs_string_buffer_appendf(strb, "%u\n", worker->keepalive.ep_count);
+    UCS_ASYNC_UNBLOCK(&worker->async);
+}
+
+ucs_thread_mode_t ucp_worker_get_thread_mode(uint64_t worker_flags)
+{
+    if (worker_flags & UCP_WORKER_FLAG_THREAD_MULTI) {
+        return UCS_THREAD_MODE_MULTI;
+    } else if (worker_flags & UCP_WORKER_FLAG_THREAD_SERIALIZED) {
+        return UCS_THREAD_MODE_SERIALIZED;
+    }
+    return UCS_THREAD_MODE_SINGLE;
+}
+
+void ucp_worker_create_vfs(ucp_context_h context, ucp_worker_h worker)
+{
+    ucs_thread_mode_t thread_mode;
+
+    ucs_vfs_obj_add_dir(context, worker, "worker/%s", worker->name);
+    ucs_vfs_obj_add_ro_file(worker, ucs_vfs_show_memory_address, NULL, 0,
+                            "memory_address");
+    ucs_vfs_obj_add_ro_file(worker, ucp_worker_vfs_show_address_name, NULL, 0,
+                            "address_name");
+
+    thread_mode = ucp_worker_get_thread_mode(worker->flags);
+    ucs_vfs_obj_add_ro_file(worker, ucs_vfs_show_primitive,
+                            (void*)ucs_thread_mode_names[thread_mode],
+                            UCS_VFS_TYPE_STRING, "thread_mode");
+
+    ucs_vfs_obj_add_ro_file(worker, ucp_worker_vfs_show_num_all_eps, NULL, 0,
+                            "num_all_eps");
+    ucs_vfs_obj_add_ro_file(worker, ucp_worker_vfs_show_keepalive_ep_count,
+                            NULL, 0, "keepalive/ep_count");
+}
+
 ucs_status_t ucp_worker_create(ucp_context_h context,
                                const ucp_worker_params_t *params,
                                ucp_worker_h *worker_p)
 {
-    ucs_thread_mode_t uct_thread_mode;
+    ucs_thread_mode_t thread_mode, uct_thread_mode;
     unsigned name_length;
     ucp_worker_h worker;
     ucs_status_t status;
@@ -1898,26 +2111,6 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
         return UCS_ERR_NO_MEMORY;
     }
 
-    uct_thread_mode = UCS_THREAD_MODE_SINGLE;
-    worker->flags   = 0;
-
-    if (params->field_mask & UCP_WORKER_PARAM_FIELD_THREAD_MODE) {
-#if ENABLE_MT
-        if (params->thread_mode != UCS_THREAD_MODE_SINGLE) {
-            /* UCT is serialized by UCP lock or by UCP user */
-            uct_thread_mode = UCS_THREAD_MODE_SERIALIZED;
-        }
-
-        if (params->thread_mode == UCS_THREAD_MODE_MULTI) {
-            worker->flags |= UCP_WORKER_FLAG_MT;
-        }
-#else
-        if (params->thread_mode != UCS_THREAD_MODE_SINGLE) {
-            ucs_debug("forced single thread mode on worker create");
-        }
-#endif
-    }
-
     worker->context              = context;
     worker->uuid                 = ucs_generate_uuid((uintptr_t)worker);
     worker->flush_ops_count      = 0;
@@ -1928,18 +2121,50 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
     worker->num_ifaces           = 0;
     worker->am_message_id        = ucs_generate_uuid(0);
     worker->rkey_ptr_cb_id       = UCS_CALLBACKQ_ID_NULL;
-    worker->keepalive.cb_id      = UCS_CALLBACKQ_ID_NULL;
-    worker->keepalive.last_round = 0;
-    worker->keepalive.lane_map   = 0;
-    worker->keepalive.ep_count   = 0;
+    worker->num_all_eps          = 0;
     ucp_worker_keepalive_reset(worker);
     ucs_queue_head_init(&worker->rkey_ptr_reqs);
     ucs_list_head_init(&worker->arm_ifaces);
     ucs_list_head_init(&worker->stream_ready_eps);
     ucs_list_head_init(&worker->all_eps);
+    ucs_list_head_init(&worker->internal_eps);
     kh_init_inplace(ucp_worker_rkey_config, &worker->rkey_config_hash);
     kh_init_inplace(ucp_worker_discard_uct_ep_hash, &worker->discard_uct_ep_hash);
 
+    /* Copy user flags, and mask-out unsupported flags for compatibility */
+    worker->flags = UCP_PARAM_VALUE(WORKER, params, flags, FLAGS, 0) &
+                    UCS_MASK(UCP_WORKER_INTERNAL_FLAGS_SHIFT);
+    UCS_STATIC_ASSERT(UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK <
+                      UCS_BIT(UCP_WORKER_INTERNAL_FLAGS_SHIFT));
+
+    /* Set multi-thread support mode */
+    thread_mode = UCP_PARAM_VALUE(WORKER, params, thread_mode, THREAD_MODE,
+                                  UCS_THREAD_MODE_SINGLE);
+    switch (thread_mode) {
+    case UCS_THREAD_MODE_SINGLE:
+        /* UCT is serialized by UCP lock or by UCP user */
+        uct_thread_mode = UCS_THREAD_MODE_SINGLE;
+        break;
+    case UCS_THREAD_MODE_SERIALIZED:
+        uct_thread_mode = UCS_THREAD_MODE_SERIALIZED;
+        worker->flags  |= UCP_WORKER_FLAG_THREAD_SERIALIZED;
+        break;
+    case UCS_THREAD_MODE_MULTI:
+        uct_thread_mode = UCS_THREAD_MODE_SERIALIZED;
+#if ENABLE_MT
+        worker->flags |= UCP_WORKER_FLAG_THREAD_MULTI;
+#else
+        ucs_diag("multi-threaded worker is requested, but library is built "
+                 "without multi-thread support");
+#endif
+        break;
+    default:
+        ucs_error("invalid thread mode %d", thread_mode);
+        status = UCS_ERR_INVALID_PARAM;
+        goto err_free;
+    }
+
+    /* Initialize endpoint allocator */
     UCS_STATIC_ASSERT(sizeof(ucp_ep_ext_gen_t) <= sizeof(ucp_ep_t));
     if (context->config.features & (UCP_FEATURE_STREAM | UCP_FEATURE_AM)) {
         UCS_STATIC_ASSERT(sizeof(ucp_ep_ext_proto_t) <= sizeof(ucp_ep_t));
@@ -1948,16 +2173,21 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
         ucs_strided_alloc_init(&worker->ep_alloc, sizeof(ucp_ep_t), 2);
     }
 
-    if (params->field_mask & UCP_WORKER_PARAM_FIELD_USER_DATA) {
-        worker->user_data = params->user_data;
+    worker->user_data = UCP_PARAM_VALUE(WORKER, params, user_data, USER_DATA,
+                                        NULL);
+
+    if ((params->field_mask & UCP_WORKER_PARAM_FIELD_NAME) &&
+        (params->name != NULL)) {
+        ucs_snprintf_zero(worker->name, UCP_ENTITY_NAME_MAX, "%s",
+                          params->name);
     } else {
-        worker->user_data = NULL;
+        ucs_snprintf_zero(worker->name, UCP_ENTITY_NAME_MAX, "%p", worker);
     }
 
-    name_length = ucs_min(UCP_WORKER_NAME_MAX,
-                          context->config.ext.max_worker_name + 1);
-    ucs_snprintf_zero(worker->name, name_length, "%s:%d", ucs_get_host_name(),
-                      getpid());
+    name_length = ucs_min(UCP_WORKER_ADDRESS_NAME_MAX,
+                          context->config.ext.max_worker_address_name + 1);
+    ucs_snprintf_zero(worker->address_name, name_length, "%s:%d",
+                      ucs_get_host_name(), getpid());
 
     status = ucs_ptr_map_init(&worker->ptr_map);
     if (status != UCS_OK) {
@@ -2021,7 +2251,7 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
     }
 
     /* Create loopback endpoints to copy across memory types */
-    status = ucp_worker_create_mem_type_endpoints(worker);
+    status = ucp_worker_mem_type_eps_create(worker);
     if (status != UCS_OK) {
         goto err_close_cms;
     }
@@ -2048,9 +2278,11 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
     ucp_worker_init_atomic_tls(worker);
 
     /* At this point all UCT memory domains and interfaces are already created
-     * so warn about unused environment variables.
+     * so print used environment variables and warn about unused ones.
      */
-    ucs_config_parser_warn_unused_env_vars_once(context->config.env_prefix);
+    ucs_config_parser_print_env_vars_once(context->config.env_prefix);
+
+    ucp_worker_create_vfs(context, worker);
 
     *worker_p = worker;
     return UCS_OK;
@@ -2060,7 +2292,7 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
 err_destroy_mpools:
     ucp_worker_destroy_mpools(worker);
 err_destroy_memtype_eps:
-    ucp_worker_destroy_mem_type_endpoints(worker);
+    ucp_worker_mem_type_eps_create(worker);
 err_close_cms:
     ucp_worker_close_cms(worker);
 err_close_ifaces:
@@ -2088,26 +2320,39 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
     return status;
 }
 
+static void ucp_worker_discard_uct_ep_complete(ucp_request_t *req)
+{
+    ucp_ep_h ucp_ep = req->send.ep;
+
+    UCP_EP_ASSERT_COUNTER_DEC(&ucp_ep->discard_refcount);
+    ucp_worker_flush_ops_count_dec(ucp_ep->worker);
+    ucp_request_complete(req, send.cb, UCS_OK, req->user_data);
+    ucp_ep_remove_ref(ucp_ep);
+}
+
 static unsigned ucp_worker_discard_uct_ep_destroy_progress(void *arg)
 {
     ucp_request_t *req  = (ucp_request_t*)arg;
     uct_ep_h uct_ep     = req->send.discard_uct_ep.uct_ep;
-    ucp_worker_h worker = req->send.discard_uct_ep.ucp_worker;
+    ucp_ep_h ucp_ep     = req->send.ep;
+    ucp_worker_h worker = ucp_ep->worker;
     khiter_t iter;
 
     ucp_trace_req(req, "destroy uct_ep=%p", uct_ep);
-    ucp_request_put(req);
+
+    req->send.discard_uct_ep.cb_id = UCS_CALLBACKQ_ID_NULL;
 
     UCS_ASYNC_BLOCK(&worker->async);
-    ucp_worker_flush_ops_count_dec(worker);
-    iter = kh_get(ucp_worker_discard_uct_ep_hash,
-                  &worker->discard_uct_ep_hash, uct_ep);
+    ucp_worker_discard_uct_ep_complete(req);
+    iter = kh_get(ucp_worker_discard_uct_ep_hash, &worker->discard_uct_ep_hash,
+                  uct_ep);
     if (iter == kh_end(&worker->discard_uct_ep_hash)) {
         ucs_fatal("no %p UCT EP in the %p worker hash of discarded UCT EPs",
                   uct_ep, worker);
     }
-    kh_del(ucp_worker_discard_uct_ep_hash,
-           &worker->discard_uct_ep_hash, iter);
+
+    ucs_assert(kh_value(&worker->discard_uct_ep_hash, iter) == req);
+    kh_del(ucp_worker_discard_uct_ep_hash, &worker->discard_uct_ep_hash, iter);
     UCS_ASYNC_UNBLOCK(&worker->async);
 
     uct_ep_destroy(uct_ep);
@@ -2115,40 +2360,52 @@ static unsigned ucp_worker_discard_uct_ep_destroy_progress(void *arg)
     return 1;
 }
 
-static void
-ucp_worker_discard_uct_ep_flush_comp(uct_completion_t *self)
+static void ucp_worker_discard_uct_ep_progress_register(ucp_request_t *req,
+                                                        ucs_callback_t func)
+{
+    ucp_worker_h worker = req->send.ep->worker;
+
+    ucs_assert_always(req->send.discard_uct_ep.cb_id == UCS_CALLBACKQ_ID_NULL);
+    uct_worker_progress_register_safe(worker->uct, func, req,
+                                      UCS_CALLBACKQ_FLAG_ONESHOT,
+                                      &req->send.discard_uct_ep.cb_id);
+}
+
+static void ucp_worker_discard_uct_ep_flush_comp(uct_completion_t *self)
 {
-    uct_worker_cb_id_t cb_id = UCS_CALLBACKQ_ID_NULL;
-    ucp_request_t *req       = ucs_container_of(self, ucp_request_t,
-                                                send.state.uct_comp);
-    ucp_worker_h worker      = req->send.discard_uct_ep.ucp_worker;
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t,
+                                          send.state.uct_comp);
 
     ucp_trace_req(req, "discard_uct_ep flush completion status %s",
                   ucs_status_string(self->status));
 
     /* don't destroy UCT EP from the flush completion callback, schedule
      * a progress callback on the main thread to destroy UCT EP */
-    uct_worker_progress_register_safe(worker->uct,
-                                      ucp_worker_discard_uct_ep_destroy_progress,
-                                      req, UCS_CALLBACKQ_FLAG_ONESHOT, &cb_id);
+    ucp_worker_discard_uct_ep_progress_register(
+            req, ucp_worker_discard_uct_ep_destroy_progress);
 }
 
 static ucs_status_t
 ucp_worker_discard_uct_ep_pending_cb(uct_pending_req_t *self)
 {
-    ucp_request_t *req       = ucs_container_of(self, ucp_request_t, send.uct);
-    uct_ep_h uct_ep          = req->send.discard_uct_ep.uct_ep;
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    uct_ep_h uct_ep    = req->send.discard_uct_ep.uct_ep;
     ucs_status_t status;
 
     status = uct_ep_flush(uct_ep, req->send.discard_uct_ep.ep_flush_flags,
                           &req->send.state.uct_comp);
-    if (status == UCS_INPROGRESS) {
+    if (status == UCS_OK) {
+        /* don't destroy UCT EP from the pending callback, schedule a progress
+         * callback on the main thread to destroy UCT EP */
+        ucp_worker_discard_uct_ep_progress_register(
+                req, ucp_worker_discard_uct_ep_destroy_progress);
+        return UCS_OK;
+    } else if (status == UCS_INPROGRESS) {
         return UCS_OK;
     } else if (status == UCS_ERR_NO_RESOURCE) {
         return UCS_ERR_NO_RESOURCE;
     }
 
-    /* UCS_OK is handled here as well */
     uct_completion_update_status(&req->send.state.uct_comp, status);
     ucp_worker_discard_uct_ep_flush_comp(&req->send.state.uct_comp);
     return UCS_OK;
@@ -2156,12 +2413,12 @@ ucp_worker_discard_uct_ep_pending_cb(uct_pending_req_t *self)
 
 static unsigned ucp_worker_discard_uct_ep_progress(void *arg)
 {
-    uct_worker_cb_id_t cb_id = UCS_CALLBACKQ_ID_NULL;
-    ucp_request_t *req       = (ucp_request_t*)arg;
-    uct_ep_h uct_ep          = req->send.discard_uct_ep.uct_ep;
-    ucp_worker_h worker      = req->send.discard_uct_ep.ucp_worker;
+    ucp_request_t *req = (ucp_request_t*)arg;
+    uct_ep_h uct_ep    = req->send.discard_uct_ep.uct_ep;
     ucs_status_t status;
 
+    req->send.discard_uct_ep.cb_id = UCS_CALLBACKQ_ID_NULL;
+
     status = ucp_worker_discard_uct_ep_pending_cb(&req->send.uct);
     if (status == UCS_ERR_NO_RESOURCE) {
         status = uct_ep_pending_add(uct_ep, &req->send.uct, 0);
@@ -2169,10 +2426,8 @@ static unsigned ucp_worker_discard_uct_ep_progress(void *arg)
         if (status == UCS_ERR_BUSY) {
             /* adding to the pending queue failed, schedule the UCT EP discard
              * operation on UCT worker progress again */
-            uct_worker_progress_register_safe(worker->uct,
-                                              ucp_worker_discard_uct_ep_progress,
-                                              req, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                              &cb_id);
+            ucp_worker_discard_uct_ep_progress_register(
+                    req, ucp_worker_discard_uct_ep_progress);
         }
 
         return 0;
@@ -2181,58 +2436,101 @@ static unsigned ucp_worker_discard_uct_ep_progress(void *arg)
     return 1;
 }
 
-static int ucp_worker_discard_remove_filter(const ucs_callbackq_elem_t *elem,
-                                            void *arg)
+static int
+ucp_worker_discard_remove_filter(const ucs_callbackq_elem_t *elem, void *arg)
 {
-    return (elem->cb == ucp_worker_discard_uct_ep_progress) ||
-           (elem->cb == ucp_worker_discard_uct_ep_destroy_progress);
+    if ((elem->arg == arg) &&
+        ((elem->cb == ucp_worker_discard_uct_ep_destroy_progress) ||
+         (elem->cb == ucp_worker_discard_uct_ep_progress))) {
+        ucp_worker_discard_uct_ep_complete((ucp_request_t*)elem->arg);
+        return 1;
+    }
+
+    return 0;
 }
 
-static void ucp_worker_discarded_uct_eps_cleanup(ucp_worker_h worker)
+/* Fast-forward UCT EP discarding operation */
+static void
+ucp_worker_discard_uct_ep_purge(uct_pending_req_t *self, void *arg)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    /* If there is a pending request during UCT EP discarding, it means
+     * UCS_ERR_NO_RESOURCE was returned from the flush operation, the operation
+     * was added to a pending queue, complete the discarding operation */
+    ucs_assert_always(req->send.discard_uct_ep.cb_id == UCS_CALLBACKQ_ID_NULL);
+    ucp_worker_discard_uct_ep_complete(req);
+}
+
+static void ucp_worker_discard_uct_ep_cleanup(ucp_worker_h worker)
 {
-    uct_ep_h uct_ep;
     ucp_request_t *req;
+    uct_ep_h uct_ep;
+
+    /* Destroying UCP EP in ucp_ep_disconnected() could start UCT EP discarding
+     * operations. Do cleanup of discarding functionality after trying to
+     * destroy UCP EPs in order to destroy all remaining UCP EPs here (they are
+     * destroyed during discarding operation completion for all UCT EPs) */
 
     kh_foreach(&worker->discard_uct_ep_hash, uct_ep, req, {
-        ucp_worker_flush_ops_count_dec(worker);
-        ucp_request_put(req);
+        ucs_assert(uct_ep == req->send.discard_uct_ep.uct_ep);
+        uct_ep_pending_purge(uct_ep, ucp_worker_discard_uct_ep_purge, NULL);
         uct_ep_destroy(uct_ep);
+
+        /* Either flush is in-progress (will be completed in UCT EP destroy)
+         * or some progress callback is scheduled (will be removed after UCT
+         * EP destroy) */
+        ucs_callbackq_remove_if(&worker->uct->progress_q,
+                                ucp_worker_discard_remove_filter, req);
     })
 }
 
-static void ucp_worker_destroy_eps(ucp_worker_h worker)
+static void ucp_worker_destroy_eps(ucp_worker_h worker,
+                                   ucs_list_link_t *ep_list,
+                                   const char *ep_type_name)
 {
     ucp_ep_ext_gen_t *ep_ext, *tmp;
+    ucp_ep_h ep;
 
-    ucs_debug("worker %p: destroy all endpoints", worker);
-    ucs_list_for_each_safe(ep_ext, tmp, &worker->all_eps, ep_list) {
-        ucp_ep_disconnected(ucp_ep_from_ext_gen(ep_ext), 1);
+    ucs_debug("worker %p: destroy %s endpoints", worker, ep_type_name);
+    ucs_list_for_each_safe(ep_ext, tmp, ep_list, ep_list) {
+        ep = ucp_ep_from_ext_gen(ep_ext);
+        /* Cleanup pending operations on the UCP EP before destroying it, since
+         * ucp_ep_destroy_internal() expects the pending queues of the UCT EPs
+         * will be empty before they are destroyed */
+        ucp_ep_purge_lanes(ep, ucp_ep_err_pending_purge,
+                           UCS_STATUS_PTR(UCS_ERR_CANCELED));
+        ucp_ep_disconnected(ep, 1);
     }
 }
 
 void ucp_worker_destroy(ucp_worker_h worker)
 {
-    ucs_trace_func("worker=%p", worker);
+    ucs_debug("destroy worker %p", worker);
 
     UCS_ASYNC_BLOCK(&worker->async);
     uct_worker_progress_unregister_safe(worker->uct, &worker->keepalive.cb_id);
-    ucs_callbackq_remove_if(&worker->uct->progress_q,
-                            ucp_worker_discard_remove_filter, NULL);
-    ucp_worker_destroy_eps(worker);
+    ucp_worker_destroy_eps(worker, &worker->all_eps, "all");
+    ucp_worker_destroy_eps(worker, &worker->internal_eps, "internal");
     ucp_worker_remove_am_handlers(worker);
     ucp_am_cleanup(worker);
-    ucp_worker_discarded_uct_eps_cleanup(worker);
+    ucp_worker_discard_uct_ep_cleanup(worker);
 
     if (worker->flush_ops_count != 0) {
-        ucs_warn("not all pending operations (%u) were flushed on worker %p "
-                 "that is being destroyed",
-                 worker->flush_ops_count, worker);
+        ucs_warn("worker %p: %u pending operations were not flushed", worker,
+                 worker->flush_ops_count);
+    }
+
+    if (worker->num_all_eps != 0) {
+        ucs_warn("worker %p: %u endpoints were not destroyed", worker,
+                 worker->num_all_eps);
     }
+
     UCS_ASYNC_UNBLOCK(&worker->async);
 
+    ucs_vfs_obj_remove(worker);
     ucp_tag_match_cleanup(&worker->tm);
     ucp_worker_destroy_mpools(worker);
-    ucp_worker_destroy_mem_type_endpoints(worker);
     ucp_worker_close_cms(worker);
     ucp_worker_close_ifaces(worker);
     ucs_conn_match_cleanup(&worker->conn_match_ctx);
@@ -2255,35 +2553,31 @@ ucs_status_t ucp_worker_query(ucp_worker_h worker,
 {
     ucp_context_h context = worker->context;
     ucs_status_t status   = UCS_OK;
-    uint64_t tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap;
     ucp_rsc_index_t tl_id;
 
     if (attr->field_mask & UCP_WORKER_ATTR_FIELD_THREAD_MODE) {
-        if (worker->flags & UCP_WORKER_FLAG_MT) {
-            attr->thread_mode = UCS_THREAD_MODE_MULTI;
-        } else {
-            attr->thread_mode = UCS_THREAD_MODE_SINGLE;
-        }
+        attr->thread_mode = ucp_worker_get_thread_mode(worker->flags);
     }
 
     if (attr->field_mask & UCP_WORKER_ATTR_FIELD_ADDRESS) {
         /* If UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS is not set,
-         * pack all tl adresses */
-        tl_bitmap = UINT64_MAX;
+         * pack all tl addresses */
+        UCS_BITMAP_SET_ALL(tl_bitmap);
 
         if (attr->field_mask & UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS) {
             if (attr->address_flags & UCP_WORKER_ADDRESS_FLAG_NET_ONLY) {
-                tl_bitmap = 0;
-                ucs_for_each_bit(tl_id, context->tl_bitmap) {
+                UCS_BITMAP_CLEAR(&tl_bitmap);
+                UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, tl_id) {
                     if (context->tl_rscs[tl_id].tl_rsc.dev_type == UCT_DEVICE_TYPE_NET) {
-                        tl_bitmap |= UCS_BIT(tl_id);
+                        UCS_BITMAP_SET(tl_bitmap, tl_id);
                     }
                 }
             }
         }
 
-        status = ucp_address_pack(worker, NULL, tl_bitmap,
-                                  UCP_ADDRESS_PACK_FLAGS_WORKER_DEFAULT,
+        status = ucp_address_pack(worker, NULL, &tl_bitmap,
+                                  ucp_worker_default_address_pack_flags(worker),
                                   NULL, &attr->address_length,
                                   (void**)&attr->address);
     }
@@ -2292,6 +2586,10 @@ ucs_status_t ucp_worker_query(ucp_worker_h worker,
         attr->max_am_header = ucp_am_max_header_size(worker);
     }
 
+    if (attr->field_mask & UCP_WORKER_ATTR_FIELD_NAME) {
+        ucs_strncpy_safe(attr->name, worker->name, UCP_ENTITY_NAME_MAX);
+    }
+
     return status;
 }
 
@@ -2379,6 +2677,7 @@ ucs_status_t ucp_worker_arm(ucp_worker_h worker)
     do {
         ret = read(worker->eventfd, &dummy, sizeof(dummy));
         if (ret == sizeof(dummy)) {
+            ucs_trace("worker %p: extracted queued event", worker);
             status = UCS_ERR_BUSY;
             goto out;
         } else if (ret == -1) {
@@ -2507,9 +2806,9 @@ ucs_status_t ucp_worker_get_address(ucp_worker_h worker, ucp_address_t **address
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
-    status = ucp_address_pack(worker, NULL, UINT64_MAX,
-                              UCP_ADDRESS_PACK_FLAGS_WORKER_DEFAULT, NULL,
-                              address_length_p, (void**)address_p);
+    status = ucp_address_pack(worker, NULL, &ucp_tl_bitmap_max,
+                              ucp_worker_default_address_pack_flags(worker),
+                              NULL, address_length_p, (void**)address_p);
 
     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
 
@@ -2524,16 +2823,18 @@ void ucp_worker_release_address(ucp_worker_h worker, ucp_address_t *address)
 void ucp_worker_print_info(ucp_worker_h worker, FILE *stream)
 {
     ucp_context_h context = worker->context;
+    ucp_worker_cfg_index_t rkey_cfg_index;
+    ucp_rsc_index_t rsc_index;
+    ucs_string_buffer_t strb;
     ucp_address_t *address;
     size_t address_length;
     ucs_status_t status;
-    ucp_rsc_index_t rsc_index;
     int first;
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
     fprintf(stream, "#\n");
-    fprintf(stream, "# UCP worker '%s'\n", ucp_worker_get_name(worker));
+    fprintf(stream, "# UCP worker '%s'\n", ucp_worker_get_address_name(worker));
     fprintf(stream, "#\n");
 
     status = ucp_worker_get_address(worker, &address, &address_length);
@@ -2548,7 +2849,7 @@ void ucp_worker_print_info(ucp_worker_h worker, FILE *stream)
         fprintf(stream, "#                 atomics: ");
         first = 1;
         for (rsc_index = 0; rsc_index < worker->context->num_tls; ++rsc_index) {
-            if (worker->atomic_tls & UCS_BIT(rsc_index)) {
+            if (UCS_BITMAP_GET(worker->atomic_tls, rsc_index)) {
                 if (!first) {
                     fprintf(stream, ", ");
                 }
@@ -2562,20 +2863,30 @@ void ucp_worker_print_info(ucp_worker_h worker, FILE *stream)
 
     fprintf(stream, "#\n");
 
-    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
-}
+    if (context->config.ext.proto_enable) {
+        ucs_string_buffer_init(&strb);
+        for (rkey_cfg_index = 0; rkey_cfg_index < worker->rkey_config_count;
+             ++rkey_cfg_index) {
+            ucp_rkey_proto_select_dump(worker, rkey_cfg_index, &strb);
+            ucs_string_buffer_appendf(&strb, "\n");
+        }
+        ucs_string_buffer_dump(&strb, "# ", stream);
+        ucs_string_buffer_cleanup(&strb);
+    }
 
-static int ucp_worker_keepalive_is_enabled(ucp_worker_h worker)
-{
-    return (worker->context->config.ext.keepalive_timeout != 0) &&
-           (worker->context->config.ext.keepalive_num_eps != 0);
+    ucp_worker_mem_type_eps_print_info(worker, stream);
+
+    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
 }
 
 static UCS_F_ALWAYS_INLINE ucp_ep_h
 ucp_worker_keepalive_current_ep(ucp_worker_h worker)
 {
-    ucp_ep_ext_gen_t *ep_ext = ucs_container_of(worker->keepalive.iter,
-                                                ucp_ep_ext_gen_t, ep_list);
+    ucp_ep_ext_gen_t *ep_ext;
+
+    ucs_assert(worker->keepalive.iter != &worker->all_eps);
+    ep_ext = ucs_container_of(worker->keepalive.iter, ucp_ep_ext_gen_t,
+                              ep_list);
     return ucp_ep_from_ext_gen(ep_ext);
 }
 
@@ -2597,70 +2908,101 @@ ucp_worker_keepalive_next_ep(ucp_worker_h worker)
                                  ucp_ep_config(ep)->key.ep_check_map : 0;
 }
 
-static unsigned ucp_worker_keepalive_progress(void *arg)
+static UCS_F_NOINLINE unsigned
+ucp_worker_do_keepalive_progress(ucp_worker_h worker)
 {
-    ucp_worker_h worker = (ucp_worker_h)arg;
-    ucs_time_t now      = ucs_get_time();
-    ucs_list_link_t *iter_begin;
+    unsigned ka_ep_count = 0;
+    unsigned max_ka_ep_count;
+    ucs_time_t now;
     ucp_ep_h ep;
 
+    ucs_assert(worker->context->config.ext.keepalive_num_eps != 0);
+
+    now = ucs_get_time();
     if (ucs_likely((now - worker->keepalive.last_round) <
-                   worker->context->config.ext.keepalive_timeout)) {
-        return 0;
+                   worker->context->config.keepalive_interval)) {
+        goto out;
     }
 
+    ucs_trace_func("worker %p: keepalive round", worker);
+
     if (ucs_unlikely(ucs_list_is_empty(&worker->all_eps))) {
         ucs_assert(worker->keepalive.iter == &worker->all_eps);
+        ucs_trace("worker %p: keepalive ep list is empty - disabling",
+                  worker);
         uct_worker_progress_unregister_safe(worker->uct,
                                             &worker->keepalive.cb_id);
-        return 0;
+        goto out;
     }
 
     if (ucs_unlikely(worker->keepalive.iter == &worker->all_eps)) {
         ucp_worker_keepalive_next_ep(worker);
     }
 
-    iter_begin = worker->keepalive.iter;
-    /* use own loop for elements because standard for_each skips
+    max_ka_ep_count = ucs_min(worker->context->config.ext.keepalive_num_eps,
+                              worker->num_all_eps) - worker->keepalive.ep_count;
+
+    /* Use own loop for elements because standard for_each skips
      * head element */
     /* TODO: use more optimal algo to enumerate EPs to keepalive
      * (linked list) */
     do {
         ep = ucp_worker_keepalive_current_ep(worker);
+        ucs_trace_func("worker %p: do keepalive on ep %p lane_map 0x%x", worker,
+                       ep, worker->keepalive.lane_map);
         ucp_ep_do_keepalive(ep, &worker->keepalive.lane_map);
         if (worker->keepalive.lane_map != 0) {
-            /* in case if EP has no resources to send keepalive message
+            /* In case if EP has no resources to send keepalive message
              * then just return without update of last_round timestamp,
              * on next progress iteration we will continue from this point */
-            goto out_no_resources;
+            worker->keepalive.ep_count += ka_ep_count;
+            goto out;
         }
 
-        worker->keepalive.ep_count++;
+        ka_ep_count++;
         ucp_worker_keepalive_next_ep(worker);
-    } while ((iter_begin != worker->keepalive.iter) &&
-             (worker->keepalive.ep_count < worker->context->config.ext.keepalive_num_eps));
+    } while (ka_ep_count < max_ka_ep_count);
 
+    ucs_trace("worker %p: sent keepalive on %u endpoints", worker, ka_ep_count);
     worker->keepalive.last_round = now;
     worker->keepalive.ep_count   = 0;
 
-out_no_resources:
-    return worker->keepalive.ep_count;
+out:
+    return ka_ep_count;
+}
+
+static unsigned ucp_worker_keepalive_progress(void *arg)
+{
+    ucp_worker_h worker = (ucp_worker_h)arg;
+
+    if ((worker->keepalive.iter_count++ % UCP_WORKER_KEEPALIVE_ITER_SKIP) != 0) {
+        return 0;
+    }
+
+    return ucp_worker_do_keepalive_progress(worker);
 }
 
 void ucp_worker_keepalive_add_ep(ucp_ep_h ep)
 {
     ucp_worker_h worker = ep->worker;
 
-    if (ucp_ep_config(ep)->key.err_mode == UCP_ERR_HANDLING_MODE_NONE) {
-        return;
-    }
+    ucs_assert(ep->cfg_index != UCP_WORKER_CFG_INDEX_NULL);
 
-    if (!ucp_worker_keepalive_is_enabled(worker)) {
+    if ((ep->flags & UCP_EP_FLAG_INTERNAL) ||
+        (ucp_ep_config(ep)->key.ep_check_map == 0) ||
+        !ucp_worker_keepalive_is_enabled(worker)) {
+        ucs_trace("ep %p flags 0x%x cfg_index %d: not using keepalive, "
+                  "err_mode %d ep_check_map 0x%x",
+                  ep, ep->flags, ep->cfg_index, ucp_ep_config(ep)->key.err_mode,
+                  ucp_ep_config(ep)->key.ep_check_map);
         return;
     }
 
+    ucs_trace("ep %p flags 0x%x: adding to keepalive lane_map 0x%x", ep,
+              ep->flags, ucp_ep_config(ep)->key.ep_check_map);
     uct_worker_progress_register_safe(worker->uct,
-                                      ucp_worker_keepalive_progress, worker, 0,
+                                      ucp_worker_keepalive_progress, worker,
+                                      UCS_CALLBACKQ_FLAG_FAST,
                                       &worker->keepalive.cb_id);
 }
 
@@ -2669,6 +3011,8 @@ void ucp_worker_keepalive_remove_ep(ucp_ep_h ep)
 {
     ucp_worker_h worker = ep->worker;
 
+    ucs_assert(!(ep->flags & UCP_EP_FLAG_INTERNAL));
+
     if (!ucp_worker_keepalive_is_enabled(worker)) {
         ucs_assert(worker->keepalive.iter == &worker->all_eps);
         return;
@@ -2686,15 +3030,22 @@ void ucp_worker_keepalive_remove_ep(ucp_ep_h ep)
     }
 }
 
-static void
-ucp_worker_discard_tl_uct_ep(ucp_worker_h worker, uct_ep_h uct_ep,
-                             unsigned ep_flush_flags)
+static void ucp_worker_discard_tl_uct_ep(ucp_ep_h ucp_ep, uct_ep_h uct_ep,
+                                         unsigned ep_flush_flags,
+                                         ucp_send_nbx_callback_t discarded_cb,
+                                         void *discarded_cb_arg)
 {
-    uct_worker_cb_id_t cb_id = UCS_CALLBACKQ_ID_NULL;
+    ucp_worker_h worker = ucp_ep->worker;
     ucp_request_t *req;
     int ret;
     khiter_t iter;
 
+    if (ucp_is_uct_ep_failed(uct_ep)) {
+        /* No need to discard failed TL EP, because it may lead to adding the
+         * same UCT EP to the hash of discarded UCT EPs */
+        return;
+    }
+
     req = ucp_request_get(worker);
     if (ucs_unlikely(req == NULL)) {
         ucs_error("unable to allocate request for discarding UCT EP %p "
@@ -2702,6 +3053,8 @@ ucp_worker_discard_tl_uct_ep(ucp_worker_h worker, uct_ep_h uct_ep,
         return;
     }
 
+    ucp_ep_add_ref(ucp_ep);
+    UCP_EP_ASSERT_COUNTER_INC(&ucp_ep->discard_refcount);
     ucp_worker_flush_ops_count_inc(worker);
     iter = kh_put(ucp_worker_discard_uct_ep_hash, &worker->discard_uct_ep_hash,
                   uct_ep, &ret);
@@ -2715,55 +3068,31 @@ ucp_worker_discard_tl_uct_ep(ucp_worker_h worker, uct_ep_h uct_ep,
     kh_value(&worker->discard_uct_ep_hash, iter) = req;
 
     ucs_assert(!ucp_wireup_ep_test(uct_ep));
+    req->flags                              = UCP_REQUEST_FLAG_RELEASED;
+    req->send.ep                            = ucp_ep;
     req->send.uct.func                      = ucp_worker_discard_uct_ep_pending_cb;
     req->send.state.uct_comp.func           = ucp_worker_discard_uct_ep_flush_comp;
     req->send.state.uct_comp.count          = 1;
     req->send.state.uct_comp.status         = UCS_OK;
-    req->send.discard_uct_ep.ucp_worker     = worker;
     req->send.discard_uct_ep.uct_ep         = uct_ep;
     req->send.discard_uct_ep.ep_flush_flags = ep_flush_flags;
-    uct_worker_progress_register_safe(worker->uct,
-                                      ucp_worker_discard_uct_ep_progress,
-                                      req, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                      &cb_id);    
-}
-
-static void
-ucp_worker_discard_wireup_uct_ep(ucp_worker_h worker,
-                                 ucp_wireup_ep_t *wireup_ep,
-                                 unsigned ep_flush_flags,
-                                 uct_ep_h uct_ep)
-{
-    if (uct_ep == NULL) {
-        return;
-    }
+    req->send.discard_uct_ep.cb_id          = UCS_CALLBACKQ_ID_NULL;
+    ucp_request_set_callback(req, send.cb, discarded_cb, discarded_cb_arg);
 
-    ucp_wireup_ep_disown(&wireup_ep->super.super, uct_ep);
-    /* discard the WIREUP EP's UCT EP */
-    ucp_worker_discard_uct_ep(worker, uct_ep, ep_flush_flags,
-                              /* make sure that there are no WIREUP MSGs
-                               * anymore that are scheduled on the UCT EP, i.e.
-                               * the purge callback hasn't be invoked here */
-                              (uct_pending_purge_callback_t)
-                              ucs_empty_function_do_assert, NULL);
+    ucp_worker_discard_uct_ep_progress(req);
 }
 
-static uct_ep_h
-ucp_worker_discard_wireup_ep(ucp_worker_h worker,
-                             ucp_wireup_ep_t *wireup_ep,
-                             unsigned ep_flush_flags,
-                             uct_pending_purge_callback_t purge_cb,
-                             void *purge_arg)
+static uct_ep_h ucp_worker_discard_wireup_ep(
+        ucp_ep_h ucp_ep, ucp_wireup_ep_t *wireup_ep, unsigned ep_flush_flags,
+        uct_pending_purge_callback_t purge_cb, void *purge_arg,
+        ucp_send_nbx_callback_t discarded_cb, void *discarded_cb_arg)
 {
     uct_ep_h uct_ep;
     int is_owner;
 
     ucs_assert(wireup_ep != NULL);
-
-    ucp_worker_discard_wireup_uct_ep(worker, wireup_ep, ep_flush_flags,
-                                     wireup_ep->aux_ep);
-    ucp_worker_discard_wireup_uct_ep(worker, wireup_ep, ep_flush_flags,
-                                     wireup_ep->sockaddr_ep);
+    ucp_wireup_ep_discard_aux_ep(wireup_ep, ep_flush_flags,
+                                 ucp_destroyed_ep_pending_purge, NULL);
 
     is_owner = wireup_ep->super.is_owner;
     uct_ep   = ucp_wireup_ep_extract_next_ep(&wireup_ep->super.super);
@@ -2777,33 +3106,49 @@ ucp_worker_discard_wireup_ep(ucp_worker_h worker,
     return is_owner ? uct_ep : NULL;
 }
 
-/* must be called with async lock held */
 int ucp_worker_is_uct_ep_discarding(ucp_worker_h worker, uct_ep_h uct_ep)
 {
+    UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED(worker);
     return kh_get(ucp_worker_discard_uct_ep_hash,
                   &worker->discard_uct_ep_hash, uct_ep) !=
            kh_end(&worker->discard_uct_ep_hash);
 }
 
-/* must be called with async lock held */
-void ucp_worker_discard_uct_ep(ucp_worker_h worker, uct_ep_h uct_ep,
+void ucp_worker_discard_uct_ep(ucp_ep_h ucp_ep, uct_ep_h uct_ep,
                                unsigned ep_flush_flags,
                                uct_pending_purge_callback_t purge_cb,
-                               void *purge_arg)
+                               void *purge_arg,
+                               ucp_send_nbx_callback_t discarded_cb,
+                               void *discarded_cb_arg)
 {
+    UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED(ucp_ep->worker);
     ucs_assert(uct_ep != NULL);
     ucs_assert(purge_cb != NULL);
 
     uct_ep_pending_purge(uct_ep, purge_cb, purge_arg);
 
     if (ucp_wireup_ep_test(uct_ep)) {
-        uct_ep = ucp_worker_discard_wireup_ep(worker, ucp_wireup_ep(uct_ep),
-                                              ep_flush_flags,
-                                              purge_cb, purge_arg);
+        uct_ep = ucp_worker_discard_wireup_ep(ucp_ep, ucp_wireup_ep(uct_ep),
+                                              ep_flush_flags, purge_cb,
+                                              purge_arg, discarded_cb,
+                                              discarded_cb_arg);
         if (uct_ep == NULL) {
             return;
         }
     }
 
-    ucp_worker_discard_tl_uct_ep(worker, uct_ep, ep_flush_flags);
+    ucp_worker_discard_tl_uct_ep(ucp_ep, uct_ep, ep_flush_flags, discarded_cb,
+                                 discarded_cb_arg);
+}
+
+void ucp_worker_vfs_refresh(void *obj)
+{
+    ucp_worker_h worker = obj;
+    ucp_ep_ext_gen_t *ep_ext;
+
+    UCS_ASYNC_BLOCK(&worker->async);
+    ucs_list_for_each(ep_ext, &worker->all_eps, ep_list) {
+        ucp_ep_vfs_init(ucp_ep_from_ext_gen(ep_ext));
+    }
+    UCS_ASYNC_UNBLOCK(&worker->async);
 }
diff --git a/src/ucp/core/ucp_worker.h b/src/ucp/core/ucp_worker.h
index 8232f4d3d0d..a9f207f27d6 100644
--- a/src/ucp/core/ucp_worker.h
+++ b/src/ucp/core/ucp_worker.h
@@ -29,28 +29,40 @@
 #define UCP_WORKER_HEADROOM_PRIV_SIZE 32
 
 
+#define UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED(_worker) \
+    ucs_assert(ucs_async_is_blocked(&(_worker)->async))
+
+
 #if ENABLE_MT
 
-#define UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(_worker)                 \
-    do {                                                                \
-        if ((_worker)->flags & UCP_WORKER_FLAG_MT) {                    \
-            UCS_ASYNC_BLOCK(&(_worker)->async);                         \
-        }                                                               \
+#define UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(_worker) \
+    do { \
+        if ((_worker)->flags & UCP_WORKER_FLAG_THREAD_MULTI) { \
+            UCS_ASYNC_BLOCK(&(_worker)->async); \
+        } \
     } while (0)
 
 
-#define UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(_worker)                  \
-    do {                                                                \
-        if ((_worker)->flags & UCP_WORKER_FLAG_MT) {                    \
-            UCS_ASYNC_UNBLOCK(&(_worker)->async);                       \
-        }                                                               \
+#define UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(_worker) \
+    do { \
+        if ((_worker)->flags & UCP_WORKER_FLAG_THREAD_MULTI) { \
+            UCS_ASYNC_UNBLOCK(&(_worker)->async); \
+        } \
     } while (0)
 
 
+#define UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED_CONDITIONAL(_worker) \
+    do { \
+        if ((_worker)->flags & UCP_WORKER_FLAG_THREAD_MULTI) { \
+            UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED(_worker); \
+        } \
+    } while (0)
+
 #else
 
 #define UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(_worker)
 #define UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(_worker)
+#define UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED_CONDITIONAL(_worker)
 
 #endif
 
@@ -59,9 +71,28 @@
  * UCP worker flags
  */
 enum {
-    UCP_WORKER_FLAG_EXTERNAL_EVENT_FD = UCS_BIT(0), /**< worker event fd is external */
-    UCP_WORKER_FLAG_EDGE_TRIGGERED    = UCS_BIT(1), /**< events are edge-triggered */
-    UCP_WORKER_FLAG_MT                = UCS_BIT(2)  /**< MT locking is required */
+    /** Internal worker flags start from this bit index, to co-exist with user
+     * flags specified when worker is created */
+    UCP_WORKER_INTERNAL_FLAGS_SHIFT = 32,
+
+    /** The worker can be accessed from multiple threads at the same time, so
+        locking is required */
+    UCP_WORKER_FLAG_THREAD_MULTI =
+            UCS_BIT(UCP_WORKER_INTERNAL_FLAGS_SHIFT + 0),
+
+    /** The worker can be accessed from multiple threads, but only by one thread
+        at a time, so locking is not required, but IO flush may be required.
+        This flag is mutually exclusive with UCP_WORKER_FLAG_THREAD_MULTI. */
+    UCP_WORKER_FLAG_THREAD_SERIALIZED =
+            UCS_BIT(UCP_WORKER_INTERNAL_FLAGS_SHIFT + 1),
+
+    /** Events are edge-triggered */
+    UCP_WORKER_FLAG_EDGE_TRIGGERED =
+            UCS_BIT(UCP_WORKER_INTERNAL_FLAGS_SHIFT + 2),
+
+    /** Worker event fd is external */
+    UCP_WORKER_FLAG_EXTERNAL_EVENT_FD =
+            UCS_BIT(UCP_WORKER_INTERNAL_FLAGS_SHIFT + 3)
 };
 
 
@@ -202,17 +233,20 @@ struct ucp_worker_cm {
  * UCP worker (thread context).
  */
 typedef struct ucp_worker {
-    unsigned                         flags;               /* Worker flags */
+    uint64_t                         flags;               /* Worker flags */
     ucs_async_context_t              async;               /* Async context for this worker */
     ucp_context_h                    context;             /* Back-reference to UCP context */
     uint64_t                         uuid;                /* Unique ID for wireup */
     uct_worker_h                     uct;                 /* UCT worker handle */
     ucs_mpool_t                      req_mp;              /* Memory pool for requests */
     ucs_mpool_t                      rkey_mp;             /* Pool for small memory keys */
-    uint64_t                         atomic_tls;          /* Which resources can be used for atomics */
+    ucp_tl_bitmap_t                  atomic_tls;          /* Which resources can be used for atomics */
 
     int                              inprogress;
-    char                             name[UCP_WORKER_NAME_MAX]; /* Worker name */
+    /* Worker name for tracing and analysis */
+    char                             name[UCP_ENTITY_NAME_MAX];
+    /* Worker address name composed of host name and process id */
+    char                             address_name[UCP_WORKER_ADDRESS_NAME_MAX];
 
     unsigned                         flush_ops_count;     /* Number of pending operations */
 
@@ -225,13 +259,17 @@ typedef struct ucp_worker {
     void                             *user_data;          /* User-defined data */
     ucs_strided_alloc_t              ep_alloc;            /* Endpoint allocator */
     ucs_list_link_t                  stream_ready_eps;    /* List of EPs with received stream data */
-    ucs_list_link_t                  all_eps;             /* List of all endpoints */
+    unsigned                         num_all_eps;         /* Number of all endpoints (except internal
+                                                           * endpoints) */
+    ucs_list_link_t                  all_eps;             /* List of all endpoints (except internal
+                                                           * endpoints) */
+    ucs_list_link_t                  internal_eps;        /* List of internal endpoints */
     ucs_conn_match_ctx_t             conn_match_ctx;      /* Endpoint-to-endpoint matching context */
     ucp_worker_iface_t               **ifaces;            /* Array of pointers to interfaces,
                                                              one for each resource */
     unsigned                         num_ifaces;          /* Number of elements in ifaces array  */
     unsigned                         num_active_ifaces;   /* Number of activated ifaces  */
-    uint64_t                         scalable_tl_bitmap;  /* Map of scalable tl resources */
+    ucp_tl_bitmap_t                  scalable_tl_bitmap;  /* Map of scalable tl resources */
     ucp_worker_cm_t                  *cms;                /* Array of CMs, one for each component */
     ucs_mpool_t                      am_mp;               /* Memory pool for AM receives */
     ucs_mpool_t                      reg_mp;              /* Registered memory pool */
@@ -262,10 +300,12 @@ typedef struct ucp_worker {
 
     struct {
         uct_worker_cb_id_t           cb_id;               /* Keepalive callback id */
-        ucs_time_t                   last_round;          /* Last round timespamp */
+        ucs_time_t                   last_round;          /* Last round timestamp */
         ucs_list_link_t              *iter;               /* Last EP processed keepalive */
         ucp_lane_map_t               lane_map;            /* Lane map used to retry after no-resources */
-        unsigned                     ep_count;            /* Number if EPs processed in current time slot */
+        unsigned                     ep_count;            /* Number of EPs processed in current time slot */
+        unsigned                     iter_count;          /* Number of progress iterations to skip,
+                                                           * used to minimize call of ucs_get_time */
     } keepalive;
 } ucp_worker_t;
 
@@ -284,7 +324,9 @@ ucp_worker_get_ep_config(ucp_worker_h worker, const ucp_ep_config_key_t *key,
                          int print_cfg, ucp_worker_cfg_index_t *cfg_index_p);
 
 ucs_status_t
-ucp_worker_add_rkey_config(ucp_worker_h worker, const ucp_rkey_config_key_t *key,
+ucp_worker_add_rkey_config(ucp_worker_h worker,
+                           const ucp_rkey_config_key_t *key,
+                           const ucs_sys_dev_distance_t *lanes_distance,
                            ucp_worker_cfg_index_t *cfg_index_p);
 
 ucs_status_t ucp_worker_iface_open(ucp_worker_h worker, ucp_rsc_index_t tl_id,
@@ -320,15 +362,22 @@ void ucp_worker_keepalive_remove_ep(ucp_ep_h ep);
 int ucp_worker_is_uct_ep_discarding(ucp_worker_h worker, uct_ep_h uct_ep);
 
 /* must be called with async lock held */
-void ucp_worker_discard_uct_ep(ucp_worker_h worker, uct_ep_h uct_ep,
+void ucp_worker_discard_uct_ep(ucp_ep_h ucp_ep, uct_ep_h uct_ep,
                                unsigned ep_flush_flags,
                                uct_pending_purge_callback_t purge_cb,
-                               void *purge_arg);
+                               void *purge_arg,
+                               ucp_send_nbx_callback_t discarded_cb,
+                               void *discarded_cb_arg);
+
+char *ucp_worker_print_used_tls(const ucp_ep_config_key_t *key,
+                                ucp_context_h context,
+                                ucp_worker_cfg_index_t config_idx, char *info,
+                                size_t max);
 
-/* must be called with async lock held */
 static UCS_F_ALWAYS_INLINE void
 ucp_worker_flush_ops_count_inc(ucp_worker_h worker)
 {
+    UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED_CONDITIONAL(worker);
     ucs_assert(worker->flush_ops_count < UINT_MAX);
     ++worker->flush_ops_count;
 }
@@ -337,8 +386,11 @@ ucp_worker_flush_ops_count_inc(ucp_worker_h worker)
 static UCS_F_ALWAYS_INLINE void
 ucp_worker_flush_ops_count_dec(ucp_worker_h worker)
 {
+    UCP_WORKER_THREAD_CS_CHECK_IS_BLOCKED_CONDITIONAL(worker);
     ucs_assert(worker->flush_ops_count > 0);
     --worker->flush_ops_count;
 }
 
+void ucp_worker_vfs_refresh(void *obj);
+
 #endif
diff --git a/src/ucp/core/ucp_worker.inl b/src/ucp/core/ucp_worker.inl
index 00d76e0e956..97ec122a8c3 100644
--- a/src/ucp/core/ucp_worker.inl
+++ b/src/ucp/core/ucp_worker.inl
@@ -8,100 +8,74 @@
 #define UCP_WORKER_INL_
 
 #include "ucp_worker.h"
+#include "ucp_rkey.inl"
 
 #include <ucp/core/ucp_request.h>
+#include <ucp/wireup/address.h>
 #include <ucs/datastruct/ptr_map.inl>
 
 
-static UCS_F_ALWAYS_INLINE khint_t
-ucp_worker_rkey_config_hash_func(ucp_rkey_config_key_t rkey_config_key)
-{
-    return (khint_t)rkey_config_key.md_map  ^
-           rkey_config_key.ep_cfg_index     ^
-           (rkey_config_key.mem_type << 16) ^
-           (rkey_config_key.sys_dev  << 24);
-}
+KHASH_IMPL(ucp_worker_rkey_config, ucp_rkey_config_key_t,
+           ucp_worker_cfg_index_t, 1, ucp_rkey_config_hash_func,
+           ucp_rkey_config_is_equal);
 
-static UCS_F_ALWAYS_INLINE int
-ucp_worker_rkey_config_is_equal(ucp_rkey_config_key_t rkey_config_key1,
-                                ucp_rkey_config_key_t rkey_config_key2)
+/**
+ * Resolve remote key configuration key to a remote key configuration index.
+ *
+ * @param [in]  worker          UCP worker to resolve configuration on.
+ * @param [in]  key             Rkey configuration key.
+ * @param [out] cfg_index_p     Filled with configuration index in the worker.
+ */
+static UCS_F_ALWAYS_INLINE ucs_status_t ucp_worker_rkey_config_get(
+        ucp_worker_h worker, const ucp_rkey_config_key_t *key,
+        const ucs_sys_dev_distance_t *lanes_distance,
+        ucp_worker_cfg_index_t *cfg_index_p)
 {
-    return (rkey_config_key1.md_map       == rkey_config_key2.md_map) &&
-           (rkey_config_key1.ep_cfg_index == rkey_config_key2.ep_cfg_index) &&
-           (rkey_config_key1.sys_dev      == rkey_config_key2.sys_dev) &&
-           (rkey_config_key1.mem_type     == rkey_config_key2.mem_type);
-}
+    khiter_t khiter = kh_get(ucp_worker_rkey_config, &worker->rkey_config_hash,
+                             *key);
+    if (ucs_likely(khiter != kh_end(&worker->rkey_config_hash))) {
+        *cfg_index_p = kh_val(&worker->rkey_config_hash, khiter);
+        return UCS_OK;
+    }
 
-KHASH_IMPL(ucp_worker_rkey_config, ucp_rkey_config_key_t, ucp_worker_cfg_index_t,
-           1, ucp_worker_rkey_config_hash_func, ucp_worker_rkey_config_is_equal);
+    return ucp_worker_add_rkey_config(worker, key, lanes_distance, cfg_index_p);
+}
 
 /**
  * @return Worker name
  */
 static UCS_F_ALWAYS_INLINE const char*
-ucp_worker_get_name(ucp_worker_h worker)
+ucp_worker_get_address_name(ucp_worker_h worker)
 {
-    return worker->name;
+    return worker->address_name;
 }
 
 /**
- * @return endpoint by a pointer received from remote side
+ * @return endpoint by a key received from remote side
  */
-static UCS_F_ALWAYS_INLINE ucp_ep_h
-ucp_worker_get_ep_by_id(ucp_worker_h worker, ucs_ptr_map_key_t id)
-{
-    ucp_ep_h ep;
-
-    ucs_assert(id != UCP_EP_ID_INVALID);
-    ep = (ucp_ep_h)ucs_ptr_map_get(&worker->ptr_map, id);
-    ucs_assertv((ep == NULL) || (ep->worker == worker),
-                "worker=%p ep=%p ep->worker=%p", worker,
-                ep, ep->worker);
-    return ep;
-}
-
-static UCS_F_ALWAYS_INLINE ucs_ptr_map_key_t
-ucp_worker_get_request_id(ucp_worker_h worker, ucp_request_t *req, int indirect)
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_worker_get_ep_by_id(ucp_worker_h worker, ucs_ptr_map_key_t id,
+                        ucp_ep_h *ep_p)
 {
-    ucs_ptr_map_key_t id;
     ucs_status_t status;
+    void *ptr;
 
-    status = ucs_ptr_map_put(&worker->ptr_map, req, indirect, &id);
-    if (ucs_unlikely(indirect)) {
-        return (status == UCS_OK) ? id : UCP_REQUEST_ID_INVALID;
+    ucs_assert(id != UCS_PTR_MAP_KEY_INVALID);
+    status = ucs_ptr_map_get(&worker->ptr_map, id, 0, &ptr);
+    if (ucs_unlikely((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS))) {
+        return status;
     }
 
-    ucs_assert(status == UCS_OK);
-    return id;
-}
-
-static UCS_F_ALWAYS_INLINE ucp_request_t*
-ucp_worker_get_request_by_id(ucp_worker_h worker, ucs_ptr_map_key_t id)
-{
-    ucp_request_t* request;
-
-    request = (ucp_request_t*)ucs_ptr_map_get(&worker->ptr_map, id);
-    ucs_assert(request != NULL);
-    return request;
-}
-
-static UCS_F_ALWAYS_INLINE void
-ucp_worker_del_request_id(ucp_worker_h worker, ucs_ptr_map_key_t id)
-{
-    ucs_status_t status UCS_V_UNUSED;
-
-    status = ucs_ptr_map_del(&worker->ptr_map, id);
-    ucs_assert(status == UCS_OK);
+    *ep_p = (ucp_ep_h)ptr;
+    ucs_assertv((*ep_p)->worker == worker, "worker=%p ep=%p ep->worker=%p",
+                worker, (*ep_p), (*ep_p)->worker);
+    return UCS_OK;
 }
 
-static UCS_F_ALWAYS_INLINE ucp_request_t*
-ucp_worker_extract_request_by_id(ucp_worker_h worker, ucs_ptr_map_key_t id)
+static UCS_F_ALWAYS_INLINE int
+ucp_worker_keepalive_is_enabled(ucp_worker_h worker)
 {
-    ucp_request_t *request;
-
-    request = (ucp_request_t*)ucs_ptr_map_extract(&worker->ptr_map, id);
-    ucs_assert(request != NULL);
-    return request;
+    return worker->context->config.keepalive_interval != 0;
 }
 
 /**
@@ -110,15 +84,15 @@ ucp_worker_extract_request_by_id(ucp_worker_h worker, ucs_ptr_map_key_t id)
 static UCS_F_ALWAYS_INLINE ucp_worker_iface_t*
 ucp_worker_iface(ucp_worker_h worker, ucp_rsc_index_t rsc_index)
 {
-    uint64_t tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap;
 
     if (rsc_index == UCP_NULL_RESOURCE) {
         return NULL;
     }
 
     tl_bitmap = worker->context->tl_bitmap;
-    ucs_assert(UCS_BIT(rsc_index) & tl_bitmap);
-    return worker->ifaces[ucs_bitmap2idx(tl_bitmap, rsc_index)];
+    ucs_assert(UCS_BITMAP_GET(tl_bitmap, rsc_index));
+    return worker->ifaces[UCS_BITMAP_POPCOUNT_UPTO_INDEX(tl_bitmap, rsc_index)];
 }
 
 /**
@@ -159,15 +133,6 @@ ucp_worker_num_cm_cmpts(const ucp_worker_h worker)
     return worker->context->config.num_cm_cmpts;
 }
 
-/**
- * @return whether the worker should be using connection manager mode
- */
-static UCS_F_ALWAYS_INLINE int
-ucp_worker_sockaddr_is_cm_proto(const ucp_worker_h worker)
-{
-    return !!ucp_worker_num_cm_cmpts(worker);
-}
-
 /**
  * Check if interface with @a iface_attr supports point-to-point connections.
  *
@@ -226,26 +191,52 @@ ucp_worker_is_tl_2sockaddr(ucp_worker_h worker, ucp_rsc_index_t rsc_index)
               UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR);
 }
 
-/**
- * Resolve remote key configuration key to a remote key configuration index
- *
- * @param [in]  worker       UCP worker to resolve configuration on
- * @param [in]  key          Rkey configuration key
- * @param [out] cfg_index_p  Filled with configuration index in the worker.
- */
-static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_worker_get_rkey_config(ucp_worker_h worker, const ucp_rkey_config_key_t *key,
-                           ucp_worker_cfg_index_t *cfg_index_p)
+static UCS_F_ALWAYS_INLINE unsigned
+ucp_worker_common_address_pack_flags(ucp_worker_h worker)
 {
-    khiter_t khiter = kh_get(ucp_worker_rkey_config, &worker->rkey_config_hash,
-                             *key);
-    if (ucs_likely(khiter != kh_end(&worker->rkey_config_hash))) {
-        *cfg_index_p = kh_val(&worker->rkey_config_hash, khiter);
-        return UCS_OK;
+    unsigned pack_flags = 0;
+
+    if (worker->context->num_mem_type_detect_mds > 0) {
+        pack_flags |= UCP_ADDRESS_PACK_FLAG_SYS_DEVICE;
     }
 
-    return ucp_worker_add_rkey_config(worker, key, cfg_index_p);
+    return pack_flags;
 }
 
+static UCS_F_ALWAYS_INLINE unsigned
+ucp_worker_default_address_pack_flags(ucp_worker_h worker)
+{
+    return ucp_worker_common_address_pack_flags(worker) |
+           UCP_ADDRESS_PACK_FLAG_WORKER_UUID |
+           UCP_ADDRESS_PACK_FLAG_WORKER_NAME |
+           UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR |
+           UCP_ADDRESS_PACK_FLAG_IFACE_ADDR | UCP_ADDRESS_PACK_FLAG_EP_ADDR;
+}
+
+#define UCP_WORKER_GET_EP_BY_ID(_ep_p, _worker, _ep_id, _action, _fmt_str, ...) \
+    { \
+        ucs_status_t __status; \
+        \
+        __status = ucp_worker_get_ep_by_id(_worker, _ep_id, _ep_p); \
+        if (ucs_unlikely(__status != UCS_OK)) { \
+            ucs_trace_data("worker %p: ep id 0x%" PRIx64 \
+                           " was not found, drop" _fmt_str, \
+                           _worker, _ep_id, ##__VA_ARGS__); \
+            _action; \
+        } \
+    }
+
+#define UCP_WORKER_GET_VALID_EP_BY_ID(_ep_p, _worker, _ep_id, _action, \
+                                      _fmt_str, ...) \
+    { \
+        UCP_WORKER_GET_EP_BY_ID(_ep_p, _worker, _ep_id, _action, _fmt_str, \
+                                ##__VA_ARGS__); \
+        if (ucs_unlikely((*(_ep_p))->flags & UCP_EP_FLAG_CLOSED)) { \
+            ucs_trace_data("worker %p: ep id 0x%" PRIx64 " was already closed" \
+                           " ep %p, drop " _fmt_str, \
+                           _worker, _ep_id, *(_ep_p), ##__VA_ARGS__); \
+            _action; \
+        } \
+    }
 
 #endif
diff --git a/src/ucp/dt/datatype_iter.h b/src/ucp/dt/datatype_iter.h
index 9d25abefdf1..24533096fcd 100644
--- a/src/ucp/dt/datatype_iter.h
+++ b/src/ucp/dt/datatype_iter.h
@@ -11,6 +11,7 @@
 #include "dt_generic.h"
 
 #include <ucp/api/ucp.h>
+#include <ucs/memory/memtype_cache.h>
 
 
 /*
@@ -18,10 +19,11 @@
  * into a receive buffer.
  */
 typedef struct {
-    ucp_dt_class_t                dt_class;   /* Datatype class (contig/iov/...) */
-    ucs_memory_type_t             mem_type;   /* Memory type, needed to pack/unpack */
-    size_t                        length;     /* Total packed flat length */
-    size_t                        offset;     /* Current flat offset */
+    ucp_dt_class_t    dt_class; /* Datatype class (contig/iov/...) */
+    ucs_memory_info_t mem_info; /* Memory type and locality, needed to
+                                   pack/unpack */
+    size_t            length; /* Total packed flat length */
+    size_t            offset; /* Current flat offset */
     union {
         struct {
             void                  *buffer;    /* Contiguous buffer pointer */
diff --git a/src/ucp/dt/datatype_iter.inl b/src/ucp/dt/datatype_iter.inl
index 82295151fcb..b7995018a4d 100644
--- a/src/ucp/dt/datatype_iter.inl
+++ b/src/ucp/dt/datatype_iter.inl
@@ -32,8 +32,7 @@ static UCS_F_ALWAYS_INLINE void
 ucp_datatype_contig_iter_init(ucp_context_h context, void *buffer, size_t length,
                               ucp_datatype_t datatype, ucp_datatype_iter_t *dt_iter)
 {
-    dt_iter->mem_type               = ucp_memory_type_detect(context, buffer,
-                                                             length);
+    ucp_memory_detect(context, buffer, length, &dt_iter->mem_info);
     dt_iter->length                 = length;
     dt_iter->type.contig.buffer     = buffer;
     dt_iter->type.contig.reg.md_map = 0;
@@ -53,11 +52,10 @@ ucp_datatype_iov_iter_init(ucp_context_h context, void *buffer, size_t count,
 
     if (ucs_likely(count > 0)) {
         *sg_count         = ucs_min(count, (size_t)UINT8_MAX);
-        dt_iter->mem_type = ucp_memory_type_detect(context, iov->buffer,
-                                                   iov->length);
+        ucp_memory_detect(context, iov->buffer, iov->length, &dt_iter->mem_info);
     } else {
-        *sg_count         = 1;
-        dt_iter->mem_type = UCS_MEMORY_TYPE_HOST;
+        *sg_count = 1;
+        ucp_memory_info_set_host(&dt_iter->mem_info);
     }
 }
 
@@ -70,10 +68,10 @@ ucp_datatype_generic_iter_init(ucp_context_h context, void *buffer, size_t count
 
     state                        = dt_gen->ops.start_pack(dt_gen->context,
                                                           buffer, count);
-    dt_iter->mem_type            = UCS_MEMORY_TYPE_LAST;
     dt_iter->length              = dt_gen->ops.packed_size(state);
     dt_iter->type.generic.dt_gen = dt_gen;
     dt_iter->type.generic.state  = state;
+    ucp_memory_info_set_host(&dt_iter->mem_info);
 }
 
 /*
@@ -128,11 +126,12 @@ ucp_datatype_iter_next_pack(const ucp_datatype_iter_t *dt_iter,
 
     switch (dt_iter->dt_class) {
     case UCP_DATATYPE_CONTIG:
-        ucs_assert(dt_iter->mem_type < UCS_MEMORY_TYPE_LAST);
+        ucs_assert(dt_iter->mem_info.type < UCS_MEMORY_TYPE_LAST);
         length = ucs_min(dt_iter->length - dt_iter->offset, max_length);
         src    = UCS_PTR_BYTE_OFFSET(dt_iter->type.contig.buffer,
                                      dt_iter->offset);
-        ucp_dt_contig_pack(worker, dest, src, length, dt_iter->mem_type);
+        ucp_dt_contig_pack(worker, dest, src, length,
+                           (ucs_memory_type_t)dt_iter->mem_info.type);
         break;
     case UCP_DATATYPE_IOV:
         length = ucs_min(dt_iter->length - dt_iter->offset, max_length);
@@ -178,9 +177,10 @@ ucp_datatype_iter_next_unpack(const ucp_datatype_iter_t *dt_iter,
 
     switch (dt_iter->dt_class) {
     case UCP_DATATYPE_CONTIG:
-        ucs_assert(dt_iter->mem_type < UCS_MEMORY_TYPE_LAST);
+        ucs_assert(dt_iter->mem_info.type < UCS_MEMORY_TYPE_LAST);
         dest = UCS_PTR_BYTE_OFFSET(dt_iter->type.contig.buffer, dt_iter->offset);
-        ucp_dt_contig_unpack(worker, dest, src, length, dt_iter->mem_type);
+        ucp_dt_contig_unpack(worker, dest, src, length,
+                             (ucs_memory_type_t)dt_iter->mem_info.type);
         status = UCS_OK;
         break;
     case UCP_DATATYPE_IOV:
@@ -290,13 +290,14 @@ ucp_datatype_iter_is_end(const ucp_datatype_iter_t *dt_iter)
  */
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_datatype_iter_mem_reg(ucp_context_h context, ucp_datatype_iter_t *dt_iter,
-                          ucp_md_map_t md_map)
+                          ucp_md_map_t md_map, unsigned uct_flags)
 {
     /* TODO support IOV datatype */
     ucs_assert(dt_iter->dt_class == UCP_DATATYPE_CONTIG);
+
     return ucp_mem_rereg_mds(context, md_map, dt_iter->type.contig.buffer,
-                             dt_iter->length, UCT_MD_MEM_ACCESS_RMA, NULL,
-                             dt_iter->mem_type, NULL,
+                             dt_iter->length, uct_flags, NULL,
+                             (ucs_memory_type_t)dt_iter->mem_info.type, NULL,
                              dt_iter->type.contig.reg.memh,
                              &dt_iter->type.contig.reg.md_map);
 }
@@ -307,7 +308,8 @@ ucp_datatype_iter_mem_reg(ucp_context_h context, ucp_datatype_iter_t *dt_iter,
 static UCS_F_ALWAYS_INLINE void
 ucp_datatype_iter_mem_dereg(ucp_context_h context, ucp_datatype_iter_t *dt_iter)
 {
-    ucp_mem_rereg_mds(context, 0, NULL, 0, 0, NULL, dt_iter->mem_type, NULL,
+    ucp_mem_rereg_mds(context, 0, NULL, 0, 0, NULL,
+                      (ucs_memory_type_t)dt_iter->mem_info.type, NULL,
                       dt_iter->type.contig.reg.memh,
                       &dt_iter->type.contig.reg.md_map);
 }
diff --git a/src/ucp/proto/proto.h b/src/ucp/proto/proto.h
index 38f11699466..7462f6737f0 100644
--- a/src/ucp/proto/proto.h
+++ b/src/ucp/proto/proto.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -34,6 +34,10 @@
 #define UCP_PROTO_ID_INVALID        ((ucp_proto_id_t)-1)
 
 
+/** Maximal length of ucp_proto_config_str_func_t output */
+#define UCP_PROTO_CONFIG_STR_MAX 128
+
+
 /* Protocol identifier */
 typedef unsigned ucp_proto_id_t;
 
@@ -47,7 +51,10 @@ typedef uint64_t ucp_proto_id_mask_t;
  */
 enum {
     UCP_PROTO_FLAG_AM_SHORT  = UCS_BIT(0), /* The protocol uses only uct_ep_am_short() */
-    UCP_PROTO_FLAG_PUT_SHORT = UCS_BIT(1)  /* The protocol uses only uct_ep_put_short() */
+    UCP_PROTO_FLAG_PUT_SHORT = UCS_BIT(1), /* The protocol uses only uct_ep_put_short() */
+    UCP_PROTO_FLAG_TAG_SHORT = UCS_BIT(2), /* The protocol uses only
+                                              uct_ep_tag_eager_short() */
+    UCP_PROTO_FLAG_INVALID   = UCS_BIT(3)  /* The protocol is a placeholder */
 };
 
 
@@ -73,6 +80,11 @@ typedef struct {
 typedef struct {
     const ucp_proto_t        *proto;       /* Protocol definition */
     const void               *priv;        /* Protocol private configuration space */
+    ucp_worker_cfg_index_t   ep_cfg_index; /* Endpoint configuration index this
+                                              protocol was selected on */
+    ucp_worker_cfg_index_t   rkey_cfg_index; /* Remote key configuration index
+                                                this protocol was elected on
+                                                (can be UCP_WORKER_CFG_INDEX_NULL) */
     ucp_proto_select_param_t select_param; /* Copy of protocol selection parameters,
                                               used to re-select protocol for existing
                                               in-progress request */
@@ -111,6 +123,7 @@ typedef struct {
     /* Input parameters */
     ucp_worker_h                   worker;           /* Worker to initialize on */
     const ucp_proto_select_param_t *select_param;    /* Operation parameters */
+    ucp_worker_cfg_index_t         ep_cfg_index;     /* Endpoint configuration index */
     const ucp_ep_config_key_t      *ep_config_key;   /* Endpoint configuration */
     const ucp_rkey_config_key_t    *rkey_config_key; /* Remote key configuration,
                                                         may be NULL */
@@ -140,14 +153,18 @@ typedef ucs_status_t
 /**
  * Dump protocol-specific configuration.
  *
- * @param [in]  priv      Protocol private data, which was previously filled by
- *                        @ref ucp_proto_init_func_t.
- * @param [out] strb      Filled with a string of protocol configuration text.
- *                        The user is responsible to release the string by
- *                        calling @ref ucs_string_buffer_cleanup.
+ * @param [in]  min_length  Return information starting from this message length.
+ * @param [in]  max_length  Return information up to this message length (inclusive).
+ * @param [in]  priv        Protocol private data, which was previously filled by
+ *                          @ref ucp_proto_init_func_t.
+ * @param [out] strb        Protocol configuration text should be written to this
+ *                          string buffer. This function should only **append**
+ *                          data to the buffer, and should not initialize, release
+ *                          or erase any data already in the buffer.
  */
-typedef void
-(*ucp_proto_config_str_func_t)(const void *priv, ucs_string_buffer_t *strb);
+typedef void (*ucp_proto_config_str_func_t)(size_t min_length,
+                                            size_t max_length, const void *priv,
+                                            ucs_string_buffer_t *strb);
 
 
 /**
@@ -158,7 +175,11 @@ struct ucp_proto {
     unsigned                        flags;      /* Protocol flags for special handling */
     ucp_proto_init_func_t           init;       /* Initialization function */
     ucp_proto_config_str_func_t     config_str; /* Configuration dump function */
-    uct_pending_callback_t          progress;   /* UCT progress function */
+
+    /* Initial UCT progress function, can be changed during the protocol
+     * request lifetime to implement different stages
+     */
+    uct_pending_callback_t          progress;
 };
 
 
diff --git a/src/ucp/proto/proto_am.c b/src/ucp/proto/proto_am.c
index 3a73ee48a3d..50663f8ceae 100644
--- a/src/ucp/proto/proto_am.c
+++ b/src/ucp/proto/proto_am.c
@@ -76,13 +76,17 @@ ucp_do_am_single(uct_pending_req_t *self, uint8_t am_id,
 ucs_status_t ucp_proto_progress_am_single(uct_pending_req_t *self)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
-    ucs_status_t status = ucp_do_am_single(self, req->send.proto.am_id,
-                                           ucp_proto_pack,
-                                           ucp_proto_max_packed_size());
-    if (status == UCS_OK) {
-        req->send.proto.comp_cb(req);
+    ucs_status_t status;
+
+    status = ucp_do_am_single(self, req->send.proto.am_id, ucp_proto_pack,
+                              ucp_proto_max_packed_size());
+    if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) {
+        return UCS_ERR_NO_RESOURCE;
     }
-    return status;
+
+    /* TODO: handle failure */
+    req->send.proto.comp_cb(req);
+    return UCS_OK;
 }
 
 void ucp_proto_am_zcopy_req_complete(ucp_request_t *req, ucs_status_t status)
@@ -97,17 +101,10 @@ void ucp_proto_am_zcopy_completion(uct_completion_t *self)
     ucp_request_t *req  = ucs_container_of(self, ucp_request_t,
                                            send.state.uct_comp);
 
-    if (req->send.state.dt.offset == req->send.length) {
-        ucp_proto_am_zcopy_req_complete(req, self->status);
-    } else if (self->status != UCS_OK) {
-        ucs_assert(req->send.state.uct_comp.count == 0);
-        ucs_assert(self->status != UCS_INPROGRESS);
-
-        /* NOTE: the request is in pending queue if data was not completely sent,
-         *       just dereg the buffer here and complete request on purge
-         *       pending later.
-         */
-        ucp_request_send_buffer_dereg(req);
-        req->send.state.uct_comp.func = NULL;
+    if (req->send.state.dt.offset != req->send.length) {
+        /* Cannot complete since not all fragments were posted yet */
+        return;
     }
+
+    ucp_proto_am_zcopy_req_complete(req, self->status);
 }
diff --git a/src/ucp/proto/proto_am.inl b/src/ucp/proto/proto_am.inl
index 6440c774003..72037e9ba1b 100644
--- a/src/ucp/proto/proto_am.inl
+++ b/src/ucp/proto/proto_am.inl
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -20,9 +20,22 @@
 
 #define UCP_STATUS_PENDING_SWITCH (UCS_ERR_LAST - 1)
 
+
 typedef void (*ucp_req_complete_func_t)(ucp_request_t *req, ucs_status_t status);
 
 
+static UCS_F_ALWAYS_INLINE void
+ucp_add_uct_iov_elem(uct_iov_t *iov, void *buffer, size_t length,
+                     uct_mem_h memh, size_t *iov_cnt)
+{
+    iov[*iov_cnt].buffer = buffer;
+    iov[*iov_cnt].length = length;
+    iov[*iov_cnt].count  = 1;
+    iov[*iov_cnt].stride = 0;
+    iov[*iov_cnt].memh   = memh;
+    ++(*iov_cnt);
+}
+
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_do_am_bcopy_single(uct_pending_req_t *self, uint8_t am_id,
                        uct_pack_callback_t pack_cb)
@@ -231,30 +244,24 @@ ucs_status_t ucp_am_zcopy_common(ucp_request_t *req, const void *hdr,
     ucp_ep_t *ep          = req->send.ep;
     ucp_md_index_t md_idx = ucp_ep_md_index(ep, req->send.lane);
     size_t iovcnt         = 0ul;
-    unsigned user_hdr_iov_cnt;
+
+    ucp_dt_iov_copy_uct(ep->worker->context, iov, &iovcnt,
+            max_iov - !!user_hdr_size, state, req->send.buffer,
+            req->send.datatype, max_length - user_hdr_size, md_idx, NULL);
 
     if (user_hdr_size != 0) {
         ucs_assert((req->send.length == 0) || (max_length > user_hdr_size));
         ucs_assert(max_iov > 1);
 
-        iov[0].buffer    = user_hdr_desc + 1;
-        iov[0].length    = user_hdr_size;
-        iov[0].memh      = ucp_memh2uct(user_hdr_desc->memh, md_idx);
-        iov[0].stride    = 0;
-        iov[0].count     = 1;
-        user_hdr_iov_cnt = 1;
-    } else {
-        user_hdr_iov_cnt = 0;
-    }
+        ucs_assert(user_hdr_desc != NULL);
 
-    ucp_dt_iov_copy_uct(ep->worker->context, iov + user_hdr_iov_cnt, &iovcnt,
-                        max_iov - user_hdr_iov_cnt, state, req->send.buffer,
-                        req->send.datatype, max_length - user_hdr_size,
-                        md_idx, NULL);
+        ucp_add_uct_iov_elem(iov, user_hdr_desc + 1, user_hdr_size,
+                             ucp_memh2uct(user_hdr_desc->memh, md_idx),
+                             &iovcnt);
+    }
 
     return uct_ep_am_zcopy(ep->uct_eps[req->send.lane], am_id, (void*)hdr,
-                           hdr_size, iov, iovcnt + user_hdr_iov_cnt, 0,
-                           &req->send.state.uct_comp);
+                           hdr_size, iov, iovcnt, 0, &req->send.state.uct_comp);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
@@ -345,11 +352,14 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first,
 
     if (enable_am_bw && (req->send.state.dt.offset != 0)) {
         req->send.lane = ucp_send_request_get_am_bw_lane(req);
-        ucp_send_request_add_reg_lane(req, req->send.lane);
     } else {
         req->send.lane = ucp_ep_get_am_lane(ep);
     }
 
+    if (enable_am_bw || (req->send.state.dt.offset == 0)) {
+        ucp_send_request_add_reg_lane(req, req->send.lane);
+    }
+
     uct_ep     = ep->uct_eps[req->send.lane];
     max_middle = ucp_ep_get_max_zcopy(ep, req->send.lane) - hdr_size_middle;
     max_iov    = ucp_ep_get_max_iov(ep, req->send.lane);
@@ -371,6 +381,8 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first,
         if (offset == 0) {
             /* First stage */
             ucs_assert(req->send.lane == ucp_ep_get_am_lane(ep));
+            ucs_assert(am_id_first != UCP_AM_ID_LAST);
+            ucs_assert(hdr_first != NULL);
 
             status = ucp_am_zcopy_common(req, hdr_first, hdr_size_first,
                                          user_hdr_desc, user_hdr_size, iov, max_iov,
@@ -385,6 +397,9 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first,
                                                    iov[0].length, status);
         } else {
             /* Middle or last stage */
+            ucs_assert(am_id_middle != UCP_AM_ID_LAST);
+            ucs_assert(hdr_middle != NULL);
+
             mid_len = ucs_min(max_middle, req->send.length - offset);
             ucs_assert(offset + mid_len <= req->send.length);
             ucp_dt_iov_copy_uct(ep->worker->context, iov, &iovcnt, max_iov, &state,
@@ -448,9 +463,7 @@ ucs_status_t ucp_do_am_zcopy_multi(uct_pending_req_t *self, uint8_t am_id_first,
                                        UCP_REQUEST_SEND_PROTO_ZCOPY_AM,
                                        status);
         if (UCS_STATUS_IS_ERR(status)) {
-            if (req->send.state.uct_comp.count == 0) {
-               complete(req, status);
-            }
+            ucp_request_send_state_ff(req, status);
             return UCS_OK;
         } else {
             if (enable_am_bw) {
@@ -517,18 +530,26 @@ ucp_proto_get_short_max(const ucp_request_t *req,
            -1 : msg_config->max_short;
 }
 
-static UCS_F_ALWAYS_INLINE ucp_request_t*
-ucp_proto_ssend_ack_request_alloc(ucp_worker_h worker, ucs_ptr_map_key_t ep_id)
+static UCS_F_ALWAYS_INLINE int
+ucp_proto_is_inline(ucp_ep_h ep, const ucp_memtype_thresh_t *max_eager_short,
+                    ssize_t length)
 {
-    ucp_request_t *req;
+    return (ucs_likely(length <= max_eager_short->memtype_off) ||
+            (length <= max_eager_short->memtype_on &&
+             ucp_memory_type_cache_is_empty(ep->worker->context)));
+}
 
-    req = ucp_request_get(worker);
+static UCS_F_ALWAYS_INLINE ucp_request_t*
+ucp_proto_ssend_ack_request_alloc(ucp_worker_h worker, ucp_ep_h ep)
+{
+    ucp_request_t *req = ucp_request_get(worker);
     if (req == NULL) {
+        ucs_error("failed to allocate UCP request");
         return NULL;
     }
 
     req->flags              = 0;
-    req->send.ep            = ucp_worker_get_ep_by_id(worker, ep_id);
+    req->send.ep            = ep;
     req->send.uct.func      = ucp_proto_progress_am_single;
     req->send.proto.comp_cb = ucp_request_put;
     req->send.proto.status  = UCS_OK;
@@ -536,6 +557,18 @@ ucp_proto_ssend_ack_request_alloc(ucp_worker_h worker, ucs_ptr_map_key_t ep_id)
     return req;
 }
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_am_short_handle_status_from_pending(ucp_request_t *req, ucs_status_t status)
+{
+    if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) {
+        req->send.lane = ucp_ep_get_am_lane(req->send.ep);
+        return UCS_ERR_NO_RESOURCE;
+    }
+
+    ucp_request_complete_send(req, status);
+    return UCS_OK;
+}
+
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_am_bcopy_handle_status_from_pending(uct_pending_req_t *self, int multi,
                                         int tag_sync, ucs_status_t status)
@@ -545,9 +578,7 @@ ucp_am_bcopy_handle_status_from_pending(uct_pending_req_t *self, int multi,
     if (multi) {
         if (status == UCS_INPROGRESS) {
             return UCS_INPROGRESS;
-        }
-
-        if (ucs_unlikely(status == UCP_STATUS_PENDING_SWITCH)) {
+        } else if (ucs_unlikely(status == UCP_STATUS_PENDING_SWITCH)) {
             return UCS_OK;
         }
     } else {
@@ -560,7 +591,8 @@ ucp_am_bcopy_handle_status_from_pending(uct_pending_req_t *self, int multi,
 
     ucp_request_send_generic_dt_finish(req);
     if (tag_sync) {
-        ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_LOCAL_COMPLETED,
+        ucp_tag_eager_sync_completion(req,
+                                      UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED,
                                       status);
     } else {
         ucp_request_complete_send(req, status);
diff --git a/src/ucp/proto/proto_common.c b/src/ucp/proto/proto_common.c
index 3fdce4b3920..ab069b40b09 100644
--- a/src/ucp/proto/proto_common.c
+++ b/src/ucp/proto/proto_common.c
@@ -59,10 +59,10 @@ void ucp_proto_common_lane_priv_str(const ucp_proto_common_lane_priv_t *lpriv,
 {
     ucs_string_buffer_appendf(strb, "ln:%d", lpriv->lane);
     if (lpriv->memh_index != UCP_NULL_RESOURCE) {
-        ucs_string_buffer_appendf(strb, "/mh:%d", lpriv->memh_index);
+        ucs_string_buffer_appendf(strb, ",mh%d", lpriv->memh_index);
     }
     if (lpriv->rkey_index != UCP_NULL_RESOURCE) {
-        ucs_string_buffer_appendf(strb, "/rk:%d", lpriv->rkey_index);
+        ucs_string_buffer_appendf(strb, ",rk%d", lpriv->rkey_index);
     }
 }
 
@@ -82,11 +82,24 @@ ucp_proto_common_get_iface_attr(const ucp_proto_init_params_t *params,
                                      ucp_proto_common_get_rsc_index(params, lane));
 }
 
-size_t ucp_proto_get_iface_attr_field(const uct_iface_attr_t *iface_attr,
-                                      ptrdiff_t field_offset, size_t dfl_value)
+size_t ucp_proto_common_get_iface_attr_field(const uct_iface_attr_t *iface_attr,
+                                             ptrdiff_t field_offset,
+                                             size_t dfl_value)
 {
-    return (field_offset == UCP_PROTO_COMMON_OFFSET_INVALID) ? dfl_value :
-           *(const size_t*)UCS_PTR_BYTE_OFFSET(iface_attr, field_offset);
+    if (field_offset == UCP_PROTO_COMMON_OFFSET_INVALID) {
+        return dfl_value;
+    }
+
+    return *(const size_t*)UCS_PTR_BYTE_OFFSET(iface_attr, field_offset);
+}
+
+size_t
+ucp_proto_common_get_max_frag(const ucp_proto_common_init_params_t *params,
+                              const uct_iface_attr_t *iface_attr)
+{
+    return ucp_proto_common_get_iface_attr_field(iface_attr,
+                                                 params->max_frag_offs,
+                                                 SIZE_MAX);
 }
 
 double
@@ -97,41 +110,49 @@ ucp_proto_common_iface_bandwidth(const ucp_proto_common_init_params_t *params,
                                   &iface_attr->bandwidth);
 }
 
-ucp_lane_index_t
-ucp_proto_common_find_lanes(const ucp_proto_common_init_params_t *params,
-                            ucp_lane_type_t lane_type, uint64_t tl_cap_flags,
-                            ucp_lane_index_t max_lanes, ucp_lane_map_t exclude_map,
-                            ucp_lane_index_t *lanes, ucp_md_map_t *reg_md_map_p)
+static ucp_lane_index_t
+ucp_proto_common_find_lanes_internal(const ucp_proto_init_params_t *params,
+                                     unsigned flags, ucp_lane_type_t lane_type,
+                                     uint64_t tl_cap_flags,
+                                     ucp_lane_index_t max_lanes,
+                                     ucp_lane_map_t exclude_map,
+                                     ucp_lane_index_t *lanes)
 {
-    ucp_context_h context                        = params->super.worker->context;
-    const ucp_ep_config_key_t *ep_config_key     = params->super.ep_config_key;
-    const ucp_rkey_config_key_t *rkey_config_key = params->super.rkey_config_key;
-    const ucp_proto_select_param_t *select_param = params->super.select_param;
+    UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_SELECT_PARAM_STR_MAX);
+    ucp_context_h context                        = params->worker->context;
+    const ucp_ep_config_key_t *ep_config_key     = params->ep_config_key;
+    const ucp_rkey_config_key_t *rkey_config_key = params->rkey_config_key;
+    const ucp_proto_select_param_t *select_param = params->select_param;
     const uct_iface_attr_t *iface_attr;
     ucp_lane_index_t lane, num_lanes;
     const uct_md_attr_t *md_attr;
     ucp_rsc_index_t rsc_index;
-    ucs_string_buffer_t strb;
     ucp_md_index_t md_index;
     ucp_lane_map_t lane_map;
-    size_t frag_size;
+    char lane_desc[64];
 
-    ucp_proto_select_param_str(select_param, &strb);
-    ucs_trace("selecting %d out of %d lanes for %s %s", max_lanes,
-              ep_config_key->num_lanes, params->super.proto_name,
-              ucs_string_buffer_cstr(&strb));
-    ucs_string_buffer_cleanup(&strb);
+    num_lanes = 0;
 
-    if (params->flags & UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY) {
+    ucp_proto_select_param_str(select_param, &sel_param_strb);
+    if (rkey_config_key != NULL) {
+        ucs_string_buffer_appendf(&sel_param_strb, "->");
+        ucp_rkey_config_dump_brief(rkey_config_key, &sel_param_strb);
+    }
+    ucs_trace("selecting up to %d/%d lanes for %s %s", max_lanes,
+              ep_config_key->num_lanes, params->proto_name,
+              ucs_string_buffer_cstr(&sel_param_strb));
+    ucs_log_indent(1);
+
+    if (flags & UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY) {
         if ((select_param->dt_class == UCP_DATATYPE_GENERIC) ||
             (select_param->dt_class == UCP_DATATYPE_IOV)) {
             /* Generic/IOV datatype cannot be used with zero-copy send */
             /* TODO support IOV registration */
             ucs_trace("datatype %s cannot be used with zcopy",
                       ucp_datatype_class_names[select_param->dt_class]);
-            return 0;
+            goto out;
         }
-    } else if (!(params->flags & UCP_PROTO_COMMON_INIT_FLAG_MEM_TYPE) &&
+    } else if (!(flags & UCP_PROTO_COMMON_INIT_FLAG_MEM_TYPE) &&
                (select_param->dt_class != UCP_DATATYPE_GENERIC) &&
                !UCP_MEM_IS_ACCESSIBLE_FROM_CPU(select_param->mem_type)) {
         /* If zero-copy is off, the memory must be host-accessible for
@@ -139,34 +160,37 @@ ucp_proto_common_find_lanes(const ucp_proto_common_init_params_t *params,
         ucs_trace("memory type %s with datatype %s is not supported",
                   ucs_memory_type_names[select_param->mem_type],
                   ucp_datatype_class_names[select_param->dt_class]);
-        return 0;
+        goto out;
     }
 
     lane_map      = UCS_MASK(ep_config_key->num_lanes) & ~exclude_map;
-    *reg_md_map_p = 0;
-    num_lanes     = 0;
     ucs_for_each_bit(lane, lane_map) {
         if (num_lanes >= max_lanes) {
             break;
         }
 
-        /* Check if lane type matches */
         ucs_assert(lane < UCP_MAX_LANES);
-        if (!(ep_config_key->lanes[lane].lane_types & UCS_BIT(lane_type))) {
-            ucs_trace("lane[%d]: no %s", lane,
-                      ucp_lane_type_info[lane_type].short_name);
+        rsc_index = ep_config_key->lanes[lane].rsc_index;
+        if (rsc_index == UCP_NULL_RESOURCE) {
             continue;
         }
 
-        rsc_index = ep_config_key->lanes[lane].rsc_index;
-        if (rsc_index == UCP_NULL_RESOURCE) {
+        snprintf(lane_desc, sizeof(lane_desc),
+                 "lane[%d] " UCT_TL_RESOURCE_DESC_FMT, lane,
+                 UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[rsc_index].tl_rsc));
+
+        /* Check if lane type matches */
+        ucs_assert(lane < UCP_MAX_LANES);
+        if (!(ep_config_key->lanes[lane].lane_types & UCS_BIT(lane_type))) {
+            ucs_trace("%s: no %s in name types", lane_desc,
+                      ucp_lane_type_info[lane_type].short_name);
             continue;
         }
 
         /* Check iface capabilities */
-        iface_attr = ucp_proto_common_get_iface_attr(&params->super, lane);
+        iface_attr = ucp_proto_common_get_iface_attr(params, lane);
         if (!ucs_test_all_flags(iface_attr->cap.flags, tl_cap_flags)) {
-            ucs_trace("lane[%d]: no cap 0x%"PRIx64, lane, tl_cap_flags);
+            ucs_trace("%s: no cap 0x%" PRIx64, lane_desc, tl_cap_flags);
             continue;
         }
 
@@ -174,63 +198,149 @@ ucp_proto_common_find_lanes(const ucp_proto_common_init_params_t *params,
         md_attr  = &context->tl_mds[md_index].attr;
 
         /* Check memory registration capabilities for zero-copy case */
-        if (params->flags & UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY) {
+        if (flags & UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY) {
             if (md_attr->cap.flags & UCT_MD_FLAG_NEED_MEMH) {
                 /* Memory domain must support registration on the relevant
                  * memory type */
                 if (!(md_attr->cap.flags & UCT_MD_FLAG_REG) ||
                     !(md_attr->cap.reg_mem_types & UCS_BIT(select_param->mem_type))) {
-                    ucs_trace("lane[%d]: no reg of mem type %s", lane,
+                    ucs_trace("%s: no reg of mem type %s", lane_desc,
                               ucs_memory_type_names[select_param->mem_type]);
                     continue;
                 }
-
-                *reg_md_map_p |= UCS_BIT(md_index);
-            } else {
-                /* Memory domain which does not require a registration for zero
+            } else if (!(md_attr->cap.access_mem_types &
+                         UCS_BIT(select_param->mem_type))) {
+                /*
+                 * Memory domain which does not require a registration for zero
                  * copy operation must be able to access the relevant memory type
-                 * TODO UCT should expose a bitmap of accessible memory types
                  */
-                if (!(md_attr->cap.access_mem_types & UCS_BIT(select_param->mem_type))) {
-                    ucs_trace("lane[%d]: no access to mem type %s", lane,
-                              ucs_memory_type_names[select_param->mem_type]);
-                    continue;
-                }
+                ucs_trace("%s: no access to mem type %s", lane_desc,
+                          ucs_memory_type_names[select_param->mem_type]);
+                continue;
             }
         }
 
         /* Check remote access capabilities */
-        if (params->flags & UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS) {
-            ucs_assert(rkey_config_key != NULL);
+        if (flags & UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS) {
+            if (rkey_config_key == NULL) {
+                ucs_trace("protocol requires remote access but remote key is "
+                          "not present");
+                goto out;
+            }
+
             if (md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY) {
                 if (!(rkey_config_key->md_map &
                     UCS_BIT(ep_config_key->lanes[lane].dst_md_index))) {
-                    ucs_trace("lane[%d]: no support of dst md map 0x%"PRIx64,
-                              lane, rkey_config_key->md_map);
+                    ucs_trace("%s: no support of dst md map 0x%" PRIx64,
+                              lane_desc, rkey_config_key->md_map);
                     continue;
                 }
             } else if (!(md_attr->cap.access_mem_types &
                          UCS_BIT(rkey_config_key->mem_type))) {
-                ucs_trace("lane[%d]: no access to remote mem type %s", lane,
+                ucs_trace("%s: no access to remote mem type %s", lane_desc,
                           ucs_memory_type_names[rkey_config_key->mem_type]);
                 continue;
             }
         }
 
+        lanes[num_lanes++] = lane;
+    }
+
+out:
+    ucs_trace("selected %d lanes", num_lanes);
+    ucs_log_indent(-1);
+    return num_lanes;
+}
+
+ucp_md_map_t
+ucp_proto_common_reg_md_map(const ucp_proto_common_init_params_t *params,
+                            ucp_lane_map_t lane_map)
+{
+    ucp_context_h context                        = params->super.worker->context;
+    const ucp_proto_select_param_t *select_param = params->super.select_param;
+    const uct_md_attr_t *md_attr;
+    ucp_md_index_t md_index;
+    ucp_md_map_t reg_md_map;
+    ucp_lane_index_t lane;
+
+    /* Register memory only for zero-copy send operations */
+    if (!(params->flags & UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY)) {
+        return 0;
+    }
+
+    reg_md_map = 0;
+    ucs_for_each_bit(lane, lane_map) {
+        md_index = ucp_proto_common_get_md_index(&params->super, lane);
+        md_attr  = &context->tl_mds[md_index].attr;
+
+        /* Register if the memory domain support registration for the relevant
+           memory type, and needs a local memory handle for zero-copy
+           communication */
+        if (ucs_test_all_flags(md_attr->cap.flags,
+                               UCT_MD_FLAG_NEED_MEMH | UCT_MD_FLAG_REG) &&
+            (md_attr->cap.reg_mem_types & UCS_BIT(select_param->mem_type))) {
+            reg_md_map |= UCS_BIT(md_index);
+        }
+    }
+
+    return reg_md_map;
+}
+
+ucp_lane_index_t
+ucp_proto_common_find_lanes(const ucp_proto_common_init_params_t *params,
+                            ucp_lane_type_t lane_type, uint64_t tl_cap_flags,
+                            ucp_lane_index_t max_lanes,
+                            ucp_lane_map_t exclude_map, ucp_lane_index_t *lanes)
+{
+    ucp_lane_index_t lane_index, lane, num_lanes, num_valid_lanes;
+    const uct_iface_attr_t *iface_attr;
+    size_t frag_size;
+
+    num_lanes = ucp_proto_common_find_lanes_internal(&params->super,
+                                                     params->flags, lane_type,
+                                                     tl_cap_flags, max_lanes,
+                                                     exclude_map, lanes);
+
+    num_valid_lanes = 0;
+    for (lane_index = 0; lane_index < num_lanes; ++lane_index) {
+        lane       = lanes[lane_index];
+        iface_attr = ucp_proto_common_get_iface_attr(&params->super, lane);
+        frag_size  = ucp_proto_common_get_max_frag(params, iface_attr);
         /* Max fragment size should be larger than header size */
-        frag_size = ucp_proto_get_iface_attr_field(iface_attr,
-                                                   params->max_frag_offs, SIZE_MAX);
         if (frag_size <= params->hdr_size) {
             ucs_trace("lane[%d]: max fragment is too small %zu, need > %zu",
                       lane, frag_size, params->hdr_size);
             continue;
         }
 
-        lanes[num_lanes++] = lane;
+        lanes[num_valid_lanes++] = lane;
     }
 
-    ucs_trace("selected %d lanes", num_lanes);
-    return num_lanes;
+    if (num_valid_lanes != num_lanes) {
+        ucs_assert(num_valid_lanes < num_lanes);
+        ucs_trace("selected %d/%d valid lanes", num_valid_lanes, num_lanes);
+    }
+
+    return num_valid_lanes;
+}
+
+ucp_lane_index_t
+ucp_proto_common_find_am_bcopy_lane(const ucp_proto_init_params_t *params)
+{
+    ucp_lane_index_t lane = UCP_NULL_LANE;
+    ucp_lane_index_t num_lanes;
+
+    num_lanes = ucp_proto_common_find_lanes_internal(
+            params, UCP_PROTO_COMMON_INIT_FLAG_MEM_TYPE, UCP_LANE_TYPE_AM,
+            UCT_IFACE_FLAG_AM_BCOPY, 1, 0, &lane);
+    if (num_lanes == 0) {
+        ucs_debug("no active message lane for %s", params->proto_name);
+        return UCP_NULL_LANE;
+    }
+
+    ucs_assert(num_lanes == 1);
+
+    return lane;
 }
 
 static ucs_linear_func_t
@@ -371,14 +481,14 @@ void ucp_proto_common_calc_perf(const ucp_proto_common_init_params_t *params,
 {
     ucp_context_h context  = params->super.worker->context;
     ucp_proto_caps_t *caps = params->super.caps;
-    double bandwidth, overhead, latency;
+    double overhead, latency, tl_latency;
     const uct_iface_attr_t *iface_attr;
+    size_t frag_size, tl_min_length;
     ucs_linear_func_t extra_time;
     ucs_linear_func_t pack_time;
     ucs_linear_func_t uct_time;
     ucp_lane_index_t lane;
     uint32_t op_attr_mask;
-    size_t frag_size;
 
     /* Remote access implies zero copy on receiver */
     if (params->flags & UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS) {
@@ -389,40 +499,35 @@ void ucp_proto_common_calc_perf(const ucp_proto_common_init_params_t *params,
      * - consider remote/local system device
      * - consider memory type for pack/unpack
      */
+    caps->cfg_thresh   = params->cfg_thresh;
+    caps->cfg_priority = params->cfg_priority;
+    caps->num_ranges   = 0;
+    caps->min_length   = 0;
 
-    bandwidth = 0;
-    overhead  = 0;
-    latency   = params->latency;
-
-    /* Collect latency, overhead, bandwidth from all lanes */
+    /* Collect latency and overhead from all lanes */
+    overhead = 0;
+    latency  = params->latency;
     ucs_for_each_bit(lane, perf_params->lane_map) {
-        iface_attr = ucp_proto_common_get_iface_attr(&params->super, lane);
-        overhead  += iface_attr->overhead;
-        latency    = ucs_max(ucp_tl_iface_latency(context, &iface_attr->latency),
-                             latency);
-        bandwidth += ucp_proto_common_iface_bandwidth(params, iface_attr);
+        iface_attr    = ucp_proto_common_get_iface_attr(&params->super, lane);
+        tl_latency    = ucp_tl_iface_latency(context, &iface_attr->latency);
+        tl_min_length = ucp_proto_common_get_iface_attr_field(
+                iface_attr, params->min_frag_offs, 0);
+
+        overhead        += iface_attr->overhead;
+        latency          = ucs_max(tl_latency, latency);
+        caps->min_length = ucs_max(caps->min_length, tl_min_length);
     }
 
     /* Take fragment size from first lane */
-    iface_attr = ucp_proto_common_get_iface_attr(&params->super,
-                                                 perf_params->lane0);
-    frag_size  = ucp_proto_get_iface_attr_field(iface_attr,
-                                                params->max_frag_offs, SIZE_MAX);
+    frag_size = perf_params->frag_size;
     if (!(params->flags & UCP_PROTO_COMMON_INIT_FLAG_RESPONSE)) {
         /* if the data returns as a response, no need to subtract header size */
         frag_size -= params->hdr_size;
     }
 
-    caps->cfg_thresh   = params->cfg_thresh;
-    caps->cfg_priority = params->cfg_priority;
-    caps->min_length   = ucp_proto_get_iface_attr_field(iface_attr,
-                                                        params->min_frag_offs,
-                                                        0);
-    caps->num_ranges   = 0;
-
     op_attr_mask  = ucp_proto_select_op_attr_from_flags(
                             params->super.select_param->op_flags);
-    uct_time      = ucs_linear_func_make(latency, 1.0 / bandwidth);
+    uct_time      = ucs_linear_func_make(latency, 1.0 / perf_params->bandwidth);
     pack_time     = ucs_linear_func_make(0, 1.0 / context->config.ext.bcopy_bw);
     extra_time    = ucp_proto_common_get_reg_cost(params, perf_params->reg_md_map);
     extra_time.c += overhead + params->overhead;
@@ -453,7 +558,8 @@ void ucp_proto_request_zcopy_completion(uct_completion_t *self)
     /* request should NOT be on pending queue because when we decrement the last
      * refcount the request is not on the pending queue any more
      */
-    ucp_proto_request_zcopy_complete(req, req->send.state.uct_comp.status);
+    ucp_proto_request_zcopy_cleanup(req);
+    ucp_request_complete_send(req, req->send.state.uct_comp.status);
 }
 
 void ucp_proto_request_select_error(ucp_request_t *req,
@@ -462,15 +568,29 @@ void ucp_proto_request_select_error(ucp_request_t *req,
                                     const ucp_proto_select_param_t *sel_param,
                                     size_t msg_length)
 {
+    UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_SELECT_PARAM_STR_MAX);
+    UCS_STRING_BUFFER_ONSTACK(proto_select_strb, UCP_PROTO_CONFIG_STR_MAX);
     ucp_ep_h ep = req->send.ep;
-    ucs_string_buffer_t strb;
 
-    ucp_proto_select_param_str(sel_param, &strb);
+    ucp_proto_select_param_str(sel_param, &sel_param_strb);
     ucp_proto_select_dump(ep->worker, ep->cfg_index, rkey_cfg_index,
-                          proto_select, stdout);
+                          proto_select, &proto_select_strb);
     ucs_fatal("req %p on ep %p to %s: could not find a protocol for %s "
-              "length %zu",
-              req, ep, ucp_ep_peer_name(ep), ucs_string_buffer_cstr(&strb),
-              msg_length);
-    ucs_string_buffer_cleanup(&strb);
+              "length %zu\navailable protocols:\n%s\n",
+              req, ep, ucp_ep_peer_name(ep),
+              ucs_string_buffer_cstr(&sel_param_strb), msg_length,
+              ucs_string_buffer_cstr(&proto_select_strb));
+}
+
+void ucp_proto_request_abort(ucp_request_t *req, ucs_status_t status)
+{
+    ucs_assert(UCS_STATUS_IS_ERR(status));
+    /*
+     * TODO add a method to ucp_proto_t to abort a request (which is currently
+     * not scheduled to a pending queue). The method should wait for UCT
+     * completions and release associated resources, such as memory handles,
+     * remote keys, request ID, etc.
+     */
+    ucs_fatal("abort request %p proto %s status %s: unimplemented", req,
+              req->send.proto_config->proto->name, ucs_status_string(status));
 }
diff --git a/src/ucp/proto/proto_common.h b/src/ucp/proto/proto_common.h
index 8afa736901a..adc6a0c8db0 100644
--- a/src/ucp/proto/proto_common.h
+++ b/src/ucp/proto/proto_common.h
@@ -56,13 +56,17 @@ typedef struct {
 
 
 typedef struct {
-    ucp_lane_map_t          lane_map;      /* Which lanes are used for sending
-                                              data in the protocol */
-    ucp_md_map_t            reg_md_map;    /* Which memory domains are used for
-                                              registration */
-    ucp_lane_index_t        lane0;         /* The lane which is used to send the
-                                              first fragment, to detect fragment
-                                              size and performance ranges */
+    /* Which lanes are used for sending data in the protocol */
+    ucp_lane_map_t lane_map;
+
+    /* Which memory domains are used for registration */
+    ucp_md_map_t   reg_md_map;
+
+    /* Fragment size for performance estimation  */
+    size_t         frag_size;
+
+    /* Total transport bandwidth on all lanes */
+    double         bandwidth;
 } ucp_proto_common_perf_params_t;
 
 
@@ -74,8 +78,24 @@ typedef struct {
 } ucp_proto_common_lane_priv_t;
 
 
+/**
+ * Called the first time the protocol starts sending a request, and only once
+ * per request.
+ *
+ * @param [in] req   Request which started to send.
+ */
 typedef void (*ucp_proto_init_cb_t)(ucp_request_t *req);
-typedef void (*ucp_proto_complete_cb_t)(ucp_request_t *req, ucs_status_t status);
+
+
+/**
+ * Called when a protocol finishes sending (or queueing to the transport) all
+ * its data successfully.
+ *
+ * @param [in] req   Request which is finished sending.
+ *
+ * @return Status code to be returned from the progress function.
+ */
+typedef ucs_status_t (*ucp_proto_complete_cb_t)(ucp_request_t *req);
 
 
 void ucp_proto_common_lane_priv_init(const ucp_proto_common_init_params_t *params,
@@ -97,8 +117,14 @@ ucp_proto_common_get_iface_attr(const ucp_proto_init_params_t *params,
                                 ucp_lane_index_t lane);
 
 
-size_t ucp_proto_get_iface_attr_field(const uct_iface_attr_t *iface_attr,
-                                      ptrdiff_t field_offset, size_t dfl_value);
+size_t
+ucp_proto_common_get_max_frag(const ucp_proto_common_init_params_t *params,
+                              const uct_iface_attr_t *iface_attr);
+
+
+size_t ucp_proto_common_get_iface_attr_field(const uct_iface_attr_t *iface_attr,
+                                             ptrdiff_t field_offset,
+                                             size_t dfl_value);
 
 
 double
@@ -110,8 +136,18 @@ ucp_proto_common_iface_bandwidth(const ucp_proto_common_init_params_t *params,
 ucp_lane_index_t
 ucp_proto_common_find_lanes(const ucp_proto_common_init_params_t *params,
                             ucp_lane_type_t lane_type, uint64_t tl_cap_flags,
-                            ucp_lane_index_t max_lanes, ucp_lane_map_t exclude_map,
-                            ucp_lane_index_t *lanes, ucp_md_map_t *reg_md_map_p);
+                            ucp_lane_index_t max_lanes,
+                            ucp_lane_map_t exclude_map,
+                            ucp_lane_index_t *lanes);
+
+
+ucp_md_map_t
+ucp_proto_common_reg_md_map(const ucp_proto_common_init_params_t *params,
+                            ucp_lane_map_t lane_map);
+
+
+ucp_lane_index_t
+ucp_proto_common_find_am_bcopy_lane(const ucp_proto_init_params_t *params);
 
 
 void ucp_proto_common_calc_perf(const ucp_proto_common_init_params_t *params,
@@ -127,4 +163,6 @@ void ucp_proto_request_select_error(ucp_request_t *req,
                                     const ucp_proto_select_param_t *sel_param,
                                     size_t msg_length);
 
+void ucp_proto_request_abort(ucp_request_t *req, ucs_status_t status);
+
 #endif
diff --git a/src/ucp/proto/proto_common.inl b/src/ucp/proto/proto_common.inl
index 2293064227c..75592d810c5 100644
--- a/src/ucp/proto/proto_common.inl
+++ b/src/ucp/proto/proto_common.inl
@@ -14,26 +14,34 @@
 #include <ucp/core/ucp_request.inl>
 
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_request_bcopy_complete_success(ucp_request_t *req)
+{
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter, UINT_MAX);
+    ucp_request_complete_send(req, UCS_OK);
+    return UCS_OK;
+}
+
 static UCS_F_ALWAYS_INLINE void
-ucp_proto_request_bcopy_complete(ucp_request_t *req, ucs_status_t status)
+ucp_proto_msg_multi_request_init(ucp_request_t *req)
 {
-    ucp_datatype_iter_cleanup(&req->send.dt_iter, UINT_MAX);
-    ucp_request_complete_send(req, status);
+    req->send.msg_proto.message_id = req->send.ep->worker->am_message_id++;
 }
 
 static UCS_F_ALWAYS_INLINE void
-ucp_proto_request_completion_init(ucp_request_t *req,
-                                  uct_completion_callback_t comp_func)
+ucp_proto_completion_init(uct_completion_t *comp,
+                          uct_completion_callback_t comp_func)
 {
-    req->send.state.uct_comp.func   = comp_func;
-    req->send.state.uct_comp.count  = 1;
-    req->send.state.uct_comp.status = UCS_OK;
+    comp->func   = comp_func;
+    comp->count  = 1;
+    comp->status = UCS_OK;
     /* extra ref to be decremented when all sent */
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_proto_request_zcopy_init(ucp_request_t *req, ucp_md_map_t md_map,
-                             uct_completion_callback_t comp_func)
+                             uct_completion_callback_t comp_func,
+                             unsigned uct_reg_flags)
 {
     ucp_ep_h ep = req->send.ep;
     ucs_status_t status;
@@ -41,16 +49,17 @@ ucp_proto_request_zcopy_init(ucp_request_t *req, ucp_md_map_t md_map,
     ucp_trace_req(req, "ucp_proto_zcopy_request_init for %s",
                   req->send.proto_config->proto->name);
 
-    ucp_proto_request_completion_init(req, comp_func);
+    ucp_proto_completion_init(&req->send.state.uct_comp, comp_func);
 
-    status = ucp_datatype_iter_mem_reg(ep->worker->context, &req->send.dt_iter,
-                                       md_map);
+    status = ucp_datatype_iter_mem_reg(ep->worker->context,
+                                       &req->send.state.dt_iter,
+                                       md_map, uct_reg_flags);
     if (status != UCS_OK) {
         return status;
     }
 
     ucp_trace_req(req, "registered md_map 0x%"PRIx64"/0x%"PRIx64,
-                  req->send.dt_iter.type.contig.reg.md_map, md_map);
+                  req->send.state.dt_iter.type.contig.reg.md_map, md_map);
 
     /* We expect the registration to happen on all desired memory domains, since
      * the protocol initialization code would already disqualify any memory
@@ -58,7 +67,10 @@ ucp_proto_request_zcopy_init(ucp_request_t *req, ucp_md_map_t md_map,
      * memory key for zero-copy operations. This assumption simplifies memory
      * key lookups during protocol progress.
      */
-    ucs_assert(req->send.dt_iter.type.contig.reg.md_map == md_map);
+    ucs_assertv((req->send.state.dt_iter.type.contig.reg.md_map == md_map) ||
+                        (req->send.state.dt_iter.length == 0),
+                "md_map=0x%" PRIx64 " reg.md_map=0x%" PRIx64, md_map,
+                req->send.state.dt_iter.type.contig.reg.md_map);
 
     return UCS_OK;
 }
@@ -67,8 +79,9 @@ static UCS_F_ALWAYS_INLINE void
 ucp_proto_request_zcopy_cleanup(ucp_request_t *req)
 {
     ucp_datatype_iter_mem_dereg(req->send.ep->worker->context,
-                                &req->send.dt_iter);
-    ucp_datatype_iter_cleanup(&req->send.dt_iter, UCS_BIT(UCP_DATATYPE_CONTIG));
+                                &req->send.state.dt_iter);
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter,
+                              UCS_BIT(UCP_DATATYPE_CONTIG));
 }
 
 static UCS_F_ALWAYS_INLINE void
@@ -78,6 +91,13 @@ ucp_proto_request_zcopy_complete(ucp_request_t *req, ucs_status_t status)
     ucp_request_complete_send(req, status);
 }
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_request_zcopy_complete_success(ucp_request_t *req)
+{
+    ucp_proto_request_zcopy_complete(req, UCS_OK);
+    return UCS_OK;
+}
+
 /* Select protocol for the request and initialize protocol-related fields */
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_proto_request_set_proto(ucp_worker_h worker, ucp_ep_h ep,
@@ -92,17 +112,24 @@ ucp_proto_request_set_proto(ucp_worker_h worker, ucp_ep_h ep,
 
     thresh_elem = ucp_proto_select_lookup(worker, proto_select, ep->cfg_index,
                                           rkey_cfg_index, sel_param, msg_length);
-    if (ucs_unlikely(thresh_elem == NULL)) {
+    if (UCS_ENABLE_ASSERT && (thresh_elem == NULL)) {
+        /* We expect that a protocol will always be found, or we will fallback
+           to 'reconfig' placeholder */
         ucp_proto_request_select_error(req, proto_select, rkey_cfg_index,
                                        sel_param, msg_length);
         return UCS_ERR_UNREACHABLE;
     }
 
+    /* Set pointer to request's protocol configuration */
+    ucs_assert(thresh_elem->proto_config.ep_cfg_index == ep->cfg_index);
+    ucs_assert(thresh_elem->proto_config.rkey_cfg_index == rkey_cfg_index);
+
     proto                  = thresh_elem->proto_config.proto;
     req->send.proto_config = &thresh_elem->proto_config;
     req->send.uct.func     = proto->progress;
 
     if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_REQ)) {
+        ucs_string_buffer_init(&strb);
         ucp_proto_select_param_str(sel_param, &strb);
         ucp_trace_req(req, "selected protocol %s for %s length %zu",
                       proto->name, ucs_string_buffer_cstr(&strb), msg_length);
@@ -128,15 +155,15 @@ ucp_proto_request_send_op(ucp_ep_h ep, ucp_proto_select_t *proto_select,
     req->send.ep = ep;
 
     ucp_datatype_iter_init(worker->context, (void*)buffer, count, datatype,
-                           contig_length, &req->send.dt_iter, &sg_count);
+                           contig_length, &req->send.state.dt_iter, &sg_count);
 
     ucp_proto_select_param_init(&sel_param, op_id, param->op_attr_mask,
-                                req->send.dt_iter.dt_class,
-                                req->send.dt_iter.mem_type,
-                                sg_count);
+                                req->send.state.dt_iter.dt_class,
+                                &req->send.state.dt_iter.mem_info, sg_count);
 
-    status = ucp_proto_request_set_proto(worker, ep, req, proto_select, rkey_cfg_index,
-                                      &sel_param, contig_length);
+    status = ucp_proto_request_set_proto(worker, ep, req, proto_select,
+                                         rkey_cfg_index, &sel_param,
+                                         contig_length);
     if (status != UCS_OK) {
         goto out_put_request;
     }
@@ -162,4 +189,28 @@ out_put_request:
     return UCS_STATUS_PTR(status);
 }
 
+static UCS_F_ALWAYS_INLINE size_t
+ucp_proto_request_pack_rkey(ucp_request_t *req, void *rkey_buffer)
+{
+    ssize_t packed_rkey_size;
+
+    /* For contiguous buffer, pack one rkey
+     * TODO to support IOV datatype write N [address+length] records,
+     */
+    ucs_assert(req->send.state.dt_iter.dt_class == UCP_DATATYPE_CONTIG);
+
+    packed_rkey_size = ucp_rkey_pack_uct(
+            req->send.ep->worker->context,
+            req->send.state.dt_iter.type.contig.reg.md_map,
+            req->send.state.dt_iter.type.contig.reg.memh,
+            &req->send.state.dt_iter.mem_info, 0, NULL, rkey_buffer);
+    if (packed_rkey_size < 0) {
+        ucs_error("failed to pack remote key: %s",
+                  ucs_status_string((ucs_status_t)packed_rkey_size));
+        return 0;
+    }
+
+    return packed_rkey_size;
+}
+
 #endif
diff --git a/src/ucp/proto/proto_multi.c b/src/ucp/proto/proto_multi.c
index 83346acee88..62afb200c91 100644
--- a/src/ucp/proto/proto_multi.c
+++ b/src/ucp/proto/proto_multi.c
@@ -8,8 +8,8 @@
 #  include "config.h"
 #endif
 
-#include "proto_multi.h"
-#include "proto_common.h"
+#include "proto_common.inl"
+#include "proto_multi.inl"
 
 #include <ucs/debug/assert.h>
 #include <ucs/debug/log.h>
@@ -18,92 +18,139 @@
 ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params)
 {
     ucp_proto_multi_priv_t *mpriv = params->super.super.priv;
+    ucp_context_h context         = params->super.super.worker->context;
+    const double max_bw_ratio     = context->config.ext.multi_lane_max_ratio;
+    double max_bandwidth, max_frag_ratio, total_bandwidth;
     ucp_lane_index_t lanes[UCP_PROTO_MAX_LANES];
     double lanes_bandwidth[UCP_PROTO_MAX_LANES];
+    size_t lanes_max_frag[UCP_PROTO_MAX_LANES];
     ucp_proto_common_perf_params_t perf_params;
+    ucp_lane_index_t i, lane, num_lanes;
     const uct_iface_attr_t *iface_attr;
     ucp_proto_multi_lane_priv_t *lpriv;
-    ucp_md_map_t reg_md_map;
-    double total_bandwidth;
-    ucp_lane_index_t i;
+    ucp_lane_map_t lane_map;
 
     ucs_assert(params->max_lanes >= 1);
     ucs_assert(params->max_lanes <= UCP_PROTO_MAX_LANES);
 
     /* Find first lane */
-    mpriv->num_lanes = ucp_proto_common_find_lanes(&params->super,
-                                                   params->first.lane_type,
-                                                   params->first.tl_cap_flags,
-                                                   1, 0, lanes, &reg_md_map);
-    if (mpriv->num_lanes == 0) {
+    num_lanes = ucp_proto_common_find_lanes(&params->super,
+                                            params->first.lane_type,
+                                            params->first.tl_cap_flags, 1, 0,
+                                            lanes);
+    if (num_lanes == 0) {
         ucs_trace("no lanes for %s", params->super.super.proto_name);
         return UCS_ERR_UNSUPPORTED;
     }
 
-    mpriv->reg_md_map = reg_md_map;
-
     /* Find rest of the lanes */
-    mpriv->num_lanes  += ucp_proto_common_find_lanes(&params->super,
-                                                     params->middle.lane_type,
-                                                     params->middle.tl_cap_flags,
-                                                     params->max_lanes - 1,
-                                                     UCS_BIT(lanes[0]),
-                                                     lanes + 1, &reg_md_map);
-    mpriv->reg_md_map |= reg_md_map;
-
-    /* Fill the size of private data */
-    *params->super.super.priv_size =
-            sizeof(ucp_proto_multi_priv_t) +
-            (mpriv->num_lanes * ucs_field_sizeof(ucp_proto_multi_priv_t, lanes[0]));
-
-    /* Initialize parameters for calculating performance */
-    perf_params.lane_map   = 0;
-    perf_params.reg_md_map = mpriv->reg_md_map;
-    perf_params.lane0      = lanes[0];
+    num_lanes += ucp_proto_common_find_lanes(&params->super,
+                                             params->middle.lane_type,
+                                             params->middle.tl_cap_flags,
+                                             params->max_lanes - 1,
+                                             UCS_BIT(lanes[0]), lanes + 1);
+
+    /* Get bandwidth of all lanes and max_bandwidth */
+    max_bandwidth = 0;
+    for (i = 0; i < num_lanes; ++i) {
+        lane       = lanes[i];
+        iface_attr = ucp_proto_common_get_iface_attr(&params->super.super,
+                                                     lane);
+
+        lanes_bandwidth[lane] = ucp_proto_common_iface_bandwidth(&params->super,
+                                                                 iface_attr);
+        lanes_max_frag[lane]  = ucp_proto_common_get_max_frag(&params->super,
+                                                              iface_attr);
 
-    /* Collect information from all lanes */
-    total_bandwidth = 0;
-    for (i = 0; i < mpriv->num_lanes; ++i) {
-        lpriv                 = &mpriv->lanes[i];
+        /* Calculate maximal bandwidth of all lanes, to skip slow lanes */
+        max_bandwidth = ucs_max(max_bandwidth, lanes_bandwidth[lane]);
+    }
 
-        perf_params.lane_map |= UCS_BIT(lanes[i]);
-        iface_attr            = ucp_proto_common_get_iface_attr(&params->super.super,
-                                                                lanes[i]);
-        lanes_bandwidth[i]    = ucp_proto_common_iface_bandwidth(&params->super,
-                                                                 iface_attr);
-        total_bandwidth      += lanes_bandwidth[i];
+    /* Select the lanes to use, and calculate their total bandwidth */
+    total_bandwidth = 0;
+    lane_map        = 0;
+    max_frag_ratio  = 0;
+    for (i = 0; i < num_lanes; ++i) {
+        lane = lanes[i];
+        if ((lanes_bandwidth[lane] * max_bw_ratio) < max_bandwidth) {
+            /* Bandwidth on this lane is too low compared to the fastest
+               available lane, so it's not worth using it */
+            continue;
+        }
 
-        lpriv->max_frag       = ucp_proto_get_iface_attr_field(iface_attr,
-                                         params->super.max_frag_offs, SIZE_MAX);
+        /* Calculate maximal bandwidth-to-fragment-size ratio, which is used to
+           adjust fragment sizes so they are proportional to bandwidth ratio and
+           also do not exceed maximal supported size */
+        max_frag_ratio = ucs_max(max_frag_ratio,
+                                 lanes_bandwidth[lane] / lanes_max_frag[lane]);
 
-        ucp_proto_common_lane_priv_init(&params->super, mpriv->reg_md_map,
-                                        lanes[i], &lpriv->super);
+        total_bandwidth += lanes_bandwidth[lane];
+        lane_map        |= UCS_BIT(lane);
     }
 
-    /* Set up the relative weights */
-    for (i = 0; i < mpriv->num_lanes; ++i) {
-        mpriv->lanes[i].weight = lanes_bandwidth[i] / total_bandwidth;
+    /* Initialize multi-lane private data and relative weights */
+    mpriv->reg_md_map = ucp_proto_common_reg_md_map(&params->super, lane_map);
+    mpriv->num_lanes  = 0;
+    ucs_for_each_bit(lane, lane_map) {
+        ucs_assert(lane < UCP_MAX_LANES);
+        lpriv = &mpriv->lanes[mpriv->num_lanes++];
+        ucp_proto_common_lane_priv_init(&params->super, mpriv->reg_md_map, lane,
+                                        &lpriv->super);
+        lpriv->weight   = ucs_proto_multi_calc_weight(lanes_bandwidth[lane],
+                                                      total_bandwidth);
+        lpriv->max_frag = ucs_double_to_sizet(lanes_bandwidth[lane] /
+                                                      max_frag_ratio,
+                                              SIZE_MAX);
+        ucs_assert(lpriv->max_frag <= lanes_max_frag[lane]);
+        ucs_assert(lpriv->max_frag > 0);
     }
 
+    /* Fill the size of private data according to number of used lanes */
+    *params->super.super.priv_size = sizeof(ucp_proto_multi_priv_t) +
+                                     (mpriv->num_lanes * sizeof(*lpriv));
+
+    /* Calculate protocol performance */
+    perf_params.reg_md_map = mpriv->reg_md_map;
+    perf_params.frag_size  = mpriv->lanes[0].max_frag;
+    perf_params.lane_map   = lane_map;
+    perf_params.bandwidth  = total_bandwidth;
     ucp_proto_common_calc_perf(&params->super, &perf_params);
 
     return UCS_OK;
 }
 
-void ucp_proto_multi_config_str(const void *priv, ucs_string_buffer_t *strb)
+void ucp_proto_multi_config_str(size_t min_length, size_t max_length,
+                                const void *priv, ucs_string_buffer_t *strb)
 {
     const ucp_proto_multi_priv_t *mpriv = priv;
     const ucp_proto_multi_lane_priv_t *lpriv;
+    size_t percent, remaining;
+    char frag_size_buf[64];
     ucp_lane_index_t i;
 
-    ucs_string_buffer_init(strb);
+    remaining = 100;
     for (i = 0; i < mpriv->num_lanes; ++i) {
-        if (i > 0) {
-            ucs_string_buffer_appendf(strb, " ");
+        lpriv      = &mpriv->lanes[i];
+        percent    = ucs_min(remaining,
+                             ucp_proto_multi_scaled_length(lpriv, 100));
+        remaining -= percent;
+
+        if (percent != 100) {
+            ucs_string_buffer_appendf(strb, "%zu%%*", percent);
         }
 
-        lpriv = &mpriv->lanes[i];
-        ucs_string_buffer_appendf(strb, "%.0f%% ", 100.0 * lpriv->weight);
         ucp_proto_common_lane_priv_str(&lpriv->super, strb);
+
+        /* Print fragment size if it's small enough. For large fragments we can
+           skip the print because it has little effect on performance */
+        if (lpriv->max_frag < UCS_MBYTE) {
+            ucs_memunits_to_str(lpriv->max_frag, frag_size_buf,
+                                sizeof(frag_size_buf));
+            ucs_string_buffer_appendf(strb, "<=%s", frag_size_buf);
+        }
+
+        if ((i + 1) < mpriv->num_lanes) {
+            ucs_string_buffer_appendf(strb, "|");
+        }
     }
 }
diff --git a/src/ucp/proto/proto_multi.h b/src/ucp/proto/proto_multi.h
index 1a26c7074de..c420fb341a6 100644
--- a/src/ucp/proto/proto_multi.h
+++ b/src/ucp/proto/proto_multi.h
@@ -13,6 +13,10 @@
 #include <ucp/dt/datatype_iter.h>
 
 
+/* ucp_proto_multi_lane_priv_t.weight is shifted by this value */
+#define UCP_PROTO_MULTI_WEIGHT_SHIFT 16
+
+
 /**
  * UCP base protocol definition for multi-fragment protocols
  */
@@ -27,9 +31,16 @@ typedef struct ucp_proto_send_multi {
  * One lane configuration for multi-lane protocol
  */
 typedef struct {
-    ucp_proto_common_lane_priv_t   super;
-    size_t                         max_frag;   /* Max frag size on this lane */
-    double                         weight;     /* Relative weight for this lane */
+    ucp_proto_common_lane_priv_t super;
+
+    /* Maximal fragment size on this lane */
+    size_t                       max_frag;
+
+    /* Ratio of data to send on this lane.
+     * This is a fixed-point numeric representation (n * 2^shift), where "n" is
+     * the real value, and "shift" is defined by UCP_PROTO_MULTI_WEIGHT_SHIFT.
+     */
+    uint32_t                     weight;
 } ucp_proto_multi_lane_priv_t;
 
 
@@ -75,6 +86,7 @@ typedef ucs_status_t (*ucp_proto_send_multi_cb_t)(
 ucs_status_t ucp_proto_multi_init(const ucp_proto_multi_init_params_t *params);
 
 
-void ucp_proto_multi_config_str(const void *priv, ucs_string_buffer_t *strb);
+void ucp_proto_multi_config_str(size_t min_length, size_t max_length,
+                                const void *priv, ucs_string_buffer_t *strb);
 
 #endif
diff --git a/src/ucp/proto/proto_multi.inl b/src/ucp/proto/proto_multi.inl
index c5c0e84516e..8567d5374a2 100644
--- a/src/ucp/proto/proto_multi.inl
+++ b/src/ucp/proto/proto_multi.inl
@@ -13,20 +13,52 @@
 
 
 static UCS_F_ALWAYS_INLINE void
-ucp_proto_multi_request_init(ucp_request_t *req)
+ucp_proto_multi_set_send_lane(ucp_request_t *req)
 {
-    req->send.multi_lane_idx = 0;
 #if ENABLE_ASSERT
-    req->send.lane           = UCP_NULL_LANE;
+    req->send.lane = UCP_NULL_LANE;
 #endif
 }
 
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_multi_request_init(ucp_request_t *req)
+{
+    req->send.multi_lane_idx = 0;
+    ucp_proto_multi_set_send_lane(req);
+}
+
+static UCS_F_ALWAYS_INLINE size_t
+ucs_proto_multi_calc_weight(double lane_weight, double total_weight)
+{
+    return (size_t)(
+            lane_weight * UCS_BIT(UCP_PROTO_MULTI_WEIGHT_SHIFT) / total_weight +
+            0.5);
+}
+
+static UCS_F_ALWAYS_INLINE size_t
+ucp_proto_multi_scaled_length(const ucp_proto_multi_lane_priv_t *lpriv,
+                              size_t length)
+{
+    return (lpriv->weight * length + UCS_MASK(UCP_PROTO_MULTI_WEIGHT_SHIFT)) >>
+           UCP_PROTO_MULTI_WEIGHT_SHIFT;
+}
+
 static UCS_F_ALWAYS_INLINE size_t
 ucp_proto_multi_max_payload(ucp_request_t *req,
                             const ucp_proto_multi_lane_priv_t *lpriv,
                             size_t hdr_size)
 {
-    return lpriv->max_frag - hdr_size;
+    size_t scaled_length =
+            ucp_proto_multi_scaled_length(lpriv, req->send.state.dt_iter.length);
+    size_t max_payload   = ucs_min(scaled_length, lpriv->max_frag - hdr_size);
+
+    ucs_assertv(max_payload > 0,
+                "length=%zu weight=%.2f scaled_length=%zu max_frag=%zu "
+                "hdr_size=%zu",
+                req->send.state.dt_iter.length,
+                lpriv->weight / (double)UCS_BIT(UCP_PROTO_MULTI_WEIGHT_SHIFT),
+                scaled_length, lpriv->max_frag, hdr_size);
+    return max_payload;
 }
 
 static size_t UCS_F_ALWAYS_INLINE
@@ -34,22 +66,57 @@ ucp_proto_multi_data_pack(ucp_proto_multi_pack_ctx_t *pack_ctx, void *dest)
 {
     ucp_request_t *req = pack_ctx->req;
 
-    return ucp_datatype_iter_next_pack(&req->send.dt_iter, req->send.ep->worker,
+    return ucp_datatype_iter_next_pack(&req->send.state.dt_iter,
+                                       req->send.ep->worker,
                                        pack_ctx->max_payload,
                                        pack_ctx->next_iter, dest);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_proto_multi_progress(ucp_request_t *req, ucp_proto_send_multi_cb_t send_func,
-                         ucp_proto_complete_cb_t complete_func, unsigned dt_mask)
+ucp_proto_multi_no_resource(ucp_request_t *req,
+                            const ucp_proto_multi_lane_priv_t *lpriv)
+{
+    ucs_status_t status;
+    uct_ep_h uct_ep;
+
+    if (lpriv->super.lane == req->send.lane) {
+        /* if we failed to send on same lane, return error */
+        return UCS_ERR_NO_RESOURCE;
+    }
+
+    /* failed to send on another lane - add to its pending queue */
+    uct_ep = req->send.ep->uct_eps[lpriv->super.lane];
+    status = uct_ep_pending_add(uct_ep, &req->send.uct, 0);
+    if (status == UCS_ERR_BUSY) {
+        /* try sending again */
+        return UCS_INPROGRESS;
+    }
+
+    ucs_assert(status == UCS_OK);
+    req->send.lane = lpriv->super.lane;
+
+    /* Remove the request from current pending queue because it was added to
+     * other lane's pending queue.
+     */
+    return UCS_OK;
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_multi_progress(ucp_request_t *req,
+                         const ucp_proto_multi_priv_t *mpriv,
+                         ucp_proto_send_multi_cb_t send_func,
+                         ucp_proto_complete_cb_t complete_func,
+                         unsigned dt_mask)
 {
-    const ucp_proto_multi_priv_t *mpriv = req->send.proto_config->priv;
     const ucp_proto_multi_lane_priv_t *lpriv;
-    ucs_status_t pending_add_status;
     ucp_datatype_iter_t next_iter;
     ucp_lane_index_t lane_idx;
     ucs_status_t status;
-    uct_ep_h uct_ep;
+
+    ucs_assertv(req->send.multi_lane_idx < mpriv->num_lanes,
+                "lane_idx=%d num_lanes=%d", req->send.multi_lane_idx,
+                mpriv->num_lanes);
+    ucs_assert(!ucp_datatype_iter_is_end(&req->send.state.dt_iter));
 
     lane_idx = req->send.multi_lane_idx;
     lpriv    = &mpriv->lanes[lane_idx];
@@ -62,40 +129,18 @@ ucp_proto_multi_progress(ucp_request_t *req, ucp_proto_send_multi_cb_t send_func
         /* operation started and completion will be called later */
         ++req->send.state.uct_comp.count;
     } else if (status == UCS_ERR_NO_RESOURCE) {
-        if (lpriv->super.lane == req->send.lane) {
-            /* if we failed to send on same lane, return error */
-            return UCS_ERR_NO_RESOURCE;
-        }
-
-        /* failed to send on another lane - add to its pending queue */
-        uct_ep             = req->send.ep->uct_eps[lpriv->super.lane];
-        pending_add_status = uct_ep_pending_add(uct_ep, &req->send.uct, 0);
-        if (pending_add_status == UCS_ERR_BUSY) {
-            /* try sending again */
-            return UCS_INPROGRESS;
-        }
-
-        ucs_assert(pending_add_status == UCS_OK);
-        req->send.lane = lpriv->super.lane;
-
-        /* remove the request from current pending queue because it was
-         * added to other lane's pending queue
-         * TODO return an indication, if the protocol needs to roll-back
-         */
-        return UCS_OK;
+        return ucp_proto_multi_no_resource(req, lpriv);
     } else {
-        /* send failed - complete request with error */
-        ucs_debug("send %s completed with status %s",
-                  req->send.proto_config->proto->name, ucs_status_string(status));
-        complete_func(req, status);
+        /* failed to send - call common error handler */
+        ucp_proto_request_abort(req, status);
         return UCS_OK;
     }
 
     /* advance position in send buffer */
-    ucp_datatype_iter_copy_from_next(&req->send.dt_iter, &next_iter, dt_mask);
-    if (ucp_datatype_iter_is_end(&req->send.dt_iter)) {
-        complete_func(req, UCS_OK);
-        return UCS_OK;
+    ucp_datatype_iter_copy_from_next(&req->send.state.dt_iter, &next_iter,
+                                     dt_mask);
+    if (ucp_datatype_iter_is_end(&req->send.state.dt_iter)) {
+        return complete_func(req);
     }
 
     /* move to the next lane, in a round-robin fashion */
@@ -109,20 +154,37 @@ ucp_proto_multi_progress(ucp_request_t *req, ucp_proto_send_multi_cb_t send_func
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_proto_multi_zcopy_progress(uct_pending_req_t *uct_req,
+ucp_proto_multi_bcopy_progress(ucp_request_t *req,
+                               const ucp_proto_multi_priv_t *mpriv,
                                ucp_proto_init_cb_t init_func,
                                ucp_proto_send_multi_cb_t send_func,
-                               uct_completion_callback_t comp_func)
+                               ucp_proto_complete_cb_t comp_func)
+{
+    if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
+        ucp_proto_multi_request_init(req);
+        if (init_func != NULL) {
+            init_func(req);
+        }
+
+        req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
+    }
+
+    return ucp_proto_multi_progress(req, mpriv, send_func, comp_func, UINT_MAX);
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t ucp_proto_multi_zcopy_progress(
+        ucp_request_t *req, const ucp_proto_multi_priv_t *mpriv,
+        ucp_proto_init_cb_t init_func, unsigned uct_mem_flags,
+        ucp_proto_send_multi_cb_t send_func,
+        uct_completion_callback_t comp_func)
 {
-    ucp_request_t *req                 = ucs_container_of(uct_req, ucp_request_t,
-                                                          send.uct);
-    const ucp_proto_multi_priv_t *priv = req->send.proto_config->priv;
     ucs_status_t status;
 
     if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
-        status = ucp_proto_request_zcopy_init(req, priv->reg_md_map, comp_func);
+        status = ucp_proto_request_zcopy_init(req, mpriv->reg_md_map, comp_func,
+                                              uct_mem_flags);
         if (status != UCS_OK) {
-            ucp_proto_request_zcopy_complete(req, status);
+            ucp_proto_request_abort(req, status);
             return UCS_OK; /* remove from pending after request is completed */
         }
 
@@ -134,8 +196,8 @@ ucp_proto_multi_zcopy_progress(uct_pending_req_t *uct_req,
         req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
     }
 
-    return ucp_proto_multi_progress(req, send_func,
-                                    ucp_request_invoke_uct_completion,
+    return ucp_proto_multi_progress(req, mpriv, send_func,
+                                    ucp_request_invoke_uct_completion_success,
                                     UCS_BIT(UCP_DATATYPE_CONTIG));
 }
 
diff --git a/src/ucp/proto/proto_reconfig.c b/src/ucp/proto/proto_reconfig.c
new file mode 100644
index 00000000000..f6dc8f05fa4
--- /dev/null
+++ b/src/ucp/proto/proto_reconfig.c
@@ -0,0 +1,109 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "proto_common.inl"
+
+#include <ucp/core/ucp_worker.inl>
+
+
+/* Select a new protocol and start progressing it */
+static ucs_status_t ucp_proto_reconfig_select_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req  = ucs_container_of(self, ucp_request_t, send.uct);
+    ucp_ep_h ep         = req->send.ep;
+    ucp_worker_h worker = ep->worker;
+    ucp_worker_cfg_index_t prev_rkey_cfg_index;
+    ucp_rkey_config_key_t rkey_config_key;
+    ucp_worker_cfg_index_t rkey_cfg_index;
+    ucp_proto_select_t *proto_select;
+    ucs_status_t status;
+
+    /*
+     * Find the protocol selection hash: could be either on the endpoint or on
+     * the remote key
+     */
+    prev_rkey_cfg_index = req->send.proto_config->rkey_cfg_index;
+    if (prev_rkey_cfg_index == UCP_WORKER_CFG_INDEX_NULL) {
+        proto_select   = &worker->ep_config[ep->cfg_index].proto_select;
+        rkey_cfg_index = UCP_WORKER_CFG_INDEX_NULL;
+    } else {
+        rkey_config_key = worker->rkey_config[prev_rkey_cfg_index].key;
+        rkey_config_key.ep_cfg_index = ep->cfg_index;
+
+        status = ucp_worker_rkey_config_get(worker, &rkey_config_key, NULL,
+                                            &rkey_cfg_index);
+        if (status != UCS_OK) {
+            ucs_error("failed to switch to new rkey");
+            return UCS_OK;
+        }
+
+        proto_select = &worker->rkey_config[rkey_cfg_index].proto_select;
+    }
+
+    /* Select from protocol hash according to saved request parameters */
+    status = ucp_proto_request_set_proto(worker, ep, req, proto_select,
+                                         rkey_cfg_index,
+                                         &req->send.proto_config->select_param,
+                                         req->send.state.dt_iter.length);
+    if (status != UCS_OK) {
+        /* will try again later */
+        return UCS_ERR_NO_RESOURCE;
+    }
+
+    return req->send.uct.func(&req->send.uct);
+}
+
+static ucs_status_t ucp_proto_reconfig_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    ucp_ep_h ep        = req->send.ep;
+
+    /* This protocol should not be selected for valid and connected endpoint */
+    ucs_assert(!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED));
+
+    if (ep->cfg_index != req->send.proto_config->ep_cfg_index) {
+        ucp_trace_req(req,
+                      "ep configuration changed from %d to %d,"
+                      " reconfigure protocol",
+                      req->send.proto_config->ep_cfg_index, ep->cfg_index);
+        return ucp_proto_reconfig_select_progress(self);
+    }
+
+    /* TODO select wireup lane when needed */
+    req->send.lane = ucp_ep_config(ep)->key.am_lane;
+    return UCS_ERR_NO_RESOURCE;
+}
+
+static ucs_status_t
+ucp_proto_reconfig_init(const ucp_proto_init_params_t *init_params)
+{
+    /* Default reconfiguration protocol is a fallback for any case protocol
+     * selection is unsuccessful. The protocol keeps queuing requests until they
+     * can be executed.
+     */
+    *init_params->priv_size                 = 0;
+    init_params->caps->cfg_thresh           = UCS_MEMUNITS_INF;
+    init_params->caps->cfg_priority         = 0;
+    init_params->caps->min_length           = 0;
+    init_params->caps->num_ranges           = 1;
+    init_params->caps->ranges[0].max_length = SIZE_MAX;
+    init_params->caps->ranges[0].perf       = ucs_linear_func_make(INFINITY, 0);
+
+    return UCS_OK;
+}
+
+static ucp_proto_t ucp_reconfig_proto = {
+    .name       = "reconfig",
+    .flags      = UCP_PROTO_FLAG_INVALID,
+    .init       = ucp_proto_reconfig_init,
+    .config_str = (ucp_proto_config_str_func_t)ucs_empty_function,
+    .progress   = ucp_proto_reconfig_progress
+};
+UCP_PROTO_REGISTER(&ucp_reconfig_proto);
diff --git a/src/ucp/proto/proto_select.c b/src/ucp/proto/proto_select.c
index cd3603c8c7b..5c500e92664 100644
--- a/src/ucp/proto/proto_select.c
+++ b/src/ucp/proto/proto_select.c
@@ -10,6 +10,7 @@
 
 #include "proto_select.h"
 #include "proto_select.inl"
+#include "proto_single.h"
 
 #include <ucp/core/ucp_context.h>
 #include <ucp/core/ucp_worker.h>
@@ -67,9 +68,7 @@ ucp_proto_thresholds_append(ucs_array_t(ucp_proto_thresh) *thresh_list,
     /* Consolidate with last protocol if possible */
     if (!ucs_array_is_empty(thresh_list)) {
         thresh_elem = ucs_array_last(thresh_list);
-        ucs_assertv(max_length > thresh_elem->max_length,
-                    "max_length=%zu last->max_length=%zu",
-                    max_length, thresh_elem->max_length);
+        ucs_assert(max_length > thresh_elem->max_length);
         if (thresh_elem->proto_id == proto_id) {
             thresh_elem->max_length = max_length;
             return UCS_OK;
@@ -116,14 +115,27 @@ ucp_proto_perf_append(ucs_array_t(ucp_proto_perf) *perf_list, size_t max_length,
     return UCS_OK;
 }
 
+static void ucp_proto_select_perf_str(const ucs_linear_func_t *perf,
+                                      char *time_str, size_t time_str_max,
+                                      char *bw_str, size_t bw_str_max)
+{
+    /* Estimated time */
+    snprintf(time_str, time_str_max, "%.0f + %.3f * N", perf->c * 1e9,
+             perf->m * 1e9);
+
+    /* Estimated bandwidth (MiB/s) */
+    snprintf(bw_str, bw_str_max, "%.2f", 1.0 / (perf->m * UCS_MBYTE));
+}
+
+
 static ucs_status_t
 ucp_proto_thresholds_select_best(ucp_proto_id_mask_t proto_mask,
                                  const ucs_linear_func_t *proto_perf,
                                  ucs_array_t(ucp_proto_thresh) *thresh_list,
                                  ucs_array_t(ucp_proto_perf) *perf_list,
-                                 size_t start, size_t end,
-                                 const char *title)
+                                 size_t start, size_t end)
 {
+    char time_str[64], bw_str[64], num_str[64];
     struct {
         ucp_proto_id_t proto_id;
         double         result;
@@ -131,15 +143,13 @@ ucp_proto_thresholds_select_best(ucp_proto_id_mask_t proto_mask,
     ucs_status_t status;
     double x_intersect;
     size_t midpoint;
-    char buf[64];
 
-    ucs_trace("candidate protocols for %s %s:", title,
-              ucs_memunits_range_str(start, end, buf, sizeof(buf)));
+    ucs_trace("  %-16s %-20s %-18s", "PROTOCOL", "TIME", "BANDWIDTH (MB/s)");
     ucs_for_each_bit(curr.proto_id, proto_mask) {
-        ucs_trace("%24s %.0f+%.3f*X nsec",
-                  ucp_proto_id_field(curr.proto_id, name),
-                  proto_perf[curr.proto_id].c * UCS_NSEC_PER_SEC,
-                  proto_perf[curr.proto_id].m * UCS_NSEC_PER_SEC);
+        ucp_proto_select_perf_str(&proto_perf[curr.proto_id], time_str,
+                                  sizeof(time_str), bw_str, sizeof(bw_str));
+        ucs_trace("  %-16s %-20s %-18s",
+                  ucp_proto_id_field(curr.proto_id, name), time_str, bw_str);
     }
 
     do {
@@ -152,7 +162,8 @@ ucp_proto_thresholds_select_best(ucp_proto_id_mask_t proto_mask,
             curr.result = ucs_linear_func_apply(proto_perf[curr.proto_id],
                                                 start + UCP_PROTO_MSGLEN_EPSILON);
             ucs_assert(curr.result != DBL_MAX);
-            if (curr.result < best.result) {
+            if ((best.proto_id == UCP_PROTO_ID_INVALID) ||
+                (curr.result < best.result)) {
                 best = curr;
             }
         }
@@ -160,9 +171,10 @@ ucp_proto_thresholds_select_best(ucp_proto_id_mask_t proto_mask,
         /* Since proto_mask != 0, we should find at least one protocol */
         ucs_assert(best.proto_id != UCP_PROTO_ID_INVALID);
 
-        ucs_trace("  best protocol at %s is %s",
-                  ucs_memunits_to_str(start, buf, sizeof(buf)),
+        ucs_trace("best protocol at %s is %s",
+                  ucs_memunits_to_str(start, num_str, sizeof(num_str)),
                   ucp_proto_id_field(best.proto_id, name));
+        ucs_log_indent(1);
 
         /* Find first (smallest) intersection point between the current best
          * protocol and any other protocol. This would be the point where that
@@ -179,17 +191,18 @@ ucp_proto_thresholds_select_best(ucp_proto_id_mask_t proto_mask,
                  * otherwise best.proto_id is better than curr.proto_id at
                  * 'end' as well as at 'start'.
                  */
-                if (x_intersect < (double)SIZE_MAX) {
-                    midpoint = ucs_min((size_t)x_intersect, midpoint);
-                }
-                ucs_trace("   - intersects with %s at %.2f, midpoint is %s",
+                midpoint = ucs_min(ucs_double_to_sizet(x_intersect, SIZE_MAX),
+                                   midpoint);
+                ucs_memunits_to_str(midpoint, num_str, sizeof(num_str));
+                ucs_trace("intersects with %s at %.2f, midpoint is %s",
                           ucp_proto_id_field(curr.proto_id, name), x_intersect,
-                          ucs_memunits_to_str(midpoint, buf, sizeof(buf)));
+                          num_str);
             } else {
-                ucs_trace("   - intersects with %s out of range",
+                ucs_trace("intersects with %s out of range",
                           ucp_proto_id_field(curr.proto_id, name));
             }
         }
+        ucs_log_indent(-1);
 
         status = ucp_proto_thresholds_append(thresh_list, midpoint,
                                              best.proto_id);
@@ -217,16 +230,17 @@ ucp_proto_thresholds_select_next(ucp_proto_id_mask_t proto_mask,
                                  const ucp_proto_caps_t *proto_caps,
                                  ucs_array_t(ucp_proto_thresh) *thresh_list,
                                  ucs_array_t(ucp_proto_perf) *perf_list,
-                                 size_t msg_length, size_t *max_length_p,
-                                 const char *title)
+                                 size_t msg_length, size_t *max_length_p)
 {
     ucp_proto_id_mask_t valid_proto_mask, disabled_proto_mask;
     ucs_linear_func_t proto_perf[UCP_PROTO_MAX_COUNT];
+    ucp_proto_id_t max_prio_proto_id;
     const ucp_proto_caps_t *caps;
     unsigned max_cfg_priority;
     ucp_proto_id_t proto_id;
-    size_t max_length;
     ucs_status_t status;
+    char range_str[64];
+    size_t max_length;
     unsigned i;
 
     /*
@@ -238,6 +252,7 @@ ucp_proto_thresholds_select_next(ucp_proto_id_mask_t proto_mask,
     disabled_proto_mask = 0;
     max_cfg_priority    = 0;
     max_length          = SIZE_MAX;
+    max_prio_proto_id   = UCP_PROTO_ID_INVALID;
     ucs_for_each_bit(proto_id, proto_mask) {
         caps = &proto_caps[proto_id];
 
@@ -274,7 +289,9 @@ ucp_proto_thresholds_select_next(ucp_proto_id_mask_t proto_mask,
                 max_length           = ucs_min(max_length, caps->cfg_thresh - 1);
             } else {
                 /* The protocol is force-activated on 'msg_length' and above */
-                max_cfg_priority     = ucs_max(max_cfg_priority, caps->cfg_priority);
+                max_cfg_priority  = ucs_max(max_cfg_priority,
+                                            caps->cfg_priority);
+                max_prio_proto_id = proto_id;
             }
         }
     }
@@ -284,15 +301,24 @@ ucp_proto_thresholds_select_next(ucp_proto_id_mask_t proto_mask,
         return UCS_ERR_UNSUPPORTED;
     }
 
+    ucs_memunits_range_str(msg_length, max_length, range_str,
+                           sizeof(range_str));
+    ucs_trace("select best protocol for %s", range_str);
+    ucs_log_indent(1);
+
     /* A protocol with configured threshold disables all inferior protocols */
     ucs_for_each_bit(proto_id, valid_proto_mask) {
         if (proto_caps[proto_id].cfg_priority >= max_cfg_priority) {
             continue;
         }
 
+        ucs_assert(max_prio_proto_id != UCP_PROTO_ID_INVALID);
         disabled_proto_mask |= UCS_BIT(proto_id);
-        ucs_trace("skipping proto %d with priority %u since it's less than %u",
-                  proto_id, proto_caps[proto_id].cfg_priority, max_cfg_priority);
+        ucs_trace("disable %s with priority %u: prefer %s with priority %u",
+                  ucp_proto_id_field(proto_id, name),
+                  proto_caps[proto_id].cfg_priority,
+                  ucp_proto_id_field(max_prio_proto_id, name),
+                  max_cfg_priority);
     }
 
     /* Remove disabled protocols. 'disabled_proto_mask' must be contained in
@@ -310,14 +336,14 @@ ucp_proto_thresholds_select_next(ucp_proto_id_mask_t proto_mask,
     ucs_assert(valid_proto_mask != 0);
 
     status = ucp_proto_thresholds_select_best(valid_proto_mask, proto_perf,
-                                              thresh_list, perf_list, msg_length,
-                                              max_length, title);
-    if (status != UCS_OK) {
-        return status;
+                                              thresh_list, perf_list,
+                                              msg_length, max_length);
+    if (status == UCS_OK) {
+        *max_length_p = max_length;
     }
 
-    *max_length_p = max_length;
-    return UCS_OK;
+    ucs_log_indent(-1);
+    return status;
 }
 
 static ucs_status_t
@@ -325,8 +351,7 @@ ucp_proto_select_init_protocols(ucp_worker_h worker,
                                 ucp_worker_cfg_index_t ep_cfg_index,
                                 ucp_worker_cfg_index_t rkey_cfg_index,
                                 const ucp_proto_select_param_t *select_param,
-                                ucp_proto_select_init_protocols_t *proto_init,
-                                const char *title)
+                                ucp_proto_select_init_protocols_t *proto_init)
 {
     ucp_proto_init_params_t init_params;
     ucp_proto_caps_t *proto_caps;
@@ -342,6 +367,7 @@ ucp_proto_select_init_protocols(ucp_worker_h worker,
 
     init_params.worker        = worker;
     init_params.select_param  = select_param;
+    init_params.ep_cfg_index  = ep_cfg_index;
     init_params.ep_config_key = &worker->ep_config[ep_cfg_index].key;
 
     if (rkey_cfg_index == UCP_WORKER_CFG_INDEX_NULL) {
@@ -376,20 +402,33 @@ ucp_proto_select_init_protocols(ucp_worker_h worker,
         init_params.caps       = proto_caps;
         init_params.proto_name = ucp_proto_id_field(proto_id, name);
 
+        ucs_trace("trying %s", ucp_proto_id_field(proto_id, name));
+        ucs_log_indent(1);
+
         status = ucp_proto_id_call(proto_id, init, &init_params);
         if (status != UCS_OK) {
-            ucs_trace("protocol %s on %s failed to initialize: %s",
-                      ucp_proto_id_field(proto_id, name), title,
-                      ucs_status_string(status));
+            if (status != UCS_ERR_UNSUPPORTED) {
+                ucs_trace("protocol %s failed to initialize: %s",
+                          ucp_proto_id_field(proto_id, name),
+                          ucs_status_string(status));
+            }
+            ucs_log_indent(-1);
             continue;
         }
 
-        ucs_trace("protocol %s on %s has %u ranges, min_length %s, cfg_thresh %s",
-                  ucp_proto_id_field(proto_id, name), title, proto_caps->num_ranges,
+        ucs_string_buffer_init(&strb);
+        ucp_proto_id_call(proto_id, config_str, proto_caps->min_length,
+                          SIZE_MAX, init_params.priv, &strb);
+        ucs_trace("protocol %s has %u ranges, min_length %s, cfg_thresh %s %s",
+                  ucp_proto_id_field(proto_id, name), proto_caps->num_ranges,
                   ucs_memunits_to_str(proto_caps->min_length, min_length_str,
                                       sizeof(min_length_str)),
                   ucs_memunits_to_str(proto_caps->cfg_thresh, thresh_str,
-                                      sizeof(thresh_str)));
+                                      sizeof(thresh_str)),
+                  ucs_string_buffer_cstr(&strb));
+        ucs_string_buffer_cleanup(&strb);
+
+        ucs_log_indent(-1);
 
         /* A successful protocol initialization must return non-empty
          * performance range */
@@ -403,6 +442,7 @@ ucp_proto_select_init_protocols(ucp_worker_h worker,
 
     if (proto_init->mask == 0) {
         /* No protocol can support the given selection parameters */
+        ucs_string_buffer_init(&strb);
         ucp_proto_select_param_str(select_param, &strb);
         ucs_debug("no protocols found for %s", ucs_string_buffer_cstr(&strb));
         ucs_string_buffer_cleanup(&strb);
@@ -432,12 +472,11 @@ ucp_proto_select_init_protocols(ucp_worker_h worker,
     return status;
 }
 
-static ucs_status_t
-ucp_proto_select_elem_init_thresh(ucp_proto_select_elem_t *select_elem,
-                                  const ucp_proto_select_init_protocols_t *proto_init,
-                                  ucp_worker_cfg_index_t ep_cfg_index,
-                                  ucp_worker_cfg_index_t rkey_cfg_index,
-                                  const char *select_param_str)
+static ucs_status_t ucp_proto_select_elem_init_thresh(
+        ucp_proto_select_elem_t *select_elem,
+        const ucp_proto_select_init_protocols_t *proto_init,
+        ucp_worker_cfg_index_t ep_cfg_index,
+        ucp_worker_cfg_index_t rkey_cfg_index)
 {
     UCS_ARRAY_DEFINE_ONSTACK(tmp_thresh_list, ucp_proto_thresh,
                              UCP_PROTO_MAX_COUNT);
@@ -466,13 +505,11 @@ ucp_proto_select_elem_init_thresh(ucp_proto_select_elem_t *select_elem,
         status = ucp_proto_thresholds_select_next(proto_init->mask,
                                                   proto_init->caps,
                                                   &tmp_thresh_list,
-                                                  &tmp_perf_list,
-                                                  msg_length, &max_length,
-                                                  select_param_str);
+                                                  &tmp_perf_list, msg_length,
+                                                  &max_length);
         if (status != UCS_OK) {
             if (status == UCS_ERR_UNSUPPORTED) {
-                ucs_warn("no protocol for %s msg_length %zu", select_param_str,
-                         msg_length);
+                ucs_debug("no protocol for msg_length %zu", msg_length);
             }
             goto err;
         }
@@ -507,6 +544,8 @@ ucp_proto_select_elem_init_thresh(ucp_proto_select_elem_t *select_elem,
 
         proto_config                 = &thresholds[i].proto_config;
         proto_config->select_param   = *proto_init->select_param;
+        proto_config->ep_cfg_index   = ep_cfg_index;
+        proto_config->rkey_cfg_index = rkey_cfg_index;
         proto_config->proto          = ucp_protocols[proto_id];
         proto_config->priv           = UCS_PTR_BYTE_OFFSET(select_elem->priv_buf,
                                                            priv_offset);
@@ -547,31 +586,37 @@ ucp_proto_select_elem_init(ucp_worker_h worker,
                            const ucp_proto_select_param_t *select_param,
                            ucp_proto_select_elem_t *select_elem)
 {
+    UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_SELECT_PARAM_STR_MAX);
     ucp_proto_select_init_protocols_t *proto_init;
-    ucs_string_buffer_t strb;
     ucs_status_t status;
 
-    ucp_proto_select_param_str(select_param, &strb);
+    ucp_proto_select_param_str(select_param, &sel_param_strb);
+    if (rkey_cfg_index != UCP_WORKER_CFG_INDEX_NULL) {
+        ucs_string_buffer_appendf(&sel_param_strb, "->");
+        ucp_rkey_config_dump_brief(&worker->rkey_config[rkey_cfg_index].key,
+                                   &sel_param_strb);
+    }
+    ucs_trace("worker %p: select protocols ep[%d]/rkey[%d] for %s", worker,
+              ep_cfg_index, rkey_cfg_index,
+              ucs_string_buffer_cstr(&sel_param_strb));
 
-    ucs_trace("initialize selection for %s worker %p ep_config %d rkey_config %d",
-              ucs_string_buffer_cstr(&strb), worker, ep_cfg_index, rkey_cfg_index);
+    ucs_log_indent(1);
 
     proto_init = ucs_malloc(sizeof(*proto_init), "proto_init");
     if (proto_init == NULL) {
         status = UCS_ERR_NO_MEMORY;
-        goto out_free_strb;
+        goto out;
     }
 
-    status = ucp_proto_select_init_protocols(worker, ep_cfg_index, rkey_cfg_index,
-                                             select_param, proto_init,
-                                             ucs_string_buffer_cstr(&strb));
+    status = ucp_proto_select_init_protocols(worker, ep_cfg_index,
+                                             rkey_cfg_index, select_param,
+                                             proto_init);
     if (status != UCS_OK) {
         goto out_free_proto_init;
     }
 
     status = ucp_proto_select_elem_init_thresh(select_elem, proto_init,
-                                               ep_cfg_index, rkey_cfg_index,
-                                               ucs_string_buffer_cstr(&strb));
+                                               ep_cfg_index, rkey_cfg_index);
     if (status != UCS_OK) {
         goto err_cleanup_protocols;
     }
@@ -583,8 +628,8 @@ ucp_proto_select_elem_init(ucp_worker_h worker,
     ucs_free(proto_init->priv_buf);
 out_free_proto_init:
     ucs_free(proto_init);
-out_free_strb:
-    ucs_string_buffer_cleanup(&strb);
+out:
+    ucs_log_indent(-1);
     return status;
 }
 
@@ -609,31 +654,41 @@ ucp_proto_select_lookup_slow(ucp_worker_h worker,
                              ucp_worker_cfg_index_t rkey_cfg_index,
                              const ucp_proto_select_param_t *select_param)
 {
-    ucp_proto_select_elem_t *select_elem;
+    ucp_proto_select_elem_t *select_elem, tmp_select_elem;
     ucp_proto_select_key_t key;
     ucs_status_t status;
     khiter_t khiter;
     int khret;
 
     key.param = *select_param;
-    khiter    = kh_put(ucp_proto_select_hash, &proto_select->hash, key.u64,
-                       &khret);
-    ucs_assert_always((khret == UCS_KH_PUT_BUCKET_EMPTY) ||
-                      (khret == UCS_KH_PUT_BUCKET_CLEAR));
+    khiter    = kh_get(ucp_proto_select_hash, &proto_select->hash, key.u64);
+    if (khiter != kh_end(&proto_select->hash)) {
+        select_elem = &kh_value(&proto_select->hash, khiter);
+        goto out;
+    }
+
+    status = ucp_proto_select_elem_init(worker, ep_cfg_index, rkey_cfg_index,
+                                        select_param, &tmp_select_elem);
+    if (status != UCS_OK) {
+        return NULL;
+    }
+
+    /* add to hash after initializing the temp element, since calling
+     * ucp_proto_select_elem_init() can recursively modify the hash
+     */
+    khiter = kh_put(ucp_proto_select_hash, &proto_select->hash, key.u64,
+                    &khret);
+    ucs_assert_always(khret == UCS_KH_PUT_BUCKET_EMPTY);
+
+    select_elem  = &kh_value(&proto_select->hash, khiter);
+    *select_elem = tmp_select_elem;
 
     /* Adding hash values may reallocate the array, so the cached pointer to
      * select_elem may not be valid anymore.
      */
     ucp_proto_select_cache_reset(proto_select);
 
-    select_elem = &kh_value(&proto_select->hash, khiter);
-    status      = ucp_proto_select_elem_init(worker, ep_cfg_index, rkey_cfg_index,
-                                             select_param, select_elem);
-    if (status != UCS_OK) {
-        kh_del(ucp_proto_select_hash, &proto_select->hash, khiter);
-        return NULL;
-    }
-
+out:
     return select_elem;
 }
 
@@ -654,29 +709,16 @@ void ucp_proto_select_cleanup(ucp_proto_select_t *proto_select)
     kh_destroy_inplace(ucp_proto_select_hash, &proto_select->hash);
 }
 
-static void ucp_proto_select_perf_str(const ucs_linear_func_t *perf,
-                                      char *time_str, size_t time_str_max,
-                                      char *bw_str, size_t bw_str_max)
-{
-    /* Estimated time */
-    snprintf(time_str, time_str_max, "%5.0f + %.3f * N",
-             perf->c * 1e9, perf->m * 1e9);
-
-    /* Estimated bandwidth (MiB/s) */
-    snprintf(bw_str, bw_str_max, "%7.2f", 1.0 / (perf->m * UCS_MBYTE));
-}
-
-static void
+static ucs_status_t
 ucp_proto_select_dump_all(ucp_worker_h worker,
                           ucp_worker_cfg_index_t ep_cfg_index,
                           ucp_worker_cfg_index_t rkey_cfg_index,
                           const ucp_proto_select_param_t *select_param,
-                          FILE *stream)
+                          ucs_string_buffer_t *strb)
 {
     static const char *proto_info_fmt =
-                                "#     %-18s %-12s %-20s %-18s %-12s %s\n";
+            "    %-18s %-18s %-20s %-18s %-12s %s\n";
     ucp_proto_select_init_protocols_t *proto_init;
-    ucs_string_buffer_t select_strb;
     ucs_string_buffer_t config_strb;
     size_t range_start, range_end;
     const ucp_proto_caps_t *caps;
@@ -692,22 +734,20 @@ ucp_proto_select_dump_all(ucp_worker_h worker,
     /* Allocate on heap, since the structure is quite large */
     proto_init = ucs_malloc(sizeof(*proto_init), "proto_init");
     if (proto_init == NULL) {
-        fprintf(stream, "<Could not allocate memory>\n");
-        return;
+        status = UCS_ERR_NO_MEMORY;
+        goto out;
     }
 
-    ucp_proto_select_param_str(select_param, &select_strb);
-
-    status = ucp_proto_select_init_protocols(worker, ep_cfg_index, rkey_cfg_index,
-                                             select_param, proto_init,
-                                             ucs_string_buffer_cstr(&select_strb));
+    status = ucp_proto_select_init_protocols(worker, ep_cfg_index,
+                                             rkey_cfg_index, select_param,
+                                             proto_init);
     if (status != UCS_OK) {
-        fprintf(stream, "<%s>\n", ucs_status_string(status));
         goto out_free;
     }
 
-    fprintf(stream, proto_info_fmt, "PROTOCOL", "SIZE", "TIME (nsec)",
-            "BANDWIDTH (MiB/s)", "THRESHOLD", "CONIFURATION");
+    ucs_string_buffer_appendf(strb, proto_info_fmt, "PROTOCOL", "SIZE",
+                              "TIME (nsec)", "BANDWIDTH (MiB/s)", "THRESHOLD",
+                              "CONFIGURATION");
 
     ucs_for_each_bit(proto_id, proto_init->mask) {
 
@@ -715,9 +755,6 @@ ucp_proto_select_dump_all(ucp_worker_h worker,
                                    proto_init->priv_offsets[proto_id]);
         caps = &proto_init->caps[proto_id];
 
-        /* Get protocol configuration */
-        ucp_proto_id_call(proto_id, config_str, priv, &config_strb);
-
         /* String for configured threshold */
         ucs_memunits_to_str(caps->cfg_thresh, thresh_str, sizeof(thresh_str));
 
@@ -731,52 +768,62 @@ ucp_proto_select_dump_all(ucp_worker_h worker,
             ucp_proto_select_perf_str(&caps->ranges[i].perf,
                                       time_str, sizeof(time_str),
                                       bw_str, sizeof(bw_str));
+            /* Get protocol configuration */
+            ucs_string_buffer_init(&config_strb);
+            ucp_proto_id_call(proto_id, config_str, range_start, range_end,
+                              priv, &config_strb);
 
-            fprintf(stream, proto_info_fmt,
+            ucs_string_buffer_appendf(
+                    strb, proto_info_fmt,
                     (i == 0) ? ucp_proto_id_field(proto_id, name) : "",
-                    range_str, time_str, bw_str,
-                    (i == 0) ? thresh_str : "",
+                    range_str, time_str, bw_str, (i == 0) ? thresh_str : "",
                     (i == 0) ? ucs_string_buffer_cstr(&config_strb) : "");
 
+            ucs_string_buffer_cleanup(&config_strb);
             range_start = range_end + 1;
         }
 
-        ucs_string_buffer_cleanup(&config_strb);
     }
-    fprintf(stream, "#\n");
+
+    status = UCS_OK;
 
     ucs_free(proto_init->priv_buf);
 out_free:
-    ucs_string_buffer_cleanup(&select_strb);
     ucs_free(proto_init);
+out:
+    return status;
 }
 
 static void
 ucp_proto_select_dump_thresholds(const ucp_proto_select_elem_t *select_elem,
-                                 FILE *stream)
+                                 ucs_string_buffer_t *strb)
 {
-    static const char *proto_info_fmt = "#     %-16s %-18s %s\n";
+    static const char *proto_info_fmt = "    %-18s %-18s %s\n";
     const ucp_proto_threshold_elem_t *thresh_elem;
-    ucs_string_buffer_t strb;
+    ucs_string_buffer_t proto_config_strb;
     size_t range_start, range_end;
     char range_str[128];
 
     range_start = 0;
     thresh_elem = select_elem->thresholds;
-    fprintf(stream, proto_info_fmt, "SIZE", "PROTOCOL", "CONFIGURATION");
+    ucs_string_buffer_appendf(strb, proto_info_fmt, "SIZE", "PROTOCOL",
+                              "CONFIGURATION");
     do {
-        thresh_elem->proto_config.proto->config_str(
-                thresh_elem->proto_config.priv, &strb);
+        ucs_string_buffer_init(&proto_config_strb);
 
         range_end = thresh_elem->max_msg_length;
+        thresh_elem->proto_config.proto->config_str(
+                range_start, range_end, thresh_elem->proto_config.priv,
+                &proto_config_strb);
 
-        fprintf(stream, proto_info_fmt,
-                ucs_memunits_range_str(range_start, range_end, range_str,
-                                       sizeof(range_str)),
-                thresh_elem->proto_config.proto->name,
-                ucs_string_buffer_cstr(&strb));
+        ucs_memunits_range_str(range_start, range_end, range_str,
+                               sizeof(range_str));
 
-        ucs_string_buffer_cleanup(&strb);
+        ucs_string_buffer_appendf(strb, proto_info_fmt, range_str,
+                                  thresh_elem->proto_config.proto->name,
+                                  ucs_string_buffer_cstr(&proto_config_strb));
+
+        ucs_string_buffer_cleanup(&proto_config_strb);
 
         range_start = range_end + 1;
         ++thresh_elem;
@@ -785,9 +832,9 @@ ucp_proto_select_dump_thresholds(const ucp_proto_select_elem_t *select_elem,
 
 static void
 ucp_proto_select_dump_perf(const ucp_proto_select_elem_t *select_elem,
-                           FILE *stream)
+                           ucs_string_buffer_t *strb)
 {
-    static const char *proto_info_fmt = "#     %-16s %-20s %s\n";
+    static const char *proto_info_fmt = "    %-16s %-20s %s\n";
     const ucp_proto_perf_range_t *perf_elem;
     size_t range_start, range_end;
     char range_str[128];
@@ -796,18 +843,19 @@ ucp_proto_select_dump_perf(const ucp_proto_select_elem_t *select_elem,
 
     range_start = 0;
     perf_elem   = select_elem->perf_ranges;
-    fprintf(stream, proto_info_fmt, "SIZE", "TIME (nsec)", "BANDWIDTH (MiB/s)");
+    ucs_string_buffer_appendf(strb, proto_info_fmt, "SIZE", "TIME (nsec)",
+                              "BANDWIDTH (MiB/s)");
     do {
         range_end = perf_elem->max_length;
 
         ucp_proto_select_perf_str(&perf_elem->perf,
                                   time_str, sizeof(time_str),
                                   bw_str, sizeof(bw_str));
+        ucs_memunits_range_str(range_start, range_end, range_str,
+                               sizeof(range_str));
 
-        fprintf(stream, proto_info_fmt,
-                ucs_memunits_range_str(range_start, range_end, range_str,
-                                       sizeof(range_str)),
-                time_str, bw_str);
+        ucs_string_buffer_appendf(strb, proto_info_fmt, range_str, time_str,
+                                  bw_str);
 
         range_start = range_end + 1;
         ++perf_elem;
@@ -820,75 +868,203 @@ ucp_proto_select_elem_dump(ucp_worker_h worker,
                            ucp_worker_cfg_index_t rkey_cfg_index,
                            const ucp_proto_select_param_t *select_param,
                            const ucp_proto_select_elem_t *select_elem,
-                           FILE *stream)
+                           ucs_string_buffer_t *strb)
 {
-    ucs_string_buffer_t strb;
+    UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_SELECT_PARAM_STR_MAX);
+    ucs_status_t status;
     size_t i;
 
-    fprintf(stream, "#\n");
+    ucp_proto_select_param_str(select_param, &sel_param_strb);
 
-    ucp_proto_select_param_str(select_param, &strb);
-    fprintf(stream, "# %s:\n", ucs_string_buffer_cstr(&strb));
-    fprintf(stream, "# ");
-    for (i = 0; i < ucs_string_buffer_length(&strb); ++i) {
-        fputc('=', stream);
+    ucs_string_buffer_appendf(strb, "  %s\n  ",
+                              ucs_string_buffer_cstr(&sel_param_strb));
+    for (i = 0; i < ucs_string_buffer_length(&sel_param_strb); ++i) {
+        ucs_string_buffer_appendf(strb, "=");
     }
-    fprintf(stream, "\n");
-    ucs_string_buffer_cleanup(&strb);
+    ucs_string_buffer_appendf(strb, "\n");
 
-    fprintf(stream, "#\n");
-    fprintf(stream, "#   Selected protocols:\n");
-    ucp_proto_select_dump_thresholds(select_elem, stream);
+    ucs_string_buffer_appendf(strb, "\n  Selected protocols:\n");
+    ucp_proto_select_dump_thresholds(select_elem, strb);
 
-    fprintf(stream, "#\n");
-    fprintf(stream, "#   Performance estimation:\n");
-    ucp_proto_select_dump_perf(select_elem, stream);
+    ucs_string_buffer_appendf(strb, "\n  Performance estimation:\n");
+    ucp_proto_select_dump_perf(select_elem, strb);
 
-    fprintf(stream, "#\n");
-    fprintf(stream, "#   Candidates:\n");
-    ucp_proto_select_dump_all(worker, ep_cfg_index, rkey_cfg_index,
-                              select_param, stream);
+    ucs_string_buffer_appendf(strb, "\n  Candidates:\n");
+    status = ucp_proto_select_dump_all(worker, ep_cfg_index, rkey_cfg_index,
+                                       select_param, strb);
+    if (status != UCS_OK) {
+        ucs_string_buffer_appendf(strb, "<Error: %s>\n",
+                                  ucs_status_string(status));
+    }
 }
 
 void ucp_proto_select_dump(ucp_worker_h worker,
                            ucp_worker_cfg_index_t ep_cfg_index,
                            ucp_worker_cfg_index_t rkey_cfg_index,
-                           ucp_proto_select_t *proto_select, FILE *stream)
+                           const ucp_proto_select_t *proto_select,
+                           ucs_string_buffer_t *strb)
 {
     ucp_proto_select_elem_t select_elem;
     ucp_proto_select_key_t key;
+    char info[256];
+
+    ucp_worker_print_used_tls(&worker->ep_config[ep_cfg_index].key,
+                              worker->context, ep_cfg_index, info,
+                              sizeof(info));
+    ucs_string_buffer_appendf(strb, "\nProtocol selection for %s", info);
+
+    if (rkey_cfg_index != UCP_WORKER_CFG_INDEX_NULL) {
+        ucs_string_buffer_appendf(strb, "rkey_cfg[%d]: ", rkey_cfg_index);
+        ucp_rkey_config_dump_brief(&worker->rkey_config[rkey_cfg_index].key,
+                                   strb);
+    }
+    ucs_string_buffer_appendf(strb, "\n\n");
+
+    if (kh_size(&proto_select->hash) == 0) {
+        ucs_string_buffer_appendf(strb, "   (No elements)\n");
+        return;
+    }
 
-    fprintf(stream, "# \n");
-    fprintf(stream, "# Protocols selection for ep_config[%d]/rkey_config[%d] "
-            "(%d items)\n", ep_cfg_index, rkey_cfg_index,
-            kh_size(&proto_select->hash));
-    fprintf(stream, "# \n");
     kh_foreach(&proto_select->hash, key.u64, select_elem,
-         ucp_proto_select_elem_dump(worker, ep_cfg_index, rkey_cfg_index,
-                                    &key.param, &select_elem, stream);
-    )
+               ucp_proto_select_elem_dump(worker, ep_cfg_index, rkey_cfg_index,
+                                          &key.param, &select_elem, strb))
+}
+
+void ucp_proto_select_dump_short(const ucp_proto_select_short_t *select_short,
+                                 const char *name, ucs_string_buffer_t *strb)
+{
+    if (select_short->lane == UCP_NULL_LANE) {
+        return;
+    }
+
+    ucs_string_buffer_appendf(strb, "\n%s: ", name);
+
+    if (select_short->max_length_unknown_mem >= 0) {
+        ucs_string_buffer_appendf(strb, "<= %zd",
+                                  select_short->max_length_unknown_mem);
+    } else {
+        ucs_string_buffer_appendf(strb, "<= %zd and host memory",
+                                  select_short->max_length_host_mem);
+    }
+
+    ucs_string_buffer_appendf(strb, ", using lane %d rkey_index %d\n",
+                              select_short->lane, select_short->rkey_index);
 }
 
 void ucp_proto_select_param_str(const ucp_proto_select_param_t *select_param,
                                 ucs_string_buffer_t *strb)
 {
+    char sys_dev_name[32];
     uint32_t op_attr_mask;
 
-    ucs_string_buffer_init(strb);
-
     op_attr_mask = ucp_proto_select_op_attr_from_flags(select_param->op_flags);
-    ucs_string_buffer_appendf(strb, "%s()",
+    ucs_string_buffer_appendf(strb, "%s(",
                               ucp_operation_names[select_param->op_id]);
-    ucs_string_buffer_appendf(strb, " on a %s data-type",
+
+    ucs_string_buffer_appendf(strb, "%s",
                               ucp_datatype_class_names[select_param->dt_class]);
+
     if (select_param->sg_count > 1) {
-        ucs_string_buffer_appendf(strb, "with %u scatter-gather entries",
-                                  select_param->sg_count);
+        ucs_string_buffer_appendf(strb, "[%d]", select_param->sg_count);
+    }
+
+    if (select_param->mem_type != UCS_MEMORY_TYPE_HOST) {
+        ucs_string_buffer_appendf(
+                strb, ", %s", ucs_memory_type_names[select_param->mem_type]);
+    }
+
+    if (select_param->sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN) {
+        ucs_topo_sys_device_bdf_name(select_param->sys_dev, sys_dev_name,
+                                     sizeof(sys_dev_name));
+        ucs_string_buffer_appendf(strb, ", %s", sys_dev_name);
     }
-    ucs_string_buffer_appendf(strb, " in %s memory",
-                              ucs_memory_type_names[select_param->mem_type]);
 
     if (op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) {
-        ucs_string_buffer_appendf(strb, " and fast completion");
+        ucs_string_buffer_appendf(strb, ", fast-completion");
     }
+
+    ucs_string_buffer_appendf(strb, ")");
+}
+
+void ucp_proto_select_short_disable(ucp_proto_select_short_t *proto_short)
+{
+    proto_short->max_length_unknown_mem = -1;
+    proto_short->max_length_host_mem    = -1;
+    proto_short->lane                   = UCP_NULL_LANE;
+    proto_short->rkey_index             = UCP_NULL_RESOURCE;
+}
+
+void
+ucp_proto_select_short_init(ucp_worker_h worker, ucp_proto_select_t *proto_select,
+                            ucp_worker_cfg_index_t ep_cfg_index,
+                            ucp_worker_cfg_index_t rkey_cfg_index,
+                            ucp_operation_id_t op_id, uint32_t op_attr_mask,
+                            unsigned proto_flags,
+                            ucp_proto_select_short_t *proto_short)
+{
+    ucp_context_h context    = worker->context;
+    const ucp_proto_t *proto = NULL;
+    const ucp_proto_threshold_elem_t *thresh;
+    ucp_proto_select_param_t select_param;
+    const ucp_proto_single_priv_t *spriv;
+    ucs_memory_info_t mem_info;
+    uint32_t op_attr;
+
+    ucp_memory_info_set_host(&mem_info);
+
+    /*
+     * Find the minimal threshold among all protocols for all possible
+     * combinations of bits in 'op_attr_mask'. For example, we are allowed to
+     * use fast-path short protocol only if the message size fits short protocol
+     * in both regular mode and UCP_OP_ATTR_FLAG_FAST_CMPL mode.
+     */
+    ucs_for_each_submask(op_attr, op_attr_mask) {
+        ucp_proto_select_param_init(&select_param, op_id, op_attr,
+                                    UCP_DATATYPE_CONTIG, &mem_info, 1);
+        thresh = ucp_proto_select_lookup(worker, proto_select, ep_cfg_index,
+                                         rkey_cfg_index, &select_param, 0);
+        if (thresh == NULL) {
+            /* no protocol for contig/host */
+            goto out_disable;
+        }
+
+        ucs_assert(thresh->proto_config.proto != NULL);
+        if (!ucs_test_all_flags(thresh->proto_config.proto->flags, proto_flags)) {
+            /* the protocol for smallest messages is not short */
+            goto out_disable;
+        }
+
+        /* Assume short protocol uses 'ucp_proto_single_priv_t' */
+        spriv = thresh->proto_config.priv;
+
+        if (proto == NULL) {
+            proto                            = thresh->proto_config.proto;
+            proto_short->max_length_host_mem = thresh->max_msg_length;
+            proto_short->lane                = spriv->super.lane;
+            proto_short->rkey_index          = spriv->super.rkey_index;
+        } else {
+            if ((proto != thresh->proto_config.proto) ||
+                (proto_short->lane != spriv->super.lane) ||
+                (proto_short->rkey_index != spriv->super.rkey_index)) {
+                /* not all op_attr options have same configuration */
+                goto out_disable;
+            }
+
+            /* Fast-path threshold is the minimal of all op_attr options */
+            proto_short->max_length_host_mem = ucs_min(
+                    proto_short->max_length_host_mem, thresh->max_msg_length);
+        }
+    }
+
+    /* If we support only host memory, set max short for unknown memory type to
+     * be same as for host memory type. Otherwise, disable short if memory type
+     * is unknown.
+     */
+    ucs_assert(proto_short->max_length_host_mem >= 0);
+    proto_short->max_length_unknown_mem = (context->num_mem_type_detect_mds > 0) ?
+                                          -1 : proto_short->max_length_host_mem;
+    return;
+
+out_disable:
+    ucp_proto_select_short_disable(proto_short);
 }
diff --git a/src/ucp/proto/proto_select.h b/src/ucp/proto/proto_select.h
index e3c9c317b4a..d8d75099df4 100644
--- a/src/ucp/proto/proto_select.h
+++ b/src/ucp/proto/proto_select.h
@@ -20,6 +20,10 @@
 #define UCP_PROTO_SELECT_OP_ATTR_MASK   UCP_OP_ATTR_FLAG_FAST_CMPL
 
 
+/** Maximal length of ucp_proto_select_param_str() */
+#define UCP_PROTO_SELECT_PARAM_STR_MAX 128
+
+
 /**
  * Entry which defines which protocol should be used for a message size range.
  */
@@ -61,15 +65,34 @@ typedef struct {
 } ucp_proto_select_t;
 
 
+/*
+ * Settings for short protocol
+ */
+typedef struct {
+    ssize_t             max_length_host_mem;    /* max length of short protocol for
+                                                   host memory buffer */
+    ssize_t             max_length_unknown_mem; /* max length of short protocol
+                                                   for unknown memory buffer */
+    ucp_lane_index_t    lane;                   /* lane for sending short message */
+    ucp_md_index_t      rkey_index;             /* uct rkey index (for put_short) */
+} ucp_proto_select_short_t;
+
+
 ucs_status_t ucp_proto_select_init(ucp_proto_select_t *proto_select);
 
 
 void ucp_proto_select_cleanup(ucp_proto_select_t *proto_select);
 
 
-void ucp_proto_select_dump(ucp_worker_h worker, ucp_worker_cfg_index_t ep_cfg_index,
+void ucp_proto_select_dump(ucp_worker_h worker,
+                           ucp_worker_cfg_index_t ep_cfg_index,
                            ucp_worker_cfg_index_t rkey_cfg_index,
-                           ucp_proto_select_t *proto_select, FILE *stream);
+                           const ucp_proto_select_t *proto_select,
+                           ucs_string_buffer_t *strb);
+
+
+void ucp_proto_select_dump_short(const ucp_proto_select_short_t *select_short,
+                                 const char *name, ucs_string_buffer_t *strb);
 
 
 void ucp_proto_select_param_str(const ucp_proto_select_param_t *select_param,
@@ -89,8 +112,15 @@ ucp_proto_thresholds_search_slow(const ucp_proto_threshold_elem_t *thresholds,
                                  size_t msg_length);
 
 
-void ucp_proto_select_param_str(const ucp_proto_select_param_t *select_param,
-                                ucs_string_buffer_t *strb);
+void ucp_proto_select_short_disable(ucp_proto_select_short_t *proto_short);
+
 
+void
+ucp_proto_select_short_init(ucp_worker_h worker, ucp_proto_select_t *proto_select,
+                            ucp_worker_cfg_index_t ep_cfg_index,
+                            ucp_worker_cfg_index_t rkey_cfg_index,
+                            ucp_operation_id_t op_id, uint32_t op_attr_mask,
+                            unsigned proto_flags,
+                            ucp_proto_select_short_t *proto_short);
 
 #endif
diff --git a/src/ucp/proto/proto_select.inl b/src/ucp/proto/proto_select.inl
index fd928ca1fc5..741dfd34dea 100644
--- a/src/ucp/proto/proto_select.inl
+++ b/src/ucp/proto/proto_select.inl
@@ -101,8 +101,8 @@ ucp_proto_select_lookup(ucp_worker_h worker, ucp_proto_select_t *proto_select,
 static UCS_F_ALWAYS_INLINE void
 ucp_proto_select_param_init(ucp_proto_select_param_t *select_param,
                             ucp_operation_id_t op_id, uint32_t op_attr_mask,
-                            ucp_dt_class_t dt_class, ucs_memory_type_t mem_type,
-                            uint8_t sg_count)
+                            ucp_dt_class_t dt_class,
+                            const ucs_memory_info_t *mem_info, uint8_t sg_count)
 {
     if (dt_class == UCP_DATATYPE_CONTIG) {
         ucs_assert(sg_count == 1);
@@ -116,11 +116,21 @@ ucp_proto_select_param_init(ucp_proto_select_param_t *select_param,
     select_param->op_id      = op_id;
     select_param->op_flags   = ucp_proto_select_op_attr_to_flags(op_attr_mask);
     select_param->dt_class   = dt_class;
-    select_param->mem_type   = mem_type;
-    select_param->sys_dev    = 0;
+    select_param->mem_type   = mem_info->type;
+    select_param->sys_dev    = mem_info->sys_dev;
     select_param->sg_count   = sg_count;
     select_param->padding[0] = 0;
     select_param->padding[1] = 0;
 }
 
+static UCS_F_ALWAYS_INLINE int
+ucp_proto_select_is_short(ucp_ep_h ep,
+                          const ucp_proto_select_short_t *proto_short,
+                          ssize_t length)
+{
+    return ucs_likely(length <= proto_short->max_length_unknown_mem) ||
+           ((length <= proto_short->max_length_host_mem) &&
+            ucp_memory_type_cache_is_empty(ep->worker->context));
+}
+
 #endif
diff --git a/src/ucp/proto/proto_single.c b/src/ucp/proto/proto_single.c
index 76e332ca33f..e9de134d52f 100644
--- a/src/ucp/proto/proto_single.c
+++ b/src/ucp/proto/proto_single.c
@@ -20,42 +20,48 @@ ucs_status_t ucp_proto_single_init(const ucp_proto_single_init_params_t *params)
 {
     ucp_proto_single_priv_t *spriv = params->super.super.priv;
     ucp_proto_common_perf_params_t perf_params;
+    const uct_iface_attr_t *iface_attr;
     ucp_lane_index_t num_lanes;
     ucp_md_map_t reg_md_map;
     ucp_lane_index_t lane;
 
     num_lanes = ucp_proto_common_find_lanes(&params->super, params->lane_type,
-                                            params->tl_cap_flags, 1, 0, &lane,
-                                            &reg_md_map);
+                                            params->tl_cap_flags, 1, 0, &lane);
     if (num_lanes == 0) {
         ucs_trace("no lanes for %s", params->super.super.proto_name);
         return UCS_ERR_UNSUPPORTED;
     }
 
+    ucs_assert(num_lanes == 1);
     *params->super.super.priv_size = sizeof(ucp_proto_single_priv_t);
 
-    ucp_proto_common_lane_priv_init(&params->super, reg_md_map, lane,
-                                    &spriv->super);
-
-    ucs_assert(ucs_popcount(reg_md_map) <= 1);
+    reg_md_map = ucp_proto_common_reg_md_map(&params->super, UCS_BIT(lane));
     if (reg_md_map == 0) {
-        spriv->reg_md      = UCP_NULL_RESOURCE;
+        spriv->reg_md = UCP_NULL_RESOURCE;
     } else {
-        spriv->reg_md      = ucs_ffs64(reg_md_map);
+        ucs_assert(ucs_popcount(reg_md_map) == 1);
+        spriv->reg_md = ucs_ffs64(reg_md_map);
     }
 
+    ucp_proto_common_lane_priv_init(&params->super, reg_md_map, lane,
+                                    &spriv->super);
+
+    iface_attr = ucp_proto_common_get_iface_attr(&params->super.super, lane);
+
     perf_params.lane_map   = UCS_BIT(lane);
     perf_params.reg_md_map = reg_md_map;
-    perf_params.lane0      = lane;
+    perf_params.frag_size  = ucp_proto_common_get_max_frag(&params->super,
+                                                           iface_attr);
+    perf_params.bandwidth  = ucp_proto_common_iface_bandwidth(&params->super,
+                                                              iface_attr);
     ucp_proto_common_calc_perf(&params->super, &perf_params);
 
     return UCS_OK;
 }
 
-void ucp_proto_single_config_str(const void *priv, ucs_string_buffer_t *strb)
+void ucp_proto_single_config_str(size_t min_length, size_t max_length,
+                                 const void *priv, ucs_string_buffer_t *strb)
 {
     const ucp_proto_single_priv_t *spriv = priv;
-
-    ucs_string_buffer_init(strb);
     ucp_proto_common_lane_priv_str(&spriv->super, strb);
 }
diff --git a/src/ucp/proto/proto_single.h b/src/ucp/proto/proto_single.h
index 298b7d88493..8d7c59e2ccb 100644
--- a/src/ucp/proto/proto_single.h
+++ b/src/ucp/proto/proto_single.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -27,6 +27,12 @@ typedef struct {
 ucs_status_t ucp_proto_single_init(const ucp_proto_single_init_params_t *params);
 
 
-void ucp_proto_single_config_str(const void *priv, ucs_string_buffer_t *strb);
+void ucp_proto_single_config_str(size_t min_length, size_t max_length,
+                                 const void *priv, ucs_string_buffer_t *strb);
+
+
+typedef ucs_status_t (*ucp_proto_send_single_cb_t)(
+        ucp_request_t *req, const ucp_proto_single_priv_t *spriv,
+        const uct_iov_t *iov);
 
 #endif
diff --git a/src/ucp/proto/proto_single.inl b/src/ucp/proto/proto_single.inl
index 0f94037face..50e65fb34df 100644
--- a/src/ucp/proto/proto_single.inl
+++ b/src/ucp/proto/proto_single.inl
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -45,79 +45,72 @@ ucp_proto_am_bcopy_single_send(ucp_request_t *req, ucp_am_id_t am_id,
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_proto_am_bcopy_single_progress(ucp_request_t *req, ucp_am_id_t am_id,
-                                   ucp_lane_index_t lane,
-                                   uct_pack_callback_t pack_func, void *pack_arg,
-                                   size_t max_packed_size,
-                                   ucp_proto_complete_cb_t complete_func,
-                                   ucp_proto_complete_cb_t error_func)
+ucp_proto_single_status_handle(ucp_request_t *req,
+                               ucp_proto_complete_cb_t complete_func,
+                               ucp_lane_index_t lane, ucs_status_t status)
 {
-    ucs_status_t status;
-
-    ucs_assert(error_func != NULL);
-
-    status = ucp_proto_am_bcopy_single_send(req, am_id, lane, pack_func,
-                                            pack_arg, max_packed_size);
     if (ucs_likely(status == UCS_OK)) {
         if (complete_func != NULL) {
-            complete_func(req, status);
+            complete_func(req);
         }
     } else if (status == UCS_ERR_NO_RESOURCE) {
+        /* keep on pending queue */
         req->send.lane = lane;
         return UCS_ERR_NO_RESOURCE;
-    } else {
-        ucs_assert(status != UCS_INPROGRESS);
-        error_func(req, status);
+    } else if (status != UCS_INPROGRESS) {
+        ucp_proto_request_abort(req, status);
     }
-
     return UCS_OK;
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_proto_am_zcopy_single_progress(ucp_request_t *req, ucp_am_id_t am_id,
-                                   const void *hdr, size_t hdr_size)
+ucp_proto_am_bcopy_single_progress(ucp_request_t *req, ucp_am_id_t am_id,
+                                   ucp_lane_index_t lane,
+                                   uct_pack_callback_t pack_func, void *pack_arg,
+                                   size_t max_packed_size,
+                                   ucp_proto_complete_cb_t complete_func)
+{
+    ucs_status_t status;
+
+    status = ucp_proto_am_bcopy_single_send(req, am_id, lane, pack_func,
+                                            pack_arg, max_packed_size);
+    return ucp_proto_single_status_handle(req, complete_func, lane, status);
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_zcopy_single_progress(ucp_request_t *req, unsigned uct_mem_flags,
+                                ucp_proto_send_single_cb_t send_func,
+                                const char *name)
 {
-    ucp_ep_t *ep                         = req->send.ep;
     const ucp_proto_single_priv_t *spriv = req->send.proto_config->priv;
     ucp_datatype_iter_t next_iter;
     ucs_status_t status;
     ucp_md_map_t md_map;
     uct_iov_t iov;
 
-    ucs_assert(req->send.dt_iter.offset == 0);
+    ucs_assert(req->send.state.dt_iter.offset == 0);
 
     if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
         md_map = (spriv->reg_md == UCP_NULL_RESOURCE) ? 0 : UCS_BIT(spriv->reg_md);
         status = ucp_proto_request_zcopy_init(req, md_map,
-                                              ucp_proto_request_zcopy_completion);
+                                              ucp_proto_request_zcopy_completion,
+                                              uct_mem_flags);
         if (status != UCS_OK) {
-            ucp_proto_request_zcopy_complete(req, status);
+            ucp_proto_request_abort(req, status);
             return UCS_OK; /* remove from pending after request is completed */
         }
 
         req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
     }
 
-    ucp_datatype_iter_next_iov(&req->send.dt_iter, spriv->super.memh_index,
+    ucp_datatype_iter_next_iov(&req->send.state.dt_iter, spriv->super.memh_index,
                                SIZE_MAX, &next_iter, &iov);
-    status = uct_ep_am_zcopy(ep->uct_eps[spriv->super.lane], am_id, hdr,
-                             hdr_size, &iov, 1, 0, &req->send.state.uct_comp);
-    UCS_PROFILE_REQUEST_EVENT_CHECK_STATUS(req, "am_zcopy_only", iov.length,
-                                           status);
-    if (ucs_likely(status == UCS_OK)) {
-        /* fastpath is UCS_OK */
-    } else if (status == UCS_INPROGRESS) {
-        /* completion callback will be called */
-        return UCS_OK;
-    } else if (status == UCS_ERR_NO_RESOURCE) {
-        /* keep on pending queue */
-        req->send.lane = spriv->super.lane;
-        return UCS_ERR_NO_RESOURCE;
-    }
+    status = send_func(req, spriv, &iov);
+    UCS_PROFILE_REQUEST_EVENT_CHECK_STATUS(req, name, iov.length, status);
 
-    /* complete the request with OK or error */
-    ucp_proto_request_zcopy_complete(req, status);
-    return UCS_OK;
+    return ucp_proto_single_status_handle(
+            req, ucp_proto_request_zcopy_complete_success, spriv->super.lane,
+            status);
 }
 
 #endif
diff --git a/src/ucp/rma/amo_sw.c b/src/ucp/rma/amo_sw.c
index 12f01dd5e64..2263f8e7eb6 100644
--- a/src/ucp/rma/amo_sw.c
+++ b/src/ucp/rma/amo_sw.c
@@ -23,10 +23,10 @@ static size_t ucp_amo_sw_pack(void *dest, void *arg, uint8_t fetch)
     size_t size                   = req->send.length;
     size_t length;
 
-    atomich->address    = req->send.rma.remote_addr;
+    atomich->address    = req->send.amo.remote_addr;
     atomich->req.ep_id  = ucp_ep_remote_id(ep);
     atomich->req.req_id = fetch ? ucp_send_request_get_id(req) :
-                          UCP_REQUEST_ID_INVALID;
+                                  UCS_PTR_MAP_KEY_INVALID;
     atomich->length     = size;
     atomich->opcode     = req->send.amo.uct_op;
 
@@ -60,11 +60,23 @@ ucp_amo_sw_progress(uct_pending_req_t *self, uct_pack_callback_t pack_cb,
     ucs_status_t status;
 
     req->send.lane = ucp_ep_get_am_lane(req->send.ep);
-    status         = ucp_rma_sw_do_am_bcopy(req, UCP_AM_ID_ATOMIC_REQ,
-                                            req->send.lane, pack_cb, req, NULL);
-    if (((status != UCS_ERR_NO_RESOURCE) && (status != UCS_OK)) ||
-        ((status == UCS_OK) && !fetch)) {
-        ucp_request_complete_send(req, status);
+    if (fetch) {
+        ucp_send_request_id_alloc(req);
+    }
+
+    status = ucp_rma_sw_do_am_bcopy(req, UCP_AM_ID_ATOMIC_REQ,
+                                    req->send.lane, pack_cb, req, NULL);
+    if ((status != UCS_OK) || ((status == UCS_OK) && !fetch)) {
+        if (fetch) {
+            ucp_send_request_id_release(req);
+        }
+
+        if (status != UCS_ERR_NO_RESOURCE) {
+            /* completed with:
+             * - with error if a fetch/post operation
+             * - either with error or with success if a post operation */
+            ucp_request_complete_send(req, status);
+        }
     }
 
     return status;
@@ -91,7 +103,7 @@ static size_t ucp_amo_sw_pack_atomic_reply(void *dest, void *arg)
     ucp_rma_rep_hdr_t *hdr = dest;
     ucp_request_t *req     = arg;
 
-    hdr->req_id = req->send.get_reply.req_id;
+    hdr->req_id = req->send.get_reply.remote_req_id;
 
     switch (req->send.length) {
     case sizeof(uint32_t):
@@ -191,11 +203,15 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_atomic_req_handler, (arg, data, length, am_fl
 {
     ucp_atomic_req_hdr_t *atomicreqh = data;
     ucp_worker_h worker              = arg;
-    ucp_ep_h ep                      = ucp_worker_get_ep_by_id(worker,
-                                                        atomicreqh->req.ep_id);
-    ucp_rsc_index_t amo_rsc_idx      = ucs_ffs64_safe(worker->atomic_tls);
+    ucp_rsc_index_t amo_rsc_idx      = UCS_BITMAP_FFS(worker->atomic_tls);
     ucp_request_t *req;
+    ucp_ep_h ep;
 
+    /* allow getting closed EP to be used for sending a completion or AMO data to
+     * enable flush on a peer
+     */
+    UCP_WORKER_GET_EP_BY_ID(&ep, worker, atomicreqh->req.ep_id, return UCS_OK,
+                            "SW AMO request");
     if (ucs_unlikely((amo_rsc_idx != UCP_MAX_RESOURCES) &&
                      (ucp_worker_iface_get_attr(worker,
                                                 amo_rsc_idx)->cap.flags &
@@ -208,7 +224,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_atomic_req_handler, (arg, data, length, am_fl
          *       EP and continue SW AMO protocol */
     }
 
-    if (atomicreqh->req.req_id == UCP_REQUEST_ID_INVALID) {
+    if (atomicreqh->req.req_id == UCS_PTR_MAP_KEY_INVALID) {
         /* atomic operation without result */
         switch (atomicreqh->length) {
         case sizeof(uint32_t):
@@ -240,10 +256,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_atomic_req_handler, (arg, data, length, am_fl
             ucs_fatal("invalid atomic length: %u", atomicreqh->length);
         }
 
-        req->send.ep                  = ep;
-        req->send.atomic_reply.req_id = atomicreqh->req.req_id;
-        req->send.length              = atomicreqh->length;
-        req->send.uct.func            = ucp_progress_atomic_reply;
+        req->flags                           = 0;
+        req->send.ep                         = ep;
+        req->send.atomic_reply.remote_req_id = atomicreqh->req.req_id;
+        req->send.length                     = atomicreqh->length;
+        req->send.uct.func                   = ucp_progress_atomic_reply;
         ucp_request_send(req, 0);
     }
 
@@ -256,10 +273,12 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_atomic_rep_handler, (arg, data, length, am_fl
     ucp_worker_h worker    = arg;
     ucp_rma_rep_hdr_t *hdr = data;
     size_t frag_length     = length - sizeof(*hdr);
-    ucp_request_t *req     = ucp_worker_extract_request_by_id(worker,
-                                                              hdr->req_id);
-    ucp_ep_h ep            = req->send.ep;
+    ucp_request_t *req;
+    ucp_ep_h ep;
 
+    UCP_SEND_REQUEST_GET_BY_ID(&req, worker, hdr->req_id, 1, return UCS_OK,
+                               "ATOMIC_REP %p", hdr);
+    ep = req->send.ep;
     memcpy(req->send.buffer, hdr + 1, frag_length);
     ucp_request_complete_send(req, UCS_OK);
     ucp_ep_rma_remote_request_completed(ep);
@@ -283,7 +302,7 @@ static void ucp_amo_sw_dump_packet(ucp_worker_h worker, uct_am_trace_type_t type
                  " ep_id 0x%"PRIx64" op %d]",
                  atomich->address, atomich->length, atomich->req.req_id,
                  atomich->req.ep_id, atomich->opcode);
-        header_len = sizeof(*atomich);;
+        header_len = sizeof(*atomich);
         break;
     case UCP_AM_ID_ATOMIC_REP:
         reph = data;
diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c
index 5aef50db9ce..df07fcdaf8f 100644
--- a/src/ucp/rma/flush.c
+++ b/src/ucp/rma/flush.c
@@ -17,12 +17,15 @@
 
 static void ucp_ep_flush_error(ucp_request_t *req, ucs_status_t status)
 {
-    if (ucp_ep_config(req->send.ep)->key.err_mode != UCP_ERR_HANDLING_MODE_PEER) {
-        ucs_error("error during flush: %s", ucs_status_string(status));
-    }
+    ucs_log_level_t level = (ucp_ep_config(req->send.ep)->key.err_mode ==
+                             UCP_ERR_HANDLING_MODE_PEER) ?
+                             UCS_LOG_LEVEL_TRACE_REQ : UCS_LOG_LEVEL_ERROR;
 
     req->status = status;
     --req->send.state.uct_comp.count;
+    ucs_log(level, "req %p: error during flush: %s, flush comp %p count reduced to %d",
+            req, ucs_status_string(status), &req->send.state.uct_comp,
+            req->send.state.uct_comp.count);
 }
 
 static int ucp_ep_flush_is_completed(ucp_request_t *req)
@@ -40,6 +43,7 @@ static void ucp_ep_flush_progress(ucp_request_t *req)
     ucs_status_t status;
     uct_ep_h uct_ep;
     int diff;
+    ucp_lane_map_t destroyed_lanes;
 
     /* If the number of lanes changed since flush operation was submitted, adjust
      * the number of expected completions */
@@ -54,18 +58,34 @@ static void ucp_ep_flush_progress(ucp_request_t *req)
                           ep, diff);
             req->send.state.uct_comp.count += diff;
         } else {
-            /* If we have less lanes, it means we are in error flow and
-             * ucp_worker_set_ep_failed() was completed, so we should have
-             * completed the flush on all lanes.
+            /* If we have less lanes, it means we are in error flow:
+             * - if count == 0, we have completed the flush on all lanes
+             * - otherwise, flush progress was re-scheduled from flush progress
+             *   pending right after ucp_worker_iface_err_handle_progress(),
+             *   so remove destroyed/failed lanes from started_lanes and count
+             *   them completed.
              */
+            ucs_assert(ep->flags & UCP_EP_FLAG_FAILED);
+            if (req->send.state.uct_comp.count > 0) {
+                destroyed_lanes = req->send.flush.started_lanes & ~all_lanes;
+
+                ucs_debug("req %p: lanes 0x%x were destroyed so reducing comp "
+                          "count by %d", req, destroyed_lanes,
+                          ucs_popcount(destroyed_lanes));
+                req->send.flush.started_lanes  &= ~destroyed_lanes;
+                req->send.state.uct_comp.count -= ucs_popcount(destroyed_lanes);
+            }
+
             ucs_assertv(req->send.state.uct_comp.count == 0,
                         "uct_comp.count=%d num_lanes=%d",
                         req->send.state.uct_comp.count, num_lanes);
         }
     }
 
-    ucs_trace("ep %p: progress flush req %p, started_lanes 0x%x count %d", ep,
-              req, req->send.flush.started_lanes, req->send.state.uct_comp.count);
+    ucs_trace("ep %p flags 0x%x: progress flush req %p, started_lanes 0x%x "
+              "count %d",
+              ep, ep->flags, req, req->send.flush.started_lanes,
+              req->send.state.uct_comp.count);
 
     while (req->send.flush.started_lanes < all_lanes) {
 
@@ -115,7 +135,7 @@ static void ucp_ep_flush_progress(ucp_request_t *req)
             }
         } else {
             ucp_ep_flush_error(req, status);
-            break;
+            req->send.flush.started_lanes |= UCS_BIT(lane);
         }
     }
 
@@ -142,9 +162,10 @@ static void ucp_ep_flush_progress(ucp_request_t *req)
                 ucs_trace_req("flush request %p remote completions done", req);
             } else {
                 req->send.flush.cmpl_sn = flush_state->send_sn;
-                ucs_queue_push(&flush_state->reqs, &req->send.flush.queue);
-                ucs_trace_req("added flush request %p to ep remote completion queue"
-                              " with sn %d", req, req->send.flush.cmpl_sn);
+                ucs_hlist_add_tail(&flush_state->reqs, &req->send.list);
+                ucs_trace_req("added flush request %p to ep remote completion"
+                              " queue with sn %d",
+                              req, req->send.flush.cmpl_sn);
             }
         }
         req->send.flush.sw_started = 1;
@@ -167,7 +188,7 @@ static int ucp_flush_check_completion(ucp_request_t *req)
 
     ucs_trace_req("flush req %p completed", req);
     ucp_ep_flush_slow_path_remove(req);
-    req->send.flush.flushed_cb(req);
+    req->send.flushed_cb(req);
     return 1;
 }
 
@@ -181,7 +202,7 @@ static unsigned ucp_ep_flush_resume_slow_path_callback(void *arg)
     return 0;
 }
 
-static ucs_status_t ucp_ep_flush_progress_pending(uct_pending_req_t *self)
+ucs_status_t ucp_ep_flush_progress_pending(uct_pending_req_t *self)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_lane_index_t lane = req->send.lane;
@@ -289,8 +310,7 @@ void ucp_ep_flush_remote_completed(ucp_request_t *req)
     }
 }
 
-ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags,
-                                       unsigned req_flags,
+ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned req_flags,
                                        const ucp_request_param_t *param,
                                        ucp_request_t *worker_req,
                                        ucp_request_callback_t flushed_cb,
@@ -301,10 +321,6 @@ ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags,
 
     ucs_debug("%s ep %p", debug_name, ep);
 
-    if (ep->flags & UCP_EP_FLAG_FAILED) {
-        return NULL;
-    }
-
     req = ucp_request_get_param(ep->worker, param,
                                 {return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);});
 
@@ -318,22 +334,21 @@ ucs_status_ptr_t ucp_ep_flush_internal(ucp_ep_h ep, unsigned uct_flags,
      */
     req->flags                      = req_flags;
     req->status                     = UCS_OK;
-    req->super_req                  = worker_req;
     req->send.ep                    = ep;
-    req->send.flush.flushed_cb      = flushed_cb;
+    req->send.flushed_cb            = flushed_cb;
     req->send.flush.prog_id         = UCS_CALLBACKQ_ID_NULL;
-    req->send.flush.uct_flags       = uct_flags;
+    req->send.flush.uct_flags       = UCT_FLUSH_FLAG_LOCAL;
     req->send.flush.sw_started      = 0;
     req->send.flush.sw_done         = 0;
     req->send.flush.num_lanes       = ucp_ep_num_lanes(ep);
     req->send.flush.started_lanes   = 0;
-
     req->send.lane                  = UCP_NULL_LANE;
     req->send.uct.func              = ucp_ep_flush_progress_pending;
     req->send.state.uct_comp.func   = ucp_ep_flush_completion;
     req->send.state.uct_comp.count  = ucp_ep_num_lanes(ep);
     req->send.state.uct_comp.status = UCS_OK;
 
+    ucp_request_set_super(req, worker_req);
     ucp_request_set_send_callback_param(param, req, send);
     ucp_ep_flush_progress(req);
 
@@ -373,9 +388,8 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_ep_flush_nbx, (ep, param),
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
 
-    request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, 0, param,
-                                    NULL, ucp_ep_flushed_callback,
-                                    "flush_nbx");
+    request = ucp_ep_flush_internal(ep, 0, param, NULL,
+                                    ucp_ep_flushed_callback, "flush_nbx");
 
     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker);
 
@@ -409,6 +423,39 @@ static ucs_status_t ucp_worker_flush_check(ucp_worker_h worker)
     return UCS_OK;
 }
 
+static UCS_F_ALWAYS_INLINE ucp_ep_h
+ucp_worker_flush_req_set_next_ep(ucp_request_t *req, int is_current_ep_valid,
+                                 ucs_list_link_t *next_ep_iter)
+{
+    ucp_worker_h worker              = req->flush_worker.worker;
+    ucp_ep_ext_gen_t *next_ep_ext    = ucs_container_of(next_ep_iter,
+                                                        ucp_ep_ext_gen_t,
+                                                        ep_list);
+    ucp_ep_h next_ep                 = ucp_ep_from_ext_gen(next_ep_ext);
+    ucp_ep_ext_gen_t *current_ep_ext = req->flush_worker.next_ep;
+    ucp_ep_h current_ep;
+
+    req->flush_worker.next_ep = next_ep_ext;
+
+    if (next_ep_iter != &worker->all_eps) {
+        /* Increment UCP EP reference counter to avoid destroying UCP EP while
+         * it is being scheduled to be flushed */
+        ucp_ep_add_ref(next_ep);
+        UCP_EP_ASSERT_COUNTER_INC(&next_ep->flush_iter_refcount);
+    }
+
+    if (!is_current_ep_valid) {
+        return NULL;
+    }
+
+    ucs_assert(&current_ep_ext->ep_list != &worker->all_eps);
+
+    current_ep = ucp_ep_from_ext_gen(current_ep_ext);
+    UCP_EP_ASSERT_COUNTER_DEC(&current_ep->flush_iter_refcount);
+
+    return ucp_ep_remove_ref(current_ep) ? NULL : current_ep;
+}
+
 static void ucp_worker_flush_complete_one(ucp_request_t *req, ucs_status_t status,
                                           int force_progress_unreg)
 {
@@ -425,13 +472,19 @@ static void ucp_worker_flush_complete_one(ucp_request_t *req, ucs_status_t statu
 
     if (complete) {
         ucs_assert(status != UCS_INPROGRESS);
+
+        if (&req->flush_worker.next_ep->ep_list != &worker->all_eps) {
+            /* Cleanup EP iterator */
+            ucp_worker_flush_req_set_next_ep(req, 1, &worker->all_eps);
+        }
+
         ucp_request_complete(req, flush_worker.cb, status, req->user_data);
     }
 }
 
 static void ucp_worker_flush_ep_flushed_cb(ucp_request_t *req)
 {
-    ucp_worker_flush_complete_one(req->super_req, UCS_OK, 0);
+    ucp_worker_flush_complete_one(ucp_request_get_super(req), UCS_OK, 0);
     ucp_request_put(req);
 }
 
@@ -462,24 +515,24 @@ static unsigned ucp_worker_flush_progress(void *arg)
         }
     }
 
-    if ((worker->context->config.ext.flush_worker_eps) &&
+    if (worker->context->config.ext.flush_worker_eps &&
         (&next_ep->ep_list != &worker->all_eps)) {
-        /* Some endpoints are not flushed yet. Take next endpoint from the list
-         * and start flush operation on it.
-         */
-        ep                        = ucp_ep_from_ext_gen(next_ep);
-        req->flush_worker.next_ep = ucs_list_next(&next_ep->ep_list,
-                                                  ucp_ep_ext_gen_t, ep_list);
+        /* Some endpoints are not flushed yet. Take the endpoint from the list
+         * and start flush operation on it. */
+        ep = ucp_worker_flush_req_set_next_ep(req, 1, next_ep->ep_list.next);
+        if (ep == NULL) {
+            goto out;
+        }
 
-        ep_flush_request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL,
-                                                 UCP_REQUEST_FLAG_RELEASED,
+        ep_flush_request = ucp_ep_flush_internal(ep, UCP_REQUEST_FLAG_RELEASED,
                                                  &ucp_request_null_param, req,
                                                  ucp_worker_flush_ep_flushed_cb,
                                                  "flush_worker");
         if (UCS_PTR_IS_ERR(ep_flush_request)) {
             /* endpoint flush resulted in an error */
             status = UCS_PTR_STATUS(ep_flush_request);
-            ucs_warn("ucp_ep_flush_internal() failed: %s", ucs_status_string(status));
+            ucs_diag("ucp_ep_flush_internal() failed: %s",
+                     ucs_status_string(status));
         } else if (ep_flush_request != NULL) {
             /* endpoint flush started, increment refcount */
             ++req->flush_worker.comp_count;
@@ -514,9 +567,8 @@ ucp_worker_flush_nbx_internal(ucp_worker_h worker,
     req->flush_worker.comp_count = 1; /* counting starts from 1, and decremented
                                          when finished going over all endpoints */
     req->flush_worker.prog_id    = UCS_CALLBACKQ_ID_NULL;
-    req->flush_worker.next_ep    = ucs_list_head(&worker->all_eps,
-                                                 ucp_ep_ext_gen_t, ep_list);
 
+    ucp_worker_flush_req_set_next_ep(req, 0, worker->all_eps.next);
     ucp_request_set_send_callback_param(param, req, flush_worker);
     uct_worker_progress_register_safe(worker->uct, ucp_worker_flush_progress,
                                       req, 0, &req->flush_worker.prog_id);
@@ -575,8 +627,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_ep_flush, (ep), ucp_ep_h ep)
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
 
-    request = ucp_ep_flush_internal(ep, UCT_FLUSH_FLAG_LOCAL, 0,
-                                    &ucp_request_null_param, NULL,
+    request = ucp_ep_flush_internal(ep, 0, &ucp_request_null_param, NULL,
                                     ucp_ep_flushed_callback, "flush");
     status = ucp_flush_wait(ep->worker, request);
 
@@ -592,7 +643,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_worker_fence, (worker), ucp_worker_h worker)
 
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
-    ucs_for_each_bit(rsc_index, worker->context->tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(worker->context->tl_bitmap, rsc_index) {
         wiface = ucp_worker_iface(worker, rsc_index);
         if (wiface->iface == NULL) {
             continue;
diff --git a/src/ucp/rma/get_am.c b/src/ucp/rma/get_am.c
index 79541f6b7f0..f153c7c52f2 100644
--- a/src/ucp/rma/get_am.c
+++ b/src/ucp/rma/get_am.c
@@ -22,7 +22,7 @@ static size_t ucp_proto_get_am_bcopy_pack(void *dest, void *arg)
     ucp_get_req_hdr_t *getreqh = dest;
 
     getreqh->address    = req->send.rma.remote_addr;
-    getreqh->length     = req->send.dt_iter.length;
+    getreqh->length     = req->send.state.dt_iter.length;
     getreqh->req.ep_id  = ucp_send_request_get_ep_remote_id(req);
     getreqh->req.req_id = ucp_send_request_get_id(req);
     getreqh->mem_type   = req->send.rma.rkey->mem_type;
@@ -30,18 +30,11 @@ static size_t ucp_proto_get_am_bcopy_pack(void *dest, void *arg)
     return sizeof(*getreqh);
 }
 
-static UCS_F_ALWAYS_INLINE void
-ucp_proto_get_am_bcopy_complete(ucp_request_t *req, ucs_status_t status)
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_get_am_bcopy_complete(ucp_request_t *req)
 {
-    ucs_assert(status == UCS_OK);
     ucp_ep_rma_remote_request_sent(req->send.ep);
-}
-
-static UCS_F_ALWAYS_INLINE void
-ucp_proto_get_am_bcopy_error(ucp_request_t *req, ucs_status_t status)
-{
-    ucp_worker_flush_ops_count_dec(req->send.ep->worker);
-    ucp_request_complete_send(req, status);
+    return UCS_OK;
 }
 
 static ucs_status_t ucp_proto_get_am_bcopy_progress(uct_pending_req_t *self)
@@ -58,21 +51,19 @@ static ucs_status_t ucp_proto_get_am_bcopy_progress(uct_pending_req_t *self)
             return status;
         }
 
-       /* initialize some request fields, for compatibility of get_reply
+        /* initialize some request fields, for compatibility of get_reply
          * processing */
-        req->send.buffer = req->send.dt_iter.type.contig.buffer;
-        req->send.length = req->send.dt_iter.length;
-
+        req->send.buffer = req->send.state.dt_iter.type.contig.buffer;
+        req->send.length = req->send.state.dt_iter.length;
         req->flags      |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
+        ucp_send_request_id_alloc(req);
     }
 
     ucp_worker_flush_ops_count_inc(worker);
-    status = ucp_proto_am_bcopy_single_progress(req, UCP_AM_ID_GET_REQ,
-                                                spriv->super.lane,
-                                                ucp_proto_get_am_bcopy_pack,
-                                                req, sizeof(ucp_get_req_hdr_t),
-                                                ucp_proto_get_am_bcopy_complete,
-                                                ucp_proto_get_am_bcopy_error);
+    status = ucp_proto_am_bcopy_single_progress(
+            req, UCP_AM_ID_GET_REQ, spriv->super.lane,
+            ucp_proto_get_am_bcopy_pack, req, sizeof(ucp_get_req_hdr_t),
+            ucp_proto_get_am_bcopy_complete);
     if (status != UCS_OK) {
         ucp_worker_flush_ops_count_dec(worker);
     }
diff --git a/src/ucp/rma/get_offload.c b/src/ucp/rma/get_offload.c
index bbbc29059e6..08d74915105 100644
--- a/src/ucp/rma/get_offload.c
+++ b/src/ucp/rma/get_offload.c
@@ -33,11 +33,12 @@ ucp_proto_get_offload_bcopy_send_func(ucp_request_t *req,
     void *dest;
 
     max_length = ucp_proto_multi_max_payload(req, lpriv, 0);
-    length     = ucp_datatype_iter_next_ptr(&req->send.dt_iter, max_length,
-                                            next_iter, &dest);
+    length     = ucp_datatype_iter_next_ptr(&req->send.state.dt_iter,
+                                            max_length, next_iter, &dest);
     return uct_ep_get_bcopy(req->send.ep->uct_eps[lpriv->super.lane],
                             ucp_proto_get_offload_bcopy_unpack, dest, length,
-                            req->send.rma.remote_addr + req->send.dt_iter.offset,
+                            req->send.rma.remote_addr +
+                            req->send.state.dt_iter.offset,
                             tl_rkey, &req->send.state.uct_comp);
 }
 
@@ -45,7 +46,8 @@ static void ucp_proto_get_offload_bcopy_completion(uct_completion_t *self)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t,
                                           send.state.uct_comp);
-    ucp_proto_request_bcopy_complete(req, req->send.state.uct_comp.status);
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter, UINT_MAX);
+    ucp_request_complete_send(req, req->send.state.uct_comp.status);
 }
 
 static ucs_status_t ucp_proto_get_offload_bcopy_progress(uct_pending_req_t *self)
@@ -54,13 +56,14 @@ static ucs_status_t ucp_proto_get_offload_bcopy_progress(uct_pending_req_t *self
 
     if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
         ucp_proto_multi_request_init(req);
-        ucp_proto_request_completion_init(req,
-                                          ucp_proto_get_offload_bcopy_completion);
+        ucp_proto_completion_init(&req->send.state.uct_comp,
+                                  ucp_proto_get_offload_bcopy_completion);
         req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
     }
 
-    return ucp_proto_multi_progress(req, ucp_proto_get_offload_bcopy_send_func,
-                                    ucp_request_invoke_uct_completion,
+    return ucp_proto_multi_progress(req, req->send.proto_config->priv,
+                                    ucp_proto_get_offload_bcopy_send_func,
+                                    ucp_request_invoke_uct_completion_success,
                                     UCS_BIT(UCP_DATATYPE_CONTIG));
 }
 
@@ -110,17 +113,22 @@ ucp_proto_get_offload_zcopy_send_func(ucp_request_t *req,
                                                      lpriv->super.rkey_index);
     uct_iov_t iov;
 
-    ucp_datatype_iter_next_iov(&req->send.dt_iter, lpriv->super.memh_index,
+    ucp_datatype_iter_next_iov(&req->send.state.dt_iter,
+                               lpriv->super.memh_index,
                                ucp_proto_multi_max_payload(req, lpriv, 0),
                                next_iter, &iov);
     return uct_ep_get_zcopy(req->send.ep->uct_eps[lpriv->super.lane], &iov, 1,
-                            req->send.rma.remote_addr + req->send.dt_iter.offset,
+                            req->send.rma.remote_addr +
+                            req->send.state.dt_iter.offset,
                             tl_rkey, &req->send.state.uct_comp);
 }
 
 static ucs_status_t ucp_proto_get_offload_zcopy_progress(uct_pending_req_t *self)
 {
-    return ucp_proto_multi_zcopy_progress(self, NULL,
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    return ucp_proto_multi_zcopy_progress(req, req->send.proto_config->priv,
+                                          NULL, UCT_MD_MEM_ACCESS_LOCAL_WRITE,
                                           ucp_proto_get_offload_zcopy_send_func,
                                           ucp_proto_request_zcopy_completion);
 }
diff --git a/src/ucp/rma/put_am.c b/src/ucp/rma/put_am.c
index cd4e1b1ea34..b2859556354 100644
--- a/src/ucp/rma/put_am.c
+++ b/src/ucp/rma/put_am.c
@@ -22,7 +22,8 @@ static size_t ucp_proto_put_am_bcopy_pack(void *dest, void *arg)
     ucp_request_t                   *req = pack_ctx->req;
     ucp_put_hdr_t                  *puth = dest;
 
-    puth->address  = req->send.rma.remote_addr + req->send.dt_iter.offset;
+    puth->address  = req->send.rma.remote_addr +
+                     req->send.state.dt_iter.offset;
     puth->ep_id    = ucp_send_request_get_ep_remote_id(req);
     puth->mem_type = req->send.rma.rkey->mem_type;
 
@@ -63,8 +64,9 @@ static ucs_status_t ucp_proto_put_am_bcopy_progress(uct_pending_req_t *self)
         req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
     }
 
-    return ucp_proto_multi_progress(req, ucp_proto_put_am_bcopy_send_func,
-                                    ucp_proto_request_bcopy_complete,
+    return ucp_proto_multi_progress(req, mpriv,
+                                    ucp_proto_put_am_bcopy_send_func,
+                                    ucp_proto_request_bcopy_complete_success,
                                     UCS_BIT(UCP_DATATYPE_CONTIG));
 }
 
diff --git a/src/ucp/rma/put_offload.c b/src/ucp/rma/put_offload.c
index d5af9b0ee90..7b01eb9775e 100644
--- a/src/ucp/rma/put_offload.c
+++ b/src/ucp/rma/put_offload.c
@@ -28,8 +28,8 @@ static ucs_status_t ucp_proto_put_offload_short_progress(uct_pending_req_t *self
 
     tl_rkey = ucp_rma_request_get_tl_rkey(req, spriv->super.rkey_index);
     status  = uct_ep_put_short(ep->uct_eps[spriv->super.lane],
-                               req->send.dt_iter.type.contig.buffer,
-                               req->send.dt_iter.length,
+                               req->send.state.dt_iter.type.contig.buffer,
+                               req->send.state.dt_iter.length,
                                req->send.rma.remote_addr, tl_rkey);
     if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) {
         req->send.lane = spriv->super.lane; /* for pending add */
@@ -39,7 +39,8 @@ static ucs_status_t ucp_proto_put_offload_short_progress(uct_pending_req_t *self
     /* UCS_INPROGRESS is not expected */
     ucs_assert((status == UCS_OK) || UCS_STATUS_IS_ERR(status));
 
-    ucp_datatype_iter_cleanup(&req->send.dt_iter, UCS_BIT(UCP_DATATYPE_CONTIG));
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter,
+                              UCS_BIT(UCP_DATATYPE_CONTIG));
     ucp_request_complete_send(req, status);
     return UCS_OK;
 }
@@ -100,7 +101,8 @@ ucp_proto_put_offload_bcopy_send_func(ucp_request_t *req,
     tl_rkey     = ucp_rma_request_get_tl_rkey(req, lpriv->super.rkey_index);
     packed_size = uct_ep_put_bcopy(ep->uct_eps[lpriv->super.lane],
                                    ucp_proto_put_offload_bcopy_pack, &pack_ctx,
-                                   req->send.rma.remote_addr + req->send.dt_iter.offset,
+                                   req->send.rma.remote_addr +
+                                   req->send.state.dt_iter.offset,
                                    tl_rkey);
     if (ucs_likely(packed_size >= 0)) {
         return UCS_OK;
@@ -118,8 +120,9 @@ static ucs_status_t ucp_proto_put_offload_bcopy_progress(uct_pending_req_t *self
         req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
     }
 
-    return ucp_proto_multi_progress(req, ucp_proto_put_offload_bcopy_send_func,
-                                    ucp_proto_request_bcopy_complete,
+    return ucp_proto_multi_progress(req, req->send.proto_config->priv,
+                                    ucp_proto_put_offload_bcopy_send_func,
+                                    ucp_proto_request_bcopy_complete_success,
                                     UCS_BIT(UCP_DATATYPE_CONTIG));
 }
 
@@ -167,17 +170,23 @@ ucp_proto_put_offload_zcopy_send_func(ucp_request_t *req,
     uct_rkey_t tl_rkey = ucp_rma_request_get_tl_rkey(req, lpriv->super.rkey_index);
     uct_iov_t iov;
 
-    ucp_datatype_iter_next_iov(&req->send.dt_iter, lpriv->super.memh_index,
+    ucp_datatype_iter_next_iov(&req->send.state.dt_iter,
+                               lpriv->super.memh_index,
                                ucp_proto_multi_max_payload(req, lpriv, 0),
                                next_iter, &iov);
     return uct_ep_put_zcopy(req->send.ep->uct_eps[lpriv->super.lane], &iov, 1,
-                            req->send.rma.remote_addr + req->send.dt_iter.offset,
+                            req->send.rma.remote_addr +
+                            req->send.state.dt_iter.offset,
                             tl_rkey, &req->send.state.uct_comp);
 }
 
-static ucs_status_t ucp_proto_put_offload_zcopy_progress(uct_pending_req_t *self)
+static ucs_status_t
+ucp_proto_put_offload_zcopy_progress(uct_pending_req_t *self)
 {
-    return ucp_proto_multi_zcopy_progress(self, NULL,
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    return ucp_proto_multi_zcopy_progress(req, req->send.proto_config->priv,
+                                          NULL, UCT_MD_MEM_ACCESS_LOCAL_READ,
                                           ucp_proto_put_offload_zcopy_send_func,
                                           ucp_proto_request_zcopy_completion);
 }
diff --git a/src/ucp/rma/rma.h b/src/ucp/rma/rma.h
index 6f47651d1c7..8f5dbbf8265 100644
--- a/src/ucp/rma/rma.h
+++ b/src/ucp/rma/rma.h
@@ -83,7 +83,8 @@ extern ucp_amo_proto_t ucp_amo_sw_proto;
 
 
 ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length,
-                                     ucs_status_t status);
+                                     ucs_status_t status,
+                                     ucs_ptr_map_key_t req_id);
 
 void ucp_ep_flush_remote_completed(ucp_request_t *req);
 
diff --git a/src/ucp/rma/rma.inl b/src/ucp/rma/rma.inl
index e2989798c0e..3f443c9a078 100644
--- a/src/ucp/rma/rma.inl
+++ b/src/ucp/rma/rma.inl
@@ -76,12 +76,12 @@ static inline ucs_status_t ucp_rma_wait(ucp_worker_h worker, void *user_req,
     }
 }
 
-static inline void ucp_ep_rma_remote_request_sent(ucp_ep_t *ep)
+static inline void ucp_ep_rma_remote_request_sent(ucp_ep_h ep)
 {
     ++ucp_ep_flush_state(ep)->send_sn;
 }
 
-static inline void ucp_ep_rma_remote_request_completed(ucp_ep_t *ep)
+static inline void ucp_ep_rma_remote_request_completed(ucp_ep_h ep)
 {
     ucp_ep_flush_state_t *flush_state = ucp_ep_flush_state(ep);
     ucp_request_t *req;
@@ -89,10 +89,10 @@ static inline void ucp_ep_rma_remote_request_completed(ucp_ep_t *ep)
     ucp_worker_flush_ops_count_dec(ep->worker);
     ++flush_state->cmpl_sn;
 
-    ucs_queue_for_each_extract(req, &flush_state->reqs, send.flush.queue,
-                               UCS_CIRCULAR_COMPARE32(req->send.flush.cmpl_sn,
-                                                      <= ,
-                                                      flush_state->cmpl_sn)) {
+    ucs_hlist_for_each_extract_if(req, &flush_state->reqs, send.list,
+                                  UCS_CIRCULAR_COMPARE32(
+                                          req->send.flush.cmpl_sn, <=,
+                                          flush_state->cmpl_sn)) {
         ucp_ep_flush_remote_completed(req);
     }
 }
diff --git a/src/ucp/rma/rma_basic.c b/src/ucp/rma/rma_basic.c
index cc1ac441d9a..ca292e27069 100644
--- a/src/ucp/rma/rma_basic.c
+++ b/src/ucp/rma/rma_basic.c
@@ -28,10 +28,10 @@ static ucs_status_t ucp_rma_basic_progress_put(uct_pending_req_t *self)
     ucs_assert(rkey->cache.ep_cfg_index == ep->cfg_index);
     ucs_assert(rkey->cache.rma_lane == lane);
 
-    if ((req->send.length <= rma_config->max_put_short) ||
+    if (((ssize_t)req->send.length <= rma_config->max_put_short) ||
         (req->send.length <= ucp_ep_config(ep)->bcopy_thresh))
     {
-        packed_len = ucs_min(req->send.length, rma_config->max_put_short);
+        packed_len = ucs_min((ssize_t)req->send.length, rma_config->max_put_short);
         status = UCS_PROFILE_CALL(uct_ep_put_short,
                                   ep->uct_eps[lane],
                                   req->send.buffer,
@@ -70,7 +70,8 @@ static ucs_status_t ucp_rma_basic_progress_put(uct_pending_req_t *self)
                                        status);
     }
 
-    return ucp_rma_request_advance(req, packed_len, status);
+    return ucp_rma_request_advance(req, packed_len, status,
+                                   UCS_PTR_MAP_KEY_INVALID);
 }
 
 static ucs_status_t ucp_rma_basic_progress_get(uct_pending_req_t *self)
@@ -86,7 +87,7 @@ static ucs_status_t ucp_rma_basic_progress_get(uct_pending_req_t *self)
     ucs_assert(rkey->cache.ep_cfg_index == ep->cfg_index);
     ucs_assert(rkey->cache.rma_lane == lane);
 
-    if (ucs_likely(req->send.length < rma_config->get_zcopy_thresh)) {
+    if (ucs_likely((ssize_t)req->send.length < rma_config->get_zcopy_thresh)) {
         frag_length = ucs_min(rma_config->max_get_bcopy, req->send.length);
         status = UCS_PROFILE_CALL(uct_ep_get_bcopy,
                                   ep->uct_eps[lane],
@@ -117,7 +118,8 @@ static ucs_status_t ucp_rma_basic_progress_get(uct_pending_req_t *self)
                                        UCS_INPROGRESS);
     }
 
-    return ucp_rma_request_advance(req, frag_length, status);
+    return ucp_rma_request_advance(req, frag_length, status,
+                                   UCS_PTR_MAP_KEY_INVALID);
 }
 
 ucp_rma_proto_t ucp_rma_basic_proto = {
diff --git a/src/ucp/rma/rma_send.c b/src/ucp/rma/rma_send.c
index 11324975bac..58dfc17d2bc 100644
--- a/src/ucp/rma/rma_send.c
+++ b/src/ucp/rma/rma_send.c
@@ -76,7 +76,8 @@
  *  next_partial_send; (oops req already freed)
  */
 ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length,
-                                     ucs_status_t status)
+                                     ucs_status_t status,
+                                     ucs_ptr_map_key_t req_id)
 {
     ucs_assert(status != UCS_ERR_NOT_IMPLEMENTED);
 
@@ -96,6 +97,9 @@ ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length,
     if (req->send.length == 0) {
         /* bcopy is the fast path */
         if (ucs_likely(req->send.state.uct_comp.count == 0)) {
+            if (req_id != UCS_PTR_MAP_KEY_INVALID) {
+                ucp_send_request_id_release(req);
+            }
             ucp_request_send_buffer_dereg(req);
             ucp_request_complete_send(req, UCS_OK);
         }
@@ -130,9 +134,9 @@ static void ucp_rma_request_zcopy_completion(uct_completion_t *self)
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_rma_request_init(ucp_request_t *req, ucp_ep_h ep, const void *buffer,
                      size_t length, uint64_t remote_addr, ucp_rkey_h rkey,
-                     uct_pending_callback_t cb, size_t zcopy_thresh, int flags)
+                     uct_pending_callback_t cb, size_t zcopy_thresh)
 {
-    req->flags                = flags; /* Implicit release */
+    req->flags                = 0;
     req->send.ep              = ep;
     req->send.buffer          = (void*)buffer;
     req->send.datatype        = ucp_dt_make_contig(1);
@@ -171,7 +175,7 @@ ucp_rma_nonblocking(ucp_ep_h ep, const void *buffer, size_t length,
                                 {return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);});
 
     status = ucp_rma_request_init(req, ep, buffer, length, remote_addr, rkey,
-                                  progress_cb, zcopy_thresh, 0);
+                                  progress_cb, zcopy_thresh);
     if (ucs_unlikely(status != UCS_OK)) {
         return UCS_STATUS_PTR(status);
     }
@@ -207,6 +211,31 @@ ucs_status_ptr_t ucp_put_nb(ucp_ep_h ep, const void *buffer, size_t length,
     return ucp_put_nbx(ep, buffer, length, remote_addr, rkey, &param);
 }
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_put_send_short(ucp_ep_h ep, const void *buffer, size_t length,
+                   uint64_t remote_addr, ucp_rkey_h rkey,
+                   const ucp_request_param_t *param)
+{
+    const ucp_rkey_config_t *rkey_config;
+    uct_rkey_t tl_rkey;
+
+    if (ucs_unlikely(param->op_attr_mask & (UCP_OP_ATTR_FIELD_DATATYPE |
+                                            UCP_OP_ATTR_FLAG_NO_IMM_CMPL))) {
+        return UCS_ERR_NO_RESOURCE;
+    }
+
+    rkey_config = ucp_rkey_config(ep->worker, rkey);
+    if (ucs_unlikely(!ucp_proto_select_is_short(ep, &rkey_config->put_short,
+                                                length))) {
+        return UCS_ERR_NO_RESOURCE;
+    }
+
+    tl_rkey = rkey->tl_rkey[rkey_config->put_short.rkey_index].rkey.rkey;
+    return UCS_PROFILE_CALL(uct_ep_put_short,
+                            ep->uct_eps[rkey_config->put_short.lane],
+                            buffer, length, remote_addr, tl_rkey);
+}
+
 ucs_status_ptr_t ucp_put_nbx(ucp_ep_h ep, const void *buffer, size_t count,
                              uint64_t remote_addr, ucp_rkey_h rkey,
                              const ucp_request_param_t *param)
@@ -227,6 +256,12 @@ ucs_status_ptr_t ucp_put_nbx(ucp_ep_h ep, const void *buffer, size_t count,
                    param->cb.send : NULL);
 
     if (worker->context->config.ext.proto_enable) {
+        status = ucp_put_send_short(ep, buffer, count, remote_addr, rkey, param);
+        if (ucs_likely(status != UCS_ERR_NO_RESOURCE)) {
+            ret = UCS_STATUS_PTR(status);
+            goto out_unlock;
+        }
+
         req = ucp_request_get_param(worker, param,
                                     {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
                                     goto out_unlock;});
diff --git a/src/ucp/rma/rma_sw.c b/src/ucp/rma/rma_sw.c
index 14381a0dc49..40ed8edff2a 100644
--- a/src/ucp/rma/rma_sw.c
+++ b/src/ucp/rma/rma_sw.c
@@ -28,7 +28,7 @@ static size_t ucp_rma_sw_put_pack_cb(void *dest, void *arg)
     puth->ep_id    = ucp_ep_remote_id(ep);
     puth->mem_type = UCS_MEMORY_TYPE_HOST;
 
-    ucs_assert(puth->ep_id != UCP_EP_ID_INVALID);
+    ucs_assert(puth->ep_id != UCS_PTR_MAP_KEY_INVALID);
 
     length = ucs_min(req->send.length,
                      ucp_ep_config(ep)->am.max_bcopy - sizeof(*puth));
@@ -48,7 +48,7 @@ static ucs_status_t ucp_rma_sw_progress_put(uct_pending_req_t *self)
                                             ucp_rma_sw_put_pack_cb, req,
                                             &packed_len);
     return ucp_rma_request_advance(req, packed_len - sizeof(ucp_put_hdr_t),
-                                   status);
+                                   status, UCS_PTR_MAP_KEY_INVALID);
 }
 
 static size_t ucp_rma_sw_get_req_pack_cb(void *dest, void *arg)
@@ -61,8 +61,7 @@ static size_t ucp_rma_sw_get_req_pack_cb(void *dest, void *arg)
     getreqh->req.ep_id  = ucp_send_request_get_ep_remote_id(req);
     getreqh->mem_type   = req->send.rma.rkey->mem_type;
     getreqh->req.req_id = ucp_send_request_get_id(req);
-
-    ucs_assert(getreqh->req.ep_id != UCP_EP_ID_INVALID);
+    ucs_assert(getreqh->req.ep_id != UCS_PTR_MAP_KEY_INVALID);
 
     return sizeof(*getreqh);
 }
@@ -74,13 +73,17 @@ static ucs_status_t ucp_rma_sw_progress_get(uct_pending_req_t *self)
     ucs_status_t status;
 
     req->send.lane = ucp_ep_get_am_lane(req->send.ep);
-    status         = ucp_rma_sw_do_am_bcopy(req, UCP_AM_ID_GET_REQ,
-                                            req->send.lane,
-                                            ucp_rma_sw_get_req_pack_cb, req,
-                                            &packed_len);
-    if ((status != UCS_OK) && (status != UCS_ERR_NO_RESOURCE)) {
-        /* completed with error */
-        ucp_request_complete_send(req, status);
+    ucp_send_request_id_alloc(req);
+
+    status = ucp_rma_sw_do_am_bcopy(req, UCP_AM_ID_GET_REQ, req->send.lane,
+                                    ucp_rma_sw_get_req_pack_cb, req,
+                                    &packed_len);
+    if (status != UCS_OK) {
+        ucp_send_request_id_release(req);
+        if (ucs_unlikely(status != UCS_ERR_NO_RESOURCE)) {
+            /* completed with error */
+            ucp_request_complete_send(req, status);
+        }
     }
 
     /* If completed with UCS_OK, it means that get request packet sent,
@@ -133,6 +136,7 @@ void ucp_rma_sw_send_cmpl(ucp_ep_h ep)
         return;
     }
 
+    req->flags         = 0;
     req->send.ep       = ep;
     req->send.uct.func = ucp_progress_rma_cmpl;
     ucp_request_send(req, 0);
@@ -143,10 +147,16 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_put_handler, (arg, data, length, am_flags),
 {
     ucp_put_hdr_t *puth = data;
     ucp_worker_h worker = arg;
+    ucp_ep_h ep;
 
+    /* allow getting closed EP to be used for sending a completion to enable flush
+     * on a peer
+     */
+    UCP_WORKER_GET_EP_BY_ID(&ep, worker, puth->ep_id, return UCS_OK,
+                            "SW PUT request");
     ucp_dt_contig_unpack(worker, (void*)puth->address, puth + 1,
                          length - sizeof(*puth), puth->mem_type);
-    ucp_rma_sw_send_cmpl(ucp_worker_get_ep_by_id(worker, puth->ep_id));
+    ucp_rma_sw_send_cmpl(ep);
     return UCS_OK;
 }
 
@@ -155,8 +165,13 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rma_cmpl_handler, (arg, data, length, am_flag
 {
     ucp_cmpl_hdr_t *putackh = data;
     ucp_worker_h worker     = arg;
-    ucp_ep_h ep             = ucp_worker_get_ep_by_id(worker, putackh->ep_id);
+    ucp_ep_h ep;
 
+    /* allow getting closed EP to be used for handling a completion to enable flush
+     * on a peer
+     */
+    UCP_WORKER_GET_EP_BY_ID(&ep, worker, putackh->ep_id, return UCS_OK,
+                            "SW RMA completion");
     ucp_ep_rma_remote_request_completed(ep);
     return UCS_OK;
 }
@@ -170,7 +185,7 @@ static size_t ucp_rma_sw_pack_get_reply(void *dest, void *arg)
     length      = ucs_min(req->send.length,
                           ucp_ep_config(req->send.ep)->am.max_bcopy -
                           sizeof(*hdr));
-    hdr->req_id = req->send.get_reply.req_id;
+    hdr->req_id = req->send.get_reply.remote_req_id;
     ucp_dt_contig_pack(req->send.ep->worker, hdr + 1, req->send.buffer, length,
                        req->send.mem_type);
 
@@ -209,25 +224,30 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_get_req_handler, (arg, data, length, am_flags
 {
     ucp_get_req_hdr_t *getreqh = data;
     ucp_worker_h worker        = arg;
-    ucp_ep_h ep                = ucp_worker_get_ep_by_id(worker,
-                                                         getreqh->req.ep_id);
+    ucp_ep_h ep;
     ucp_request_t *req;
 
+    /* allow getting closed EP to be used for sending a GET operation data to enable
+     * flush on a peer
+     */
+    UCP_WORKER_GET_EP_BY_ID(&ep, worker, getreqh->req.ep_id, return UCS_OK,
+                            "SW GET request");
     req = ucp_request_get(worker);
     if (req == NULL) {
         ucs_error("failed to allocate get reply");
         return UCS_OK;
     }
 
-    req->send.ep               = ep;
-    req->send.buffer           = (void*)getreqh->address;
-    req->send.length           = getreqh->length;
-    req->send.get_reply.req_id = getreqh->req.req_id;
-    req->send.uct.func         = ucp_progress_get_reply;
+    req->flags                        = 0;
+    req->send.ep                      = ep;
+    req->send.buffer                  = (void*)getreqh->address;
+    req->send.length                  = getreqh->length;
+    req->send.get_reply.remote_req_id = getreqh->req.req_id;
+    req->send.uct.func                = ucp_progress_get_reply;
     if (ep->worker->context->config.ext.proto_enable) {
-        req->send.mem_type     = getreqh->mem_type;
+        req->send.mem_type = getreqh->mem_type;
     } else {
-        req->send.mem_type     = UCS_MEMORY_TYPE_HOST;
+        req->send.mem_type = UCS_MEMORY_TYPE_HOST;
     }
 
     ucp_request_send(req, 0);
@@ -240,27 +260,31 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_get_rep_handler, (arg, data, length, am_flags
     ucp_worker_h worker        = arg;
     ucp_rma_rep_hdr_t *getreph = data;
     size_t frag_length         = length - sizeof(*getreph);
-    ucp_request_t *req         = ucp_worker_get_request_by_id(worker,
-                                                              getreph->req_id);
-    ucp_ep_h ep                = req->send.ep;
+    ucp_request_t *req;
+    ucp_ep_h ep;
+    void *ptr;
 
+    UCP_SEND_REQUEST_GET_BY_ID(&req, worker, getreph->req_id, 0, return UCS_OK,
+                               "GET reply data %p", getreph);
+    ep = req->send.ep;
     if (ep->worker->context->config.ext.proto_enable) {
-        ucp_dt_contig_unpack(ep->worker,
-                             req->send.dt_iter.type.contig.buffer +
-                             req->send.dt_iter.offset,
-                             getreph + 1, frag_length,
-                             req->send.dt_iter.mem_type);
-        req->send.dt_iter.offset += frag_length;
-        if (req->send.dt_iter.offset == req->send.dt_iter.length) {
-            ucp_proto_request_bcopy_complete(req, UCS_OK);
+        // TODO use dt_iter.inl unpack
+        ptr = UCS_PTR_BYTE_OFFSET(req->send.state.dt_iter.type.contig.buffer,
+                                  req->send.state.dt_iter.offset);
+        ucp_dt_contig_unpack(ep->worker, ptr, getreph + 1, frag_length,
+                             req->send.state.dt_iter.mem_info.type);
+        req->send.state.dt_iter.offset += frag_length;
+        if (req->send.state.dt_iter.offset == req->send.state.dt_iter.length) {
+            ucp_send_request_id_release(req);
+            ucp_proto_request_bcopy_complete_success(req);
             ucp_ep_rma_remote_request_completed(ep);
         }
     } else {
         memcpy(req->send.buffer, getreph + 1, frag_length);
 
         /* complete get request on last fragment of the reply */
-        if (ucp_rma_request_advance(req, frag_length, UCS_OK) == UCS_OK) {
-            ucp_worker_del_request_id(worker, getreph->req_id);
+        if (ucp_rma_request_advance(req, frag_length, UCS_OK,
+                                    getreph->req_id) == UCS_OK) {
             ucp_ep_rma_remote_request_completed(ep);
         }
     }
diff --git a/src/ucp/rndv/proto_rndv.c b/src/ucp/rndv/proto_rndv.c
new file mode 100644
index 00000000000..0be3d0b9c0f
--- /dev/null
+++ b/src/ucp/rndv/proto_rndv.c
@@ -0,0 +1,615 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "proto_rndv.inl"
+
+#include <ucp/proto/proto_common.inl>
+
+
+static ucp_md_map_t
+ucp_proto_rndv_ctrl_reg_md_map(const ucp_proto_rndv_ctrl_init_params_t *params)
+{
+    ucp_worker_h worker                      = params->super.super.worker;
+    const ucp_ep_config_key_t *ep_config_key = params->super.super.ep_config_key;
+    const uct_iface_attr_t *iface_attr;
+    const uct_md_attr_t *md_attr;
+    ucp_md_index_t md_index;
+    ucp_md_map_t reg_md_map;
+    ucp_lane_index_t lane;
+
+    /* md_map is all lanes which support get_zcopy on the given mem_type and
+     * require remote key
+     */
+    reg_md_map = 0;
+    for (lane = 0; lane < ep_config_key->num_lanes; ++lane) {
+        if (ep_config_key->lanes[lane].rsc_index == UCP_NULL_RESOURCE) {
+            continue;
+        }
+
+        /* Check the lane supports get_zcopy */
+        iface_attr = ucp_proto_common_get_iface_attr(&params->super.super,
+                                                     lane);
+        if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_GET_ZCOPY)) {
+            continue;
+        }
+
+        /* Check the memory domain requires remote key, and capable of
+         * registering the memory type
+         */
+        md_index = ucp_proto_common_get_md_index(&params->super.super, lane);
+        md_attr  = &worker->context->tl_mds[md_index].attr;
+        if (!(md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY) ||
+            !(md_attr->cap.reg_mem_types & UCS_BIT(params->mem_info.type))) {
+            continue;
+        }
+
+        reg_md_map |= UCS_BIT(md_index);
+    }
+
+    return reg_md_map;
+}
+
+/*
+ * Select (guess) the protocol that would be used by the remote peer.
+ * We report the rendezvous protocol performance according to the protocol we
+ * think the remote peer would select.
+ */
+static ucs_status_t ucp_proto_rndv_ctrl_select_remote_proto(
+        const ucp_proto_rndv_ctrl_init_params_t *params,
+        const ucp_proto_select_param_t *remote_select_param,
+        ucp_proto_rndv_ctrl_priv_t *rpriv)
+{
+    ucp_worker_h worker                 = params->super.super.worker;
+    ucp_worker_cfg_index_t ep_cfg_index = params->super.super.ep_cfg_index;
+    ucp_rkey_config_key_t rkey_config_key;
+    ucp_worker_cfg_index_t rkey_cfg_index;
+    ucp_proto_select_elem_t *select_elem;
+    ucp_rkey_config_t *rkey_config;
+    ucs_status_t status;
+
+    /* Construct remote key for remote protocol lookup according to the local
+     * buffer properties (since remote side is expected to access the local
+     * buffer)
+     */
+    rkey_config_key.md_map       = rpriv->md_map;
+    rkey_config_key.ep_cfg_index = ep_cfg_index;
+    rkey_config_key.mem_type     = params->mem_info.type;
+    rkey_config_key.sys_dev      = UCS_SYS_DEVICE_ID_UNKNOWN;
+
+    status = ucp_worker_rkey_config_get(worker, &rkey_config_key, NULL,
+                                        &rkey_cfg_index);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    rkey_config = &worker->rkey_config[rkey_cfg_index];
+    select_elem = ucp_proto_select_lookup_slow(worker,
+                                               &rkey_config->proto_select,
+                                               ep_cfg_index, rkey_cfg_index,
+                                               remote_select_param);
+    if (select_elem == NULL) {
+        ucs_debug("%s: did not find protocol for %s",
+                  params->super.super.proto_name,
+                  ucp_operation_names[params->remote_op_id]);
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    rpriv->remote_proto = *select_elem;
+    return UCS_OK;
+}
+
+ucs_status_t
+ucp_proto_rndv_ctrl_init(const ucp_proto_rndv_ctrl_init_params_t *params)
+{
+    ucp_context_h context             = params->super.super.worker->context;
+    ucp_proto_rndv_ctrl_priv_t *rpriv = params->super.super.priv;
+    const ucp_proto_perf_range_t *remote_perf_range;
+    ucp_proto_select_param_t remote_select_param;
+    ucp_proto_perf_range_t *perf_range;
+    const uct_iface_attr_t *iface_attr;
+    ucs_linear_func_t send_overheads;
+    ucs_memory_info_t mem_info;
+    ucp_md_index_t md_index;
+    ucp_proto_caps_t *caps;
+    ucs_status_t status;
+    double rts_latency;
+
+    ucs_assert(params->super.flags & UCP_PROTO_COMMON_INIT_FLAG_RESPONSE);
+    ucs_assert(!(params->super.flags & UCP_PROTO_COMMON_INIT_FLAG_MAX_FRAG));
+
+    /* Find lane to send the initial message */
+    rpriv->lane = ucp_proto_common_find_am_bcopy_lane(&params->super.super);
+    if (rpriv->lane == UCP_NULL_LANE) {
+        return UCS_ERR_NO_ELEM;
+    }
+
+    /* Construct select parameter for the remote protocol */
+    if (params->super.super.rkey_config_key == NULL) {
+        /* Remote buffer is unknown, assume same params as local */
+        remote_select_param          = *params->super.super.select_param;
+        remote_select_param.op_id    = params->remote_op_id;
+        remote_select_param.op_flags = 0;
+    } else {
+        /* If we know the remote buffer parameters, these are actually the local
+         * parameters for the remote protocol
+         */
+        mem_info.type    = params->super.super.rkey_config_key->mem_type;
+        mem_info.sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+        ucp_proto_select_param_init(&remote_select_param, params->remote_op_id,
+                                    0, UCP_DATATYPE_CONTIG, &mem_info, 1);
+    }
+
+    /* Initialize estimated memory registration map */
+    rpriv->md_map           = ucp_proto_rndv_ctrl_reg_md_map(params);
+    rpriv->packed_rkey_size = ucp_rkey_packed_size(context, rpriv->md_map,
+                                                   UCS_SYS_DEVICE_ID_UNKNOWN,
+                                                   0);
+
+    /* Guess the protocol the remote side will select */
+    status = ucp_proto_rndv_ctrl_select_remote_proto(params,
+                                                     &remote_select_param,
+                                                     rpriv);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    /* Set send_overheads to the time to send and receive RTS message */
+    iface_attr     = ucp_proto_common_get_iface_attr(&params->super.super,
+                                                     rpriv->lane);
+    rts_latency    = (iface_attr->overhead * 2) +
+                     ucp_tl_iface_latency(context, &iface_attr->latency);
+    send_overheads = ucs_linear_func_make(rts_latency, 0.0);
+
+    /* Add registration cost to send_overheads */
+    ucs_for_each_bit(md_index, rpriv->md_map) {
+        ucs_linear_func_add_inplace(&send_overheads,
+                                    context->tl_mds[md_index].attr.reg_cost);
+    }
+
+    /* Set rendezvous protocol properties */
+    *params->super.super.priv_size         = sizeof(ucp_proto_rndv_ctrl_priv_t);
+    params->super.super.caps->cfg_thresh   = params->super.cfg_thresh;
+    params->super.super.caps->cfg_priority = params->super.cfg_priority;
+    params->super.super.caps->min_length   = params->min_length;
+    params->super.super.caps->num_ranges   = 0;
+
+    /* Copy performance ranges from the remote protocol, and add overheads */
+    remote_perf_range = rpriv->remote_proto.perf_ranges;
+    caps              = params->super.super.caps;
+    do {
+        perf_range             = &caps->ranges[caps->num_ranges];
+        perf_range->max_length = remote_perf_range->max_length;
+
+        /* Add send overheads and apply perf_bias */
+        perf_range->perf = ucs_linear_func_compose(
+                ucs_linear_func_make(0, 1.0 - params->perf_bias),
+                ucs_linear_func_add(remote_perf_range->perf, send_overheads));
+
+        ++caps->num_ranges;
+    } while ((remote_perf_range++)->max_length != SIZE_MAX);
+
+    return UCS_OK;
+}
+
+void ucp_proto_rndv_ctrl_config_str(size_t min_length, size_t max_length,
+                                    const void *priv, ucs_string_buffer_t *strb)
+{
+    const ucp_proto_rndv_ctrl_priv_t *rpriv = priv;
+    const ucp_proto_threshold_elem_t *thresh_elem;
+    size_t range_start, range_end;
+    const ucp_proto_t *proto;
+    ucp_md_index_t md_index;
+    char str[64];
+
+    /* Print message lane and memory domains list */
+    ucs_string_buffer_appendf(strb, "cln:%d md:", rpriv->lane);
+    ucs_for_each_bit(md_index, rpriv->md_map) {
+        ucs_string_buffer_appendf(strb, "%d,", md_index);
+    }
+    ucs_string_buffer_rtrim(strb, ",");
+    ucs_string_buffer_appendf(strb, " ");
+
+    /* Print estimated remote protocols for each message size */
+    thresh_elem = rpriv->remote_proto.thresholds;
+    range_start = 0;
+    do {
+        range_end = thresh_elem->max_msg_length;
+
+        /* Print only protocols within the range provided by {min,max}_length */
+        if ((range_end >= min_length) && (range_start <= max_length)) {
+            proto = thresh_elem->proto_config.proto;
+            ucs_string_buffer_appendf(strb, "%s(", proto->name);
+            proto->config_str(range_start, range_end,
+                              thresh_elem->proto_config.priv, strb);
+            ucs_string_buffer_appendf(strb, ")");
+
+            if (range_end < max_length) {
+                ucs_memunits_to_str(thresh_elem->max_msg_length, str,
+                                    sizeof(str));
+                ucs_string_buffer_appendf(strb, "<=%s<", str);
+            }
+        }
+
+        ++thresh_elem;
+        range_start = range_end + 1;
+    } while (range_end < max_length);
+
+    ucs_string_buffer_rtrim(strb, "<");
+}
+
+ucs_status_t ucp_proto_rndv_rts_init(const ucp_proto_init_params_t *init_params)
+{
+    ucp_context_h context                    = init_params->worker->context;
+    ucp_proto_rndv_ctrl_init_params_t params = {
+        .super.super        = *init_params,
+        .super.latency      = 0,
+        .super.overhead     = 40e-9,
+        .super.cfg_thresh   = context->config.ext.rndv_thresh,
+        .super.cfg_priority = 60,
+        .super.flags        = UCP_PROTO_COMMON_INIT_FLAG_RESPONSE,
+        .remote_op_id       = UCP_OP_ID_RNDV_RECV,
+        .perf_bias          = context->config.ext.rndv_perf_diff / 100.0,
+        .mem_info.type      = init_params->select_param->mem_type,
+        .mem_info.sys_dev   = init_params->select_param->sys_dev,
+        .min_length         = 0
+    };
+
+    UCP_RMA_PROTO_INIT_CHECK(init_params, UCP_OP_ID_TAG_SEND);
+
+    return ucp_proto_rndv_ctrl_init(&params);
+}
+
+ucs_status_t ucp_proto_rndv_ack_init(const ucp_proto_init_params_t *init_params)
+{
+    ucp_proto_rndv_ack_priv_t *apriv = init_params->priv;
+
+    apriv->lane = ucp_proto_common_find_am_bcopy_lane(init_params);
+    if (apriv->lane == UCP_NULL_LANE) {
+        return UCS_ERR_NO_ELEM;
+    }
+
+    return UCS_OK;
+}
+
+ucs_linear_func_t
+ucp_proto_rndv_ack_time(const ucp_proto_init_params_t *init_params)
+{
+    ucp_context_t *context           = init_params->worker->context;
+    ucp_proto_rndv_ack_priv_t *apriv = init_params->priv;
+    const uct_iface_attr_t *iface_attr;
+    double ack_time;
+
+    iface_attr = ucp_proto_common_get_iface_attr(init_params, apriv->lane);
+    ack_time   = (iface_attr->overhead * 2) +
+                 ucp_tl_iface_latency(context, &iface_attr->latency);
+
+    return ucs_linear_func_make(ack_time, 0);
+}
+
+void ucp_proto_rndv_ack_config_str(size_t min_length, size_t max_length,
+                                   const void *priv, ucs_string_buffer_t *strb)
+{
+    const ucp_proto_rndv_ack_priv_t *apriv = priv;
+
+    ucs_string_buffer_appendf(strb, "aln:%d", apriv->lane);
+}
+
+ucs_status_t
+ucp_proto_rndv_bulk_init(const ucp_proto_multi_init_params_t *init_params)
+{
+    ucp_proto_rndv_bulk_priv_t *rpriv    = init_params->super.super.priv;
+    ucp_proto_multi_init_params_t params = *init_params;
+    ucs_status_t status;
+    size_t mpriv_size;
+
+    /* Change priv pointer, since proto_multi priv is not the first element in
+     * ucp_proto_rndv_bulk_priv_t struct. Later on, we also update priv size.
+     */
+    params.super.super.priv      = &rpriv->mpriv;
+    params.super.super.priv_size = &mpriv_size;
+
+    status = ucp_proto_multi_init(&params);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    status = ucp_proto_rndv_ack_init(&init_params->super.super);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    /* Update private data size based of ucp_proto_multi_priv_t variable size */
+    *init_params->super.super.priv_size =
+            ucs_offsetof(ucp_proto_rndv_bulk_priv_t, mpriv) + mpriv_size;
+    return UCS_OK;
+}
+
+size_t ucp_proto_rndv_pack_ack(void *dest, void *arg)
+{
+    ucp_request_t *req       = arg;
+    ucp_reply_hdr_t *ack_hdr = dest;
+
+    ack_hdr->req_id = req->send.rndv.remote_req_id;
+    ack_hdr->status = UCS_OK;
+
+    return sizeof(*ack_hdr);
+}
+
+void ucp_proto_rndv_bulk_config_str(size_t min_length, size_t max_length,
+                                    const void *priv, ucs_string_buffer_t *strb)
+{
+    const ucp_proto_rndv_bulk_priv_t *rpriv = priv;
+
+    ucp_proto_multi_config_str(min_length, max_length, &rpriv->mpriv, strb);
+    ucs_string_buffer_appendf(strb, " ");
+    ucp_proto_rndv_ack_config_str(min_length, max_length, &rpriv->super, strb);
+}
+
+static ucs_status_t
+ucp_proto_rndv_send_reply(ucp_worker_h worker, ucp_request_t *req,
+                          ucp_operation_id_t op_id, uint8_t sg_count,
+                          size_t length, const void *rkey_buffer,
+                          size_t rkey_length)
+{
+    ucp_worker_cfg_index_t rkey_cfg_index;
+    ucp_proto_select_param_t sel_param;
+    ucp_proto_select_t *proto_select;
+    ucs_status_t status;
+    ucp_rkey_h rkey;
+
+    ucs_assert((op_id == UCP_OP_ID_RNDV_RECV) ||
+               (op_id == UCP_OP_ID_RNDV_SEND));
+    ucs_assert(sg_count == 1);
+
+    if (rkey_length > 0) {
+        ucs_assert(rkey_buffer != NULL);
+        status = ucp_ep_rkey_unpack_internal(req->send.ep, rkey_buffer,
+                                             rkey_length, &rkey);
+        if (status != UCS_OK) {
+            goto err;
+        }
+
+        proto_select   = &ucp_rkey_config(worker, rkey)->proto_select;
+        rkey_cfg_index = rkey->cfg_index;
+    } else {
+        /* No remote key, use endpoint protocols */
+        proto_select   = &ucp_ep_config(req->send.ep)->proto_select;
+        rkey_cfg_index = UCP_WORKER_CFG_INDEX_NULL;
+        rkey           = NULL;
+    }
+
+    ucp_proto_select_param_init(&sel_param, op_id, 0,
+                                req->send.state.dt_iter.dt_class,
+                                &req->send.state.dt_iter.mem_info, sg_count);
+
+    status = ucp_proto_request_set_proto(worker, req->send.ep, req,
+                                         proto_select, rkey_cfg_index,
+                                         &sel_param, length);
+    if (status != UCS_OK) {
+        goto err_destroy_rkey;
+    }
+
+    req->send.rndv.rkey = rkey;
+
+    ucp_trace_req(req,
+                  "%s rva 0x%" PRIx64 " rreq_id 0x%" PRIx64 " with protocol %s",
+                  ucp_operation_names[op_id], req->send.rndv.remote_address,
+                  req->send.rndv.remote_req_id,
+                  req->send.proto_config->proto->name);
+
+    ucp_request_send(req, 0);
+    return UCS_OK;
+
+err_destroy_rkey:
+    if (rkey != NULL) {
+        ucp_rkey_destroy(rkey);
+    }
+err:
+    return status;
+}
+
+static UCS_F_ALWAYS_INLINE ucp_request_t *
+ucp_request_get_super_req(void *request, void *user_data)
+{
+    ucp_request_t UCS_V_UNUSED *req = (ucp_request_t*)request - 1;
+    ucp_request_t *super_req        = user_data;
+
+    ucs_assert(ucp_request_get_super(req) == super_req);
+    return super_req;
+}
+
+static void ucp_proto_rndv_recv_completion(void *request, ucs_status_t status,
+                                           void *user_data)
+{
+    ucp_request_t *recv_req = ucp_request_get_super_req(request, user_data);
+
+    ucp_request_complete_tag_recv(recv_req, status);
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_rndv_check_rkey_length(uint64_t address, size_t rkey_length,
+                                 const char *title)
+{
+    ucs_assertv((ssize_t)rkey_length >= 0, "%s rkey_length=%zd", title,
+                (ssize_t)rkey_length);
+    ucs_assertv((address != 0) == (rkey_length > 0),
+                "%s rts->address=0x%" PRIx64 " rkey_length=%zu", title, address,
+                rkey_length);
+}
+
+void ucp_proto_rndv_receive(ucp_worker_h worker, ucp_request_t *recv_req,
+                            const ucp_rndv_rts_hdr_t *rts,
+                            const void *rkey_buffer, size_t rkey_length)
+{
+    ucs_status_t status;
+    ucp_request_t *req;
+    uint8_t sg_count;
+    size_t length;
+    ucp_ep_h ep;
+
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, rts->sreq.ep_id, return,
+                                  "RTS on non-existing endpoint");
+
+    if (!UCP_DT_IS_CONTIG(recv_req->recv.datatype)) {
+        ucs_fatal("non-contiguous types are not supported with rndv protocol");
+    }
+
+    req = ucp_request_get(worker);
+    if (req == NULL) {
+        ucs_error("failed to allocate rendezvous reply");
+        return;
+    }
+
+    /* Initialize send request */
+    req->send.ep                  = ep;
+    req->send.rndv.remote_address = rts->address;
+    req->send.rndv.remote_req_id  = rts->sreq.req_id;
+
+    if (ucs_likely(rts->size <= recv_req->recv.length)) {
+        req->flags   = UCP_REQUEST_FLAG_CALLBACK | UCP_REQUEST_FLAG_RELEASED;
+        req->send.cb = ucp_proto_rndv_recv_completion;
+        length       = rts->size;
+        ucp_proto_rndv_check_rkey_length(rts->address, rkey_length, "rts");
+        ucp_request_set_super(req, recv_req);
+    } else {
+        /* Short receive: complete with error, and send reply to sender */
+        ucp_request_complete_tag_recv(recv_req, UCS_ERR_MESSAGE_TRUNCATED);
+        req->flags  = UCP_REQUEST_FLAG_RELEASED;
+        length      = 0;
+        rkey_length = 0; /* Override rkey length to disable data fetch */
+    }
+
+    ucp_datatype_iter_init(worker->context, recv_req->recv.buffer, length,
+                           recv_req->recv.datatype, length,
+                           &req->send.state.dt_iter, &sg_count);
+
+    status = ucp_proto_rndv_send_reply(worker, req, UCP_OP_ID_RNDV_RECV,
+                                       sg_count, length, rkey_buffer,
+                                       rkey_length);
+    if (status != UCS_OK) {
+        ucp_datatype_iter_cleanup(&req->send.state.dt_iter, UINT_MAX);
+        ucs_mpool_put(req);
+        return;
+    }
+}
+
+static ucs_status_t
+ucp_proto_rndv_send_start(ucp_worker_h worker, ucp_request_t *req,
+                          const ucp_rndv_rtr_hdr_t *rtr, size_t header_length)
+{
+    size_t rkey_length = header_length - sizeof(*rtr);
+    ucs_status_t status;
+
+    ucp_proto_rndv_check_rkey_length(rtr->address, rkey_length, "rtr");
+    req->send.rndv.remote_address = rtr->address;
+    req->send.rndv.remote_req_id  = rtr->rreq_id;
+
+    status = ucp_proto_rndv_send_reply(worker, req, UCP_OP_ID_RNDV_SEND, 1,
+                                       rtr->size, rtr + 1, rkey_length);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    return UCS_OK;
+}
+
+ucs_status_t
+ucp_proto_rndv_handle_rtr(void *arg, void *data, size_t length, unsigned flags)
+{
+    ucp_worker_h worker           = arg;
+    const ucp_rndv_rtr_hdr_t *rtr = data;
+    ucs_status_t status;
+    ucp_request_t *req;
+
+    UCP_SEND_REQUEST_GET_BY_ID(&req, worker, rtr->sreq_id, 1, return UCS_OK,
+                               "RTR %p", rtr);
+
+    if (rtr->address == 0) {
+        ucs_fatal("RTR without remote address is currently unsupported");
+    }
+
+    /* RTR covers the whole send request - use the send request directly */
+    ucs_assert(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED);
+    ucs_assert(rtr->size == req->send.state.dt_iter.length);
+    ucs_assert(rtr->offset == 0);
+
+    req->flags &= ~UCP_REQUEST_FLAG_PROTO_INITIALIZED;
+    status      = ucp_proto_rndv_send_start(worker, req, rtr, length);
+    if (status != UCS_OK) {
+        goto err_request_fail;
+    }
+
+    return UCS_OK;
+
+err_request_fail:
+    ucp_proto_request_abort(req, status);
+    return UCS_OK;
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_rndv_rtr_uct_comp_from_id(ucp_worker_h worker, uint64_t id,
+                                    int extract, uct_completion_t **uct_comp_p)
+{
+    ucs_status_t status;
+    void *ptr;
+
+    status = ucs_ptr_map_get(&worker->ptr_map, id, extract, &ptr);
+    if (ucs_unlikely((status != UCS_OK) && (status != UCS_ERR_NO_PROGRESS))) {
+        return status;
+    }
+
+    *uct_comp_p = ptr;
+    return UCS_OK;
+}
+
+ucs_status_t
+ucp_proto_rndv_handle_data(void *arg, void *data, size_t length, unsigned flags)
+{
+    ucp_worker_h worker                = arg;
+    ucp_rndv_data_hdr_t *rndv_data_hdr = data;
+    size_t recv_len                    = length - sizeof(*rndv_data_hdr);
+    ucp_request_t *req, *recv_req;
+    uct_completion_t *uct_comp;
+    ucs_status_t status;
+
+    status = ucp_proto_rndv_rtr_uct_comp_from_id(worker, rndv_data_hdr->rreq_id,
+                                                 0, &uct_comp);
+    if (ucs_unlikely(status != UCS_OK)) {
+        ucs_trace_data("worker %p: completion id 0x%" PRIx64
+                       " was not found, drop RNDV data %p",
+                       worker, rndv_data_hdr->rreq_id, rndv_data_hdr);
+        return UCS_OK;
+    }
+
+    req      = ucs_container_of(uct_comp, ucp_request_t, send.state.uct_comp);
+    recv_req = ucp_request_get_super(req);
+    UCS_PROFILE_REQUEST_EVENT(recv_req, "rndv_data_recv", recv_len);
+
+    ucs_assertv(recv_req->recv.remaining >= recv_len,
+                "req->recv.remaining=%zu recv_len=%zu",
+                recv_req->recv.remaining, recv_len);
+    recv_req->recv.remaining -= recv_len;
+
+    /* process data only if the request is not in error state */
+    if (ucs_likely(recv_req->status == UCS_OK)) {
+        recv_req->status = ucp_request_recv_data_unpack(
+                recv_req, rndv_data_hdr + 1, recv_len, rndv_data_hdr->offset,
+                recv_req->recv.remaining == 0);
+    }
+
+    if (recv_req->recv.remaining == 0) {
+        status = ucs_ptr_map_del(&worker->ptr_map, rndv_data_hdr->rreq_id);
+        ucs_assert((status == UCS_OK) || (status == UCS_ERR_NO_PROGRESS));
+
+        ucp_proto_rndv_rtr_common_complete(req, recv_req->status);
+    }
+
+    return UCS_OK;
+}
diff --git a/src/ucp/rndv/proto_rndv.h b/src/ucp/rndv/proto_rndv.h
new file mode 100644
index 00000000000..9ead33fffc9
--- /dev/null
+++ b/src/ucp/rndv/proto_rndv.h
@@ -0,0 +1,133 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCP_PROTO_RNDV_H_
+#define UCP_PROTO_RNDV_H_
+
+#include "rndv.h"
+
+#include <ucp/proto/proto_multi.h>
+
+
+/**
+ * Rendezvous protocol which sends a control message to the remote peer, and not
+ * actually transferring bulk data. The remote peer is expected to perform the
+ * "remote_proto" protocol to complete data transfer.
+ * Typically, a rendezvous protocol will have one or two control message
+ * exchanges before the bulk transfer takes place.
+ */
+typedef struct {
+    /* Memory domains to send remote keys */
+    ucp_md_map_t            md_map;
+
+    /* Total size of packed rkeys */
+    size_t                  packed_rkey_size;
+
+    /* Lane for sending the "remote_op" message */
+    ucp_lane_index_t        lane;
+
+    /* Which protocol the remote side is expected to use, for performance
+       estimation and reporting purpose */
+    ucp_proto_select_elem_t remote_proto;
+} ucp_proto_rndv_ctrl_priv_t;
+
+
+/*
+ * Private data for rendezvous protocol which sends an acknowledgement packet
+ */
+typedef struct {
+    /* Lane to send completion message (ATP, RTS, ATS) */
+    ucp_lane_index_t lane;
+} ucp_proto_rndv_ack_priv_t;
+
+
+/*
+ * Private data for rendezvous protocol which sends bulk data followed by an
+ * acknowledgement packet
+ */
+typedef struct {
+    ucp_proto_rndv_ack_priv_t super;
+
+    /*
+     * Multi-lane common part.
+     * Must be the last element in this struct, since it's variable-size and
+     * ends with a zero-size array.
+     */
+    ucp_proto_multi_priv_t mpriv;
+} ucp_proto_rndv_bulk_priv_t;
+
+
+/**
+ * Rendezvous control-message protocol initialization parameters
+ */
+typedef struct {
+    ucp_proto_common_init_params_t super;
+
+    /* Which operation the remote peer is expected to perform */
+    ucp_operation_id_t             remote_op_id;
+
+    /* Reduce estimated time by this value (for example, 0.03 means to report
+       a 3% better time) */
+    double                         perf_bias;
+
+    /* Memory type of the transfer */
+    ucs_memory_info_t              mem_info;
+
+    /* Minimal data length */
+    size_t                         min_length;
+} ucp_proto_rndv_ctrl_init_params_t;
+
+
+ucs_status_t
+ucp_proto_rndv_ctrl_init(const ucp_proto_rndv_ctrl_init_params_t *params);
+
+
+void ucp_proto_rndv_ctrl_config_str(size_t min_length, size_t max_length,
+                                    const void *priv,
+                                    ucs_string_buffer_t *strb);
+
+
+ucs_status_t
+ucp_proto_rndv_rts_init(const ucp_proto_init_params_t *init_params);
+
+
+ucs_status_t
+ucp_proto_rndv_ack_init(const ucp_proto_init_params_t *init_params);
+
+
+ucs_linear_func_t
+ucp_proto_rndv_ack_time(const ucp_proto_init_params_t *init_params);
+
+
+void ucp_proto_rndv_ack_config_str(size_t min_length, size_t max_length,
+                                   const void *priv, ucs_string_buffer_t *strb);
+
+
+ucs_status_t
+ucp_proto_rndv_bulk_init(const ucp_proto_multi_init_params_t *init_params);
+
+
+size_t ucp_proto_rndv_pack_ack(void *dest, void *arg);
+
+
+void ucp_proto_rndv_bulk_config_str(size_t min_length, size_t max_length,
+                                    const void *priv,
+                                    ucs_string_buffer_t *strb);
+
+
+void ucp_proto_rndv_receive(ucp_worker_h worker, ucp_request_t *recv_req,
+                            const ucp_rndv_rts_hdr_t *rts,
+                            const void *rkey_buffer, size_t rkey_length);
+
+
+ucs_status_t
+ucp_proto_rndv_handle_rtr(void *arg, void *data, size_t length, unsigned flags);
+
+
+ucs_status_t ucp_proto_rndv_handle_data(void *arg, void *data, size_t length,
+                                        unsigned flags);
+
+#endif
diff --git a/src/ucp/rndv/proto_rndv.inl b/src/ucp/rndv/proto_rndv.inl
new file mode 100644
index 00000000000..b3805bacbf2
--- /dev/null
+++ b/src/ucp/rndv/proto_rndv.inl
@@ -0,0 +1,120 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCP_PROTO_RNDV_INL_
+#define UCP_PROTO_RNDV_INL_
+
+#include "proto_rndv.h"
+
+#include <ucp/core/ucp_rkey.inl>
+#include <ucp/proto/proto_am.inl>
+#include <ucp/proto/proto_single.inl>
+#include <ucp/proto/proto_multi.inl>
+
+
+static UCS_F_ALWAYS_INLINE size_t
+ucp_proto_rndv_cfg_thresh(ucp_context_h context, uint64_t rndv_modes)
+{
+    ucs_assert(!(rndv_modes & UCS_BIT(UCP_RNDV_MODE_AUTO)));
+
+    if (context->config.ext.rndv_mode == UCP_RNDV_MODE_AUTO) {
+        return UCS_MEMUNITS_AUTO; /* automatic threshold */
+    } else if (rndv_modes & UCS_BIT(context->config.ext.rndv_mode)) {
+        return 0; /* enabled by default */
+    } else {
+        return UCS_MEMUNITS_INF; /* used only as last resort */
+    }
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_rndv_rts_request_init(ucp_request_t *req)
+{
+    const ucp_proto_rndv_ctrl_priv_t *rpriv = req->send.proto_config->priv;
+    ucp_ep_h ep                             = req->send.ep;
+    ucs_status_t status;
+
+    if (req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED) {
+        return UCS_OK;
+    }
+
+    status = ucp_ep_resolve_remote_id(req->send.ep, rpriv->lane);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    status = ucp_datatype_iter_mem_reg(ep->worker->context,
+                                       &req->send.state.dt_iter, rpriv->md_map,
+                                       UCT_MD_MEM_ACCESS_RMA |
+                                       UCT_MD_MEM_FLAG_HIDE_ERRORS);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    ucp_send_request_id_alloc(req);
+    req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
+
+    return UCS_OK;
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_rndv_ats_handler(void *arg, void *data, size_t length, unsigned flags)
+{
+    ucp_worker_h worker        = arg;
+    const ucp_reply_hdr_t *ats = data;
+    ucp_request_t *req;
+
+    UCP_SEND_REQUEST_GET_BY_ID(&req, worker, ats->req_id, 1, return UCS_OK,
+                               "ATS %p", ats);
+    ucp_proto_request_zcopy_complete(req, ats->status);
+    return UCS_OK;
+}
+
+static UCS_F_ALWAYS_INLINE size_t ucp_proto_rndv_rts_pack(
+        ucp_request_t *req, ucp_rndv_rts_hdr_t *rts, size_t hdr_len)
+{
+    void *rkey_buffer = UCS_PTR_BYTE_OFFSET(rts, hdr_len);
+    size_t rkey_size;
+
+    rts->sreq.req_id = ucp_send_request_get_id(req);
+    rts->sreq.ep_id  = ucp_send_request_get_ep_remote_id(req);
+    rts->size        = req->send.state.dt_iter.length;
+
+    if (req->send.state.dt_iter.type.contig.reg.md_map == 0) {
+        rts->address = 0;
+        rkey_size    = 0;
+    } else {
+        rts->address = (uintptr_t)req->send.state.dt_iter.type.contig.buffer;
+        rkey_size    = ucp_proto_request_pack_rkey(req, rkey_buffer);
+    }
+
+    return hdr_len + rkey_size;
+}
+
+static ucs_status_t UCS_F_ALWAYS_INLINE
+ucp_proto_rndv_ack_progress(ucp_request_t *req, ucp_am_id_t am_id,
+                            ucp_proto_complete_cb_t complete_func)
+{
+    const ucp_proto_rndv_ack_priv_t *apriv = req->send.proto_config->priv;
+
+    ucs_assert(ucp_datatype_iter_is_end(&req->send.state.dt_iter));
+
+    return ucp_proto_am_bcopy_single_progress(req, am_id, apriv->lane,
+                                              ucp_proto_rndv_pack_ack, req,
+                                              sizeof(ucp_reply_hdr_t),
+                                              complete_func);
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_rndv_rtr_common_complete(ucp_request_t *req, ucs_status_t status)
+{
+    ucp_trace_req(req, "rndv_rtr_common_complete");
+    if (req->send.rndv.rkey != NULL) {
+        ucp_rkey_destroy(req->send.rndv.rkey);
+    }
+    ucp_proto_request_zcopy_complete(req, status);
+}
+
+#endif
diff --git a/src/ucp/rndv/rndv.c b/src/ucp/rndv/rndv.c
index ab2bfe82c28..75ffcab7100 100644
--- a/src/ucp/rndv/rndv.c
+++ b/src/ucp/rndv/rndv.c
@@ -9,6 +9,7 @@
 #endif
 
 #include "rndv.h"
+#include "proto_rndv.inl"
 
 /* TODO: Avoid dependency on tag (or other API) specifics, since this is common
  * basic rendezvous implementation.
@@ -25,7 +26,7 @@ ucp_rndv_is_get_zcopy(ucp_request_t *req, ucp_context_h context)
 {
     return ((context->config.ext.rndv_mode == UCP_RNDV_MODE_GET_ZCOPY) ||
             ((context->config.ext.rndv_mode == UCP_RNDV_MODE_AUTO) &&
-             (!UCP_MEM_IS_CUDA(req->send.mem_type) ||
+             (!UCP_MEM_IS_GPU(req->send.mem_type) ||
               (req->send.length < context->config.ext.rndv_pipeline_send_thresh))));
 }
 
@@ -94,28 +95,31 @@ static int ucp_rndv_is_put_pipeline_needed(uintptr_t remote_address,
 }
 
 size_t ucp_rndv_rts_pack(ucp_request_t *sreq, ucp_rndv_rts_hdr_t *rndv_rts_hdr,
-                         size_t rndv_rts_hdr_size, uint16_t flags)
+                         ucp_rndv_rts_opcode_t opcode)
 {
     ucp_worker_h worker = sreq->send.ep->worker;
+    ucs_memory_info_t mem_info;
     ssize_t packed_rkey_size;
     void *rkey_buf;
 
     rndv_rts_hdr->sreq.ep_id  = ucp_send_request_get_ep_remote_id(sreq);
     rndv_rts_hdr->sreq.req_id = ucp_send_request_get_id(sreq);
     rndv_rts_hdr->size        = sreq->send.length;
-    rndv_rts_hdr->flags       = flags;
+    rndv_rts_hdr->opcode      = opcode;
 
     /* Pack remote keys (which can be empty list) */
     if (UCP_DT_IS_CONTIG(sreq->send.datatype) &&
         ucp_rndv_is_get_zcopy(sreq, worker->context)) {
         /* pack rkey, ask target to do get_zcopy */
+        mem_info.type         = sreq->send.mem_type;
+        mem_info.sys_dev      = UCS_SYS_DEVICE_ID_UNKNOWN;
         rndv_rts_hdr->address = (uintptr_t)sreq->send.buffer;
         rkey_buf              = UCS_PTR_BYTE_OFFSET(rndv_rts_hdr,
-                                                    rndv_rts_hdr_size);
-        packed_rkey_size = ucp_rkey_pack_uct(worker->context,
-                                             sreq->send.state.dt.dt.contig.md_map,
-                                             sreq->send.state.dt.dt.contig.memh,
-                                             sreq->send.mem_type, rkey_buf);
+                                                    sizeof(*rndv_rts_hdr));
+        packed_rkey_size      = ucp_rkey_pack_uct(
+                worker->context, sreq->send.state.dt.dt.contig.md_map,
+                sreq->send.state.dt.dt.contig.memh, &mem_info, 0, NULL,
+                rkey_buf);
         if (packed_rkey_size < 0) {
             ucs_fatal("failed to pack rendezvous remote key: %s",
                       ucs_status_string((ucs_status_t)packed_rkey_size));
@@ -128,32 +132,35 @@ size_t ucp_rndv_rts_pack(ucp_request_t *sreq, ucp_rndv_rts_hdr_t *rndv_rts_hdr,
         packed_rkey_size      = 0;
     }
 
-    return rndv_rts_hdr_size + packed_rkey_size;
+    return sizeof(*rndv_rts_hdr) + packed_rkey_size;
 }
 
 static size_t ucp_rndv_rtr_pack(void *dest, void *arg)
 {
     ucp_request_t *rndv_req          = arg;
     ucp_rndv_rtr_hdr_t *rndv_rtr_hdr = dest;
-    ucp_request_t *rreq              = rndv_req->super_req;
+    ucp_request_t *rreq              = ucp_request_get_super(rndv_req);
     ucp_ep_h ep                      = rndv_req->send.ep;
+    ucs_memory_info_t mem_info;
     ssize_t packed_rkey_size;
 
-    rndv_rtr_hdr->sreq_id = rndv_req->send.rndv_rtr.req_id;
-    /* request of receiver side */
-    rndv_rtr_hdr->rreq_id = ucp_worker_get_request_id(ep->worker, rreq,
-                                                      ucp_ep_use_indirect_id(ep));
+    /* Request ID of sender side (remote) */
+    rndv_rtr_hdr->sreq_id = rreq->recv.remote_req_id;
+    /* Request ID of receiver side (local) */
+    rndv_rtr_hdr->rreq_id = ucp_send_request_get_id(rndv_req);
 
     /* Pack remote keys (which can be empty list) */
     if (UCP_DT_IS_CONTIG(rreq->recv.datatype)) {
         rndv_rtr_hdr->address = (uintptr_t)rreq->recv.buffer;
         rndv_rtr_hdr->size    = rndv_req->send.rndv_rtr.length;
         rndv_rtr_hdr->offset  = rndv_req->send.rndv_rtr.offset;
+        mem_info.type         = rreq->recv.mem_type;
+        mem_info.sys_dev      = UCS_SYS_DEVICE_ID_UNKNOWN;
 
-        packed_rkey_size = ucp_rkey_pack_uct(rndv_req->send.ep->worker->context,
+        packed_rkey_size = ucp_rkey_pack_uct(ep->worker->context,
                                              rreq->recv.state.dt.contig.md_map,
                                              rreq->recv.state.dt.contig.memh,
-                                             rreq->recv.mem_type,
+                                             &mem_info, 0, NULL,
                                              rndv_rtr_hdr + 1);
         if (packed_rkey_size < 0) {
             return packed_rkey_size;
@@ -175,16 +182,22 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_proto_progress_rndv_rtr, (self),
     size_t packed_rkey_size;
     ucs_status_t status;
 
-    /* send the RTR. the pack_cb will pack all the necessary fields in the RTR */
+    /* Send the RTR. The pack_cb will pack all the necessary fields in the RTR */
     packed_rkey_size = ucp_ep_config(rndv_req->send.ep)->rndv.rkey_size;
-    status = ucp_do_am_single(self, UCP_AM_ID_RNDV_RTR, ucp_rndv_rtr_pack,
-                              sizeof(ucp_rndv_rtr_hdr_t) + packed_rkey_size);
-    if (status == UCS_OK) {
-        /* release rndv request */
+    status           = ucp_do_am_single(self, UCP_AM_ID_RNDV_RTR, ucp_rndv_rtr_pack,
+                                        sizeof(ucp_rndv_rtr_hdr_t) + packed_rkey_size);
+    if (ucs_unlikely(status != UCS_OK)) {
+        if (ucs_likely(status == UCS_ERR_NO_RESOURCE)) {
+            return UCS_ERR_NO_RESOURCE;
+        }
+
         ucp_request_put(rndv_req);
     }
 
-    return status;
+    /* Don't release rndv request in case of success, since it was sent to
+     * a peer as a remote request ID */
+
+    return UCS_OK;
 }
 
 ucs_status_t ucp_rndv_reg_send_buffer(ucp_request_t *sreq)
@@ -219,8 +232,6 @@ ucp_rndv_adjust_zcopy_length(size_t min_zcopy, size_t max_zcopy, size_t align,
 {
     size_t result_length, tail;
 
-    ucs_assert(length > 0);
-
     /* ensure that the current length is over min_zcopy */
     result_length = ucs_max(length, min_zcopy);
 
@@ -259,95 +270,29 @@ ucp_rndv_adjust_zcopy_length(size_t min_zcopy, size_t max_zcopy, size_t align,
     return result_length;
 }
 
-static void ucp_rndv_complete_send(ucp_request_t *sreq, ucs_status_t status)
-{
-    ucp_request_send_generic_dt_finish(sreq);
-    ucp_request_send_buffer_dereg(sreq);
-    ucp_request_complete_send(sreq, status);
-}
-
-void ucp_rndv_req_send_ats(ucp_request_t *rndv_req, ucp_request_t *rreq,
-                           ucs_ptr_map_key_t remote_req_id, ucs_status_t status)
-{
-    ucp_trace_req(rndv_req, "send ats remote_req_id 0x%"PRIxPTR, remote_req_id);
-    UCS_PROFILE_REQUEST_EVENT(rreq, "send_ats", 0);
-
-    rndv_req->send.lane                = ucp_ep_get_am_lane(rndv_req->send.ep);
-    rndv_req->send.uct.func            = ucp_proto_progress_am_single;
-    rndv_req->send.proto.am_id         = UCP_AM_ID_RNDV_ATS;
-    rndv_req->send.proto.status        = status;
-    rndv_req->send.proto.remote_req_id = remote_req_id;
-    rndv_req->send.proto.comp_cb       = ucp_request_put;
-
-    ucp_request_send(rndv_req, 0);
-}
-
-UCS_PROFILE_FUNC_VOID(ucp_rndv_complete_rma_put_zcopy, (sreq),
-                      ucp_request_t *sreq)
-{
-    ucp_trace_req(sreq, "rndv_put completed");
-    UCS_PROFILE_REQUEST_EVENT(sreq, "complete_rndv_put", 0);
-
-    ucp_request_send_buffer_dereg(sreq);
-    ucp_request_complete_send(sreq, UCS_OK);
-}
-
-static void ucp_rndv_send_atp(ucp_request_t *sreq,
-                              ucs_ptr_map_key_t remote_req_id)
-{
-    ucs_assertv(sreq->send.state.dt.offset == sreq->send.length,
-                "sreq=%p offset=%zu length=%zu", sreq,
-                sreq->send.state.dt.offset, sreq->send.length);
-
-    ucp_trace_req(sreq, "send atp remote_req_id 0x%"PRIxPTR, remote_req_id);
-    UCS_PROFILE_REQUEST_EVENT(sreq, "send_atp", 0);
-
-    /* destroy rkey before it gets overridden by ATP protocol data */
-    ucp_rkey_destroy(sreq->send.rndv_put.rkey);
-
-    sreq->send.lane                = ucp_ep_get_am_lane(sreq->send.ep);
-    sreq->send.uct.func            = ucp_proto_progress_am_single;
-    sreq->send.proto.am_id         = UCP_AM_ID_RNDV_ATP;
-    sreq->send.proto.status        = UCS_OK;
-    sreq->send.proto.remote_req_id = remote_req_id;
-    sreq->send.proto.comp_cb       = ucp_rndv_complete_rma_put_zcopy;
-
-    ucp_request_send(sreq, 0);
-}
-
-UCS_PROFILE_FUNC_VOID(ucp_rndv_complete_frag_rma_put_zcopy, (fsreq),
-                      ucp_request_t *fsreq)
+void ucp_rndv_req_send_ack(ucp_request_t *ack_req, ucp_request_t *req,
+                           ucs_ptr_map_key_t remote_req_id, ucs_status_t status,
+                           ucp_am_id_t am_id, const char *ack_str)
 {
-    ucp_request_t *sreq = fsreq->super_req;
-
-    sreq->send.state.dt.offset += fsreq->send.length;
-
-    /* delete fragments send request */
-    ucp_request_put(fsreq);
-
-    /* complete send request after put completions of all fragments */
-    if (sreq->send.state.dt.offset == sreq->send.length) {
-        ucp_rndv_complete_rma_put_zcopy(sreq);
+    if (am_id == UCP_AM_ID_RNDV_ATP) {
+        ucs_assertv(req->send.state.dt.offset == req->send.length,
+                    "req=%p offset=%zu length=%zu", req,
+                    req->send.state.dt.offset, req->send.length);
     }
-}
 
-static void ucp_rndv_send_frag_atp(ucp_request_t *fsreq,
-                                   ucs_ptr_map_key_t req_id)
-{
-    ucp_trace_req(fsreq, "send frag atp remote req_id 0x%"PRIxPTR, req_id);
-    UCS_PROFILE_REQUEST_EVENT(fsreq, "send_frag_atp", 0);
-
-    /* destroy rkey before it gets overridden by ATP protocol data */
-    ucp_rkey_destroy(fsreq->send.rndv_put.rkey);
+    ucp_trace_req(req, "%s remote_req_id 0x%"PRIxPTR, ack_str, remote_req_id);
+    UCS_PROFILE_REQUEST_EVENT(req, ack_str, 0);
 
-    fsreq->send.lane                = ucp_ep_get_am_lane(fsreq->send.ep);
-    fsreq->send.uct.func            = ucp_proto_progress_am_single;
-    fsreq->send.proto.am_id         = UCP_AM_ID_RNDV_ATP;
-    fsreq->send.proto.status        = UCS_OK;
-    fsreq->send.proto.remote_req_id = req_id;
-    fsreq->send.proto.comp_cb       = ucp_rndv_complete_frag_rma_put_zcopy;
+    ack_req->send.lane                = ucp_ep_get_am_lane(ack_req->send.ep);
+    ack_req->send.uct.func            = ucp_proto_progress_am_single;
+    ack_req->send.proto.am_id         = am_id;
+    ack_req->send.proto.status        = status;
+    ack_req->send.proto.remote_req_id = remote_req_id;
+    ack_req->send.proto.comp_cb       = ucp_request_put;
+    ucp_request_send_state_reset(ack_req, NULL,
+                                 UCP_REQUEST_SEND_PROTO_BCOPY_AM);
 
-    ucp_request_send(fsreq, 0);
+    ucp_request_send(ack_req, 0);
 }
 
 static UCS_F_ALWAYS_INLINE void
@@ -368,32 +313,43 @@ static void ucp_rndv_zcopy_recv_req_complete(ucp_request_t *req,
     ucp_rndv_recv_req_complete(req, status);
 }
 
-static void ucp_rndv_complete_rma_get_zcopy(ucp_request_t *rndv_req,
-                                            ucs_status_t status)
+static void ucp_rndv_complete_rma_put_zcopy(ucp_request_t *sreq, int is_frag_put)
 {
-    ucp_request_t *rreq = rndv_req->super_req;
+    ucs_status_t status = sreq->send.state.uct_comp.status;
+    ucp_request_t *atp_req;
 
-    ucs_assertv(rndv_req->send.state.dt.offset == rndv_req->send.length,
-                "rndv_req=%p offset=%zu length=%zu", rndv_req,
-                rndv_req->send.state.dt.offset, rndv_req->send.length);
+    ucs_assertv(sreq->send.state.dt.offset <= sreq->send.length,
+                "sreq=%p offset=%zu length=%zu", sreq,
+                sreq->send.state.dt.offset, sreq->send.length);
 
-    ucp_trace_req(rndv_req, "rndv_get completed with status %s",
-                  ucs_status_string(status));
-    UCS_PROFILE_REQUEST_EVENT(rreq, "complete_rndv_get", 0);
+    /* complete send request after PUT completions of all fragments */
+    if (sreq->send.state.dt.offset != sreq->send.length) {
+        return;
+    }
 
-    ucp_rkey_destroy(rndv_req->send.rndv_get.rkey);
-    ucp_request_send_buffer_dereg(rndv_req);
+    ucp_trace_req(sreq, "rndv_put completed with status %s",
+                  ucs_status_string(status));
+    UCS_PROFILE_REQUEST_EVENT(sreq, "complete_rndv_put", 0);
 
-    if (status == UCS_OK) {
-        ucp_rndv_req_send_ats(rndv_req, rreq,
-                              rndv_req->send.rndv_get.remote_req_id, UCS_OK);
+    if (is_frag_put) {
+        ucp_send_request_id_release(sreq);
     } else {
-        /* if completing RNDV with the error, just release RNDV request */
-        ucp_request_put(rndv_req);
+        ucp_rkey_destroy(sreq->send.rndv.rkey);
+
+        atp_req = ucp_request_get(sreq->send.ep->worker);
+        if (ucs_unlikely(atp_req == NULL)) {
+            ucs_fatal("failed to allocate request for sending ATP");
+        }
+
+        atp_req->send.ep = sreq->send.ep;
+        atp_req->flags   = 0;
+        ucp_rndv_req_send_ack(atp_req, sreq, sreq->send.rndv.remote_req_id,
+                              status, UCP_AM_ID_RNDV_ATP, "send_atp");
     }
 
-    ucs_assert(rreq->recv.state.dt.contig.md_map == 0);
-    ucp_rndv_recv_req_complete(rreq, status);
+    ucp_request_send_buffer_dereg(sreq);
+    ucs_assert(sreq->send.state.dt.dt.contig.md_map == 0);
+    ucp_request_complete_send(sreq, status);
 }
 
 static void ucp_rndv_recv_data_init(ucp_request_t *rreq, size_t size)
@@ -402,6 +358,18 @@ static void ucp_rndv_recv_data_init(ucp_request_t *rreq, size_t size)
     rreq->recv.remaining = size;
 }
 
+ucs_status_t ucp_rndv_send_rts(ucp_request_t *sreq, uct_pack_callback_t pack_cb,
+                               size_t rts_size)
+{
+    size_t max_rts_size = ucp_ep_config(sreq->send.ep)->rndv.rkey_size +
+                          rts_size;
+    ucs_status_t status;
+
+    status = ucp_do_am_single(&sreq->send.uct, UCP_AM_ID_RNDV_RTS, pack_cb,
+                              max_rts_size);
+    return ucp_rndv_rts_handle_status_from_pending(sreq, status);
+}
+
 static void ucp_rndv_req_send_rtr(ucp_request_t *rndv_req, ucp_request_t *rreq,
                                   ucs_ptr_map_key_t sender_req_id,
                                   size_t recv_length, size_t offset)
@@ -409,196 +377,272 @@ static void ucp_rndv_req_send_rtr(ucp_request_t *rndv_req, ucp_request_t *rreq,
     ucp_trace_req(rndv_req, "send rtr remote sreq_id 0x%"PRIxPTR" rreq %p",
                   sender_req_id, rreq);
 
-    rndv_req->super_req            = rreq;
+    /* Reset super request and send state, since it may be set by the previous
+     * protocol (e.g. RNDV GET Zcopy) */
+    ucp_request_reset_super(rndv_req);
+    ucp_request_send_state_reset(rndv_req, NULL,
+                                 UCP_REQUEST_SEND_PROTO_BCOPY_AM);
+    UCP_WORKER_STAT_RNDV(rndv_req->send.ep->worker, SEND_RTR, +1);
+
+    rreq->recv.remote_req_id       = sender_req_id;
     rndv_req->send.lane            = ucp_ep_get_am_lane(rndv_req->send.ep);
     rndv_req->send.uct.func        = ucp_proto_progress_rndv_rtr;
-    rndv_req->send.rndv_rtr.req_id = sender_req_id;
     rndv_req->send.rndv_rtr.length = recv_length;
     rndv_req->send.rndv_rtr.offset = offset;
 
+    ucp_request_set_super(rndv_req, rreq);
+    ucp_send_request_id_alloc(rndv_req);
+
     ucp_request_send(rndv_req, 0);
 }
 
-static ucp_lane_index_t
-ucp_rndv_get_zcopy_get_lane(ucp_request_t *rndv_req, uct_rkey_t *uct_rkey)
+static ucp_lane_index_t ucp_rndv_zcopy_get_lane(ucp_request_t *rndv_req,
+                                                uct_rkey_t *uct_rkey,
+                                                unsigned proto)
 {
     ucp_lane_index_t lane_idx;
     ucp_ep_config_t *ep_config;
     ucp_rkey_h rkey;
     uint8_t rkey_index;
 
-    if (ucs_unlikely(!rndv_req->send.rndv_get.lanes_map_all)) {
+    ucs_assert((proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) ||
+               (proto == UCP_REQUEST_SEND_PROTO_RNDV_PUT));
+
+    if (ucs_unlikely(!rndv_req->send.rndv.lanes_map_all)) {
         return UCP_NULL_LANE;
     }
 
     lane_idx   = ucs_ffs64_safe(rndv_req->send.lanes_map_avail);
     ucs_assert(lane_idx < UCP_MAX_LANES);
-    rkey       = rndv_req->send.rndv_get.rkey;
-    rkey_index = rndv_req->send.rndv_get.rkey_index[lane_idx];
+    rkey       = rndv_req->send.rndv.rkey;
+    rkey_index = rndv_req->send.rndv.rkey_index[lane_idx];
     *uct_rkey  = (rkey_index != UCP_NULL_RESOURCE) ?
                  rkey->tl_rkey[rkey_index].rkey.rkey : UCT_INVALID_RKEY;
     ep_config  = ucp_ep_config(rndv_req->send.ep);
-    return ep_config->rndv.get_zcopy_lanes[lane_idx];
+    return (proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) ?
+           ep_config->rndv.get_zcopy.lanes[lane_idx] :
+           ep_config->rndv.put_zcopy.lanes[lane_idx];
 }
 
-static void ucp_rndv_get_zcopy_next_lane(ucp_request_t *rndv_req)
+static void ucp_rndv_zcopy_next_lane(ucp_request_t *rndv_req)
 {
     rndv_req->send.lanes_map_avail    &= rndv_req->send.lanes_map_avail - 1;
     if (!rndv_req->send.lanes_map_avail) {
-        rndv_req->send.lanes_map_avail = rndv_req->send.rndv_get.lanes_map_all;
+        rndv_req->send.lanes_map_avail = rndv_req->send.rndv.lanes_map_all;
     }
 }
 
-UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self),
-                 uct_pending_req_t *self)
+static ucs_status_t
+ucp_rndv_progress_rma_zcopy_common(ucp_request_t *req, ucp_lane_index_t lane,
+                                   uct_rkey_t uct_rkey, unsigned proto)
 {
-    ucp_request_t *rndv_req = ucs_container_of(self, ucp_request_t, send.uct);
-    ucp_ep_h ep             = rndv_req->send.ep;
-    ucp_ep_config_t *config = ucp_ep_config(ep);
     const size_t max_iovcnt = 1;
-    uct_iface_attr_t* attrs;
-    ucs_status_t status;
-    size_t offset, length, ucp_mtu, remaining, align, chunk;
+    ucp_ep_h ep             = req->send.ep;
+    ucp_ep_config_t *config = ucp_ep_config(ep);
     uct_iov_t iov[max_iovcnt];
     size_t iovcnt;
-    ucp_rsc_index_t rsc_index;
+    uct_iface_attr_t *attrs;
+    ucs_status_t status;
+    size_t offset, length, ucp_mtu, remaining, align, chunk;
     ucp_dt_state_t state;
-    uct_rkey_t uct_rkey;
+    ucp_rsc_index_t rsc_index;
     size_t min_zcopy;
     size_t max_zcopy;
+    double scale;
     int pending_add_res;
-    ucp_lane_index_t lane;
 
-    /* Figure out which lane to use for get operation */
-    rndv_req->send.lane = lane = ucp_rndv_get_zcopy_get_lane(rndv_req, &uct_rkey);
-
-    if (lane == UCP_NULL_LANE) {
-        /* If can't perform get_zcopy - switch to active-message.
-         * NOTE: we do not register memory and do not send our keys. */
-        ucp_trace_req(rndv_req, "remote memory unreachable, switch to rtr");
-        ucp_rkey_destroy(rndv_req->send.rndv_get.rkey);
-        ucp_rndv_recv_data_init(rndv_req->super_req,
-                                rndv_req->send.length);
-        /* Update statistics counters from get_zcopy to rtr */
-        UCP_WORKER_STAT_RNDV(ep->worker, GET_ZCOPY, -1);
-        UCP_WORKER_STAT_RNDV(ep->worker, SEND_RTR,  +1);
-        ucp_rndv_req_send_rtr(rndv_req, rndv_req->super_req,
-                              rndv_req->send.rndv_get.remote_req_id,
-                              rndv_req->send.length, 0ul);
-        return UCS_OK;
-    }
-
-    ucs_assert_always(rndv_req->send.rndv_get.lanes_count > 0);
-
-    if (!rndv_req->send.mdesc) {
-        status = ucp_send_request_add_reg_lane(rndv_req, lane);
+    ucs_assert_always(req->send.lane != UCP_NULL_LANE);
+    ucs_assert_always(req->send.rndv.lanes_count > 0);
+
+    if (req->send.mdesc == NULL) {
+        status = ucp_send_request_add_reg_lane(req, lane);
         ucs_assert_always(status == UCS_OK);
     }
 
     rsc_index = ucp_ep_get_rsc_index(ep, lane);
     attrs     = ucp_worker_iface_get_attr(ep->worker, rsc_index);
-    align     = attrs->cap.get.opt_zcopy_align;
-    ucp_mtu   = attrs->cap.get.align_mtu;
-    min_zcopy = config->rndv.min_get_zcopy;
-    max_zcopy = config->rndv.max_get_zcopy;
 
-    offset    = rndv_req->send.state.dt.offset;
-    remaining = (uintptr_t)rndv_req->send.buffer % align;
+    if (proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) {
+        align     = attrs->cap.get.opt_zcopy_align;
+        ucp_mtu   = attrs->cap.get.align_mtu;
+        min_zcopy = config->rndv.get_zcopy.min;
+        max_zcopy = config->rndv.get_zcopy.max;
+        scale     = config->rndv.get_zcopy.scale[lane];
+    } else {
+        align     = attrs->cap.put.opt_zcopy_align;
+        ucp_mtu   = attrs->cap.put.align_mtu;
+        min_zcopy = config->rndv.put_zcopy.min;
+        max_zcopy = config->rndv.put_zcopy.max;
+        scale     = config->rndv.put_zcopy.scale[lane];
+    }
+
+    offset    = req->send.state.dt.offset;
+    remaining = (uintptr_t)req->send.buffer % align;
 
-    if ((offset == 0) && (remaining > 0) && (rndv_req->send.length > ucp_mtu)) {
+    if ((offset == 0) && (remaining > 0) && (req->send.length > ucp_mtu)) {
         length = ucp_mtu - remaining;
     } else {
-        chunk = ucs_align_up((size_t)(rndv_req->send.length /
-                                      rndv_req->send.rndv_get.lanes_count
-                                      * config->rndv.scale[lane]),
-                             align);
-        length = ucs_min(chunk, rndv_req->send.length - offset);
+        chunk  = ucs_align_up((size_t)(req->send.length /
+                                       req->send.rndv.lanes_count * scale),
+                              align);
+        length = ucs_min(chunk, req->send.length - offset);
     }
 
     length = ucp_rndv_adjust_zcopy_length(min_zcopy, max_zcopy, align,
-                                          rndv_req->send.length, offset,
-                                          length);
+                                          req->send.length, offset, length);
 
-    ucs_trace_data("req %p: offset %zu remainder %zu rma-get to %p len %zu lane %d",
-                   rndv_req, offset, remaining,
-                   UCS_PTR_BYTE_OFFSET(rndv_req->send.buffer, offset),
-                   length, lane);
+    ucs_trace_data("req %p: offset %zu remain %zu RMA-%s to %p len %zu lane %d",
+                   req, offset, remaining,
+                   (proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) ? "GET" : "PUT",
+                   UCS_PTR_BYTE_OFFSET(req->send.buffer, offset), length, lane);
 
-    state = rndv_req->send.state.dt;
+    state = req->send.state.dt;
     /* TODO: is this correct? memh array may skip MD's where
      * registration is not supported. for now SHM may avoid registration,
      * but it will work on single lane */
     ucp_dt_iov_copy_uct(ep->worker->context, iov, &iovcnt, max_iovcnt, &state,
-                        rndv_req->send.buffer, ucp_dt_make_contig(1), length,
-                        ucp_ep_md_index(ep, lane),
-                        rndv_req->send.mdesc);
+                        req->send.buffer, ucp_dt_make_contig(1), length,
+                        ucp_ep_md_index(ep, lane), req->send.mdesc);
 
     for (;;) {
-        status = uct_ep_get_zcopy(ep->uct_eps[lane],
-                                  iov, iovcnt,
-                                  rndv_req->send.rndv_get.remote_address + offset,
-                                  uct_rkey,
-                                  &rndv_req->send.state.uct_comp);
-        ucp_request_send_state_advance(rndv_req, &state,
-                                       UCP_REQUEST_SEND_PROTO_RNDV_GET,
-                                       status);
-        if (rndv_req->send.state.dt.offset == rndv_req->send.length) {
-            if (rndv_req->send.state.uct_comp.count == 0) {
-                uct_completion_update_status(&rndv_req->send.state.uct_comp,
-                                             status);
-                rndv_req->send.state.uct_comp.func(&rndv_req->send.state.uct_comp);
+        if (proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) {
+            status = uct_ep_get_zcopy(ep->uct_eps[lane], iov, iovcnt,
+                                      req->send.rndv.remote_address + offset,
+                                      uct_rkey, &req->send.state.uct_comp);
+        } else {
+            status = uct_ep_put_zcopy(ep->uct_eps[lane], iov, iovcnt,
+                                      req->send.rndv.remote_address + offset,
+                                      uct_rkey, &req->send.state.uct_comp);
+        }
+
+        ucp_request_send_state_advance(req, &state, proto, status);
+        if (req->send.state.dt.offset == req->send.length) {
+            if (req->send.state.uct_comp.count == 0) {
+                uct_completion_update_status(&req->send.state.uct_comp, status);
+                req->send.state.uct_comp.func(&req->send.state.uct_comp);
             }
             return UCS_OK;
         } else if (!UCS_STATUS_IS_ERR(status)) {
-            /* in case if not all chunks are transmitted - return in_progress
-             * status */
-            ucp_rndv_get_zcopy_next_lane(rndv_req);
+            /* return in_progress status in case if not all chunks are transmitted */
+            ucp_rndv_zcopy_next_lane(req);
             return UCS_INPROGRESS;
-        } else {
-            if (status == UCS_ERR_NO_RESOURCE) {
-                if (lane != rndv_req->send.pending_lane) {
-                    /* switch to new pending lane */
-                    pending_add_res = ucp_request_pending_add(rndv_req, 0);
-                    if (!pending_add_res) {
-                        /* failed to switch req to pending queue, try again */
-                        continue;
-                    }
-                    return UCS_OK;
+        } else if (status == UCS_ERR_NO_RESOURCE) {
+            if (lane != req->send.pending_lane) {
+                /* switch to new pending lane */
+                pending_add_res = ucp_request_pending_add(req, 0);
+                if (!pending_add_res) {
+                    /* failed to switch req to pending queue, try again */
+                    continue;
                 }
+                return UCS_OK;
             }
-            return status;
+            return UCS_ERR_NO_RESOURCE;
+        } else {
+            ucp_request_send_state_ff(req, status);
+            return UCS_OK;
         }
     }
 }
 
-UCS_PROFILE_FUNC_VOID(ucp_rndv_get_completion, (self),
-                      uct_completion_t *self)
+UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_get_zcopy, (self),
+                 uct_pending_req_t *self)
+{
+    ucp_request_t *rndv_req = ucs_container_of(self, ucp_request_t, send.uct);
+    uct_rkey_t uct_rkey;
+
+    /* Figure out which lane to use for get operation */
+    rndv_req->send.lane =
+        ucp_rndv_zcopy_get_lane(rndv_req, &uct_rkey,
+                                UCP_REQUEST_SEND_PROTO_RNDV_GET);
+    if (rndv_req->send.lane != UCP_NULL_LANE) {
+        return ucp_rndv_progress_rma_zcopy_common(
+                rndv_req, rndv_req->send.lane, uct_rkey,
+                UCP_REQUEST_SEND_PROTO_RNDV_GET);
+    }
+
+    /* If can't perform get_zcopy - switch to active-message.
+     * NOTE: we do not register memory and do not send our keys. */
+    ucp_trace_req(rndv_req, "remote memory unreachable, switch to rtr");
+    ucp_rkey_destroy(rndv_req->send.rndv.rkey);
+    ucp_rndv_recv_data_init(ucp_request_get_super(rndv_req),
+                            rndv_req->send.length);
+    /* Remove statistics related to get_zcopy */
+    UCP_WORKER_STAT_RNDV(rndv_req->send.ep->worker, GET_ZCOPY, -1);
+    ucp_rndv_req_send_rtr(rndv_req, ucp_request_get_super(rndv_req),
+                          rndv_req->send.rndv.remote_req_id,
+                          rndv_req->send.length, 0ul);
+    return UCS_OK;
+}
+
+UCS_PROFILE_FUNC_VOID(ucp_rndv_get_completion, (self), uct_completion_t *self)
 {
     ucp_request_t *rndv_req = ucs_container_of(self, ucp_request_t,
                                                send.state.uct_comp);
+    ucp_ep_h UCS_V_UNUSED ep;
+    ucp_request_t *rreq;
+    ucs_status_t status;
+
+    if (rndv_req->send.state.dt.offset != rndv_req->send.length) {
+        return;
+    }
+
+    rreq   = ucp_request_get_super(rndv_req);
+    status = rndv_req->send.state.uct_comp.status;
+    ep     = rndv_req->send.ep;
+
+    ucs_assertv(rndv_req->send.state.dt.offset == rndv_req->send.length,
+                "rndv_req=%p offset=%zu length=%zu", rndv_req,
+                rndv_req->send.state.dt.offset, rndv_req->send.length);
+
+    ucp_trace_req(rndv_req, "rndv_get completed with status %s",
+                  ucs_status_string(status));
+    UCS_PROFILE_REQUEST_EVENT(rreq, "complete_rndv_get", 0);
+
+    ucp_rkey_destroy(rndv_req->send.rndv.rkey);
+    ucp_request_send_buffer_dereg(rndv_req);
 
-    if (rndv_req->send.state.dt.offset == rndv_req->send.length) {
-        ucp_rndv_complete_rma_get_zcopy(rndv_req, self->status);
+    if (status == UCS_OK) {
+        ucp_rndv_req_send_ack(rndv_req, rreq, rndv_req->send.rndv.remote_req_id,
+                              UCS_OK, UCP_AM_ID_RNDV_ATS, "send_ats");
+    } else {
+        /* if completing RNDV with the error, just release RNDV request */
+        ucp_request_put(rndv_req);
     }
+
+    ucs_assert((rreq->recv.state.dt.contig.md_map == 0) ||
+               /* Request send state fast-forward after failure detection, i.e.
+                * it is called from ucp_request_send_state_ff() function.
+                * md_map can be NULL, if GET Zcopy was started, but no fragments
+                * were really sent yet */
+               ((ep->flags & UCP_EP_FLAG_FAILED) && (status != UCS_OK)));
+    ucp_rndv_recv_req_complete(rreq, status);
 }
 
-static void ucp_rndv_put_completion(uct_completion_t *self)
+UCS_PROFILE_FUNC_VOID(ucp_rndv_put_completion, (self), uct_completion_t *self)
 {
     ucp_request_t *sreq = ucs_container_of(self, ucp_request_t,
                                            send.state.uct_comp);
+    ucp_rndv_complete_rma_put_zcopy(sreq, 0);
+}
 
-    if (sreq->send.state.dt.offset == sreq->send.length) {
-        ucp_rndv_send_atp(sreq, sreq->send.rndv_put.rreq_remote_id);
-    }
+static void ucp_rndv_req_init_lanes(ucp_request_t *req,
+                                    ucp_lane_map_t lanes_map,
+                                    uint8_t lanes_count)
+{
+    req->send.lanes_map_avail    = lanes_map;
+    req->send.rndv.lanes_map_all = lanes_map;
+    req->send.rndv.lanes_count   = lanes_count;
 }
 
-static void ucp_rndv_req_init_get_zcopy_lane_map(ucp_request_t *rndv_req)
+static void ucp_rndv_req_init_zcopy_lane_map(ucp_request_t *rndv_req,
+                                             ucs_memory_type_t mem_type,
+                                             unsigned proto)
 {
     ucp_ep_h ep                = rndv_req->send.ep;
     ucp_ep_config_t *ep_config = ucp_ep_config(ep);
     ucp_context_h context      = ep->worker->context;
-    ucs_memory_type_t mem_type = rndv_req->send.mem_type;
-    ucp_rkey_h rkey            = rndv_req->send.rndv_get.rkey;
+    ucp_rkey_h rkey            = rndv_req->send.rndv.rkey;
+    ucp_lane_index_t *lanes;
     ucp_lane_map_t lane_map;
     ucp_lane_index_t lane, lane_idx;
     ucp_md_index_t md_index;
@@ -609,10 +653,17 @@ static void ucp_rndv_req_init_get_zcopy_lane_map(ucp_request_t *rndv_req)
     double max_lane_bw, lane_bw;
     int i;
 
+    ucs_assert((proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) ||
+               (proto == UCP_REQUEST_SEND_PROTO_RNDV_PUT));
+
+    lanes = (proto == UCP_REQUEST_SEND_PROTO_RNDV_GET) ?
+            ep_config->rndv.get_zcopy.lanes :
+            ep_config->rndv.put_zcopy.lanes;
+
     max_lane_bw = 0;
     lane_map    = 0;
     for (i = 0; i < UCP_MAX_LANES; i++) {
-        lane = ep_config->rndv.get_zcopy_lanes[i];
+        lane = lanes[i];
         if (lane == UCP_NULL_LANE) {
             break; /* no more lanes */
         }
@@ -628,9 +679,9 @@ static void ucp_rndv_req_init_get_zcopy_lane_map(ucp_request_t *rndv_req)
             /* Lane does not need rkey, can use the lane with invalid rkey  */
             if (!rkey || ((md_attr->cap.access_mem_types & UCS_BIT(mem_type)) &&
                           (mem_type == rkey->mem_type))) {
-                rndv_req->send.rndv_get.rkey_index[i] = UCP_NULL_RESOURCE;
-                lane_map                             |= UCS_BIT(i);
-                max_lane_bw                           = ucs_max(max_lane_bw, lane_bw);
+                rndv_req->send.rndv.rkey_index[i] = UCP_NULL_RESOURCE;
+                lane_map                         |= UCS_BIT(i);
+                max_lane_bw                       = ucs_max(max_lane_bw, lane_bw);
                 continue;
             }
         }
@@ -643,10 +694,10 @@ static void ucp_rndv_req_init_get_zcopy_lane_map(ucp_request_t *rndv_req)
         dst_md_index = ep_config->key.lanes[lane].dst_md_index;
         if (rkey && ucs_likely(rkey->md_map & UCS_BIT(dst_md_index))) {
             /* Return first matching lane */
-            rndv_req->send.rndv_get.rkey_index[i] = ucs_bitmap2idx(rkey->md_map,
-                                                                   dst_md_index);
-            lane_map                             |= UCS_BIT(i);
-            max_lane_bw                           = ucs_max(max_lane_bw, lane_bw);
+            rndv_req->send.rndv.rkey_index[i] = ucs_bitmap2idx(rkey->md_map,
+                                                               dst_md_index);
+            lane_map                         |= UCS_BIT(i);
+            max_lane_bw                       = ucs_max(max_lane_bw, lane_bw);
         }
     }
 
@@ -654,22 +705,79 @@ static void ucp_rndv_req_init_get_zcopy_lane_map(ucp_request_t *rndv_req)
         /* remove lanes if bandwidth is too less compare to best lane */
         ucs_for_each_bit(lane_idx, lane_map) {
             ucs_assert(lane_idx < UCP_MAX_LANES);
-            lane       = ep_config->rndv.get_zcopy_lanes[lane_idx];
+            lane       = lanes[lane_idx];
             rsc_index  = ep_config->key.lanes[lane].rsc_index;
             iface_attr = ucp_worker_iface_get_attr(ep->worker, rsc_index);
             lane_bw    = ucp_tl_iface_bandwidth(context, &iface_attr->bandwidth);
 
-            if ((lane_bw/max_lane_bw) <
+            if ((lane_bw / max_lane_bw) <
                 (1. / context->config.ext.multi_lane_max_ratio)) {
-                lane_map                                    &= ~UCS_BIT(lane_idx);
-                rndv_req->send.rndv_get.rkey_index[lane_idx] = UCP_NULL_RESOURCE;
+                lane_map                                &= ~UCS_BIT(lane_idx);
+                rndv_req->send.rndv.rkey_index[lane_idx] = UCP_NULL_RESOURCE;
             }
         }
     }
 
-    rndv_req->send.lanes_map_avail          =
-    rndv_req->send.rndv_get.lanes_map_all   = lane_map;
-    rndv_req->send.rndv_get.lanes_count     = ucs_popcount(lane_map);
+    ucp_rndv_req_init_lanes(rndv_req, lane_map, ucs_popcount(lane_map));
+}
+
+static void ucp_rndv_req_init(ucp_request_t *req, ucp_request_t *super_req,
+                              ucp_lane_map_t lanes_map, uint8_t lanes_count,
+                              ucp_rkey_h rkey, uint64_t remote_address,
+                              uint8_t *rkey_index)
+{
+    ucp_lane_index_t i;
+
+    req->send.rndv.rkey           = rkey;
+    req->send.rndv.remote_address = remote_address;
+    req->send.pending_lane        = UCP_NULL_LANE;
+
+    ucp_request_set_super(req, super_req);
+    ucp_rndv_req_init_lanes(req, lanes_map, lanes_count);
+
+    if (rkey_index != NULL) {
+        memcpy(req->send.rndv.rkey_index, rkey_index,
+               sizeof(*req->send.rndv.rkey_index) * UCP_MAX_LANES);
+    } else {
+        for (i = 0; i < UCP_MAX_LANES; i++) {
+            req->send.rndv.rkey_index[i] = UCP_NULL_RESOURCE;
+        }
+    }
+}
+
+static void
+ucp_rndv_req_init_remote_from_super_req(ucp_request_t *req,
+                                        ucp_request_t *super_req,
+                                        size_t remote_address_offset)
+{
+    req->flags   = 0;
+    req->send.ep = super_req->send.ep;
+
+    ucp_rndv_req_init(req, super_req, super_req->send.rndv.lanes_map_all,
+                      super_req->send.rndv.lanes_count,
+                      super_req->send.rndv.rkey,
+                      super_req->send.rndv.remote_address +
+                      remote_address_offset,
+                      super_req->send.rndv.rkey_index);
+}
+
+static void ucp_rndv_req_init_from_super_req(ucp_request_t *req,
+                                             ucp_request_t *super_req,
+                                             size_t length,
+                                             size_t send_buffer_offset,
+                                             size_t remote_address_offset,
+                                             ucs_ptr_map_key_t remote_req_id)
+{
+    ucs_assert(length > 0);
+
+    req->send.length = length;
+    req->send.buffer = UCS_PTR_BYTE_OFFSET(super_req->send.buffer,
+                                           send_buffer_offset);
+
+    ucp_rndv_req_init_remote_from_super_req(req, super_req,
+                                            remote_address_offset);
+
+    req->send.rndv.remote_req_id = remote_req_id;
 }
 
 static ucs_status_t ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req,
@@ -683,17 +791,18 @@ static ucs_status_t ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req,
 
     ucp_trace_req(rndv_req, "start rma_get rreq %p", rreq);
 
-    rndv_req->super_req                    = rreq;
-    rndv_req->send.uct.func                = ucp_rndv_progress_rma_get_zcopy;
-    rndv_req->send.buffer                  = rreq->recv.buffer;
-    rndv_req->send.mem_type                = rreq->recv.mem_type;
-    rndv_req->send.datatype                = ucp_dt_make_contig(1);
-    rndv_req->send.length                  = rndv_rts_hdr->size;
-    rndv_req->send.rndv_get.remote_req_id  = rndv_rts_hdr->sreq.req_id;
-    rndv_req->send.rndv_get.remote_address = rndv_rts_hdr->address;
-    rndv_req->send.datatype                = rreq->recv.datatype;
-
-    status = ucp_ep_rkey_unpack(ep, rkey_buf, &rndv_req->send.rndv_get.rkey);
+    rndv_req->send.uct.func            = ucp_rndv_progress_rma_get_zcopy;
+    rndv_req->send.buffer              = rreq->recv.buffer;
+    rndv_req->send.mem_type            = rreq->recv.mem_type;
+    rndv_req->send.datatype            = ucp_dt_make_contig(1);
+    rndv_req->send.length              = rndv_rts_hdr->size;
+    rndv_req->send.rndv.remote_req_id  = rndv_rts_hdr->sreq.req_id;
+    rndv_req->send.rndv.remote_address = rndv_rts_hdr->address;
+    rndv_req->send.pending_lane        = UCP_NULL_LANE;
+
+    ucp_request_set_super(rndv_req, rreq);
+
+    status = ucp_ep_rkey_unpack(ep, rkey_buf, &rndv_req->send.rndv.rkey);
     if (status != UCS_OK) {
         ucs_fatal("failed to unpack rendezvous remote key received from %s: %s",
                   ucp_ep_peer_name(ep), ucs_status_string(status));
@@ -703,9 +812,12 @@ static ucs_status_t ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req,
     ucp_request_send_state_reset(rndv_req, ucp_rndv_get_completion,
                                  UCP_REQUEST_SEND_PROTO_RNDV_GET);
 
-    ucp_rndv_req_init_get_zcopy_lane_map(rndv_req);
+    ucp_rndv_req_init_zcopy_lane_map(rndv_req, rndv_req->send.mem_type,
+                                     UCP_REQUEST_SEND_PROTO_RNDV_GET);
 
-    rndv_req->send.lane = ucp_rndv_get_zcopy_get_lane(rndv_req, &uct_rkey);
+    rndv_req->send.lane =
+        ucp_rndv_zcopy_get_lane(rndv_req, &uct_rkey,
+                                UCP_REQUEST_SEND_PROTO_RNDV_GET);
     if (rndv_req->send.lane == UCP_NULL_LANE) {
         return UCS_ERR_UNREACHABLE;
     }
@@ -719,43 +831,49 @@ static ucs_status_t ucp_rndv_req_send_rma_get(ucp_request_t *rndv_req,
 UCS_PROFILE_FUNC_VOID(ucp_rndv_recv_frag_put_completion, (self),
                       uct_completion_t *self)
 {
-    ucp_request_t *freq              = ucs_container_of(self, ucp_request_t,
-                                                        send.state.uct_comp);
-    ucp_worker_h worker              = freq->send.ep->worker;
-    ucs_ptr_map_key_t rreq_remote_id = freq->send.rndv_put.rreq_remote_id;
-    int is_put_proto                 = (rreq_remote_id == UCP_REQUEST_ID_INVALID);
-    ucp_request_t *req               = freq->super_req;
+    ucp_request_t *freq     = ucs_container_of(self, ucp_request_t,
+                                               send.state.uct_comp);
+    /* if the super request is a receive request, it means that's used RNDV
+     * scheme is PUT pipeline protocol, otherwise - GET pipeline protocol (where
+     * the super request is an intermediate RNDV request) */
+    int is_put_proto        = ucp_request_get_super(freq)->flags &
+                              (UCP_REQUEST_FLAG_RECV_TAG |
+                               UCP_REQUEST_FLAG_RECV_AM);
+    ucp_request_t *rreq;
     ucp_request_t *rndv_req;
 
-    ucs_trace_req("freq:%p: recv_frag_put done. rreq:%p ", freq, req);
-
     /* release memory descriptor */
-    ucs_mpool_put_inline((void *)freq->send.mdesc);
+    ucs_mpool_put_inline((void*)freq->send.mdesc);
 
     /* rndv_req is NULL in case of put protocol */
     if (!is_put_proto) {
-        rndv_req = ucp_worker_get_request_by_id(worker, rreq_remote_id);
+        rndv_req = ucp_request_get_super(freq);
+        rreq     = ucp_request_get_super(rndv_req);
+
+        ucs_trace_req("freq:%p: recv_frag_put done, nrdv_req:%p rreq:%p ", freq,
+                      rndv_req, rreq);
+
         /* pipeline recv get protocol */
         rndv_req->send.state.dt.offset += freq->send.length;
 
         /* send ATS for fragment get rndv completion */
         if (rndv_req->send.length == rndv_req->send.state.dt.offset) {
-            ucp_rkey_destroy(rndv_req->send.rndv_get.rkey);
-            ucp_rndv_req_send_ats(rndv_req, req,
-                                  rndv_req->send.rndv_get.remote_req_id,
-                                  UCS_OK);
+            ucp_rkey_destroy(rndv_req->send.rndv.rkey);
+            ucp_rndv_req_send_ack(rndv_req, rreq,
+                                  rndv_req->send.rndv.remote_req_id,
+                                  UCS_OK, UCP_AM_ID_RNDV_ATS, "send_ats");
         }
+    } else {
+        rreq = ucp_request_get_super(freq);
+        ucs_trace_req("freq:%p: recv_frag_put done, rreq:%p ", freq, rreq);
     }
 
-    ucs_assertv(req->recv.remaining >= freq->send.length,
-                "req->recv.remaining %zu, freq->send.length %zu",
-                req->recv.remaining, freq->send.length);
-    req->recv.remaining -= freq->send.length;
-    if (req->recv.remaining == 0) {
-        ucp_request_complete_tag_recv(req, UCS_OK);
-        if (!is_put_proto) {
-            ucp_worker_del_request_id(worker, rreq_remote_id);
-        }
+    ucs_assertv(rreq->recv.remaining >= freq->send.length,
+                "rreq->recv.remaining %zu, freq->send.length %zu",
+                rreq->recv.remaining, freq->send.length);
+    rreq->recv.remaining -= freq->send.length;
+    if (rreq->recv.remaining == 0) {
+        ucp_rndv_recv_req_complete(rreq, UCS_OK);
     }
 
     ucp_request_put(freq);
@@ -774,12 +892,14 @@ ucp_rndv_init_mem_type_frag_req(ucp_worker_h worker, ucp_request_t *freq, int rn
     ucp_request_send_state_init(freq, ucp_dt_make_contig(1), 0);
     ucp_request_send_state_reset(freq, comp_cb, rndv_op);
 
-    freq->send.buffer   = mdesc + 1;
-    freq->send.length   = length;
-    freq->send.datatype = ucp_dt_make_contig(1);
-    freq->send.mem_type = mem_type;
-    freq->send.mdesc    = mdesc;
-    freq->send.uct.func = uct_func;
+    freq->flags             = 0;
+    freq->send.buffer       = mdesc + 1;
+    freq->send.length       = length;
+    freq->send.datatype     = ucp_dt_make_contig(1);
+    freq->send.mem_type     = mem_type;
+    freq->send.mdesc        = mdesc;
+    freq->send.uct.func     = uct_func;
+    freq->send.pending_lane = UCP_NULL_LANE;
 
     if (mem_type != UCS_MEMORY_TYPE_HOST) {
         mem_type_ep       = worker->mem_type_ep[mem_type];
@@ -795,46 +915,81 @@ ucp_rndv_init_mem_type_frag_req(ucp_worker_h worker, ucp_request_t *freq, int rn
 }
 
 static void
-ucp_rndv_recv_frag_put_mem_type(ucp_request_t *rreq, ucp_request_t *rndv_req,
-                                ucp_request_t *freq, ucp_mem_desc_t *mdesc,
-                                size_t length, size_t offset)
+ucp_rndv_recv_frag_put_mem_type(ucp_request_t *rreq, ucp_request_t *freq,
+                                ucp_mem_desc_t *mdesc, size_t length,
+                                size_t offset)
 {
 
-    ucs_assert_always(!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(rreq->recv.mem_type));
+    ucs_assert_always(!UCP_MEM_IS_HOST(rreq->recv.mem_type));
 
     /* PUT on memtype endpoint to stage from
      * frag recv buffer to memtype recv buffer
      */
 
-    ucp_rndv_init_mem_type_frag_req(rreq->recv.worker, freq, UCP_REQUEST_SEND_PROTO_RNDV_PUT,
-                                    ucp_rndv_recv_frag_put_completion, mdesc, rreq->recv.mem_type,
-                                    length, ucp_rndv_progress_rma_put_zcopy);
+    ucp_rndv_init_mem_type_frag_req(rreq->recv.worker, freq,
+                                    UCP_REQUEST_SEND_PROTO_RNDV_PUT,
+                                    ucp_rndv_recv_frag_put_completion, mdesc,
+                                    rreq->recv.mem_type, length,
+                                    ucp_rndv_progress_rma_put_zcopy);
 
-    freq->super_req                    = rreq;
-    freq->send.rndv_put.rkey           = NULL;
-    freq->send.rndv_put.remote_address = (uintptr_t)rreq->recv.buffer + offset;
-    if (rndv_req == NULL) {
-        freq->send.rndv_put.rreq_remote_id = UCP_REQUEST_ID_INVALID;
-    } else {
-        freq->send.rndv_put.rreq_remote_id =
-            ucp_worker_get_request_id(rreq->recv.worker, rndv_req,
-                                      ucp_ep_use_indirect_id(freq->send.ep));
-    }
+    ucp_rndv_req_init(freq, rreq, 0, 0, NULL,
+                      (uintptr_t)UCS_PTR_BYTE_OFFSET(rreq->recv.buffer, offset),
+                      NULL);
+
+    ucp_rndv_req_init_zcopy_lane_map(freq, freq->send.mem_type,
+                                     UCP_REQUEST_SEND_PROTO_RNDV_PUT);
 
     ucp_request_send(freq, 0);
 }
 
 static void
-ucp_rndv_send_frag_get_mem_type(ucp_request_t *sreq, ucs_ptr_map_key_t rreq_id,
-                                size_t length, uint64_t remote_address,
-                                ucs_memory_type_t remote_mem_type, ucp_rkey_h rkey,
-                                uint8_t *rkey_index, ucp_lane_map_t lanes_map,
+ucp_rndv_send_frag_update_get_rkey(ucp_worker_h worker, ucp_request_t *freq,
+                                   ucp_mem_desc_t *mdesc,
+                                   ucs_memory_type_t mem_type)
+{
+    ucp_rkey_h *rkey_p  = &freq->send.rndv.rkey;
+    uint8_t *rkey_index = freq->send.rndv.rkey_index;
+    void *rkey_buffer;
+    size_t rkey_size;
+    ucs_status_t status;
+    ucp_ep_h mem_type_ep;
+    ucp_md_index_t md_index;
+    uct_md_attr_t *md_attr;
+    ucp_lane_index_t mem_type_rma_lane;
+
+    mem_type_ep       = worker->mem_type_ep[mem_type];
+    mem_type_rma_lane = ucp_ep_config(mem_type_ep)->key.rma_bw_lanes[0];
+    ucs_assert(mem_type_rma_lane != UCP_NULL_LANE);
+
+    md_index = ucp_ep_md_index(mem_type_ep, mem_type_rma_lane);
+    md_attr  = &mem_type_ep->worker->context->tl_mds[md_index].attr;
+
+    if (!(md_attr->cap.flags & UCT_MD_FLAG_NEED_RKEY)) {
+        return;
+    }
+
+    status = ucp_rkey_pack(mem_type_ep->worker->context, mdesc->memh,
+                           &rkey_buffer, &rkey_size);
+    ucs_assert_always(status == UCS_OK);
+
+    status = ucp_ep_rkey_unpack(mem_type_ep, rkey_buffer, rkey_p);
+    ucs_assert_always(status == UCS_OK);
+    ucp_rkey_buffer_release(rkey_buffer);
+
+    memset(rkey_index, 0, UCP_MAX_LANES * sizeof(uint8_t));
+}
+
+static void
+ucp_rndv_send_frag_get_mem_type(ucp_request_t *sreq, size_t length,
+                                uint64_t remote_address,
+                                ucs_memory_type_t remote_mem_type,
+                                ucp_rkey_h rkey, uint8_t *rkey_index,
+                                ucp_lane_map_t lanes_map, int update_get_rkey,
                                 uct_completion_callback_t comp_cb)
 {
     ucp_worker_h worker = sreq->send.ep->worker;
     ucp_request_t *freq;
     ucp_mem_desc_t *mdesc;
-    ucp_lane_index_t i;
 
     /* GET fragment to stage buffer */
 
@@ -853,18 +1008,11 @@ ucp_rndv_send_frag_get_mem_type(ucp_request_t *sreq, ucs_ptr_map_key_t rreq_id,
     ucp_rndv_init_mem_type_frag_req(worker, freq, UCP_REQUEST_SEND_PROTO_RNDV_GET,
                                     comp_cb, mdesc, remote_mem_type, length,
                                     ucp_rndv_progress_rma_get_zcopy);
+    ucp_rndv_req_init(freq, sreq, lanes_map, ucs_popcount(lanes_map), rkey,
+                      remote_address, rkey_index);
 
-    freq->super_req                     = sreq;
-    freq->send.lanes_map_avail          =
-    freq->send.rndv_get.lanes_map_all   = lanes_map;    
-    freq->send.rndv_get.lanes_count     = ucs_popcount(lanes_map);
-    freq->send.rndv_get.rkey            = rkey;
-    freq->send.rndv_get.remote_address  = remote_address;
-    freq->send.rndv_get.remote_req_id   = rreq_id;
-
-    for (i = 0; i < UCP_MAX_LANES; i++) {
-        freq->send.rndv_get.rkey_index[i] = rkey_index ? rkey_index[i]
-                                                       : UCP_NULL_RESOURCE;
+    if (update_get_rkey) {
+        ucp_rndv_send_frag_update_get_rkey(worker, freq, mdesc, remote_mem_type);
     }
 
     freq->status = UCS_INPROGRESS;
@@ -874,22 +1022,29 @@ ucp_rndv_send_frag_get_mem_type(ucp_request_t *sreq, ucs_ptr_map_key_t rreq_id,
 UCS_PROFILE_FUNC_VOID(ucp_rndv_recv_frag_get_completion, (self),
                       uct_completion_t *self)
 {
-    ucp_request_t *freq     = ucs_container_of(self, ucp_request_t,
-                                               send.state.uct_comp);
-    ucp_request_t *rndv_req = freq->super_req;
-    ucp_request_t *rreq     = rndv_req->super_req;
+    ucp_request_t *freq = ucs_container_of(self, ucp_request_t,
+                                           send.state.uct_comp);
+    ucp_request_t *rndv_req, *rreq;
+    uint64_t offset;
 
-    ucs_trace_req("freq:%p: recv_frag_get done. rreq:%p length:%ld"
+    if (freq->send.state.dt.offset != freq->send.length) {
+        return;
+    }
+
+    rndv_req = ucp_request_get_super(freq);
+    rreq     = ucp_request_get_super(rndv_req);
+    offset   = freq->send.rndv.remote_address -
+               rndv_req->send.rndv.remote_address;
+
+    ucs_trace_req("freq:%p: recv_frag_get done. rreq:%p length:%"PRIu64
                   " offset:%"PRIu64,
-                  freq, rndv_req, freq->send.length,
-                  freq->send.rndv_get.remote_address - rndv_req->send.rndv_get.remote_address);
+                  freq, rndv_req, freq->send.length, offset);
 
     /* fragment GET completed from remote to staging buffer, issue PUT from
      * staging buffer to recv buffer */
-    ucp_rndv_recv_frag_put_mem_type(rreq, rndv_req, freq,
-                                    (ucp_mem_desc_t *)freq->send.buffer -1,
-                                    freq->send.length, (freq->send.rndv_get.remote_address -
-                                    rndv_req->send.rndv_get.remote_address));
+    ucp_rndv_recv_frag_put_mem_type(rreq, freq,
+                                    (ucp_mem_desc_t*)freq->send.buffer - 1,
+                                    freq->send.length, offset);
 }
 
 static ucs_status_t
@@ -907,16 +1062,18 @@ ucp_rndv_recv_start_get_pipeline(ucp_worker_h worker, ucp_request_t *rndv_req,
     size_t max_frag_size, offset, length;
     size_t min_zcopy, max_zcopy;
 
-    min_zcopy                              = config->rndv.min_get_zcopy;
-    max_zcopy                              = config->rndv.max_get_zcopy;
-    max_frag_size                          = ucs_min(context->config.ext.rndv_frag_size,
-                                                     max_zcopy);
-    rndv_req->super_req                    = rreq;
-    rndv_req->send.rndv_get.remote_req_id  = remote_req_id;
-    rndv_req->send.rndv_get.remote_address = remote_address - base_offset;
-    rndv_req->send.length                  = size;
-    rndv_req->send.state.dt.offset         = 0;
-    rndv_req->send.mem_type                = rreq->recv.mem_type;
+    min_zcopy                          = config->rndv.get_zcopy.min;
+    max_zcopy                          = config->rndv.get_zcopy.max;
+    max_frag_size                      = ucs_min(context->config.ext.rndv_frag_size,
+                                                 max_zcopy);
+    rndv_req->send.rndv.remote_req_id  = remote_req_id;
+    rndv_req->send.rndv.remote_address = remote_address - base_offset;
+    rndv_req->send.length              = size;
+    rndv_req->send.state.dt.offset     = 0;
+    rndv_req->send.mem_type            = rreq->recv.mem_type;
+    rndv_req->send.pending_lane        = UCP_NULL_LANE;
+
+    ucp_request_set_super(rndv_req, rreq);
 
     /* Protocol:
      * Step 1: GET remote fragment into HOST fragment buffer
@@ -925,13 +1082,14 @@ ucp_rndv_recv_start_get_pipeline(ucp_worker_h worker, ucp_request_t *rndv_req,
      */
 
     status = ucp_ep_rkey_unpack(rndv_req->send.ep, rkey_buffer,
-                                &rndv_req->send.rndv_get.rkey);
+                                &rndv_req->send.rndv.rkey);
     if (ucs_unlikely(status != UCS_OK)) {
         ucs_fatal("failed to unpack rendezvous remote key received from %s: %s",
                   ucp_ep_peer_name(rndv_req->send.ep), ucs_status_string(status));
     }
 
-    ucp_rndv_req_init_get_zcopy_lane_map(rndv_req);
+    ucp_rndv_req_init_zcopy_lane_map(rndv_req, rndv_req->send.mem_type,
+                                     UCP_REQUEST_SEND_PROTO_RNDV_GET);
 
     offset = 0;
     while (offset != size) {
@@ -939,11 +1097,12 @@ ucp_rndv_recv_start_get_pipeline(ucp_worker_h worker, ucp_request_t *rndv_req,
                                               size, offset, size - offset);
 
         /* GET remote fragment into HOST fragment buffer */
-        ucp_rndv_send_frag_get_mem_type(rndv_req, remote_req_id, length,
-                                        remote_address + offset, UCS_MEMORY_TYPE_HOST,
-                                        rndv_req->send.rndv_get.rkey,
-                                        rndv_req->send.rndv_get.rkey_index,
-                                        rndv_req->send.rndv_get.lanes_map_all,
+        ucp_rndv_send_frag_get_mem_type(rndv_req, length,
+                                        remote_address + offset,
+                                        UCS_MEMORY_TYPE_HOST,
+                                        rndv_req->send.rndv.rkey,
+                                        rndv_req->send.rndv.rkey_index,
+                                        rndv_req->send.rndv.lanes_map_all, 0,
                                         ucp_rndv_recv_frag_get_completion);
 
         offset += length;
@@ -999,8 +1158,9 @@ static void ucp_rndv_send_frag_rtr(ucp_worker_h worker, ucp_request_t *rndv_req,
         freq->recv.length                 = frag_size;
         freq->recv.state.dt.contig.md_map = 0;
         freq->recv.frag.offset            = offset;
-        freq->super_req                   = rreq;
-        freq->flags                      |= UCP_REQUEST_FLAG_RNDV_FRAG;
+        freq->flags                       = UCP_REQUEST_FLAG_RNDV_FRAG;
+
+        ucp_request_set_super(freq, rreq);
 
         memh_index = 0;
         ucs_for_each_bit(md_index,
@@ -1011,6 +1171,7 @@ static void ucp_rndv_send_frag_rtr(ucp_worker_h worker, ucp_request_t *rndv_req,
         }
         ucs_assert(memh_index <= UCP_MAX_OP_MDS);
 
+        frndv_req->flags             = 0;
         frndv_req->send.ep           = rndv_req->send.ep;
         frndv_req->send.pending_lane = UCP_NULL_LANE;
 
@@ -1047,7 +1208,7 @@ static unsigned ucp_rndv_progress_rkey_ptr(void *arg)
     ucp_request_t *rndv_req = ucs_queue_head_elem_non_empty(&worker->rkey_ptr_reqs,
                                                             ucp_request_t,
                                                             send.rkey_ptr.queue_elem);
-    ucp_request_t *rreq     = rndv_req->super_req;
+    ucp_request_t *rreq     = ucp_request_get_super(rndv_req);
     size_t seg_size         = ucs_min(worker->context->config.ext.rkey_ptr_seg_size,
                                       rndv_req->send.length - rreq->recv.state.offset);
     ucs_status_t status;
@@ -1065,8 +1226,9 @@ static unsigned ucp_rndv_progress_rkey_ptr(void *arg)
         ucs_queue_pull_non_empty(&worker->rkey_ptr_reqs);
         ucp_rndv_recv_req_complete(rreq, status);
         ucp_rkey_destroy(rndv_req->send.rkey_ptr.rkey);
-        ucp_rndv_req_send_ats(rndv_req, rreq,
-                              rndv_req->send.rkey_ptr.req_id, status);
+        ucp_rndv_req_send_ack(rndv_req, rreq,
+                              rndv_req->send.rkey_ptr.remote_req_id, status,
+                              UCP_AM_ID_RNDV_ATS, "send_ats");
         if (ucs_queue_is_empty(&worker->rkey_ptr_reqs)) {
             uct_worker_progress_unregister_safe(worker->uct,
                                                 &worker->rkey_ptr_cb_id);
@@ -1123,9 +1285,10 @@ static void ucp_rndv_do_rkey_ptr(ucp_request_t *rndv_req, ucp_request_t *rreq,
                               &rkey->tl_rkey[rkey_index].rkey,
                               rndv_rts_hdr->address, &local_ptr);
     if (status != UCS_OK) {
-        ucp_request_complete_tag_recv(rreq, status);
+        ucp_rndv_recv_req_complete(rreq, status);
         ucp_rkey_destroy(rkey);
-        ucp_rndv_req_send_ats(rndv_req, rreq, rndv_rts_hdr->sreq.req_id, status);
+        ucp_rndv_req_send_ack(rndv_req, rreq, rndv_rts_hdr->sreq.req_id, status,
+                              UCP_AM_ID_RNDV_ATS, "send_ats");
         return;
     }
 
@@ -1133,12 +1296,12 @@ static void ucp_rndv_do_rkey_ptr(ucp_request_t *rndv_req, ucp_request_t *rreq,
 
     ucp_trace_req(rndv_req, "obtained a local pointer to remote buffer: %p",
                   local_ptr);
-    rndv_req->super_req            = rreq;
-    rndv_req->send.buffer          = local_ptr;
-    rndv_req->send.length          = rndv_rts_hdr->size;
-    rndv_req->send.rkey_ptr.rkey   = rkey;
-    rndv_req->send.rkey_ptr.req_id = rndv_rts_hdr->sreq.req_id;
+    rndv_req->send.buffer                 = local_ptr;
+    rndv_req->send.length                 = rndv_rts_hdr->size;
+    rndv_req->send.rkey_ptr.rkey          = rkey;
+    rndv_req->send.rkey_ptr.remote_req_id = rndv_rts_hdr->sreq.req_id;
 
+    ucp_request_set_super(rndv_req, rreq);
     UCP_WORKER_STAT_RNDV(ep->worker, RKEY_PTR, 1);
 
     ucs_queue_push(&worker->rkey_ptr_reqs, &rndv_req->send.rkey_ptr.queue_elem);
@@ -1171,25 +1334,32 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_receive, (worker, rreq, rndv_rts_hdr, rkey_buf),
     ucp_ep_config_t *ep_config;
     ucs_status_t status;
     int is_get_zcopy_failed;
+    ucp_ep_rndv_zcopy_config_t *get_zcopy;
+    ucs_memory_type_t src_mem_type;
 
     UCS_ASYNC_BLOCK(&worker->async);
 
     UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_receive", 0);
 
+    /* if receiving a message on an already closed endpoint, stop processing */
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, rndv_rts_hdr->sreq.ep_id,
+                                  { status = UCS_ERR_CANCELED; goto err; },
+                                  "RNDV rts");
+
     /* the internal send request allocated on receiver side (to perform a "get"
      * operation, send "ATS" and "RTR") */
     rndv_req = ucp_request_get(worker);
     if (rndv_req == NULL) {
         ucs_error("failed to allocate rendezvous reply");
-        goto out;
+        status = UCS_ERR_NO_MEMORY;
+        goto err;
     }
 
-    rndv_req->send.ep           = ucp_worker_get_ep_by_id(worker,
-                                                    rndv_rts_hdr->sreq.ep_id);
-    rndv_req->flags             = 0;
-    rndv_req->send.mdesc        = NULL;
-    rndv_req->send.pending_lane = UCP_NULL_LANE;
-    is_get_zcopy_failed         = 0;
+    rndv_req->flags      = 0;
+    rndv_req->send.ep    = ep;
+    rndv_req->send.mdesc = NULL;
+    is_get_zcopy_failed  = 0;
+    src_mem_type         = UCS_MEMORY_TYPE_HOST;
 
     ucp_trace_req(rreq,
                   "rndv matched remote {address 0x%"PRIx64" size %zu sreq_id "
@@ -1200,15 +1370,16 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_receive, (worker, rreq, rndv_rts_hdr, rkey_buf),
         ucp_trace_req(rndv_req,
                       "rndv truncated remote size %zu local size %zu rreq %p",
                       rndv_rts_hdr->size, rreq->recv.length, rreq);
-        ucp_rndv_req_send_ats(rndv_req, rreq, rndv_rts_hdr->sreq.req_id, UCS_OK);
+        ucp_rndv_req_send_ack(rndv_req, rreq, rndv_rts_hdr->sreq.req_id, UCS_OK,
+                              UCP_AM_ID_RNDV_ATS, "send_ats");
         ucp_request_recv_generic_dt_finish(rreq);
         ucp_rndv_zcopy_recv_req_complete(rreq, UCS_ERR_MESSAGE_TRUNCATED);
         goto out;
     }
 
     /* if the receive side is not connected yet then the RTS was received on a stub ep */
-    ep        = rndv_req->send.ep;
     ep_config = ucp_ep_config(ep);
+    get_zcopy = &ep_config->rndv.get_zcopy;
     rndv_mode = worker->context->config.ext.rndv_mode;
 
     if (ucp_rndv_is_rkey_ptr(rndv_rts_hdr, rkey_buf, ep, rreq->recv.mem_type,
@@ -1220,9 +1391,8 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_receive, (worker, rreq, rndv_rts_hdr, rkey_buf),
     if (UCP_DT_IS_CONTIG(rreq->recv.datatype)) {
         if ((rndv_rts_hdr->address != 0) &&
             ucp_rndv_test_zcopy_scheme_support(rndv_rts_hdr->size,
-                                               ep_config->rndv.min_get_zcopy,
-                                               ep_config->rndv.max_get_zcopy,
-                                               ep_config->rndv.get_zcopy_split)) {
+                                               get_zcopy->min, get_zcopy->max,
+                                               get_zcopy->split)) {
             /* try to fetch the data with a get_zcopy operation */
             status = ucp_rndv_req_send_rma_get(rndv_req, rreq, rndv_rts_hdr,
                                                rkey_buf);
@@ -1231,21 +1401,22 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_receive, (worker, rreq, rndv_rts_hdr, rkey_buf),
             }
 
             /* fallback to non get zcopy protocol */
-            ucp_rkey_destroy(rndv_req->send.rndv_get.rkey);
+            ucp_rkey_destroy(rndv_req->send.rndv.rkey);
             is_get_zcopy_failed = 1;
+            src_mem_type        = ucp_rkey_packed_mem_type(rkey_buf);
         }
 
         if (rndv_mode == UCP_RNDV_MODE_AUTO) {
             /* check if we need pipelined memtype staging */
-            if (UCP_MEM_IS_CUDA(rreq->recv.mem_type) &&
+            if (UCP_MEM_IS_GPU(rreq->recv.mem_type) &&
                 ucp_rndv_is_recv_pipeline_needed(rndv_req, rndv_rts_hdr,
                                                  rkey_buf, rreq->recv.mem_type,
                                                  is_get_zcopy_failed)) {
                 ucp_rndv_recv_data_init(rreq, rndv_rts_hdr->size);
                 if (ucp_rndv_is_put_pipeline_needed(rndv_rts_hdr->address,
                                                     rndv_rts_hdr->size,
-                                                    ep_config->rndv.min_get_zcopy,
-                                                    ep_config->rndv.max_get_zcopy,
+                                                    get_zcopy->min,
+                                                    get_zcopy->max,
                                                     is_get_zcopy_failed)) {
                     /* send FRAG RTR for sender to PUT the fragment. */
                     ucp_rndv_send_frag_rtr(worker, rndv_req, rreq, rndv_rts_hdr);
@@ -1261,9 +1432,10 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_receive, (worker, rreq, rndv_rts_hdr, rkey_buf),
             }
         }
 
-        if ((rndv_mode == UCP_RNDV_MODE_PUT_ZCOPY) ||
-            UCP_MEM_IS_CUDA(rreq->recv.mem_type)) {
-            /* put protocol is allowed - register receive buffer memory for rma */
+        if (!is_get_zcopy_failed || !UCP_MEM_IS_HOST(src_mem_type)) {
+            /* register receive buffer for
+             * put protocol (or) pipeline rndv for non-host memory type
+             */
             ucs_assert(rndv_rts_hdr->size <= rreq->recv.length);
             ucp_request_recv_buffer_reg(rreq, ep_config->key.rma_bw_md_map,
                                         rndv_rts_hdr->size);
@@ -1274,12 +1446,16 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_receive, (worker, rreq, rndv_rts_hdr, rkey_buf),
      * configured to PUT, or GET rndv mode is unsupported - send an RTR and
      * the sender will send the data with active message or put_zcopy. */
     ucp_rndv_recv_data_init(rreq, rndv_rts_hdr->size);
-    UCP_WORKER_STAT_RNDV(ep->worker, SEND_RTR, 1);
     ucp_rndv_req_send_rtr(rndv_req, rreq, rndv_rts_hdr->sreq.req_id,
                           rndv_rts_hdr->size, 0ul);
 
 out:
     UCS_ASYNC_UNBLOCK(&worker->async);
+    return;
+
+err:
+    ucp_rndv_recv_req_complete(rreq, status);
+    goto out;
 }
 
 UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rts_handler,
@@ -1289,11 +1465,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rts_handler,
     ucp_worker_h worker         = arg;
     ucp_rndv_rts_hdr_t *rts_hdr = data;
 
-    if (rts_hdr->flags & UCP_RNDV_RTS_FLAG_TAG) {
-        return ucp_tag_rndv_process_rts(worker, rts_hdr, length, tl_flags);
-    } else {
-        ucs_assert(rts_hdr->flags & UCP_RNDV_RTS_FLAG_AM);
+    if (ucp_rndv_rts_is_am(rts_hdr)) {
         return ucp_am_rndv_process_rts(arg, data, length, tl_flags);
+    } else {
+        ucs_assert(ucp_rndv_rts_is_tag(rts_hdr));
+        return ucp_tag_rndv_process_rts(worker, rts_hdr, length, tl_flags);
     }
 }
 
@@ -1303,15 +1479,41 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_ats_handler,
 {
     ucp_worker_h worker      = arg;
     ucp_reply_hdr_t *rep_hdr = data;
-    ucp_request_t *sreq      = ucp_worker_extract_request_by_id(worker,
-                                                                rep_hdr->req_id);
+    ucp_request_t *sreq;
+
+    if (worker->context->config.ext.proto_enable) {
+        return ucp_proto_rndv_ats_handler(arg, data, length, flags);
+    }
+
+    UCP_SEND_REQUEST_GET_BY_ID(&sreq, worker, rep_hdr->req_id, 1, return UCS_OK,
+                               "RNDV ATS %p", rep_hdr);
 
     /* dereg the original send request and set it to complete */
     UCS_PROFILE_REQUEST_EVENT(sreq, "rndv_ats_recv", 0);
     if (sreq->flags & UCP_REQUEST_FLAG_OFFLOADED) {
         ucp_tag_offload_cancel_rndv(sreq);
     }
-    ucp_rndv_complete_send(sreq, rep_hdr->status);
+
+    ucp_request_complete_and_dereg_send(sreq, rep_hdr->status);
+    return UCS_OK;
+}
+
+ucs_status_t ucp_rndv_rts_handle_status_from_pending(ucp_request_t *sreq,
+                                                     ucs_status_t status)
+{
+    /* we rely on the fact that the RTS isn't being sent by an AM Bcopy multi */
+    ucs_assert((status != UCP_STATUS_PENDING_SWITCH) &&
+               (status != UCS_INPROGRESS));
+
+    if (ucs_unlikely(status != UCS_OK)) {
+        if (status == UCS_ERR_NO_RESOURCE) {
+            return UCS_ERR_NO_RESOURCE;
+        }
+
+        ucp_send_request_id_release(sreq);
+        ucp_request_complete_and_dereg_send(sreq, status);
+    }
+
     return UCS_OK;
 }
 
@@ -1322,7 +1524,7 @@ static size_t ucp_rndv_pack_data(void *dest, void *arg)
     size_t length, offset;
 
     offset       = sreq->send.state.dt.offset;
-    hdr->rreq_id = sreq->send.msg_proto.rreq_id;
+    hdr->rreq_id = sreq->send.rndv_data.remote_req_id;
     hdr->offset  = offset;
     length       = ucs_min(sreq->send.length - offset,
                            ucp_ep_get_max_bcopy(sreq->send.ep, sreq->send.lane) - sizeof(*hdr));
@@ -1337,87 +1539,55 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_am_bcopy, (self),
 {
     ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_ep_t *ep        = sreq->send.ep;
+    int single          = (sreq->send.length + sizeof(ucp_rndv_data_hdr_t)) <=
+                          ucp_ep_config(ep)->am.max_bcopy;
     ucs_status_t status;
 
-    if (sreq->send.length <= ucp_ep_config(ep)->am.max_bcopy - sizeof(ucp_rndv_data_hdr_t)) {
+    if (single) {
         /* send a single bcopy message */
         status = ucp_do_am_bcopy_single(self, UCP_AM_ID_RNDV_DATA,
                                         ucp_rndv_pack_data);
+        ucs_assert(status != UCS_INPROGRESS);
     } else {
         status = ucp_do_am_bcopy_multi(self, UCP_AM_ID_RNDV_DATA,
                                        UCP_AM_ID_RNDV_DATA,
                                        ucp_rndv_pack_data,
                                        ucp_rndv_pack_data, 1);
+
+        if (status == UCS_INPROGRESS) {
+            return UCS_INPROGRESS;
+        } else if (ucs_unlikely(status == UCP_STATUS_PENDING_SWITCH)) {
+            return UCS_OK;
+        }
     }
-    if (status == UCS_OK) {
-        ucp_rndv_complete_send(sreq, UCS_OK);
-    } else if (status == UCP_STATUS_PENDING_SWITCH) {
-        status = UCS_OK;
+
+    if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) {
+        return UCS_ERR_NO_RESOURCE;
     }
 
-    return status;
+    ucp_request_complete_and_dereg_send(sreq, status);
+
+    return UCS_OK;
 }
 
 UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_progress_rma_put_zcopy, (self),
                  uct_pending_req_t *self)
 {
-    ucp_request_t *sreq     = ucs_container_of(self, ucp_request_t, send.uct);
-    const size_t max_iovcnt = 1;
-    ucp_ep_h ep             = sreq->send.ep;
-    ucs_status_t status;
-    size_t offset, ucp_mtu, align, remaining, length;
-    uct_iface_attr_t *attrs;
-    uct_iov_t iov[max_iovcnt];
-    size_t iovcnt;
-    ucp_dt_state_t state;
-
-    if (!sreq->send.mdesc) {
-        status = ucp_request_send_buffer_reg_lane(sreq, sreq->send.lane, 0);
-        ucs_assert_always(status == UCS_OK);
-    }
-
-    attrs     = ucp_worker_iface_get_attr(ep->worker,
-                                          ucp_ep_get_rsc_index(ep, sreq->send.lane));
-    align     = attrs->cap.put.opt_zcopy_align;
-    ucp_mtu   = attrs->cap.put.align_mtu;
+    ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct);
+    uct_rkey_t uct_rkey;
 
-    offset    = sreq->send.state.dt.offset;
-    remaining = (uintptr_t)sreq->send.buffer % align;
+    ucs_assert_always(sreq->send.rndv.lanes_count > 0);
 
-    if ((offset == 0) && (remaining > 0) && (sreq->send.length > ucp_mtu)) {
-        length = ucp_mtu - remaining;
-    } else {
-        length = ucs_min(sreq->send.length - offset,
-                         ucp_ep_config(ep)->rndv.max_put_zcopy);
+    /* Figure out which lane to use for put operation */
+    sreq->send.lane = ucp_rndv_zcopy_get_lane(sreq, &uct_rkey,
+                                              UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+    if (sreq->send.lane == UCP_NULL_LANE) {
+        /* Unexpected behavior */
+        ucs_fatal("sreq %p: unable to get PUT Zcopy lane", sreq);
     }
 
-    ucs_trace_data("req %p: offset %zu remainder %zu. read to %p len %zu",
-                   sreq, offset, (uintptr_t)sreq->send.buffer % align,
-                   UCS_PTR_BYTE_OFFSET(sreq->send.buffer, offset), length);
-
-    state = sreq->send.state.dt;
-    ucp_dt_iov_copy_uct(ep->worker->context, iov, &iovcnt, max_iovcnt, &state,
-                        sreq->send.buffer, ucp_dt_make_contig(1), length,
-                        ucp_ep_md_index(ep, sreq->send.lane), sreq->send.mdesc);
-    status = uct_ep_put_zcopy(ep->uct_eps[sreq->send.lane],
-                              iov, iovcnt,
-                              sreq->send.rndv_put.remote_address + offset,
-                              sreq->send.rndv_put.uct_rkey,
-                              &sreq->send.state.uct_comp);
-    ucp_request_send_state_advance(sreq, &state,
-                                   UCP_REQUEST_SEND_PROTO_RNDV_PUT,
-                                   status);
-    if (sreq->send.state.dt.offset == sreq->send.length) {
-        if (sreq->send.state.uct_comp.count == 0) {
-            uct_completion_update_status(&sreq->send.state.uct_comp, status);
-            sreq->send.state.uct_comp.func(&sreq->send.state.uct_comp);
-        }
-        return UCS_OK;
-    } else if (!UCS_STATUS_IS_ERR(status)) {
-        return UCS_INPROGRESS;
-    } else {
-        return status;
-    }
+    return ucp_rndv_progress_rma_zcopy_common(sreq, sreq->send.lane, uct_rkey,
+                                              UCP_REQUEST_SEND_PROTO_RNDV_PUT);
 }
 
 static void ucp_rndv_am_zcopy_send_req_complete(ucp_request_t *req,
@@ -1436,8 +1606,6 @@ static void ucp_rndv_am_zcopy_completion(uct_completion_t *self)
 
     if (sreq->send.state.dt.offset == sreq->send.length) {
         ucp_rndv_am_zcopy_send_req_complete(sreq, status);
-    } else if (status != UCS_OK) {
-        ucs_fatal("error handling is unsupported with rendezvous protocol");
     }
 }
 
@@ -1446,7 +1614,7 @@ static ucs_status_t ucp_rndv_progress_am_zcopy_single(uct_pending_req_t *self)
     ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_rndv_data_hdr_t hdr;
 
-    hdr.rreq_id = sreq->send.msg_proto.rreq_id;
+    hdr.rreq_id = sreq->send.rndv_data.remote_req_id;
     hdr.offset  = 0;
     return ucp_do_am_zcopy_single(self, UCP_AM_ID_RNDV_DATA, &hdr, sizeof(hdr),
                                   NULL, 0ul,
@@ -1458,7 +1626,7 @@ static ucs_status_t ucp_rndv_progress_am_zcopy_multi(uct_pending_req_t *self)
     ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_rndv_data_hdr_t hdr;
 
-    hdr.rreq_id = sreq->send.msg_proto.rreq_id;
+    hdr.rreq_id = sreq->send.rndv_data.remote_req_id;
     hdr.offset  = sreq->send.state.dt.offset;
     return ucp_do_am_zcopy_multi(self,
                                  UCP_AM_ID_RNDV_DATA,
@@ -1474,21 +1642,39 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_send_frag_put_completion, (self),
 {
     ucp_request_t *freq = ucs_container_of(self, ucp_request_t,
                                            send.state.uct_comp);
-    ucp_request_t *req  = freq->super_req;
+    ucp_request_t *fsreq, *sreq;
+
+    if (freq->send.state.dt.offset != freq->send.length) {
+        return;
+    }
 
     /* release memory descriptor */
-    if (freq->send.mdesc) {
-        ucs_mpool_put_inline((void *)freq->send.mdesc);
+    if (freq->send.mdesc != NULL) {
+        ucs_mpool_put_inline((void*)freq->send.mdesc);
     }
 
-    req->send.state.dt.offset += freq->send.length;
-    ucs_assert(req->send.state.dt.offset <= req->send.length);
+    fsreq                        = ucp_request_get_super(freq);
+    sreq                         = ucp_request_get_super(fsreq);
+    fsreq->send.state.dt.offset += freq->send.length;
+    ucs_assert(fsreq->send.state.dt.offset <= fsreq->send.length);
 
     /* send ATP for last fragment of the rndv request */
-    if (req->send.length == req->send.state.dt.offset) {
-        ucp_rndv_send_frag_atp(req, req->send.rndv_put.rreq_remote_id);
+    if (fsreq->send.length == fsreq->send.state.dt.offset) {
+        ucp_rkey_destroy(fsreq->send.rndv.rkey);
+
+        sreq->send.state.dt.offset += fsreq->send.length;
+
+        /* keep a status of a send request up to date updating it by a status from
+         * a request created for tracking a UCT PUT Zcopy operation */
+        uct_completion_update_status(&sreq->send.state.uct_comp, self->status);
+        ucp_rndv_complete_rma_put_zcopy(sreq, 1);
+
+        ucp_rndv_req_send_ack(fsreq, fsreq, fsreq->send.rndv.remote_req_id,
+                              self->status, UCP_AM_ID_RNDV_ATP, "send_frag_atp");
     }
 
+    /* release registered memory during doing PUT operation for a given fragment */
+    ucp_request_send_buffer_dereg(freq);
     ucp_request_put(freq);
 }
 
@@ -1497,18 +1683,21 @@ UCS_PROFILE_FUNC_VOID(ucp_rndv_put_pipeline_frag_get_completion, (self),
 {
     ucp_request_t *freq  = ucs_container_of(self, ucp_request_t,
                                             send.state.uct_comp);
-    ucp_request_t *fsreq = freq->super_req;
+    ucp_request_t *fsreq = ucp_request_get_super(freq);
+
+    /* get rkey can be NULL if memtype ep doesn't need RKEY */
+    if (freq->send.rndv.rkey != NULL) {
+        ucp_rkey_destroy(freq->send.rndv.rkey);
+    }
 
     /* get completed on memtype endpoint to stage on host. send put request to receiver*/
     ucp_request_send_state_reset(freq, ucp_rndv_send_frag_put_completion,
                                  UCP_REQUEST_SEND_PROTO_RNDV_PUT);
-    freq->super_req                      = fsreq;
-    freq->send.rndv_put.remote_address   = fsreq->send.rndv_put.remote_address +
-        (freq->send.rndv_get.remote_address - (uint64_t)fsreq->send.buffer);
-    freq->send.ep                        = fsreq->send.ep;
+    ucp_rndv_req_init_remote_from_super_req(freq, fsreq,
+                                            freq->send.rndv.remote_address -
+                                            (uint64_t)fsreq->send.buffer);
+
     freq->send.uct.func                  = ucp_rndv_progress_rma_put_zcopy;
-    freq->send.rndv_put.rkey             = fsreq->send.rndv_put.rkey;
-    freq->send.rndv_put.uct_rkey         = fsreq->send.rndv_put.uct_rkey;
     freq->send.lane                      = fsreq->send.lane;
     freq->send.state.dt.dt.contig.md_map = 0;
 
@@ -1525,10 +1714,10 @@ static ucs_status_t ucp_rndv_send_start_put_pipeline(ucp_request_t *sreq,
     const uct_md_attr_t *md_attr;
     ucp_request_t *freq;
     ucp_request_t *fsreq;
-    ucp_md_index_t md_index;
     size_t max_frag_size, rndv_size, length;
     size_t offset, rndv_base_offset;
     size_t min_zcopy, max_zcopy;
+    uct_rkey_t uct_rkey;
 
     ucp_trace_req(sreq, "using put rndv pipeline protocol");
 
@@ -1538,23 +1727,42 @@ static ucs_status_t ucp_rndv_send_start_put_pipeline(ucp_request_t *sreq,
      * Step 3: send ATP for each fragment request
      */
 
-    /* check if lane supports host memory, to stage sends through host memory */
-    md_attr = ucp_ep_md_attr(sreq->send.ep, sreq->send.lane);
-    if (!(md_attr->cap.reg_mem_types & UCS_BIT(UCS_MEMORY_TYPE_HOST))) {
-        return UCS_ERR_UNSUPPORTED;
-    }
-
-    min_zcopy        = config->rndv.min_put_zcopy;
-    max_zcopy        = config->rndv.max_put_zcopy;
+    min_zcopy        = config->rndv.put_zcopy.min;
+    max_zcopy        = config->rndv.put_zcopy.max;
     rndv_size        = ucs_min(rndv_rtr_hdr->size, sreq->send.length);
     max_frag_size    = ucs_min(context->config.ext.rndv_frag_size, max_zcopy);
     rndv_base_offset = rndv_rtr_hdr->offset;
 
     /* initialize send req state on first fragment rndv request */
     if (rndv_base_offset == 0) {
-         ucp_request_send_state_reset(sreq, NULL, UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+        ucp_request_send_state_reset(sreq, NULL,
+                                     UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+        ucp_rndv_req_init_zcopy_lane_map(sreq, sreq->send.rndv.rkey->mem_type,
+                                         UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+
+        /* check if lane could be allocated */
+        sreq->send.lane =
+            ucp_rndv_zcopy_get_lane(sreq, &uct_rkey,
+                                    UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+        if (sreq->send.lane == UCP_NULL_LANE) {
+            return UCS_ERR_UNSUPPORTED;
+        }
+
+        /* check if lane supports host memory, to stage sends through host memory */
+        md_attr = ucp_ep_md_attr(sreq->send.ep, sreq->send.lane);
+        if (!(md_attr->cap.reg_mem_types & UCS_BIT(UCS_MEMORY_TYPE_HOST))) {
+            return UCS_ERR_UNSUPPORTED;
+        }
+
+        /* check if mem type endpoint is exists */
+        if (!UCP_MEM_IS_HOST(sreq->send.mem_type) &&
+            (worker->mem_type_ep[sreq->send.mem_type] == NULL)) {
+            return UCS_ERR_UNSUPPORTED;
+        }
     }
 
+    sreq->send.rndv.remote_address = rndv_rtr_hdr->address;
+
     /* internal send request allocated on sender side to handle send fragments for RTR */
     fsreq = ucp_request_get(worker);
     if (fsreq == NULL) {
@@ -1562,25 +1770,18 @@ static ucs_status_t ucp_rndv_send_start_put_pipeline(ucp_request_t *sreq,
     }
 
     ucp_request_send_state_init(fsreq, ucp_dt_make_contig(1), 0);
-    fsreq->super_req                    = sreq;
-    fsreq->send.buffer                  = UCS_PTR_BYTE_OFFSET(sreq->send.buffer,
-                                                              rndv_base_offset);
-    fsreq->send.length                  = rndv_size;
-    fsreq->send.mem_type                = sreq->send.mem_type;
-    fsreq->send.ep                      = sreq->send.ep;
-    fsreq->send.lane                    = sreq->send.lane;
-    fsreq->send.rndv_put.rkey           = sreq->send.rndv_put.rkey;
-    fsreq->send.rndv_put.uct_rkey       = sreq->send.rndv_put.uct_rkey;
-    fsreq->send.rndv_put.rreq_remote_id = rndv_rtr_hdr->rreq_id;
-    fsreq->send.rndv_put.remote_address = rndv_rtr_hdr->address;
-    fsreq->send.state.dt.offset         = 0;
+    ucp_rndv_req_init_from_super_req(fsreq, sreq, rndv_size, rndv_base_offset,
+                                     0, rndv_rtr_hdr->rreq_id);
+    fsreq->send.mem_type        = sreq->send.mem_type;
+    fsreq->send.state.dt.offset = 0;
 
     offset = 0;
     while (offset != rndv_size) {
         length = ucp_rndv_adjust_zcopy_length(min_zcopy, max_frag_size, 0,
-                                              rndv_size, offset, rndv_size - offset);
+                                              rndv_size, offset,
+                                              rndv_size - offset);
 
-        if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(sreq->send.mem_type)) {
+        if (UCP_MEM_IS_HOST(sreq->send.mem_type)) {
             /* sbuf is in host, directly do put */
             freq = ucp_request_get(worker);
             if (ucs_unlikely(freq == NULL)) {
@@ -1588,35 +1789,25 @@ static ucs_status_t ucp_rndv_send_start_put_pipeline(ucp_request_t *sreq,
                 return UCS_ERR_NO_MEMORY;
             }
 
+            ucp_request_send_state_init(freq, ucp_dt_make_contig(1), 0);
             ucp_request_send_state_reset(freq, ucp_rndv_send_frag_put_completion,
                                          UCP_REQUEST_SEND_PROTO_RNDV_PUT);
-            md_index                              = ucp_ep_md_index(sreq->send.ep,
-                                                                    sreq->send.lane);
-            freq->super_req                       = fsreq;
-            freq->send.ep                         = fsreq->send.ep;
-            freq->send.buffer                     = UCS_PTR_BYTE_OFFSET(fsreq->send.buffer,
-                                                                        offset);
-            freq->send.datatype                   = ucp_dt_make_contig(1);
-            freq->send.mem_type                   = UCS_MEMORY_TYPE_HOST;
-            freq->send.state.dt.dt.contig.memh[0] =
-                        ucp_memh_map2uct(sreq->send.state.dt.dt.contig.memh,
-                                         sreq->send.state.dt.dt.contig.md_map, md_index);
-            freq->send.state.dt.dt.contig.md_map = UCS_BIT(md_index);
-            freq->send.length                    = length;
-            freq->send.uct.func                  = ucp_rndv_progress_rma_put_zcopy;
-            freq->send.rndv_put.rkey             = fsreq->send.rndv_put.rkey;
-            freq->send.rndv_put.uct_rkey         = fsreq->send.rndv_put.uct_rkey;
-            freq->send.rndv_put.remote_address   = rndv_rtr_hdr->address + offset;
-            freq->send.rndv_put.rreq_remote_id   = rndv_rtr_hdr->rreq_id;
-            freq->send.lane                      = fsreq->send.lane;
-            freq->send.mdesc                     = NULL;
+
+            ucp_rndv_req_init_from_super_req(freq, fsreq, length, offset,
+                                             offset, UCS_PTR_MAP_KEY_INVALID);
+            freq->send.datatype     = ucp_dt_make_contig(1);
+            freq->send.mem_type     = UCS_MEMORY_TYPE_HOST;
+            freq->send.uct.func     = ucp_rndv_progress_rma_put_zcopy;
+            freq->send.mdesc        = NULL;
+            freq->send.pending_lane = UCP_NULL_LANE;
 
             ucp_request_send(freq, 0);
         } else {
-            ucp_rndv_send_frag_get_mem_type(fsreq, 0, length,
-                                            (uint64_t)UCS_PTR_BYTE_OFFSET(fsreq->send.buffer, offset),
-                                            fsreq->send.mem_type, NULL, NULL, UCS_BIT(0),
-                                            ucp_rndv_put_pipeline_frag_get_completion);
+            ucp_rndv_send_frag_get_mem_type(
+                    fsreq, length,
+                    (uint64_t)UCS_PTR_BYTE_OFFSET(fsreq->send.buffer, offset),
+                    fsreq->send.mem_type, NULL, NULL, UCS_BIT(0), 1,
+                    ucp_rndv_put_pipeline_frag_get_completion);
         }
 
         offset += length;
@@ -1629,20 +1820,26 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_atp_handler,
                  (arg, data, length, flags),
                  void *arg, void *data, size_t length, unsigned flags)
 {
+    ucp_worker_h worker      = arg;
     ucp_reply_hdr_t *rep_hdr = data;
-    ucp_request_t *req       = ucp_worker_get_request_by_id(arg,
-                                                            rep_hdr->req_id);
+    ucp_request_t *rtr_sreq, *req;
+
+    UCP_SEND_REQUEST_GET_BY_ID(&rtr_sreq, worker, rep_hdr->req_id, 1,
+                               return UCS_OK, "RNDV ATP %p", rep_hdr);
+
+    req = ucp_request_get_super(rtr_sreq);
+    ucs_assert(req != NULL);
+    ucp_request_put(rtr_sreq);
 
     if (req->flags & UCP_REQUEST_FLAG_RNDV_FRAG) {
         /* received ATP for frag RTR request */
-        ucs_assert(req->super_req != NULL);
         UCS_PROFILE_REQUEST_EVENT(req, "rndv_frag_atp_recv", 0);
-        ucp_rndv_recv_frag_put_mem_type(req->super_req, NULL, req,
-                                        ((ucp_mem_desc_t*) req->recv.buffer - 1),
-                                        req->recv.length, req->recv.frag.offset);
+        ucp_rndv_recv_frag_put_mem_type(ucp_request_get_super(req), req,
+                                        (ucp_mem_desc_t*)req->recv.buffer - 1,
+                                        req->recv.length,
+                                        req->recv.frag.offset);
     } else {
         UCS_PROFILE_REQUEST_EVENT(req, "rndv_atp_recv", 0);
-        ucp_worker_del_request_id(arg, rep_hdr->req_id);
         ucp_rndv_zcopy_recv_req_complete(req, UCS_OK);
     }
 
@@ -1653,14 +1850,26 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler,
                  (arg, data, length, flags),
                  void *arg, void *data, size_t length, unsigned flags)
 {
+    ucp_worker_h worker              = arg;
+    ucp_context_h context            = worker->context;
     ucp_rndv_rtr_hdr_t *rndv_rtr_hdr = data;
-    ucp_request_t *sreq              = ucp_worker_get_request_by_id(arg,
-                                                                    rndv_rtr_hdr->sreq_id);
-    ucp_ep_h ep                      = sreq->send.ep;
-    ucp_ep_config_t *ep_config       = ucp_ep_config(ep);
-    ucp_context_h context            = ep->worker->context;
+    ucp_ep_rndv_zcopy_config_t *put_zcopy;
+    ucp_request_t *sreq;
+    ucp_ep_h ep;
+    ucp_ep_config_t *ep_config;
     ucs_status_t status;
     int is_pipeline_rndv;
+    uct_rkey_t uct_rkey;
+
+    if (context->config.ext.proto_enable) {
+        return ucp_proto_rndv_handle_rtr(arg, data, length, flags);
+    }
+
+    UCP_SEND_REQUEST_GET_BY_ID(&sreq, arg, rndv_rtr_hdr->sreq_id, 0,
+                               return UCS_OK, "RNDV RTR %p", rndv_rtr_hdr);
+    ep        = sreq->send.ep;
+    ep_config = ucp_ep_config(ep);
+    put_zcopy = &ep_config->rndv.put_zcopy;
 
     ucp_trace_req(sreq, "received rtr address 0x%"PRIx64" remote rreq_id"
                   "0x%"PRIx64, rndv_rtr_hdr->address, rndv_rtr_hdr->rreq_id);
@@ -1670,67 +1879,64 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler,
         /* Do not deregister memory here, because am zcopy rndv may
          * need it registered (if am and tag is the same lane). */
         ucp_tag_offload_cancel_rndv(sreq);
+        ucs_assert(!ucp_ep_use_indirect_id(ep));
     }
 
     if (UCP_DT_IS_CONTIG(sreq->send.datatype) && rndv_rtr_hdr->address) {
         status = ucp_ep_rkey_unpack(ep, rndv_rtr_hdr + 1,
-                                    &sreq->send.rndv_put.rkey);
+                                    &sreq->send.rndv.rkey);
         if (status != UCS_OK) {
             ucs_fatal("failed to unpack rendezvous remote key received from %s: %s",
                       ucp_ep_peer_name(ep), ucs_status_string(status));
         }
 
-        is_pipeline_rndv = ((!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(sreq->send.mem_type) ||
+        is_pipeline_rndv = ((!UCP_MEM_IS_HOST(sreq->send.mem_type) ||
                              (sreq->send.length != rndv_rtr_hdr->size)) &&
                             (context->config.ext.rndv_mode != UCP_RNDV_MODE_PUT_ZCOPY));
 
-        sreq->send.lane = ucp_rkey_find_rma_lane(ep->worker->context, ep_config,
-                                                 (is_pipeline_rndv ?
-                                                  sreq->send.rndv_put.rkey->mem_type :
-                                                  sreq->send.mem_type),
-                                                 ep_config->rndv.put_zcopy_lanes,
-                                                 sreq->send.rndv_put.rkey, 0,
-                                                 &sreq->send.rndv_put.uct_rkey);
-        if (sreq->send.lane != UCP_NULL_LANE) {
-            /*
-             * Try pipeline protocol for non-host memory, if PUT_ZCOPY protocol is
-             * not explicitly required. If pipeline is UNSUPPORTED, fallback to
-             * PUT_ZCOPY anyway.
-             */
-            if (is_pipeline_rndv) {
-                status = ucp_rndv_send_start_put_pipeline(sreq, rndv_rtr_hdr);
-                if (status != UCS_ERR_UNSUPPORTED) {
-                    return status;
-                }
-                /* If we get here, it means that RNDV pipeline protocol is
-                 * unsupported and we have to use PUT_ZCOPY RNDV scheme instead */
+        /*
+         * Try pipeline protocol for non-host memory, if PUT_ZCOPY protocol is
+         * not explicitly required. If pipeline is UNSUPPORTED, fallback to
+         * PUT_ZCOPY anyway.
+         */
+        if (is_pipeline_rndv) {
+            status = ucp_rndv_send_start_put_pipeline(sreq, rndv_rtr_hdr);
+            if (status != UCS_ERR_UNSUPPORTED) {
+                return status;
             }
+            /* If we get here, it means that RNDV pipeline protocol is unsupported
+             * and we have to use PUT_ZCOPY RNDV scheme instead */
+        }
 
-            if ((context->config.ext.rndv_mode != UCP_RNDV_MODE_GET_ZCOPY) &&
-                ucp_rndv_test_zcopy_scheme_support(sreq->send.length,
-                                                   ep_config->rndv.min_put_zcopy,
-                                                   ep_config->rndv.max_put_zcopy,
-                                                   ep_config->rndv.put_zcopy_split)) {
-                ucp_request_send_state_reset(sreq, ucp_rndv_put_completion,
+        if ((context->config.ext.rndv_mode != UCP_RNDV_MODE_GET_ZCOPY) &&
+            ucp_rndv_test_zcopy_scheme_support(sreq->send.length,
+                                               put_zcopy->min, put_zcopy->max,
+                                               put_zcopy->split)) {
+            ucp_request_send_state_reset(sreq, ucp_rndv_put_completion,
+                                         UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+            sreq->send.uct.func            = ucp_rndv_progress_rma_put_zcopy;
+            sreq->send.rndv.remote_req_id  = rndv_rtr_hdr->rreq_id;
+            sreq->send.rndv.remote_address = rndv_rtr_hdr->address;
+            sreq->send.mdesc               = NULL;
+            sreq->send.pending_lane        = UCP_NULL_LANE;
+
+            ucp_rndv_req_init_zcopy_lane_map(sreq, sreq->send.mem_type,
                                              UCP_REQUEST_SEND_PROTO_RNDV_PUT);
-                sreq->send.uct.func                = ucp_rndv_progress_rma_put_zcopy;
-                sreq->send.rndv_put.rreq_remote_id = rndv_rtr_hdr->rreq_id;
-                sreq->send.rndv_put.remote_address = rndv_rtr_hdr->address;
-                sreq->send.mdesc                   = NULL;
+
+            sreq->send.lane =
+                ucp_rndv_zcopy_get_lane(sreq, &uct_rkey,
+                                        UCP_REQUEST_SEND_PROTO_RNDV_PUT);
+            if (sreq->send.lane != UCP_NULL_LANE) {
                 goto out_send;
-            } else {
-                ucp_rkey_destroy(sreq->send.rndv_put.rkey);
             }
-        } else {
-            ucp_rkey_destroy(sreq->send.rndv_put.rkey);
         }
+
+        ucp_rkey_destroy(sreq->send.rndv.rkey);
     }
 
     ucp_trace_req(sreq, "using rdnv_data protocol");
 
     /* switch to AM */
-    sreq->send.msg_proto.rreq_id = rndv_rtr_hdr->rreq_id;
-
     if (UCP_DT_IS_CONTIG(sreq->send.datatype) &&
         (sreq->send.length >=
          ep_config->am.mem_type_zcopy_thresh[sreq->send.mem_type]))
@@ -1754,7 +1960,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_rtr_handler,
         sreq->send.am_bw_index     = 1;
     }
 
+    sreq->send.rndv_data.remote_req_id = rndv_rtr_hdr->rreq_id;
+
 out_send:
+    /* if it is not a PUT pipeline protocol, delete the send request ID */
+    ucp_send_request_id_release(sreq);
     ucp_request_send(sreq, 0);
     return UCS_OK;
 }
@@ -1765,36 +1975,45 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_rndv_data_handler,
 {
     ucp_worker_h worker                = arg;
     ucp_rndv_data_hdr_t *rndv_data_hdr = data;
-    ucp_request_t *rreq;
-    ucs_status_t status;
+    ucp_request_t *rreq, *rndv_req;
     size_t recv_len;
+    ucs_status_t status;
+
+    if (worker->context->config.ext.proto_enable) {
+        return ucp_proto_rndv_handle_data(arg, data, length, flags);
+    }
+
+    UCP_SEND_REQUEST_GET_BY_ID(&rndv_req, worker, rndv_data_hdr->rreq_id, 0,
+                               return UCS_OK, "RNDV data %p", rndv_data_hdr);
 
-    rreq = ucp_worker_get_request_by_id(worker, rndv_data_hdr->rreq_id);
-    ucs_assert(!(rreq->flags & UCP_REQUEST_FLAG_RNDV_FRAG) &&
-               (rreq->flags & (UCP_REQUEST_FLAG_RECV_AM |
-                               UCP_REQUEST_FLAG_RECV_TAG)));
+    rreq = ucp_request_get_super(rndv_req);
+    ucs_assert(rreq != NULL);
+    ucs_assert(!(rreq->flags & UCP_REQUEST_FLAG_RNDV_FRAG));
+    ucs_assert(rreq->flags &
+               (UCP_REQUEST_FLAG_RECV_AM | UCP_REQUEST_FLAG_RECV_TAG));
 
     recv_len = length - sizeof(*rndv_data_hdr);
     UCS_PROFILE_REQUEST_EVENT(rreq, "rndv_data_recv", recv_len);
 
     status = ucp_request_process_recv_data(rreq, rndv_data_hdr + 1, recv_len,
                                            rndv_data_hdr->offset, 1,
-                                           rreq->flags & UCP_REQUEST_FLAG_RECV_AM);
+                                           rreq->flags &
+                                                   UCP_REQUEST_FLAG_RECV_AM);
     if (status != UCS_INPROGRESS) {
-        ucp_worker_del_request_id(worker, rndv_data_hdr->rreq_id);
+        ucp_send_request_id_release(rndv_req);
+        ucp_request_put(rndv_req);
     }
+
     return UCS_OK;
 }
 
-static void ucp_rndv_dump_rkey(const void *packed_rkey, char *buffer, size_t max)
+static void ucp_rndv_dump_rkey(const void *packed_rkey, size_t rkey_size,
+                               char *buffer, size_t max)
 {
-    char *p    = buffer;
-    char *endp = buffer + max;
-
-    snprintf(p, endp - p, " rkey ");
-    p += strlen(p);
+    UCS_STRING_BUFFER_FIXED(strb, buffer, max);
 
-    ucp_rkey_dump_packed(packed_rkey, p, endp - p);
+    ucs_string_buffer_appendf(&strb, " rkey ");
+    ucp_rkey_dump_packed(packed_rkey, rkey_size, &strb);
 }
 
 static void ucp_rndv_dump(ucp_worker_h worker, uct_am_trace_type_t type,
@@ -1806,56 +2025,47 @@ static void ucp_rndv_dump(ucp_worker_h worker, uct_am_trace_type_t type,
     const ucp_rndv_rtr_hdr_t *rndv_rtr_hdr = data;
     const ucp_rndv_data_hdr_t *rndv_data   = data;
     const ucp_reply_hdr_t *rep_hdr         = data;
-    ucp_tag_rndv_rts_hdr_t *tag_rts;
-    ucp_am_rndv_rts_hdr_t *am_rts;
-    ucs_string_buffer_t rts_info;
-    void *rkey_buf;
+    UCS_STRING_BUFFER_ONSTACK(rts_info, 128);
+    const void *rkey_buf;
 
     switch (id) {
     case UCP_AM_ID_RNDV_RTS:
-        ucs_assert(rndv_rts_hdr->sreq.ep_id != UCP_EP_ID_INVALID);
-
-        ucs_string_buffer_init(&rts_info);
-
-        if (rndv_rts_hdr->flags & UCP_RNDV_RTS_FLAG_AM) {
-            am_rts   = ucs_derived_of(rndv_rts_hdr, ucp_am_rndv_rts_hdr_t);
-            rkey_buf = am_rts + 1;
+        if (ucp_rndv_rts_is_am(rndv_rts_hdr)) {
             ucs_string_buffer_appendf(&rts_info, "AM am_id %u",
-                                      am_rts->am.am_id);
+                                      ucp_am_hdr_from_rts(rndv_rts_hdr)->am_id);
         } else {
-            ucs_assert(rndv_rts_hdr->flags & UCP_RNDV_RTS_FLAG_TAG);
-
-            tag_rts  = ucs_derived_of(rndv_rts_hdr, ucp_tag_rndv_rts_hdr_t);
-            rkey_buf = tag_rts + 1;
-
-            ucs_string_buffer_appendf(&rts_info, "TAG tag %"PRIx64"",
-                                      tag_rts->tag.tag);
+            ucs_assert(ucp_rndv_rts_is_tag(rndv_rts_hdr));
+            ucs_string_buffer_appendf(&rts_info, "TAG tag %" PRIx64,
+                                      ucp_tag_hdr_from_rts(rndv_rts_hdr)->tag);
         }
 
+        rkey_buf = rndv_rts_hdr + 1;
         snprintf(buffer, max, "RNDV_RTS %s ep_id 0x%"PRIx64" sreq_id"
                  " 0x%"PRIx64" address 0x%"PRIx64" size %zu",
                  ucs_string_buffer_cstr(&rts_info), rndv_rts_hdr->sreq.ep_id,
                  rndv_rts_hdr->sreq.req_id, rndv_rts_hdr->address,
                  rndv_rts_hdr->size);
 
-        if (rndv_rts_hdr->address) {
-            ucp_rndv_dump_rkey(rkey_buf, buffer + strlen(buffer),
-                               max - strlen(buffer));
+        if (rndv_rts_hdr->address != 0) {
+            ucp_rndv_dump_rkey(rkey_buf,
+                               length - UCS_PTR_BYTE_DIFF(data, rkey_buf),
+                               buffer + strlen(buffer), max - strlen(buffer));
         }
-
-        ucs_string_buffer_cleanup(&rts_info);
         break;
     case UCP_AM_ID_RNDV_ATS:
         snprintf(buffer, max, "RNDV_ATS sreq_id 0x%"PRIx64" status '%s'",
                  rep_hdr->req_id, ucs_status_string(rep_hdr->status));
         break;
     case UCP_AM_ID_RNDV_RTR:
-        snprintf(buffer, max, "RNDV_RTR sreq_id 0x%"PRIx64" rreq_id 0x%"PRIx64
-                 " address 0x%"PRIx64, rndv_rtr_hdr->sreq_id,
-                 rndv_rtr_hdr->rreq_id, rndv_rtr_hdr->address);
-        if (rndv_rtr_hdr->address) {
-            ucp_rndv_dump_rkey(rndv_rtr_hdr + 1, buffer + strlen(buffer),
-                               max - strlen(buffer));
+        snprintf(buffer, max,
+                 "RNDV_RTR sreq_id 0x%" PRIx64 " rreq_id 0x%" PRIx64
+                 " address 0x%" PRIx64 " size %zu offset %zu",
+                 rndv_rtr_hdr->sreq_id, rndv_rtr_hdr->rreq_id,
+                 rndv_rtr_hdr->address, rndv_rtr_hdr->size,
+                 rndv_rtr_hdr->offset);
+        if (rndv_rtr_hdr->address != 0) {
+            ucp_rndv_dump_rkey(rndv_rtr_hdr + 1, length - sizeof(*rndv_rtr_hdr),
+                               buffer + strlen(buffer), max - strlen(buffer));
         }
         break;
     case UCP_AM_ID_RNDV_DATA:
diff --git a/src/ucp/rndv/rndv.h b/src/ucp/rndv/rndv.h
index ca7712ba93a..44eedd6a666 100644
--- a/src/ucp/rndv/rndv.h
+++ b/src/ucp/rndv/rndv.h
@@ -12,21 +12,36 @@
 #include <ucs/datastruct/ptr_map.h>
 
 
-enum ucp_rndv_rts_flags {
-    UCP_RNDV_RTS_FLAG_TAG = UCS_BIT(0),
-    UCP_RNDV_RTS_FLAG_AM  = UCS_BIT(1)
-};
+typedef enum {
+    /* RNDV TAG operation with status UCS_OK (kept for wire compatibility with
+     * the previous UCP versions) */
+    UCP_RNDV_RTS_TAG_OK       = UCS_OK,
+    /* RNDV TAG operation with status UCS_ERR_CANCELED (kept for wire
+     * compatibility with the previous UCP versions) */
+    UCP_RNDV_RTS_TAG_CANCELED = (uint8_t)UCS_ERR_CANCELED,
+    /* RNDV AM oepration */
+    UCP_RNDV_RTS_AM           = 1
+} UCS_S_PACKED ucp_rndv_rts_opcode_t;
 
 
 /*
  * Rendezvous RTS
  */
 typedef struct {
-    ucp_request_hdr_t         sreq;     /* send request on the rndv initiator side */
-    uint64_t                  address;  /* holds the address of the data buffer on the sender's side */
-    size_t                    size;     /* size of the data for sending */
-    uint16_t                  flags;    /* rndv proto flags, as defined by
-                                           ucp_rndv_rts_flags */
+    /* Protocol-specific header */
+    uint64_t          hdr;
+    /* Send request on the rndv initiator side */
+    ucp_request_hdr_t sreq;
+    /* Holds the address of the data buffer on the sender's side */
+    uint64_t          address;
+    /* Size of the data for sending */
+    size_t            size;
+    /* RNDV proto opcode */
+    uint8_t           opcode;
+    /*
+     * 1. Packed rkeys follow
+     * 2. AM only: User header follows, if am->header_length is not 0
+     */
 } UCS_S_PACKED ucp_rndv_rts_hdr_t;
 
 
@@ -44,7 +59,7 @@ typedef struct {
 
 
 /*
- * RNDV_DATA
+ * Rendezvous data
  */
 typedef struct {
     uint64_t                  rreq_id; /* request ID on the rndv receiver side */
@@ -52,12 +67,19 @@ typedef struct {
 } UCS_S_PACKED ucp_rndv_data_hdr_t;
 
 
+ucs_status_t ucp_rndv_send_rts(ucp_request_t *sreq, uct_pack_callback_t pack_cb,
+                               size_t rts_body_size);
+
+void ucp_rndv_req_send_ack(ucp_request_t *ack_req, ucp_request_t *req,
+                           ucs_ptr_map_key_t remote_req_id, ucs_status_t status,
+                           ucp_am_id_t am_id, const char *ack_str);
+
 ucs_status_t ucp_rndv_progress_rma_get_zcopy(uct_pending_req_t *self);
 
 ucs_status_t ucp_rndv_progress_rma_put_zcopy(uct_pending_req_t *self);
 
 size_t ucp_rndv_rts_pack(ucp_request_t *sreq, ucp_rndv_rts_hdr_t *rndv_rts_hdr,
-                         size_t rndv_rts_hdr_size, uint16_t flags);
+                         ucp_rndv_rts_opcode_t opcode);
 
 ucs_status_t ucp_rndv_reg_send_buffer(ucp_request_t *sreq);
 
@@ -65,7 +87,20 @@ void ucp_rndv_receive(ucp_worker_h worker, ucp_request_t *rreq,
                       const ucp_rndv_rts_hdr_t *rndv_rts_hdr,
                       const void *rkey_buf);
 
-void ucp_rndv_req_send_ats(ucp_request_t *rndv_req, ucp_request_t *rreq,
-                           ucs_ptr_map_key_t remote_req_id, ucs_status_t status);
+ucs_status_t ucp_rndv_rts_handle_status_from_pending(ucp_request_t *sreq,
+                                                     ucs_status_t status);
+
+static UCS_F_ALWAYS_INLINE int
+ucp_rndv_rts_is_am(const ucp_rndv_rts_hdr_t *rts_hdr)
+{
+    return rts_hdr->opcode == UCP_RNDV_RTS_AM;
+}
+
+static UCS_F_ALWAYS_INLINE int
+ucp_rndv_rts_is_tag(const ucp_rndv_rts_hdr_t *rts_hdr)
+{
+    return (rts_hdr->opcode == UCP_RNDV_RTS_TAG_OK) ||
+           (rts_hdr->opcode == UCP_RNDV_RTS_TAG_CANCELED);
+}
 
 #endif
diff --git a/src/ucp/rndv/rndv_am.c b/src/ucp/rndv/rndv_am.c
new file mode 100644
index 00000000000..aaa7585ab32
--- /dev/null
+++ b/src/ucp/rndv/rndv_am.c
@@ -0,0 +1,118 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "proto_rndv.inl"
+
+
+static ucs_status_t
+ucp_proto_rdnv_am_init_common(ucp_proto_multi_init_params_t *params)
+{
+    ucp_context_h context = params->super.super.worker->context;
+
+    if (params->super.super.select_param->op_id != UCP_OP_ID_RNDV_SEND) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    params->super.cfg_thresh =
+            ucp_proto_rndv_cfg_thresh(context, UCS_BIT(UCP_RNDV_MODE_AM));
+    params->super.overhead   = 10e-9; /* for multiple lanes management */
+    params->super.latency    = 0;
+    params->first.lane_type  = UCP_LANE_TYPE_AM;
+    params->middle.lane_type = UCP_LANE_TYPE_AM_BW;
+    params->super.hdr_size   = sizeof(ucp_rndv_data_hdr_t);
+    params->max_lanes        = context->config.ext.max_rndv_lanes;
+
+    return ucp_proto_multi_init(params);
+}
+
+static size_t ucp_proto_rndv_am_bcopy_pack(void *dest, void *arg)
+{
+    ucp_rndv_data_hdr_t *hdr             = dest;
+    ucp_proto_multi_pack_ctx_t *pack_ctx = arg;
+    ucp_request_t *req                   = pack_ctx->req;
+
+    hdr->rreq_id = req->send.rndv.remote_req_id;
+    hdr->offset  = req->send.state.dt_iter.offset;
+
+    return sizeof(*hdr) + ucp_proto_multi_data_pack(pack_ctx, hdr + 1);
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t ucp_proto_rndv_am_bcopy_send_func(
+        ucp_request_t *req, const ucp_proto_multi_lane_priv_t *lpriv,
+        ucp_datatype_iter_t *next_iter)
+{
+    static const size_t hdr_size        = sizeof(ucp_rndv_data_hdr_t);
+    ucp_ep_t *ep                        = req->send.ep;
+    ucp_proto_multi_pack_ctx_t pack_ctx = {
+        .req       = req,
+        .next_iter = next_iter
+    };
+    ssize_t packed_size;
+
+    pack_ctx.max_payload = ucp_proto_multi_max_payload(req, lpriv, hdr_size);
+
+    packed_size = uct_ep_am_bcopy(ep->uct_eps[lpriv->super.lane],
+                                  UCP_AM_ID_RNDV_DATA,
+                                  ucp_proto_rndv_am_bcopy_pack, &pack_ctx, 0);
+    if (ucs_unlikely(packed_size < 0)) {
+        return (ucs_status_t)packed_size;
+    }
+
+    ucs_assert(packed_size >= hdr_size);
+    return UCS_OK;
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_rndv_am_request_init(ucp_request_t *req)
+{
+    if (req->send.rndv.rkey != NULL) {
+        ucp_rkey_destroy(req->send.rndv.rkey);
+    }
+    ucp_proto_msg_multi_request_init(req);
+    /* Memory could be registered when we sent the RTS */
+    ucp_datatype_iter_mem_dereg(req->send.ep->worker->context,
+                                &req->send.state.dt_iter);
+}
+
+static ucs_status_t ucp_proto_rndv_am_bcopy_progress(uct_pending_req_t *uct_req)
+{
+    ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
+
+    return ucp_proto_multi_bcopy_progress(
+            req, req->send.proto_config->priv, ucp_proto_rndv_am_request_init,
+            ucp_proto_rndv_am_bcopy_send_func,
+            ucp_proto_request_bcopy_complete_success);
+}
+
+static ucs_status_t
+ucp_proto_rdnv_am_bcopy_init(const ucp_proto_init_params_t *init_params)
+{
+    ucp_proto_multi_init_params_t params = {
+        .super.super         = *init_params,
+        .super.cfg_thresh    = UCS_MEMUNITS_AUTO,
+        .super.cfg_priority  = 0,
+        .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
+        .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_bcopy),
+        .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_MEM_TYPE,
+        .first.tl_cap_flags  = UCT_IFACE_FLAG_AM_BCOPY,
+        .middle.tl_cap_flags = UCT_IFACE_FLAG_AM_BCOPY,
+    };
+
+    return ucp_proto_rdnv_am_init_common(&params);
+}
+
+static ucp_proto_t ucp_rndv_am_bcopy_proto = {
+    .name       = "rndv/am/bcopy",
+    .flags      = 0,
+    .init       = ucp_proto_rdnv_am_bcopy_init,
+    .config_str = ucp_proto_multi_config_str,
+    .progress   = ucp_proto_rndv_am_bcopy_progress,
+};
+UCP_PROTO_REGISTER(&ucp_rndv_am_bcopy_proto);
\ No newline at end of file
diff --git a/src/ucp/rndv/rndv_get.c b/src/ucp/rndv/rndv_get.c
new file mode 100644
index 00000000000..b8478f799c2
--- /dev/null
+++ b/src/ucp/rndv/rndv_get.c
@@ -0,0 +1,158 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "proto_rndv.inl"
+
+
+static ucs_status_t
+ucp_proto_rndv_get_zcopy_init(const ucp_proto_init_params_t *init_params)
+{
+    static const uint64_t rndv_modes     = UCS_BIT(UCP_RNDV_MODE_GET_ZCOPY);
+    ucp_context_t *context               = init_params->worker->context;
+    ucp_proto_multi_init_params_t params = {
+        .super.super         = *init_params,
+        .super.cfg_thresh    = ucp_proto_rndv_cfg_thresh(context, rndv_modes),
+        .super.cfg_priority  = 0,
+        .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY |
+                               UCP_PROTO_COMMON_INIT_FLAG_RECV_ZCOPY |
+                               UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS |
+                               UCP_PROTO_COMMON_INIT_FLAG_RESPONSE,
+        .super.overhead      = 0,
+        .super.latency       = 0,
+        .max_lanes           = context->config.ext.max_rndv_lanes,
+        .first.tl_cap_flags  = UCT_IFACE_FLAG_GET_ZCOPY,
+        .super.min_frag_offs = ucs_offsetof(uct_iface_attr_t,
+                                            cap.get.min_zcopy),
+        .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t,
+                                            cap.get.max_zcopy),
+        .first.lane_type     = UCP_LANE_TYPE_RMA_BW,
+        .super.hdr_size      = 0,
+        .middle.tl_cap_flags = UCT_IFACE_FLAG_GET_ZCOPY,
+        .middle.lane_type    = UCP_LANE_TYPE_RMA_BW
+    };
+
+    if ((init_params->select_param->op_id != UCP_OP_ID_RNDV_RECV) ||
+        (init_params->select_param->dt_class != UCP_DATATYPE_CONTIG)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    return ucp_proto_rndv_bulk_init(&params);
+}
+
+static ucs_status_t ucp_proto_rndv_get_complete(ucp_request_t *req)
+{
+    ucp_rkey_destroy(req->send.rndv.rkey);
+    ucp_proto_request_zcopy_complete(req, req->send.state.uct_comp.status);
+    return UCS_OK;
+}
+
+static void ucp_proto_rndv_get_completion(uct_completion_t *uct_comp)
+{
+    ucp_request_t *req = ucs_container_of(uct_comp, ucp_request_t,
+                                          send.state.uct_comp);
+
+    ucp_trace_req(req, "%s completed", req->send.proto_config->proto->name);
+    ucp_request_send(req, 0); /* reschedule to send ATS */
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t ucp_proto_rndv_get_zcopy_send_func(
+        ucp_request_t *req, const ucp_proto_multi_lane_priv_t *lpriv,
+        ucp_datatype_iter_t *next_iter)
+{
+    ucp_rkey_h rkey    = req->send.rndv.rkey;
+    uct_rkey_t tl_rkey = rkey->tl_rkey[lpriv->super.rkey_index].rkey.rkey;
+    uct_iov_t iov;
+
+    ucp_datatype_iter_next_iov(&req->send.state.dt_iter,
+                               lpriv->super.memh_index,
+                               ucp_proto_multi_max_payload(req, lpriv, 0),
+                               next_iter, &iov);
+    return uct_ep_get_zcopy(req->send.ep->uct_eps[lpriv->super.lane], &iov, 1,
+                            req->send.rndv.remote_address +
+                                    req->send.state.dt_iter.offset,
+                            tl_rkey, &req->send.state.uct_comp);
+}
+
+static ucs_status_t ucp_proto_rndv_get_zcopy_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    const ucp_proto_rndv_bulk_priv_t *rpriv = req->send.proto_config->priv;
+
+    if (ucp_datatype_iter_is_end(&req->send.state.dt_iter)) {
+        if (req->send.state.dt_iter.length > 0) {
+            ucs_assert(req->send.state.uct_comp.count == 0);
+        }
+        return ucp_proto_rndv_ack_progress(req, UCP_AM_ID_RNDV_ATS,
+                                           ucp_proto_rndv_get_complete);
+    } else {
+        return ucp_proto_multi_zcopy_progress(req, &rpriv->mpriv, NULL,
+                                              UCT_MD_MEM_ACCESS_LOCAL_WRITE,
+                                              ucp_proto_rndv_get_zcopy_send_func,
+                                              ucp_proto_rndv_get_completion);
+    }
+}
+
+static ucp_proto_t ucp_rndv_get_zcopy_proto = {
+    .name       = "rndv/get/zcopy",
+    .flags      = 0,
+    .init       = ucp_proto_rndv_get_zcopy_init,
+    .config_str = ucp_proto_rndv_bulk_config_str,
+    .progress   = ucp_proto_rndv_get_zcopy_progress
+};
+UCP_PROTO_REGISTER(&ucp_rndv_get_zcopy_proto);
+
+
+static ucs_status_t
+ucp_proto_rndv_ats_init(const ucp_proto_init_params_t *params)
+{
+    ucs_status_t status;
+
+    if (params->select_param->op_id != UCP_OP_ID_RNDV_RECV) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    if (params->rkey_config_key != NULL) {
+        /* This ATS-only protocol will not take care of releasing the remote, so
+           disqualify if remote key is present */
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    status = ucp_proto_rndv_ack_init(params);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    /* Support only 0-length messages */
+    *params->priv_size                 = sizeof(ucp_proto_rndv_ack_priv_t);
+    params->caps->cfg_thresh           = 0;
+    params->caps->cfg_priority         = 1;
+    params->caps->min_length           = 0;
+    params->caps->num_ranges           = 1;
+    params->caps->ranges[0].max_length = 0;
+    params->caps->ranges[0].perf       = ucp_proto_rndv_ack_time(params);
+    return UCS_OK;
+}
+
+static ucs_status_t ucp_proto_rndv_ats_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    return ucp_proto_rndv_ack_progress(
+            req, UCP_AM_ID_RNDV_ATS, ucp_proto_request_zcopy_complete_success);
+}
+
+static ucp_proto_t ucp_rndv_ats_proto = {
+    .name       = "rndv/ats",
+    .flags      = 0,
+    .init       = ucp_proto_rndv_ats_init,
+    .config_str = ucp_proto_rndv_ack_config_str,
+    .progress   = ucp_proto_rndv_ats_progress
+};
+UCP_PROTO_REGISTER(&ucp_rndv_ats_proto);
diff --git a/src/ucp/rndv/rndv_rtr.c b/src/ucp/rndv/rndv_rtr.c
new file mode 100644
index 00000000000..e3f5dcc25ee
--- /dev/null
+++ b/src/ucp/rndv/rndv_rtr.c
@@ -0,0 +1,145 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "proto_rndv.inl"
+
+#include <ucp/proto/proto_single.inl>
+
+
+static ucs_status_t
+ucp_proto_rndv_rtr_common_init(const ucp_proto_init_params_t *init_params,
+                               uint64_t rndv_modes, ucs_memory_type_t mem_type,
+                               ucs_sys_device_t sys_dev)
+{
+    ucp_context_h context                    = init_params->worker->context;
+    ucp_proto_rndv_ctrl_init_params_t params = {
+        .super.super        = *init_params,
+        .super.latency      = 0,
+        .super.overhead     = 40e-9,
+        .super.cfg_thresh   = ucp_proto_rndv_cfg_thresh(context, rndv_modes),
+        .super.cfg_priority = 0,
+        .super.flags        = UCP_PROTO_COMMON_INIT_FLAG_RESPONSE,
+        .remote_op_id       = UCP_OP_ID_RNDV_SEND,
+        .perf_bias          = 0.0,
+        .mem_info.type      = mem_type,
+        .mem_info.sys_dev   = sys_dev,
+        .min_length         = 1
+    };
+
+    return ucp_proto_rndv_ctrl_init(&params);
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_rtr_common_request_init(ucp_request_t *req)
+{
+    ucp_request_t *recv_req = ucp_request_get_super(req);
+
+    recv_req->status         = UCS_OK;
+    recv_req->recv.remaining = req->send.state.dt_iter.length;
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_rdnv_rtr_common_comp_init(ucp_ep_h ep, uct_completion_t *comp,
+                                    ucs_ptr_map_key_t *ptr_id_p,
+                                    uct_completion_callback_t comp_func)
+{
+    /* RTR sends the id of its &req->comp field, and not of the request, to
+       support fragmented protocol with multiple RTRs per request */
+    ucp_proto_completion_init(comp, comp_func);
+    ucp_ep_ptr_id_alloc(ep, comp, ptr_id_p);
+}
+
+static ucs_status_t
+ucp_proto_rndv_rtr_common_send(ucp_request_t *req, uct_pack_callback_t pack_cb)
+{
+    const ucp_proto_rndv_ctrl_priv_t *rpriv = req->send.proto_config->priv;
+    size_t max_rtr_size = sizeof(ucp_rndv_rtr_hdr_t) + rpriv->packed_rkey_size;
+
+    return ucp_proto_am_bcopy_single_progress(req, UCP_AM_ID_RNDV_RTR,
+                                              rpriv->lane, pack_cb, req,
+                                              max_rtr_size, NULL);
+}
+
+static void ucp_proto_rndv_rtr_common_completion(uct_completion_t *uct_comp)
+{
+    ucp_request_t *req = ucs_container_of(uct_comp, ucp_request_t,
+                                          send.state.uct_comp);
+    ucp_proto_rndv_rtr_common_complete(req, req->send.state.uct_comp.status);
+}
+
+static size_t ucp_proto_rndv_rtr_pack(void *dest, void *arg)
+{
+    ucp_rndv_rtr_hdr_t *rtr = dest;
+    ucp_request_t *req      = arg;
+    const UCS_V_UNUSED ucp_proto_rndv_ctrl_priv_t *rpriv;
+
+    rtr->sreq_id = req->send.rndv.remote_req_id;
+    rtr->rreq_id = req->send.rndv.rtr.rreq_id;
+    rtr->size    = req->send.state.dt_iter.length;
+    rtr->offset  = 0;
+    rtr->address = (uintptr_t)req->send.state.dt_iter.type.contig.buffer;
+
+    rpriv = req->send.proto_config->priv;
+    ucs_assert(rtr->size > 0);
+    ucs_assert(rpriv->md_map == req->send.state.dt_iter.type.contig.reg.md_map);
+    return sizeof(*rtr) + ucp_proto_request_pack_rkey(req, rtr + 1);
+}
+
+static ucs_status_t ucp_proto_rndv_rtr_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    const ucp_proto_rndv_ctrl_priv_t *rpriv = req->send.proto_config->priv;
+    ucs_status_t status;
+
+    if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
+        status = ucp_datatype_iter_mem_reg(req->send.ep->worker->context,
+                                           &req->send.state.dt_iter,
+                                           rpriv->md_map,
+                                           UCT_MD_MEM_ACCESS_REMOTE_PUT);
+        if (status != UCS_OK) {
+            ucp_proto_request_abort(req, status);
+            return UCS_OK;
+        }
+
+        ucp_proto_rtr_common_request_init(req);
+        ucp_proto_rdnv_rtr_common_comp_init(
+                req->send.ep, &req->send.state.uct_comp,
+                &req->send.rndv.rtr.rreq_id,
+                ucp_proto_rndv_rtr_common_completion);
+
+        req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
+    }
+
+    return ucp_proto_rndv_rtr_common_send(req, ucp_proto_rndv_rtr_pack);
+}
+
+static ucs_status_t
+ucp_proto_rndv_rtr_init(const ucp_proto_init_params_t *init_params)
+{
+    static const uint64_t rndv_modes = UCS_BIT(UCP_RNDV_MODE_PUT_ZCOPY) |
+                                       UCS_BIT(UCP_RNDV_MODE_AM);
+
+    if (init_params->select_param->op_id != UCP_OP_ID_RNDV_RECV) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    return ucp_proto_rndv_rtr_common_init(init_params, rndv_modes,
+                                          init_params->select_param->mem_type,
+                                          init_params->select_param->sys_dev);
+}
+
+static ucp_proto_t ucp_rndv_rtr_proto = {
+    .name       = "rndv/rtr",
+    .flags      = 0,
+    .init       = ucp_proto_rndv_rtr_init,
+    .config_str = ucp_proto_rndv_ctrl_config_str,
+    .progress   = ucp_proto_rndv_rtr_progress
+};
+UCP_PROTO_REGISTER(&ucp_rndv_rtr_proto);
diff --git a/src/ucp/stream/stream_recv.c b/src/ucp/stream/stream_recv.c
index 7836b3fe256..6d384f78c94 100644
--- a/src/ucp/stream/stream_recv.c
+++ b/src/ucp/stream/stream_recv.c
@@ -196,17 +196,18 @@ ucp_stream_rdesc_advance(ucp_recv_desc_t *rdesc, ssize_t offset,
     return UCS_OK;
 }
 
-static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_stream_process_rdesc_inplace(ucp_recv_desc_t *rdesc, ucp_datatype_t dt,
-                                 void *buffer, size_t count, size_t length,
-                                 ucs_memory_type_t mem_type,
-                                 ucp_ep_ext_proto_t *ep_ext)
+static UCS_F_ALWAYS_INLINE ucs_status_t ucp_stream_process_rdesc_inplace(
+        ucp_recv_desc_t *rdesc, ucp_datatype_t dt, void *buffer, size_t count,
+        size_t length, const ucp_request_param_t *param,
+        ucp_ep_ext_proto_t *ep_ext)
 {
     ucp_worker_h worker = ucp_ep_from_ext_proto(ep_ext)->worker;
+    ucs_memory_type_t mem_type;
     ucs_status_t status;
     ssize_t unpacked;
 
-    mem_type = ucp_get_memory_type(worker->context, buffer, length, mem_type);
+    mem_type = ucp_request_get_memory_type(worker->context, buffer, length,
+                                           param);
     status   = ucp_dt_unpack_only(worker, buffer, count, dt, mem_type,
                                   ucp_stream_rdesc_payload(rdesc), length, 0);
 
@@ -231,7 +232,7 @@ ucp_stream_process_rdesc(ucp_recv_desc_t *rdesc, ucp_ep_ext_proto_t *ep_ext,
 static UCS_F_ALWAYS_INLINE void
 ucp_stream_recv_request_init(ucp_request_t *req, ucp_ep_h ep, void *buffer,
                              size_t count, size_t length,
-                             ucp_datatype_t datatype, ucs_memory_type_t memory_type,
+                             ucp_datatype_t datatype,
                              const ucp_request_param_t *param)
 {
     uint32_t flags = ucp_request_param_flags(param);
@@ -252,8 +253,9 @@ ucp_stream_recv_request_init(ucp_request_t *req, ucp_ep_h ep, void *buffer,
     req->recv.datatype = datatype;
     req->recv.length   = ucs_likely(!UCP_DT_IS_GENERIC(datatype)) ? length :
                          ucp_dt_length(datatype, count, NULL, &req->recv.state);
-    req->recv.mem_type = ucp_get_memory_type(ep->worker->context, (void*)buffer,
-                                             req->recv.length, memory_type);
+    req->recv.mem_type = ucp_request_get_memory_type(ep->worker->context,
+                                                     (void*)buffer,
+                                                     req->recv.length, param);
 
     if (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) {
         req->flags         |= UCP_REQUEST_FLAG_CALLBACK;
@@ -293,29 +295,30 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_recv_nbx,
                  ucp_ep_h ep, void *buffer, size_t count, size_t *length,
                  const ucp_request_param_t *param)
 {
-    ucs_status_t        status  = UCS_OK;
-    ucp_ep_ext_proto_t  *ep_ext = ucp_ep_ext_proto(ep);
-    ucp_datatype_t      datatype;
-    ucs_memory_type_t   memory_type;
-    size_t              dt_length;
-    ucp_request_t       *req;
-    ucp_recv_desc_t     *rdesc;
-    uint32_t            attr_mask;
+    ucs_status_t status        = UCS_OK;
+    ucp_ep_ext_proto_t *ep_ext = ucp_ep_ext_proto(ep);
+    ucp_datatype_t datatype;
+    size_t dt_length;
+    ucp_request_t *req;
+    ucp_recv_desc_t *rdesc;
+    uint32_t attr_mask;
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_STREAM,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
+
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
 
-    memory_type = ucp_request_param_mem_type(param);
-    attr_mask   = param->op_attr_mask & (UCP_OP_ATTR_FIELD_DATATYPE |
-                                         UCP_OP_ATTR_FLAG_NO_IMM_CMPL);
+    attr_mask = param->op_attr_mask &
+                (UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FLAG_NO_IMM_CMPL);
     if (ucs_likely(attr_mask == 0)) {
         datatype  = ucp_dt_make_contig(1);
         dt_length = count; /* use dt_lendth to suppress coverity false positive */
         if (ucs_likely(ucp_stream_recv_nb_is_inplace(ep_ext, count))) {
-            status  = ucp_stream_process_rdesc_inplace(ucp_stream_rdesc_get(ep_ext),
-                                                       datatype, buffer, count,
-                                                       dt_length, memory_type, ep_ext);
+            rdesc   = ucp_stream_rdesc_get(ep_ext);
+            status  = ucp_stream_process_rdesc_inplace(rdesc, datatype, buffer,
+                                                       count, dt_length, param,
+                                                       ep_ext);
             *length = count;
             goto out_status;
         }
@@ -324,9 +327,11 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_recv_nbx,
         if (!UCP_DT_IS_GENERIC(datatype)) {
             dt_length = ucp_dt_length(datatype, count, buffer, NULL);
             if (ucp_stream_recv_nb_is_inplace(ep_ext, dt_length)) {
-                status  = ucp_stream_process_rdesc_inplace(ucp_stream_rdesc_get(ep_ext),
-                                                           datatype, buffer, count,
-                                                           dt_length, memory_type, ep_ext);
+                rdesc   = ucp_stream_rdesc_get(ep_ext);
+                status  = ucp_stream_process_rdesc_inplace(rdesc, datatype,
+                                                           buffer, count,
+                                                           dt_length, param,
+                                                           ep_ext);
                 *length = dt_length;
                 goto out_status;
             }
@@ -350,7 +355,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_recv_nbx,
                                 });
 
     ucp_stream_recv_request_init(req, ep, buffer, count, dt_length, datatype,
-                                 memory_type, param);
+                                 param);
 
     /* OK, lets obtain all arrived data which matches the recv size */
     while ((req->recv.stream.offset < req->recv.length) &&
@@ -528,7 +533,8 @@ ucp_stream_am_handler(void *am_arg, void *am_data, size_t am_length,
 
     ucs_assert(am_length >= sizeof(ucp_stream_am_hdr_t));
 
-    ep     = ucp_worker_get_ep_by_id(worker, data->hdr.ep_id);
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, data->hdr.ep_id, return UCS_OK,
+                                  "stream data");
     ep_ext = ucp_ep_ext_proto(ep);
 
     if (ucs_unlikely(ep->flags & (UCP_EP_FLAG_CLOSED |
@@ -566,7 +572,7 @@ static void ucp_stream_am_dump(ucp_worker_h worker, uct_am_trace_type_t type,
     snprintf(buffer, max, "STREAM ep_id 0x%"PRIx64, hdr->ep_id);
     p = buffer + strlen(buffer);
 
-    ucs_assert(hdr->ep_id != UCP_EP_ID_INVALID);
+    ucs_assert(hdr->ep_id != UCS_PTR_MAP_KEY_INVALID);
     ucp_dump_payload(worker->context, p, buffer + max - p,
                      UCS_PTR_BYTE_OFFSET(data, hdr_len), length - hdr_len);
 }
diff --git a/src/ucp/stream/stream_send.c b/src/ucp/stream/stream_send.c
index c22067c1a11..dde01a9a972 100644
--- a/src/ucp/stream/stream_send.c
+++ b/src/ucp/stream/stream_send.c
@@ -33,10 +33,10 @@ ucp_stream_send_am_short(ucp_ep_t *ep, const void *buffer, size_t length)
                            ucp_ep_remote_id(ep), buffer, length);
 }
 
-static void ucp_stream_send_req_init(ucp_request_t* req, ucp_ep_h ep,
-                                     const void* buffer, uintptr_t datatype,
-                                     ucs_memory_type_t memory_type, size_t count,
-                                     uint32_t flags)
+static void ucp_stream_send_req_init(ucp_request_t *req, ucp_ep_h ep,
+                                     const void *buffer, uintptr_t datatype,
+                                     size_t count, uint32_t flags,
+                                     const ucp_request_param_t *param)
 {
     req->flags             = flags;
     req->send.ep           = ep;
@@ -47,8 +47,9 @@ static void ucp_stream_send_req_init(ucp_request_t* req, ucp_ep_h ep,
     req->send.length       = ucp_dt_length(req->send.datatype, count,
                                            req->send.buffer,
                                            &req->send.state.dt);
-    req->send.mem_type     = ucp_get_memory_type(ep->worker->context, (void*)buffer,
-                                                 req->send.length, memory_type);
+    req->send.mem_type     = ucp_request_get_memory_type(ep->worker->context,
+                                                         (void*)buffer,
+                                                         req->send.length, param);
     VALGRIND_MAKE_MEM_UNDEFINED(&req->send.msg_proto.tag,
                                 sizeof(req->send.msg_proto.tag));
 }
@@ -119,17 +120,18 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_send_nbx,
                  ucp_ep_h ep, const void *buffer, size_t count,
                  const ucp_request_param_t *param)
 {
-    ucp_datatype_t   datatype;
-    ucp_request_t    *req;
-    size_t           length;
-    ucs_status_t     status;
+    ucp_datatype_t datatype;
+    ucp_request_t *req;
+    size_t length;
+    ucs_status_t status;
     ucs_status_ptr_t ret;
-    uint32_t         attr_mask;
-    uint32_t         flags;
-    ucs_memory_type_t memory_type;
+    uint32_t attr_mask;
+    uint32_t flags;
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_STREAM,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
+
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
 
     flags = ucp_request_param_flags(param);
@@ -176,15 +178,13 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_stream_send_nbx,
         goto out;
     }
 
-    memory_type = ucp_request_param_mem_type(param);
-
     req = ucp_request_get_param(ep->worker, param,
                                 {
                                     ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
                                     goto out;
                                 });
 
-    ucp_stream_send_req_init(req, ep, buffer, datatype, memory_type, count, flags);
+    ucp_stream_send_req_init(req, ep, buffer, datatype, count, flags, param);
 
     ret = ucp_stream_send_req(req, count, &ucp_ep_config(ep)->am, param,
                               ucp_ep_config(ep)->stream.proto);
@@ -200,10 +200,7 @@ static ucs_status_t ucp_stream_contig_am_short(uct_pending_req_t *self)
     ucs_status_t   status = ucp_stream_send_am_short(req->send.ep,
                                                      req->send.buffer,
                                                      req->send.length);
-    if (ucs_likely(status == UCS_OK)) {
-        ucp_request_complete_send(req, UCS_OK);
-    }
-    return status;
+    return ucp_am_short_handle_status_from_pending(req, status);
 }
 
 static size_t ucp_stream_pack_am_single_dt(void *dest, void *arg)
diff --git a/src/ucp/tag/eager.h b/src/ucp/tag/eager.h
index 7ae9be45086..c27f54bf319 100644
--- a/src/ucp/tag/eager.h
+++ b/src/ucp/tag/eager.h
@@ -69,6 +69,9 @@ void ucp_tag_eager_sync_send_ack(ucp_worker_h worker, void *hdr, uint16_t recv_f
 void ucp_tag_eager_sync_completion(ucp_request_t *req, uint32_t flag,
                                    ucs_status_t status);
 
+void ucp_proto_eager_sync_ack_handler(ucp_worker_h worker,
+                                      const ucp_reply_hdr_t *rep_hdr);
+
 void ucp_tag_eager_zcopy_completion(uct_completion_t *self);
 
 void ucp_tag_eager_zcopy_req_complete(ucp_request_t *req, ucs_status_t status);
@@ -77,4 +80,13 @@ void ucp_tag_eager_sync_zcopy_req_complete(ucp_request_t *req, ucs_status_t stat
 
 void ucp_tag_eager_sync_zcopy_completion(uct_completion_t *self);
 
+static UCS_F_ALWAYS_INLINE int
+ucp_proto_eager_check_op_id(const ucp_proto_init_params_t *init_params,
+                            int offload_enabled)
+{
+    return (init_params->select_param->op_id == UCP_OP_ID_TAG_SEND) &&
+           (offload_enabled ==
+            ucp_ep_config_key_has_tag_lane(init_params->ep_config_key));
+}
+
 #endif
diff --git a/src/ucp/tag/eager_multi.c b/src/ucp/tag/eager_multi.c
index b3dbb6f32d9..b3b0d8ed8ed 100644
--- a/src/ucp/tag/eager_multi.c
+++ b/src/ucp/tag/eager_multi.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -14,38 +14,37 @@
 #include <ucp/proto/proto_multi.inl>
 
 
-static UCS_F_ALWAYS_INLINE
-void ucp_eager_multi_proto_request_init(ucp_request_t *req)
-{
-    req->send.msg_proto.message_id = req->send.ep->worker->am_message_id++;
-}
-
 static UCS_F_ALWAYS_INLINE void
-ucp_eager_proto_set_first_hdr(ucp_request_t *req, ucp_eager_first_hdr_t *hdr)
+ucp_proto_eager_set_first_hdr(ucp_request_t *req, ucp_eager_first_hdr_t *hdr)
 {
-    hdr->super.super.tag = req->send.msg_proto.tag.tag;
-    hdr->total_len       = req->send.dt_iter.length;
+    hdr->super.super.tag = req->send.msg_proto.tag;
+    hdr->total_len       = req->send.state.dt_iter.length;
     hdr->msg_id          = req->send.msg_proto.message_id;
 }
 
 static UCS_F_ALWAYS_INLINE void
-ucp_eager_proto_set_middle_hdr(ucp_request_t *req, ucp_eager_middle_hdr_t *hdr)
+ucp_proto_eager_set_middle_hdr(ucp_request_t *req, ucp_eager_middle_hdr_t *hdr)
 {
     hdr->msg_id = req->send.msg_proto.message_id;
-    hdr->offset = req->send.dt_iter.offset;
+    hdr->offset = req->send.state.dt_iter.offset;
 }
 
 static ucs_status_t
-ucp_proto_eager_multi_init_common(ucp_proto_multi_init_params_t *params)
+ucp_proto_eager_multi_init_common(ucp_proto_multi_init_params_t *params,
+                                  ucp_proto_id_t op_id)
 {
-    if (params->super.super.select_param->op_id != UCP_OP_ID_TAG_SEND) {
+    /* TODO: Disable AM based protocols if tag lane is present! It can be done
+     * when tag offload rndv is implemented (so any msg size can be sent with
+     * tag offload). I. e. would need to check one more condition below:
+     * ucp_ep_config_key_has_tag_lane(params->super.super.ep_config_key)
+     */
+    if (params->super.super.select_param->op_id != op_id) {
         return UCS_ERR_UNSUPPORTED;
     }
 
     params->super.overhead   = 10e-9; /* for multiple lanes management */
     params->super.latency    = 0;
     params->first.lane_type  = UCP_LANE_TYPE_AM;
-    params->super.hdr_size   = sizeof(ucp_eager_first_hdr_t);
     params->middle.lane_type = UCP_LANE_TYPE_AM_BW;
     params->max_lanes        =
             params->super.super.worker->context->config.ext.max_eager_lanes;
@@ -53,8 +52,9 @@ ucp_proto_eager_multi_init_common(ucp_proto_multi_init_params_t *params)
     return ucp_proto_multi_init(params);
 }
 
-static ucs_status_t
-ucp_proto_eager_bcopy_multi_init(const ucp_proto_init_params_t *init_params)
+static ucs_status_t ucp_proto_eager_bcopy_multi_common_init(
+        const ucp_proto_init_params_t *init_params, ucp_proto_id_t op_id,
+        size_t hdr_size)
 {
     ucp_context_t *context               = init_params->worker->context;
     ucp_proto_multi_init_params_t params = {
@@ -63,36 +63,38 @@ ucp_proto_eager_bcopy_multi_init(const ucp_proto_init_params_t *init_params)
         .super.cfg_priority  = 20,
         .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
         .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_bcopy),
-        .super.flags         = 0,
+        .super.hdr_size      = hdr_size,
+        .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_MEM_TYPE,
         .first.tl_cap_flags  = UCT_IFACE_FLAG_AM_BCOPY,
         .middle.tl_cap_flags = UCT_IFACE_FLAG_AM_BCOPY,
     };
 
-    return ucp_proto_eager_multi_init_common(&params);
+    return ucp_proto_eager_multi_init_common(&params, op_id);
 }
 
-static size_t ucp_eager_bcopy_pack_first(void *dest, void *arg)
+static size_t ucp_proto_eager_bcopy_pack_first(void *dest, void *arg)
 {
     ucp_eager_first_hdr_t           *hdr = dest;
     ucp_proto_multi_pack_ctx_t *pack_ctx = arg;
 
-    ucp_eager_proto_set_first_hdr(pack_ctx->req, hdr);
+    ucp_proto_eager_set_first_hdr(pack_ctx->req, hdr);
     return sizeof(*hdr) + ucp_proto_multi_data_pack(pack_ctx, hdr + 1);
 }
 
-static size_t ucp_eager_bcopy_pack_middle(void *dest, void *arg)
+static size_t ucp_proto_eager_bcopy_pack_middle(void *dest, void *arg)
 {
     ucp_eager_middle_hdr_t          *hdr = dest;
     ucp_proto_multi_pack_ctx_t *pack_ctx = arg;
 
-    ucp_eager_proto_set_middle_hdr(pack_ctx->req, hdr);
+    ucp_proto_eager_set_middle_hdr(pack_ctx->req, hdr);
     return sizeof(*hdr) + ucp_proto_multi_data_pack(pack_ctx, hdr + 1);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
-ucp_eager_bcopy_multi_send_func(ucp_request_t *req,
-                                const ucp_proto_multi_lane_priv_t *lpriv,
-                                ucp_datatype_iter_t *next_iter)
+ucp_proto_eager_bcopy_multi_common_send_func(
+        ucp_request_t *req, const ucp_proto_multi_lane_priv_t *lpriv,
+        ucp_datatype_iter_t *next_iter, ucp_am_id_t am_id_first,
+        uct_pack_callback_t pack_cb_first, size_t hdr_size_first)
 {
     ucp_ep_t *ep                        = req->send.ep;
     ucp_proto_multi_pack_ctx_t pack_ctx = {
@@ -104,13 +106,13 @@ ucp_eager_bcopy_multi_send_func(ucp_request_t *req,
     ucp_am_id_t am_id;
     size_t hdr_size;
 
-    if (req->send.dt_iter.offset == 0) {
-        am_id    = UCP_AM_ID_EAGER_FIRST;
-        pack_cb  = ucp_eager_bcopy_pack_first;
-        hdr_size = sizeof(ucp_eager_first_hdr_t);
+    if (req->send.state.dt_iter.offset == 0) {
+        am_id    = am_id_first;
+        pack_cb  = pack_cb_first;
+        hdr_size = hdr_size_first;
     } else {
-        am_id   = UCP_AM_ID_EAGER_MIDDLE;
-        pack_cb = ucp_eager_bcopy_pack_middle;
+        am_id    = UCP_AM_ID_EAGER_MIDDLE;
+        pack_cb  = ucp_proto_eager_bcopy_pack_middle;
         hdr_size = sizeof(ucp_eager_middle_hdr_t);
     }
     pack_ctx.max_payload = ucp_proto_multi_max_payload(req, lpriv, hdr_size);
@@ -126,18 +128,31 @@ ucp_eager_bcopy_multi_send_func(ucp_request_t *req,
 }
 
 static ucs_status_t
-ucp_eager_bcopy_multi_proto_progress(uct_pending_req_t *uct_req)
+ucp_proto_eager_bcopy_multi_init(const ucp_proto_init_params_t *init_params)
 {
-    ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
+    return ucp_proto_eager_bcopy_multi_common_init(
+            init_params, UCP_OP_ID_TAG_SEND, sizeof(ucp_eager_first_hdr_t));
+}
 
-    if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
-        ucp_proto_multi_request_init(req);
-        ucp_eager_multi_proto_request_init(req);
-        req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
-    }
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_eager_bcopy_multi_send_func(ucp_request_t *req,
+                                      const ucp_proto_multi_lane_priv_t *lpriv,
+                                      ucp_datatype_iter_t *next_iter)
+{
+    return ucp_proto_eager_bcopy_multi_common_send_func(
+            req, lpriv, next_iter, UCP_AM_ID_EAGER_FIRST,
+            ucp_proto_eager_bcopy_pack_first, sizeof(ucp_eager_first_hdr_t));
+}
 
-    return ucp_proto_multi_progress(req, ucp_eager_bcopy_multi_send_func,
-                                    ucp_proto_request_bcopy_complete, UINT_MAX);
+static ucs_status_t
+ucp_proto_eager_bcopy_multi_progress(uct_pending_req_t *uct_req)
+{
+    ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
+
+    return ucp_proto_multi_bcopy_progress(
+            req, req->send.proto_config->priv, ucp_proto_msg_multi_request_init,
+            ucp_proto_eager_bcopy_multi_send_func,
+            ucp_proto_request_bcopy_complete_success);
 }
 
 static ucp_proto_t ucp_eager_bcopy_multi_proto = {
@@ -145,10 +160,96 @@ static ucp_proto_t ucp_eager_bcopy_multi_proto = {
     .flags      = 0,
     .init       = ucp_proto_eager_bcopy_multi_init,
     .config_str = ucp_proto_multi_config_str,
-    .progress   = ucp_eager_bcopy_multi_proto_progress
+    .progress   = ucp_proto_eager_bcopy_multi_progress
 };
 UCP_PROTO_REGISTER(&ucp_eager_bcopy_multi_proto);
 
+static ucs_status_t
+ucp_proto_eager_sync_bcopy_multi_init(const ucp_proto_init_params_t *init_params)
+{
+    return ucp_proto_eager_bcopy_multi_common_init(
+            init_params, UCP_OP_ID_TAG_SEND_SYNC,
+            sizeof(ucp_eager_sync_first_hdr_t));
+}
+
+static size_t ucp_eager_sync_bcopy_pack_first(void *dest, void *arg)
+{
+    ucp_eager_sync_first_hdr_t *hdr      = dest;
+    ucp_proto_multi_pack_ctx_t *pack_ctx = arg;
+    ucp_request_t *req                   = pack_ctx->req;
+
+    ucp_proto_eager_set_first_hdr(req, &hdr->super);
+    hdr->req.ep_id  = ucp_send_request_get_ep_remote_id(req);
+    hdr->req.req_id = ucp_send_request_get_id(req);
+
+    return sizeof(*hdr) + ucp_proto_multi_data_pack(pack_ctx, hdr + 1);
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_eager_sync_bcopy_multi_send_func(
+        ucp_request_t *req, const ucp_proto_multi_lane_priv_t *lpriv,
+        ucp_datatype_iter_t *next_iter)
+{
+    return ucp_proto_eager_bcopy_multi_common_send_func(
+            req, lpriv, next_iter, UCP_AM_ID_EAGER_SYNC_FIRST,
+            ucp_eager_sync_bcopy_pack_first,
+            sizeof(ucp_eager_sync_first_hdr_t));
+}
+
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucp_proto_eager_sync_bcopy_send_completed(ucp_request_t *req)
+{
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter, UINT_MAX);
+
+    req->flags |= UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED;
+    if (req->flags & UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED) {
+        ucp_request_complete_send(req, UCS_OK);
+    }
+    return UCS_OK;
+}
+
+void ucp_proto_eager_sync_ack_handler(ucp_worker_h worker,
+                                      const ucp_reply_hdr_t *rep_hdr)
+{
+    ucp_request_t *req;
+
+    UCP_SEND_REQUEST_GET_BY_ID(&req, worker, rep_hdr->req_id, 1, return,
+                               "EAGER_S ACK %p", rep_hdr);
+
+    req->flags |= UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED;
+    if (req->flags & UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED) {
+        ucp_request_complete_send(req, rep_hdr->status);
+    }
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucp_proto_eager_sync_bcopy_request_init(ucp_request_t *req)
+{
+    ucp_proto_msg_multi_request_init(req);
+    ucp_send_request_id_alloc(req);
+}
+
+static ucs_status_t
+ucp_proto_eager_sync_bcopy_multi_progress(uct_pending_req_t *uct_req)
+{
+    ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
+
+    return ucp_proto_multi_bcopy_progress(
+            req, req->send.proto_config->priv,
+            ucp_proto_eager_sync_bcopy_request_init,
+            ucp_proto_eager_sync_bcopy_multi_send_func,
+            ucp_proto_eager_sync_bcopy_send_completed);
+}
+
+static ucp_proto_t ucp_eager_sync_bcopy_multi_proto = {
+    .name       = "egrsnc/multi/bcopy",
+    .flags      = 0,
+    .init       = ucp_proto_eager_sync_bcopy_multi_init,
+    .config_str = ucp_proto_multi_config_str,
+    .progress   = ucp_proto_eager_sync_bcopy_multi_progress
+};
+UCP_PROTO_REGISTER(&ucp_eager_sync_bcopy_multi_proto);
+
 static ucs_status_t
 ucp_proto_eager_zcopy_multi_init(const ucp_proto_init_params_t *init_params)
 {
@@ -157,14 +258,15 @@ ucp_proto_eager_zcopy_multi_init(const ucp_proto_init_params_t *init_params)
         .super.super         = *init_params,
         .super.cfg_thresh    = context->config.ext.zcopy_thresh,
         .super.cfg_priority  = 30,
-        .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
+        .super.min_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.min_zcopy),
         .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_zcopy),
+        .super.hdr_size      = sizeof(ucp_eager_first_hdr_t),
         .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY,
         .first.tl_cap_flags  = UCT_IFACE_FLAG_AM_ZCOPY,
         .middle.tl_cap_flags = UCT_IFACE_FLAG_AM_ZCOPY,
     };
 
-    return ucp_proto_eager_multi_init_common(&params);
+    return ucp_proto_eager_multi_init_common(&params, UCP_OP_ID_TAG_SEND);
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
@@ -180,17 +282,17 @@ ucp_proto_eager_zcopy_multi_send_func(ucp_request_t *req,
     size_t hdr_size;
     uct_iov_t iov;
 
-    if (req->send.dt_iter.offset == 0) {
+    if (req->send.state.dt_iter.offset == 0) {
         am_id    = UCP_AM_ID_EAGER_FIRST;
         hdr_size = sizeof(hdr.first);
-        ucp_eager_proto_set_first_hdr(req, &hdr.first);
+        ucp_proto_eager_set_first_hdr(req, &hdr.first);
     } else {
         am_id    = UCP_AM_ID_EAGER_MIDDLE;
         hdr_size = sizeof(hdr.middle);
-        ucp_eager_proto_set_middle_hdr(req, &hdr.middle);
+        ucp_proto_eager_set_middle_hdr(req, &hdr.middle);
     }
 
-    ucp_datatype_iter_next_iov(&req->send.dt_iter, lpriv->super.memh_index,
+    ucp_datatype_iter_next_iov(&req->send.state.dt_iter, lpriv->super.memh_index,
                                ucp_proto_multi_max_payload(req, lpriv, hdr_size),
                                next_iter, &iov);
     return uct_ep_am_zcopy(req->send.ep->uct_eps[lpriv->super.lane], am_id, &hdr,
@@ -199,8 +301,11 @@ ucp_proto_eager_zcopy_multi_send_func(ucp_request_t *req,
 
 static ucs_status_t ucp_proto_eager_zcopy_multi_progress(uct_pending_req_t *self)
 {
-    return ucp_proto_multi_zcopy_progress(self,
-                                          ucp_eager_multi_proto_request_init,
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    return ucp_proto_multi_zcopy_progress(req, req->send.proto_config->priv,
+                                          ucp_proto_msg_multi_request_init,
+                                          UCT_MD_MEM_ACCESS_LOCAL_READ,
                                           ucp_proto_eager_zcopy_multi_send_func,
                                           ucp_proto_request_zcopy_completion);
 }
diff --git a/src/ucp/tag/eager_rcv.c b/src/ucp/tag/eager_rcv.c
index 905c77b4aeb..466b0f784a2 100644
--- a/src/ucp/tag/eager_rcv.c
+++ b/src/ucp/tag/eager_rcv.c
@@ -258,9 +258,11 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_offload_sync_ack_handler,
 
     ucs_queue_for_each_safe(sreq, iter, queue, send.tag_offload.queue) {
         if ((sreq->send.tag_offload.ssend_tag == rep_hdr->sender_tag) &&
+            !(sreq->send.ep->flags & UCP_EP_FLAG_FAILED) &&
             (ucp_ep_local_id(sreq->send.ep) == rep_hdr->ep_id)) {
-            ucp_tag_eager_sync_completion(sreq, UCP_REQUEST_FLAG_REMOTE_COMPLETED,
-                                          UCS_OK);
+            ucp_send_request_id_release(sreq);
+            ucp_tag_eager_sync_completion(
+                    sreq, UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED, UCS_OK);
             ucs_queue_del_iter(queue, iter);
             return UCS_OK;
         }
@@ -277,10 +279,18 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_eager_sync_ack_handler,
 {
     ucp_worker_h    worker   = arg;
     ucp_reply_hdr_t *rep_hdr = data;
-    ucp_request_t   *req     = ucp_worker_extract_request_by_id(worker,
-                                                                rep_hdr->req_id);
+    ucp_request_t *req;
+
+    if (worker->context->config.ext.proto_enable) {
+        ucp_proto_eager_sync_ack_handler(worker, rep_hdr);
+    } else {
+        UCP_SEND_REQUEST_GET_BY_ID(&req, worker, rep_hdr->req_id, 1,
+                                   return UCS_OK, "EAGER_S ACK %p", rep_hdr);
+        ucp_tag_eager_sync_completion(req,
+                                      UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED,
+                                      UCS_OK);
+    }
 
-    ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_REMOTE_COMPLETED, UCS_OK);
     return UCS_OK;
 }
 
@@ -419,7 +429,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_eager,
     priv_len              = sizeof(*priv);
     priv                  = ucp_tag_eager_offload_priv(tl_flags, data, length,
                                                        ucp_eager_sync_hdr_t);
-    priv->req.req_id      = UCP_REQUEST_ID_INVALID;
+    priv->req.req_id      = UCS_PTR_MAP_KEY_INVALID;
     priv->req.ep_id       = imm;
     priv->super.super.tag = stag;
     return ucp_eager_tagged_handler(worker, priv, length + priv_len,
@@ -457,10 +467,12 @@ static void ucp_eager_dump(ucp_worker_h worker, uct_am_trace_type_t type,
         header_len = sizeof(*eager_mid_hdr);
         break;
     case UCP_AM_ID_EAGER_SYNC_ONLY:
-        ucs_assert(eagers_hdr->req.ep_id != UCP_EP_ID_INVALID);
-        snprintf(buffer, max, "EGRS tag %"PRIx64" ep_id 0x%"PRIx64" req_id 0x%"PRIx64,
+        ucs_assert(eagers_hdr->req.ep_id != UCS_PTR_MAP_KEY_INVALID);
+        snprintf(buffer, max,
+                 "EGRS tag %" PRIx64 " ep_id 0x%" PRIx64 " req_id 0x%" PRIx64
+                 " len %zu",
                  eagers_hdr->super.super.tag, eagers_hdr->req.ep_id,
-                 eagers_hdr->req.req_id);
+                 eagers_hdr->req.req_id, length - sizeof(*eagers_hdr));
         header_len = sizeof(*eagers_hdr);
         break;
     case UCP_AM_ID_EAGER_SYNC_FIRST:
diff --git a/src/ucp/tag/eager_single.c b/src/ucp/tag/eager_single.c
index b4d0aac86f9..946fecf4121 100644
--- a/src/ucp/tag/eager_single.c
+++ b/src/ucp/tag/eager_single.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -27,15 +27,16 @@ static ucs_status_t ucp_eager_short_progress(uct_pending_req_t *self)
     ucs_status_t status;
 
     status = uct_ep_am_short(req->send.ep->uct_eps[spriv->super.lane],
-                             UCP_AM_ID_EAGER_ONLY, req->send.msg_proto.tag.tag,
-                             req->send.dt_iter.type.contig.buffer,
-                             req->send.dt_iter.length);
+                             UCP_AM_ID_EAGER_ONLY, req->send.msg_proto.tag,
+                             req->send.state.dt_iter.type.contig.buffer,
+                             req->send.state.dt_iter.length);
     if (ucs_unlikely(status == UCS_ERR_NO_RESOURCE)) {
         req->send.lane = spriv->super.lane; /* for pending add */
         return status;
     }
 
-    ucp_datatype_iter_cleanup(&req->send.dt_iter, UCS_BIT(UCP_DATATYPE_CONTIG));
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter,
+                              UCS_BIT(UCP_DATATYPE_CONTIG));
 
     ucs_assert(status != UCS_INPROGRESS);
     ucp_request_complete_send(req, status);
@@ -60,8 +61,9 @@ ucp_proto_eager_short_init(const ucp_proto_init_params_t *init_params)
         .tl_cap_flags        = UCT_IFACE_FLAG_AM_SHORT
     };
 
-    /* short protocol requires contig/host */
-    if ((select_param->op_id != UCP_OP_ID_TAG_SEND) ||
+    /* AM based proto can not be used if tag offload lane configured */
+    if (!ucp_proto_eager_check_op_id(init_params, 0) ||
+        /* short protocol requires contig/host */
         (select_param->dt_class != UCP_DATATYPE_CONTIG) ||
         !UCP_MEM_IS_HOST(select_param->mem_type)) {
         return UCS_ERR_UNSUPPORTED;
@@ -86,9 +88,9 @@ static size_t ucp_eager_single_pack(void *dest, void *arg)
     ucp_datatype_iter_t next_iter;
     size_t packed_size;
 
-    ucs_assert(req->send.dt_iter.offset == 0);
-    hdr->super.tag = req->send.msg_proto.tag.tag;
-    packed_size    = ucp_datatype_iter_next_pack(&req->send.dt_iter,
+    ucs_assert(req->send.state.dt_iter.offset == 0);
+    hdr->super.tag = req->send.msg_proto.tag;
+    packed_size    = ucp_datatype_iter_next_pack(&req->send.state.dt_iter,
                                                  req->send.ep->worker,
                                                  SIZE_MAX, &next_iter, hdr + 1);
     return sizeof(*hdr) + packed_size;
@@ -100,12 +102,9 @@ static ucs_status_t ucp_eager_bcopy_single_progress(uct_pending_req_t *self)
                                                             send.uct);
     const ucp_proto_single_priv_t *spriv = req->send.proto_config->priv;
 
-    return ucp_proto_am_bcopy_single_progress(req, UCP_AM_ID_EAGER_ONLY,
-                                              spriv->super.lane,
-                                              ucp_eager_single_pack, req,
-                                              SIZE_MAX,
-                                              ucp_proto_request_bcopy_complete,
-                                              ucp_proto_request_bcopy_complete);
+    return ucp_proto_am_bcopy_single_progress(
+            req, UCP_AM_ID_EAGER_ONLY, spriv->super.lane, ucp_eager_single_pack,
+            req, SIZE_MAX, ucp_proto_request_bcopy_complete_success);
 }
 
 static ucs_status_t
@@ -126,7 +125,8 @@ ucp_proto_eager_bcopy_single_init(const ucp_proto_init_params_t *init_params)
         .tl_cap_flags        = UCT_IFACE_FLAG_AM_BCOPY
     };
 
-    if (init_params->select_param->op_id != UCP_OP_ID_TAG_SEND) {
+    /* AM based proto can not be used if tag offload lane configured */
+    if (!ucp_proto_eager_check_op_id(init_params, 0)) {
         return UCS_ERR_UNSUPPORTED;
     }
 
@@ -152,7 +152,7 @@ ucp_proto_eager_zcopy_single_init(const ucp_proto_init_params_t *init_params)
         .super.overhead      = 0,
         .super.cfg_thresh    = context->config.ext.zcopy_thresh,
         .super.cfg_priority  = 30,
-        .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
+        .super.min_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.min_zcopy),
         .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t, cap.am.max_zcopy),
         .super.hdr_size      = sizeof(ucp_tag_hdr_t),
         .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY |
@@ -161,23 +161,36 @@ ucp_proto_eager_zcopy_single_init(const ucp_proto_init_params_t *init_params)
         .tl_cap_flags        = UCT_IFACE_FLAG_AM_ZCOPY
     };
 
-    if (init_params->select_param->op_id != UCP_OP_ID_TAG_SEND) {
+    /* AM based proto can not be used if tag offload lane configured */
+    if (!ucp_proto_eager_check_op_id(init_params, 0)) {
         return UCS_ERR_UNSUPPORTED;
     }
 
     return ucp_proto_single_init(&params);
 }
 
-static ucs_status_t ucp_eager_zcopy_single_progress(uct_pending_req_t *self)
+static ucs_status_t
+ucp_proto_eager_zcopy_send_func(ucp_request_t *req,
+                                const ucp_proto_single_priv_t *spriv,
+                                const uct_iov_t *iov)
 {
-    ucp_request_t  *req = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_eager_hdr_t hdr = {
-        .super.tag = req->send.msg_proto.tag.tag
+        .super.tag = req->send.msg_proto.tag
     };
 
-    hdr.super.tag = req->send.msg_proto.tag.tag;
-    return ucp_proto_am_zcopy_single_progress(req, UCP_AM_ID_EAGER_ONLY,
-                                              &hdr, sizeof(ucp_eager_hdr_t));
+    return uct_ep_am_zcopy(req->send.ep->uct_eps[spriv->super.lane],
+                           UCP_AM_ID_EAGER_ONLY, &hdr, sizeof(hdr), iov, 1, 0,
+                           &req->send.state.uct_comp);
+}
+
+static ucs_status_t
+ucp_proto_eager_zcopy_single_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    return ucp_proto_zcopy_single_progress(req, UCT_MD_MEM_ACCESS_LOCAL_READ,
+                                           ucp_proto_eager_zcopy_send_func,
+                                           "am_zcopy_only");
 }
 
 static ucp_proto_t ucp_eager_zcopy_single_proto = {
@@ -185,6 +198,6 @@ static ucp_proto_t ucp_eager_zcopy_single_proto = {
     .flags      = 0,
     .init       = ucp_proto_eager_zcopy_single_init,
     .config_str = ucp_proto_single_config_str,
-    .progress   = ucp_eager_zcopy_single_progress,
+    .progress   = ucp_proto_eager_zcopy_single_progress,
 };
 UCP_PROTO_REGISTER(&ucp_eager_zcopy_single_proto);
diff --git a/src/ucp/tag/eager_snd.c b/src/ucp/tag/eager_snd.c
index 110af2030e7..76b32947aae 100644
--- a/src/ucp/tag/eager_snd.c
+++ b/src/ucp/tag/eager_snd.c
@@ -37,9 +37,9 @@ ucp_tag_pack_eager_common(ucp_request_t *req, void *dest,
 static size_t ucp_tag_pack_eager_only_dt(void *dest, void *arg)
 {
     ucp_eager_hdr_t *hdr = dest;
-    ucp_request_t *req = arg;
+    ucp_request_t *req   = arg;
 
-    hdr->super.tag = req->send.msg_proto.tag.tag;
+    hdr->super.tag = req->send.msg_proto.tag;
 
     return ucp_tag_pack_eager_common(req, hdr + 1, req->send.length,
                                      sizeof(*hdr), 1);
@@ -48,9 +48,9 @@ static size_t ucp_tag_pack_eager_only_dt(void *dest, void *arg)
 static size_t ucp_tag_pack_eager_sync_only_dt(void *dest, void *arg)
 {
     ucp_eager_sync_hdr_t *hdr = dest;
-    ucp_request_t *req = arg;
+    ucp_request_t *req        = arg;
 
-    hdr->super.super.tag = req->send.msg_proto.tag.tag;
+    hdr->super.super.tag = req->send.msg_proto.tag;
     hdr->req.ep_id       = ucp_send_request_get_ep_remote_id(req);
     hdr->req.req_id      = ucp_send_request_get_id(req);
 
@@ -69,7 +69,7 @@ static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg)
     length               = ucp_ep_get_max_bcopy(req->send.ep, req->send.lane) -
                            sizeof(*hdr);
     length               = ucs_min(length, req->send.length);
-    hdr->super.super.tag = req->send.msg_proto.tag.tag;
+    hdr->super.super.tag = req->send.msg_proto.tag;
     hdr->total_len       = req->send.length;
     hdr->msg_id          = req->send.msg_proto.message_id;
 
@@ -79,7 +79,7 @@ static size_t ucp_tag_pack_eager_first_dt(void *dest, void *arg)
 static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg)
 {
     ucp_eager_sync_first_hdr_t *hdr = dest;
-    ucp_request_t *req = arg;
+    ucp_request_t *req              = arg;
     size_t length;
 
     ucs_assert(req->send.lane == ucp_ep_get_am_lane(req->send.ep));
@@ -88,7 +88,7 @@ static size_t ucp_tag_pack_eager_sync_first_dt(void *dest, void *arg)
                                                       req->send.lane) -
                                  sizeof(*hdr);
     length                     = ucs_min(length, req->send.length);
-    hdr->super.super.super.tag = req->send.msg_proto.tag.tag;
+    hdr->super.super.super.tag = req->send.msg_proto.tag;
     hdr->super.total_len       = req->send.length;
     hdr->req.ep_id             = ucp_send_request_get_ep_remote_id(req);
     hdr->super.msg_id          = req->send.msg_proto.message_id;
@@ -117,19 +117,15 @@ static size_t ucp_tag_pack_eager_middle_dt(void *dest, void *arg)
 static ucs_status_t ucp_tag_eager_contig_short(uct_pending_req_t *self)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
-    ucp_ep_t *ep = req->send.ep;
+    ucp_ep_t *ep       = req->send.ep;
     ucs_status_t status;
 
     req->send.lane = ucp_ep_get_am_lane(ep);
-    status = uct_ep_am_short(ep->uct_eps[req->send.lane], UCP_AM_ID_EAGER_ONLY,
-                             req->send.msg_proto.tag.tag, req->send.buffer,
-                             req->send.length);
-    if (status != UCS_OK) {
-        return status;
-    }
-
-    ucp_request_complete_send(req, UCS_OK);
-    return UCS_OK;
+    status         = uct_ep_am_short(ep->uct_eps[req->send.lane],
+                                     UCP_AM_ID_EAGER_ONLY,
+                                     req->send.msg_proto.tag, req->send.buffer,
+                                     req->send.length);
+    return ucp_am_short_handle_status_from_pending(req, status);
 }
 
 static ucs_status_t ucp_tag_eager_bcopy_single(uct_pending_req_t *self)
@@ -156,7 +152,7 @@ static ucs_status_t ucp_tag_eager_zcopy_single(uct_pending_req_t *self)
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_eager_hdr_t hdr;
 
-    hdr.super.tag = req->send.msg_proto.tag.tag;
+    hdr.super.tag = req->send.msg_proto.tag;
     return ucp_do_am_zcopy_single(self, UCP_AM_ID_EAGER_ONLY, &hdr, sizeof(hdr),
                                   NULL, 0ul, ucp_proto_am_zcopy_req_complete);
 }
@@ -167,7 +163,7 @@ static ucs_status_t ucp_tag_eager_zcopy_multi(uct_pending_req_t *self)
     ucp_eager_first_hdr_t first_hdr;
     ucp_eager_middle_hdr_t middle_hdr;
 
-    first_hdr.super.super.tag = req->send.msg_proto.tag.tag;
+    first_hdr.super.super.tag = req->send.msg_proto.tag;
     first_hdr.total_len       = req->send.length;
     first_hdr.msg_id          = req->send.msg_proto.message_id;
     middle_hdr.msg_id         = req->send.msg_proto.message_id;
@@ -198,8 +194,9 @@ const ucp_request_send_proto_t ucp_tag_eager_proto = {
 void ucp_tag_eager_sync_completion(ucp_request_t *req, uint32_t flag,
                                    ucs_status_t status)
 {
-    static const uint16_t all_completed = UCP_REQUEST_FLAG_LOCAL_COMPLETED |
-                                          UCP_REQUEST_FLAG_REMOTE_COMPLETED;
+    static const uint16_t all_completed =
+            UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED |
+            UCP_REQUEST_FLAG_SYNC_REMOTE_COMPLETED;
 
     ucs_assertv(!(req->flags & flag), "req->flags=%d flag=%d", req->flags, flag);
     req->flags |= flag;
@@ -231,8 +228,9 @@ void
 ucp_tag_eager_sync_zcopy_req_complete(ucp_request_t *req, ucs_status_t status)
 {
     ucs_assert(req->send.state.uct_comp.count == 0);
+
     ucp_request_send_buffer_dereg(req); /* TODO register+lane change */
-    ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_LOCAL_COMPLETED,
+    ucp_tag_eager_sync_completion(req, UCP_REQUEST_FLAG_SYNC_LOCAL_COMPLETED,
                                   status);
 }
 
@@ -254,7 +252,7 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_single(uct_pending_req_t *self)
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
     ucp_eager_sync_hdr_t hdr;
 
-    hdr.super.super.tag = req->send.msg_proto.tag.tag;
+    hdr.super.super.tag = req->send.msg_proto.tag;
     hdr.req.ep_id       = ucp_send_request_get_ep_remote_id(req);
     hdr.req.req_id      = ucp_send_request_get_id(req);
 
@@ -269,19 +267,25 @@ static ucs_status_t ucp_tag_eager_sync_zcopy_multi(uct_pending_req_t *self)
     ucp_eager_sync_first_hdr_t first_hdr;
     ucp_eager_middle_hdr_t middle_hdr;
 
-    first_hdr.super.super.super.tag = req->send.msg_proto.tag.tag;
+    if (req->send.state.dt.offset != 0) {
+        middle_hdr.msg_id = req->send.msg_proto.message_id;
+        middle_hdr.offset = req->send.state.dt.offset;
+
+        return ucp_do_am_zcopy_multi(self, UCP_AM_ID_LAST,
+                                     UCP_AM_ID_EAGER_MIDDLE, NULL, 0,
+                                     &middle_hdr, sizeof(middle_hdr), NULL, 0ul,
+                                     ucp_tag_eager_sync_zcopy_req_complete, 1);
+    }
+    
+    first_hdr.super.super.super.tag = req->send.msg_proto.tag;
     first_hdr.super.total_len       = req->send.length;
     first_hdr.req.ep_id             = ucp_send_request_get_ep_remote_id(req);
     first_hdr.req.req_id            = ucp_send_request_get_id(req);
     first_hdr.super.msg_id          = req->send.msg_proto.message_id;
-    middle_hdr.msg_id               = req->send.msg_proto.message_id;
-    middle_hdr.offset               = req->send.state.dt.offset;
 
-    return ucp_do_am_zcopy_multi(self,
-                                 UCP_AM_ID_EAGER_SYNC_FIRST,
-                                 UCP_AM_ID_EAGER_MIDDLE,
-                                 &first_hdr, sizeof(first_hdr),
-                                 &middle_hdr, sizeof(middle_hdr), NULL, 0ul,
+    return ucp_do_am_zcopy_multi(self, UCP_AM_ID_EAGER_SYNC_FIRST,
+                                 UCP_AM_ID_LAST, &first_hdr, sizeof(first_hdr),
+                                 NULL, 0, NULL, 0ul,
                                  ucp_tag_eager_sync_zcopy_req_complete, 1);
 }
 
@@ -299,6 +303,7 @@ void ucp_tag_eager_sync_send_ack(ucp_worker_h worker, void *hdr, uint16_t recv_f
 {
     ucp_request_hdr_t *reqhdr;
     ucp_request_t *req;
+    ucp_ep_h ep;
 
     ucs_assert(recv_flags & UCP_RECV_DESC_FLAG_EAGER_SYNC);
 
@@ -315,8 +320,11 @@ void ucp_tag_eager_sync_send_ack(ucp_worker_h worker, void *hdr, uint16_t recv_f
         return;
     }
 
-    ucs_assert(reqhdr->req_id != UCP_REQUEST_ID_INVALID);
-    req = ucp_proto_ssend_ack_request_alloc(worker, reqhdr->ep_id);
+    ucs_assert(reqhdr->req_id != UCS_PTR_MAP_KEY_INVALID);
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, reqhdr->ep_id, return,
+                                  "ACK for sync-send");
+
+    req = ucp_proto_ssend_ack_request_alloc(worker, ep);
     if (req == NULL) {
         ucs_fatal("could not allocate request");
     }
diff --git a/src/ucp/tag/offload.c b/src/ucp/tag/offload.c
index f4514428a57..eb7fed1c577 100644
--- a/src/ucp/tag/offload.c
+++ b/src/ucp/tag/offload.c
@@ -90,9 +90,10 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_tag_consumed, (self),
 
 /* Message is scattered to user buffer by the transport, complete the request */
 UCS_PROFILE_FUNC_VOID(ucp_tag_offload_completed,
-                      (self, stag, imm, length, status),
+                      (self, stag, imm, length, inline_data, status),
                       uct_tag_context_t *self, uct_tag_t stag,
-                      uint64_t imm, size_t length, ucs_status_t status)
+                      uint64_t imm, size_t length, void *inline_data,
+                      ucs_status_t status)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, recv.uct_ctx);
     ucp_eager_sync_hdr_t hdr;
@@ -107,7 +108,7 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_completed,
 
     if (ucs_unlikely(imm)) {
         hdr.req.ep_id       = imm;
-        hdr.req.req_id      = UCP_REQUEST_ID_INVALID;  /* unused */
+        hdr.req.req_id      = UCS_PTR_MAP_KEY_INVALID; /* unused */
         hdr.super.super.tag = stag;
 
         /* Sync send - need to send a reply */
@@ -117,7 +118,10 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_completed,
                                     UCP_RECV_DESC_FLAG_EAGER_OFFLOAD);
     }
 
-    if (req->recv.tag.rdesc != NULL) {
+    if (ucs_unlikely(inline_data != NULL)) {
+        status = ucp_request_recv_data_unpack(req, inline_data, length, 0, 1);
+        ucp_tag_offload_release_buf(req);
+    } else if (req->recv.tag.rdesc != NULL) {
         status = ucp_request_recv_data_unpack(req, req->recv.tag.rdesc + 1,
                                               length, 0, 1);
         ucs_mpool_put_inline(req->recv.tag.rdesc);
@@ -133,10 +137,10 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_completed,
 
 /* RNDV request matched by the transport. Need to proceed with SW based RNDV */
 UCS_PROFILE_FUNC_VOID(ucp_tag_offload_rndv_cb,
-                      (self, stag, header, header_length, status),
+                      (self, stag, header, header_length, status, flags),
                       uct_tag_context_t *self, uct_tag_t stag,
                       const void *header, unsigned header_length,
-                      ucs_status_t status)
+                      ucs_status_t status, unsigned flags)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t, recv.uct_ctx);
     void *header_host_copy;
@@ -151,8 +155,9 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_rndv_cb,
 
     ucs_assert(header_length >= sizeof(ucp_rndv_rts_hdr_t));
 
-    if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->recv.mem_type)) {
-        ucp_tag_rndv_matched(req->recv.worker, req, header);
+    if (UCP_MEM_IS_HOST(req->recv.mem_type) ||
+        (flags & UCT_TAG_RECV_CB_INLINE_DATA)) {
+        ucp_tag_rndv_matched(req->recv.worker, req, header, header_length);
     } else {
         /* SW rendezvous request is stored in the user buffer (temporarily)
            when matched. If user buffer allocated on GPU memory, need to "pack"
@@ -160,7 +165,8 @@ UCS_PROFILE_FUNC_VOID(ucp_tag_offload_rndv_cb,
         header_host_copy = ucs_alloca(header_length);
         ucp_mem_type_pack(req->recv.worker, header_host_copy, header,
                           header_length, req->recv.mem_type);
-        ucp_tag_rndv_matched(req->recv.worker, req, header_host_copy);
+        ucp_tag_rndv_matched(req->recv.worker, req, header_host_copy,
+                             header_length);
     }
 
 out:
@@ -177,8 +183,9 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_rndv,
     ucp_worker_t *worker      = iface->worker;
     const void *uct_rkeys[]   = { rkey_buf };
     const ucp_tag_offload_unexp_rndv_hdr_t *rndv_hdr;
-    ucp_tag_rndv_rts_hdr_t *dummy_rts;
-    ucp_md_index_t md_index;
+    ucp_rndv_rts_hdr_t *dummy_rts;
+    ucp_tag_hdr_t *tag;
+    ucp_md_map_t md_map;
     size_t dummy_rts_size;
     size_t rkey_size;
 
@@ -188,26 +195,28 @@ UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_offload_unexp_rndv,
         rndv_hdr = hdr;
 
         /* Calculate size for dummy (on-stack) RTS packet */
-        md_index       = rndv_hdr->md_index;
-        rkey_size      = ucp_rkey_packed_size(worker->context, UCS_BIT(md_index));
+        md_map         = UCS_BIT(rndv_hdr->md_index);
+        rkey_size      = ucp_rkey_packed_size(worker->context, md_map,
+                                              UCS_SYS_DEVICE_ID_UNKNOWN, 0);
         dummy_rts_size = sizeof(*dummy_rts) + rkey_size;
 
         /* Build the dummy RTS packet, copy meta-data from unexpected rndv header
          * and remote key from rkey_buf.
          */
-        dummy_rts                    = ucs_alloca(dummy_rts_size);
-        dummy_rts->tag.tag           = stag;
-        dummy_rts->super.sreq.ep_id  = rndv_hdr->ep_id;
-        dummy_rts->super.sreq.req_id = rndv_hdr->req_id;
-        dummy_rts->super.address     = remote_addr;
-        dummy_rts->super.size        = length;
-        dummy_rts->super.flags       = UCP_RNDV_RTS_FLAG_TAG;
-
-        ucp_rkey_packed_copy(worker->context, UCS_BIT(md_index),
-                             UCS_MEMORY_TYPE_HOST, dummy_rts + 1, uct_rkeys);
+        dummy_rts              = ucs_alloca(dummy_rts_size);
+        dummy_rts->sreq.ep_id  = rndv_hdr->ep_id;
+        dummy_rts->sreq.req_id = rndv_hdr->req_id;
+        dummy_rts->address     = remote_addr;
+        dummy_rts->size        = length;
+        dummy_rts->opcode      = UCP_RNDV_RTS_TAG_OK;
+        tag                    = ucp_tag_hdr_from_rts(dummy_rts);
+        tag->tag               = stag;
+
+        ucp_rkey_packed_copy(worker->context, md_map, UCS_MEMORY_TYPE_HOST,
+                             dummy_rts + 1, uct_rkeys);
 
         UCP_WORKER_STAT_TAG_OFFLOAD(worker, RX_UNEXP_RNDV);
-        ucp_tag_rndv_process_rts(worker, &dummy_rts->super, dummy_rts_size, 0);
+        ucp_tag_rndv_process_rts(worker, dummy_rts, dummy_rts_size, 0);
     } else {
         /* Unexpected tag offload rndv request. Sender buffer is either
            non-contig or it's length > rndv.max_zcopy capability of tag lane.
@@ -274,7 +283,7 @@ ucp_tag_offload_do_post(ucp_request_t *req)
     /* Do not use bounce buffer for receives to GPU memory to avoid
      * cost of h2d transfers (i.e. cuda_copy from staging to dest memory). */
     if ((length >= worker->tm.offload.zcopy_thresh) ||
-        !UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->recv.mem_type)) {
+        !UCP_MEM_IS_HOST(req->recv.mem_type)) {
         if (length > wiface->attr.cap.tag.recv.max_zcopy) {
             /* Post maximum allowed length. If sender sends smaller message
              * (which is allowed per MPI standard), max recv should fit it.
@@ -464,7 +473,7 @@ static ucs_status_t ucp_tag_offload_eager_short(uct_pending_req_t *self)
 
     req->send.lane = ucp_ep_get_tag_lane(ep);
     status         = uct_ep_tag_eager_short(ep->uct_eps[req->send.lane],
-                                            req->send.msg_proto.tag.tag,
+                                            req->send.msg_proto.tag,
                                             req->send.buffer,
                                             req->send.length);
     if (status == UCS_OK) {
@@ -483,8 +492,8 @@ ucp_do_tag_offload_bcopy(uct_pending_req_t *self, uint64_t imm_data,
 
     req->send.lane = ucp_ep_get_tag_lane(ep);
     packed_len     = uct_ep_tag_eager_bcopy(ep->uct_eps[req->send.lane],
-                                            req->send.msg_proto.tag.tag,
-                                            imm_data, pack_cb, req, 0);
+                                            req->send.msg_proto.tag, imm_data,
+                                            pack_cb, req, 0);
     if (packed_len < 0) {
         return (ucs_status_t)packed_len;
     }
@@ -510,9 +519,8 @@ ucp_do_tag_offload_zcopy(uct_pending_req_t *self, uint64_t imm_data,
                         ucp_ep_md_index(ep, req->send.lane), NULL);
 
     status = uct_ep_tag_eager_zcopy(ep->uct_eps[req->send.lane],
-                                    req->send.msg_proto.tag.tag,
-                                    imm_data, iov, iovcnt, 0,
-                                    &req->send.state.uct_comp);
+                                    req->send.msg_proto.tag, imm_data, iov,
+                                    iovcnt, 0, &req->send.state.uct_comp);
 
     return ucp_am_zcopy_single_handle_status(req, &dt_state, status, complete);
 }
@@ -538,6 +546,7 @@ ucs_status_t ucp_tag_offload_sw_rndv(uct_pending_req_t *self)
     ucp_rndv_rts_hdr_t *rndv_rts_hdr;
     unsigned rndv_hdr_len;
     size_t packed_len;
+    ucs_status_t status;
 
     ucs_assert((UCP_DT_IS_CONTIG(req->send.datatype) &&
                (req->send.length > ucp_ep_config(ep)->tag.offload.max_rndv_zcopy)) ||
@@ -551,15 +560,18 @@ ucs_status_t ucp_tag_offload_sw_rndv(uct_pending_req_t *self)
     rndv_rts_hdr = ucs_alloca(rndv_hdr_len);
     packed_len   = ucp_tag_rndv_rts_pack(rndv_rts_hdr, req);
 
-    return uct_ep_tag_rndv_request(ep->uct_eps[req->send.lane],
-                                   req->send.msg_proto.tag.tag,
-                                   rndv_rts_hdr, packed_len, 0);
+    status = uct_ep_tag_rndv_request(ep->uct_eps[req->send.lane],
+                                     req->send.msg_proto.tag, rndv_rts_hdr,
+                                     packed_len, 0);
+    return ucp_rndv_rts_handle_status_from_pending(req, status);
 }
 
 static void ucp_tag_offload_rndv_zcopy_completion(uct_completion_t *self)
 {
     ucp_request_t *req = ucs_container_of(self, ucp_request_t,
                                           send.state.uct_comp);
+
+    ucp_send_request_id_release(req);
     ucp_proto_am_zcopy_req_complete(req, self->status);
 }
 
@@ -570,18 +582,18 @@ ucs_status_t ucp_tag_offload_rndv_zcopy(uct_pending_req_t *self)
     size_t max_iov     = ucp_ep_config(ep)->tag.eager.max_iov;
     uct_iov_t *iov     = ucs_alloca(max_iov * sizeof(uct_iov_t));
     size_t iovcnt      = 0;
-    ucp_md_index_t md_index;
     ucp_dt_state_t dt_state;
     void *rndv_op;
+    ucs_status_t status;
 
-    md_index = ucp_ep_md_index(ep, req->send.lane);
 
     ucp_tag_offload_unexp_rndv_hdr_t rndv_hdr = {
         .ep_id    = ucp_send_request_get_ep_remote_id(req),
         .req_id   = ucp_send_request_get_id(req),
-        .md_index = md_index
+        .md_index = ucp_ep_md_index(ep, req->send.lane)
     };
 
+    ucs_assert(!ucp_ep_use_indirect_id(req->send.ep));
     dt_state = req->send.state.dt;
 
     UCS_STATIC_ASSERT(sizeof(ucp_rsc_index_t) <= sizeof(rndv_hdr.md_index));
@@ -592,12 +604,14 @@ ucs_status_t ucp_tag_offload_rndv_zcopy(uct_pending_req_t *self)
                         ucp_ep_md_index(ep, req->send.lane), NULL);
 
     rndv_op = uct_ep_tag_rndv_zcopy(ep->uct_eps[req->send.lane],
-                                    req->send.msg_proto.tag.tag, &rndv_hdr,
+                                    req->send.msg_proto.tag, &rndv_hdr,
                                     sizeof(rndv_hdr), iov, iovcnt, 0,
                                     &req->send.state.uct_comp);
-    if (UCS_PTR_IS_ERR(rndv_op)) {
-        return UCS_PTR_STATUS(rndv_op);
+    if (ucs_unlikely(UCS_PTR_IS_ERR(rndv_op))) {
+        status = UCS_PTR_STATUS(rndv_op);
+        return ucp_rndv_rts_handle_status_from_pending(req, status);
     }
+
     ucp_request_send_state_advance(req, &dt_state,
                                    UCP_REQUEST_SEND_PROTO_RNDV_GET,
                                    UCS_INPROGRESS);
@@ -679,7 +693,7 @@ const ucp_request_send_proto_t ucp_tag_offload_proto = {
 static UCS_F_ALWAYS_INLINE void
 ucp_tag_offload_sync_posted(ucp_worker_t *worker, ucp_request_t *req)
 {
-    req->send.tag_offload.ssend_tag = req->send.msg_proto.tag.tag;
+    req->send.tag_offload.ssend_tag = req->send.msg_proto.tag;
     ucs_queue_push(&worker->tm.offload.sync_reqs, &req->send.tag_offload.queue);
 }
 
@@ -718,10 +732,14 @@ void ucp_tag_offload_sync_send_ack(ucp_worker_h worker, ucs_ptr_map_key_t ep_id,
                                    ucp_tag_t stag, uint16_t recv_flags)
 {
     ucp_request_t *req;
+    ucp_ep_h ep;
 
     ucs_assert(recv_flags & UCP_RECV_DESC_FLAG_EAGER_OFFLOAD);
 
-    req = ucp_proto_ssend_ack_request_alloc(worker, ep_id);
+    UCP_WORKER_GET_VALID_EP_BY_ID(&ep, worker, ep_id, return,
+                                  "ACK for sync-send");
+
+    req = ucp_proto_ssend_ack_request_alloc(worker, ep);
     if (req == NULL) {
         ucs_fatal("could not allocate request");
     }
diff --git a/src/ucp/tag/offload.h b/src/ucp/tag/offload.h
index ad35f2eca90..ae724bdfca4 100644
--- a/src/ucp/tag/offload.h
+++ b/src/ucp/tag/offload.h
@@ -160,5 +160,4 @@ ucp_tag_offload_unexp(ucp_worker_iface_t *wiface, ucp_tag_t tag, size_t length)
     }
 }
 
-
 #endif
diff --git a/src/ucp/tag/offload/eager.c b/src/ucp/tag/offload/eager.c
new file mode 100644
index 00000000000..325e62759f5
--- /dev/null
+++ b/src/ucp/tag/offload/eager.c
@@ -0,0 +1,203 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <ucp/tag/eager.h>
+#include <ucp/proto/proto_single.inl>
+
+
+static ucs_status_t
+ucp_proto_eager_tag_offload_short_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    ucp_ep_t *ep       = req->send.ep;
+    const ucp_proto_single_priv_t *spriv = req->send.proto_config->priv;
+    ucs_status_t status;
+
+    status = uct_ep_tag_eager_short(ep->uct_eps[spriv->super.lane],
+                                    req->send.msg_proto.tag,
+                                    req->send.state.dt_iter.type.contig.buffer,
+                                    req->send.state.dt_iter.length);
+    if (status == UCS_ERR_NO_RESOURCE) {
+        req->send.lane = spriv->super.lane; /* for pending add */
+        return status;
+    }
+
+    ucp_datatype_iter_cleanup(&req->send.state.dt_iter,
+                              UCS_BIT(UCP_DATATYPE_CONTIG));
+
+    ucs_assert(status != UCS_INPROGRESS);
+    ucp_request_complete_send(req, status);
+
+    return UCS_OK;
+}
+
+static ucs_status_t ucp_proto_eager_tag_offload_short_init(
+        const ucp_proto_init_params_t *init_params)
+{
+    const ucp_proto_select_param_t *select_param = init_params->select_param;
+    ucp_proto_single_init_params_t params = {
+        .super.super         = *init_params,
+        .super.latency       = -150e-9, /* no extra memory access to fetch data */
+        .super.overhead      = 0,
+        .super.cfg_thresh    = UCS_MEMUNITS_AUTO,
+        .super.cfg_priority  = 0,
+        .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
+        .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t,
+                                            cap.tag.eager.max_short),
+        .super.hdr_size      = sizeof(ucp_tag_t),
+        .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_MAX_FRAG |
+                               UCP_PROTO_COMMON_INIT_FLAG_RECV_ZCOPY,
+        .lane_type           = UCP_LANE_TYPE_TAG,
+        .tl_cap_flags        = UCT_IFACE_FLAG_TAG_EAGER_SHORT
+    };
+
+    if (!ucp_proto_eager_check_op_id(init_params, 1) ||
+        /* short protocol requires contig/host */
+        (select_param->dt_class != UCP_DATATYPE_CONTIG) ||
+        !UCP_MEM_IS_HOST(select_param->mem_type)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    return ucp_proto_single_init(&params);
+}
+
+static ucp_proto_t ucp_eager_tag_offload_short_proto = {
+    .name       = "egr/offload/short",
+    .flags      = UCP_PROTO_FLAG_TAG_SHORT,
+    .init       = ucp_proto_eager_tag_offload_short_init,
+    .config_str = ucp_proto_single_config_str,
+    .progress   = ucp_proto_eager_tag_offload_short_progress
+};
+UCP_PROTO_REGISTER(&ucp_eager_tag_offload_short_proto);
+
+static size_t ucp_eager_tag_offload_pack(void *dest, void *arg)
+{
+    ucp_request_t *req = arg;
+    ucp_datatype_iter_t next_iter;
+
+    ucs_assert(req->send.state.dt_iter.offset == 0);
+
+    return ucp_datatype_iter_next_pack(&req->send.state.dt_iter,
+                                       req->send.ep->worker, SIZE_MAX,
+                                       &next_iter, dest);
+}
+
+static ucs_status_t
+ucp_proto_eager_tag_offload_bcopy_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    const ucp_proto_single_priv_t *spriv = req->send.proto_config->priv;
+    ssize_t packed_len;
+    ucs_status_t status;
+
+    packed_len = uct_ep_tag_eager_bcopy(req->send.ep->uct_eps[spriv->super.lane],
+                                        req->send.msg_proto.tag, 0ul,
+                                        ucp_eager_tag_offload_pack, req, 0);
+    status     = ucs_likely(packed_len >= 0) ? UCS_OK : packed_len;
+
+    return ucp_proto_single_status_handle(
+            req, ucp_proto_request_bcopy_complete_success, spriv->super.lane,
+            status);
+}
+
+static ucs_status_t ucp_proto_eager_tag_offload_bcopy_init(
+        const ucp_proto_init_params_t *init_params)
+{
+    ucp_context_t *context                = init_params->worker->context;
+    ucp_proto_single_init_params_t params = {
+        .super.super         = *init_params,
+        .super.latency       = 0,
+        .super.overhead      = 5e-9,
+        .super.cfg_thresh    = context->config.ext.bcopy_thresh,
+        .super.cfg_priority  = 20,
+        .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
+        .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t,
+                                            cap.tag.eager.max_bcopy),
+        .super.hdr_size      = sizeof(ucp_tag_t),
+        .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_MAX_FRAG |
+                               UCP_PROTO_COMMON_INIT_FLAG_RECV_ZCOPY,
+        .lane_type           = UCP_LANE_TYPE_TAG,
+        .tl_cap_flags        = UCT_IFACE_FLAG_TAG_EAGER_BCOPY
+    };
+
+    /* offload proto can not be used if no tag offload lane configured */
+    if (!ucp_proto_eager_check_op_id(init_params, 1)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    return ucp_proto_single_init(&params);
+}
+
+static ucp_proto_t ucp_eager_bcopy_single_proto = {
+    .name       = "egr/offload/bcopy",
+    .flags      = 0,
+    .init       = ucp_proto_eager_tag_offload_bcopy_init,
+    .config_str = ucp_proto_single_config_str,
+    .progress   = ucp_proto_eager_tag_offload_bcopy_progress
+};
+UCP_PROTO_REGISTER(&ucp_eager_bcopy_single_proto);
+
+static ucs_status_t ucp_proto_eager_tag_offload_zcopy_init(
+        const ucp_proto_init_params_t *init_params)
+{
+    ucp_context_t *context                = init_params->worker->context;
+    ucp_proto_single_init_params_t params = {
+        .super.super         = *init_params,
+        .super.latency       = 0,
+        .super.overhead      = 0,
+        .super.cfg_thresh    = context->config.ext.zcopy_thresh,
+        .super.cfg_priority  = 30,
+        .super.min_frag_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
+        .super.max_frag_offs = ucs_offsetof(uct_iface_attr_t,
+                                            cap.tag.eager.max_zcopy),
+        .super.hdr_size      = sizeof(ucp_tag_t),
+        .super.flags         = UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY |
+                               UCP_PROTO_COMMON_INIT_FLAG_RECV_ZCOPY |
+                               UCP_PROTO_COMMON_INIT_FLAG_MAX_FRAG,
+        .lane_type           = UCP_LANE_TYPE_TAG,
+        .tl_cap_flags        = UCT_IFACE_FLAG_TAG_EAGER_ZCOPY
+    };
+
+    /* offload proto can not be used if no tag offload lane configured */
+    if (!ucp_proto_eager_check_op_id(init_params, 1)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    return ucp_proto_single_init(&params);
+}
+
+static ucs_status_t
+ucp_proto_tag_offload_zcopy_send_func(ucp_request_t *req,
+                                      const ucp_proto_single_priv_t *spriv,
+                                      const uct_iov_t *iov)
+{
+    return uct_ep_tag_eager_zcopy(req->send.ep->uct_eps[spriv->super.lane],
+                                  req->send.msg_proto.tag, 0ul, iov, 1, 0,
+                                  &req->send.state.uct_comp);
+}
+
+static ucs_status_t
+ucp_proto_eager_tag_offload_zcopy_progress(uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+
+    return ucp_proto_zcopy_single_progress(req, UCT_MD_MEM_ACCESS_LOCAL_READ,
+                                           ucp_proto_tag_offload_zcopy_send_func,
+                                           "tag_eager_zcopy");
+}
+
+static ucp_proto_t ucp_eager_zcopy_single_proto = {
+    .name       = "egr/offload/zcopy",
+    .flags      = 0,
+    .init       = ucp_proto_eager_tag_offload_zcopy_init,
+    .config_str = ucp_proto_single_config_str,
+    .progress   = ucp_proto_eager_tag_offload_zcopy_progress,
+};
+UCP_PROTO_REGISTER(&ucp_eager_zcopy_single_proto);
diff --git a/src/ucp/tag/probe.c b/src/ucp/tag/probe.c
index ff4db815d40..971c57fd1f6 100644
--- a/src/ucp/tag/probe.c
+++ b/src/ucp/tag/probe.c
@@ -28,7 +28,6 @@ UCS_PROFILE_FUNC(ucp_tag_message_h, ucp_tag_probe_nb,
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_TAG,
                                     return NULL);
-    UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
     ucs_trace_req("probe_nb tag %"PRIx64"/%"PRIx64" remove=%d", tag, tag_mask,
                   rem);
@@ -44,11 +43,10 @@ UCS_PROFILE_FUNC(ucp_tag_message_h, ucp_tag_probe_nb,
             info->length = ((ucp_eager_first_hdr_t*)(rdesc + 1))->total_len;
         } else {
             ucs_assert(flags & UCP_RECV_DESC_FLAG_RNDV);
-            info->length = ucp_tag_rndv_rts_from_rdesc(rdesc)->super.size;
+            info->length = ucp_tag_rndv_rts_from_rdesc(rdesc)->size;
         }
     }
 
-    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
 
     return rdesc;
 }
diff --git a/src/ucp/tag/tag_recv.c b/src/ucp/tag/tag_recv.c
index c0a19d1aa18..e2e3bc734d6 100644
--- a/src/ucp/tag/tag_recv.c
+++ b/src/ucp/tag/tag_recv.c
@@ -39,8 +39,6 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count,
     ucp_trace_req(req, "%s buffer %p dt 0x%lx count %zu tag %"PRIx64"/%"PRIx64,
                   debug_name, buffer, datatype, count, tag, tag_mask);
 
-    memory_type = ucp_request_param_mem_type(param);
-
     /* First, check the fast path case - single fragment
      * in this case avoid initializing most of request fields
      * */
@@ -59,12 +57,13 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count,
         recv_len                      = rdesc->length - hdr_len;
         req->recv.tag.info.sender_tag = ucp_rdesc_get_tag(rdesc);
         req->recv.tag.info.length     = recv_len;
-        memory_type                   = ucp_get_memory_type(worker->context, buffer,
-                                                            recv_len, memory_type);
 
-        status = ucp_dt_unpack_only(worker, buffer, count, datatype, memory_type,
-                                    UCS_PTR_BYTE_OFFSET(rdesc + 1, hdr_len),
-                                    recv_len, 1);
+        memory_type = ucp_request_get_memory_type(worker->context, buffer,
+                                                  recv_len, param);
+        status      = ucp_dt_unpack_only(worker, buffer, count, datatype,
+                                         memory_type,
+                                         UCS_PTR_BYTE_OFFSET(rdesc + 1, hdr_len),
+                                         recv_len, 1);
         ucp_recv_desc_release(rdesc);
 
         req->status = status;
@@ -95,13 +94,19 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count,
     req->flags              = common_flags | req_flags;
     req->recv.length        = ucp_dt_length(datatype, count, buffer,
                                             &req->recv.state);
-    req->recv.mem_type      = ucp_get_memory_type(worker->context, buffer,
-                                                  req->recv.length, memory_type);
+    req->recv.mem_type      = ucp_request_get_memory_type(worker->context, buffer,
+                                                         req->recv.length, param);
+
     req->recv.tag.tag       = tag;
     req->recv.tag.tag_mask  = tag_mask;
     if (param->op_attr_mask & UCP_OP_ATTR_FIELD_CALLBACK) {
         req->recv.tag.cb    = param->cb.recv;
-        req->user_data      = param->user_data;
+
+        if (param->op_attr_mask & UCP_OP_ATTR_FIELD_USER_DATA) {
+            req->user_data = param->user_data;
+        } else {
+            req->user_data = NULL;
+        }
     }
 
     if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_REQ)) {
@@ -126,7 +131,8 @@ ucp_tag_recv_common(ucp_worker_h worker, void *buffer, size_t count,
 
     /* Check rendezvous case */
     if (ucs_unlikely(rdesc->flags & UCP_RECV_DESC_FLAG_RNDV)) {
-        ucp_tag_rndv_matched(worker, req, ucp_tag_rndv_rts_from_rdesc(rdesc));
+        ucp_tag_rndv_matched(worker, req, ucp_tag_rndv_rts_from_rdesc(rdesc),
+                             rdesc->length);
         UCP_WORKER_STAT_RNDV(worker, UNEXP, 1);
         ucp_recv_desc_release(rdesc);
         return req + 1;
@@ -205,6 +211,8 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_recv_nbx,
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_TAG,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
+
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
     datatype = ucp_request_param_datatype(param);
@@ -221,34 +229,46 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_recv_nbx,
     return ret;
 }
 
-UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_msg_recv_nb,
-                 (worker, buffer, count, datatype, message, cb),
-                 ucp_worker_h worker, void *buffer, size_t count,
-                 ucp_datatype_t datatype, ucp_tag_message_h message,
-                 ucp_tag_recv_callback_t cb)
+ucs_status_ptr_t ucp_tag_msg_recv_nb(ucp_worker_h worker, void *buffer, size_t count,
+                                     ucp_datatype_t datatype, ucp_tag_message_h message,
+                                     ucp_tag_recv_callback_t cb)
 {
     ucp_request_param_t param = {
         .op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
+                        UCP_OP_ATTR_FIELD_DATATYPE |
                         UCP_OP_ATTR_FLAG_NO_IMM_CMPL,
+        .datatype     = datatype,
         .cb.recv      = (ucp_tag_recv_nbx_callback_t)cb
     };
+
+    return ucp_tag_msg_recv_nbx(worker, buffer, count, message, &param);
+}
+
+UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_msg_recv_nbx,
+                 (worker, buffer, count, message, param),
+                 ucp_worker_h worker, void *buffer, size_t count,
+                 ucp_tag_message_h message, const ucp_request_param_t *param)
+{
     ucp_recv_desc_t *rdesc = message;
     ucs_status_ptr_t ret;
     ucp_request_t *req;
+    ucp_datatype_t datatype;
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_TAG,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
+
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
-    req = ucp_request_get(worker);
-    if (ucs_likely(req != NULL)) {
-        ret = ucp_tag_recv_common(worker, buffer, count, datatype,
-                                  ucp_rdesc_get_tag(rdesc), UCP_TAG_MASK_FULL,
-                                  req, rdesc, &param, "msg_recv_nb");
-    } else {
-        ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
-    }
+    req      = ucp_request_get_param(worker, param,
+                                     {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
+                                     goto out;});
+    datatype = ucp_request_param_datatype(param);
+    ret      =  ucp_tag_recv_common(worker, buffer, count, datatype,
+                                    ucp_rdesc_get_tag(rdesc), UCP_TAG_MASK_FULL,
+                                    req, rdesc, param, "msg_recv_nbx");
 
+out:
     UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
     return ret;
 }
diff --git a/src/ucp/tag/tag_rndv.c b/src/ucp/tag/tag_rndv.c
index 6a9779f183b..20e5f547d34 100644
--- a/src/ucp/tag/tag_rndv.c
+++ b/src/ucp/tag/tag_rndv.c
@@ -11,39 +11,42 @@
 #include "tag_rndv.h"
 #include "tag_match.inl"
 
+#include <ucp/proto/proto_single.inl>
+#include <ucp/rndv/proto_rndv.inl>
+
 
 void ucp_tag_rndv_matched(ucp_worker_h worker, ucp_request_t *rreq,
-                          const ucp_tag_rndv_rts_hdr_t *rts_hdr)
+                          const ucp_rndv_rts_hdr_t *rts_hdr, size_t hdr_length)
 {
-    ucs_assert(rts_hdr->super.flags & UCP_RNDV_RTS_FLAG_TAG);
-
     /* rreq is the receive request on the receiver's side */
-    rreq->recv.tag.info.sender_tag = rts_hdr->tag.tag;
-    rreq->recv.tag.info.length     = rts_hdr->super.size;
+    ucs_assert(ucp_rndv_rts_is_tag(rts_hdr));
+    rreq->recv.tag.info.sender_tag = ucp_tag_hdr_from_rts(rts_hdr)->tag;
+    rreq->recv.tag.info.length     = rts_hdr->size;
 
-    ucp_rndv_receive(worker, rreq, &rts_hdr->super, rts_hdr + 1);
+    if (worker->context->config.ext.proto_enable) {
+        ucp_proto_rndv_receive(worker, rreq, rts_hdr, rts_hdr + 1,
+                               hdr_length - sizeof(*rts_hdr));
+    } else {
+        ucp_rndv_receive(worker, rreq, rts_hdr, rts_hdr + 1);
+    }
 }
 
 ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker,
-                                      ucp_rndv_rts_hdr_t *common_rts_hdr,
+                                      ucp_rndv_rts_hdr_t *rts_hdr,
                                       size_t length, unsigned tl_flags)
 {
-    ucp_tag_rndv_rts_hdr_t *rts_hdr = ucs_derived_of(common_rts_hdr,
-                                                     ucp_tag_rndv_rts_hdr_t);
     ucp_recv_desc_t *rdesc;
-    ucp_tag_t *rdesc_hdr;
     ucp_request_t *rreq;
     ucs_status_t status;
 
-    ucs_assert(rts_hdr->super.flags & UCP_RNDV_RTS_FLAG_TAG);
+    ucs_assert(ucp_rndv_rts_is_tag(rts_hdr));
 
-    rreq = ucp_tag_exp_search(&worker->tm, rts_hdr->tag.tag);
+    rreq = ucp_tag_exp_search(&worker->tm, ucp_tag_hdr_from_rts(rts_hdr)->tag);
     if (rreq != NULL) {
-        ucp_tag_rndv_matched(worker, rreq, rts_hdr);
-
         /* Cancel req in transport if it was offloaded, because it arrived
            as unexpected */
         ucp_tag_offload_try_cancel(worker, rreq, UCP_TAG_OFFLOAD_CANCEL_FORCE);
+        ucp_tag_rndv_matched(worker, rreq, rts_hdr, length);
 
         UCP_WORKER_STAT_RNDV(worker, EXP, 1);
         return UCS_OK;
@@ -51,17 +54,14 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker,
 
     ucs_assert(length >= sizeof(*rts_hdr));
 
-    /* Include tag before the header as well, to keep ucp_rdesc_get_tag() fast
-     * (and therefore keep fast search by ucp_tag_unexp_search())
-     */
-    status = ucp_recv_desc_init(worker, rts_hdr, length, sizeof(*rdesc_hdr),
-                                tl_flags, sizeof(*rts_hdr) + sizeof(*rdesc_hdr),
-                                UCP_RECV_DESC_FLAG_RNDV,
-                                sizeof(*rdesc_hdr), &rdesc);
+    status = ucp_recv_desc_init(worker, rts_hdr, length, 0, tl_flags,
+                                sizeof(*rts_hdr), UCP_RECV_DESC_FLAG_RNDV, 0,
+                                &rdesc);
     if (!UCS_STATUS_IS_ERR(status)) {
-        rdesc_hdr  = (ucp_tag_t*)(rdesc + 1);
-        *rdesc_hdr = rts_hdr->tag.tag;
-        ucp_tag_unexp_recv(&worker->tm, rdesc, rts_hdr->tag.tag);
+        ucs_assert(ucp_rdesc_get_tag(rdesc) ==
+                   ucp_tag_hdr_from_rts(rts_hdr)->tag);
+        ucp_tag_unexp_recv(&worker->tm, rdesc,
+                           ucp_tag_hdr_from_rts(rts_hdr)->tag);
     }
 
     return status;
@@ -69,25 +69,21 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker,
 
 size_t ucp_tag_rndv_rts_pack(void *dest, void *arg)
 {
-    ucp_request_t *sreq                 = arg;
-    ucp_tag_rndv_rts_hdr_t *tag_rts_hdr = dest;
+    ucp_request_t *sreq         = arg;
+    ucp_rndv_rts_hdr_t *rts_hdr = dest;
 
-    tag_rts_hdr->tag.tag = sreq->send.msg_proto.tag.tag;
+    ucp_tag_hdr_from_rts(rts_hdr)->tag = sreq->send.msg_proto.tag;
 
-    return ucp_rndv_rts_pack(sreq, &tag_rts_hdr->super, sizeof(*tag_rts_hdr),
-                             UCP_RNDV_RTS_FLAG_TAG);
+    return ucp_rndv_rts_pack(sreq, rts_hdr, UCP_RNDV_RTS_TAG_OK);
 }
 
 UCS_PROFILE_FUNC(ucs_status_t, ucp_proto_progress_rndv_rts, (self),
                  uct_pending_req_t *self)
 {
     ucp_request_t *sreq = ucs_container_of(self, ucp_request_t, send.uct);
-    size_t packed_rkey_size;
 
-    /* send the RTS. the pack_cb will pack all the necessary fields in the RTS */
-    packed_rkey_size = ucp_ep_config(sreq->send.ep)->rndv.rkey_size;
-    return ucp_do_am_single(self, UCP_AM_ID_RNDV_RTS, ucp_tag_rndv_rts_pack,
-                            sizeof(ucp_tag_rndv_rts_hdr_t) + packed_rkey_size);
+    return ucp_rndv_send_rts(sreq, ucp_tag_rndv_rts_pack,
+                             sizeof(ucp_rndv_rts_hdr_t));
 }
 
 ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq)
@@ -95,9 +91,9 @@ ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq)
     ucp_ep_h ep = sreq->send.ep;
     ucs_status_t status;
 
-    ucp_trace_req(sreq, "start_rndv to %s buffer %p length %zu",
+    ucp_trace_req(sreq, "start_rndv to %s buffer %p length %zu mem_type:%s",
                   ucp_ep_peer_name(ep), sreq->send.buffer,
-                  sreq->send.length);
+                  sreq->send.length, ucs_memory_type_names[sreq->send.mem_type]);
     UCS_PROFILE_REQUEST_EVENT(sreq, "start_rndv", sreq->send.length);
 
     status = ucp_ep_resolve_remote_id(ep, sreq->send.lane);
@@ -105,7 +101,9 @@ ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq)
         return status;
     }
 
-    if (ucp_ep_is_tag_offload_enabled(ucp_ep_config(ep))) {
+    ucp_send_request_id_alloc(sreq);
+
+    if (ucp_ep_config_key_has_tag_lane(&ucp_ep_config(ep)->key)) {
         status = ucp_tag_offload_start_rndv(sreq);
     } else {
         ucs_assert(sreq->send.lane == ucp_ep_get_am_lane(ep));
@@ -116,3 +114,45 @@ ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *sreq)
     return status;
 }
 
+static size_t ucp_tag_rndv_proto_rts_pack(void *dest, void *arg)
+{
+    ucp_rndv_rts_hdr_t *tag_rts = dest;
+    ucp_request_t *req          = arg;
+
+    tag_rts->opcode                    = UCP_RNDV_RTS_TAG_OK;
+    ucp_tag_hdr_from_rts(tag_rts)->tag = req->send.msg_proto.tag;
+
+    return ucp_proto_rndv_rts_pack(req, tag_rts, sizeof(*tag_rts));
+}
+
+UCS_PROFILE_FUNC(ucs_status_t, ucp_tag_rndv_rts_progress, (self),
+                 uct_pending_req_t *self)
+{
+    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
+    const ucp_proto_rndv_ctrl_priv_t *rpriv;
+    size_t max_rts_size;
+    ucs_status_t status;
+
+    rpriv        = req->send.proto_config->priv;
+    max_rts_size = sizeof(ucp_rndv_rts_hdr_t) + rpriv->packed_rkey_size;
+
+    status = UCS_PROFILE_CALL(ucp_proto_rndv_rts_request_init, req);
+    if (status != UCS_OK) {
+        ucp_proto_request_abort(req, status);
+        return UCS_OK;
+    }
+
+    return UCS_PROFILE_CALL(ucp_proto_am_bcopy_single_progress, req,
+                            UCP_AM_ID_RNDV_RTS, rpriv->lane,
+                            ucp_tag_rndv_proto_rts_pack, req, max_rts_size,
+                            NULL);
+}
+
+static ucp_proto_t ucp_tag_rndv_proto = {
+    .name       = "tag/rndv",
+    .flags      = 0,
+    .init       = ucp_proto_rndv_rts_init,
+    .config_str = ucp_proto_rndv_ctrl_config_str,
+    .progress   = ucp_tag_rndv_rts_progress
+};
+UCP_PROTO_REGISTER(&ucp_tag_rndv_proto);
diff --git a/src/ucp/tag/tag_rndv.h b/src/ucp/tag/tag_rndv.h
index 4f238944282..bc54f5cb368 100644
--- a/src/ucp/tag/tag_rndv.h
+++ b/src/ucp/tag/tag_rndv.h
@@ -12,20 +12,17 @@
 #include <ucp/core/ucp_request.h>
 
 
-/*
- * TAG API Rendezvous RTS header
- */
-typedef struct {
-    ucp_rndv_rts_hdr_t        super;
-    ucp_tag_hdr_t             tag;
-    /* packed rkeys follows */
-} UCS_S_PACKED ucp_tag_rndv_rts_hdr_t;
+#define ucp_tag_hdr_from_rts(_rts) \
+    ({ \
+        UCS_STATIC_ASSERT(sizeof((_rts)->hdr) == sizeof(ucp_tag_hdr_t)); \
+        ((ucp_tag_hdr_t*)&(_rts)->hdr); \
+    })
 
 
 ucs_status_t ucp_tag_send_start_rndv(ucp_request_t *req);
 
 void ucp_tag_rndv_matched(ucp_worker_h worker, ucp_request_t *req,
-                          const ucp_tag_rndv_rts_hdr_t *rndv_rts_hdr);
+                          const ucp_rndv_rts_hdr_t *rts_hdr, size_t hdr_length);
 
 ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker,
                                       ucp_rndv_rts_hdr_t *rts_hdr,
@@ -33,17 +30,13 @@ ucs_status_t ucp_tag_rndv_process_rts(ucp_worker_h worker,
 
 size_t ucp_tag_rndv_rts_pack(void *dest, void *arg);
 
-/* In case of RNDV, there is a tag(ucp_tag_t) right after rdesc and before
- * the TAG RTS header (ucp_tag_rndv_rts_hdr_t). It is needed to unify all
- * TAG API protocol headers and make search in unexpected queue fast.
- */
-static UCS_F_ALWAYS_INLINE ucp_tag_rndv_rts_hdr_t*
+
+static UCS_F_ALWAYS_INLINE ucp_rndv_rts_hdr_t *
 ucp_tag_rndv_rts_from_rdesc(ucp_recv_desc_t *rdesc)
 {
-    ucs_assert(rdesc->payload_offset ==
-               (sizeof(ucp_tag_rndv_rts_hdr_t) + sizeof(ucp_tag_t)));
+    ucs_assert(rdesc->payload_offset == sizeof(ucp_rndv_rts_hdr_t));
 
-    return UCS_PTR_BYTE_OFFSET(rdesc + 1, sizeof(ucp_tag_t));
+    return (ucp_rndv_rts_hdr_t*)(rdesc + 1);
 }
 
 #endif
diff --git a/src/ucp/tag/tag_send.c b/src/ucp/tag/tag_send.c
index a346df79b5c..650dde9cc05 100644
--- a/src/ucp/tag/tag_send.c
+++ b/src/ucp/tag/tag_send.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -29,7 +29,7 @@ ucp_tag_get_rndv_threshold(const ucp_request_t *req, size_t count,
     switch (req->send.datatype & UCP_DATATYPE_CLASS_MASK) {
     case UCP_DATATYPE_IOV:
         if ((count > max_iov) &&
-            ucp_ep_is_tag_offload_enabled(ucp_ep_config(req->send.ep))) {
+            ucp_ep_config_key_has_tag_lane(&ucp_ep_config(req->send.ep)->key)) {
             /* Make sure SW RNDV will be used, because tag offload does
              * not support multi-packet eager protocols. */
             return 1;
@@ -68,7 +68,7 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count,
                                              rndv_rma_thresh, rndv_am_thresh);
 
     if (!(param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL) ||
-        ucs_unlikely(!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(req->send.mem_type))) {
+        ucs_unlikely(!UCP_MEM_IS_HOST(req->send.mem_type))) {
         zcopy_thresh = ucp_proto_get_zcopy_threshold(req, msg_config, dt_count,
                                                      rndv_thresh);
     } else {
@@ -76,34 +76,35 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count,
     }
 
     ucs_trace_req("select tag request(%p) progress algorithm datatype=0x%"PRIx64
-                  " buffer=%p length=%zu max_short=%zd rndv_thresh=%zu "
+                  " buffer=%p length=%zu mem_type:%s max_short=%zd rndv_thresh=%zu "
                   "zcopy_thresh=%zu zcopy_enabled=%d",
                   req, req->send.datatype, req->send.buffer, req->send.length,
+                  ucs_memory_type_names[req->send.mem_type],
                   max_short, rndv_thresh, zcopy_thresh,
                   !(param->op_attr_mask & UCP_OP_ATTR_FLAG_FAST_CMPL));
 
     status = ucp_request_send_start(req, max_short, zcopy_thresh, rndv_thresh,
                                     dt_count, 0, req->send.length, msg_config,
                                     proto);
-    if (ucs_unlikely(status != UCS_OK)) {
-        if (status == UCS_ERR_NO_PROGRESS) {
-            /* RMA/AM rendezvous */
-            ucs_assert(req->send.length >= rndv_thresh);
-            status = ucp_tag_send_start_rndv(req);
-            if (status != UCS_OK) {
-                return UCS_STATUS_PTR(status);
-            }
-
-            UCP_EP_STAT_TAG_OP(req->send.ep, RNDV);
+    if (ucs_likely(status == UCS_OK)) {
+        /* Eager send initialized successfully */
+        if (req->flags & UCP_REQUEST_FLAG_SYNC) {
+            ucp_send_request_id_alloc(req);
+            UCP_EP_STAT_TAG_OP(req->send.ep, EAGER_SYNC);
         } else {
+            UCP_EP_STAT_TAG_OP(req->send.ep, EAGER);
+        }
+    } else if (status == UCS_ERR_NO_PROGRESS) {
+        /* RMA/AM rendezvous */
+        ucs_assert(req->send.length >= rndv_thresh);
+        status = ucp_tag_send_start_rndv(req);
+        if (status != UCS_OK) {
             return UCS_STATUS_PTR(status);
         }
-    }
 
-    if (req->flags & UCP_REQUEST_FLAG_SYNC) {
-        UCP_EP_STAT_TAG_OP(req->send.ep, EAGER_SYNC);
+        UCP_EP_STAT_TAG_OP(req->send.ep, RNDV);
     } else {
-        UCP_EP_STAT_TAG_OP(req->send.ep, EAGER);
+        return UCS_STATUS_PTR(status);
     }
 
     /*
@@ -123,47 +124,39 @@ ucp_tag_send_req(ucp_request_t *req, size_t dt_count,
 }
 
 static UCS_F_ALWAYS_INLINE void
-ucp_tag_send_req_init(ucp_request_t* req, ucp_ep_h ep, const void* buffer,
-                      uintptr_t datatype, ucs_memory_type_t memory_type,
-                      size_t count, ucp_tag_t tag, uint32_t flags)
+ucp_tag_send_req_init(ucp_request_t *req, ucp_ep_h ep, const void *buffer,
+                      uintptr_t datatype, size_t count, ucp_tag_t tag,
+                      uint32_t flags, const ucp_request_param_t *param)
 {
-    req->flags                  = flags | UCP_REQUEST_FLAG_SEND_TAG;
-    req->send.ep                = ep;
-    req->send.buffer            = (void*)buffer;
-    req->send.datatype          = datatype;
-    req->send.msg_proto.tag.tag = tag;
+    req->flags              = flags | UCP_REQUEST_FLAG_SEND_TAG;
+    req->send.ep            = ep;
+    req->send.buffer        = (void*)buffer;
+    req->send.datatype      = datatype;
+    req->send.msg_proto.tag = tag;
     ucp_request_send_state_init(req, datatype, count);
     req->send.length       = ucp_dt_length(req->send.datatype, count,
                                            req->send.buffer,
                                            &req->send.state.dt);
-    req->send.mem_type     = ucp_get_memory_type(ep->worker->context, (void*)buffer,
-                                                 req->send.length, memory_type);
+    req->send.mem_type     = ucp_request_get_memory_type(ep->worker->context,
+                                                         (void*)buffer,
+                                                         req->send.length, param);
     req->send.lane         = ucp_ep_config(ep)->tag.lane;
     req->send.pending_lane = UCP_NULL_LANE;
 }
 
-static UCS_F_ALWAYS_INLINE int
-ucp_tag_eager_is_inline(ucp_ep_h ep, const ucp_memtype_thresh_t *max_eager_short,
-                        ssize_t length)
-{
-    return (ucs_likely(length <= max_eager_short->memtype_off) ||
-            (length <= max_eager_short->memtype_on &&
-             ucp_memory_type_cache_is_empty(ep->worker->context)));
-}
-
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucp_tag_send_inline(ucp_ep_h ep, const void *buffer, size_t length, ucp_tag_t tag)
 {
     ucs_status_t status;
 
-    if (ucp_tag_eager_is_inline(ep, &ucp_ep_config(ep)->tag.max_eager_short,
-                                length)) {
+    if (ucp_proto_is_inline(ep, &ucp_ep_config(ep)->tag.max_eager_short, length)) {
         UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(ucp_eager_hdr_t));
         UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uint64_t));
         status = uct_ep_am_short(ucp_ep_get_am_uct_ep(ep), UCP_AM_ID_EAGER_ONLY,
                                  tag, buffer, length);
-    } else if (ucp_tag_eager_is_inline(ep, &ucp_ep_config(ep)->tag.offload.max_eager_short,
-                                       length)) {
+    } else if (ucp_proto_is_inline(ep,
+                                   &ucp_ep_config(ep)->tag.offload.max_eager_short,
+                                   length)) {
         UCS_STATIC_ASSERT(sizeof(ucp_tag_t) == sizeof(uct_tag_t));
         status = uct_ep_tag_eager_short(ucp_ep_get_tag_uct_ep(ep), tag, buffer,
                                         length);
@@ -240,12 +233,13 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx,
     ucp_request_t *req;
     ucs_status_ptr_t ret;
     uintptr_t datatype;
-    ucs_memory_type_t memory_type;
     uint32_t attr_mask;
     ucp_worker_h worker;
 
     UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_TAG,
                                     return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
+
     UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
 
     ucs_trace_req("send_nbx buffer %p count %zu tag %"PRIx64" to %s",
@@ -276,22 +270,21 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_nbx,
         goto out;
     }
 
-    worker      = ep->worker;
-    memory_type = ucp_request_param_mem_type(param);
-    req         = ucp_request_get_param(worker, param,
-                                        {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
-                                        goto out;});
+    worker = ep->worker;
+    req    = ucp_request_get_param(worker, param, {
+        ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
+        goto out;
+    });
 
     if (worker->context->config.ext.proto_enable) {
-        req->send.msg_proto.tag.tag = tag;
+        req->send.msg_proto.tag = tag;
 
         ret = ucp_proto_request_send_op(ep, &ucp_ep_config(ep)->proto_select,
                                         UCP_WORKER_CFG_INDEX_NULL, req,
                                         UCP_OP_ID_TAG_SEND, buffer, count,
                                         datatype, contig_length, param);
     } else {
-        ucp_tag_send_req_init(req, ep, buffer, datatype, memory_type, count,
-                              tag, 0);
+        ucp_tag_send_req_init(req, ep, buffer, datatype, count, tag, 0, param);
         ret = ucp_tag_send_req(req, count, &ucp_ep_config(ep)->tag.eager,
                                param, ucp_ep_config(ep)->tag.proto);
     }
@@ -305,22 +298,23 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx,
                  ucp_ep_h ep, const void *buffer, size_t count,
                  ucp_tag_t tag, const ucp_request_param_t *param)
 {
+    ucp_worker_h worker = ep->worker;
     ucs_status_t status;
     ucp_request_t *req;
     ucs_status_ptr_t ret;
     uintptr_t datatype;
-    ucs_memory_type_t memory_type;
 
-    UCP_CONTEXT_CHECK_FEATURE_FLAGS(ep->worker->context, UCP_FEATURE_TAG,
-                                    return UCS_STATUS_PTR(UCS_ERR_INVALID_PARAM));
-    UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(ep->worker);
+    UCP_CONTEXT_CHECK_FEATURE_FLAGS(worker->context, UCP_FEATURE_TAG,
+                                    return UCS_STATUS_PTR(
+                                            UCS_ERR_INVALID_PARAM));
+    UCP_REQUEST_CHECK_PARAM(param);
+
+    UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker);
 
     ucs_trace_req("send_sync_nbx buffer %p count %zu tag %"PRIx64" to %s",
                   buffer, count, tag, ucp_ep_peer_name(ep));
 
-    datatype    = ucp_request_param_datatype(param);
-    memory_type = ucp_request_param_mem_type(param);
-
+    datatype = ucp_request_param_datatype(param);
     if (!ucp_ep_config_test_rndv_support(ucp_ep_config(ep))) {
         ret = UCS_STATUS_PTR(UCS_ERR_UNSUPPORTED);
         goto out;
@@ -332,15 +326,26 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_tag_send_sync_nbx,
         goto out;
     }
 
-    req = ucp_request_get_param(ep->worker, param,
+    req = ucp_request_get_param(worker, param,
                                 {ret = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY);
                                  goto out;});
 
-    ucp_tag_send_req_init(req, ep, buffer, datatype, memory_type, count, tag,
-                          UCP_REQUEST_FLAG_SYNC);
-    ret = ucp_tag_send_req(req, count, &ucp_ep_config(ep)->tag.eager,
-                           param, ucp_ep_config(ep)->tag.sync_proto);
+    if (worker->context->config.ext.proto_enable) {
+        req->send.msg_proto.tag = tag;
+        ret = ucp_proto_request_send_op(ep, &ucp_ep_config(ep)->proto_select,
+                                        UCP_WORKER_CFG_INDEX_NULL, req,
+                                        UCP_OP_ID_TAG_SEND_SYNC, buffer, count,
+                                        datatype,
+                                        ucp_contig_dt_length(datatype, count),
+                                        param);
+    } else {
+        ucp_tag_send_req_init(req, ep, buffer, datatype, count, tag,
+                              UCP_REQUEST_FLAG_SYNC, param);
+        ret = ucp_tag_send_req(req, count, &ucp_ep_config(ep)->tag.eager, param,
+                               ucp_ep_config(ep)->tag.sync_proto);
+    }
+
 out:
-    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(ep->worker);
+    UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker);
     return ret;
 }
diff --git a/src/ucp/wireup/address.c b/src/ucp/wireup/address.c
index d61bfe2430c..c8aba24a451 100644
--- a/src/ucp/wireup/address.c
+++ b/src/ucp/wireup/address.c
@@ -49,10 +49,11 @@
 
 typedef struct {
     size_t           dev_addr_len;
-    uint64_t         tl_bitmap;
+    ucp_tl_bitmap_t  tl_bitmap;
     ucp_rsc_index_t  rsc_index;
     ucp_rsc_index_t  tl_count;
     unsigned         num_paths;
+    ucs_sys_device_t sys_dev;
     size_t           tl_addrs_size;
 } ucp_address_packed_device_t;
 
@@ -68,7 +69,7 @@ typedef struct {
                                       * 2 hsb :
                                       *        - amo32
                                       *        - amo64 */
-} ucp_address_packed_iface_attr_t;
+} UCS_S_PACKED ucp_address_packed_iface_attr_t;
 
 
 /* In unified mode we pack resource index instead of iface attrs to the address,
@@ -86,7 +87,7 @@ typedef struct {
 typedef struct {
     ucp_rsc_index_t  rsc_index;
     float            lat_ovh;
-} ucp_address_unified_iface_attr_t;
+} UCS_S_PACKED ucp_address_unified_iface_attr_t;
 
 
 #define UCP_ADDRESS_FLAG_ATOMIC32     UCS_BIT(30) /* 32bit atomic operations */
@@ -96,15 +97,26 @@ typedef struct {
 #define UCP_ADDRESS_FLAG_HAS_EP_ADDR  0x40u  /* For iface address:
                                                 Indicates that ep addr is packed
                                                 right after iface addr */
-#define UCP_ADDRESS_FLAG_HAVE_PATHS   0x40u  /* For device address:
+#define UCP_ADDRESS_FLAG_NUM_PATHS    0x40u  /* For device address:
                                                 Indicates that number of paths on the
                                                 device is packed right after device
                                                 address, otherwise number of paths
                                                 defaults to 1. */
-#define UCP_ADDRESS_FLAG_LEN_MASK     (UCS_MASK(8) ^ \
-                                        (UCP_ADDRESS_FLAG_HAS_EP_ADDR | \
-                                         UCP_ADDRESS_FLAG_HAVE_PATHS  | \
-                                         UCP_ADDRESS_FLAG_LAST))
+#define UCP_ADDRESS_FLAG_SYS_DEVICE   0x20u  /* For device address:
+                                                Indicates that system device is
+                                                packed after device address or
+                                                number of paths (if present) */
+
+/* Mask for iface and endpoint address length */
+#define UCP_ADDRESS_IFACE_LEN_MASK   (UCS_MASK(8) ^ \
+                                      (UCP_ADDRESS_FLAG_HAS_EP_ADDR | \
+                                       UCP_ADDRESS_FLAG_LAST))
+
+/* Mask for device address length */
+#define UCP_ADDRESS_DEVICE_LEN_MASK  (UCS_MASK(8) ^ \
+                                      (UCP_ADDRESS_FLAG_SYS_DEVICE | \
+                                       UCP_ADDRESS_FLAG_NUM_PATHS | \
+                                       UCP_ADDRESS_FLAG_LAST))
 
 #define UCP_ADDRESS_FLAG_MD_EMPTY_DEV 0x80u  /* Device without TL addresses */
 #define UCP_ADDRESS_FLAG_MD_ALLOC     0x40u  /* MD can register  */
@@ -144,12 +156,13 @@ static uint64_t ucp_worker_iface_can_connect(uct_iface_attr_t *attrs)
 }
 
 /* Pack a string and return a pointer to storage right after the string */
-static void* ucp_address_pack_worker_name(ucp_worker_h worker, void *dest)
+static void *
+ucp_address_pack_worker_address_name(ucp_worker_h worker, void *dest)
 {
     const char *s;
     size_t length;
 
-    s      = ucp_worker_get_name(worker);
+    s      = ucp_worker_get_address_name(worker);
     length = strlen(s);
     ucs_assert(length <= UINT8_MAX);
     *(uint8_t*)dest = length;
@@ -158,13 +171,13 @@ static void* ucp_address_pack_worker_name(ucp_worker_h worker, void *dest)
 }
 
 /* Unpack a string and return pointer to next storage byte */
-static const void*
-ucp_address_unpack_worker_name(const void *src, char *s)
+static const void *
+ucp_address_unpack_worker_address_name(const void *src, char *s)
 {
     size_t length, avail;
 
     length   = *(const uint8_t*)src;
-    avail    = ucs_min(length, UCP_WORKER_NAME_MAX - 1);
+    avail    = ucs_min(length, UCP_WORKER_ADDRESS_NAME_MAX - 1);
     memcpy(s, UCS_PTR_TYPE_OFFSET(src, uint8_t), avail);
     s[avail] = '\0';
     return UCS_PTR_TYPE_OFFSET(UCS_PTR_BYTE_OFFSET(src, length), uint8_t);
@@ -180,8 +193,7 @@ ucp_address_get_device(ucp_context_h context, ucp_rsc_index_t rsc_index,
 
     for (dev = devices; dev < devices + *num_devices_p; ++dev) {
         if ((tl_rsc[rsc_index].md_index == tl_rsc[dev->rsc_index].md_index) &&
-            !strcmp(tl_rsc[rsc_index].tl_rsc.dev_name,
-                    tl_rsc[dev->rsc_index].tl_rsc.dev_name)) {
+            (tl_rsc[rsc_index].dev_index == tl_rsc[dev->rsc_index].dev_index)) {
             goto out;
         }
     }
@@ -193,11 +205,13 @@ ucp_address_get_device(ucp_context_h context, ucp_rsc_index_t rsc_index,
 }
 
 static ucs_status_t
-ucp_address_gather_devices(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap,
-                           uint64_t flags, ucp_address_packed_device_t **devices_p,
+ucp_address_gather_devices(ucp_worker_h worker, ucp_ep_h ep,
+                           const ucp_tl_bitmap_t *tl_bitmap, uint64_t flags,
+                           ucp_address_packed_device_t **devices_p,
                            ucp_rsc_index_t *num_devices_p)
 {
     ucp_context_h context = worker->context;
+    ucp_tl_bitmap_t current_tl_bitmap = *tl_bitmap;
     ucp_address_packed_device_t *dev, *devices;
     uct_iface_attr_t *iface_attr;
     ucp_rsc_index_t num_devices;
@@ -210,8 +224,8 @@ ucp_address_gather_devices(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap,
     }
 
     num_devices = 0;
-    tl_bitmap  &= context->tl_bitmap;
-    ucs_for_each_bit(rsc_index, tl_bitmap) {
+    UCS_BITMAP_AND_INPLACE(&current_tl_bitmap, context->tl_bitmap);
+    UCS_BITMAP_FOR_EACH_BIT(current_tl_bitmap, rsc_index) {
         iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
         if (!ucp_worker_iface_can_connect(iface_attr)) {
             continue;
@@ -250,6 +264,12 @@ ucp_address_gather_devices(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap,
             dev->dev_addr_len = 0;
         }
 
+        if (flags & UCP_ADDRESS_PACK_FLAG_SYS_DEVICE) {
+            dev->sys_dev = context->tl_rscs[rsc_index].tl_rsc.sys_device;
+        } else {
+            dev->sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+        }
+
         if (iface_attr->dev_num_paths > UINT8_MAX) {
             ucs_error("only up to %d paths are supported by address pack (got: %u)",
                       UINT8_MAX, iface_attr->dev_num_paths);
@@ -258,7 +278,7 @@ ucp_address_gather_devices(ucp_worker_h worker, ucp_ep_h ep, uint64_t tl_bitmap,
         }
 
         dev->rsc_index  = rsc_index;
-        dev->tl_bitmap |= UCS_BIT(rsc_index);
+        UCS_BITMAP_SET(dev->tl_bitmap, rsc_index);
         dev->num_paths  = iface_attr->dev_num_paths;
     }
 
@@ -284,7 +304,7 @@ static size_t ucp_address_packed_size(ucp_worker_h worker,
 
     if ((worker->context->config.ext.address_debug_info) &&
         (pack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_NAME)) {
-        size += strlen(ucp_worker_get_name(worker)) + 1;
+        size += strlen(ucp_worker_get_address_name(worker)) + 1;
     }
 
     if (num_devices == 0) {
@@ -297,7 +317,10 @@ static size_t ucp_address_packed_size(ucp_worker_h worker,
                 size += dev->dev_addr_len;  /* device address */
             }
             if (dev->num_paths > 1) {
-                size += 1;                  /* number of paths */
+                size += 1; /* number of paths */
+            }
+            if (dev->sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN) {
+                size += 1; /* system device */
             }
             size += dev->tl_addrs_size; /* transport addresses */
         }
@@ -452,10 +475,10 @@ ucp_address_unpack_iface_attr(ucp_worker_t *worker,
     if (ucp_worker_is_unified_mode(worker)) {
         /* Address contains resources index and iface latency overhead
          * (not all iface attrs). */
-        unified               = ptr;
-        rsc_idx               = unified->rsc_index & UCP_ADDRESS_FLAG_LEN_MASK;
-        iface_attr->lat_ovh   = fabs(unified->lat_ovh);
-        if (!(worker->context->tl_bitmap & UCS_BIT(rsc_idx))) {
+        unified             = ptr;
+        rsc_idx             = unified->rsc_index & UCP_ADDRESS_IFACE_LEN_MASK;
+        iface_attr->lat_ovh = fabs(unified->lat_ovh);
+        if (!UCS_BITMAP_GET(worker->context->tl_bitmap, rsc_idx)) {
             if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) {
                 ucs_error("failed to unpack address, resource[%d] is not valid",
                           rsc_idx);
@@ -505,7 +528,7 @@ ucp_address_unpack_iface_attr(ucp_worker_t *worker,
         iface_attr->atomic.atomic32.op_flags  |= UCP_ATOMIC_OP_MASK;
         iface_attr->atomic.atomic32.fop_flags |= UCP_ATOMIC_FOP_MASK;
     }
-    
+
     /* Unpack iface 64-bit atomic operations */
     if (packed->prio_cap_flags & UCP_ADDRESS_FLAG_ATOMIC64) {
         iface_attr->atomic.atomic64.op_flags  |= UCP_ATOMIC_OP_MASK;
@@ -541,23 +564,24 @@ ucp_address_iface_flags_ptr(ucp_worker_h worker, void *attr_ptr, int attr_len)
     return UCS_PTR_BYTE_OFFSET(attr_ptr, attr_len);
 }
 
-static void*
-ucp_address_pack_length(ucp_worker_h worker, void *ptr, size_t addr_length)
+static void *ucp_address_pack_iface_length(ucp_worker_h worker, void *ptr,
+                                           size_t addr_length)
 {
     if (ucp_worker_is_unified_mode(worker)) {
         return ptr;
     }
 
-    ucs_assertv(addr_length <= UCP_ADDRESS_FLAG_LEN_MASK, "addr_length=%zu",
+    ucs_assertv(addr_length <= UCP_ADDRESS_IFACE_LEN_MASK, "addr_length=%zu",
                 addr_length);
     *(uint8_t*)ptr = addr_length;
 
     return UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
 }
 
-static const void*
-ucp_address_unpack_length(ucp_worker_h worker, const void* flags_ptr, const void *ptr,
-                          size_t *addr_length, int is_ep_addr, int *is_last_iface)
+static const void *
+ucp_address_unpack_iface_length(ucp_worker_h worker, const void *flags_ptr,
+                                const void *ptr, size_t *addr_length,
+                                int is_ep_addr, int *is_last_iface)
 {
     ucp_rsc_index_t rsc_index;
     uct_iface_attr_t *attr;
@@ -573,7 +597,7 @@ ucp_address_unpack_length(ucp_worker_h worker, const void* flags_ptr, const void
          * - iface and ep addr lengths are not packed, need to take them from
          *   local iface attrs */
         unified   = flags_ptr;
-        rsc_index = unified->rsc_index & UCP_ADDRESS_FLAG_LEN_MASK;
+        rsc_index = unified->rsc_index & UCP_ADDRESS_IFACE_LEN_MASK;
         attr      = ucp_worker_iface_get_attr(worker, rsc_index);
 
         ucs_assert(&unified->rsc_index == flags_ptr);
@@ -591,17 +615,16 @@ ucp_address_unpack_length(ucp_worker_h worker, const void* flags_ptr, const void
         *is_last_iface = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LAST;
     }
 
-    *addr_length = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LEN_MASK;
+    *addr_length = *(uint8_t*)ptr & UCP_ADDRESS_IFACE_LEN_MASK;
 
     return UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
 }
 
-static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
-                                        void *buffer, size_t size,
-                                        uint64_t tl_bitmap, unsigned pack_flags,
-                                        const ucp_lane_index_t *lanes2remote,
-                                        const ucp_address_packed_device_t *devices,
-                                        ucp_rsc_index_t num_devices)
+static ucs_status_t
+ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size,
+                    unsigned pack_flags, const ucp_lane_index_t *lanes2remote,
+                    const ucp_address_packed_device_t *devices,
+                    ucp_rsc_index_t num_devices)
 {
     ucp_context_h context       = worker->context;
     uint64_t md_flags_pack_mask = (UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC);
@@ -612,7 +635,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
     ucp_worker_iface_t *wiface;
     ucp_rsc_index_t rsc_index;
     ucp_lane_index_t lane, remote_lane;
-    uint64_t dev_tl_bitmap;
+    ucp_tl_bitmap_t dev_tl_bitmap;
     unsigned num_ep_addrs;
     ucs_status_t status;
     size_t iface_addr_len;
@@ -643,7 +666,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
         *address_header_p |= UCP_ADDRESS_HEADER_FLAG_DEBUG_INFO;
 
         if (pack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_NAME) {
-            ptr            = ucp_address_pack_worker_name(worker, ptr);
+            ptr            = ucp_address_pack_worker_address_name(worker, ptr);
         }
     }
 
@@ -654,8 +677,8 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
     }
 
     for (dev = devices; dev < (devices + num_devices); ++dev) {
-
-        dev_tl_bitmap = context->tl_bitmap & dev->tl_bitmap;
+        dev_tl_bitmap = context->tl_bitmap;
+        UCS_BITMAP_AND_INPLACE(&dev_tl_bitmap, dev->tl_bitmap);
 
         /* MD index */
         md_index       = context->tl_rscs[dev->rsc_index].md_index;
@@ -663,30 +686,47 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
         ucs_assertv_always(md_index <= UCP_ADDRESS_FLAG_MD_MASK,
                            "md_index=%d", md_index);
 
-        *(uint8_t*)ptr = md_index |
-                         ((dev_tl_bitmap == 0)           ? UCP_ADDRESS_FLAG_MD_EMPTY_DEV : 0) |
-                         ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC     : 0) |
-                         ((md_flags & UCT_MD_FLAG_REG)   ? UCP_ADDRESS_FLAG_MD_REG       : 0);
+        *(uint8_t*)ptr = md_index;
+
+        if (UCS_BITMAP_IS_ZERO_INPLACE(&dev_tl_bitmap)) {
+            *(uint8_t*)ptr |= UCP_ADDRESS_FLAG_MD_EMPTY_DEV;
+        }
+
+        if (md_flags & UCT_MD_FLAG_ALLOC) {
+            *(uint8_t*)ptr |= UCP_ADDRESS_FLAG_MD_ALLOC;
+        }
+
+        if (md_flags & UCT_MD_FLAG_REG) {
+            *(uint8_t*)ptr |= UCP_ADDRESS_FLAG_MD_REG;
+        }
+
         ptr = UCS_PTR_TYPE_OFFSET(ptr, md_index);
 
         /* Device address length */
         *(uint8_t*)ptr = (dev == (devices + num_devices - 1)) ?
                          UCP_ADDRESS_FLAG_LAST : 0;
         if (pack_flags & UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR) {
-            ucs_assert(dev->dev_addr_len <= UCP_ADDRESS_FLAG_LEN_MASK);
+            ucs_assert(dev->dev_addr_len <= UCP_ADDRESS_DEVICE_LEN_MASK);
             *(uint8_t*)ptr |= dev->dev_addr_len;
         }
+        flags_ptr = ptr;
+        ptr       = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
 
         /* Device number of paths flag and value */
         ucs_assert(dev->num_paths >= 1);
-        ucs_assert(dev->num_paths <= UINT8_MAX);
-
         if (dev->num_paths > 1) {
-            *(uint8_t*)ptr |= UCP_ADDRESS_FLAG_HAVE_PATHS;
-            ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
-            *(uint8_t*)ptr = dev->num_paths;
+            ucs_assert(dev->num_paths <= UINT8_MAX);
+            *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_NUM_PATHS;
+            *(uint8_t*)ptr        = dev->num_paths;
+            ptr                   = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
+        }
+
+        /* System device */
+        if (dev->sys_dev != UCS_SYS_DEVICE_ID_UNKNOWN) {
+            *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_SYS_DEVICE;
+            *(uint8_t*)ptr        = dev->sys_dev;
+            ptr                   = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
         }
-        ptr = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
 
         /* Device address */
         if (pack_flags & UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR) {
@@ -702,8 +742,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
         }
 
         flags_ptr = NULL;
-        ucs_for_each_bit(rsc_index, dev_tl_bitmap) {
-
+        UCS_BITMAP_FOR_EACH_BIT(dev_tl_bitmap, rsc_index) {
             wiface     = ucp_worker_iface(worker, rsc_index);
             iface_attr = &wiface->attr;
 
@@ -717,7 +756,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
                                       context->tl_rscs[rsc_index].tl_name_csum);
 
             /* Transport information */
-            enable_amo = worker->atomic_tls & UCS_BIT(rsc_index);
+            enable_amo = UCS_BITMAP_GET(worker->atomic_tls, rsc_index);
             attr_len   = ucp_address_pack_iface_attr(worker, ptr, rsc_index,
                                                      iface_attr, pack_flags,
                                                      enable_amo);
@@ -737,7 +776,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
             ptr       = UCS_PTR_BYTE_OFFSET(ptr, attr_len);
 
             /* Pack iface address */
-            ptr = ucp_address_pack_length(worker, ptr, iface_addr_len);
+            ptr = ucp_address_pack_iface_length(worker, ptr, iface_addr_len);
             if (pack_flags & UCP_ADDRESS_PACK_FLAG_IFACE_ADDR) {
                 status = uct_iface_get_address(wiface->iface,
                                                (uct_iface_addr_t*)ptr);
@@ -760,12 +799,14 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
                 ep_lane_ptr = NULL;
 
                 ucs_for_each_bit(lane, ucp_ep_config(ep)->p2p_lanes) {
+                    ucs_assert(lane < UCP_MAX_LANES);
                     if (ucp_ep_get_rsc_index(ep, lane) != rsc_index) {
                         continue;
                     }
 
                     /* pack ep address length and save pointer to flags */
-                    ptr = ucp_address_pack_length(worker, ptr, ep_addr_len);
+                    ptr = ucp_address_pack_iface_length(worker, ptr,
+                                                        ep_addr_len);
 
                     /* pack ep address */
                     status = uct_ep_get_address(ep->uct_eps[lane], ptr);
@@ -781,7 +822,7 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
                      */
                     remote_lane  = (lanes2remote == NULL) ? lane :
                                    lanes2remote[lane];
-                    ucs_assertv(remote_lane <= UCP_ADDRESS_FLAG_LEN_MASK,
+                    ucs_assertv(remote_lane <= UCP_ADDRESS_IFACE_LEN_MASK,
                                 "remote_lane=%d", remote_lane);
                     ep_lane_ptr  = ptr;
                     *ep_lane_ptr = remote_lane;
@@ -809,18 +850,21 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
                        !(*(uint8_t*)flags_ptr & UCP_ADDRESS_FLAG_HAS_EP_ADDR));
 
             if (!(pack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) {
-               ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT" "
-                          "eps %u md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64
-                          " bw %e + %e/n ovh %e lat_ovh %e dev_priority %d a32 "
-                          "0x%"PRIx64"/0x%"PRIx64" a64 0x%"PRIx64"/0x%"PRIx64,
+                ucs_trace("pack addr[%d] : " UCT_TL_RESOURCE_DESC_FMT
+                          " sysdev %d paths %d eps %u md_flags 0x%" PRIx64
+                          " tl_flags 0x%" PRIx64 " bw %.2f+%.2f/nMBs"
+                          " ovh %.0fns lat_ovh %.0fns dev_priority %d"
+                          " a32 0x%" PRIx64 "/0x%" PRIx64 " a64 0x%" PRIx64
+                          "/0x%" PRIx64,
                           addr_index,
-                          UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[rsc_index].tl_rsc),
-                          num_ep_addrs, md_flags, iface_attr->cap.flags,
-                          iface_attr->bandwidth.dedicated,
-                          iface_attr->bandwidth.shared,
-                          iface_attr->overhead,
-                          iface_attr->latency.c,
-                          iface_attr->priority,
+                          UCT_TL_RESOURCE_DESC_ARG(
+                                  &context->tl_rscs[rsc_index].tl_rsc),
+                          dev->sys_dev, dev->num_paths, num_ep_addrs, md_flags,
+                          iface_attr->cap.flags,
+                          iface_attr->bandwidth.dedicated / UCS_MBYTE,
+                          iface_attr->bandwidth.shared / UCS_MBYTE,
+                          iface_attr->overhead * 1e9,
+                          iface_attr->latency.c * 1e9, iface_attr->priority,
                           iface_attr->cap.atomic32.op_flags,
                           iface_attr->cap.atomic32.fop_flags,
                           iface_attr->cap.atomic64.op_flags,
@@ -835,11 +879,11 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
          * during the above loop So, set the LAST flag for the flags_ptr
          * from the last iteration */
         if (flags_ptr != NULL) {
-            ucs_assert(dev_tl_bitmap != 0);
+            ucs_assert(!UCS_BITMAP_IS_ZERO_INPLACE(&dev_tl_bitmap));
             *(uint8_t*)flags_ptr |= UCP_ADDRESS_FLAG_LAST;
         } else {
             /* cppcheck-suppress internalAstError */
-            ucs_assert(dev_tl_bitmap == 0);
+            ucs_assert(UCS_BITMAP_IS_ZERO_INPLACE(&dev_tl_bitmap));
         }
     }
 
@@ -851,7 +895,8 @@ static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep,
 }
 
 ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep,
-                              uint64_t tl_bitmap, unsigned pack_flags,
+                              const ucp_tl_bitmap_t *tl_bitmap,
+                              unsigned pack_flags,
                               const ucp_lane_index_t *lanes2remote,
                               size_t *size_p, void **buffer_p)
 {
@@ -885,7 +930,7 @@ ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep,
     memset(buffer, 0, size);
 
     /* Pack the address */
-    status = ucp_address_do_pack(worker, ep, buffer, size, tl_bitmap, pack_flags,
+    status = ucp_address_do_pack(worker, ep, buffer, size, pack_flags,
                                  lanes2remote, devices, num_devices);
     if (status != UCS_OK) {
         ucs_free(buffer);
@@ -914,6 +959,7 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
     int last_dev, last_tl, last_ep_addr;
     const uct_device_addr_t *dev_addr;
     ucp_rsc_index_t dev_index;
+    ucs_sys_device_t sys_dev;
     ucp_md_index_t md_index;
     unsigned dev_num_paths;
     ucs_status_t status;
@@ -952,7 +998,8 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
 
     if ((address_header & UCP_ADDRESS_HEADER_FLAG_DEBUG_INFO) &&
         (unpack_flags & UCP_ADDRESS_PACK_FLAG_WORKER_NAME)) {
-        ptr = ucp_address_unpack_worker_name(ptr, unpacked_address->name);
+        ptr = ucp_address_unpack_worker_address_name(ptr,
+                                                     unpacked_address->name);
     } else {
         ucs_strncpy_safe(unpacked_address->name, UCP_WIREUP_EMPTY_PEER_NAME,
                          sizeof(unpacked_address->name));
@@ -985,15 +1032,22 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
         ptr          = UCS_PTR_TYPE_OFFSET(ptr, md_byte);
 
         /* device address length */
-        dev_addr_len = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_LEN_MASK;
-        last_dev     = (*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_LAST;
-        if ((*(uint8_t*)ptr) & UCP_ADDRESS_FLAG_HAVE_PATHS) {
-            ptr           = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
+        flags_ptr    = ptr;
+        ptr          = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
+        dev_addr_len = (*(uint8_t*)flags_ptr) & UCP_ADDRESS_DEVICE_LEN_MASK;
+        last_dev     = (*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_LAST;
+        if ((*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_NUM_PATHS) {
             dev_num_paths = *(uint8_t*)ptr;
+            ptr           = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
         } else {
             dev_num_paths = 1;
         }
-        ptr      = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
+        if ((*(uint8_t*)flags_ptr) & UCP_ADDRESS_FLAG_SYS_DEVICE) {
+            sys_dev = *(uint8_t*)ptr;
+            ptr     = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
+        } else {
+            sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+        }
 
         dev_addr = ptr;
         ptr      = UCS_PTR_BYTE_OFFSET(ptr, dev_addr_len);
@@ -1003,7 +1057,7 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
             if (address >= &address_list[UCP_MAX_RESOURCES]) {
                 if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) {
                     ucs_error("failed to parse address: number of addresses"
-                              "exceeds %d", UCP_MAX_RESOURCES);
+                              " exceeds %d", UCP_MAX_RESOURCES);
                 }
                 goto err_free;
             }
@@ -1014,6 +1068,7 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
 
             address->dev_addr      = (dev_addr_len > 0) ? dev_addr : NULL;
             address->md_index      = md_index;
+            address->sys_dev       = sys_dev;
             address->dev_index     = dev_index;
             address->md_flags      = md_flags;
             address->dev_num_paths = dev_num_paths;
@@ -1026,8 +1081,8 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
 
             flags_ptr = ucp_address_iface_flags_ptr(worker, (void*)ptr, attr_len);
             ptr       = UCS_PTR_BYTE_OFFSET(ptr, attr_len);
-            ptr       = ucp_address_unpack_length(worker, flags_ptr, ptr,
-                                                  &iface_addr_len, 0, &last_tl);
+            ptr       = ucp_address_unpack_iface_length(worker, flags_ptr, ptr,
+                                                        &iface_addr_len, 0, &last_tl);
             address->iface_addr   = (iface_addr_len > 0) ? ptr : NULL;
             address->num_ep_addrs = 0;
             ptr                   = UCS_PTR_BYTE_OFFSET(ptr, iface_addr_len);
@@ -1042,13 +1097,14 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
                     goto err_free;
                 }
 
+                ptr = ucp_address_unpack_iface_length(worker, flags_ptr, ptr,
+                                                      &ep_addr_len, 1, NULL);
+
                 ep_addr       = &address->ep_addrs[address->num_ep_addrs++];
-                ptr           = ucp_address_unpack_length(worker, flags_ptr, ptr,
-                                                          &ep_addr_len, 1, NULL);
                 ep_addr->addr = ptr;
                 ptr           = UCS_PTR_BYTE_OFFSET(ptr, ep_addr_len);
 
-                ep_addr->lane = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LEN_MASK;
+                ep_addr->lane = *(uint8_t*)ptr & UCP_ADDRESS_IFACE_LEN_MASK;
                 last_ep_addr  = *(uint8_t*)ptr & UCP_ADDRESS_FLAG_LAST;
 
                 if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) {
@@ -1062,17 +1118,20 @@ ucs_status_t ucp_address_unpack(ucp_worker_t *worker, const void *buffer,
             }
 
             if (!(unpack_flags & UCP_ADDRESS_PACK_FLAG_NO_TRACE)) {
-                ucs_trace("unpack addr[%d] : eps %u md_flags 0x%"PRIx64
-                          " tl_iface_flags 0x%"PRIx64" tl_event_flags 0x%"PRIx64
-                          " bw %e + %e/n ovh %e lat_ovh %e dev_priority %d a32 "
-                          "0x%"PRIx64"/0x%"PRIx64" a64 0x%"PRIx64"/0x%"PRIx64,
-                          (int)(address - address_list), address->num_ep_addrs,
+                ucs_trace("unpack addr[%d] : sysdev %d paths %d eps %u"
+                          " md_flags 0x%" PRIx64 " tl_iface_flags 0x%" PRIx64
+                          " tl_event_flags 0x%" PRIx64 " bw %.2f+%.2f/nMBs"
+                          " ovh %.0fns lat_ovh %.0fns dev_priority %d"
+                          " a32 0x%" PRIx64 "/0x%" PRIx64 " a64 0x%" PRIx64
+                          "/0x%" PRIx64,
+                          (int)(address - address_list), address->sys_dev,
+                          address->dev_num_paths, address->num_ep_addrs,
                           address->md_flags, address->iface_attr.cap_flags,
                           address->iface_attr.event_flags,
-                          address->iface_attr.bandwidth.dedicated,
-                          address->iface_attr.bandwidth.shared,
-                          address->iface_attr.overhead,
-                          address->iface_attr.lat_ovh,
+                          address->iface_attr.bandwidth.dedicated / UCS_MBYTE,
+                          address->iface_attr.bandwidth.shared / UCS_MBYTE,
+                          address->iface_attr.overhead * 1e9,
+                          address->iface_attr.lat_ovh * 1e9,
                           address->iface_attr.priority,
                           address->iface_attr.atomic.atomic32.op_flags,
                           address->iface_attr.atomic.atomic32.fop_flags,
diff --git a/src/ucp/wireup/address.h b/src/ucp/wireup/address.h
index 09d7de908f2..b18f3ef1343 100644
--- a/src/ucp/wireup/address.h
+++ b/src/ucp/wireup/address.h
@@ -7,8 +7,6 @@
 #ifndef UCP_ADDRESS_H_
 #define UCP_ADDRESS_H_
 
-#include "wireup.h"
-
 #include <uct/api/uct.h>
 #include <ucp/core/ucp_context.h>
 #include <ucp/core/ucp_worker.h>
@@ -41,12 +39,26 @@ enum {
 
 
 enum {
-    UCP_ADDRESS_PACK_FLAG_WORKER_UUID     = UCS_BIT(0), /* Add worker UUID */
-    UCP_ADDRESS_PACK_FLAG_WORKER_NAME     = UCS_BIT(1), /* Pack worker name */
-    UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR     = UCS_BIT(2), /* Pack device addresses */
-    UCP_ADDRESS_PACK_FLAG_IFACE_ADDR      = UCS_BIT(3), /* Pack interface addresses */
-    UCP_ADDRESS_PACK_FLAG_EP_ADDR         = UCS_BIT(4), /* Pack endpoint addresses */
-    UCP_ADDRESS_PACK_FLAG_TL_RSC_IDX      = UCS_BIT(5), /* Pack TL resource index */
+    /* Add worker UUID */
+    UCP_ADDRESS_PACK_FLAG_WORKER_UUID = UCS_BIT(0),
+
+    /* Pack worker name */
+    UCP_ADDRESS_PACK_FLAG_WORKER_NAME = UCS_BIT(1),
+
+    /* Pack device addresses */
+    UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR = UCS_BIT(2),
+
+    /* Pack interface addresses */
+    UCP_ADDRESS_PACK_FLAG_IFACE_ADDR  = UCS_BIT(3),
+
+    /* Pack endpoint addresses */
+    UCP_ADDRESS_PACK_FLAG_EP_ADDR     = UCS_BIT(4),
+
+    /* Pack TL resource index */
+    UCP_ADDRESS_PACK_FLAG_TL_RSC_IDX  = UCS_BIT(5),
+
+    /* Pack system device id */
+    UCP_ADDRESS_PACK_FLAG_SYS_DEVICE  = UCS_BIT(6),
 
     UCP_ADDRESS_PACK_FLAG_LAST,
 
@@ -54,16 +66,14 @@ enum {
      * so UCP_ADDRESS_PACK_FLAG_LAST<<1 is the next bit plus 2. If we subtract 3
      * we get the next bit minus 1.
      */
-    UCP_ADDRESS_PACK_FLAGS_ALL            = (UCP_ADDRESS_PACK_FLAG_LAST << 1) - 3,
-
-    UCP_ADDRESS_PACK_FLAGS_WORKER_DEFAULT = UCP_ADDRESS_PACK_FLAGS_ALL &
-                                            ~UCP_ADDRESS_PACK_FLAG_TL_RSC_IDX,
+    UCP_ADDRESS_PACK_FLAGS_ALL        = (UCP_ADDRESS_PACK_FLAG_LAST << 1) - 3,
 
-    UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT     = UCP_ADDRESS_PACK_FLAG_IFACE_ADDR |
-                                            UCP_ADDRESS_PACK_FLAG_EP_ADDR    |
-                                            UCP_ADDRESS_PACK_FLAG_TL_RSC_IDX,
+    /* Default packing flags for client-server protocol */
+    UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT = UCP_ADDRESS_PACK_FLAG_IFACE_ADDR |
+                                        UCP_ADDRESS_PACK_FLAG_EP_ADDR,
 
-    UCP_ADDRESS_PACK_FLAG_NO_TRACE        = UCS_BIT(16) /* Suppress debug tracing */
+    /* Suppress debug tracing */
+    UCP_ADDRESS_PACK_FLAG_NO_TRACE    = UCS_BIT(16)
 };
 
 
@@ -99,6 +109,7 @@ struct ucp_address_entry {
     unsigned                    dev_num_paths;  /* Number of paths on the device */
     uint16_t                    tl_name_csum;   /* Checksum of transport name */
     ucp_md_index_t              md_index;       /* Memory domain index */
+    ucs_sys_device_t            sys_dev;        /* System device id */
     ucp_rsc_index_t             dev_index;      /* Device index */
 };
 
@@ -107,10 +118,11 @@ struct ucp_address_entry {
  * Unpacked remote address
  */
 struct ucp_unpacked_address {
-    uint64_t                   uuid;            /* Remote worker UUID */
-    char                       name[UCP_WORKER_NAME_MAX]; /* Remote worker name */
-    unsigned                   address_count;   /* Length of address list */
-    ucp_address_entry_t        *address_list;   /* Pointer to address list */
+    uint64_t                    uuid;           /* Remote worker UUID */
+    /* Remote worker address name */
+    char                        name[UCP_WORKER_ADDRESS_NAME_MAX];
+    unsigned                    address_count;  /* Length of address list */
+    ucp_address_entry_t         *address_list;  /* Pointer to address list */
 };
 
 
@@ -149,7 +161,8 @@ struct ucp_unpacked_address {
  *                            released by ucs_free().
  */
 ucs_status_t ucp_address_pack(ucp_worker_h worker, ucp_ep_h ep,
-                              uint64_t tl_bitmap, unsigned pack_flags,
+                              const ucp_tl_bitmap_t *tl_bitmap,
+                              unsigned pack_flags,
                               const ucp_lane_index_t *lanes2remote,
                               size_t *size_p, void **buffer_p);
 
diff --git a/src/ucp/wireup/ep_match.c b/src/ucp/wireup/ep_match.c
index 8b01b76abbe..ddd906aa5fd 100644
--- a/src/ucp/wireup/ep_match.c
+++ b/src/ucp/wireup/ep_match.c
@@ -75,11 +75,10 @@ void ucp_ep_match_insert(ucp_worker_h worker, ucp_ep_h ep, uint64_t dest_uuid,
                !(ep->flags & UCP_EP_FLAG_REMOTE_ID));
     /* NOTE: protect union */
     ucs_assert(!(ep->flags & (UCP_EP_FLAG_ON_MATCH_CTX |
-                              UCP_EP_FLAG_FLUSH_STATE_VALID |
-                              UCP_EP_FLAG_LISTENER)));
+                              UCP_EP_FLAG_FLUSH_STATE_VALID)));
     /* EP matching is not used in CM flow */
     ucs_assert(!ucp_ep_has_cm_lane(ep));
-    ep->flags                             |= UCP_EP_FLAG_ON_MATCH_CTX;
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_ON_MATCH_CTX, 0);
     ucp_ep_ext_gen(ep)->ep_match.dest_uuid = dest_uuid;
 
     ucs_conn_match_insert(&worker->conn_match_ctx, &dest_uuid,
@@ -115,7 +114,8 @@ ucp_ep_h ucp_ep_match_retrieve(ucp_worker_h worker, uint64_t dest_uuid,
     ucs_assertv(ucs_test_all_flags(ep->flags, exp_ep_flags),
                 "ep=%p flags=0x%x exp_flags=0x%x", ep, ep->flags,
                 exp_ep_flags);
-    ep->flags &= ~UCP_EP_FLAG_ON_MATCH_CTX;
+    ucp_ep_update_flags(ep, 0, UCP_EP_FLAG_ON_MATCH_CTX);
+
     return ep;
 }
 
@@ -131,5 +131,5 @@ void ucp_ep_match_remove_ep(ucp_worker_h worker, ucp_ep_h ep)
                                UCS_CONN_MATCH_QUEUE_UNEXP :
                                UCS_CONN_MATCH_QUEUE_EXP);
 
-    ep->flags &= ~UCP_EP_FLAG_ON_MATCH_CTX;
+    ucp_ep_update_flags(ep, 0, UCP_EP_FLAG_ON_MATCH_CTX);
 }
diff --git a/src/ucp/wireup/select.c b/src/ucp/wireup/select.c
index 70f0c6f70c8..ae0d758a3a8 100644
--- a/src/ucp/wireup/select.c
+++ b/src/ucp/wireup/select.c
@@ -44,13 +44,13 @@ typedef struct ucp_wireup_atomic_flag {
 
 
 typedef struct {
-    ucp_rsc_index_t       rsc_index;
-    ucp_rsc_index_t       dst_rsc_index;
-    unsigned              addr_index;
-    unsigned              path_index;
-    ucp_md_index_t        dst_md_index;
-    ucp_lane_type_mask_t  lane_types;
-    double                score[UCP_LANE_TYPE_LAST];
+    ucp_rsc_index_t      rsc_index;
+    unsigned             addr_index;
+    unsigned             path_index;
+    ucp_md_index_t       dst_md_index;
+    ucs_sys_device_t     dst_sys_dev;
+    ucp_lane_type_mask_t lane_types;
+    double               score[UCP_LANE_TYPE_LAST];
 } ucp_wireup_lane_desc_t;
 
 
@@ -70,7 +70,7 @@ typedef struct {
 typedef struct {
     ucp_ep_h                      ep;               /* UCP Endpoint */
     unsigned                      ep_init_flags;    /* Endpoint init flags */
-    uint64_t                      tl_bitmap;        /* TLs bitmap which can be selected */
+    ucp_tl_bitmap_t               tl_bitmap;        /* TLs bitmap which can be selected */
     const ucp_unpacked_address_t  *address;         /* Remote addresses */
     int                           allow_am;         /* Shows whether emulation over AM
                                                      * is allowed or not for RMA/AMO */
@@ -115,7 +115,9 @@ static const char *ucp_wireup_iface_flags[] = {
     [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_SHORT)]  = "tag eager short",
     [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_BCOPY)]  = "tag eager bcopy",
     [ucs_ilog2(UCT_IFACE_FLAG_TAG_EAGER_ZCOPY)]  = "tag eager zcopy",
-    [ucs_ilog2(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)]   = "tag rndv zcopy"
+    [ucs_ilog2(UCT_IFACE_FLAG_TAG_RNDV_ZCOPY)]   = "tag rndv zcopy",
+    [ucs_ilog2(UCT_IFACE_FLAG_EP_CHECK)]         = "ep check",
+    [ucs_ilog2(UCT_IFACE_FLAG_EP_KEEPALIVE)]     = "ep keepalive"
 };
 
 static const char *ucp_wireup_event_flags[] = {
@@ -130,7 +132,7 @@ static ucp_wireup_atomic_flag_t ucp_wireup_atomic_desc[] = {
      [UCT_ATOMIC_OP_OR]    = {.name = "or",    .fetch = "fetch-"},
      [UCT_ATOMIC_OP_XOR]   = {.name = "xor",   .fetch = "fetch-"},
      [UCT_ATOMIC_OP_SWAP]  = {.name = "swap",  .fetch = ""},
-     [UCT_ATOMIC_OP_CSWAP] = {.name = "cscap", .fetch = ""}
+     [UCT_ATOMIC_OP_CSWAP] = {.name = "cswap", .fetch = ""}
 };
 
 
@@ -225,6 +227,36 @@ static int ucp_wireup_check_amo_flags(const uct_tl_resource_desc_t *resource,
     return 0;
 }
 
+static int
+ucp_wireup_check_keepalive(const ucp_wireup_select_params_t *select_params,
+                           const uct_tl_resource_desc_t *resource,
+                           uint64_t flags, uint64_t required_flags,
+                           const char *title, const char **flag_descs,
+                           char *reason, size_t max)
+{
+    ucp_worker_h worker = select_params->ep->worker;
+    char title_keepalive[128];
+    char title_ep_check[128];
+
+    ucs_snprintf_safe(title_keepalive, sizeof(title_keepalive),
+                      "%s with keepalive", title);
+    ucs_snprintf_safe(title_ep_check, sizeof(title_ep_check),
+                      "%s with ep_check", title);
+    return /* if error handling and keepalive were requested, UCT iface has to
+            * support peer failure (i.e. UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE)
+            * and either built-in keepalive (i.e. UCT_IFACE_FLAG_EP_KEEPALIVE)
+            * or EP checking (i.e. UCT_IFACE_FLAG_EP_CHECK) */
+            !ucp_worker_keepalive_is_enabled(worker) ||
+            !(select_params->ep_init_flags &
+              UCP_EP_INIT_ERR_MODE_PEER_FAILURE) ||
+            ucp_wireup_check_flags(resource, flags, UCT_IFACE_FLAG_EP_KEEPALIVE,
+                                   title_keepalive, ucp_wireup_iface_flags,
+                                   reason, max) ||
+            ucp_wireup_check_flags(resource, flags, UCT_IFACE_FLAG_EP_CHECK,
+                                   title_ep_check, ucp_wireup_iface_flags,
+                                   reason, max);
+}
+
 static void
 ucp_wireup_init_select_info(double score, unsigned addr_index,
                             ucp_rsc_index_t rsc_index,
@@ -245,14 +277,13 @@ ucp_wireup_init_select_info(double score, unsigned addr_index,
 /**
  * Select a local and remote transport
  */
-static UCS_F_NOINLINE ucs_status_t
-ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
-                            const ucp_wireup_criteria_t *criteria,
-                            uint64_t tl_bitmap, uint64_t remote_md_map,
-                            uint64_t local_dev_bitmap,
-                            uint64_t remote_dev_bitmap,
-                            int show_error,
-                            ucp_wireup_select_info_t *select_info)
+static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
+        const ucp_wireup_select_context_t *select_ctx,
+        const ucp_wireup_select_params_t *select_params,
+        const ucp_wireup_criteria_t *criteria, ucp_tl_bitmap_t tl_bitmap,
+        uint64_t remote_md_map, uint64_t local_dev_bitmap,
+        uint64_t remote_dev_bitmap, int show_error,
+        ucp_wireup_select_info_t *select_info)
 {
     const ucp_unpacked_address_t *address = select_params->address;
     ucp_ep_h ep                           = select_params->ep;
@@ -260,15 +291,18 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
     ucp_context_h context                 = worker->context;
     ucp_wireup_select_info_t sinfo        = {0};
     int found                             = 0;
+    uint64_t local_iface_flags            = criteria->local_iface_flags;
+    uint64_t addr_index_map, rsc_addr_index_map;
+    const ucp_wireup_lane_desc_t *lane_desc;
     unsigned addr_index;
     uct_tl_resource_desc_t *resource;
     const ucp_address_entry_t *ae;
     ucp_rsc_index_t rsc_index;
+    ucp_lane_index_t lane;
     char tls_info[256];
     char *p, *endp;
     uct_iface_attr_t *iface_attr;
     uct_md_attr_t *md_attr;
-    uint64_t addr_index_map;
     int is_reachable;
     double score;
     uint8_t priority;
@@ -276,7 +310,8 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
     p            = tls_info;
     endp         = tls_info + sizeof(tls_info) - 1;
     tls_info[0]  = '\0';
-    tl_bitmap   &= (select_params->tl_bitmap & context->tl_bitmap);
+    UCS_BITMAP_AND_INPLACE(&tl_bitmap, select_params->tl_bitmap);
+    UCS_BITMAP_AND_INPLACE(&tl_bitmap, context->tl_bitmap);
     show_error   = (select_params->show_error && show_error);
 
     /* Check which remote addresses satisfy the criteria */
@@ -287,7 +322,8 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
             ucs_trace("addr[%d]: not in use, because on device[%d]",
                       addr_index, ae->dev_index);
             continue;
-        } else if (!(remote_md_map & UCS_BIT(ae->md_index))) {
+        } else if ((ae->md_index != UCP_NULL_RESOURCE) &&
+                   !(remote_md_map & UCS_BIT(ae->md_index))) {
             ucs_trace("addr[%d]: not in use, because on md[%d]", addr_index,
                       ae->md_index);
             continue;
@@ -344,7 +380,7 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
      * Pick the best local resource to satisfy the criteria.
      * best one has the highest score (from the dedicated score_func) and
      * has a reachable tl on the remote peer */
-    ucs_for_each_bit(rsc_index, tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(tl_bitmap, rsc_index) {
         resource   = &context->tl_rscs[rsc_index].tl_rsc;
         iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
         md_attr    = &context->tl_mds[context->tl_rscs[rsc_index].md_index].attr;
@@ -354,13 +390,21 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
             continue;
         }
 
+        if (select_params->ep_init_flags & UCP_EP_INIT_CONNECT_TO_IFACE_ONLY) {
+            local_iface_flags |= UCT_IFACE_FLAG_CONNECT_TO_IFACE;
+        }
+
         /* Check that local md and interface satisfy the criteria */
         if (!ucp_wireup_check_flags(resource, md_attr->cap.flags,
                                     criteria->local_md_flags, criteria->title,
                                     ucp_wireup_md_flags, p, endp - p) ||
             !ucp_wireup_check_flags(resource, iface_attr->cap.flags,
-                                    criteria->local_iface_flags, criteria->title,
+                                    local_iface_flags, criteria->title,
                                     ucp_wireup_iface_flags, p, endp - p) ||
+            !ucp_wireup_check_keepalive(select_params, resource,
+                                        iface_attr->cap.flags,
+                                        criteria->local_iface_flags, criteria->title,
+                                        ucp_wireup_iface_flags, p, endp - p) ||
             !ucp_wireup_check_flags(resource, iface_attr->cap.event_flags,
                                     criteria->local_event_flags, criteria->title,
                                     ucp_wireup_event_flags, p, endp - p) ||
@@ -384,14 +428,15 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
         }
 
         /* Check supplied tl & device bitmap */
-        if (!(tl_bitmap & UCS_BIT(rsc_index))) {
+        if (!UCS_BITMAP_GET(tl_bitmap, rsc_index)) {
             ucs_trace(UCT_TL_RESOURCE_DESC_FMT " : disabled by tl_bitmap",
                       UCT_TL_RESOURCE_DESC_ARG(resource));
             snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT" - disabled for %s, ",
                      UCT_TL_RESOURCE_DESC_ARG(resource), criteria->title);
             p += strlen(p);
             continue;
-        } else if (!(local_dev_bitmap & UCS_BIT(context->tl_rscs[rsc_index].dev_index))) {
+        } else if (!(local_dev_bitmap &
+                     UCS_BIT(context->tl_rscs[rsc_index].dev_index))) {
             ucs_trace(UCT_TL_RESOURCE_DESC_FMT " : disabled by device bitmap",
                       UCT_TL_RESOURCE_DESC_ARG(resource));
             snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT" - disabled for %s, ",
@@ -400,11 +445,30 @@ ucp_wireup_select_transport(const ucp_wireup_select_params_t *select_params,
             continue;
         }
 
+        if (select_ctx->num_lanes < UCP_MAX_LANES) {
+            /* If we have not reached the lanes limit, we can select any
+               combination of rsc_index/addr_index */
+            rsc_addr_index_map = addr_index_map;
+        } else {
+            /* If we reached the lanes limit, select only existing combinations
+             * of rsc_index/addr_index, to make sure lane selection result will
+             * be the same when connecting to worker address and when connecting
+             * to a remote ep by wireup protocol.
+             */
+            rsc_addr_index_map = 0;
+            for (lane = 0; lane < select_ctx->num_lanes; ++lane) {
+                lane_desc = &select_ctx->lane_descs[lane];
+                if (lane_desc->rsc_index == rsc_index) {
+                    rsc_addr_index_map |= UCS_BIT(lane_desc->addr_index);
+                }
+            }
+            rsc_addr_index_map &= addr_index_map;
+        }
+
         is_reachable = 0;
-        ucp_unpacked_address_for_each(ae, address) {
-            addr_index = ucp_unpacked_address_index(address, ae);
-            if (!(addr_index_map & UCS_BIT(addr_index)) ||
-                !ucp_wireup_is_reachable(ep, select_params->ep_init_flags,
+        ucs_for_each_bit(addr_index, rsc_addr_index_map) {
+            ae = &address->address_list[addr_index];
+            if (!ucp_wireup_is_reachable(ep, select_params->ep_init_flags,
                                          rsc_index, ae)) {
                 /* Must be reachable device address, on same transport */
                 continue;
@@ -474,12 +538,10 @@ static inline double ucp_wireup_tl_iface_latency(ucp_context_h context,
            (iface_attr->latency.m * context->config.est_num_eps);
 }
 
-static UCS_F_NOINLINE ucs_status_t
-ucp_wireup_add_lane_desc(const ucp_wireup_select_info_t *select_info,
-                         ucp_md_index_t dst_md_index,
-                         ucp_rsc_index_t dst_rsc_index,
-                         ucp_lane_type_t lane_type,
-                         ucp_wireup_select_context_t *select_ctx)
+static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_lane_desc(
+        const ucp_wireup_select_info_t *select_info,
+        ucp_md_index_t dst_md_index, ucs_sys_device_t dst_sys_dev,
+        ucp_lane_type_t lane_type, ucp_wireup_select_context_t *select_ctx)
 {
     ucp_wireup_lane_desc_t *lane_desc;
     ucp_lane_type_t lane_type_iter;
@@ -495,9 +557,6 @@ ucp_wireup_add_lane_desc(const ucp_wireup_select_info_t *select_info,
             (lane_desc->path_index == select_info->path_index))
         {
             lane = lane_desc - select_ctx->lane_descs;
-            ucs_assertv_always(dst_rsc_index == lane_desc->dst_rsc_index,
-                               "lane[%d].dst_rsc_index=%d, dst_rsc_index=%d",
-                               lane, lane_desc->dst_rsc_index, dst_rsc_index);
             ucs_assertv_always(dst_md_index == lane_desc->dst_md_index,
                                "lane[%d].dst_md_index=%d, dst_md_index=%d",
                                lane, lane_desc->dst_md_index, dst_md_index);
@@ -521,12 +580,12 @@ ucp_wireup_add_lane_desc(const ucp_wireup_select_info_t *select_info,
     lane_desc = &select_ctx->lane_descs[select_ctx->num_lanes];
     ++select_ctx->num_lanes;
 
-    lane_desc->rsc_index     = select_info->rsc_index;
-    lane_desc->dst_rsc_index = dst_rsc_index;
-    lane_desc->addr_index    = select_info->addr_index;
-    lane_desc->path_index    = select_info->path_index;
-    lane_desc->dst_md_index  = dst_md_index;
-    lane_desc->lane_types    = UCS_BIT(lane_type);
+    lane_desc->rsc_index    = select_info->rsc_index;
+    lane_desc->addr_index   = select_info->addr_index;
+    lane_desc->path_index   = select_info->path_index;
+    lane_desc->dst_md_index = dst_md_index;
+    lane_desc->dst_sys_dev  = dst_sys_dev;
+    lane_desc->lane_types   = UCS_BIT(lane_type);
     for (lane_type_iter = UCP_LANE_TYPE_FIRST;
          lane_type_iter < UCP_LANE_TYPE_LAST;
          ++lane_type_iter) {
@@ -545,15 +604,12 @@ ucp_wireup_add_lane(const ucp_wireup_select_params_t *select_params,
                     ucp_lane_type_t lane_type,
                     ucp_wireup_select_context_t *select_ctx)
 {
-    ucp_md_index_t dst_md_index;
-    ucp_rsc_index_t dst_rsc_index;
-
-    dst_md_index  = select_params->address->address_list
-                        [select_info->addr_index].md_index;
-    dst_rsc_index = select_params->address->address_list
-                        [select_info->addr_index].iface_attr.dst_rsc_index;
-    return ucp_wireup_add_lane_desc(select_info, dst_md_index, dst_rsc_index,
-                                    lane_type, select_ctx);
+    ucp_address_entry_t *addr_list = select_params->address->address_list;
+    unsigned addr_index            = select_info->addr_index;
+
+    return ucp_wireup_add_lane_desc(select_info, addr_list[addr_index].md_index,
+                                    addr_list[addr_index].sys_dev, lane_type,
+                                    select_ctx);
 }
 
 static int ucp_wireup_compare_score(const void *elem1, const void *elem2,
@@ -595,10 +651,10 @@ static int ucp_wireup_compare_lane_amo_score(const void *elem1, const void *elem
     return ucp_wireup_compare_score(elem1, elem2, arg, UCP_LANE_TYPE_AMO);
 }
 
-static void
-ucp_wireup_unset_tl_by_md(const ucp_wireup_select_params_t *sparams,
-                          const ucp_wireup_select_info_t *sinfo,
-                          uint64_t *tl_bitmap, uint64_t *remote_md_map)
+static void ucp_wireup_unset_tl_by_md(const ucp_wireup_select_params_t *sparams,
+                                      const ucp_wireup_select_info_t *sinfo,
+                                      ucp_tl_bitmap_t *tl_bitmap,
+                                      uint64_t *remote_md_map)
 {
     ucp_context_h context         = sparams->ep->worker->context;
     const ucp_address_entry_t *ae = &sparams->address->
@@ -609,18 +665,17 @@ ucp_wireup_unset_tl_by_md(const ucp_wireup_select_params_t *sparams,
 
     *remote_md_map &= ~UCS_BIT(dst_md_index);
 
-    ucs_for_each_bit(i, context->tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, i) {
         if (context->tl_rscs[i].md_index == md_index) {
-            *tl_bitmap &= ~UCS_BIT(i);
+            UCS_BITMAP_UNSET(*tl_bitmap, i);
         }
     }
 }
 
-static UCS_F_NOINLINE ucs_status_t
-ucp_wireup_add_memaccess_lanes(const ucp_wireup_select_params_t *select_params,
-                               const ucp_wireup_criteria_t *criteria,
-                               uint64_t tl_bitmap, ucp_lane_type_t lane_type,
-                               ucp_wireup_select_context_t *select_ctx)
+static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(
+        const ucp_wireup_select_params_t *select_params,
+        const ucp_wireup_criteria_t *criteria, ucp_tl_bitmap_t tl_bitmap,
+        ucp_lane_type_t lane_type, ucp_wireup_select_context_t *select_ctx)
 {
     ucp_wireup_criteria_t mem_criteria   = *criteria;
     ucp_wireup_select_info_t select_info = {0};
@@ -636,9 +691,9 @@ ucp_wireup_add_memaccess_lanes(const ucp_wireup_select_params_t *select_params,
     snprintf(title, sizeof(title), criteria->title, "registered");
     mem_criteria.title           = title;
     mem_criteria.remote_md_flags = UCT_MD_FLAG_REG | criteria->remote_md_flags;
-    status = ucp_wireup_select_transport(select_params, &mem_criteria,
-                                         tl_bitmap, remote_md_map,
-                                         UINT64_MAX, UINT64_MAX,
+    status = ucp_wireup_select_transport(select_ctx, select_params,
+                                         &mem_criteria, tl_bitmap,
+                                         remote_md_map, UINT64_MAX, UINT64_MAX,
                                          show_error, &select_info);
     if (status == UCS_OK) {
         /* Add to the list of lanes */
@@ -673,10 +728,10 @@ ucp_wireup_add_memaccess_lanes(const ucp_wireup_select_params_t *select_params,
                                    criteria->remote_md_flags;
 
     for (;;) {
-        status = ucp_wireup_select_transport(select_params, &mem_criteria,
-                                             tl_bitmap, remote_md_map,
-                                             UINT64_MAX, UINT64_MAX, 0,
-                                             &select_info);
+        status = ucp_wireup_select_transport(select_ctx, select_params,
+                                             &mem_criteria, tl_bitmap,
+                                             remote_md_map, UINT64_MAX,
+                                             UINT64_MAX, 0, &select_info);
         /* Break if: */
         /* - transport selection wasn't OK */
         if ((status != UCS_OK) ||
@@ -722,6 +777,8 @@ static void ucp_wireup_fill_peer_err_criteria(ucp_wireup_criteria_t *criteria,
 {
     if (ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) {
         criteria->local_iface_flags |= UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE;
+        /* transport selection procedure will check additionally for KA or EP check
+         * support */
     }
 }
 
@@ -745,7 +802,6 @@ static void ucp_wireup_fill_aux_criteria(ucp_wireup_criteria_t *criteria,
     criteria->calc_score              = ucp_wireup_aux_score_func;
     criteria->tl_rsc_flags            = UCP_TL_RSC_FLAG_AUX; /* Can use aux transports */
 
-    /* TODO: add evaluation for err handling/keepalive mode */
     ucp_wireup_fill_peer_err_criteria(criteria, ep_init_flags);
 }
 
@@ -762,19 +818,7 @@ static void ucp_wireup_clean_amo_criteria(ucp_wireup_criteria_t *criteria)
  */
 static int ucp_wireup_allow_am_emulation_layer(unsigned ep_init_flags)
 {
-    if (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) {
-        return 0;
-    }
-
-    /* disable emulation layer if err handling is required due to lack of
-     * keep alive protocol, unless we have CM which handles disconnect
-     */
-    if ((ep_init_flags & UCP_EP_INIT_ERR_MODE_PEER_FAILURE) &&
-        !ucp_ep_init_flags_has_cm(ep_init_flags)) {
-        return 0;
-    }
-
-    return 1;
+    return !(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE);
 }
 
 static unsigned
@@ -794,11 +838,12 @@ ucp_wireup_add_cm_lane(const ucp_wireup_select_params_t *select_params,
         return UCS_OK;
     }
 
-    ucp_wireup_init_select_info(0., 0, UCP_NULL_RESOURCE, 0, &select_info);
+    ucp_wireup_init_select_info(0., UINT_MAX, UCP_NULL_RESOURCE, 0,
+                                &select_info);
 
     /* server is not a proxy because it can create all lanes connected */
     return ucp_wireup_add_lane_desc(&select_info, UCP_NULL_RESOURCE,
-                                    UCP_NULL_RESOURCE, UCP_LANE_TYPE_CM,
+                                    UCS_SYS_DEVICE_ID_UNKNOWN, UCP_LANE_TYPE_CM,
                                     select_ctx);
 }
 
@@ -810,8 +855,9 @@ ucp_wireup_add_rma_lanes(const ucp_wireup_select_params_t *select_params,
     unsigned ep_init_flags         = ucp_wireup_ep_init_flags(select_params,
                                                               select_ctx);
 
-    if (!(ucp_ep_get_context_features(select_params->ep) & UCP_FEATURE_RMA) &&
-        !(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE)) {
+    if ((!(ucp_ep_get_context_features(select_params->ep) & UCP_FEATURE_RMA) &&
+         !(ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE)) ||
+        (ep_init_flags & UCP_EP_INIT_CM_PHASE)) {
         return UCS_OK;
     }
 
@@ -833,8 +879,9 @@ ucp_wireup_add_rma_lanes(const ucp_wireup_select_params_t *select_params,
     criteria.tl_rsc_flags           = 0;
     ucp_wireup_fill_peer_err_criteria(&criteria, ep_init_flags);
 
-    return ucp_wireup_add_memaccess_lanes(select_params, &criteria, UINT64_MAX,
-                                          UCP_LANE_TYPE_RMA, select_ctx);
+    return ucp_wireup_add_memaccess_lanes(select_params, &criteria,
+                                          ucp_tl_bitmap_max, UCP_LANE_TYPE_RMA,
+                                          select_ctx);
 }
 
 double ucp_wireup_amo_score_func(ucp_context_h context,
@@ -857,11 +904,11 @@ ucp_wireup_add_amo_lanes(const ucp_wireup_select_params_t *select_params,
     unsigned ep_init_flags         = ucp_wireup_ep_init_flags(select_params,
                                                               select_ctx);
     ucp_rsc_index_t rsc_index;
-    uint64_t tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap;
 
-    if (!ucs_test_flags(context->config.features,
-                        UCP_FEATURE_AMO32, UCP_FEATURE_AMO64) ||
-        (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE)) {
+    if (!ucs_test_flags(context->config.features, UCP_FEATURE_AMO32,
+                        UCP_FEATURE_AMO64) ||
+        (ep_init_flags & (UCP_EP_INIT_FLAG_MEM_TYPE | UCP_EP_INIT_CM_PHASE))) {
         return UCS_OK;
     }
 
@@ -881,9 +928,9 @@ ucp_wireup_add_amo_lanes(const ucp_wireup_select_params_t *select_params,
      * connect back on p2p transport.
      */
     tl_bitmap = worker->atomic_tls;
-    ucs_for_each_bit(rsc_index, context->tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, rsc_index) {
         if (ucp_worker_is_tl_2iface(worker, rsc_index)) {
-            tl_bitmap |= UCS_BIT(rsc_index);
+            UCS_BITMAP_SET(tl_bitmap, rsc_index);
         }
     }
 
@@ -974,7 +1021,7 @@ ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
                        ucp_wireup_select_context_t *select_ctx)
 {
     ucp_worker_h worker            = select_params->ep->worker;
-    uint64_t tl_bitmap             = select_params->tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap      = select_params->tl_bitmap;
     ucp_wireup_criteria_t criteria = {0};
     const uct_iface_attr_t *iface_attr;
     ucs_status_t status;
@@ -1002,9 +1049,10 @@ ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
             criteria.local_event_flags = UCP_WIREUP_UCT_EVENT_CAP_FLAGS;
         }
 
-        status = ucp_wireup_select_transport(select_params, &criteria, tl_bitmap,
-                                             UINT64_MAX, UINT64_MAX, UINT64_MAX,
-                                             1, am_info);
+        status = ucp_wireup_select_transport(select_ctx, select_params,
+                                             &criteria, tl_bitmap, UINT64_MAX,
+                                             UINT64_MAX, UINT64_MAX, 1,
+                                             am_info);
         if (status != UCS_OK) {
             return status;
         }
@@ -1015,7 +1063,7 @@ ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
             ucs_debug("ep %p: rsc_index[%d] am.max_bcopy is too small: %zu, "
                       "expected: >= %d", select_params->ep, am_info->rsc_index,
                       iface_attr->cap.am.max_bcopy, UCP_MIN_BCOPY);
-            tl_bitmap &= ~UCS_BIT(am_info->rsc_index);
+            UCS_BITMAP_UNSET(tl_bitmap, am_info->rsc_index);
             continue;
         }
 
@@ -1042,7 +1090,7 @@ static double ucp_wireup_am_bw_score_func(ucp_context_h context,
 static unsigned
 ucp_wireup_add_bw_lanes(const ucp_wireup_select_params_t *select_params,
                         const ucp_wireup_select_bw_info_t *bw_info,
-                        uint64_t tl_bitmap, ucp_lane_index_t excl_lane,
+                        ucp_tl_bitmap_t tl_bitmap, ucp_lane_index_t excl_lane,
                         ucp_wireup_select_context_t *select_ctx)
 {
     ucp_ep_h ep                                  = select_params->ep;
@@ -1072,10 +1120,10 @@ ucp_wireup_add_bw_lanes(const ucp_wireup_select_params_t *select_params,
     while ((num_lanes < bw_info->max_lanes) &&
            (ucs_popcount(md_map) < UCP_MAX_OP_MDS)) {
         if (excl_lane == UCP_NULL_LANE) {
-            status = ucp_wireup_select_transport(select_params, &bw_info->criteria,
-                                                 tl_bitmap, UINT64_MAX,
-                                                 local_dev_bitmap, remote_dev_bitmap,
-                                                 0, &sinfo);
+            status = ucp_wireup_select_transport(select_ctx, select_params,
+                                                 &bw_info->criteria, tl_bitmap,
+                                                 UINT64_MAX, local_dev_bitmap,
+                                                 remote_dev_bitmap, 0, &sinfo);
             if (status != UCS_OK) {
                 break;
             }
@@ -1125,19 +1173,19 @@ static ucs_status_t
 ucp_wireup_add_am_bw_lanes(const ucp_wireup_select_params_t *select_params,
                            ucp_wireup_select_context_t *select_ctx)
 {
-    ucp_ep_h ep            = select_params->ep; 
+    ucp_ep_h ep            = select_params->ep;
     ucp_context_h context  = ep->worker->context;
     unsigned ep_init_flags = ucp_wireup_ep_init_flags(select_params,
                                                       select_ctx);
-            
+
     ucp_lane_index_t lane_desc_idx, am_lane;
     ucp_wireup_select_bw_info_t bw_info;
     unsigned num_am_bw_lanes;
 
     /* Check if we need active message BW lanes */
-    if (!(ucp_ep_get_context_features(ep) & (UCP_FEATURE_TAG |
-                                             UCP_FEATURE_AM)) ||
-        (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) ||
+    if (!(ucp_ep_get_context_features(ep) &
+          (UCP_FEATURE_TAG | UCP_FEATURE_AM)) ||
+        (ep_init_flags & (UCP_EP_INIT_FLAG_MEM_TYPE | UCP_EP_INIT_CM_PHASE)) ||
         (context->config.ext.max_eager_lanes < 2)) {
         return UCS_OK;
     }
@@ -1179,8 +1227,9 @@ ucp_wireup_add_am_bw_lanes(const ucp_wireup_select_params_t *select_params,
         }
     }
 
-    num_am_bw_lanes = ucp_wireup_add_bw_lanes(select_params, &bw_info, UINT64_MAX,
-                                              am_lane, select_ctx);
+    num_am_bw_lanes = ucp_wireup_add_bw_lanes(select_params, &bw_info,
+                                              ucp_tl_bitmap_max, am_lane,
+                                              select_ctx);
     return ((am_lane != UCP_NULL_LANE) || (num_am_bw_lanes > 0)) ? UCS_OK :
            UCS_ERR_UNREACHABLE;
 }
@@ -1217,8 +1266,13 @@ ucp_wireup_add_rma_bw_lanes(const ucp_wireup_select_params_t *select_params,
     ucs_memory_type_t mem_type;
     size_t added_lanes;
     uint64_t md_reg_flag;
+    ucp_tl_bitmap_t tl_bitmap;
     uint8_t i;
 
+    if (ep_init_flags & UCP_EP_INIT_CM_PHASE) {
+        return UCS_OK;
+    }
+
     if (ep_init_flags & UCP_EP_INIT_FLAG_MEM_TYPE) {
         md_reg_flag = 0;
     } else if (ucp_ep_get_context_features(ep) &
@@ -1294,16 +1348,23 @@ ucp_wireup_add_rma_bw_lanes(const ucp_wireup_select_params_t *select_params,
         bw_info.criteria.local_iface_flags  |= iface_rma_flags;
 
         added_lanes = 0;
+        UCS_BITMAP_CLEAR(&tl_bitmap);
 
         for (mem_type = UCS_MEMORY_TYPE_HOST;
              mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) {
-            if (!context->mem_type_access_tls[mem_type]) {
+            if (UCS_BITMAP_IS_ZERO_INPLACE(
+                        &context->mem_type_access_tls[mem_type])) {
                 continue;
             }
 
-            added_lanes += ucp_wireup_add_bw_lanes(select_params, &bw_info,
-                                                   context->mem_type_access_tls[mem_type],
-                                                   UCP_NULL_LANE, select_ctx);
+            added_lanes += ucp_wireup_add_bw_lanes(
+                    select_params, &bw_info,
+                    UCP_TL_BITMAP_AND_NOT(
+                            context->mem_type_access_tls[mem_type], tl_bitmap),
+                    UCP_NULL_LANE, select_ctx);
+
+            UCS_BITMAP_OR_INPLACE(&tl_bitmap,
+                                  context->mem_type_access_tls[mem_type]);
         }
 
         if (added_lanes /* There are selected lanes */ ||
@@ -1327,9 +1388,12 @@ ucp_wireup_add_tag_lane(const ucp_wireup_select_params_t *select_params,
     ucp_ep_h ep                          = select_params->ep;
     ucp_wireup_criteria_t criteria       = {0};
     ucp_wireup_select_info_t select_info = {0};
+    unsigned ep_init_flags               = ucp_wireup_ep_init_flags(
+                                                   select_params, select_ctx);
     ucs_status_t status;
 
     if (!(ucp_ep_get_context_features(ep) & UCP_FEATURE_TAG) ||
+        (ep_init_flags & (UCP_EP_INIT_FLAG_MEM_TYPE | UCP_EP_INIT_CM_PHASE)) ||
         /* TODO: remove check below when UCP_ERR_HANDLING_MODE_PEER supports
          *       RNDV-protocol or HW TM supports fragmented protocols
          */
@@ -1354,10 +1418,11 @@ ucp_wireup_add_tag_lane(const ucp_wireup_select_params_t *select_params,
     }
 
     /* Do not add tag offload lane, if selected tag lane score is lower
-     * than AM score. In this case AM will be used for tag macthing. */
-    status = ucp_wireup_select_transport(select_params, &criteria,
-                                         UINT64_MAX, UINT64_MAX, UINT64_MAX,
-                                         UINT64_MAX, 0, &select_info);
+     * than AM score. In this case AM will be used for tag matching. */
+    status = ucp_wireup_select_transport(select_ctx, select_params, &criteria,
+                                         ucp_tl_bitmap_max, UINT64_MAX,
+                                         UINT64_MAX, UINT64_MAX, 0,
+                                         &select_info);
     if ((status == UCS_OK) &&
         (ucp_score_cmp(select_info.score,
                        am_info->score) >= 0)) {
@@ -1427,7 +1492,7 @@ static UCS_F_NOINLINE void
 ucp_wireup_select_params_init(ucp_wireup_select_params_t *select_params,
                               ucp_ep_h ep, unsigned ep_init_flags,
                               const ucp_unpacked_address_t *remote_address,
-                              uint64_t tl_bitmap, int show_error)
+                              ucp_tl_bitmap_t tl_bitmap, int show_error)
 {
     select_params->ep            = ep;
     select_params->ep_init_flags = ep_init_flags;
@@ -1502,7 +1567,9 @@ ucp_wireup_search_lanes(const ucp_wireup_select_params_t *select_params,
 static void ucp_wireup_init_keepalive_map(ucp_worker_h worker,
                                           ucp_ep_config_key_t *key)
 {
-    ucp_context_h context = worker->context;
+    ucp_context_h context  = worker->context;
+    int shm_added_ep_check = 0;
+    uct_tl_resource_desc_t *resource;
     ucp_lane_index_t lane;
     ucp_rsc_index_t rsc_index;
     ucp_rsc_index_t dev_index;
@@ -1516,6 +1583,22 @@ static void ucp_wireup_init_keepalive_map(ucp_worker_h worker,
 
     dev_map_used = 0;
 
+    /* find all devices with built-in keepalive support */
+    for (lane = 0; lane < key->num_lanes; ++lane) {
+        rsc_index = key->lanes[lane].rsc_index;
+        if (rsc_index == UCP_NULL_RESOURCE) {
+            continue;
+        }
+
+        dev_index = context->tl_rscs[rsc_index].dev_index;
+        ucs_assert(dev_index < (sizeof(dev_map_used) * 8));
+        iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
+        if (iface_attr->cap.flags & UCT_IFACE_FLAG_EP_KEEPALIVE) {
+            dev_map_used |= UCS_BIT(dev_index);
+        }
+    }
+
+    /* send ep_check on devices without built-in keepalive */
     for (lane = 0; lane < key->num_lanes; ++lane) {
         /* add lanes to ep_check map */
         rsc_index = key->lanes[lane].rsc_index;
@@ -1523,14 +1606,28 @@ static void ucp_wireup_init_keepalive_map(ucp_worker_h worker,
             continue;
         }
 
+        resource  = &context->tl_rscs[rsc_index].tl_rsc;
         dev_index = context->tl_rscs[rsc_index].dev_index;
         ucs_assert(dev_index < (sizeof(dev_map_used) * 8));
+
         iface_attr = ucp_worker_iface_get_attr(worker, rsc_index);
         if (!(UCS_BIT(dev_index) & dev_map_used) &&
              /* TODO: convert to assert to make sure iface supports
               * both err handling & ep_check */
             (iface_attr->cap.flags & UCT_IFACE_FLAG_EP_CHECK)) {
             ucs_assert(!(key->ep_check_map & UCS_BIT(lane)));
+
+            if (resource->dev_type & UCT_DEVICE_TYPE_SHM) {
+                if (shm_added_ep_check) {
+                    /* Skip, if SHM device was already added to EP check map -
+                     * add only one SHM device in order to simplify checking of
+                     * errors because they do same check for a peer existence */
+                    continue;
+                }
+
+                shm_added_ep_check = 1;
+            }
+
             key->ep_check_map |= UCS_BIT(lane);
             dev_map_used      |= UCS_BIT(dev_index);
         }
@@ -1558,12 +1655,12 @@ ucp_wireup_construct_lanes(const ucp_wireup_select_params_t *select_params,
      */
     for (lane = 0; lane < key->num_lanes; ++lane) {
         ucs_assert(select_ctx->lane_descs[lane].lane_types != 0);
-        key->lanes[lane].rsc_index     = select_ctx->lane_descs[lane].rsc_index;
-        key->lanes[lane].dst_rsc_index = select_ctx->lane_descs[lane].dst_rsc_index;
-        key->lanes[lane].dst_md_index  = select_ctx->lane_descs[lane].dst_md_index;
-        key->lanes[lane].path_index    = select_ctx->lane_descs[lane].path_index;
-        key->lanes[lane].lane_types    = select_ctx->lane_descs[lane].lane_types;
-        addr_indices[lane]             = select_ctx->lane_descs[lane].addr_index;
+        key->lanes[lane].rsc_index    = select_ctx->lane_descs[lane].rsc_index;
+        key->lanes[lane].dst_md_index = select_ctx->lane_descs[lane].dst_md_index;
+        key->lanes[lane].dst_sys_dev  = select_ctx->lane_descs[lane].dst_sys_dev;
+        key->lanes[lane].path_index   = select_ctx->lane_descs[lane].path_index;
+        key->lanes[lane].lane_types   = select_ctx->lane_descs[lane].lane_types;
+        addr_indices[lane]            = select_ctx->lane_descs[lane].addr_index;
 
         if (select_ctx->lane_descs[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_CM)) {
             ucs_assert(key->cm_lane == UCP_NULL_LANE);
@@ -1612,7 +1709,7 @@ ucp_wireup_construct_lanes(const ucp_wireup_select_params_t *select_params,
     /* Select lane for wireup messages, if: */
     if (/* - no CM support was requested */
         !ucp_ep_init_flags_has_cm(select_params->ep_init_flags) ||
-        /* - CM support was reuested, but not locally connected yet */
+        /* - CM support was requested, but not locally connected yet */
         !(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) {
         key->wireup_msg_lane =
         ucp_wireup_select_wireup_msg_lane(worker,
@@ -1654,17 +1751,20 @@ ucp_wireup_construct_lanes(const ucp_wireup_select_params_t *select_params,
 }
 
 ucs_status_t
-ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags, uint64_t tl_bitmap,
+ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
+                        ucp_tl_bitmap_t tl_bitmap,
                         const ucp_unpacked_address_t *remote_address,
                         unsigned *addr_indices, ucp_ep_config_key_t *key)
 {
-    ucp_worker_h worker         = ep->worker;
-    uint64_t scalable_tl_bitmap = worker->scalable_tl_bitmap & tl_bitmap;
+    ucp_worker_h worker                = ep->worker;
+    ucp_tl_bitmap_t scalable_tl_bitmap = worker->scalable_tl_bitmap;
     ucp_wireup_select_context_t select_ctx;
     ucp_wireup_select_params_t select_params;
     ucs_status_t status;
 
-    if (scalable_tl_bitmap) {
+    UCS_BITMAP_AND_INPLACE(&scalable_tl_bitmap, tl_bitmap);
+
+    if (!UCS_BITMAP_IS_ZERO_INPLACE(&scalable_tl_bitmap)) {
         ucp_wireup_select_params_init(&select_params, ep, ep_init_flags,
                                       remote_address, scalable_tl_bitmap, 0);
         status = ucp_wireup_search_lanes(&select_params, key->err_mode,
@@ -1688,6 +1788,11 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags, uint64_t tl_bitmap,
 
 out:
     ucp_wireup_construct_lanes(&select_params, &select_ctx, addr_indices, key);
+
+    /* Only two lanes must be created during CM phase (CM lane and TL lane) of
+     * connection setup between two peers */
+    ucs_assert(!(ep_init_flags & UCP_EP_INIT_CM_PHASE) || key->num_lanes == 2);
+
     return UCS_OK;
 }
 
@@ -1703,59 +1808,18 @@ static double ucp_wireup_aux_score_func(ucp_context_h context,
 
 ucs_status_t
 ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags,
-                                uint64_t tl_bitmap,
+                                ucp_tl_bitmap_t tl_bitmap,
                                 const ucp_unpacked_address_t *remote_address,
                                 ucp_wireup_select_info_t *select_info)
 {
-    ucp_wireup_criteria_t criteria = {0};
+    ucp_wireup_select_context_t select_ctx = {};
+    ucp_wireup_criteria_t criteria         = {};
     ucp_wireup_select_params_t select_params;
 
     ucp_wireup_select_params_init(&select_params, ep, ep_init_flags,
                                   remote_address, tl_bitmap, 1);
     ucp_wireup_fill_aux_criteria(&criteria, ep_init_flags);
-    return ucp_wireup_select_transport(&select_params, &criteria,
-                                       UINT64_MAX, UINT64_MAX, UINT64_MAX,
-                                       UINT64_MAX, 1, select_info);
-}
-
-ucs_status_t
-ucp_wireup_select_sockaddr_transport(const ucp_context_h context,
-                                     const ucs_sock_addr_t *sockaddr,
-                                     ucp_rsc_index_t *rsc_index_p)
-{
-    char saddr_str[UCS_SOCKADDR_STRING_LEN];
-    ucp_tl_resource_desc_t *resource;
-    ucp_rsc_index_t tl_id;
-    ucp_md_index_t md_index;
-    uct_md_h md;
-    int i;
-
-    /* Go over the sockaddr transports priority array and try to use the transports
-     * one by one for the client side */
-    for (i = 0; i < context->config.num_sockaddr_tls; i++) {
-        tl_id    = context->config.sockaddr_tl_ids[i];
-        resource = &context->tl_rscs[tl_id];
-        md_index = resource->md_index;
-        md       = context->tl_mds[md_index].md;
-
-        ucs_assert(context->tl_mds[md_index].attr.cap.flags &
-                   UCT_MD_FLAG_SOCKADDR);
-
-        /* The client selects the transport for sockaddr according to the
-         * configuration. We rely on the server having this transport available
-         * as well */
-        if (uct_md_is_sockaddr_accessible(md, sockaddr,
-                                          UCT_SOCKADDR_ACC_REMOTE)) {
-            *rsc_index_p = tl_id;
-            ucs_debug("sockaddr transport selected: %s", resource->tl_rsc.tl_name);
-            return UCS_OK;
-        }
-
-        ucs_debug("md %s cannot reach %s",
-                  context->tl_mds[md_index].rsc.md_name,
-                  ucs_sockaddr_str(sockaddr->addr, saddr_str,
-                                   sizeof(saddr_str)));
-    }
-
-    return UCS_ERR_UNREACHABLE;
+    return ucp_wireup_select_transport(&select_ctx, &select_params, &criteria,
+                                       ucp_tl_bitmap_max, UINT64_MAX,
+                                       UINT64_MAX, UINT64_MAX, 1, select_info);
 }
diff --git a/src/ucp/wireup/wireup.c b/src/ucp/wireup/wireup.c
index c3619c0c1e5..2052304539a 100644
--- a/src/ucp/wireup/wireup.c
+++ b/src/ucp/wireup/wireup.c
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -18,21 +18,40 @@
 #include <ucp/core/ucp_proxy_ep.h>
 #include <ucp/core/ucp_worker.h>
 #include <ucp/core/ucp_listener.h>
+#include <ucp/proto/proto_am.inl>
 #include <ucp/tag/eager.h>
 #include <ucs/async/async.h>
 #include <ucs/datastruct/queue.h>
+#include <ucs/sys/iovec.h>
 
 /*
  * Description of the protocol in UCX wiki:
  * https://github.com/openucx/ucx/wiki/Connection-establishment
  */
 
-static size_t ucp_wireup_msg_pack(void *dest, void *arg)
+
+/* Validate wireup message, implemented as a macro to prevent static checker
+ * warnings */
+#define UCP_WIREUP_MSG_CHECK(_msg, _ep, _msg_type) \
+    do { \
+        ucs_assert((_msg)->type == (_msg_type)); \
+        if ((_msg_type) == UCP_WIREUP_MSG_REQUEST) { \
+            ucs_assert(((_msg)->dst_ep_id == UCS_PTR_MAP_KEY_INVALID) != \
+                       ((_ep) != NULL)); \
+        } else { \
+            ucs_assert((_msg)->dst_ep_id != UCS_PTR_MAP_KEY_INVALID); \
+            ucs_assert((_ep) != NULL); \
+        } \
+    } while (0)
+
+
+size_t ucp_wireup_msg_pack(void *dest, void *arg)
 {
-    ucp_request_t *req = arg;
-    *(ucp_wireup_msg_t*)dest = req->send.wireup;
-    memcpy((ucp_wireup_msg_t*)dest + 1, req->send.buffer, req->send.length);
-    return sizeof(ucp_wireup_msg_t) + req->send.length;
+    struct iovec *wireup_msg_iov = (struct iovec*)arg;
+
+    return ucs_iov_copy(wireup_msg_iov, 2, 0, dest,
+                        wireup_msg_iov[0].iov_len + wireup_msg_iov[1].iov_len,
+                        UCS_IOV_COPY_TO_BUF);
 }
 
 static const char* ucp_wireup_msg_str(uint8_t msg_type)
@@ -46,6 +65,10 @@ static const char* ucp_wireup_msg_str(uint8_t msg_type)
         return "REP";
     case UCP_WIREUP_MSG_ACK:
         return "ACK";
+    case UCP_WIREUP_MSG_EP_CHECK:
+        return "EP_CHECK";
+    case UCP_WIREUP_MSG_EP_REMOVED:
+        return "EP_REMOVED";
     default:
         return "<unknown>";
     }
@@ -79,16 +102,21 @@ static ucp_lane_index_t ucp_wireup_get_msg_lane(ucp_ep_h ep, uint8_t msg_type)
 
 ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self)
 {
-    ucp_request_t *req = ucs_container_of(self, ucp_request_t, send.uct);
-    ucp_ep_h ep = req->send.ep;
+    ucp_request_t *req  = ucs_container_of(self, ucp_request_t, send.uct);
+    ucp_ep_h ep         = req->send.ep;
+    ucs_status_t status;
     ssize_t packed_len;
     unsigned am_flags;
+    struct iovec wireup_msg_iov[2];
+
+    UCS_ASYNC_BLOCK(&ep->worker->async);
 
     if (req->send.wireup.type == UCP_WIREUP_MSG_REQUEST) {
         if (ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED) {
             ucs_trace("ep %p: not sending wireup message - remote already connected",
                       ep);
-            goto out;
+            status = UCS_OK;
+            goto out_free_req;
         }
     } else if (req->send.wireup.type == UCP_WIREUP_MSG_PRE_REQUEST) {
         ucs_assert (!(ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED));
@@ -106,104 +134,140 @@ ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self)
     VALGRIND_CHECK_MEM_IS_DEFINED(&req->send.wireup, sizeof(req->send.wireup));
     VALGRIND_CHECK_MEM_IS_DEFINED(req->send.buffer, req->send.length);
 
+    wireup_msg_iov[0].iov_base = &req->send.wireup;
+    wireup_msg_iov[0].iov_len  = sizeof(req->send.wireup);
+
+    wireup_msg_iov[1].iov_base = req->send.buffer;
+    wireup_msg_iov[1].iov_len  = req->send.length;
+
     packed_len = uct_ep_am_bcopy(ep->uct_eps[req->send.lane], UCP_AM_ID_WIREUP,
-                                 ucp_wireup_msg_pack, req, am_flags);
-    if (packed_len < 0) {
-        if (packed_len != UCS_ERR_NO_RESOURCE) {
-            ucs_error("failed to send wireup: %s",
-                      ucs_status_string((ucs_status_t)packed_len));
+                                 ucp_wireup_msg_pack, wireup_msg_iov, am_flags);
+    if (ucs_unlikely(packed_len < 0)) {
+        status = (ucs_status_t)packed_len;
+        if (ucs_likely(status == UCS_ERR_NO_RESOURCE)) {
+            goto out;
         }
-        return (ucs_status_t)packed_len;
+
+        ucs_diag("failed to send wireup: %s", ucs_status_string(status));
+        status = UCS_OK;
+        goto out_free_req;
+    } else {
+        status = UCS_OK;
     }
 
     switch (req->send.wireup.type) {
     case UCP_WIREUP_MSG_PRE_REQUEST:
-        ep->flags |= UCP_EP_FLAG_CONNECT_PRE_REQ_SENT;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_PRE_REQ_SENT, 0);
         break;
     case UCP_WIREUP_MSG_REQUEST:
-        ep->flags |= UCP_EP_FLAG_CONNECT_REQ_SENT;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_REQ_SENT, 0);
         break;
     case UCP_WIREUP_MSG_REPLY:
-        ep->flags |= UCP_EP_FLAG_CONNECT_REP_SENT;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_REP_SENT, 0);
         break;
     case UCP_WIREUP_MSG_ACK:
-        ep->flags |= UCP_EP_FLAG_CONNECT_ACK_SENT;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_ACK_SENT, 0);
         break;
     }
 
+out_free_req:
+    ucs_free(req->send.buffer);
+    ucp_request_mem_free(req);
 out:
-    ucs_free((void*)req->send.buffer);
-    ucs_free(req);
-    return UCS_OK;
+    UCS_ASYNC_UNBLOCK(&ep->worker->async);
+    return status;
 }
 
-static inline int ucp_wireup_is_ep_needed(ucp_ep_h ep)
+ucs_status_t
+ucp_wireup_msg_prepare(ucp_ep_h ep, uint8_t type,
+                       const ucp_tl_bitmap_t *tl_bitmap,
+                       const ucp_lane_index_t *lanes2remote,
+                       ucp_wireup_msg_t *msg_hdr, void **address_p,
+                       size_t *address_length_p)
 {
-    return (ep != NULL) && !(ep->flags & UCP_EP_FLAG_LISTENER);
+    unsigned pack_flags = ucp_worker_default_address_pack_flags(ep->worker) |
+                          UCP_ADDRESS_PACK_FLAG_TL_RSC_IDX;
+    ucs_status_t status;
+
+    msg_hdr->type      = type;
+    msg_hdr->err_mode  = ucp_ep_config(ep)->key.err_mode;
+    msg_hdr->conn_sn   = ep->conn_sn;
+    msg_hdr->src_ep_id = ucp_ep_local_id(ep);
+    if (ep->flags & UCP_EP_FLAG_REMOTE_ID) {
+        msg_hdr->dst_ep_id = ucp_ep_remote_id(ep);
+    } else {
+        msg_hdr->dst_ep_id = UCS_PTR_MAP_KEY_INVALID;
+    }
+
+    /* pack all addresses */
+    status = ucp_address_pack(ep->worker, ep, tl_bitmap, pack_flags,
+                              lanes2remote, address_length_p, address_p);
+    if (status != UCS_OK) {
+        ucs_error("failed to pack address: %s", ucs_status_string(status));
+    }
+
+    return status;
 }
 
 /*
  * @param [in] rsc_tli  Resource index for every lane.
  */
+
 static ucs_status_t
-ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, uint64_t tl_bitmap,
+ucp_wireup_msg_send(ucp_ep_h ep, uint8_t type, const ucp_tl_bitmap_t *tl_bitmap,
                     const ucp_lane_index_t *lanes2remote)
 {
-    ucp_request_t* req;
+    ucp_request_t *req;
     ucs_status_t status;
-    void *address;
 
     ucs_assert(ep->cfg_index != UCP_WORKER_CFG_INDEX_NULL);
 
+    if (ep->flags & UCP_EP_FLAG_FAILED) {
+        ucs_debug("ep %p: not sending WIREUP message (%u), because ep failed",
+                  ep, type);
+        return UCS_ERR_CONNECTION_RESET;
+    }
+
     /* We cannot allocate from memory pool because it's not thread safe
      * and this function may be called from any thread
      */
-    req = ucs_malloc(sizeof(*req), "wireup_msg_req");
+    req = ucp_request_mem_alloc("wireup_msg_req");
     if (req == NULL) {
+        ucs_error("failed to allocate request for sending WIREUP message");
         return UCS_ERR_NO_MEMORY;
     }
 
-    req->flags                   = 0;
-    req->send.ep                 = ep;
-    req->send.wireup.type        = type;
-    req->send.wireup.err_mode    = ucp_ep_config(ep)->key.err_mode;
-    req->send.wireup.conn_sn     = ep->conn_sn;
-    req->send.wireup.src_ep_id   = ucp_ep_local_id(ep);
-    if (ep->flags & UCP_EP_FLAG_REMOTE_ID) {
-        req->send.wireup.dst_ep_id = ucp_ep_remote_id(ep);
-    } else {
-        req->send.wireup.dst_ep_id = UCP_EP_ID_INVALID;
-    }
-
-    req->send.uct.func           = ucp_wireup_msg_progress;
-    req->send.datatype           = ucp_dt_make_contig(1);
+    req->flags         = 0;
+    req->send.ep       = ep;
+    req->send.uct.func = ucp_wireup_msg_progress;
+    req->send.datatype = ucp_dt_make_contig(1);
     ucp_request_send_state_init(req, ucp_dt_make_contig(1), 0);
 
-    /* pack all addresses */
-    status = ucp_address_pack(ep->worker,
-                              ucp_wireup_is_ep_needed(ep) ? ep : NULL,
-                              tl_bitmap, UCP_ADDRESS_PACK_FLAGS_ALL,
-                              lanes2remote, &req->send.length, &address);
+    status = ucp_wireup_msg_prepare(ep, type, tl_bitmap, lanes2remote,
+                                    &req->send.wireup, &req->send.buffer,
+                                    &req->send.length);
     if (status != UCS_OK) {
-        ucs_free(req);
-        ucs_error("failed to pack address: %s", ucs_status_string(status));
+        ucp_request_mem_free(req);
         return status;
     }
 
-    req->send.buffer = address;
-
     ucp_request_send(req, 0);
     return UCS_OK;
 }
 
-static uint64_t ucp_wireup_get_ep_tl_bitmap(ucp_ep_h ep, ucp_lane_map_t lane_map)
+static ucp_tl_bitmap_t
+ucp_wireup_get_ep_tl_bitmap(ucp_ep_h ep, ucp_lane_map_t lane_map)
 {
-    uint64_t         tl_bitmap = 0;
+    ucp_tl_bitmap_t tl_bitmap = UCS_BITMAP_ZERO;
     ucp_lane_index_t lane;
 
     ucs_for_each_bit(lane, lane_map) {
         ucs_assert(lane < UCP_MAX_LANES);
-        tl_bitmap |= UCS_BIT(ucp_ep_get_rsc_index(ep, lane));
+        if (ucp_ep_get_rsc_index(ep, lane) == UCP_NULL_RESOURCE) {
+            continue;
+        }
+
+        UCS_BITMAP_SET(tl_bitmap, ucp_ep_get_rsc_index(ep, lane));
     }
 
     return tl_bitmap;
@@ -303,6 +367,7 @@ ucp_wireup_connect_local(ucp_ep_h ep,
     ucs_status_t status;
 
     ucs_trace("ep %p: connect local transports", ep);
+    ucs_log_indent(1);
 
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
         if (!ucp_ep_is_lane_p2p(ep, lane)) {
@@ -316,16 +381,20 @@ ucp_wireup_connect_local(ucp_ep_h ep,
         if (status != UCS_OK) {
             ucs_error("ep %p: no remote ep address for lane[%d]->remote_lane[%d]",
                       ep, lane, remote_lane);
-           return status;
+            goto out;
         }
 
         status = uct_ep_connect_to_ep(ep->uct_eps[lane], dev_addr, ep_addr);
         if (status != UCS_OK) {
-            return status;
+            goto out;
         }
     }
 
-    return UCS_OK;
+    status = UCS_OK;
+
+out:
+    ucs_log_indent(-1);
+    return status;
 }
 
 void ucp_wireup_remote_connected(ucp_ep_h ep)
@@ -337,12 +406,17 @@ void ucp_wireup_remote_connected(ucp_ep_h ep)
     }
 
     ucs_trace("ep %p: remote connected", ep);
-    ep->flags |= UCP_EP_FLAG_REMOTE_CONNECTED;
+    if (!(ep->flags & UCP_EP_FLAG_CLOSED)) {
+        /* set REMOTE_CONNECTED flag if an EP is not closed, otherwise -
+         * just make UCT EPs remote connected to remove WIREUP_EP for them
+         * and complete flush(LOCAL) operation in UCP EP close procedure
+         * (don't set REMOTE_CONNECTED flag to avoid possible wrong behavior
+         * in ucp_ep_close_flushed_callback() when a peer was already
+         * disconnected, but we set REMOTE_CONNECTED flag again) */
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_REMOTE_CONNECTED, 0);
+    }
 
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
-        if (ucp_ep_is_lane_p2p(ep, lane)) {
-            ucs_assert(ucp_wireup_ep_test(ep->uct_eps[lane]));
-        }
         if (ucp_wireup_ep_test(ep->uct_eps[lane])) {
             ucp_wireup_ep_remote_connected(ep->uct_eps[lane]);
         }
@@ -351,14 +425,14 @@ void ucp_wireup_remote_connected(ucp_ep_h ep)
     ucs_assert(ep->flags & UCP_EP_FLAG_REMOTE_ID);
 }
 
-
 static ucs_status_t
 ucp_wireup_init_lanes_by_request(ucp_worker_h worker, ucp_ep_h ep,
                                  unsigned ep_init_flags,
                                  const ucp_unpacked_address_t *remote_address,
                                  unsigned *addr_indices)
 {
-    ucs_status_t status = ucp_wireup_init_lanes(ep, ep_init_flags, UINT64_MAX,
+    ucs_status_t status = ucp_wireup_init_lanes(ep, ep_init_flags,
+                                                &ucp_tl_bitmap_max,
                                                 remote_address, addr_indices);
     if (status == UCS_OK) {
         return UCS_OK;
@@ -368,28 +442,26 @@ ucp_wireup_init_lanes_by_request(ucp_worker_h worker, ucp_ep_h ep,
     return status;
 }
 
-
 static UCS_F_NOINLINE void
-ucp_wireup_process_pre_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
+ucp_wireup_process_pre_request(ucp_worker_h worker, ucp_ep_h ep,
+                               const ucp_wireup_msg_t *msg,
                                const ucp_unpacked_address_t *remote_address)
 {
-    unsigned ep_init_flags = UCP_EP_INIT_CREATE_AM_LANE;
+    unsigned ep_init_flags = UCP_EP_INIT_CREATE_AM_LANE |
+                             UCP_EP_INIT_CM_WIREUP_CLIENT;
     unsigned addr_indices[UCP_MAX_LANES];
     ucs_status_t status;
-    ucp_ep_h ep;
 
-    ucs_assert(msg->type      == UCP_WIREUP_MSG_PRE_REQUEST);
-    ucs_assert(msg->dst_ep_id != UCP_EP_ID_INVALID);
+    UCP_WIREUP_MSG_CHECK(msg, ep, UCP_WIREUP_MSG_PRE_REQUEST);
     ucs_trace("got wireup pre_request from 0x%"PRIx64" src_ep_id 0x%"PRIx64
               " dst_ep_id 0x%"PRIx64" conn_sn %u",
               remote_address->uuid, msg->src_ep_id, msg->dst_ep_id,
               msg->conn_sn);
 
-    /* wireup pre_request for a specific ep */
-    ep = ucp_worker_get_ep_by_id(worker, msg->dst_ep_id);
-    ucs_assert((ep->flags & UCP_EP_FLAG_SOCKADDR_PARTIAL_ADDR) ||
-               ucp_ep_has_cm_lane(ep));
+    ucs_assert(ucp_ep_get_cm_wireup_ep(ep) != NULL);
 
+    /* restore the EP here to avoid access to incomplete configuration before
+       this point */
     ucp_ep_update_remote_id(ep, msg->src_ep_id);
     ucp_ep_flush_state_reset(ep);
 
@@ -397,10 +469,6 @@ ucp_wireup_process_pre_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
         ep_init_flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE;
     }
 
-    if (ucp_ep_has_cm_lane(ep)) {
-        ep_init_flags |= UCP_EP_INIT_CM_WIREUP_CLIENT;
-    }
-
     /* initialize transport endpoints */
     status = ucp_wireup_init_lanes_by_request(worker, ep, ep_init_flags,
                                               remote_address, addr_indices);
@@ -415,37 +483,31 @@ ucp_wireup_process_pre_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
 }
 
 static UCS_F_NOINLINE void
-ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
+ucp_wireup_process_request(ucp_worker_h worker, ucp_ep_h ep,
+                           const ucp_wireup_msg_t *msg,
                            const ucp_unpacked_address_t *remote_address)
 {
-    uint64_t remote_uuid   = remote_address->uuid;
-    uint64_t tl_bitmap     = 0;
-    int send_reply         = 0;
-    unsigned ep_init_flags = 0;
+    uint64_t remote_uuid      = remote_address->uuid;
+    int send_reply            = 0;
+    unsigned ep_init_flags    = 0;
+    ucp_tl_bitmap_t tl_bitmap = UCS_BITMAP_ZERO;
     ucp_rsc_index_t lanes2remote[UCP_MAX_LANES];
     unsigned addr_indices[UCP_MAX_LANES];
     ucs_status_t status;
-    ucp_ep_flags_t listener_flag;
-    ucp_ep_h ep;
     int has_cm_lane;
 
-    ucs_assert(msg->type == UCP_WIREUP_MSG_REQUEST);
+    UCP_WIREUP_MSG_CHECK(msg, ep, UCP_WIREUP_MSG_REQUEST);
     ucs_trace("got wireup request from 0x%"PRIx64" src_ep_id 0x%"PRIx64""
               " dst_ep_id 0x%"PRIx64" conn_sn %d", remote_address->uuid,
               msg->src_ep_id, msg->dst_ep_id, msg->conn_sn);
 
-    if (msg->dst_ep_id != UCP_EP_ID_INVALID) {
-        /* wireup request for a specific ep */
-        ep = ucp_worker_get_ep_by_id(worker, msg->dst_ep_id);
+    if (ep != NULL) {
+        ucs_assert(msg->dst_ep_id != UCS_PTR_MAP_KEY_INVALID);
         ucp_ep_update_remote_id(ep, msg->src_ep_id);
-        if (!(ep->flags & UCP_EP_FLAG_LISTENER)) {
-            /* Reset flush state only if it's not a client-server wireup on
-             * server side with long address exchange when listener (united with
-             * flush state) should be valid until user's callback invoking */
-            ucp_ep_flush_state_reset(ep);
-        }
+        ucp_ep_flush_state_reset(ep);
         ep_init_flags |= UCP_EP_INIT_CREATE_AM_LANE;
     } else {
+        ucs_assert(msg->dst_ep_id == UCS_PTR_MAP_KEY_INVALID);
         ep = ucp_ep_match_retrieve(worker, remote_uuid,
                                    msg->conn_sn ^
                                    (remote_uuid == worker->uuid),
@@ -477,24 +539,16 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
          * instead of each other. We use the uniqueness of worker uuid to decide
          * which connect request should be ignored.
          */
-        if ((ep->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED) && (remote_uuid > worker->uuid)) {
+        if ((ep->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED) &&
+            (remote_uuid > worker->uuid)) {
             ucs_trace("ep %p: ignoring simultaneous connect request", ep);
-            ep->flags |= UCP_EP_FLAG_CONNECT_REQ_IGNORED;
+            ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_REQ_IGNORED, 0);
             return;
         }
     }
 
     has_cm_lane = ucp_ep_has_cm_lane(ep);
 
-    if (ep->flags & UCP_EP_FLAG_LISTENER) {
-        /* If this is an ep on a listener (server) that received a partial
-         * worker address from the client, then the following lanes initialization
-         * will be done after an aux lane was already created on this ep.
-         * Therefore, remove the existing aux endpoint since will need to create
-         * new lanes now */
-        ucp_ep_cleanup_lanes(ep);
-    }
-
     if (msg->err_mode == UCP_ERR_HANDLING_MODE_PEER) {
         ep_init_flags |= UCP_EP_INIT_ERR_MODE_PEER_FAILURE;
     }
@@ -517,9 +571,8 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
      */
     send_reply = /* Always send the reply in case of CM, the client's EP has to
                   * be marked as REMOTE_CONNECTED */
-                 has_cm_lane                           ||
-                 (msg->dst_ep_id == UCP_EP_ID_INVALID) ||
-                 ucp_ep_config(ep)->p2p_lanes;
+        has_cm_lane || (msg->dst_ep_id == UCS_PTR_MAP_KEY_INVALID) ||
+        ucp_ep_config(ep)->p2p_lanes;
 
     /* Connect p2p addresses to remote endpoint, if at least one is true: */
     if (/* - EP has not been connected locally yet */
@@ -534,7 +587,7 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
 
         tl_bitmap  = ucp_wireup_get_ep_tl_bitmap(ep,
                                                  ucp_ep_config(ep)->p2p_lanes);
-        ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_LOCAL_CONNECTED, 0);
 
         ucs_assert(send_reply);
     }
@@ -548,27 +601,8 @@ ucp_wireup_process_request(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
     }
 
     if (send_reply) {
-        listener_flag = ep->flags & UCP_EP_FLAG_LISTENER;
-        /* Remove this flag at this point if it's set
-         * (so that address packing would be correct) */
-        ep->flags &= ~UCP_EP_FLAG_LISTENER;
-
         ucs_trace("ep %p: sending wireup reply", ep);
-        status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, tl_bitmap,
-                                     lanes2remote);
-        if (status != UCS_OK) {
-            return;
-        }
-
-        /* Restore saved flag value */
-        ep->flags |= listener_flag;
-    } else {
-        /* if in client-server flow, schedule invoking the user's callback
-         * (if server is connected) from the main thread */
-        if (ucs_test_all_flags(ep->flags,
-                               (UCP_EP_FLAG_LISTENER | UCP_EP_FLAG_LOCAL_CONNECTED))) {
-            ucp_listener_schedule_accept_cb(ep);
-        }
+        ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REPLY, &tl_bitmap, lanes2remote);
     }
 }
 
@@ -582,7 +616,8 @@ static unsigned ucp_wireup_send_msg_ack(void *arg)
     ucs_trace("ep %p: sending wireup ack", ep);
 
     memset(rsc_tli, UCP_NULL_RESOURCE, sizeof(rsc_tli));
-    status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_ACK, 0, rsc_tli);
+    status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_ACK, &ucp_tl_bitmap_min,
+                                 rsc_tli);
     return (status == UCS_OK);
 }
 
@@ -592,18 +627,15 @@ int ucp_wireup_msg_ack_cb_pred(const ucs_callbackq_elem_t *elem, void *arg)
 }
 
 static UCS_F_NOINLINE void
-ucp_wireup_process_reply(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
+ucp_wireup_process_reply(ucp_worker_h worker, ucp_ep_h ep,
+                         const ucp_wireup_msg_t *msg,
                          const ucp_unpacked_address_t *remote_address)
 {
     uct_worker_cb_id_t cb_id = UCS_CALLBACKQ_ID_NULL;
     ucs_status_t status;
-    ucp_ep_h ep;
     int ack;
 
-    ep = ucp_worker_get_ep_by_id(worker, msg->dst_ep_id);
-
-    ucs_assert(msg->type == UCP_WIREUP_MSG_REPLY);
-    ucs_assert((!(ep->flags & UCP_EP_FLAG_LISTENER)));
+    UCP_WIREUP_MSG_CHECK(msg, ep, UCP_WIREUP_MSG_REPLY);
     ucs_trace("ep %p: got wireup reply src_ep_id 0x%"PRIx64
               " dst_ep_id 0x%"PRIx64" sn %d", ep, msg->src_ep_id,
               msg->dst_ep_id, msg->conn_sn);
@@ -626,7 +658,7 @@ ucp_wireup_process_reply(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
             return;
         }
 
-        ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_LOCAL_CONNECTED, 0);
         ack = 1;
     } else {
         ack = 0;
@@ -643,21 +675,71 @@ ucp_wireup_process_reply(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
     }
 }
 
-static UCS_F_NOINLINE
-void ucp_wireup_process_ack(ucp_worker_h worker, const ucp_wireup_msg_t *msg)
+static UCS_F_NOINLINE void
+ucp_wireup_send_ep_removed(ucp_worker_h worker, const ucp_wireup_msg_t *msg,
+                           const ucp_unpacked_address_t *remote_address)
 {
-    ucp_ep_h ep;
+    /* Request a peer failure detection support from a reply EP to be able to do
+     * discarding of lanes when destroying all UCP EPs in UCP worker destroy.
+     * Also, create UCP EP with CONNECT_TO_IFACE connection mode to not do
+     * WIREUP_MSG phase between peers which require a direct EP ID */
+    unsigned ep_init_flags = UCP_EP_INIT_ERR_MODE_PEER_FAILURE |
+                             UCP_EP_INIT_FLAG_INTERNAL |
+                             UCP_EP_INIT_CONNECT_TO_IFACE_ONLY;
+    ucs_status_t status;
+    ucp_ep_h reply_ep;
+    unsigned addr_indices[UCP_MAX_LANES];
+    ucs_status_ptr_t req;
+
+    /* If endpoint does not exist - create a temporary endpoint to send a
+     * UCP_WIREUP_MSG_EP_REMOVED reply */
+    status = ucp_worker_create_ep(worker, ep_init_flags, remote_address->name,
+                                  "wireup ep_check reply", &reply_ep);
+    if (status != UCS_OK) {
+        ucs_error("failed to create EP: %s", ucs_status_string(status));
+        return;
+    }
+
+    /* Initialize lanes of the reply EP */
+    status = ucp_wireup_init_lanes(reply_ep, ep_init_flags, &ucp_tl_bitmap_max,
+                                   remote_address, addr_indices);
+    if (status != UCS_OK) {
+        goto out_delete_ep;
+    }
+
+    ucp_ep_update_remote_id(reply_ep, msg->src_ep_id);
+    ucp_ep_flush_state_reset(reply_ep);
+    status = ucp_wireup_msg_send(reply_ep, UCP_WIREUP_MSG_EP_REMOVED,
+                                 &ucp_tl_bitmap_min, NULL);
+    if (status != UCS_OK) {
+        goto out_cleanup_lanes;
+    }
+
+    req = ucp_ep_flush_internal(reply_ep, UCP_REQUEST_FLAG_RELEASED,
+                                &ucp_request_null_param, NULL,
+                                ucp_ep_register_disconnect_progress, "close");
+    if (UCS_PTR_IS_PTR(req)) {
+        return;
+    }
 
-    ep = ucp_worker_get_ep_by_id(worker, msg->dst_ep_id);
+out_cleanup_lanes:
+    ucp_ep_cleanup_lanes(reply_ep);
+out_delete_ep:
+    ucp_ep_delete(reply_ep);
+}
 
-    ucs_assert(msg->type == UCP_WIREUP_MSG_ACK);
+static UCS_F_NOINLINE
+void ucp_wireup_process_ack(ucp_worker_h worker, ucp_ep_h ep,
+                            const ucp_wireup_msg_t *msg)
+{
+    UCP_WIREUP_MSG_CHECK(msg, ep, UCP_WIREUP_MSG_ACK);
     ucs_trace("ep %p: got wireup ack", ep);
 
     ucs_assert(ep->flags & UCP_EP_FLAG_REMOTE_ID);
     ucs_assert(ep->flags & UCP_EP_FLAG_CONNECT_REP_SENT);
 
     if (!(ep->flags & UCP_EP_FLAG_LOCAL_CONNECTED)) {
-        /* drop the procesing of ACK since close protocol or error
+        /* drop the processing of ACK since close protocol or error
          * handling is started */
         ucs_assert(ucp_ep_has_cm_lane(ep) &&
                    (ep->flags & UCP_EP_FLAG_DISCONNECTED_CM_LANE));
@@ -665,13 +747,6 @@ void ucp_wireup_process_ack(ucp_worker_h worker, const ucp_wireup_msg_t *msg)
     }
 
     ucp_wireup_remote_connected(ep);
-
-    /* if this ack is received as part of the client-server flow, when handling
-     * a large worker address from the client, invoke the cached user callback
-     * from the main thread */
-    if (ep->flags & UCP_EP_FLAG_LISTENER) {
-        ucp_listener_schedule_accept_cb(ep);
-    }
 }
 
 static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data,
@@ -679,19 +754,22 @@ static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data,
 {
     ucp_worker_h worker   = arg;
     ucp_wireup_msg_t *msg = data;
+    ucp_ep_h ep           = NULL;
     ucp_unpacked_address_t remote_address;
-    ucp_ep_h ep UCS_V_UNUSED;
     ucs_status_t status;
 
     UCS_ASYNC_BLOCK(&worker->async);
 
-    if (msg->dst_ep_id != UCP_EP_ID_INVALID) {
-        ep = ucp_worker_get_ep_by_id(worker, msg->dst_ep_id);
-        if (ep == NULL) {
-            ucs_diag("got wireup msg %d src_ep_id 0x%"PRIx64" for"
-                     " non-existing dst_ep_id 0x%"PRIx64" sn %d,"
-                     " ignoring it",
-                     msg->type, msg->src_ep_id, msg->dst_ep_id, msg->conn_sn);
+    if (msg->dst_ep_id != UCS_PTR_MAP_KEY_INVALID) {
+        UCP_WORKER_GET_EP_BY_ID(
+                &ep, worker, msg->dst_ep_id,
+                if (msg->type != UCP_WIREUP_MSG_EP_CHECK) { goto out; },
+                "WIREUP message (%d src_ep_id 0x%" PRIx64 " sn %d)", msg->type,
+                msg->src_ep_id, msg->conn_sn);
+
+        if ((msg->type == UCP_WIREUP_MSG_EP_CHECK) && (ep != NULL)) {
+            /* UCP EP is valid, no need for any other actions when handling
+             * EP_CHECK message (e.g. can avoid remote address unpacking) */
             goto out;
         }
     }
@@ -706,13 +784,20 @@ static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data,
 
     if (msg->type == UCP_WIREUP_MSG_ACK) {
         ucs_assert(remote_address.address_count == 0);
-        ucp_wireup_process_ack(worker, msg);
+        ucp_wireup_process_ack(worker, ep, msg);
     } else if (msg->type == UCP_WIREUP_MSG_PRE_REQUEST) {
-        ucp_wireup_process_pre_request(worker, msg, &remote_address);
+        ucp_wireup_process_pre_request(worker, ep, msg, &remote_address);
     } else if (msg->type == UCP_WIREUP_MSG_REQUEST) {
-        ucp_wireup_process_request(worker, msg, &remote_address);
+        ucp_wireup_process_request(worker, ep, msg, &remote_address);
     } else if (msg->type == UCP_WIREUP_MSG_REPLY) {
-        ucp_wireup_process_reply(worker, msg, &remote_address);
+        ucp_wireup_process_reply(worker, ep, msg, &remote_address);
+    } else if (msg->type == UCP_WIREUP_MSG_EP_CHECK) {
+        ucs_assert((msg->dst_ep_id != UCS_PTR_MAP_KEY_INVALID) && (ep == NULL));
+        ucp_wireup_send_ep_removed(worker, msg, &remote_address);
+    } else if (msg->type == UCP_WIREUP_MSG_EP_REMOVED) {
+        ucs_assert(msg->dst_ep_id != UCS_PTR_MAP_KEY_INVALID);
+        ucp_worker_set_ep_failed(worker, ep, NULL, UCP_NULL_LANE,
+                                 UCS_ERR_CONNECTION_RESET);
     } else {
         ucs_bug("invalid wireup message");
     }
@@ -724,24 +809,6 @@ static ucs_status_t ucp_wireup_msg_handler(void *arg, void *data,
     return UCS_OK;
 }
 
-void ucp_wireup_assign_lane(ucp_ep_h ep, ucp_lane_index_t lane, uct_ep_h uct_ep,
-                            const char *info)
-{
-    /* If ep already exists, it's a wireup proxy, and we need to update its
-     * next_ep instead of replacing it.
-     */
-    if (ep->uct_eps[lane] == NULL) {
-        ucs_trace("ep %p: assign uct_ep[%d]=%p%s", ep, lane, uct_ep, info);
-        ep->uct_eps[lane] = uct_ep;
-    } else {
-        ucs_assert(ucp_wireup_ep_test(ep->uct_eps[lane]));
-        ucs_trace("ep %p: wireup uct_ep[%d]=%p next set to %p%s", ep, lane,
-                  ep->uct_eps[lane], uct_ep, info);
-        ucp_wireup_ep_set_next_ep(ep->uct_eps[lane], uct_ep);
-        ucp_wireup_ep_remote_connected(ep->uct_eps[lane]);
-    }
-}
-
 uct_ep_h ucp_wireup_extract_lane(ucp_ep_h ep, ucp_lane_index_t lane)
 {
     uct_ep_h uct_ep = ep->uct_eps[lane];
@@ -774,6 +841,15 @@ void ucp_wireup_replay_pending_requests(ucp_ep_h ucp_ep,
     }
 }
 
+static void
+ucp_wireup_ep_lane_set_next_ep(ucp_ep_h ep, ucp_lane_index_t lane,
+                               uct_ep_h uct_ep)
+{
+    ucs_trace("ep %p: wireup uct_ep[%d]=%p next set to %p", ep, lane,
+              ep->uct_eps[lane], uct_ep);
+    ucp_wireup_ep_set_next_ep(ep->uct_eps[lane], uct_ep);
+}
+
 static ucs_status_t
 ucp_wireup_connect_lane_to_iface(ucp_ep_h ep, ucp_lane_index_t lane,
                                  unsigned path_index,
@@ -808,7 +884,35 @@ ucp_wireup_connect_lane_to_iface(ucp_ep_h ep, ucp_lane_index_t lane,
         return status;
     }
 
-    ucp_wireup_assign_lane(ep, lane, uct_ep, "");
+    if (ep->uct_eps[lane] == NULL) {
+        if (ucp_ep_has_cm_lane(ep)) {
+            /* Create wireup EP in case of CM lane is used, since a WIREUP EP is
+             * used to keep user's pending requests and send WIREUP MSGs (if it
+             * is WIREUP MSG lane) until CM and WIREUP_MSG phases are done. The
+             * lane is added during WIREUP_MSG exchange or created as an initial
+             * configuration after a connection request on a server side */
+            status = ucp_wireup_ep_create(ep, &ep->uct_eps[lane]);
+            if (status != UCS_OK) {
+                /* coverity[leaked_storage] */
+                return status;
+            }
+            ucp_wireup_ep_lane_set_next_ep(ep, lane, uct_ep);
+        } else {
+            /* Assign the lane without wireup EP when out-of-band address
+             * exchange is used */
+            ucs_trace("ep %p: assign uct_ep[%d]=%p", ep, lane, uct_ep);
+            ep->uct_eps[lane] = uct_ep;
+        }
+    } else {
+        /* If EP already exists, it's a wireup proxy, and we need to update
+         * its next_ep instead of replacing it. The wireup EP was created
+         * during CM pack_cb() on a client side */
+        ucs_assert(ucp_wireup_ep_test(ep->uct_eps[lane]));
+        ucs_assert(ucp_proxy_ep_extract(ep->uct_eps[lane]) == NULL);
+        ucs_assert(ucp_ep_has_cm_lane(ep));
+        ucp_wireup_ep_lane_set_next_ep(ep, lane, uct_ep);
+    }
+
     ucp_worker_iface_progress_ep(wiface);
     return UCS_OK;
 }
@@ -911,7 +1015,6 @@ static void ucp_wireup_print_config(ucp_worker_h worker,
                                     ucp_rsc_index_t cm_index,
                                     ucs_log_level_t log_level)
 {
-    char lane_info[128] = {0};
     char am_lane_str[8];
     char wireup_msg_lane_str[8];
     char cm_lane_str[8];
@@ -922,25 +1025,27 @@ static void ucp_wireup_print_config(ucp_worker_h worker,
     }
 
     ucs_log(log_level,
-            "%s: am_lane %s wireup_msg_lane %s cm_lane %s reachable_mds 0x%"PRIx64,
-            title, ucp_wireup_get_lane_index_str(key->am_lane, am_lane_str,
-                                                 sizeof(am_lane_str)),
-            ucp_wireup_get_lane_index_str(key->wireup_msg_lane, wireup_msg_lane_str,
+            "%s: am_lane %s wireup_msg_lane %s cm_lane %s reachable_mds "
+            "0x%" PRIx64 " ep_check_map 0x%x",
+            title,
+            ucp_wireup_get_lane_index_str(key->am_lane, am_lane_str,
+                                          sizeof(am_lane_str)),
+            ucp_wireup_get_lane_index_str(key->wireup_msg_lane,
+                                          wireup_msg_lane_str,
                                           sizeof(wireup_msg_lane_str)),
             ucp_wireup_get_lane_index_str(key->cm_lane, cm_lane_str,
                                           sizeof(cm_lane_str)),
-            key->reachable_md_map);
+            key->reachable_md_map, key->ep_check_map);
 
     for (lane = 0; lane < key->num_lanes; ++lane) {
+        UCS_STRING_BUFFER_ONSTACK(strb, 128);
         if (lane == key->cm_lane) {
-            ucp_ep_config_cm_lane_info_str(worker, key, lane, cm_index,
-                                           lane_info, sizeof(lane_info));
+            ucp_ep_config_cm_lane_info_str(worker, key, lane, cm_index, &strb);
         } else {
             ucp_ep_config_lane_info_str(worker, key, addr_indices, lane,
-                                        UCP_NULL_RESOURCE, lane_info,
-                                        sizeof(lane_info));
+                                        UCP_NULL_RESOURCE, &strb);
         }
-        ucs_log(log_level, "%s: %s", title, lane_info);
+        ucs_log(log_level, "%s: %s", title, ucs_string_buffer_cstr(&strb));
     }
 }
 
@@ -975,7 +1080,7 @@ ucp_wireup_get_reachable_mds(ucp_ep_h ep, unsigned ep_init_flags,
     unsigned num_dst_mds;
 
     ae_dst_md_map = 0;
-    ucs_for_each_bit(rsc_index, context->tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, rsc_index) {
         ucp_unpacked_address_for_each(ae, remote_address) {
             if (ucp_wireup_is_reachable(ep, ep_init_flags, rsc_index, ae)) {
                 ae_dst_md_map         |= UCS_BIT(ae->md_index);
@@ -1022,34 +1127,66 @@ ucp_wireup_get_reachable_mds(ucp_ep_h ep, unsigned ep_init_flags,
 }
 
 static void
-ucp_wireup_check_config_intersect(ucp_ep_h ep,
-                                  ucp_ep_config_key_t *new_key,
+ucp_wireup_check_config_intersect(ucp_ep_h ep, ucp_ep_config_key_t *new_key,
+                                  const ucp_unpacked_address_t *remote_address,
+                                  const unsigned *addr_indices,
                                   ucp_lane_index_t *connect_lane_bitmap,
                                   ucs_queue_head_t *replay_pending_queue)
 {
-    ucp_worker_h worker                            = ep->worker;
-    uct_ep_h new_uct_eps[UCP_MAX_LANES]            = { NULL };
-    ucp_lane_index_t reuse_lane_map[UCP_MAX_LANES] = { UCP_NULL_LANE };
+    uct_ep_h new_uct_eps[UCP_MAX_LANES]                = { NULL };
+    ucp_lane_index_t reuse_lane_map[UCP_MAX_LANES]     = { UCP_NULL_LANE };
+    ucp_rsc_index_t old_dst_rsc_indices[UCP_MAX_LANES] = { UCP_NULL_RESOURCE };
+    ucp_rsc_index_t new_dst_rsc_indices[UCP_MAX_LANES] = { UCP_NULL_RESOURCE };
+    ucp_wireup_ep_t *cm_wireup_ep                      = NULL;
     ucp_ep_config_key_t *old_key;
     ucp_lane_index_t lane, reuse_lane;
-    ucp_wireup_ep_t *cm_wireup_ep;
+    ucp_address_entry_t *ae;
+    unsigned addr_index;
+    ucp_rsc_index_t dst_rsc_index;
 
     *connect_lane_bitmap = UCS_MASK(new_key->num_lanes);
     ucs_queue_head_init(replay_pending_queue);
 
-    if ((ep->cfg_index == UCP_WORKER_CFG_INDEX_NULL) ||
-        !ucp_ep_has_cm_lane(ep)) {
+    if (!ucp_ep_has_cm_lane(ep) ||
+        (ep->cfg_index == UCP_WORKER_CFG_INDEX_NULL)) {
+        /* nothing to intersect with */
         return;
     }
 
-    old_key = &ucp_ep_config(ep)->key;
-
-    ucp_ep_config_lanes_intersect(old_key, new_key, reuse_lane_map);
+    ucs_assert(!(ep->flags & UCP_EP_FLAG_INTERNAL));
 
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
         ucs_assert(ep->uct_eps[lane] != NULL);
     }
 
+    cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
+    ucs_assert(cm_wireup_ep != NULL);
+
+    memcpy(old_dst_rsc_indices, cm_wireup_ep->dst_rsc_indices,
+           sizeof(old_dst_rsc_indices));
+    for (lane = 0; lane < new_key->num_lanes; ++lane) {
+        addr_index = addr_indices[lane];
+
+        if (lane == ucp_ep_get_cm_lane(ep)) {
+            ucs_assert(addr_index == UINT_MAX);
+            dst_rsc_index = UCP_NULL_RESOURCE;
+        } else {
+            ucs_assert(addr_index != UINT_MAX);
+            ae            = &remote_address->address_list[addr_index];
+            dst_rsc_index = ae->iface_attr.dst_rsc_index;
+        }
+
+        /* save destination resource index in the CM wireup EP for doing
+         * further intersections, if needed */
+        cm_wireup_ep->dst_rsc_indices[lane] = dst_rsc_index;
+        new_dst_rsc_indices[lane]           = dst_rsc_index;
+    }
+
+    old_key = &ucp_ep_config(ep)->key;
+
+    ucp_ep_config_lanes_intersect(old_key, old_dst_rsc_indices, new_key,
+                                  new_dst_rsc_indices, reuse_lane_map);
+
     /* CM lane has to be re-used by the new EP configuration */
     ucs_assert(reuse_lane_map[ucp_ep_get_cm_lane(ep)] != UCP_NULL_LANE);
     /* wireup lane has to be selected for the old configuration */
@@ -1067,28 +1204,36 @@ ucp_wireup_check_config_intersect(ucp_ep_h ep,
         /* previous wireup lane is not part of new configuration, so add it as
          * auxiliary endpoint inside cm lane, to be able to continue wireup
          * messages exchange */
-        cm_wireup_ep             = ucp_ep_get_cm_wireup_ep(ep);
+        ucs_assert(cm_wireup_ep != NULL);
+
         new_key->wireup_msg_lane = new_key->cm_lane;
         reuse_lane               = old_key->wireup_msg_lane;
-        ucp_wireup_ep_set_aux(cm_wireup_ep, ep->uct_eps[reuse_lane],
+        ucp_wireup_ep_set_aux(cm_wireup_ep,
+                              ucp_wireup_ep_extract_next_ep(ep->uct_eps[reuse_lane]),
                               old_key->lanes[reuse_lane].rsc_index);
-        /* reset the UCT EP from the previous WIREUP lane to not
-         * destroy it, since it's not needed anymore in the new
-         * configuration, but will be used for WIREUP MSG */
+        ucp_wireup_ep_pending_queue_purge(ep->uct_eps[reuse_lane],
+                                          ucp_wireup_pending_purge_cb,
+                                          replay_pending_queue);
+
+        /* reset the UCT EP from the previous WIREUP lane and destroy its WIREUP EP,
+         * since it's not needed anymore in the new configuration, UCT EP will be
+         * used for sending WIREUP MSGs in the new configuration */
+        uct_ep_destroy(ep->uct_eps[reuse_lane]);
         ep->uct_eps[reuse_lane]  = NULL;
     }
 
     /* Need to discard only old lanes that won't be used anymore in the new
-     * configuraton. Also, UCT EPs with the lane index >= old_key->num_lanes
+     * configuration. Also, UCT EPs with the lane index >= old_key->num_lanes
      * could be set in case of CM, we have to not reset them */
     for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) {
         reuse_lane = reuse_lane_map[lane];
         if (reuse_lane == UCP_NULL_RESOURCE) {
             if (ep->uct_eps[lane] != NULL) {
-                ucp_worker_discard_uct_ep(worker, ep->uct_eps[lane],
-                                          UCT_FLUSH_FLAG_LOCAL,
-                                          ucp_wireup_pending_purge_cb,
-                                          replay_pending_queue);
+                ucs_assert(lane != ucp_ep_get_cm_lane(ep));
+                ucp_worker_discard_uct_ep(
+                        ep, ep->uct_eps[lane], UCT_FLUSH_FLAG_LOCAL,
+                        ucp_wireup_pending_purge_cb, replay_pending_queue,
+                        (ucp_send_nbx_callback_t)ucs_empty_function, NULL);
                 ep->uct_eps[lane] = NULL;
             }
         } else if (ep->uct_eps[lane] != NULL) {
@@ -1109,13 +1254,14 @@ ucp_wireup_check_config_intersect(ucp_ep_h ep,
 }
 
 ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
-                                   uint64_t local_tl_bitmap,
+                                   const ucp_tl_bitmap_t *local_tl_bitmap,
                                    const ucp_unpacked_address_t *remote_address,
                                    unsigned *addr_indices)
 {
     ucp_worker_h worker                  = ep->worker;
-    uint64_t tl_bitmap                   = local_tl_bitmap &
-                                           worker->context->tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap            = UCS_BITMAP_AND(*local_tl_bitmap,
+                                                          worker->context->tl_bitmap,
+                                                          UCP_MAX_RESOURCES);
     ucp_rsc_index_t cm_idx               = UCP_NULL_RESOURCE;
     ucp_lane_index_t connect_lane_bitmap;
     ucp_ep_config_key_t key;
@@ -1123,12 +1269,13 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
     ucp_lane_index_t lane;
     ucs_status_t status;
     char str[32];
-    ucp_wireup_ep_t *cm_wireup_ep;
     ucs_queue_head_t replay_pending_queue;
 
-    ucs_assert(tl_bitmap != 0);
+    UCS_BITMAP_AND_INPLACE(&tl_bitmap, worker->context->tl_bitmap);
+    ucs_assert(!UCS_BITMAP_IS_ZERO_INPLACE(&tl_bitmap));
 
     ucs_trace("ep %p: initialize lanes", ep);
+    ucs_log_indent(1);
 
     ucp_ep_config_key_reset(&key);
     ucp_ep_config_key_set_err_mode(&key, ep_init_flags);
@@ -1136,10 +1283,11 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
     status = ucp_wireup_select_lanes(ep, ep_init_flags, tl_bitmap,
                                      remote_address, addr_indices, &key);
     if (status != UCS_OK) {
-        return status;
+        goto out;
     }
 
-    ucp_wireup_check_config_intersect(ep, &key, &connect_lane_bitmap,
+    ucp_wireup_check_config_intersect(ep, &key, remote_address, addr_indices,
+                                      &connect_lane_bitmap,
                                       &replay_pending_queue);
 
     /* Get all reachable MDs from full remote address list and join with
@@ -1151,7 +1299,7 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
     /* Load new configuration */
     status = ucp_worker_get_ep_config(worker, &key, 1, &new_cfg_index);
     if (status != UCS_OK) {
-        return status;
+        goto out;
     }
 
     if (ep->cfg_index == new_cfg_index) {
@@ -1160,17 +1308,15 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
             ucs_assert(ep->uct_eps[lane] != NULL);
         }
 #endif
-        return UCS_OK; /* No change */
+        status = UCS_OK; /* No change */
+        goto out;
     }
 
-    cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
-    if (cm_wireup_ep != NULL) {
-        cm_idx = cm_wireup_ep->cm_idx;
-    }
+    cm_idx = ucp_ep_ext_control(ep)->cm_idx;
 
     if ((ep->cfg_index != UCP_WORKER_CFG_INDEX_NULL) &&
-        /* reconfiguration is allowed for CM and sockaddr flows */
-        !ucp_ep_is_sockaddr_stub(ep) && !ucp_ep_has_cm_lane(ep)) {
+        /* reconfiguration is allowed for CM flow */
+        !ucp_ep_has_cm_lane(ep)) {
         /*
          * TODO handle a case where we have to change lanes and reconfigure the ep:
          *
@@ -1208,7 +1354,7 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
                                              key.lanes[lane].path_index,
                                              remote_address, addr_indices[lane]);
             if (status != UCS_OK) {
-                return status;
+                goto out;
             }
         }
 
@@ -1217,20 +1363,24 @@ ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
 
     /* If we don't have a p2p transport, we're connected */
     if (!ucp_ep_config(ep)->p2p_lanes) {
-        ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED;
+        ucp_ep_update_flags(ep, UCP_EP_FLAG_LOCAL_CONNECTED, 0);
     }
 
     ucp_wireup_replay_pending_requests(ep, &replay_pending_queue);
 
     ucp_worker_keepalive_add_ep(ep);
-    return UCS_OK;
+    status = UCS_OK;
+
+out:
+    ucs_log_indent(-1);
+    return status;
 }
 
 ucs_status_t ucp_wireup_send_request(ucp_ep_h ep)
 {
     ucp_rsc_index_t rsc_index;
     ucs_status_t status;
-    uint64_t tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap;
 
     tl_bitmap = ucp_wireup_get_ep_tl_bitmap(ep, UCS_MASK(ucp_ep_num_lanes(ep)));
 
@@ -1238,13 +1388,13 @@ ucs_status_t ucp_wireup_send_request(ucp_ep_h ep)
     rsc_index = ucp_wireup_ep_get_aux_rsc_index(
                     ep->uct_eps[ucp_ep_get_wireup_msg_lane(ep)]);
     if (rsc_index != UCP_NULL_RESOURCE) {
-        tl_bitmap |= UCS_BIT(rsc_index);
+        UCS_BITMAP_SET(tl_bitmap, rsc_index);
     }
 
     ucs_debug("ep %p: send wireup request (flags=0x%x)", ep, ep->flags);
-    status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, tl_bitmap, NULL);
+    status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_REQUEST, &tl_bitmap, NULL);
 
-    ep->flags |= UCP_EP_FLAG_CONNECT_REQ_QUEUED;
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_REQ_QUEUED, 0);
 
     return status;
 }
@@ -1261,18 +1411,17 @@ void ucp_wireup_pending_purge_cb(uct_pending_req_t *self, void *arg)
 
 ucs_status_t ucp_wireup_send_pre_request(ucp_ep_h ep)
 {
-    uint64_t tl_bitmap = UINT64_MAX;  /* pack full worker address */
     ucs_status_t status;
 
-    ucs_assert((ep->flags & UCP_EP_FLAG_LISTENER) ||
-               ucp_ep_has_cm_lane(ep));
+    ucs_assert(ucp_ep_has_cm_lane(ep));
     ucs_assert(!(ep->flags & UCP_EP_FLAG_CONNECT_PRE_REQ_QUEUED));
 
     ucs_debug("ep %p: send wireup pre-request (flags=0x%x)", ep, ep->flags);
     status = ucp_wireup_msg_send(ep, UCP_WIREUP_MSG_PRE_REQUEST,
-                                 tl_bitmap, NULL);
+                                 &ucp_tl_bitmap_max, NULL);
+
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_CONNECT_PRE_REQ_QUEUED, 0);
 
-    ep->flags |= UCP_EP_FLAG_CONNECT_PRE_REQ_QUEUED;
     return status;
 }
 
@@ -1289,9 +1438,10 @@ ucs_status_t ucp_wireup_connect_remote(ucp_ep_h ep, ucp_lane_index_t lane)
 
     UCS_ASYNC_BLOCK(&ep->worker->async);
 
-    /* checking again, with lock held, if already connected or connection is
-     * in progress */
-    if ((ep->flags & UCP_EP_FLAG_REMOTE_ID) ||
+    /* Checking again, with lock held, if already connected, connection is in
+     * progress, or the endpoint is in failed state.
+     */
+    if ((ep->flags & (UCP_EP_FLAG_REMOTE_ID | UCP_EP_FLAG_FAILED)) ||
         ucp_wireup_ep_test(ep->uct_eps[lane])) {
         status = UCS_OK;
         goto out_unlock;
@@ -1376,7 +1526,8 @@ static void ucp_wireup_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type,
                                 UCP_ADDRESS_PACK_FLAG_NO_TRACE,
                                 &unpacked_address);
     if (status != UCS_OK) {
-        strncpy(unpacked_address.name, "<malformed address>", UCP_WORKER_NAME_MAX);
+        strncpy(unpacked_address.name, "<malformed address>",
+                UCP_WORKER_ADDRESS_NAME_MAX);
         unpacked_address.uuid          = 0;
         unpacked_address.address_count = 0;
         unpacked_address.address_list  = NULL;
@@ -1398,7 +1549,7 @@ static void ucp_wireup_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type,
     }
 
     ucp_unpacked_address_for_each(ae, &unpacked_address) {
-        ucs_for_each_bit(tl, context->tl_bitmap) {
+        UCS_BITMAP_FOR_EACH_BIT(context->tl_bitmap, tl) {
             rsc = &context->tl_rscs[tl];
             if (ae->tl_name_csum == rsc->tl_name_csum) {
                 snprintf(p, end - p, " "UCT_TL_RESOURCE_DESC_FMT,
@@ -1430,11 +1581,11 @@ ucp_ep_params_err_handling_mode(const ucp_ep_params_t *params)
 unsigned ucp_ep_init_flags(const ucp_worker_h worker,
                            const ucp_ep_params_t *params)
 {
-    unsigned flags = ucp_cm_ep_init_flags(worker, params);
+    unsigned flags = ucp_cm_ep_init_flags(params);
 
-    if ((ucp_worker_sockaddr_is_cm_proto(worker) &&
-         worker->context->config.ext.cm_use_all_devices) ||
-        (params->field_mask & UCP_EP_PARAM_FIELD_SOCK_ADDR)) {
+    if (ucp_ep_init_flags_has_cm(flags) &&
+        worker->context->config.ext.cm_use_all_devices) {
+        /* request AM lane for wireup MSG protocol which enables all devices */
         flags |= UCP_EP_INIT_CREATE_AM_LANE;
     }
 
diff --git a/src/ucp/wireup/wireup.h b/src/ucp/wireup/wireup.h
index 4ada9143198..70186619f64 100644
--- a/src/ucp/wireup/wireup.h
+++ b/src/ucp/wireup/wireup.h
@@ -27,6 +27,8 @@ enum {
     UCP_WIREUP_MSG_REQUEST,
     UCP_WIREUP_MSG_REPLY,
     UCP_WIREUP_MSG_ACK,
+    UCP_WIREUP_MSG_EP_CHECK,
+    UCP_WIREUP_MSG_EP_REMOVED,
     UCP_WIREUP_MSG_LAST
 };
 
@@ -68,13 +70,13 @@ typedef struct {
  * Packet structure for wireup requests.
  */
 typedef struct ucp_wireup_msg {
-    uint8_t                 type;         /* Message type */
-    uint8_t                 err_mode;     /* Peer error handling mode defined in
-                                             @ucp_err_handling_mode_t */
-    ucp_ep_match_conn_sn_t  conn_sn;      /* Connection sequence number */
-    uint64_t                src_ep_id;    /* Endpoint ID of source */
-    uint64_t                dst_ep_id;    /* Endpoint ID of destination, can be
-                                             UCP_EP_ID_INVALID */
+    uint8_t                type; /* Message type */
+    uint8_t                err_mode; /* Peer error handling mode defined in
+                                        @ucp_err_handling_mode_t */
+    ucp_ep_match_conn_sn_t conn_sn; /* Connection sequence number */
+    uint64_t               src_ep_id; /* Endpoint ID of source */
+    uint64_t               dst_ep_id; /* Endpoint ID of destination, can be
+                                         UCS_PTR_MAP_KEY_INVALID */
     /* packed addresses follow */
 } UCS_S_PACKED ucp_wireup_msg_t;
 
@@ -96,22 +98,26 @@ ucs_status_t ucp_wireup_connect_remote(ucp_ep_h ep, ucp_lane_index_t lane);
 
 ucs_status_t
 ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags,
-                                uint64_t tl_bitmap,
+                                ucp_tl_bitmap_t tl_bitmap,
                                 const ucp_unpacked_address_t *remote_address,
                                 ucp_wireup_select_info_t *select_info);
 
-ucs_status_t
-ucp_wireup_select_sockaddr_transport(const ucp_context_h context,
-                                     const ucs_sock_addr_t *sockaddr,
-                                     ucp_rsc_index_t *rsc_index_p);
-
 double ucp_wireup_amo_score_func(ucp_context_h context,
                                  const uct_md_attr_t *md_attr,
                                  const uct_iface_attr_t *iface_attr,
                                  const ucp_address_iface_attr_t *remote_iface_attr);
 
+size_t ucp_wireup_msg_pack(void *dest, void *arg);
+
 ucs_status_t ucp_wireup_msg_progress(uct_pending_req_t *self);
 
+ucs_status_t
+ucp_wireup_msg_prepare(ucp_ep_h ep, uint8_t type,
+                       const ucp_tl_bitmap_t *tl_bitmap,
+                       const ucp_lane_index_t *lanes2remote,
+                       ucp_wireup_msg_t *msg_hdr, void **address_p,
+                       size_t *address_length_p);
+
 int ucp_wireup_msg_ack_cb_pred(const ucs_callbackq_elem_t *elem, void *arg);
 
 int ucp_wireup_is_reachable(ucp_ep_h ep, unsigned ep_init_flags,
@@ -119,18 +125,16 @@ int ucp_wireup_is_reachable(ucp_ep_h ep, unsigned ep_init_flags,
                             const ucp_address_entry_t *ae);
 
 ucs_status_t ucp_wireup_init_lanes(ucp_ep_h ep, unsigned ep_init_flags,
-                                   uint64_t local_tl_bitmap,
+                                   const ucp_tl_bitmap_t *local_tl_bitmap,
                                    const ucp_unpacked_address_t *remote_address,
                                    unsigned *addr_indices);
 
 ucs_status_t
-ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags, uint64_t tl_bitmap,
+ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
+                        ucp_tl_bitmap_t tl_bitmap,
                         const ucp_unpacked_address_t *remote_address,
                         unsigned *addr_indices, ucp_ep_config_key_t *key);
 
-void ucp_wireup_assign_lane(ucp_ep_h ep, ucp_lane_index_t lane, uct_ep_h uct_ep,
-                            const char *info);
-
 void ucp_wireup_replay_pending_requests(ucp_ep_h ucp_ep,
                                         ucs_queue_head_t *tmp_pending_queue);
 
diff --git a/src/ucp/wireup/wireup_cm.c b/src/ucp/wireup/wireup_cm.c
index cc5457b6e45..04f703b70ba 100644
--- a/src/ucp/wireup/wireup_cm.c
+++ b/src/ucp/wireup/wireup_cm.c
@@ -17,13 +17,31 @@
 #include <ucs/sys/string.h>
 
 
+/**
+ * @brief Check whether CM callback should be called or not.
+ *
+ * @param [in] _ucp_ep        UCP Endpoint for which CM callback is called.
+ * @param [in] _uct_cm_ep     UCT CM Endpoint which calls CM callback.
+ * @param [in] _failed_action Action to do if UCP EP is in a FAILED state.
+ *                            This actions should stop macro execution.
+ */
+#define UCP_EP_CM_CALLBACK_ENTER(_ucp_ep, _uct_cm_ep, _failed_action) \
+    do { \
+        ucs_assert(ucs_async_is_blocked(&(_ucp_ep)->worker->async)); \
+        if ((_ucp_ep)->flags & UCP_EP_FLAG_FAILED) { \
+            _failed_action; \
+        } \
+        \
+        ucs_assertv_always((_uct_cm_ep) == ucp_ep_get_cm_uct_ep(_ucp_ep), \
+                           "%p: uct_cm_ep=%p vs found_uct_ep=%p", \
+                           _ucp_ep, _uct_cm_ep, \
+                           ucp_ep_get_cm_uct_ep(_ucp_ep)); \
+    } while (0)
+
+
 unsigned
-ucp_cm_ep_init_flags(const ucp_worker_h worker, const ucp_ep_params_t *params)
+ucp_cm_ep_init_flags(const ucp_ep_params_t *params)
 {
-    if (!ucp_worker_sockaddr_is_cm_proto(worker)) {
-        return 0;
-    }
-
     if (params->field_mask & UCP_EP_PARAM_FIELD_SOCK_ADDR) {
         return UCP_EP_INIT_CM_WIREUP_CLIENT | UCP_EP_INIT_CM_PHASE;
     }
@@ -40,19 +58,6 @@ int ucp_ep_init_flags_has_cm(unsigned ep_init_flags)
                                UCP_EP_INIT_CM_WIREUP_SERVER));
 }
 
-static int ucp_cm_ep_should_use_wireup_msg(ucp_ep_h ucp_ep)
-{
-    ucp_context_t *context        = ucp_ep->worker->context;
-    ucp_wireup_ep_t *cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep);
-
-    return context->config.ext.cm_use_all_devices &&
-           /* TCP doesn't have CONNECT_TO_EP support and has internal connection
-            * matching that could lead to unexpected behavior when connections
-            * are accepted in the reverse order.
-            * TODO: remove it, when CONNECT_TO_EP support is added to TCP */
-           strcmp(ucp_context_cm_name(context, cm_wireup_ep->cm_idx), "tcp");
-}
-
 /*
  * The main thread progress part of attempting connecting the client to the server
  * through the next available cm.
@@ -64,22 +69,26 @@ static unsigned ucp_cm_client_try_next_cm_progress(void *arg)
     ucp_context_h context = worker->context;
     ucp_wireup_ep_t *cm_wireup_ep;
     ucs_status_t status;
+    ucp_rsc_index_t cm_idx;
 
     UCS_ASYNC_BLOCK(&worker->async);
 
+    cm_idx = ucp_ep_ext_control(ucp_ep)->cm_idx;
+    ucs_assert(cm_idx != UCP_NULL_RESOURCE);
+
     cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep);
     ucs_assert_always(cm_wireup_ep != NULL);
     ucp_wireup_ep_destroy_next_ep(cm_wireup_ep);
 
-    ucs_debug("client switching from %s to %s in attempt to connect to the server",
-              ucp_context_cm_name(context, cm_wireup_ep->cm_idx - 1),
-              ucp_context_cm_name(context, cm_wireup_ep->cm_idx));
+    ucs_debug("client switching from %s to %s in attempt to connect to the"
+              " server",
+              ucp_context_cm_name(context, cm_idx - 1),
+              ucp_context_cm_name(context, cm_idx));
 
     status = ucp_ep_client_cm_create_uct_ep(ucp_ep);
     if (status != UCS_OK) {
         ucs_error("failed to create a uct sockaddr endpoint on %s cm %p",
-                  ucp_context_cm_name(context, cm_wireup_ep->cm_idx),
-                  worker->cms[cm_wireup_ep->cm_idx].cm);
+                  ucp_context_cm_name(context, cm_idx), worker->cms[cm_idx].cm);
 
         ucp_worker_set_ep_failed(worker, ucp_ep, &cm_wireup_ep->super.super,
                                  ucp_ep_get_cm_lane(ucp_ep), status);
@@ -89,22 +98,54 @@ static unsigned ucp_cm_client_try_next_cm_progress(void *arg)
     return 1;
 }
 
+static int ucp_cm_client_get_next_cm_idx(ucp_ep_h ep)
+{
+    ucp_worker_h worker          = ep->worker;
+    ucp_rsc_index_t next_cm_idx  = ucp_ep_ext_control(ep)->cm_idx + 1;
+    ucp_rsc_index_t num_cm_cmpts = ucp_worker_num_cm_cmpts(worker);
+
+    for (; next_cm_idx < num_cm_cmpts; ++next_cm_idx) {
+        if (worker->cms[next_cm_idx].cm != NULL) {
+            return next_cm_idx;
+        }
+    }
+
+    return UCP_NULL_RESOURCE;
+}
+
 static int ucp_cm_client_try_fallback_cms(ucp_ep_h ep)
 {
-    ucp_worker_h worker           = ep->worker;
-    ucp_wireup_ep_t *cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
-    ucp_rsc_index_t next_cm_idx   = cm_wireup_ep->cm_idx + 1;
-    uct_worker_cb_id_t prog_id    = UCS_CALLBACKQ_ID_NULL;
-
-    if (next_cm_idx >= ucp_worker_num_cm_cmpts(worker)) {
-        ucs_debug("reached the end of the cms priority list, no cms left to"
-                  " check (sockaddr_cm=%s, cm_idx=%d).",
-                  ucp_context_cm_name(worker->context, cm_wireup_ep->cm_idx),
-                  cm_wireup_ep->cm_idx);
+    ucp_worker_h worker          = ep->worker;
+    uct_worker_cb_id_t prog_id   = UCS_CALLBACKQ_ID_NULL;
+    ucp_rsc_index_t num_cm_cmpts = ucp_worker_num_cm_cmpts(worker);
+    UCS_STRING_BUFFER_ONSTACK(cms_strb, 64);
+    char addr_str[UCS_SOCKADDR_STRING_LEN];
+    ucp_wireup_ep_t *cm_wireup_ep;
+    ucp_rsc_index_t next_cm_idx;
+    int i;
+
+    next_cm_idx = ucp_cm_client_get_next_cm_idx(ep);
+    if (next_cm_idx == UCP_NULL_RESOURCE) {
+        for (i = 0; i < num_cm_cmpts; ++i) {
+            ucs_string_buffer_appendf(&cms_strb, "%s,",
+                                      ucp_context_cm_name(worker->context, i));
+        }
+        ucs_string_buffer_rtrim(&cms_strb, ",");
+
+        cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
+        ucs_assert_always(cm_wireup_ep != NULL);
+
+        ucs_diag("client ep %p failed to connect to %s using %s cms",
+                 ep,
+                 ucs_sockaddr_str(
+                         (struct sockaddr*)&cm_wireup_ep->cm_remote_sockaddr,
+                         addr_str, sizeof(addr_str)),
+                 ucs_string_buffer_cstr(&cms_strb));
+
         return 0;
     }
 
-    cm_wireup_ep->cm_idx = next_cm_idx;
+    ucp_ep_ext_control(ep)->cm_idx = next_cm_idx;
     uct_worker_progress_register_safe(worker->uct,
                                       ucp_cm_client_try_next_cm_progress,
                                       ep, UCS_CALLBACKQ_FLAG_ONESHOT,
@@ -114,18 +155,19 @@ static int ucp_cm_client_try_fallback_cms(ucp_ep_h ep)
 }
 
 static ucp_rsc_index_t
-ucp_cm_tl_bitmap_get_dev_idx(ucp_context_h context, uint64_t tl_bitmap)
-{   
-    ucp_rsc_index_t rsc_index;
+ucp_cm_tl_bitmap_get_dev_idx(ucp_context_h context,
+                             const ucp_tl_bitmap_t *tl_bitmap)
+{
+    ucp_rsc_index_t rsc_index = UCS_BITMAP_FFS(*tl_bitmap);
     ucp_rsc_index_t dev_index;
 
-    ucs_assert(tl_bitmap != 0);
+    ucs_assert(!UCS_BITMAP_IS_ZERO_INPLACE(tl_bitmap));
+    ucs_assert(rsc_index < context->num_tls);
 
-    rsc_index = ucs_ffs64_safe(tl_bitmap);
     dev_index = context->tl_rscs[rsc_index].dev_index;
 
     /* check that all TL resources in the TL bitmap have the same dev_index */
-    ucs_for_each_bit(rsc_index, tl_bitmap) {
+    UCS_BITMAP_FOR_EACH_BIT(*tl_bitmap, rsc_index) {
         ucs_assert(dev_index == context->tl_rscs[rsc_index].dev_index);
     }
 
@@ -133,28 +175,24 @@ ucp_cm_tl_bitmap_get_dev_idx(ucp_context_h context, uint64_t tl_bitmap)
 }
 
 static ucs_status_t
-ucp_cm_ep_client_initial_config_get(ucp_ep_h ucp_ep, const char *dev_name,
+ucp_cm_ep_client_initial_config_get(ucp_ep_h ucp_ep,
+                                    const ucp_tl_bitmap_t *tl_bitmap,
                                     ucp_ep_config_key_t *key)
 {
     ucp_worker_h worker        = ucp_ep->worker;
-    uint64_t addr_pack_flags   = UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR |
+    unsigned addr_pack_flags   = ucp_worker_common_address_pack_flags(worker) |
+                                 UCP_ADDRESS_PACK_FLAG_DEVICE_ADDR |
                                  UCP_ADDRESS_PACK_FLAG_IFACE_ADDR;
     ucp_wireup_ep_t *wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep);
-    uint64_t tl_bitmap         = ucp_context_dev_tl_bitmap(worker->context,
-                                                           dev_name);
     void *ucp_addr;
     size_t ucp_addr_size;
     ucp_unpacked_address_t unpacked_addr;
+    ucp_address_entry_t *ae;
     unsigned addr_indices[UCP_MAX_RESOURCES];
     ucs_status_t status;
 
     ucs_assert_always(wireup_ep != NULL);
 
-    if (tl_bitmap == 0) {
-        ucs_debug("tl_bitmap for %s is empty", dev_name);
-        return UCS_ERR_UNREACHABLE;
-    }
-
     /* Construct local dummy address for lanes selection taking an assumption
      * that server has the transports which are the best from client's
      * perspective. */
@@ -170,11 +208,17 @@ ucp_cm_ep_client_initial_config_get(ucp_ep_h ucp_ep, const char *dev_name,
         goto free_ucp_addr;
     }
 
+    /* Update destination MD and RSC indicies in the unpacked address list */
+    ucp_unpacked_address_for_each(ae, &unpacked_addr) {
+        ae->md_index                 = UCP_NULL_RESOURCE;
+        ae->iface_attr.dst_rsc_index = UCP_NULL_RESOURCE;
+    }
+
     ucs_assert(unpacked_addr.address_count <= UCP_MAX_RESOURCES);
     ucp_ep_config_key_reset(key);
     ucp_ep_config_key_set_err_mode(key, wireup_ep->ep_init_flags);
     status = ucp_wireup_select_lanes(ucp_ep, wireup_ep->ep_init_flags,
-                                     tl_bitmap, &unpacked_addr, addr_indices,
+                                     *tl_bitmap, &unpacked_addr, addr_indices,
                                      key);
 
     ucs_free(unpacked_addr.address_list);
@@ -184,75 +228,130 @@ ucp_cm_ep_client_initial_config_get(ucp_ep_h ucp_ep, const char *dev_name,
     return status;
 }
 
-static void ucp_cm_priv_data_pack(ucp_wireup_sockaddr_data_t *sa_data,
-                                  ucp_ep_h ep, ucp_rsc_index_t dev_index,
-                                  const ucp_address_t *addr, size_t addr_size)
+static size_t ucp_cm_priv_data_length(size_t addr_size)
 {
+    return sizeof(ucp_wireup_sockaddr_data_t) + addr_size;
+}
+
+static unsigned ucp_cm_address_pack_flags(ucp_worker_h worker)
+{
+    return ucp_worker_common_address_pack_flags(worker) |
+           UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT;
+}
+
+static ucs_status_t
+ucp_cm_ep_priv_data_pack(ucp_ep_h ep, const ucp_tl_bitmap_t *tl_bitmap,
+                         ucp_rsc_index_t dev_index,
+                         void **data_buf_p, size_t *data_buf_length_p)
+{
+    ucp_worker_h worker = ep->worker;
+    void *ucp_addr      = NULL;
+    ucp_wireup_sockaddr_data_t *sa_data;
+    size_t ucp_addr_size;
+    ucp_rsc_index_t cm_idx;
+    ucs_status_t status;
+    ucs_log_level_t log_level;
+
     ucs_assert((int)ucp_ep_config(ep)->key.err_mode <= UINT8_MAX);
     ucs_assert(dev_index != UCP_NULL_RESOURCE);
 
+    /* Don't pack the device address to reduce address size, it will be
+     * delivered by uct_cm_listener_conn_request_callback_t in
+     * uct_cm_remote_data_t */
+    status = ucp_address_pack(worker, ep, tl_bitmap,
+                              ucp_cm_address_pack_flags(worker), NULL,
+                              &ucp_addr_size, &ucp_addr);
+    if (status != UCS_OK) {
+        goto err;
+    }
+
+    cm_idx = ucp_ep_ext_control(ep)->cm_idx;
+    if (worker->cms[cm_idx].attr.max_conn_priv <
+        ucp_cm_priv_data_length(ucp_addr_size)) {
+        log_level = (ucp_cm_client_get_next_cm_idx(ep) != UCP_NULL_RESOURCE) ?
+                    UCS_LOG_LEVEL_DIAG : UCS_LOG_LEVEL_ERROR;
+        ucs_log(log_level,
+                "CM private data buffer is too small to pack UCP endpoint"
+                " info, ep %p service data %lu, address length %lu, cm %p"
+                " max_conn_priv %lu", ep, sizeof(ucp_wireup_sockaddr_data_t),
+                ucp_addr_size, worker->cms[cm_idx].cm,
+                worker->cms[cm_idx].attr.max_conn_priv);
+        status = UCS_ERR_BUFFER_TOO_SMALL;
+        goto err;
+    }
+
+    sa_data = ucs_malloc(ucp_cm_priv_data_length(ucp_addr_size),
+                         "client_priv_data");
+    if (sa_data == NULL) {
+        status = UCS_ERR_NO_MEMORY;
+        goto err;
+    }
+
     sa_data->ep_id     = ucp_ep_local_id(ep);
     sa_data->err_mode  = ucp_ep_config(ep)->key.err_mode;
     sa_data->addr_mode = UCP_WIREUP_SA_DATA_CM_ADDR;
     sa_data->dev_index = dev_index;
-    memcpy(sa_data + 1, addr, addr_size);
+    memcpy(sa_data + 1, ucp_addr, ucp_addr_size);
+
+    *data_buf_p        = sa_data;
+    *data_buf_length_p = ucp_cm_priv_data_length(ucp_addr_size);
+    status             = UCS_OK;
+
+err:
+    ucs_free(ucp_addr);
+    return status;
 }
 
-static void uct_wireup_cm_tmp_ep_cleanup(ucp_ep_h tmp_ep, ucs_queue_head_t *queue)
+static void
+ucp_wireup_cm_ep_cleanup(ucp_ep_t *ucp_ep, ucs_queue_head_t *queue)
 {
     ucp_lane_index_t lane_idx;
     uct_ep_h uct_ep;
 
-    if (tmp_ep == NULL) {
-        return;
-    }
-
-    for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(tmp_ep); ++lane_idx) {
-        if (lane_idx == ucp_ep_get_cm_lane(tmp_ep)) {
+    for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(ucp_ep); ++lane_idx) {
+        if (lane_idx == ucp_ep_get_cm_lane(ucp_ep)) {
             continue;
         }
 
-        /* transfer the pending queues content from the previous tmp_ep to
-         * a temporary queue */
-        uct_ep_pending_purge(tmp_ep->uct_eps[lane_idx],
+        /* Transfer the pending queues content from the previosly configured
+         * UCP EP to a temporary queue for futher replaying */
+        uct_ep_pending_purge(ucp_ep->uct_eps[lane_idx],
                              ucp_wireup_pending_purge_cb, &queue);
 
-        if (ucp_ep_config(tmp_ep)->p2p_lanes & UCS_BIT(lane_idx)) {
-            uct_ep = ucp_wireup_extract_lane(tmp_ep, lane_idx);
-            /* destroy the transport ep */
+        if (ucp_ep_config(ucp_ep)->p2p_lanes & UCS_BIT(lane_idx)) {
+            uct_ep = ucp_wireup_extract_lane(ucp_ep, lane_idx);
+            /* Destroy the transport ep */
             uct_ep_destroy(uct_ep);
         }
 
-        /* destroy the wireup ep */
-        uct_ep_destroy(tmp_ep->uct_eps[lane_idx]);
+        /* Destroy the wireup ep */
+        uct_ep_destroy(ucp_ep->uct_eps[lane_idx]);
+        ucp_ep->uct_eps[lane_idx] = NULL;
     }
-
-    ucs_trace("deleting tmp_ep %p", tmp_ep);
-    ucp_ep_destroy_base(tmp_ep);
 }
 
-static ucs_status_t ucp_cm_ep_init_lanes(ucp_ep_h ep, uint64_t *tl_bitmap,
+static ucs_status_t ucp_cm_ep_init_lanes(ucp_ep_h ep,
+                                         ucp_tl_bitmap_t *tl_bitmap,
                                          ucp_rsc_index_t *dev_index)
 {
     ucp_worker_h worker = ep->worker;
-    ucp_ep_h tmp_ep     = ucp_ep_get_cm_wireup_ep(ep)->tmp_ep;
     ucs_status_t status = UCS_ERR_NO_RESOURCE;
     ucp_lane_index_t lane_idx;
     ucp_rsc_index_t rsc_idx;
     uint8_t path_index;
 
-    *tl_bitmap = 0;
-    for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(tmp_ep); ++lane_idx) {
-        if (lane_idx == ucp_ep_get_cm_lane(tmp_ep)) {
+    UCS_BITMAP_CLEAR(tl_bitmap);
+    for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(ep); ++lane_idx) {
+        if (lane_idx == ucp_ep_get_cm_lane(ep)) {
             continue;
         }
 
-        rsc_idx = ucp_ep_get_rsc_index(tmp_ep, lane_idx);
+        rsc_idx = ucp_ep_get_rsc_index(ep, lane_idx);
         if (rsc_idx == UCP_NULL_RESOURCE) {
             continue;
         }
 
-        status = ucp_wireup_ep_create(tmp_ep, &tmp_ep->uct_eps[lane_idx]);
+        status = ucp_wireup_ep_create(ep, &ep->uct_eps[lane_idx]);
         if (status != UCS_OK) {
             goto out;
         }
@@ -261,10 +360,10 @@ static ucs_status_t ucp_cm_ep_init_lanes(ucp_ep_h ep, uint64_t *tl_bitmap,
                    (*dev_index == worker->context->tl_rscs[rsc_idx].dev_index));
         *dev_index = worker->context->tl_rscs[rsc_idx].dev_index;
 
-        *tl_bitmap |= UCS_BIT(rsc_idx);
-        if (ucp_ep_config(tmp_ep)->p2p_lanes & UCS_BIT(lane_idx)) {
-            path_index = ucp_ep_get_path_index(tmp_ep, lane_idx);
-            status     = ucp_wireup_ep_connect(tmp_ep->uct_eps[lane_idx], 0,
+        UCS_BITMAP_SET(*tl_bitmap, rsc_idx);
+        if (ucp_ep_config(ep)->p2p_lanes & UCS_BIT(lane_idx)) {
+            path_index = ucp_ep_get_path_index(ep, lane_idx);
+            status     = ucp_wireup_ep_connect(ep->uct_eps[lane_idx], 0,
                                                rsc_idx, path_index, 0, NULL);
             if (status != UCS_OK) {
                 goto out;
@@ -280,147 +379,158 @@ static ucs_status_t ucp_cm_ep_init_lanes(ucp_ep_h ep, uint64_t *tl_bitmap,
     return status;
 }
 
-static ssize_t ucp_cm_client_priv_pack_cb(void *arg,
-                                          const uct_cm_ep_priv_data_pack_args_t
-                                          *pack_args, void *priv_data)
+static unsigned ucp_cm_client_uct_connect_progress(void *arg)
 {
-    ucp_wireup_sockaddr_data_t *sa_data = priv_data;
-    ucp_ep_h ep                         = arg;
-    ucp_worker_h worker                 = ep->worker;
-    ucp_rsc_index_t dev_index           = UCP_NULL_RESOURCE;
+    ucp_ep_h ep                   = arg;
+    ucp_worker_h worker           = ep->worker;
+    ucp_wireup_ep_t *cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
+    ucp_rsc_index_t dev_index     = UCP_NULL_RESOURCE;
+    void *ucp_addr                = NULL; /* Set to NULL to call ucs_free
+                                             safely */
+    ucp_tl_bitmap_t tl_bitmap;
+    uct_ep_connect_params_t params;
+    void *priv_data;
+    size_t priv_data_length;
     ucp_ep_config_key_t key;
-    uint64_t tl_bitmap;
-    ucp_wireup_ep_t *cm_wireup_ep;
-    void* ucp_addr;
-    size_t ucp_addr_size;
-    ucs_status_t status;
-    const char *dev_name;
     ucs_queue_head_t tmp_pending_queue;
+    ucs_status_t status;
 
     UCS_ASYNC_BLOCK(&worker->async);
 
-    ucs_assert_always(pack_args->field_mask &
-                      UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME);
-
-    dev_name = pack_args->dev_name;
-
-    /* At this point the ep has only CM lane */
-    ucs_assert((ucp_ep_num_lanes(ep) == 1) && ucp_ep_has_cm_lane(ep));
-    cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
-    ucs_assert(cm_wireup_ep != NULL);
-
-    status = ucp_cm_ep_client_initial_config_get(ep, dev_name, &key);
-    if (status != UCS_OK) {
-        if (ucp_cm_client_try_fallback_cms(ep)) {
-            goto out;
-        } else {
-            goto out_check_err;
-        }
-    }
-
     ucs_queue_head_init(&tmp_pending_queue);
 
-    /* cleanup the previously created cm_wireup_ep->tmp_ep. the one that was
-     * created on the previous call to this client's pack_cb */
-    uct_wireup_cm_tmp_ep_cleanup(cm_wireup_ep->tmp_ep, &tmp_pending_queue);
-    cm_wireup_ep->tmp_ep = NULL;
+    /* Cleanup the previously created UCP EP. The one that was created on the
+     * previous call to this client's resolve_cb */
+    ucp_wireup_cm_ep_cleanup(ep, &tmp_pending_queue);
 
-    /* Create tmp ep which will hold local tl addresses until connect
-     * event arrives, to avoid asynchronous ep reconfiguration. */
-    status = ucp_ep_create_base(worker, "tmp_cm", "tmp cm client",
-                                &cm_wireup_ep->tmp_ep);
+    status = ucp_cm_ep_client_initial_config_get(ep,
+                                    &cm_wireup_ep->cm_resolve_tl_bitmap, &key);
     if (status != UCS_OK) {
-        goto out_check_err;
+        goto try_fallback;
     }
 
-    cm_wireup_ep->tmp_ep->flags |= UCP_EP_FLAG_TEMPORARY;
-
-    status = ucp_worker_get_ep_config(worker, &key, 0,
-                                      &cm_wireup_ep->tmp_ep->cfg_index);
+    status = ucp_worker_get_ep_config(worker, &key, 0, &ep->cfg_index);
     if (status != UCS_OK) {
-        goto out_check_err;
+        goto err;
     }
 
+    ep->am_lane = key.am_lane;
+
     status = ucp_cm_ep_init_lanes(ep, &tl_bitmap, &dev_index);
     if (status != UCS_OK) {
-        goto out_check_err;
+        goto err;
     }
 
     /* Replay pending requests from the tmp_pending_queue */
     ucp_wireup_replay_pending_requests(ep, &tmp_pending_queue);
 
-    /* Don't pack the device address to reduce address size, it will be
-     * delivered by uct_cm_listener_conn_request_callback_t in
-     * uct_cm_remote_data_t */
-    status = ucp_address_pack(worker, cm_wireup_ep->tmp_ep, tl_bitmap,
-                              UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT,
-                              NULL, &ucp_addr_size, &ucp_addr);
-    if (status != UCS_OK) {
-        goto out_check_err;
+    status = ucp_cm_ep_priv_data_pack(ep, &tl_bitmap, dev_index, &priv_data,
+                                      &priv_data_length);
+    if (status == UCS_ERR_BUFFER_TOO_SMALL) {
+        goto try_fallback;
+    } else if (status != UCS_OK) {
+        goto err;
     }
 
-    if (worker->cms[cm_wireup_ep->cm_idx].attr.max_conn_priv <
-        (sizeof(*sa_data) + ucp_addr_size)) {
-        ucs_error("CM private data buffer is too small to pack UCP endpoint info, "
-                  "ep %p/%p service data %lu, address length %lu, cm %p max_conn_priv %lu",
-                  ep, cm_wireup_ep->tmp_ep, sizeof(*sa_data), ucp_addr_size,
-                  worker->cms[cm_wireup_ep->cm_idx].cm,
-                  worker->cms[cm_wireup_ep->cm_idx].attr.max_conn_priv);
-        status = UCS_ERR_BUFFER_TOO_SMALL;
-        goto free_addr;
+    params.field_mask          = UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA |
+                                 UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA_LENGTH;
+    params.private_data        = priv_data;
+    params.private_data_length = priv_data_length;
+    status                     = uct_ep_connect(ucp_ep_get_cm_uct_ep(ep),
+                                                &params);
+    ucs_free(priv_data);
+
+    if (status != UCS_OK) {
+        goto err;
     }
 
-    ucs_debug("client ep %p created on device %s idx %d, tl_bitmap 0x%"PRIx64
-              "on cm %s", ep, dev_name, dev_index, tl_bitmap,
-              ucp_context_cm_name(worker->context, cm_wireup_ep->cm_idx));
-    /* Pass real ep (not cm_wireup_ep->tmp_ep), because only its pointer and
-     * err_mode is taken from the config. */
-    ucp_cm_priv_data_pack(sa_data, ep, dev_index, ucp_addr, ucp_addr_size);
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_LOCAL_CONNECTED, 0);
+    goto out;
 
-free_addr:
-    ucs_free(ucp_addr);
-out_check_err:
-    if (status == UCS_OK) {
-        ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED;
-    } else {
-        ucp_worker_set_ep_failed(worker, ep,
-                                 &ucp_ep_get_cm_wireup_ep(ep)->super.super,
-                                 ucp_ep_get_cm_lane(ep), status);
+try_fallback:
+    if (ucp_cm_client_try_fallback_cms(ep)) {
+        /* Can fallback to the next CM to retry getting CM initial config to
+         * fit to CM private data */
+        goto out;
     }
 
+err:
+    ucp_worker_set_ep_failed(worker, ep,
+                             &ucp_ep_get_cm_wireup_ep(ep)->super.super,
+                             ucp_ep_get_cm_lane(ep), status);
 out:
+    ucs_free(ucp_addr);
     UCS_ASYNC_UNBLOCK(&worker->async);
-    return (status == UCS_OK) ? (sizeof(*sa_data) + ucp_addr_size) : status;
+    return 1;
 }
 
-static void
-ucp_cm_client_connect_prog_arg_free(ucp_cm_client_connect_progress_arg_t *arg)
+static ucs_status_t
+ucp_cm_client_resolve_cb(void *user_data, const uct_cm_ep_resolve_args_t *args)
 {
-    ucs_free(arg->sa_data);
-    ucs_free(arg->dev_addr);
-    ucs_free(arg);
-}
+    ucp_ep_h ep                = user_data;
+    ucp_worker_h worker        = ep->worker;
+    ucs_status_t status        = UCS_OK;
+    uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
+    ucp_wireup_ep_t *cm_wireup_ep;
+    char addr_str[UCS_SOCKADDR_STRING_LEN];
 
-static void ucp_cm_client_restore_ep(ucp_wireup_ep_t *wireup_cm_ep,
-                                     ucp_ep_h ucp_ep)
-{
-    ucp_ep_h tmp_ep = wireup_cm_ep->tmp_ep;
-    ucp_wireup_ep_t *w_ep;
-    ucp_lane_index_t lane_idx;
+    UCS_ASYNC_BLOCK(&worker->async);
+    ucs_assert_always(args->field_mask & UCT_CM_EP_RESOLVE_ARGS_FIELD_DEV_NAME);
 
-    ucp_ep->cfg_index = tmp_ep->cfg_index;
+    UCP_EP_CM_CALLBACK_ENTER(ep, ucp_ep_get_cm_uct_ep(ep),
+                             {
+                                 ucs_assert(ep->flags & UCP_EP_FLAG_CLOSED);
+                                 status = UCS_ERR_CANCELED;
+                                 goto out;
+                             });
+
+    cm_wireup_ep = ucp_ep_get_cm_wireup_ep(ep);
+    ucs_assert(cm_wireup_ep != NULL);
+    ucp_context_dev_tl_bitmap(worker->context, args->dev_name,
+                              &cm_wireup_ep->cm_resolve_tl_bitmap);
+
+    if (UCS_BITMAP_IS_ZERO_INPLACE(&cm_wireup_ep->cm_resolve_tl_bitmap)) {
+        ucs_diag("client ep %p connect to %s failed: device %s is not enabled, "
+                 "enable it in UCX_NET_DEVICES or use corresponding ip address",
+                 ep,
+                 ucs_sockaddr_str(
+                        (struct sockaddr*)&cm_wireup_ep->cm_remote_sockaddr,
+                         addr_str, sizeof(addr_str)),
+                 args->dev_name);
+        status = UCS_ERR_UNREACHABLE;
+        if (!ucp_cm_client_try_fallback_cms(ep)) {
+            ucp_worker_set_ep_failed(worker, ep,
+                                     &cm_wireup_ep->super.super,
+                                     ucp_ep_get_cm_lane(ep), status);
 
-    for (lane_idx = 0; lane_idx < ucp_ep_num_lanes(tmp_ep); ++lane_idx) {
-        if (tmp_ep->uct_eps[lane_idx] != NULL) {
-            ucs_assert(ucp_ep->uct_eps[lane_idx] == NULL);
-            ucp_ep->uct_eps[lane_idx] = tmp_ep->uct_eps[lane_idx];
-            w_ep = ucs_derived_of(ucp_ep->uct_eps[lane_idx], ucp_wireup_ep_t);
-            w_ep->super.ucp_ep = ucp_ep;
         }
+        goto out;
     }
 
-    ucp_ep_destroy_base(tmp_ep); /* not needed anymore */
-    wireup_cm_ep->tmp_ep = NULL;
+    ucs_debug("client created ep %p on device %s, "
+              "tl_bitmap " UCT_TL_BITMAP_FMT " on cm %s",
+              ep, args->dev_name,
+              UCT_TL_BITMAP_ARG(&cm_wireup_ep->cm_resolve_tl_bitmap),
+              ucp_context_cm_name(worker->context,
+                                  ucp_ep_ext_control(ep)->cm_idx));
+
+    uct_worker_progress_register_safe(worker->uct,
+                                      ucp_cm_client_uct_connect_progress,
+                                      ep, UCS_CALLBACKQ_FLAG_ONESHOT,
+                                      &prog_id);
+    ucp_worker_signal_internal(worker);
+
+out:
+    UCS_ASYNC_UNBLOCK(&worker->async);
+    return status;
+}
+
+static void
+ucp_cm_client_connect_prog_arg_free(ucp_cm_client_connect_progress_arg_t *arg)
+{
+    ucs_free(arg->sa_data);
+    ucs_free(arg->dev_addr);
+    ucs_free(arg);
 }
 
 /*
@@ -435,7 +545,7 @@ static unsigned ucp_cm_client_connect_progress(void *arg)
     uct_ep_h uct_cm_ep                                 = ucp_ep_get_cm_uct_ep(ucp_ep);
     ucp_wireup_ep_t *wireup_ep;
     ucp_unpacked_address_t addr;
-    uint64_t tl_bitmap;
+    ucp_tl_bitmap_t tl_bitmap;
     ucp_rsc_index_t dev_index;
     ucp_rsc_index_t UCS_V_UNUSED rsc_index;
     unsigned addr_idx;
@@ -444,12 +554,16 @@ static unsigned ucp_cm_client_connect_progress(void *arg)
 
     UCS_ASYNC_BLOCK(&worker->async);
 
+    ucs_debug("ep %p flags 0x%x cfg_index %d: client connect progress", ucp_ep,
+              ucp_ep->flags, ucp_ep->cfg_index);
+    ucs_log_indent(1);
+
     wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep);
     ucs_assert(wireup_ep != NULL);
     ucs_assert(wireup_ep->ep_init_flags & UCP_EP_INIT_CM_WIREUP_CLIENT);
 
     status = ucp_address_unpack(worker, progress_arg->sa_data + 1,
-                                UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT, &addr);
+                                ucp_cm_address_pack_flags(worker), &addr);
     if (status != UCS_OK) {
         goto out;
     }
@@ -467,34 +581,35 @@ static unsigned ucp_cm_client_connect_progress(void *arg)
     ucs_assert(addr.address_count <= UCP_MAX_RESOURCES);
     ucp_ep_update_remote_id(ucp_ep, progress_arg->sa_data->ep_id);
 
-    /* Get tl bitmap from tmp_ep, because it contains initial configuration. */
-    tl_bitmap = ucp_ep_get_tl_bitmap(wireup_ep->tmp_ep);
-    dev_index = ucp_cm_tl_bitmap_get_dev_idx(worker->context, tl_bitmap);
+    ucp_ep_get_tl_bitmap(ucp_ep, &tl_bitmap);
+    dev_index = ucp_cm_tl_bitmap_get_dev_idx(worker->context, &tl_bitmap);
 
-    /* Restore initial configuration from tmp_ep created for packing local
-     * addresses. */
-    ucp_cm_client_restore_ep(wireup_ep, ucp_ep);
-
-    tl_bitmap = ucp_context_dev_idx_tl_bitmap(context, dev_index);
+    ucp_context_dev_idx_tl_bitmap(context, dev_index, &tl_bitmap);
     status    = ucp_wireup_init_lanes(ucp_ep, wireup_ep->ep_init_flags,
-                                      tl_bitmap, &addr, addr_indices);
+                                      &tl_bitmap, &addr, addr_indices);
     if (status != UCS_OK) {
+        ucs_debug("ep %p: failed to initialize lanes: %s", ucp_ep,
+                  ucs_status_string(status));
         goto out_free_addr;
     }
 
     status = ucp_wireup_connect_local(ucp_ep, &addr, NULL);
     if (status != UCS_OK) {
+        ucs_debug("ep %p: failed to connect lanes: %s", ucp_ep,
+                  ucs_status_string(status));
         goto out_free_addr;
     }
 
     status = uct_cm_client_ep_conn_notify(uct_cm_ep);
     if (status != UCS_OK) {
+        ucs_debug("ep %p: failed to send notify: %s", ucp_ep,
+                  ucs_status_string(status));
         /* connection can't be established by UCT, no need to disconnect */
-        ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED;
+        ucp_ep_update_flags(ucp_ep, 0, UCP_EP_FLAG_LOCAL_CONNECTED);
         goto out_free_addr;
     }
 
-    if (!ucp_cm_ep_should_use_wireup_msg(ucp_ep)) {
+    if (!context->config.ext.cm_use_all_devices) {
         ucp_wireup_remote_connected(ucp_ep);
     }
 
@@ -506,6 +621,7 @@ static unsigned ucp_cm_client_connect_progress(void *arg)
                                  ucp_ep_get_cm_lane(ucp_ep), status);
     }
 
+    ucs_log_indent(-1);
     UCS_ASYNC_UNBLOCK(&worker->async);
     ucp_cm_client_connect_prog_arg_free(progress_arg);
     return 1;
@@ -544,27 +660,33 @@ static void ucp_cm_client_connect_cb(uct_ep_h uct_cm_ep, void *arg,
     ucs_assert_always(ucs_test_all_flags(connect_args->field_mask,
                                          (UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_REMOTE_DATA |
                                           UCT_CM_EP_CLIENT_CONNECT_ARGS_FIELD_STATUS)));
-
     remote_data = connect_args->remote_data;
     status      = connect_args->status;
+    ucp_ep_update_flags(ucp_ep, UCP_EP_FLAG_CLIENT_CONNECT_CB, 0);
+
+    ucs_debug("ep %p flags 0x%x cfg_index %d: client connected status %s",
+              ucp_ep, ucp_ep->flags, ucp_ep->cfg_index,
+              ucs_status_string(status));
 
+    UCP_EP_CM_CALLBACK_ENTER(ucp_ep, uct_cm_ep, return);
 
     if (((status == UCS_ERR_NOT_CONNECTED) || (status == UCS_ERR_UNREACHABLE) ||
          (status == UCS_ERR_CONNECTION_RESET)) &&
         /* try connecting through another cm (next one in the priority list) */
         ucp_cm_client_try_fallback_cms(ucp_ep)) {
         /* connection can't be established by UCT, no need to disconnect */
-        ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED;
+        ucp_ep_update_flags(ucp_ep, 0, UCP_EP_FLAG_LOCAL_CONNECTED);
         /* cms fallback has started */
         return;
     } else if (status != UCS_OK) {
         /* connection can't be established by UCT, no need to disconnect */
-        ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED;
+        ucp_ep_update_flags(ucp_ep, 0, UCP_EP_FLAG_LOCAL_CONNECTED);
         ucs_debug("failed status on client connect callback: %s "
-                  "(sockaddr_cm=%s, cms_used_idx=%d)", ucs_status_string(status),
+                  "(sockaddr_cm=%s, cms_used_idx=%d)",
+                  ucs_status_string(status),
                   ucp_context_cm_name(worker->context,
-                                      ucp_ep_get_cm_wireup_ep(ucp_ep)->cm_idx),
-                  ucp_ep_get_cm_wireup_ep(ucp_ep)->cm_idx);
+                                      ucp_ep_ext_control(ucp_ep)->cm_idx),
+                  ucp_ep_ext_control(ucp_ep)->cm_idx);
         goto err_out;
     }
 
@@ -618,44 +740,9 @@ static void ucp_cm_client_connect_cb(uct_ep_h uct_cm_ep, void *arg,
     UCS_ASYNC_UNBLOCK(&worker->async);
 }
 
-/*
- * Internal flush completion callback which is a part of close protocol,
- * this flush was initiated by remote peer in disconnect callback on CM lane.
- */
-static void ucp_ep_cm_disconnect_flushed_cb(ucp_request_t *req)
-{
-    ucp_ep_h ucp_ep            = req->send.ep;
-    /* the EP can be closed/destroyed from err callback */
-    ucs_async_context_t *async = &ucp_ep->worker->async;
-
-    UCS_ASYNC_BLOCK(async);
-    if (req->status == UCS_OK) {
-        ucs_assert(ucp_ep_is_cm_local_connected(ucp_ep));
-        ucp_ep_cm_disconnect_cm_lane(ucp_ep);
-    } else if (ucp_ep->flags & UCP_EP_FLAG_FAILED) {
-        ucs_assert(!ucp_ep_is_cm_local_connected(ucp_ep));
-    } else {
-        /* 1) ucp_ep_close(force) is called from err callback which was invoked
-              on remote connection reset
-              TODO: remove this case when IB flush cancel is fixed (#4743),
-                    moving QP to err state should move UCP EP to error state,
-                    then ucp_worker_set_ep_failed disconnects CM lane
-           2) transport err is also possible on flush
-         */
-        ucs_assert((req->status == UCS_ERR_CANCELED) ||
-                   (req->status == UCS_ERR_ENDPOINT_TIMEOUT));
-    }
-
-    ucs_assert(!(req->flags & UCP_REQUEST_FLAG_CALLBACK));
-    ucp_request_put(req);
-    UCS_ASYNC_UNBLOCK(async);
-}
-
-static unsigned ucp_ep_cm_remote_disconnect_progress(void *arg)
+static void ucp_ep_cm_remote_disconnect_progress(ucp_ep_h ucp_ep)
 {
     ucs_status_t status = UCS_ERR_CONNECTION_RESET;
-    ucp_ep_h ucp_ep     = arg;
-    void *req;
 
     ucs_trace("ep %p: flags 0x%x cm_remote_disconnect_progress", ucp_ep,
               ucp_ep->flags);
@@ -667,52 +754,30 @@ static unsigned ucp_ep_cm_remote_disconnect_progress(void *arg)
                                           UCP_EP_FLAG_CLOSE_REQ_VALID)) {
         ucp_request_complete_send(ucp_ep_ext_control(ucp_ep)->close_req.req,
                                   UCS_OK);
-        return 1;
-    }
-
-    if (ucp_ep->flags & UCP_EP_FLAG_CLOSED) {
-        /* the ep is closed by API but close req is not valid yet (checked
-         * above), it will be set later from scheduled
-         * @ref ucp_ep_close_flushed_callback */
-        ucs_debug("ep %p: ep closed but request is not set, waiting for"
-                  " the flush callback", ucp_ep);
-        goto err;
+        return;
     }
 
     if (!(ucp_ep->flags & UCP_EP_FLAG_REMOTE_CONNECTED)) {
-        /* CM disconnect happens during WIREUP MSGs exchange phase, when EP
-         * is locally connected to the peer */
-        goto err;
+        /* CM disconnect happens during WIREUP MSGs exchange phase, when EP is
+         * locally connected to the peer, so UCP EP should not wait for flush
+         * completion even if it was started from close EP procedure, because
+         * it won't be never completed due to unreachability of the peer */
+        goto set_ep_failed;
     }
 
-    /*
-     * TODO: set the ucp_ep to error state to prevent user from sending more
-     *       ops.
-     */
-    ucs_assert(ucp_ep->flags & UCP_EP_FLAG_FLUSH_STATE_VALID);
-    ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_CLOSED));
-    req = ucp_ep_flush_internal(ucp_ep, UCT_FLUSH_FLAG_LOCAL, 0,
-                                &ucp_request_null_param, NULL,
-                                ucp_ep_cm_disconnect_flushed_cb,
-                                "cm_disconnected_cb");
-    if (req == NULL) {
-        /* flush is successfully completed in place, notify remote peer
-         * that we are disconnected, the EP will be destroyed from API call */
-        ucp_ep_cm_disconnect_cm_lane(ucp_ep);
-    } else if (UCS_PTR_IS_ERR(req)) {
-        status = UCS_PTR_STATUS(req);
-        ucs_error("ucp_ep_flush_internal completed with error: %s",
-                  ucs_status_string(status));
-        goto err;
+    if (ucp_ep->flags & UCP_EP_FLAG_CLOSED) {
+        /* the ep is remote connected (checked above) and closed by API but
+         * close req is not valid yet (checked above), it will be set later
+         * from scheduled @ref ucp_ep_close_flushed_callback */
+        ucs_debug("ep %p: ep is remote connected and closed, but request is"
+                  " not set, waiting for the flush callback", ucp_ep);
+        return;
     }
 
-    return 1;
-
-err:
+set_ep_failed:
     ucp_worker_set_ep_failed(ucp_ep->worker, ucp_ep,
                              ucp_ep_get_cm_uct_ep(ucp_ep),
                              ucp_ep_get_cm_lane(ucp_ep), status);
-    return 1;
 }
 
 static unsigned ucp_ep_cm_disconnect_progress(void *arg)
@@ -728,8 +793,6 @@ static unsigned ucp_ep_cm_disconnect_progress(void *arg)
               uct_cm_ep, ucp_ep->flags);
     ucs_assert(ucp_ep_get_cm_uct_ep(ucp_ep) == uct_cm_ep);
 
-    ucp_ep->flags &= ~UCP_EP_FLAG_REMOTE_CONNECTED;
-
     if (ucp_ep->flags & UCP_EP_FLAG_FAILED) {
         /* - ignore close event on failed ep, since all lanes are destroyed in
              generic err flow
@@ -745,13 +808,29 @@ static unsigned ucp_ep_cm_disconnect_progress(void *arg)
         /* if the EP is not local connected, the EP has been closed and flushed,
            CM lane is disconnected, complete close request and destroy EP */
         ucs_assert(ucp_ep->flags & UCP_EP_FLAG_CLOSED);
+        ucp_ep_update_flags(ucp_ep, 0, UCP_EP_FLAG_REMOTE_CONNECTED);
         close_req = ucp_ep_ext_control(ucp_ep)->close_req.req;
         ucp_ep_local_disconnect_progress(close_req);
+        /* don't touch UCP EP after local disconnect, since it is not valid
+         * anymore */
+        goto out;
+    } else if (ucp_ep->flags & UCP_EP_FLAG_CLOSED) {
+        /* if an EP was closed and not local connected anymore (i.e.
+         * ucp_ep_cm_disconnect_cm_lane() was called from ucp_ep_close_nbx()),
+         * not failed and no CLOSE request is set, it means that an EP was
+         * disconnected from a peer */
+        ucs_assert(ucp_ep->flags & UCP_EP_FLAG_DISCONNECTED_CM_LANE);
+        ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_ERR_HANDLER_INVOKED));
     } else {
         ucs_warn("ep %p: unexpected state on disconnect, flags: 0x%u",
                  ucp_ep, ucp_ep->flags);
     }
 
+    /* don't remove the flag at the beginning of the function, some functions
+     * may rely on that flag (e.g. ucp_ep_cm_remote_disconnect_progress()) */
+    ucp_ep_update_flags(ucp_ep, 0, UCP_EP_FLAG_REMOTE_CONNECTED);
+
+out:
     UCS_ASYNC_UNBLOCK(async);
     return 1;
 }
@@ -762,42 +841,29 @@ static void ucp_cm_disconnect_cb(uct_ep_h uct_cm_ep, void *arg)
     uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
     ucp_worker_h worker        = ucp_ep->worker;
     uct_ep_h uct_ep;
-    int discard_uct_ep;
-
-    ucs_trace("ep %p: CM remote disconnect callback invoked, flags 0x%x",
-              ucp_ep, ucp_ep->flags);
 
-    uct_ep = ucp_ep_get_cm_uct_ep(ucp_ep);
-    if (uct_ep == NULL) {
-        UCS_ASYNC_BLOCK(&worker->async);
-        discard_uct_ep = ucp_worker_is_uct_ep_discarding(worker, uct_cm_ep);
-        UCS_ASYNC_UNBLOCK(&worker->async);
-
-        if (discard_uct_ep) {
-            /* The CM lane couldn't exist if the error was detected on the
-             * transport lane and all UCT lanes have already been discraded */
-            ucs_diag("ep %p: UCT EP %p for CM lane doesn't exist, it"
-                     " has already been discarded", ucp_ep, uct_cm_ep);
-            return;
-        }
+    ucp_ep_update_flags(ucp_ep, UCP_EP_FLAG_DISCONNECT_CB_CALLED, 0);
+    ucs_trace("ep %p flags 0x%x: remote disconnect callback invoked", ucp_ep,
+              ucp_ep->flags);
 
-        ucs_fatal("ep %p: UCT EP for CM lane doesn't exist", ucp_ep);
-    }
+    UCP_EP_CM_CALLBACK_ENTER(ucp_ep, uct_cm_ep, return);
 
+    uct_ep = ucp_ep_get_cm_uct_ep(ucp_ep);
     ucs_assertv_always(uct_cm_ep == uct_ep,
                        "%p: uct_cm_ep=%p vs found_uct_ep=%p",
                        ucp_ep, uct_cm_ep, uct_ep);
 
-    uct_worker_progress_register_safe(ucp_ep->worker->uct,
+    uct_worker_progress_register_safe(worker->uct,
                                       ucp_ep_cm_disconnect_progress,
                                       ucp_ep, UCS_CALLBACKQ_FLAG_ONESHOT,
                                       &prog_id);
-    ucp_worker_signal_internal(ucp_ep->worker);
+    ucp_worker_signal_internal(worker);
 }
 
 ucs_status_t ucp_ep_client_cm_create_uct_ep(ucp_ep_h ucp_ep)
 {
     ucp_wireup_ep_t *wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep);
+    ucp_rsc_index_t cm_idx     = ucp_ep_ext_control(ucp_ep)->cm_idx;
     ucp_worker_h worker        = ucp_ep->worker;
     uct_ep_params_t cm_lane_params;
     ucs_sock_addr_t remote_addr;
@@ -809,7 +875,7 @@ ucs_status_t ucp_ep_client_cm_create_uct_ep(ucp_ep_h ucp_ep)
                                 UCT_EP_PARAM_FIELD_USER_DATA                  |
                                 UCT_EP_PARAM_FIELD_SOCKADDR                   |
                                 UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS          |
-                                UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB           |
+                                UCT_EP_PARAM_FIELD_CM_RESOLVE_CB              |
                                 UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT |
                                 UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB;
 
@@ -825,10 +891,10 @@ ucs_status_t ucp_ep_client_cm_create_uct_ep(ucp_ep_h ucp_ep)
     cm_lane_params.sockaddr           = &remote_addr;
     cm_lane_params.user_data          = ucp_ep;
     cm_lane_params.sockaddr_cb_flags  = UCT_CB_FLAG_ASYNC;
-    cm_lane_params.sockaddr_pack_cb   = ucp_cm_client_priv_pack_cb;
+    cm_lane_params.cm_resolve_cb      = ucp_cm_client_resolve_cb;
     cm_lane_params.sockaddr_cb_client = ucp_cm_client_connect_cb;
     cm_lane_params.disconnect_cb      = ucp_cm_disconnect_cb;
-    cm_lane_params.cm                 = worker->cms[wireup_ep->cm_idx].cm;
+    cm_lane_params.cm                 = worker->cms[cm_idx].cm;
 
     status = uct_ep_create(&cm_lane_params, &cm_ep);
     if (status != UCS_OK) {
@@ -845,14 +911,17 @@ ucs_status_t ucp_ep_client_cm_create_uct_ep(ucp_ep_h ucp_ep)
 ucs_status_t ucp_ep_client_cm_connect_start(ucp_ep_h ucp_ep,
                                             const ucp_ep_params_t *params)
 {
+    ucp_worker_h worker        = ucp_ep->worker;
     ucp_wireup_ep_t *wireup_ep = ucp_ep_get_cm_wireup_ep(ucp_ep);
     ucs_status_t status;
 
-    wireup_ep->ep_init_flags = ucp_ep_init_flags(ucp_ep->worker, params);
-    wireup_ep->cm_idx        = 0;
+    ucs_assert(ucp_ep_ext_control(ucp_ep)->cm_idx == UCP_NULL_RESOURCE);
+
+    ucp_ep_ext_control(ucp_ep)->cm_idx = 0;
+    wireup_ep->ep_init_flags           = ucp_ep_init_flags(worker, params);
 
     /* save the address from the ep_params on the wireup_ep */
-    status = ucs_sockaddr_copy((struct sockaddr *)&wireup_ep->cm_remote_sockaddr,
+    status = ucs_sockaddr_copy((struct sockaddr*)&wireup_ep->cm_remote_sockaddr,
                                params->sockaddr.addr);
     if (status != UCS_OK) {
         return status;
@@ -863,8 +932,6 @@ ucs_status_t ucp_ep_client_cm_connect_start(ucp_ep_h ucp_ep,
         return status;
     }
 
-    ucp_ep_flush_state_reset(ucp_ep);
-
     return UCS_OK;
 }
 
@@ -882,6 +949,7 @@ static unsigned ucp_cm_server_conn_request_progress(void *arg)
         return 1;
     }
 
+    ucs_assert(listener->accept_cb != NULL);
     UCS_ASYNC_BLOCK(&worker->async);
     ucp_ep_create_server_accept(worker, conn_request, &ep);
     UCS_ASYNC_UNBLOCK(&worker->async);
@@ -903,6 +971,25 @@ static ucp_rsc_index_t ucp_listener_get_cm_index(uct_listener_h listener,
     return UCP_NULL_RESOURCE;
 }
 
+int ucp_cm_server_conn_request_progress_cb_pred(const ucs_callbackq_elem_t *elem,
+                                                void *arg)
+{
+    ucp_listener_h listener = arg;
+    ucp_conn_request_h conn_request;
+
+    if (elem->cb != ucp_cm_server_conn_request_progress) {
+        return 0;
+    }
+
+    conn_request = elem->arg;
+    if (conn_request->listener != listener) {
+        return 0;
+    }
+
+    ucp_listener_reject(listener, conn_request);
+    return 1;
+}
+
 void ucp_cm_server_conn_request_cb(uct_listener_h listener, void *arg,
                                    const uct_cm_listener_conn_request_args_t
                                    *conn_req_args)
@@ -962,9 +1049,10 @@ void ucp_cm_server_conn_request_cb(uct_listener_h listener, void *arg,
     }
 
     ucp_conn_request->listener     = ucp_listener;
-    ucp_conn_request->uct.listener = listener;
+    ucp_conn_request->uct_listener = listener;
     ucp_conn_request->uct_req      = conn_request;
     ucp_conn_request->cm_idx       = cm_idx;
+    ucp_conn_request->ep           = NULL;
 
     status = ucs_sockaddr_copy((struct sockaddr *)&ucp_conn_request->client_address,
                                conn_req_args->client_address.addr);
@@ -996,6 +1084,7 @@ void ucp_cm_server_conn_request_cb(uct_listener_h listener, void *arg,
 err_reject:
     status = uct_listener_reject(listener, conn_request);
     if (status != UCS_OK) {
+        /* coverity[pass_freed_arg] */
         ucs_warn("failed to reject connect request %p on listener %p",
                  conn_request, listener);
     }
@@ -1007,15 +1096,16 @@ ucp_ep_cm_server_create_connected(ucp_worker_h worker, unsigned ep_init_flags,
                                   ucp_conn_request_h conn_request,
                                   ucp_ep_h *ep_p)
 {
-    uint64_t tl_bitmap = ucp_context_dev_tl_bitmap(worker->context,
-                                                   conn_request->dev_name);
+    ucp_tl_bitmap_t tl_bitmap;
     ucp_ep_h ep;
     ucs_status_t status;
     char client_addr_str[UCS_SOCKADDR_STRING_LEN];
 
     ep_init_flags |= UCP_EP_INIT_CM_WIREUP_SERVER | UCP_EP_INIT_CM_PHASE;
 
-    if (tl_bitmap == 0) {
+    ucp_context_dev_tl_bitmap(worker->context, conn_request->dev_name,
+                              &tl_bitmap);
+    if (UCS_BITMAP_IS_ZERO_INPLACE(&tl_bitmap)) {
         ucs_error("listener %p: got connection request from %s on a device %s "
                   "which was not present during UCP initialization",
                   conn_request->listener,
@@ -1023,112 +1113,98 @@ ucp_ep_cm_server_create_connected(ucp_worker_h worker, unsigned ep_init_flags,
                                    client_addr_str, sizeof(client_addr_str)),
                   conn_request->dev_name);
         status = UCS_ERR_UNREACHABLE;
-        goto out;
+        goto out_free_request;
     }
 
     /* Create and connect TL part */
-    status = ucp_ep_create_to_worker_addr(worker, tl_bitmap, remote_addr,
+    status = ucp_ep_create_to_worker_addr(worker, &tl_bitmap, remote_addr,
                                           ep_init_flags,
                                           "conn_request on uct_listener", &ep);
     if (status != UCS_OK) {
         ucs_warn("failed to create server ep and connect to worker address on "
-                 "device %s, tl_bitmap 0x%"PRIx64", status %s",
-                 conn_request->dev_name, tl_bitmap, ucs_status_string(status));
-        uct_listener_reject(conn_request->uct.listener, conn_request->uct_req);
-        goto out;
+                 "device %s, tl_bitmap " UCT_TL_BITMAP_FMT ", status %s",
+                 conn_request->dev_name, UCT_TL_BITMAP_ARG(&tl_bitmap),
+                 ucs_status_string(status));
+        uct_listener_reject(conn_request->uct_listener, conn_request->uct_req);
+        goto out_free_request;
     }
 
     status = ucp_wireup_connect_local(ep, remote_addr, NULL);
     if (status != UCS_OK) {
         ucs_warn("server ep %p failed to connect to remote address on "
-                 "device %s, tl_bitmap 0x%"PRIx64", status %s",
-                 ep, conn_request->dev_name, tl_bitmap,
-                 ucs_status_string(status));
-        uct_listener_reject(conn_request->uct.listener, conn_request->uct_req);
+                 "device %s, tl_bitmap " UCT_TL_BITMAP_FMT ", status %s",
+                 ep, conn_request->dev_name, tl_bitmap.bits[0],
+                 tl_bitmap.bits[1], ucs_status_string(status));
+        uct_listener_reject(conn_request->uct_listener, conn_request->uct_req);
         goto err_destroy_ep;
     }
 
-    status = ucp_ep_cm_connect_server_lane(ep, conn_request->uct.listener,
+    status = ucp_ep_cm_connect_server_lane(ep, conn_request->uct_listener,
                                            conn_request->uct_req,
-                                           conn_request->cm_idx);
+                                           conn_request->cm_idx,
+                                           conn_request->dev_name);
     if (status != UCS_OK) {
         ucs_warn("server ep %p failed to connect CM lane on device %s, "
-                 "tl_bitmap 0x%"PRIx64", status %s",
-                 ep, conn_request->dev_name, tl_bitmap,
+                 "tl_bitmap " UCT_TL_BITMAP_FMT ", status %s",
+                 ep, conn_request->dev_name, UCT_TL_BITMAP_ARG(&tl_bitmap),
                  ucs_status_string(status));
         goto err_destroy_ep;
     }
 
-    ep->flags                       |= UCP_EP_FLAG_LISTENER;
-    ucp_ep_ext_control(ep)->listener = conn_request->listener;
     ucp_ep_update_remote_id(ep, conn_request->sa_data.ep_id);
-    ucp_listener_schedule_accept_cb(ep);
-    *ep_p = ep;
+    ucp_ep_flush_state_reset(ep);
 
-out:
+    if (conn_request->listener->accept_cb == NULL) {
+        goto out_free_request;
+    } else {
+        conn_request->ep = ep;
+        ucp_listener_schedule_accept_cb(conn_request);
+        goto out;
+    }
+
+err_destroy_ep:
+    ucp_ep_destroy_internal(ep);
+out_free_request:
     ucs_free(conn_request->remote_dev_addr);
     ucs_free(conn_request);
+out:
+    if (status == UCS_OK) {
+        *ep_p = ep;
+    }
 
     return status;
-
-err_destroy_ep:
-    ucp_ep_destroy_internal(ep);
-    goto out;
 }
 
-static ssize_t ucp_cm_server_priv_pack_cb(void *arg,
-                                          const uct_cm_ep_priv_data_pack_args_t
-                                          *pack_args, void *priv_data)
+static ucs_status_t
+ucp_ep_server_init_priv_data(ucp_ep_h ep,  const char *dev_name,
+                             const void **data_buf_p, size_t *data_buf_size_p)
 {
-    ucp_wireup_sockaddr_data_t *sa_data = priv_data;
-    ucp_ep_h ep                         = arg;
-    ucp_worker_h worker                 = ep->worker;
-    ucp_wireup_ep_t *cm_wireup_ep       = ucp_ep_get_cm_wireup_ep(ep);
-    uint64_t tl_bitmap;
-    void* ucp_addr;
-    size_t ucp_addr_size;
+    ucp_worker_h worker = ep->worker;
+    ucp_tl_bitmap_t tl_bitmap;
+    ucp_tl_bitmap_t ctx_tl_bitmap;
     ucp_rsc_index_t dev_index;
     ucs_status_t status;
 
     UCS_ASYNC_BLOCK(&worker->async);
 
-    tl_bitmap = ucp_ep_get_tl_bitmap(ep);
-    /* make sure that all lanes are created on correct device */
-    ucs_assert_always(pack_args->field_mask &
-                      UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME);
-    ucs_assert(!(tl_bitmap & ~ucp_context_dev_tl_bitmap(worker->context,
-                                                        pack_args->dev_name)));
+    UCP_EP_CM_CALLBACK_ENTER(ep, ucp_ep_get_cm_uct_ep(ep),
+                             {
+                                 status = UCS_ERR_NOT_CONNECTED;
+                                 goto out;
+                             });
 
-    status = ucp_address_pack(worker, ep, tl_bitmap,
-                              UCP_ADDRESS_PACK_FLAGS_CM_DEFAULT, NULL,
-                              &ucp_addr_size, &ucp_addr);
-    if (status != UCS_OK) {
-        goto out;
-    }
+    ucp_ep_get_tl_bitmap(ep, &tl_bitmap);
 
-    if (worker->cms[cm_wireup_ep->cm_idx].attr.max_conn_priv <
-        (sizeof(*sa_data) + ucp_addr_size)) {
-        status = UCS_ERR_BUFFER_TOO_SMALL;
-        goto free_addr;
-    }
+    ucp_context_dev_tl_bitmap(worker->context, dev_name, &ctx_tl_bitmap);
+    ucp_tl_bitmap_validate(&tl_bitmap, &ctx_tl_bitmap);
 
-    dev_index = ucp_cm_tl_bitmap_get_dev_idx(worker->context, tl_bitmap);
-    ucp_cm_priv_data_pack(sa_data, ep, dev_index, ucp_addr, ucp_addr_size);
+    dev_index = ucp_cm_tl_bitmap_get_dev_idx(worker->context, &tl_bitmap);
+    status    = ucp_cm_ep_priv_data_pack(ep, &tl_bitmap, dev_index,
+                                         (void **)data_buf_p, data_buf_size_p);
 
-free_addr:
-    ucs_free(ucp_addr);
 out:
-    if (status == UCS_OK) {
-        ep->flags |= UCP_EP_FLAG_LOCAL_CONNECTED;
-    } else {
-        ucp_worker_set_ep_failed(worker, ep,
-                                 &ucp_ep_get_cm_wireup_ep(ep)->super.super,
-                                 ucp_ep_get_cm_lane(ep), status);
-    }
-
     UCS_ASYNC_UNBLOCK(&worker->async);
-
-    return (status == UCS_OK) ? (sizeof(*sa_data) + ucp_addr_size) : status;
+    return status;
 }
 
 /*
@@ -1140,7 +1216,7 @@ static unsigned ucp_cm_server_conn_notify_progress(void *arg)
     ucs_status_t status;
 
     UCS_ASYNC_BLOCK(&ucp_ep->worker->async);
-    if (!ucp_cm_ep_should_use_wireup_msg(ucp_ep)) {
+    if (!ucp_ep->worker->context->config.ext.cm_use_all_devices) {
         ucp_wireup_remote_connected(ucp_ep);
     } else {
         status = ucp_wireup_send_pre_request(ucp_ep);
@@ -1153,9 +1229,9 @@ static unsigned ucp_cm_server_conn_notify_progress(void *arg)
 /*
  * Async callback on a server side which notifies that client is connected.
  */
-static void ucp_cm_server_conn_notify_cb(uct_ep_h ep, void *arg,
-                                         const uct_cm_ep_server_conn_notify_args_t
-                                         *notify_args)
+static void ucp_cm_server_conn_notify_cb(
+        uct_ep_h uct_cm_ep, void *arg,
+        const uct_cm_ep_server_conn_notify_args_t *notify_args)
 {
     ucp_ep_h ucp_ep            = arg;
     uct_worker_cb_id_t prog_id = UCS_CALLBACKQ_ID_NULL;
@@ -1166,6 +1242,11 @@ static void ucp_cm_server_conn_notify_cb(uct_ep_h ep, void *arg,
                       UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS);
 
     status = notify_args->status;
+    ucp_ep_update_flags(ucp_ep, UCP_EP_FLAG_SERVER_NOTIFY_CB, 0);
+    ucs_trace("ep %p flags 0x%x: notify callback invoked, status %s", ucp_ep,
+              ucp_ep->flags, ucs_status_string(status));
+
+    UCP_EP_CM_CALLBACK_ENTER(ucp_ep, uct_cm_ep, return);
 
     if (status == UCS_OK) {
         uct_worker_progress_register_safe(ucp_ep->worker->uct,
@@ -1185,12 +1266,12 @@ static void ucp_cm_server_conn_notify_cb(uct_ep_h ep, void *arg,
 ucs_status_t ucp_ep_cm_connect_server_lane(ucp_ep_h ep,
                                            uct_listener_h uct_listener,
                                            uct_conn_request_h uct_conn_req,
-                                           ucp_rsc_index_t cm_idx)
+                                           ucp_rsc_index_t cm_idx,
+                                           const char *dev_name)
 {
     ucp_worker_h worker   = ep->worker;
     ucp_lane_index_t lane = ucp_ep_get_cm_lane(ep);
     uct_ep_params_t uct_ep_params;
-    ucp_wireup_ep_t *cm_wireup_ep;
     uct_ep_h uct_ep;
     ucs_status_t status;
 
@@ -1203,11 +1284,10 @@ ucs_status_t ucp_ep_cm_connect_server_lane(ucp_ep_h ep,
         ucs_warn("server ep %p failed to create wireup CM lane, status %s",
                  ep, ucs_status_string(status));
         uct_listener_reject(uct_listener, uct_conn_req);
-        return status;
+        goto err;
     }
 
-    cm_wireup_ep         = ucs_derived_of(ep->uct_eps[lane], ucp_wireup_ep_t);
-    cm_wireup_ep->cm_idx = cm_idx;
+    ucp_ep_ext_control(ep)->cm_idx = cm_idx;
 
     /* create a server side CM endpoint */
     ucs_trace("server ep %p: uct_ep[%d], worker %p, cm_idx=%d, cm=%s",
@@ -1217,26 +1297,38 @@ ucs_status_t ucp_ep_cm_connect_server_lane(ucp_ep_h ep,
                                UCT_EP_PARAM_FIELD_CONN_REQUEST              |
                                UCT_EP_PARAM_FIELD_USER_DATA                 |
                                UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS         |
-                               UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB          |
                                UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER |
-                               UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB;
+                               UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB    |
+                               UCT_EP_PARAM_FIELD_PRIV_DATA                 |
+                               UCT_EP_PARAM_FIELD_PRIV_DATA_LENGTH;
 
     uct_ep_params.cm                 = worker->cms[cm_idx].cm;
     uct_ep_params.user_data          = ep;
     uct_ep_params.conn_request       = uct_conn_req;
     uct_ep_params.sockaddr_cb_flags  = UCT_CB_FLAG_ASYNC;
-    uct_ep_params.sockaddr_pack_cb   = ucp_cm_server_priv_pack_cb;
     uct_ep_params.sockaddr_cb_server = ucp_cm_server_conn_notify_cb;
     uct_ep_params.disconnect_cb      = ucp_cm_disconnect_cb;
+    status = ucp_ep_server_init_priv_data(ep, dev_name,
+                                          &uct_ep_params.private_data,
+                                          &uct_ep_params.private_data_length);
+    if (status != UCS_OK) {
+        goto err;
+    }
 
     status = uct_ep_create(&uct_ep_params, &uct_ep);
+    ucs_free((void*)uct_ep_params.private_data);
     if (status != UCS_OK) {
-        /* coverity[leaked_storage] */
-        return status;
+        goto err;
     }
 
     ucp_wireup_ep_set_next_ep(ep->uct_eps[lane], uct_ep);
+    ucp_ep_update_flags(ep, UCP_EP_FLAG_LOCAL_CONNECTED, 0);
     return UCS_OK;
+
+err:
+    ucp_worker_set_ep_failed(worker, ep, ep->uct_eps[lane], lane, status);
+    /* coverity[leaked_storage] (uct_ep) */
+    return status;
 }
 
 void ucp_ep_cm_disconnect_cm_lane(ucp_ep_h ucp_ep)
@@ -1248,9 +1340,11 @@ void ucp_ep_cm_disconnect_cm_lane(ucp_ep_h ucp_ep)
     /* No reason to try disconnect twice */
     ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_DISCONNECTED_CM_LANE));
     ucs_assert(!(ucp_ep->flags & UCP_EP_FLAG_FAILED));
+    ucs_assert(ucp_ep_is_cm_local_connected(ucp_ep));
+
+    ucp_ep_update_flags(ucp_ep, UCP_EP_FLAG_DISCONNECTED_CM_LANE,
+                        UCP_EP_FLAG_LOCAL_CONNECTED);
 
-    ucp_ep->flags &= ~UCP_EP_FLAG_LOCAL_CONNECTED;
-    ucp_ep->flags |= UCP_EP_FLAG_DISCONNECTED_CM_LANE;
     /* this will invoke @ref ucp_cm_disconnect_cb on remote side */
     status = uct_ep_disconnect(uct_cm_ep, 0);
     if (status != UCS_OK) {
@@ -1259,9 +1353,9 @@ void ucp_ep_cm_disconnect_cm_lane(ucp_ep_h ucp_ep)
     }
 }
 
-ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep)
+ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep, const ucp_request_param_t *param)
 {
-    ucp_request_t *request = ucp_request_get(ep->worker);
+    ucp_request_t *request = ucp_request_get_param(ep->worker, param, {return NULL;});
 
     if (request == NULL) {
         ucs_error("failed to allocate close request for ep %p", ep);
@@ -1273,6 +1367,8 @@ ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep)
     request->send.ep = ep;
     request->send.flush.uct_flags = UCT_FLUSH_FLAG_LOCAL;
 
+    ucp_request_set_send_callback_param(param, request, send);
+
     return request;
 }
 
@@ -1288,8 +1384,9 @@ static int ucp_cm_cbs_remove_filter(const ucs_callbackq_elem_t *elem, void *arg)
         } else {
             return 0;
         }
-    } else if ((elem->cb == ucp_ep_cm_disconnect_progress) ||
-               (elem->cb == ucp_cm_server_conn_notify_progress)) {
+    } else if ((elem->cb == ucp_ep_cm_disconnect_progress)      ||
+               (elem->cb == ucp_cm_server_conn_notify_progress) ||
+               (elem->cb == ucp_cm_client_uct_connect_progress)) {
         return arg == elem->arg;
     } else {
         return 0;
diff --git a/src/ucp/wireup/wireup_cm.h b/src/ucp/wireup/wireup_cm.h
index edd2648c385..34c924a72d4 100644
--- a/src/ucp/wireup/wireup_cm.h
+++ b/src/ucp/wireup/wireup_cm.h
@@ -19,21 +19,26 @@ typedef struct ucp_cm_client_connect_progress_arg {
 } ucp_cm_client_connect_progress_arg_t;
 
 
-unsigned ucp_cm_ep_init_flags(const ucp_worker_h worker,
-                              const ucp_ep_params_t *params);
+unsigned ucp_cm_ep_init_flags(const ucp_ep_params_t *params);
 
 int ucp_ep_init_flags_has_cm(unsigned ep_init_flags);
 
+void ucp_cm_client_restore_ep(ucp_wireup_ep_t *wireup_cm_ep, ucp_ep_h ucp_ep);
+
 ucs_status_t ucp_ep_cm_connect_server_lane(ucp_ep_h ep,
                                            uct_listener_h uct_listener,
                                            uct_conn_request_h uct_conn_req,
-                                           ucp_rsc_index_t cm_idx);
+                                           ucp_rsc_index_t cm_idx,
+                                           const char *dev_name);
 
 ucs_status_t ucp_ep_client_cm_connect_start(ucp_ep_h ucp_ep,
                                             const ucp_ep_params_t *params);
 
 ucs_status_t ucp_ep_client_cm_create_uct_ep(ucp_ep_h ucp_ep);
 
+int ucp_cm_server_conn_request_progress_cb_pred(const ucs_callbackq_elem_t *elem,
+                                                void *arg);
+
 void ucp_cm_server_conn_request_cb(uct_listener_h listener, void *arg,
                                    const uct_cm_listener_conn_request_args_t
                                    *conn_req_args);
@@ -46,7 +51,8 @@ ucp_ep_cm_server_create_connected(ucp_worker_h worker, unsigned ep_init_flags,
 
 void ucp_ep_cm_disconnect_cm_lane(ucp_ep_h ucp_ep);
 
-ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep);
+ucp_request_t* ucp_ep_cm_close_request_get(ucp_ep_h ep,
+                                           const ucp_request_param_t *param);
 
 void ucp_ep_cm_slow_cbq_cleanup(ucp_ep_h ep);
 
diff --git a/src/ucp/wireup/wireup_ep.c b/src/ucp/wireup/wireup_ep.c
index 7a5d95c0589..1f494cd1cd5 100644
--- a/src/ucp/wireup/wireup_ep.c
+++ b/src/ucp/wireup/wireup_ep.c
@@ -138,7 +138,7 @@ ucs_status_t ucp_wireup_ep_progress_pending(uct_pending_req_t *self)
     status = req->func(req);
     if (status == UCS_OK) {
         ucs_atomic_sub32(&wireup_ep->pending_count, 1);
-        ucs_free(proxy_req);
+        ucp_request_mem_free(proxy_req);
     }
     return status;
 }
@@ -152,12 +152,12 @@ ucp_wireup_ep_pending_req_release(uct_pending_req_t *self, void *arg)
     ucp_request_t   *req;
 
     ucs_atomic_sub32(&wireup_ep->pending_count, 1);
- 
+
     if (proxy_req->send.proxy.req->func == ucp_wireup_msg_progress) {
         req = ucs_container_of(proxy_req->send.proxy.req, ucp_request_t,
                                send.uct);
-        ucs_free((void*)req->send.buffer);
-        ucs_free(req);
+        ucs_free(req->send.buffer);
+        ucp_request_mem_free(req);
     }
 
     ucs_free(proxy_req);
@@ -176,7 +176,7 @@ static ucs_status_t ucp_wireup_ep_pending_add(uct_ep_h uct_ep,
 
     UCS_ASYNC_BLOCK(&worker->async);
     if (req->func == ucp_wireup_msg_progress) {
-        proxy_req = ucs_malloc(sizeof(*proxy_req), "ucp_wireup_proxy_req");
+        proxy_req = ucp_request_mem_alloc("ucp_wireup_proxy_req");
         if (proxy_req == NULL) {
             status = UCS_ERR_NO_MEMORY;
             goto out;
@@ -194,7 +194,7 @@ static ucs_status_t ucp_wireup_ep_pending_add(uct_ep_h uct_ep,
         if (status == UCS_OK) {
             ucs_atomic_add32(&wireup_ep->pending_count, +1);
         } else {
-            ucs_free(proxy_req);
+            ucp_request_mem_free(proxy_req);
         }
     } else {
         ucs_queue_push(&wireup_ep->pending_q, ucp_wireup_ep_req_priv(req));
@@ -207,9 +207,9 @@ static ucs_status_t ucp_wireup_ep_pending_add(uct_ep_h uct_ep,
     return status;
 }
 
-static void
-ucp_wireup_ep_pending_purge(uct_ep_h uct_ep, uct_pending_purge_callback_t cb,
-                            void *arg)
+void ucp_wireup_ep_pending_queue_purge(uct_ep_h uct_ep,
+                                       uct_pending_purge_callback_t cb,
+                                       void *arg)
 {
     ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
     ucp_worker_h worker        = wireup_ep->super.ucp_ep->worker;
@@ -223,6 +223,15 @@ ucp_wireup_ep_pending_purge(uct_ep_h uct_ep, uct_pending_purge_callback_t cb,
         UCS_ASYNC_UNBLOCK(&worker->async);
         cb(&ucp_req->send.uct, arg);
     }
+}
+
+static void
+ucp_wireup_ep_pending_purge(uct_ep_h uct_ep, uct_pending_purge_callback_t cb,
+                            void *arg)
+{
+    ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
+
+    ucp_wireup_ep_pending_queue_purge(uct_ep, cb, arg);
 
     if (wireup_ep->pending_count > 0) {
         uct_ep_pending_purge(ucp_wireup_ep_get_msg_ep(wireup_ep),
@@ -253,6 +262,7 @@ UCS_CLASS_DEFINE_NAMED_NEW_FUNC(ucp_wireup_ep_create, ucp_wireup_ep_t, uct_ep_t,
 void ucp_wireup_ep_set_aux(ucp_wireup_ep_t *wireup_ep, uct_ep_h uct_ep,
                            ucp_rsc_index_t rsc_index)
 {
+    ucs_assert(!ucp_wireup_ep_test(uct_ep));
     wireup_ep->aux_ep        = uct_ep;
     wireup_ep->aux_rsc_index = rsc_index;
 }
@@ -273,8 +283,9 @@ ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, unsigned ep_init_flags,
     /* select an auxiliary transport which would be used to pass connection
      * establishment messages.
      */
-    status = ucp_wireup_select_aux_transport(ucp_ep, ep_init_flags, UINT64_MAX,
-                                             remote_address, &select_info);
+    status = ucp_wireup_select_aux_transport(ucp_ep, ep_init_flags,
+                                             ucp_tl_bitmap_max, remote_address,
+                                             &select_info);
     if (status != UCS_OK) {
         return status;
     }
@@ -307,6 +318,38 @@ ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, unsigned ep_init_flags,
     return UCS_OK;
 }
 
+static void ucp_wireup_ep_aux_ep_discarded(void *request, ucs_status_t status,
+                                           void *user_data)
+{
+    ucp_worker_iface_t *wiface = (ucp_worker_iface_t*)user_data;
+
+    /* Make Coverity happy */
+    ucs_assert(user_data != NULL);
+
+    ucp_worker_iface_unprogress_ep(wiface);
+}
+
+void ucp_wireup_ep_discard_aux_ep(ucp_wireup_ep_t *wireup_ep,
+                                  unsigned ep_flush_flags,
+                                  uct_pending_purge_callback_t purge_cb,
+                                  void *purge_arg)
+{
+    ucp_ep_h ucp_ep     = wireup_ep->super.ucp_ep;
+    ucp_worker_h worker = ucp_ep->worker;
+    uct_ep_h aux_ep     = wireup_ep->aux_ep;
+    ucp_worker_iface_t *wiface;
+
+    if (aux_ep == NULL) {
+        return;
+    }
+
+    wiface = ucp_worker_iface(worker, wireup_ep->aux_rsc_index);
+    ucp_wireup_ep_disown(&wireup_ep->super.super, aux_ep);
+    ucp_worker_discard_uct_ep(ucp_ep, aux_ep, ep_flush_flags, purge_cb,
+                              purge_arg, ucp_wireup_ep_aux_ep_discarded,
+                              wiface);
+}
+
 static ucs_status_t ucp_wireup_ep_flush(uct_ep_h uct_ep, unsigned flags,
                                         uct_completion_t *comp)
 {
@@ -321,13 +364,56 @@ static ucs_status_t ucp_wireup_ep_flush(uct_ep_h uct_ep, unsigned flags,
     return UCS_ERR_NO_RESOURCE;
 }
 
+static ucs_status_t
+ucp_wireup_ep_do_check(ucp_ep_h ucp_ep, uct_ep_h uct_ep,
+                       ucp_rsc_index_t rsc_idx, unsigned flags,
+                       uct_completion_t *comp)
+{
+    ucp_worker_h worker = ucp_ep->worker;
+    ucp_worker_iface_t *wiface;
+
+    ucs_assert(rsc_idx != UCP_NULL_RESOURCE);
+
+    wiface = ucp_worker_iface(worker, rsc_idx);
+    if (wiface->attr.cap.flags & UCT_IFACE_FLAG_EP_CHECK) {
+        return ucp_ep_do_uct_ep_keepalive(ucp_ep, uct_ep, rsc_idx, flags,
+                                          comp);
+    }
+
+    /* if EP_CHECK is not supported by UCT transport, it has to support a
+     * built-in keepalive mechanism to be able to detect peer failure during
+     * wireup
+     */
+    ucs_assert(wiface->attr.cap.flags & UCT_IFACE_FLAG_EP_KEEPALIVE);
+    return UCS_OK;
+}
+
+static ucs_status_t ucp_wireup_ep_check(uct_ep_h uct_ep, unsigned flags,
+                                        uct_completion_t *comp)
+{
+    ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
+    ucp_ep_h ucp_ep            = wireup_ep->super.ucp_ep;
+
+    if (wireup_ep->flags & UCP_WIREUP_EP_FLAG_READY) {
+        return uct_ep_check(wireup_ep->super.uct_ep, flags, comp);
+    }
+
+    if (wireup_ep->aux_ep != NULL) {
+        return ucp_wireup_ep_do_check(ucp_ep, wireup_ep->aux_ep,
+                                      wireup_ep->aux_rsc_index,
+                                      flags, comp);
+    }
+
+    return UCS_OK;
+}
+
 
 UCS_CLASS_INIT_FUNC(ucp_wireup_ep_t, ucp_ep_h ucp_ep)
 {
     static uct_iface_ops_t ops = {
         .ep_connect_to_ep    = ucp_wireup_ep_connect_to_ep,
         .ep_flush            = ucp_wireup_ep_flush,
-        .ep_check            = ucs_empty_function_return_success,
+        .ep_check            = ucp_wireup_ep_check,
         .ep_destroy          = UCS_CLASS_DELETE_FUNC_NAME(ucp_wireup_ep_t),
         .ep_pending_add      = ucp_wireup_ep_pending_add,
         .ep_pending_purge    = ucp_wireup_ep_pending_purge,
@@ -338,6 +424,7 @@ UCS_CLASS_INIT_FUNC(ucp_wireup_ep_t, ucp_ep_h ucp_ep)
         .ep_get_bcopy        = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_no_resource,
         .ep_get_zcopy        = (uct_ep_get_zcopy_func_t)ucs_empty_function_return_no_resource,
         .ep_am_short         = (uct_ep_am_short_func_t)ucs_empty_function_return_no_resource,
+        .ep_am_short_iov     = (uct_ep_am_short_iov_func_t)ucs_empty_function_return_no_resource,
         .ep_am_bcopy         = ucp_wireup_ep_am_bcopy,
         .ep_am_zcopy         = (uct_ep_am_zcopy_func_t)ucs_empty_function_return_no_resource,
         .ep_tag_eager_short  = (uct_ep_tag_eager_short_func_t)ucs_empty_function_return_no_resource,
@@ -352,19 +439,21 @@ UCS_CLASS_INIT_FUNC(ucp_wireup_ep_t, ucp_ep_h ucp_ep)
         .ep_atomic32_fetch   = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_no_resource,
         .ep_atomic_cswap32   = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_no_resource
     };
+    ucp_lane_index_t lane;
 
     UCS_CLASS_CALL_SUPER_INIT(ucp_proxy_ep_t, &ops, ucp_ep, NULL, 0);
 
-    self->aux_ep             = NULL;
-    self->sockaddr_ep        = NULL;
-    self->tmp_ep             = NULL;
-    self->aux_rsc_index      = UCP_NULL_RESOURCE;
-    self->sockaddr_rsc_index = UCP_NULL_RESOURCE;
-    self->pending_count      = 0;
-    self->flags              = 0;
-    self->progress_id        = UCS_CALLBACKQ_ID_NULL;
-    self->cm_idx             = UCP_NULL_RESOURCE;
+    self->aux_ep        = NULL;
+    self->aux_rsc_index = UCP_NULL_RESOURCE;
+    self->pending_count = 0;
+    self->flags         = 0;
+    self->progress_id   = UCS_CALLBACKQ_ID_NULL;
     ucs_queue_head_init(&self->pending_q);
+    UCS_BITMAP_CLEAR(&self->cm_resolve_tl_bitmap);
+
+    for (lane = 0; lane < UCP_MAX_LANES; ++lane) {
+        self->dst_rsc_indices[lane] = UCP_NULL_RESOURCE;
+    }
 
     UCS_ASYNC_BLOCK(&ucp_ep->worker->async);
     ucp_worker_flush_ops_count_inc(ucp_ep->worker);
@@ -388,25 +477,19 @@ static UCS_CLASS_CLEANUP_FUNC(ucp_wireup_ep_t)
 
     uct_worker_progress_unregister_safe(worker->uct, &self->progress_id);
     if (self->aux_ep != NULL) {
-        ucp_worker_iface_unprogress_ep(ucp_worker_iface(worker,
-                                                        self->aux_rsc_index));
         ucs_queue_head_init(&tmp_pending_queue);
-        uct_ep_pending_purge(self->aux_ep, ucp_wireup_pending_purge_cb,
-                             &tmp_pending_queue);
-        uct_ep_destroy(self->aux_ep);
+        /* Discard AUX UCT EP to purge all outstanding/pending operations.
+         * Normally, WIREUP EP should complete all outstanding operations prior
+         * destroying WIREUP EP - so, doing flush(CANCEL) won't take any affect,
+         * but it will make sure that no completions will be received if some
+         * error was detected */
+        ucp_wireup_ep_discard_aux_ep(self, UCT_FLUSH_FLAG_CANCEL,
+                                     ucp_wireup_pending_purge_cb,
+                                     &tmp_pending_queue);
         self->aux_ep = NULL;
         ucp_wireup_replay_pending_requests(ucp_ep, &tmp_pending_queue);
     }
 
-    if (self->sockaddr_ep != NULL) {
-        uct_ep_destroy(self->sockaddr_ep);
-    }
-
-    if (self->tmp_ep != NULL) {
-        ucs_assert(!(self->tmp_ep->flags & UCP_EP_FLAG_USED));
-        ucp_ep_disconnected(self->tmp_ep, 1);
-    }
-
     UCS_ASYNC_BLOCK(&worker->async);
     ucp_worker_flush_ops_count_dec(worker);
     UCS_ASYNC_UNBLOCK(&worker->async);
@@ -456,9 +539,12 @@ ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, unsigned ep_init_flags,
 
     ucp_proxy_ep_set_uct_ep(&wireup_ep->super, next_ep, 1);
 
-    ucs_debug("ep %p: created next_ep %p to %s using " UCT_TL_RESOURCE_DESC_FMT,
-              ucp_ep, wireup_ep->super.uct_ep, ucp_ep_peer_name(ucp_ep),
-              UCT_TL_RESOURCE_DESC_ARG(&worker->context->tl_rscs[rsc_index].tl_rsc));
+    ucs_debug("ep %p: wireup_ep %p created next_ep %p to %s "
+              "using " UCT_TL_RESOURCE_DESC_FMT,
+              ucp_ep, wireup_ep, wireup_ep->super.uct_ep,
+              ucp_ep_peer_name(ucp_ep),
+              UCT_TL_RESOURCE_DESC_ARG(
+                      &worker->context->tl_rscs[rsc_index].tl_rsc));
 
     /* we need to create an auxiliary transport only for active messages */
     if (connect_aux) {
@@ -478,198 +564,17 @@ ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, unsigned ep_init_flags,
     return status;
 }
 
-static ucs_status_t ucp_wireup_ep_pack_sockaddr_aux_tls(ucp_worker_h worker,
-                                                        const char *dev_name,
-                                                        uint64_t *tl_bitmap_p,
-                                                        ucp_address_t **address_p,
-                                                        size_t *address_length_p)
-{
-    ucp_context_h context = worker->context;
-    int tl_id, found_supported_tl = 0;
-    ucs_status_t status;
-    uint64_t tl_bitmap = 0;
-
-    /* Find a transport which matches the given dev_name and the user's configuration.
-     * It also has to be a UCT_IFACE_FLAG_CONNECT_TO_IFACE transport and support
-     * active messaging for sending a wireup message */
-    ucs_for_each_bit(tl_id, context->config.sockaddr_aux_rscs_bitmap) {
-        if ((!strncmp(context->tl_rscs[tl_id].tl_rsc.dev_name, dev_name,
-                      UCT_DEVICE_NAME_MAX)) &&
-            (ucs_test_all_flags(ucp_worker_iface_get_attr(worker, tl_id)->cap.flags,
-                                UCT_IFACE_FLAG_CONNECT_TO_IFACE |
-                                UCT_IFACE_FLAG_AM_BCOPY))) {
-            found_supported_tl = 1;
-            tl_bitmap |= UCS_BIT(tl_id);
-        }
-    }
-
-    if (found_supported_tl) {
-        status = ucp_address_pack(worker, NULL, tl_bitmap,
-                                  UCP_ADDRESS_PACK_FLAGS_ALL, NULL,
-                                  address_length_p, (void**)address_p);
-    } else {
-        ucs_error("no supported sockaddr auxiliary transports found for %s", dev_name);
-        status = UCS_ERR_UNREACHABLE;
-    }
-
-    *tl_bitmap_p = tl_bitmap;
-    return status;
-}
-
-ssize_t ucp_wireup_ep_sockaddr_fill_private_data(void *arg,
-                                                 const uct_cm_ep_priv_data_pack_args_t
-                                                 *pack_args, void *priv_data)
-{
-    ucp_wireup_sockaddr_data_t *sa_data = priv_data;
-    ucp_wireup_ep_t *wireup_ep          = arg;
-    ucp_ep_h ucp_ep                     = wireup_ep->super.ucp_ep;
-    ucp_rsc_index_t sockaddr_rsc        = wireup_ep->sockaddr_rsc_index;
-    ucp_worker_h worker                 = ucp_ep->worker;
-    ucp_context_h context               = worker->context;
-    size_t address_length, conn_priv_len;
-    ucp_address_t *worker_address, *rsc_address;
-    uct_iface_attr_t *attrs;
-    ucs_status_t status;
-    uint64_t tl_bitmap;
-    char aux_tls_str[64];
-    const char *dev_name;
-
-    ucs_assert_always(pack_args->field_mask &
-                      UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME);
-
-    dev_name = pack_args->dev_name;
-
-    status = ucp_address_pack(worker, NULL, UINT64_MAX,
-                              UCP_ADDRESS_PACK_FLAGS_ALL, NULL,
-                              &address_length, (void**)&worker_address);
-    if (status != UCS_OK) {
-        goto err;
-    }
-
-    conn_priv_len = sizeof(*sa_data) + address_length;
-
-    /* pack client data */
-    ucs_assert((int)ucp_ep_config(ucp_ep)->key.err_mode <= UINT8_MAX);
-    sa_data->err_mode  = ucp_ep_config(ucp_ep)->key.err_mode;
-    sa_data->ep_id     = ucp_ep_local_id(ucp_ep);
-    sa_data->dev_index = UCP_NULL_RESOURCE; /* Not used */
-
-    attrs = ucp_worker_iface_get_attr(worker, sockaddr_rsc);
-
-    /* check private data length limitation */
-    if (conn_priv_len > attrs->max_conn_priv) {
-
-        /* since the full worker address is too large to fit into the trasnport's
-         * private data, try to pack sockaddr aux tls to pass in the address */
-        status = ucp_wireup_ep_pack_sockaddr_aux_tls(worker, dev_name,
-                                                     &tl_bitmap, &rsc_address,
-                                                     &address_length);
-        if (status != UCS_OK) {
-            goto err_free_address;
-        }
-
-        conn_priv_len = sizeof(*sa_data) + address_length;
-
-        /* check the private data length limitation again, now with partial
-         * resources packed (and not the entire worker address) */
-        if (conn_priv_len > attrs->max_conn_priv) {
-            ucs_error("sockaddr aux resources addresses (%s transports)"
-                      " information (%zu) exceeds max_priv on "
-                      UCT_TL_RESOURCE_DESC_FMT" (%zu)",
-                      ucp_tl_bitmap_str(context, tl_bitmap, aux_tls_str,
-                                        sizeof(aux_tls_str)),
-                      conn_priv_len,
-                      UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[sockaddr_rsc].tl_rsc),
-                      attrs->max_conn_priv);
-            status = UCS_ERR_UNREACHABLE;
-            ucs_free(rsc_address);
-            goto err_free_address;
-        }
-
-        sa_data->addr_mode = UCP_WIREUP_SA_DATA_PARTIAL_ADDR;
-        memcpy(sa_data + 1, rsc_address, address_length);
-        ucp_ep->flags |= UCP_EP_FLAG_SOCKADDR_PARTIAL_ADDR;
-
-        ucs_free(rsc_address);
-
-        ucs_trace("sockaddr tl ("UCT_TL_RESOURCE_DESC_FMT") sending partial address: "
-                  "(%s transports) (len=%zu) to server. "
-                  "total client priv data len: %zu",
-                  UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[sockaddr_rsc].tl_rsc),
-                  ucp_tl_bitmap_str(context, tl_bitmap, aux_tls_str,
-                                    sizeof(aux_tls_str)),
-                  address_length, conn_priv_len);
-    } else {
-        sa_data->addr_mode = UCP_WIREUP_SA_DATA_FULL_ADDR;
-        memcpy(sa_data + 1, worker_address, address_length);
-    }
-
-    ucp_worker_release_address(worker, worker_address);
-    return conn_priv_len;
-
-err_free_address:
-    ucp_worker_release_address(worker, worker_address);
-err:
-    return status;
-}
-
-ucs_status_t ucp_wireup_ep_connect_to_sockaddr(uct_ep_h uct_ep,
-                                               const ucp_ep_params_t *params)
-{
-    ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
-    ucp_ep_h ucp_ep            = wireup_ep->super.ucp_ep;
-    ucp_worker_h worker        = ucp_ep->worker;
-    char saddr_str[UCS_SOCKADDR_STRING_LEN];
-    uct_ep_params_t uct_ep_params;
-    ucp_rsc_index_t sockaddr_rsc;
-    ucp_worker_iface_t *wiface;
-    ucs_status_t status;
-
-    ucs_assert(ucp_wireup_ep_test(uct_ep));
-
-    status = ucp_wireup_select_sockaddr_transport(worker->context,
-                                                  &params->sockaddr,
-                                                  &sockaddr_rsc);
-    if (status != UCS_OK) {
-        goto out;
-    }
-
-    wiface = ucp_worker_iface(worker, sockaddr_rsc);
-
-    wireup_ep->sockaddr_rsc_index = sockaddr_rsc;
-
-    /* Fill parameters and send connection request using the transport */
-    uct_ep_params.field_mask        = UCT_EP_PARAM_FIELD_IFACE             |
-                                      UCT_EP_PARAM_FIELD_USER_DATA         |
-                                      UCT_EP_PARAM_FIELD_SOCKADDR          |
-                                      UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS |
-                                      UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB;
-    uct_ep_params.iface             = wiface->iface;
-    uct_ep_params.sockaddr          = &params->sockaddr;
-    uct_ep_params.user_data         = wireup_ep;
-    uct_ep_params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC;
-    uct_ep_params.sockaddr_pack_cb  = ucp_wireup_ep_sockaddr_fill_private_data;
-    status = uct_ep_create(&uct_ep_params, &wireup_ep->sockaddr_ep);
-    if (status != UCS_OK) {
-        goto out;
-    }
-
-    ucs_debug("ep %p connecting to %s", ucp_ep,
-              ucs_sockaddr_str(params->sockaddr.addr, saddr_str, sizeof(saddr_str)));
-    status = UCS_OK;
-
-out:
-    return status;
-}
-
 void ucp_wireup_ep_set_next_ep(uct_ep_h uct_ep, uct_ep_h next_ep)
 {
     ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
 
     ucs_assert(wireup_ep != NULL);
     ucs_assert(wireup_ep->super.uct_ep == NULL);
+    ucs_assert(!ucp_wireup_ep_test(next_ep));
     wireup_ep->flags |= UCP_WIREUP_EP_FLAG_LOCAL_CONNECTED;
     ucp_proxy_ep_set_uct_ep(&wireup_ep->super, next_ep, 1);
+    ucs_debug("ep %p: wireup_ep %p set next_ep %p", wireup_ep->super.ucp_ep,
+              wireup_ep, wireup_ep->super.uct_ep);
 }
 
 uct_ep_h ucp_wireup_ep_extract_next_ep(uct_ep_h uct_ep)
@@ -696,19 +601,28 @@ void ucp_wireup_ep_destroy_next_ep(ucp_wireup_ep_t *wireup_ep)
     ucs_assert(wireup_ep->flags == 0);
 }
 
-void ucp_wireup_ep_remote_connected(uct_ep_h uct_ep)
+void ucp_wireup_ep_mark_ready(uct_ep_h uct_ep)
 {
     ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
-    ucp_ep_h ucp_ep;
 
     ucs_assert(wireup_ep != NULL);
     ucs_assert(wireup_ep->super.uct_ep != NULL);
     ucs_assert(wireup_ep->flags & UCP_WIREUP_EP_FLAG_LOCAL_CONNECTED);
 
-    ucp_ep = wireup_ep->super.ucp_ep;
+    ucs_trace("ep %p: wireup ep %p is ready", wireup_ep->super.ucp_ep,
+              wireup_ep);
+    wireup_ep->flags |= UCP_WIREUP_EP_FLAG_READY;
+}
 
+void ucp_wireup_ep_remote_connected(uct_ep_h uct_ep)
+{
+    ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
+    ucp_ep_h ucp_ep;
+
+    ucp_wireup_ep_mark_ready(uct_ep);
+
+    ucp_ep = wireup_ep->super.ucp_ep;
     ucs_trace("ep %p: wireup ep %p is remote-connected", ucp_ep, wireup_ep);
-    wireup_ep->flags |= UCP_WIREUP_EP_FLAG_READY;
     uct_worker_progress_register_safe(ucp_ep->worker->uct,
                                       ucp_wireup_ep_progress, wireup_ep, 0,
                                       &wireup_ep->progress_id);
@@ -721,17 +635,38 @@ int ucp_wireup_ep_test(uct_ep_h uct_ep)
                     UCS_CLASS_DELETE_FUNC_NAME(ucp_wireup_ep_t);
 }
 
+int ucp_wireup_aux_ep_is_owner(ucp_wireup_ep_t *wireup_ep, uct_ep_h owned_ep)
+{
+    ucp_ep_h ucp_ep              = wireup_ep->super.ucp_ep;
+    ucp_lane_index_t cm_lane_idx = ucp_ep_get_cm_lane(ucp_ep);
+
+    return (wireup_ep->aux_ep == owned_ep) ||
+           /* Auxilliary EP can be WIREUP EP in case of it is on CM lane */
+           ((wireup_ep->aux_ep != NULL) &&
+            (cm_lane_idx != UCP_NULL_LANE) &&
+            (ucp_ep->uct_eps[cm_lane_idx] == &wireup_ep->super.super) &&
+            ucp_wireup_ep_is_owner(wireup_ep->aux_ep, owned_ep));
+}
+
 int ucp_wireup_ep_is_owner(uct_ep_h uct_ep, uct_ep_h owned_ep)
 {
-    ucp_wireup_ep_t *wireup_ep = ucp_wireup_ep(uct_ep);
+    ucp_wireup_ep_t *wireup_ep;
+
+    if (uct_ep == NULL) {
+        return 0;
+    }
 
+    wireup_ep = ucp_wireup_ep(uct_ep);
     if (wireup_ep == NULL) {
         return 0;
     }
 
-    return (wireup_ep->aux_ep == owned_ep) ||
-           (wireup_ep->sockaddr_ep == owned_ep) ||
-           (wireup_ep->super.uct_ep == owned_ep);
+    if ((ucp_wireup_aux_ep_is_owner(wireup_ep, owned_ep)) ||
+        (wireup_ep->super.uct_ep == owned_ep)) {
+        return 1;
+    }
+
+    return 0;
 }
 
 void ucp_wireup_ep_disown(uct_ep_h uct_ep, uct_ep_h owned_ep)
@@ -741,8 +676,6 @@ void ucp_wireup_ep_disown(uct_ep_h uct_ep, uct_ep_h owned_ep)
     ucs_assert_always(wireup_ep != NULL);
     if (wireup_ep->aux_ep == owned_ep) {
         wireup_ep->aux_ep = NULL;
-    } else if (wireup_ep->sockaddr_ep == owned_ep) {
-        wireup_ep->sockaddr_ep = NULL;
     } else if (wireup_ep->super.uct_ep == owned_ep) {
         ucp_proxy_ep_extract(uct_ep);
     }
diff --git a/src/ucp/wireup/wireup_ep.h b/src/ucp/wireup/wireup_ep.h
index 1c211a5d083..40f8858901a 100644
--- a/src/ucp/wireup/wireup_ep.h
+++ b/src/ucp/wireup/wireup_ep.h
@@ -33,20 +33,19 @@ struct ucp_wireup_ep {
     ucp_proxy_ep_t            super;         /**< Derive from ucp_proxy_ep_t */
     ucs_queue_head_t          pending_q;     /**< Queue of pending operations */
     uct_ep_h                  aux_ep;        /**< Used to wireup the "real" endpoint */
-    uct_ep_h                  sockaddr_ep;   /**< Used for client-server wireup */
-    ucp_ep_h                  tmp_ep;        /**< Used by the client for local tls setup */
     struct sockaddr_storage   cm_remote_sockaddr;  /**< sockaddr of the remote peer -
                                                         used only on the client side
                                                         in a client-server flow */
-    ucp_rsc_index_t           cm_idx;        /**< If this ucp_wireup_ep wraps a CM ep,
-                                                  this is the index of the CM resource
-                                                  on which it was created */
     ucp_rsc_index_t           aux_rsc_index; /**< Index of auxiliary transport */
-    ucp_rsc_index_t           sockaddr_rsc_index; /**< Index of sockaddr transport */
     volatile uint32_t         pending_count; /**< Number of pending wireup operations */
     volatile uint32_t         flags;         /**< Connection state flags */
     uct_worker_cb_id_t        progress_id;   /**< ID of progress function */
     unsigned                  ep_init_flags; /**< UCP wireup EP init flags */
+    /**< TLs which are awailable on client side resolved device */
+    ucp_tl_bitmap_t           cm_resolve_tl_bitmap;
+    /**< Destination resource indicies used for checking intersection between
+         between two configurations in case of CM */
+    ucp_rsc_index_t           dst_rsc_indices[UCP_MAX_LANES];
 };
 
 
@@ -81,8 +80,9 @@ ucs_status_t ucp_wireup_ep_connect(uct_ep_h uct_ep, unsigned ucp_ep_init_flags,
                                    unsigned path_index, int connect_aux,
                                    const ucp_unpacked_address_t *remote_address);
 
-ucs_status_t ucp_wireup_ep_connect_to_sockaddr(uct_ep_h uct_ep,
-                                               const ucp_ep_params_t *params);
+void ucp_wireup_ep_pending_queue_purge(uct_ep_h uct_ep,
+                                       uct_pending_purge_callback_t cb,
+                                       void *arg);
 
 void ucp_wireup_ep_set_aux(ucp_wireup_ep_t *wireup_ep, uct_ep_h uct_ep,
                            ucp_rsc_index_t rsc_index);
@@ -91,16 +91,25 @@ ucs_status_t
 ucp_wireup_ep_connect_aux(ucp_wireup_ep_t *wireup_ep, unsigned ep_init_flags,
                           const ucp_unpacked_address_t *remote_address);
 
+void ucp_wireup_ep_discard_aux_ep(ucp_wireup_ep_t *wireup_ep,
+                                  unsigned ep_flush_flags,
+                                  uct_pending_purge_callback_t purge_cb,
+                                  void *purge_arg);
+
 void ucp_wireup_ep_set_next_ep(uct_ep_h uct_ep, uct_ep_h next_ep);
 
 uct_ep_h ucp_wireup_ep_extract_next_ep(uct_ep_h uct_ep);
 
 void ucp_wireup_ep_destroy_next_ep(ucp_wireup_ep_t *wireup_ep);
 
+void ucp_wireup_ep_mark_ready(uct_ep_h uct_ep);
+
 void ucp_wireup_ep_remote_connected(uct_ep_h uct_ep);
 
 int ucp_wireup_ep_test(uct_ep_h uct_ep);
 
+int ucp_wireup_aux_ep_is_owner(ucp_wireup_ep_t *wireup_ep, uct_ep_h owned_ep);
+
 int ucp_wireup_ep_is_owner(uct_ep_h uct_ep, uct_ep_h owned_ep);
 
 void ucp_wireup_ep_disown(uct_ep_h uct_ep, uct_ep_h owned_ep);
diff --git a/src/ucs/Makefile.am b/src/ucs/Makefile.am
index 688ffce0f5e..4515884ad90 100644
--- a/src/ucs/Makefile.am
+++ b/src/ucs/Makefile.am
@@ -5,11 +5,14 @@
 # See file LICENSE for terms.
 #
 
+SUBDIRS = vfs/sock . vfs/fuse
+
 AUTOMAKE_OPTIONS    = nostdinc # avoid collision with built-in debug.h
 lib_LTLIBRARIES     = libucs.la
 bin_PROGRAMS        =
 
-libucs_la_CPPFLAGS = $(BASE_CPPFLAGS) -DUCX_MODULE_DIR=\"$(moduledir)\"
+libucs_la_CPPFLAGS = $(BASE_CPPFLAGS) -DUCX_MODULE_DIR=\"$(moduledir)\" \
+                     -DUCX_CONF_DIR=\"$(ucx_conf_dir)\"
 libucs_la_CFLAGS   = $(BASE_CFLAGS)
 libucs_la_LDFLAGS  = -ldl $(NUMA_LIBS) -version-info $(SOVERSION)
 libucs_ladir       = $(includedir)/ucs
@@ -24,6 +27,7 @@ nobase_dist_libucs_la_HEADERS = \
 	algorithm/qsort_r.h \
 	async/async_fwd.h \
 	config/global_opts.h \
+	config/ini.h \
 	config/parser.h \
 	config/types.h \
 	datastruct/array.h \
@@ -40,6 +44,7 @@ nobase_dist_libucs_la_HEADERS = \
 	datastruct/string_buffer.h \
 	datastruct/string_set.h \
 	debug/log_def.h \
+	debug/debug.h \
 	memory/rcache.h \
 	memory/memory_type.h \
 	memory/memtype_cache.h \
@@ -58,11 +63,13 @@ nobase_dist_libucs_la_HEADERS = \
 	sys/stubs.h \
 	time/time_def.h \
 	type/class.h \
+	type/param.h \
 	type/init_once.h \
 	type/spinlock.h \
 	type/status.h \
 	type/thread_mode.h \
 	type/cpu_set.h \
+	vfs/base/vfs_obj.h \
 	arch/atomic.h \
 	arch/x86_64/global_opts.h \
 	arch/x86_64/atomic.h \
@@ -78,6 +85,7 @@ noinst_HEADERS = \
 	arch/x86_64/cpu.h \
 	arch/cpu.h \
 	datastruct/arbiter.h \
+	datastruct/bitmap.h \
 	datastruct/frag_list.h \
 	datastruct/mpmc.h \
 	datastruct/mpool.inl \
@@ -89,7 +97,7 @@ noinst_HEADERS = \
 	datastruct/ptr_map.h \
 	datastruct/ptr_map.inl \
 	debug/assert.h \
-	debug/debug.h \
+	debug/debug_int.h \
 	debug/log.h \
 	debug/memtrack.h \
 	memory/numa.h \
@@ -105,6 +113,8 @@ noinst_HEADERS = \
 	time/time.h \
 	time/timerq.h \
 	time/timer_wheel.h \
+	type/serialize.h \
+	type/float8.h \
 	async/async.h \
 	async/pipe.h \
 	async/signal.h \
@@ -127,6 +137,7 @@ libucs_la_SOURCES = \
 	async/thread.c \
 	config/global_opts.c \
 	config/ucm_opts.c \
+	config/ini.c \
 	config/parser.c \
 	datastruct/arbiter.c \
 	datastruct/array.c \
@@ -165,8 +176,9 @@ libucs_la_SOURCES = \
 	time/timerq.c \
 	type/class.c \
 	type/status.c \
-	type/init_once.c \
-	type/spinlock.c
+	type/spinlock.c \
+	type/thread_mode.c \
+	vfs/base/vfs_obj.c
 
 if HAVE_AARCH64_THUNDERX2
 libucs_la_SOURCES += \
diff --git a/src/ucs/arch/aarch64/cpu.h b/src/ucs/arch/aarch64/cpu.h
index b7a4d277cab..46826330d8d 100644
--- a/src/ucs/arch/aarch64/cpu.h
+++ b/src/ucs/arch/aarch64/cpu.h
@@ -1,6 +1,7 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
 * Copyright (C) ARM Ltd. 2016-2020.  ALL RIGHTS RESERVED.
+* Copyright (C) Stony Brook University. 2016-2020.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -19,6 +20,9 @@
 #ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
 
 
 #define UCS_ARCH_CACHE_LINE_SIZE 64
@@ -142,10 +146,11 @@ static inline void ucs_cpu_init()
 static inline void ucs_arch_wait_mem(void *address)
 {
     unsigned long tmp;
-    asm volatile ("ldxrb %w0, %1 \n"
+    asm volatile ("ldaxrb %w0, [%1] \n"
                   "wfe           \n"
                   : "=&r"(tmp)
-                  : "Q"(address));
+                  : "r"(address)
+                  : "memory");
 }
 
 #if !HAVE___CLEAR_CACHE
@@ -230,10 +235,30 @@ static inline void ucs_arch_clear_cache(void *start, void *end)
 }
 #endif
 
+#if defined(__ARM_FEATURE_SVE)
+static inline void *memcpy_aarch64_sve(void *dest, const void *src, size_t len)
+{
+    uint8_t *dest_u8      = (uint8_t*) dest;
+    const uint8_t *src_u8 = (uint8_t*) src;
+    uint64_t i            = 0;
+    svbool_t pg           = svwhilelt_b8_u64(i, (uint64_t)len);
+
+    do {
+        svst1_u8(pg, &dest_u8[i], svld1_u8(pg, &src_u8[i]));
+        i += svcntb();
+        pg = svwhilelt_b8_u64(i, (uint64_t)len);
+    } while (svptest_first(svptrue_b8(), pg));
+
+    return dest;
+}
+#endif
+
 static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len)
 {
 #if defined(HAVE_AARCH64_THUNDERX2)
-    return __memcpy_thunderx2(dst, src,len);
+    return __memcpy_thunderx2(dst, src, len);
+#elif defined(__ARM_FEATURE_SVE)
+    return memcpy_aarch64_sve(dst, src, len);
 #else
     return memcpy(dst, src, len);
 #endif
@@ -244,6 +269,8 @@ ucs_memcpy_nontemporal(void *dst, const void *src, size_t len)
 {
 #if defined(HAVE_AARCH64_THUNDERX2)
     __memcpy_thunderx2(dst, src,len);
+#elif defined(__ARM_FEATURE_SVE)
+    memcpy_aarch64_sve(dst, src, len);
 #else
     memcpy(dst, src, len);
 #endif
diff --git a/src/ucs/arch/cpu.c b/src/ucs/arch/cpu.c
index f030036106e..210a49c8e71 100644
--- a/src/ucs/arch/cpu.c
+++ b/src/ucs/arch/cpu.c
@@ -1,5 +1,6 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2019. ALL RIGHTS RESERVED.
+* Copyright (C) Shanghai Zhaoxin Semiconductor Co., Ltd. 2020. ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -63,6 +64,10 @@ const ucs_cpu_builtin_memcpy_t ucs_cpu_builtin_memcpy[UCS_CPU_VENDOR_LAST] = {
     [UCS_CPU_VENDOR_FUJITSU_ARM] = {
         .min = UCS_MEMUNITS_INF,
         .max = UCS_MEMUNITS_INF
+    },
+    [UCS_CPU_VENDOR_ZHAOXIN] = {
+        .min = UCS_MEMUNITS_INF,
+        .max = UCS_MEMUNITS_INF
     }
 };
 
diff --git a/src/ucs/arch/cpu.h b/src/ucs/arch/cpu.h
index cb317a8db3a..e06f6b95ebb 100644
--- a/src/ucs/arch/cpu.h
+++ b/src/ucs/arch/cpu.h
@@ -1,6 +1,7 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
 * Copyright (C) ARM Ltd. 2016.  ALL RIGHTS RESERVED.
+* Copyright (C) Shanghai Zhaoxin Semiconductor Co., Ltd. 2020. ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -30,6 +31,9 @@ typedef enum ucs_cpu_model {
     UCS_CPU_MODEL_ARM_AARCH64,
     UCS_CPU_MODEL_AMD_NAPLES,
     UCS_CPU_MODEL_AMD_ROME,
+    UCS_CPU_MODEL_ZHAOXIN_ZHANGJIANG,
+    UCS_CPU_MODEL_ZHAOXIN_WUDAOKOU,
+    UCS_CPU_MODEL_ZHAOXIN_LUJIAZUI,
     UCS_CPU_MODEL_LAST
 } ucs_cpu_model_t;
 
@@ -59,6 +63,7 @@ typedef enum ucs_cpu_vendor {
     UCS_CPU_VENDOR_GENERIC_ARM,
     UCS_CPU_VENDOR_GENERIC_PPC,
     UCS_CPU_VENDOR_FUJITSU_ARM,
+    UCS_CPU_VENDOR_ZHAOXIN,
     UCS_CPU_VENDOR_LAST
 } ucs_cpu_vendor_t;
 
diff --git a/src/ucs/arch/x86_64/cpu.c b/src/ucs/arch/x86_64/cpu.c
index 5618a2c9a98..29cbc00feef 100644
--- a/src/ucs/arch/x86_64/cpu.c
+++ b/src/ucs/arch/x86_64/cpu.c
@@ -1,6 +1,7 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2018.  ALL RIGHTS RESERVED.
 * Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED.
+* Copyright (C) Shanghai Zhaoxin Semiconductor Co., Ltd. 2020. ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -19,6 +20,8 @@
 
 #define X86_CPUID_GENUINEINTEL    "GenuntelineI" /* GenuineIntel in magic notation */
 #define X86_CPUID_AUTHENTICAMD    "AuthcAMDenti" /* AuthenticAMD in magic notation */
+#define X86_CPUID_CENTAURHAULS    "CentaulsaurH" /* CentaurHauls in magic notation */
+#define X86_CPUID_SHANGHAI        "  Shai  angh" /* Shanghai in magic notation */
 #define X86_CPUID_GET_MODEL       0x00000001u
 #define X86_CPUID_GET_BASE_VALUE  0x00000000u
 #define X86_CPUID_GET_EXTD_VALUE  0x00000007u
@@ -103,7 +106,7 @@ typedef struct ucs_x86_cpu_cache_size_codes {
 } ucs_x86_cpu_cache_size_codes_t;
 
 
-ucs_ternary_value_t ucs_arch_x86_enable_rdtsc = UCS_TRY;
+ucs_ternary_auto_value_t ucs_arch_x86_enable_rdtsc = UCS_TRY;
 
 static const ucs_x86_cpu_cache_info_t x86_cpu_cache[] = {
     [UCS_CPU_CACHE_L1d] = {.level = 1, .type = X86_CPU_CACHE_TYPE_DATA},
@@ -320,7 +323,7 @@ double ucs_arch_get_clocks_per_sec()
 
 ucs_cpu_model_t ucs_arch_get_cpu_model()
 {
-    ucs_x86_cpu_version_t version;
+    ucs_x86_cpu_version_t version = {}; /* Silence static checker */
     uint32_t _ebx, _ecx, _edx;
     uint32_t model, family;
 
@@ -334,53 +337,72 @@ ucs_cpu_model_t ucs_arch_get_cpu_model()
     if (family == 0xf) {
         family += version.ext_family;
     }
-    if ((family == 0x6) || (family == 0xf) || (family == 0x17)) {
+    if ((family == 0x6) || (family == 0x7) || (family == 0xf) || (family == 0x17)) {
         model = (version.ext_model << 4) | model;
     }
 
-    /* Check known CPUs */
-    if (family == 0x06) {
-       switch (model) {
-       case 0x3a:
-       case 0x3e:
-           return UCS_CPU_MODEL_INTEL_IVYBRIDGE;
-       case 0x2a:
-       case 0x2d:
-           return UCS_CPU_MODEL_INTEL_SANDYBRIDGE;
-       case 0x1a:
-       case 0x1e:
-       case 0x1f:
-       case 0x2e:
-           return UCS_CPU_MODEL_INTEL_NEHALEM;
-       case 0x25:
-       case 0x2c:
-       case 0x2f:
-           return UCS_CPU_MODEL_INTEL_WESTMERE;
-       case 0x3c:
-       case 0x3f:
-       case 0x45:
-       case 0x46:
-           return UCS_CPU_MODEL_INTEL_HASWELL;
-       case 0x3d:
-       case 0x47:
-       case 0x4f:
-       case 0x56:
-           return UCS_CPU_MODEL_INTEL_BROADWELL;
-       case 0x5e:
-       case 0x4e:
-       case 0x55:
-           return UCS_CPU_MODEL_INTEL_SKYLAKE;
-       }
-    }
+    if (ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_ZHAOXIN) {
+        if (family == 0x06) {
+            switch (model) {
+            case 0x0f:
+                return UCS_CPU_MODEL_ZHAOXIN_ZHANGJIANG;
+            }
+        }
+
+        if (family == 0x07) {
+            switch (model) {
+            case 0x1b:
+                return UCS_CPU_MODEL_ZHAOXIN_WUDAOKOU;
+            case 0x3b:
+                return UCS_CPU_MODEL_ZHAOXIN_LUJIAZUI;
+            }
+        }
+    } else {
+        /* Check known CPUs */
+        if (family == 0x06) {
+            switch (model) {
+            case 0x3a:
+            case 0x3e:
+                return UCS_CPU_MODEL_INTEL_IVYBRIDGE;
+            case 0x2a:
+            case 0x2d:
+                return UCS_CPU_MODEL_INTEL_SANDYBRIDGE;
+            case 0x1a:
+            case 0x1e:
+            case 0x1f:
+            case 0x2e:
+                return UCS_CPU_MODEL_INTEL_NEHALEM;
+            case 0x25:
+            case 0x2c:
+            case 0x2f:
+                return UCS_CPU_MODEL_INTEL_WESTMERE;
+            case 0x3c:
+            case 0x3f:
+            case 0x45:
+            case 0x46:
+                return UCS_CPU_MODEL_INTEL_HASWELL;
+            case 0x3d:
+            case 0x47:
+            case 0x4f:
+            case 0x56:
+                return UCS_CPU_MODEL_INTEL_BROADWELL;
+            case 0x5e:
+            case 0x4e:
+            case 0x55:
+                return UCS_CPU_MODEL_INTEL_SKYLAKE;
+            }
+        }
 
-    if (family == 0x17) {
-        switch (model) {
-        case 0x29:
-            return UCS_CPU_MODEL_AMD_NAPLES;
-        case 0x31:
-            return UCS_CPU_MODEL_AMD_ROME;
+        if (family == 0x17) {
+            switch (model) {
+            case 0x29:
+                return UCS_CPU_MODEL_AMD_NAPLES;
+            case 0x31:
+                return UCS_CPU_MODEL_AMD_ROME;
+            }
         }
-    } 
+    }
+
     return UCS_CPU_MODEL_UNKNOWN;
 }
 
@@ -447,7 +469,7 @@ int ucs_arch_get_cpu_flag()
 
 ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
 {
-    ucs_x86_cpu_registers reg;
+    ucs_x86_cpu_registers reg = {}; /* Silence static checker */
 
     ucs_x86_cpuid(X86_CPUID_GET_BASE_VALUE,
                   ucs_unaligned_ptr(&reg.eax), ucs_unaligned_ptr(&reg.ebx),
@@ -456,6 +478,9 @@ ucs_cpu_vendor_t ucs_arch_get_cpu_vendor()
         return UCS_CPU_VENDOR_INTEL;
     } else if (!memcmp(reg.id, X86_CPUID_AUTHENTICAMD, sizeof(X86_CPUID_AUTHENTICAMD) - 1)) {
         return UCS_CPU_VENDOR_AMD;
+    } else if (!memcmp(reg.id, X86_CPUID_CENTAURHAULS, sizeof(X86_CPUID_CENTAURHAULS) - 1) ||
+               !memcmp(reg.id, X86_CPUID_SHANGHAI, sizeof(X86_CPUID_SHANGHAI) - 1)) {
+        return UCS_CPU_VENDOR_ZHAOXIN;
     }
 
     return UCS_CPU_VENDOR_UNKNOWN;
@@ -470,7 +495,8 @@ static size_t ucs_cpu_memcpy_thresh(size_t user_val, size_t auto_val)
 
     if (((ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_INTEL) &&
          (ucs_arch_get_cpu_model() >= UCS_CPU_MODEL_INTEL_HASWELL)) ||
-        (ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_AMD)) {
+        (ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_AMD) ||
+        (ucs_arch_get_cpu_vendor() == UCS_CPU_VENDOR_ZHAOXIN)) {
         return auto_val;
     } else {
         return UCS_MEMUNITS_INF;
@@ -492,9 +518,9 @@ void ucs_cpu_init()
 
 ucs_status_t ucs_arch_get_cache_size(size_t *cache_sizes)
 {
+    ucs_x86_cpu_registers reg = {}; /* Silence static checker */
     ucs_x86_cache_line_reg_info_t cache_info;
     ucs_x86_cache_line_reg_info_t line_info;
-    ucs_x86_cpu_registers reg;
     uint32_t sets;
     uint32_t i, t, r, l4;
     uint32_t max_iter;
diff --git a/src/ucs/arch/x86_64/cpu.h b/src/ucs/arch/x86_64/cpu.h
index 7fc00627e6e..4cb5c0eafbf 100644
--- a/src/ucs/arch/x86_64/cpu.h
+++ b/src/ucs/arch/x86_64/cpu.h
@@ -42,7 +42,7 @@ BEGIN_C_DECLS
 #define ucs_memory_cpu_load_fence()   ucs_compiler_fence()
 #define ucs_memory_cpu_wc_fence()     asm volatile ("sfence" ::: "memory")
 
-extern ucs_ternary_value_t ucs_arch_x86_enable_rdtsc;
+extern ucs_ternary_auto_value_t ucs_arch_x86_enable_rdtsc;
 
 double ucs_arch_get_clocks_per_sec();
 double ucs_x86_init_tsc_freq();
diff --git a/src/ucs/async/async.c b/src/ucs/async/async.c
index b6e361403b4..b0f782909d9 100644
--- a/src/ucs/async/async.c
+++ b/src/ucs/async/async.c
@@ -11,7 +11,7 @@
 #include "async_int.h"
 
 #include <ucs/arch/atomic.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/datastruct/khash.h>
 #include <ucs/sys/stubs.h>
 
@@ -23,8 +23,6 @@
 #define UCS_ASYNC_HANDLER_ARG(_h)       (_h), (_h)->id, (_h)->refcount, \
                                         ucs_debug_get_symbol_name((_h)->cb)
 
-#define UCS_ASYNC_HANDLER_CALLER_NULL   ((pthread_t)-1)
-
 #define UCS_ASYNC_MISSED_QUEUE_SHIFT    32
 #define UCS_ASYNC_MISSED_QUEUE_MASK     UCS_MASK(UCS_ASYNC_MISSED_QUEUE_SHIFT)
 
@@ -246,10 +244,10 @@ static void ucs_async_handler_invoke(ucs_async_handler_t *handler,
      * the handler must always be called with async context blocked, so no need
      * for atomic operations here.
      */
-    ucs_assert(handler->caller == UCS_ASYNC_HANDLER_CALLER_NULL);
+    ucs_assert(handler->caller == UCS_ASYNC_PTHREAD_ID_NULL);
     handler->caller = pthread_self();
     handler->cb(handler->id, events, handler->arg);
-    handler->caller = UCS_ASYNC_HANDLER_CALLER_NULL;
+    handler->caller = UCS_ASYNC_PTHREAD_ID_NULL;
 }
 
 static ucs_status_t ucs_async_handler_dispatch(ucs_async_handler_t *handler,
@@ -447,7 +445,7 @@ ucs_async_alloc_handler(int min_id, int max_id, ucs_async_mode_t mode,
 
     handler->mode     = mode;
     handler->events   = events;
-    handler->caller   = UCS_ASYNC_HANDLER_CALLER_NULL;
+    handler->caller   = UCS_ASYNC_PTHREAD_ID_NULL;
     handler->cb       = cb;
     handler->arg      = arg;
     handler->async    = async;
diff --git a/src/ucs/async/async.h b/src/ucs/async/async.h
index e324574f4cc..9c561e37af2 100644
--- a/src/ucs/async/async.h
+++ b/src/ucs/async/async.h
@@ -90,12 +90,31 @@ static inline int ucs_async_check_miss(ucs_async_context_t *async)
     return 0;
 }
 
+/**
+ * Returns whether a context blocked or not.
+ *
+ * @param async Event context to check `is_blocked` status for.
+ */
+static inline int ucs_async_is_blocked(const ucs_async_context_t *async)
+{
+    if (async->mode == UCS_ASYNC_MODE_THREAD_SPINLOCK) {
+        return ucs_recursive_spin_is_owner(&async->thread.spinlock,
+                                           pthread_self());
+    } else if (async->mode == UCS_ASYNC_MODE_THREAD_MUTEX) {
+        return ucs_recursive_mutex_is_blocked(&async->thread.mutex);
+    } else if (async->mode == UCS_ASYNC_MODE_SIGNAL) {
+        return UCS_ASYNC_SIGNAL_IS_RECURSIVELY_BLOCKED(async);
+    }
+
+    return async->poll_block > 0;
+}
+
 
 /**
  * Block the async handler (if its currently running, wait until it exits and
  * block it then). Used to serialize accesses with the async handler.
  *
- * @param event Event context to block events for.
+ * @param _async Event context to block events for.
  * @note This function might wait until a currently running callback returns.
  */
 #define UCS_ASYNC_BLOCK(_async) \
@@ -103,7 +122,7 @@ static inline int ucs_async_check_miss(ucs_async_context_t *async)
         if ((_async)->mode == UCS_ASYNC_MODE_THREAD_SPINLOCK) { \
             ucs_recursive_spin_lock(&(_async)->thread.spinlock); \
         } else if ((_async)->mode == UCS_ASYNC_MODE_THREAD_MUTEX) { \
-            (void)pthread_mutex_lock(&(_async)->thread.mutex); \
+            ucs_recursive_mutex_block(&(_async)->thread.mutex); \
         } else if ((_async)->mode == UCS_ASYNC_MODE_SIGNAL) { \
             UCS_ASYNC_SIGNAL_BLOCK(_async); \
         } else { \
@@ -115,14 +134,14 @@ static inline int ucs_async_check_miss(ucs_async_context_t *async)
 /**
  * Unblock asynchronous event delivery, and invoke pending callbacks.
  *
- * @param event Event context to unblock events for.
+ * @param _async Event context to unblock events for.
  */
 #define UCS_ASYNC_UNBLOCK(_async) \
     do { \
         if ((_async)->mode == UCS_ASYNC_MODE_THREAD_SPINLOCK) { \
             ucs_recursive_spin_unlock(&(_async)->thread.spinlock); \
         } else if ((_async)->mode == UCS_ASYNC_MODE_THREAD_MUTEX) { \
-            (void)pthread_mutex_unlock(&(_async)->thread.mutex); \
+            ucs_recursive_mutex_unblock(&(_async)->thread.mutex); \
         } else if ((_async)->mode == UCS_ASYNC_MODE_SIGNAL) { \
             UCS_ASYNC_SIGNAL_UNBLOCK(_async); \
         } else { \
diff --git a/src/ucs/async/async_fwd.h b/src/ucs/async/async_fwd.h
index 68f7f85a149..0078dfe299f 100644
--- a/src/ucs/async/async_fwd.h
+++ b/src/ucs/async/async_fwd.h
@@ -16,6 +16,10 @@ BEGIN_C_DECLS
 
 /** @file async_fwd.h */
 
+
+#define UCS_ASYNC_PTHREAD_ID_NULL ((pthread_t)-1)
+
+
 typedef struct ucs_async_context ucs_async_context_t;
 
 
diff --git a/src/ucs/async/signal.c b/src/ucs/async/signal.c
index c4689a8174d..c9fac015936 100644
--- a/src/ucs/async/signal.c
+++ b/src/ucs/async/signal.c
@@ -13,7 +13,7 @@
 
 #include <ucs/arch/atomic.h>
 #include <ucs/datastruct/list.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/debug/log.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/sys.h>
@@ -67,7 +67,7 @@ static pid_t ucs_async_signal_context_tid(ucs_async_context_t *async)
     if (pid == -1) {
         pid = getpid();
     }
-    return (async == NULL) ? pid : async->signal.tid;;
+    return (async == NULL) ? pid : async->signal.tid;
 }
 
 static ucs_status_t
diff --git a/src/ucs/async/thread.c b/src/ucs/async/thread.c
index 37bf46c6da1..3f5c1f0f832 100644
--- a/src/ucs/async/thread.c
+++ b/src/ucs/async/thread.c
@@ -105,6 +105,8 @@ static void *ucs_async_thread_func(void *arg)
     cb_arg.thread    = thread;
     cb_arg.is_missed = &is_missed;
 
+    ucs_log_set_thread_name("async");
+
     while (!thread->stop) {
         num_events = ucs_min(UCS_ASYNC_EPOLL_MAX_EVENTS,
                              ucs_sys_event_set_max_wait_events);
@@ -278,11 +280,16 @@ static void ucs_async_thread_spinlock_unblock(ucs_async_context_t *async)
 static ucs_status_t ucs_async_thread_mutex_init(ucs_async_context_t *async)
 {
     pthread_mutexattr_t attr;
-    int                 ret;
+    int ret;
+
+#if UCS_ENABLE_ASSERT
+    async->thread.mutex.owner = UCS_ASYNC_PTHREAD_ID_NULL;
+    async->thread.mutex.count = 0;
+#endif
 
     pthread_mutexattr_init(&attr);
     pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
-    ret = pthread_mutex_init(&async->thread.mutex, &attr);
+    ret = pthread_mutex_init(&async->thread.mutex.lock, &attr);
     if (ret == 0) {
         return UCS_OK;
     }
@@ -293,7 +300,7 @@ static ucs_status_t ucs_async_thread_mutex_init(ucs_async_context_t *async)
 
 static void ucs_async_thread_mutex_cleanup(ucs_async_context_t *async)
 {
-    int ret = pthread_mutex_destroy(&async->thread.mutex);
+    int ret = pthread_mutex_destroy(&async->thread.mutex.lock);
 
     if (ret != 0) {
         ucs_warn("failed to destroy async lock: %s", strerror(ret));
@@ -355,12 +362,24 @@ ucs_async_thread_modify_event_fd(ucs_async_context_t *async, int event_fd,
 
 static int ucs_async_thread_mutex_try_block(ucs_async_context_t *async)
 {
-    return !pthread_mutex_trylock(&async->thread.mutex);
+    if (pthread_mutex_trylock(&async->thread.mutex.lock)) {
+        /* not locked */
+        return 0;
+    }
+
+#if UCS_ENABLE_ASSERT
+    /* locked */
+    if (async->thread.mutex.count++ == 0) {
+        async->thread.mutex.owner = pthread_self();
+    }
+#endif
+
+    return 1;
 }
 
 static void ucs_async_thread_mutex_unblock(ucs_async_context_t *async)
 {
-    (void)pthread_mutex_unlock(&async->thread.mutex);
+    ucs_recursive_mutex_unblock(&async->thread.mutex);
 }
 
 static ucs_status_t ucs_async_thread_add_timer(ucs_async_context_t *async,
diff --git a/src/ucs/async/thread.h b/src/ucs/async/thread.h
index 73c48536712..29b96fd3c7b 100644
--- a/src/ucs/async/thread.h
+++ b/src/ucs/async/thread.h
@@ -9,13 +9,60 @@
 
 #include <ucs/type/spinlock.h>
 #include <ucs/sys/checker.h>
+#include <ucs/debug/assert.h>
+
+
+typedef struct ucs_async_thread_mutex {
+    pthread_mutex_t lock;
+#if UCS_ENABLE_ASSERT
+    pthread_t       owner;
+    unsigned        count;
+#endif
+} ucs_async_thread_mutex_t;
 
 
 typedef struct ucs_async_thread_context {
     union {
         ucs_recursive_spinlock_t spinlock;
-        pthread_mutex_t          mutex;
+        ucs_async_thread_mutex_t mutex;
     };
 } ucs_async_thread_context_t;
 
+
+static UCS_F_ALWAYS_INLINE int
+ucs_recursive_mutex_is_blocked(const ucs_async_thread_mutex_t *mutex)
+{
+#if UCS_ENABLE_ASSERT
+    return mutex->owner == pthread_self();
+#else
+    ucs_fatal("must not be called without assertion");
+#endif
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucs_recursive_mutex_block(ucs_async_thread_mutex_t *mutex)
+{
+    (void)pthread_mutex_lock(&mutex->lock);
+
+#if UCS_ENABLE_ASSERT
+    if (mutex->count++ == 0) {
+        mutex->owner = pthread_self();
+    }
+#endif
+}
+
+static UCS_F_ALWAYS_INLINE void
+ucs_recursive_mutex_unblock(ucs_async_thread_mutex_t *mutex)
+{
+    ucs_assert(ucs_recursive_mutex_is_blocked(mutex));
+
+#if UCS_ENABLE_ASSERT
+    if (--mutex->count == 0) {
+        mutex->owner = UCS_ASYNC_PTHREAD_ID_NULL;
+    }
+#endif
+
+    (void)pthread_mutex_unlock(&mutex->lock);
+}
+
 #endif
diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c
index c2f79fc2fa6..930c4ed0195 100644
--- a/src/ucs/config/global_opts.c
+++ b/src/ucs/config/global_opts.c
@@ -46,6 +46,7 @@ ucs_global_opts_t ucs_global_opts = {
     .profile_file          = "",
     .stats_filter          = { NULL, 0 },
     .stats_format          = UCS_STATS_FULL,
+    .vfs_enable            = 1,
     .rcache_check_pfn      = 0,
     .module_dir            = UCX_MODULE_DIR, /* defined in Makefile.am */
     .module_log_level      = UCS_LOG_LEVEL_TRACE,
@@ -56,6 +57,7 @@ static const char *ucs_handle_error_modes[] = {
     [UCS_HANDLE_ERROR_BACKTRACE] = "bt",
     [UCS_HANDLE_ERROR_FREEZE]    = "freeze",
     [UCS_HANDLE_ERROR_DEBUG]     = "debug",
+    [UCS_HANDLE_ERROR_NONE]      = "none",
     [UCS_HANDLE_ERROR_LAST]      = NULL
 };
 
@@ -115,8 +117,11 @@ static ucs_config_field_t ucs_global_opts_table[] = {
 #else
   "bt",
 #endif
-  "Error handling mode. A combination of: 'bt' (print backtrace),\n"
-  "'freeze' (freeze and wait for a debugger), 'debug' (attach debugger)",
+  "Error signal handling mode. Either 'none' to disable signal interception,\n"
+  "or a combination of:\n"
+  " - 'bt'     : Print backtrace\n"
+  " - 'freeze' : Freeze and wait for a debugger\n"
+  " - 'debug'  : Attach a debugger",
   ucs_offsetof(ucs_global_opts_t, handle_errors),
   UCS_CONFIG_TYPE_BITMAP(ucs_handle_error_modes)},
 
@@ -192,9 +197,12 @@ static ucs_config_field_t ucs_global_opts_table[] = {
    "  agg     - like full but there will also be an aggregation between similar counters\n"
    "  summary - all counters will be printed in the same line.",
    ucs_offsetof(ucs_global_opts_t, stats_format), UCS_CONFIG_TYPE_ENUM(ucs_stats_formats_names)},
-
 #endif
 
+ {"VFS_ENABLE", "y",
+  "Enable virtual monitoring filesystem",
+  ucs_offsetof(ucs_global_opts_t, vfs_enable), UCS_CONFIG_TYPE_BOOL},
+
 #ifdef ENABLE_MEMTRACK
  {"MEMTRACK_DEST", "",
   "Destination to output memory tracking report to. If the value is empty,\n"
diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h
index 03d54b20b19..d03ed50bb4d 100644
--- a/src/ucs/config/global_opts.h
+++ b/src/ucs/config/global_opts.h
@@ -115,6 +115,9 @@ typedef struct {
     /* statistics format options */
     ucs_stats_formats_t        stats_format;
 
+    /* Enable VFS monitoring */
+    int                        vfs_enable;
+
     /* registration cache checks if physical pages are not moved */
     unsigned                   rcache_check_pfn;
 
diff --git a/src/ucs/config/ini.c b/src/ucs/config/ini.c
new file mode 100644
index 00000000000..917f6b29beb
--- /dev/null
+++ b/src/ucs/config/ini.c
@@ -0,0 +1,302 @@
+/* inih -- simple .INI file parser
+
+SPDX-License-Identifier: BSD-3-Clause
+
+Copyright (C) 2009-2020, Ben Hoyt
+
+inih is released under the New BSD license (see LICENSE.txt). Go to the project
+home page for more info:
+
+https://github.com/benhoyt/inih
+
+*/
+
+#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "ini.h"
+
+#if !INI_USE_STACK
+#if INI_CUSTOM_ALLOCATOR
+#include <stddef.h>
+void* ini_malloc(size_t size);
+void ini_free(void* ptr);
+void* ini_realloc(void* ptr, size_t size);
+#else
+#include <stdlib.h>
+#define ini_malloc malloc
+#define ini_free free
+#define ini_realloc realloc
+#endif
+#endif
+
+#define MAX_SECTION 50
+#define MAX_NAME 50
+
+/* Used by ini_parse_string() to keep track of string parsing state. */
+typedef struct {
+    const char* ptr;
+    size_t num_left;
+} ini_parse_string_ctx;
+
+/* Strip whitespace chars off end of given string, in place. Return s. */
+static char* rstrip(char* s)
+{
+    char* p = s + strlen(s);
+    while (p > s && isspace((unsigned char)(*--p)))
+        *p = '\0';
+    return s;
+}
+
+/* Return pointer to first non-whitespace char in given string. */
+static char* lskip(const char* s)
+{
+    while (*s && isspace((unsigned char)(*s)))
+        s++;
+    return (char*)s;
+}
+
+/* Return pointer to first char (of chars) or inline comment in given string,
+   or pointer to NUL at end of string if neither found. Inline comment must
+   be prefixed by a whitespace character to register as a comment. */
+static char* find_chars_or_comment(const char* s, const char* chars)
+{
+#if INI_ALLOW_INLINE_COMMENTS
+    int was_space = 0;
+    while (*s && (!chars || !strchr(chars, *s)) &&
+           !(was_space && strchr(INI_INLINE_COMMENT_PREFIXES, *s))) {
+        was_space = isspace((unsigned char)(*s));
+        s++;
+    }
+#else
+    while (*s && (!chars || !strchr(chars, *s))) {
+        s++;
+    }
+#endif
+    return (char*)s;
+}
+
+/* Similar to strncpy, but ensures dest (size bytes) is
+   NUL-terminated, and doesn't pad with NULs. */
+static char* strncpy0(char* dest, const char* src, size_t size)
+{
+    /* Could use strncpy internally, but it causes gcc warnings (see issue #91) */
+    size_t i;
+    for (i = 0; i < size - 1 && src[i]; i++)
+        dest[i] = src[i];
+    dest[i] = '\0';
+    return dest;
+}
+
+/* See documentation in header file. */
+int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
+                     void* user)
+{
+    /* Uses a fair bit of stack (use heap instead if you need to) */
+#if INI_USE_STACK
+    char line[INI_MAX_LINE];
+    int max_line = INI_MAX_LINE;
+#else
+    char* line;
+    size_t max_line = INI_INITIAL_ALLOC;
+#endif
+#if INI_ALLOW_REALLOC && !INI_USE_STACK
+    char* new_line;
+    size_t offset;
+#endif
+    char section[MAX_SECTION] = "";
+    char prev_name[MAX_NAME] = "";
+
+    char* start;
+    char* end;
+    char* name;
+    char* value;
+    int lineno = 0;
+    int error = 0;
+
+#if !INI_USE_STACK
+    line = (char*)ini_malloc(INI_INITIAL_ALLOC);
+    if (!line) {
+        return -2;
+    }
+#endif
+
+#if INI_HANDLER_LINENO
+#define HANDLER(u, s, n, v) handler(u, s, n, v, lineno)
+#else
+#define HANDLER(u, s, n, v) handler(u, s, n, v)
+#endif
+
+    /* Scan through stream line by line */
+    while (reader(line, (int)max_line, stream) != NULL) {
+#if INI_ALLOW_REALLOC && !INI_USE_STACK
+        offset = strlen(line);
+        while (offset == max_line - 1 && line[offset - 1] != '\n') {
+            max_line *= 2;
+            if (max_line > INI_MAX_LINE)
+                max_line = INI_MAX_LINE;
+            new_line = ini_realloc(line, max_line);
+            if (!new_line) {
+                ini_free(line);
+                return -2;
+            }
+            line = new_line;
+            if (reader(line + offset, (int)(max_line - offset), stream) == NULL)
+                break;
+            if (max_line >= INI_MAX_LINE)
+                break;
+            offset += strlen(line + offset);
+        }
+#endif
+
+        lineno++;
+
+        start = line;
+#if INI_ALLOW_BOM
+        if (lineno == 1 && (unsigned char)start[0] == 0xEF &&
+                           (unsigned char)start[1] == 0xBB &&
+                           (unsigned char)start[2] == 0xBF) {
+            start += 3;
+        }
+#endif
+        start = lskip(rstrip(start));
+
+        if (strchr(INI_START_COMMENT_PREFIXES, *start)) {
+            /* Start-of-line comment */
+        }
+#if INI_ALLOW_MULTILINE
+        else if (*prev_name && *start && start > line) {
+            /* Non-blank line with leading whitespace, treat as continuation
+               of previous name's value (as per Python configparser). */
+            if (!HANDLER(user, section, prev_name, start) && !error)
+                error = lineno;
+        }
+#endif
+        else if (*start == '[') {
+            /* A "[section]" line */
+            end = find_chars_or_comment(start + 1, "]");
+            if (*end == ']') {
+                *end = '\0';
+                strncpy0(section, start + 1, sizeof(section));
+                *prev_name = '\0';
+#if INI_CALL_HANDLER_ON_NEW_SECTION
+                if (!HANDLER(user, section, NULL, NULL) && !error)
+                    error = lineno;
+#endif
+            }
+            else if (!error) {
+                /* No ']' found on section line */
+                error = lineno;
+            }
+        }
+        else if (*start) {
+            /* Not a comment, must be a name[=:]value pair */
+            end = find_chars_or_comment(start, "=:");
+            if (*end == '=' || *end == ':') {
+                *end = '\0';
+                name = rstrip(start);
+                value = end + 1;
+#if INI_ALLOW_INLINE_COMMENTS
+                end = find_chars_or_comment(value, NULL);
+                if (*end)
+                    *end = '\0';
+#endif
+                value = lskip(value);
+                rstrip(value);
+
+                /* Valid name[=:]value pair found, call handler */
+                strncpy0(prev_name, name, sizeof(prev_name));
+                if (!HANDLER(user, section, name, value) && !error)
+                    error = lineno;
+            }
+            else if (!error) {
+                /* No '=' or ':' found on name[=:]value line */
+#if INI_ALLOW_NO_VALUE
+                *end = '\0';
+                name = rstrip(start);
+                if (!HANDLER(user, section, name, NULL) && !error)
+                    error = lineno;
+#else
+                error = lineno;
+#endif
+            }
+        }
+
+#if INI_STOP_ON_FIRST_ERROR
+        if (error)
+            break;
+#endif
+    }
+
+#if !INI_USE_STACK
+    ini_free(line);
+#endif
+
+    return error;
+}
+
+/* See documentation in header file. */
+int ini_parse_file(FILE* file, ini_handler handler, void* user)
+{
+    return ini_parse_stream((ini_reader)fgets, file, handler, user);
+}
+
+/* See documentation in header file. */
+int ini_parse(const char* filename, ini_handler handler, void* user)
+{
+    FILE* file;
+    int error;
+
+    file = fopen(filename, "r");
+    if (!file)
+        return -1;
+    error = ini_parse_file(file, handler, user);
+    fclose(file);
+    return error;
+}
+
+/* An ini_reader function to read the next line from a string buffer. This
+   is the fgets() equivalent used by ini_parse_string(). */
+static char* ini_reader_string(char* str, int num, void* stream) {
+    ini_parse_string_ctx* ctx = (ini_parse_string_ctx*)stream;
+    const char* ctx_ptr = ctx->ptr;
+    size_t ctx_num_left = ctx->num_left;
+    char* strp = str;
+    char c;
+
+    if (ctx_num_left == 0 || num < 2)
+        return NULL;
+
+    while (num > 1 && ctx_num_left != 0) {
+        c = *ctx_ptr++;
+        ctx_num_left--;
+        *strp++ = c;
+        if (c == '\n')
+            break;
+        num--;
+    }
+
+    *strp = '\0';
+    ctx->ptr = ctx_ptr;
+    ctx->num_left = ctx_num_left;
+    return str;
+}
+
+/* See documentation in header file. */
+int ini_parse_string(const char* string, ini_handler handler, void* user) {
+    ini_parse_string_ctx ctx;
+
+    ctx.ptr = string;
+    ctx.num_left = strlen(string);
+    return ini_parse_stream((ini_reader)ini_reader_string, &ctx, handler,
+                            user);
+}
diff --git a/src/ucs/config/ini.h b/src/ucs/config/ini.h
new file mode 100644
index 00000000000..7ca3be73e54
--- /dev/null
+++ b/src/ucs/config/ini.h
@@ -0,0 +1,157 @@
+/* inih -- simple .INI file parser
+
+SPDX-License-Identifier: BSD-3-Clause
+
+Copyright (C) 2009-2020, Ben Hoyt
+
+inih is released under the New BSD license (see LICENSE.txt). Go to the project
+home page for more info:
+
+https://github.com/benhoyt/inih
+
+*/
+
+#ifndef UCS_CONFIG_INI_H
+#define UCS_CONFIG_INI_H
+
+/* Make this header file easier to include in C++ code */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+
+/* Nonzero if ini_handler callback should accept lineno parameter. */
+#ifndef INI_HANDLER_LINENO
+#define INI_HANDLER_LINENO 0
+#endif
+
+/* Typedef for prototype of handler function. */
+#if INI_HANDLER_LINENO
+typedef int (*ini_handler)(void* user, const char* section,
+                           const char* name, const char* value,
+                           int lineno);
+#else
+typedef int (*ini_handler)(void* user, const char* section,
+                           const char* name, const char* value);
+#endif
+
+/* Typedef for prototype of fgets-style reader function. */
+typedef char* (*ini_reader)(char* str, int num, void* stream);
+
+/* Parse given INI-style file. May have [section]s, name=value pairs
+   (whitespace stripped), and comments starting with ';' (semicolon). Section
+   is "" if name=value pair parsed before any section heading. name:value
+   pairs are also supported as a concession to Python's configparser.
+
+   For each name=value pair parsed, call handler function with given user
+   pointer as well as section, name, and value (data only valid for duration
+   of handler call). Handler should return nonzero on success, zero on error.
+
+   Returns 0 on success, line number of first error on parse error (doesn't
+   stop on first error), -1 on file open error, or -2 on memory allocation
+   error (only when INI_USE_STACK is zero).
+*/
+int ini_parse(const char* filename, ini_handler handler, void* user);
+
+/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't
+   close the file when it's finished -- the caller must do that. */
+int ini_parse_file(FILE* file, ini_handler handler, void* user);
+
+/* Same as ini_parse(), but takes an ini_reader function pointer instead of
+   filename. Used for implementing custom or string-based I/O (see also
+   ini_parse_string). */
+int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler,
+                     void* user);
+
+/* Same as ini_parse(), but takes a zero-terminated string with the INI data
+instead of a file. Useful for parsing INI data from a network socket or
+already in memory. */
+int ini_parse_string(const char* string, ini_handler handler, void* user);
+
+/* Nonzero to allow multi-line value parsing, in the style of Python's
+   configparser. If allowed, ini_parse() will call the handler with the same
+   name for each subsequent line parsed. */
+#ifndef INI_ALLOW_MULTILINE
+#define INI_ALLOW_MULTILINE 1
+#endif
+
+/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of
+   the file. See https://github.com/benhoyt/inih/issues/21 */
+#ifndef INI_ALLOW_BOM
+#define INI_ALLOW_BOM 1
+#endif
+
+/* Chars that begin a start-of-line comment. Per Python configparser, allow
+   both ; and # comments at the start of a line by default. */
+#ifndef INI_START_COMMENT_PREFIXES
+#define INI_START_COMMENT_PREFIXES ";#"
+#endif
+
+/* Nonzero to allow inline comments (with valid inline comment characters
+   specified by INI_INLINE_COMMENT_PREFIXES). Set to 0 to turn off and match
+   Python 3.2+ configparser behaviour. */
+#ifndef INI_ALLOW_INLINE_COMMENTS
+#define INI_ALLOW_INLINE_COMMENTS 1
+#endif
+#ifndef INI_INLINE_COMMENT_PREFIXES
+#define INI_INLINE_COMMENT_PREFIXES ";"
+#endif
+
+/* Nonzero to use stack for line buffer, zero to use heap (malloc/free). */
+#ifndef INI_USE_STACK
+#define INI_USE_STACK 1
+#endif
+
+/* Maximum line length for any line in INI file (stack or heap). Note that
+   this must be 3 more than the longest line (due to '\r', '\n', and '\0'). */
+#ifndef INI_MAX_LINE
+#define INI_MAX_LINE 200
+#endif
+
+/* Nonzero to allow heap line buffer to grow via realloc(), zero for a
+   fixed-size buffer of INI_MAX_LINE bytes. Only applies if INI_USE_STACK is
+   zero. */
+#ifndef INI_ALLOW_REALLOC
+#define INI_ALLOW_REALLOC 0
+#endif
+
+/* Initial size in bytes for heap line buffer. Only applies if INI_USE_STACK
+   is zero. */
+#ifndef INI_INITIAL_ALLOC
+#define INI_INITIAL_ALLOC 200
+#endif
+
+/* Stop parsing on first error (default is to keep parsing). */
+#ifndef INI_STOP_ON_FIRST_ERROR
+#define INI_STOP_ON_FIRST_ERROR 0
+#endif
+
+/* Nonzero to call the handler at the start of each new section (with
+   name and value NULL). Default is to only call the handler on
+   each name=value pair. */
+#ifndef INI_CALL_HANDLER_ON_NEW_SECTION
+#define INI_CALL_HANDLER_ON_NEW_SECTION 0
+#endif
+
+/* Nonzero to allow a name without a value (no '=' or ':' on the line) and
+   call the handler with value NULL in this case. Default is to treat
+   no-value lines as an error. */
+#ifndef INI_ALLOW_NO_VALUE
+#define INI_ALLOW_NO_VALUE 0
+#endif
+
+/* Nonzero to use custom ini_malloc, ini_free, and ini_realloc memory
+   allocation functions (INI_USE_STACK must also be 0). These functions must
+   have the same signatures as malloc/free/realloc and behave in a similar
+   way. ini_realloc is only needed if INI_ALLOW_REALLOC is set. */
+#ifndef INI_CUSTOM_ALLOCATOR
+#define INI_CUSTOM_ALLOCATOR 0
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* UCS_CONFIG_INI_H */
diff --git a/src/ucs/config/parser.c b/src/ucs/config/parser.c
index 1bf3214e8fc..b3457af3280 100644
--- a/src/ucs/config/parser.c
+++ b/src/ucs/config/parser.c
@@ -16,8 +16,10 @@
 #include <ucs/datastruct/khash.h>
 #include <ucs/debug/assert.h>
 #include <ucs/debug/log.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/time/time.h>
+#include <ucs/config/ini.h>
+#include <ucs/type/init_once.h>
 #include <fnmatch.h>
 #include <ctype.h>
 
@@ -25,6 +27,8 @@
 /* width of titles in docstring */
 #define UCS_CONFIG_PARSER_DOCSTR_WIDTH         10
 
+/* String literal for allow-list */
+#define UCS_CONFIG_PARSER_ALL "all"
 
 /* list of prefixes for a configuration variable, used to dump all possible
  * aliases.
@@ -39,6 +43,8 @@ typedef UCS_CONFIG_ARRAY_FIELD(void, data) ucs_config_array_field_t;
 
 KHASH_SET_INIT_STR(ucs_config_env_vars)
 
+KHASH_MAP_INIT_STR(ucs_config_map, char*)
+
 
 /* Process environment variables */
 extern char **environ;
@@ -46,7 +52,9 @@ extern char **environ;
 
 UCS_LIST_HEAD(ucs_config_global_list);
 static khash_t(ucs_config_env_vars) ucs_config_parser_env_vars = {0};
+static khash_t(ucs_config_map) ucs_config_file_vars            = {0};
 static pthread_mutex_t ucs_config_parser_env_vars_hash_lock    = PTHREAD_MUTEX_INITIALIZER;
+static char ucs_config_parser_negate                           = '^';
 
 
 const char *ucs_async_mode_names[] = {
@@ -236,19 +244,31 @@ int ucs_config_sscanf_ternary(const char *buf, void *dest, const void *arg)
     if (!strcasecmp(buf, "try") || !strcasecmp(buf, "maybe")) {
         *(int*)dest = UCS_TRY;
         return 1;
-    } else {
-        return ucs_config_sscanf_bool(buf, dest, arg);
     }
+
+    return ucs_config_sscanf_bool(buf, dest, arg);
 }
 
-int ucs_config_sprintf_ternary(char *buf, size_t max,
-                               const void *src, const void *arg)
+int ucs_config_sscanf_ternary_auto(const char *buf, void *dest, const void *arg)
 {
-    if (*(int*)src == UCS_TRY) {
+    if (!strcasecmp(buf, UCS_VALUE_AUTO_STR)) {
+        *(int*)dest = UCS_AUTO;
+        return 1;
+    }
+
+    return ucs_config_sscanf_ternary(buf, dest, arg);
+}
+
+int ucs_config_sprintf_ternary_auto(char *buf, size_t max,
+                                    const void *src, const void *arg)
+{
+    if (*(int*)src == UCS_AUTO) {
+        return snprintf(buf, max, UCS_VALUE_AUTO_STR);
+    } else if (*(int*)src == UCS_TRY) {
         return snprintf(buf, max, "try");
-    } else {
-        return ucs_config_sprintf_bool(buf, max, src, arg);
     }
+
+    return ucs_config_sprintf_bool(buf, max, src, arg);
 }
 
 int ucs_config_sscanf_on_off(const char *buf, void *dest, const void *arg)
@@ -433,8 +453,7 @@ int ucs_config_sscanf_time(const char *buf, void *dest, const void *arg)
 int ucs_config_sprintf_time(char *buf, size_t max,
                             const void *src, const void *arg)
 {
-    snprintf(buf, max, "%.2fus", *(double*)src * UCS_USEC_PER_SEC);
-    return 1;
+    return snprintf(buf, max, "%.2fus", *(double*)src * UCS_USEC_PER_SEC);
 }
 
 int ucs_config_sscanf_time_units(const char *buf, void *dest, const void *arg)
@@ -442,6 +461,14 @@ int ucs_config_sscanf_time_units(const char *buf, void *dest, const void *arg)
     double value;
     int ret;
 
+    if (!strcmp(buf, "inf")) {
+        *(ucs_time_t*)dest = UCS_TIME_INFINITY;
+        return 1;
+    } else if (!strcmp(buf, "auto")) {
+        *(ucs_time_t*)dest = UCS_TIME_AUTO;
+        return 1;
+    }
+
     ret = ucs_config_sscanf_time(buf, &value, arg);
     if (ret == 0) {
         return 0;
@@ -454,8 +481,15 @@ int ucs_config_sscanf_time_units(const char *buf, void *dest, const void *arg)
 int ucs_config_sprintf_time_units(char *buf, size_t max,
                                   const void *src, const void *arg)
 {
-    double value = ucs_time_to_sec(*(ucs_time_t*)src);
+    double value;
 
+    if (*(ucs_time_t*)src == UCS_TIME_INFINITY) {
+        return snprintf(buf, max, "inf");
+    } else if (*(ucs_time_t*)src == UCS_TIME_AUTO) {
+        return snprintf(buf, max, "auto");
+    }
+
+    value = ucs_time_to_sec(*(ucs_time_t*)src);
     return ucs_config_sprintf_time(buf, max, &value, arg);
 }
 
@@ -788,10 +822,14 @@ ucs_status_t ucs_config_clone_array(const void *src, void *dest, const void *arg
     ucs_status_t status;
     unsigned i;
 
-    dest_array->data = ucs_calloc(src_array->count, array->elem_size,
-                                  "config array");
-    if (dest_array->data == NULL) {
-        return UCS_ERR_NO_MEMORY;
+    if (src_array->count > 0) {
+        dest_array->data = ucs_calloc(src_array->count, array->elem_size,
+                                      "config array");
+        if (dest_array->data == NULL) {
+            return UCS_ERR_NO_MEMORY;
+        }
+    } else {
+        dest_array->data = NULL;
     }
 
     dest_array->count = src_array->count;
@@ -829,6 +867,86 @@ void ucs_config_help_array(char *buf, size_t max, const void *arg)
     array->parser.help(buf + strlen(buf), max - strlen(buf), array->parser.arg);
 }
 
+int ucs_config_sscanf_allow_list(const char *buf, void *dest, const void *arg)
+{
+    ucs_config_allow_list_t *field  = dest;
+    unsigned offset                 = 0;
+
+    if (buf[0] == ucs_config_parser_negate) {
+        field->mode = UCS_CONFIG_ALLOW_LIST_NEGATE;
+        offset++;
+    } else {
+        field->mode = UCS_CONFIG_ALLOW_LIST_ALLOW;
+    }
+
+    if (!ucs_config_sscanf_array(&buf[offset], &field->array, arg)) {
+        return 0;
+    }
+
+    if ((field->array.count >= 1) &&
+        !strcmp(field->array.names[0], UCS_CONFIG_PARSER_ALL)) {
+        field->mode = UCS_CONFIG_ALLOW_LIST_ALLOW_ALL;
+        ucs_config_release_array(&field->array, arg);
+        if (field->array.count != 1) {
+            return 0;
+        }
+
+        field->array.count = 0;
+    }
+
+    return 1;
+}
+
+int ucs_config_sprintf_allow_list(char *buf, size_t max, const void *src,
+                                  const void *arg)
+{
+    const ucs_config_allow_list_t *allow_list = src;
+    size_t offset                             = 0;
+
+    if (allow_list->mode == UCS_CONFIG_ALLOW_LIST_ALLOW_ALL) {
+        snprintf(buf, max, UCS_CONFIG_PARSER_ALL);
+        return 1;
+    }
+    
+    if (allow_list->mode == UCS_CONFIG_ALLOW_LIST_NEGATE) {
+        buf[offset++] = ucs_config_parser_negate;
+        max--;
+    }
+
+    return ucs_config_sprintf_array(&buf[offset], max, &allow_list->array, arg);
+}
+
+ucs_status_t ucs_config_clone_allow_list(const void *src, void *dest, const void *arg)
+{
+    const ucs_config_allow_list_t *src_list = src;
+    ucs_config_allow_list_t *dest_list      = dest;
+
+    dest_list->mode = src_list->mode;
+    return ucs_config_clone_array(&src_list->array, &dest_list->array, arg);
+}
+
+void ucs_config_release_allow_list(void *ptr, const void *arg)
+{
+    ucs_config_allow_list_t *allow_list = ptr;
+
+    if (allow_list->mode == UCS_CONFIG_ALLOW_LIST_ALLOW_ALL) {
+        return;
+    }
+
+    ucs_config_release_array(&allow_list->array, arg);
+}
+
+void ucs_config_help_allow_list(char *buf, size_t max, const void *arg)
+{
+    const ucs_config_array_t *array = arg;
+
+    snprintf(
+        buf,
+        max, "comma-separated list (use \"all\" for including "
+             "all items or \'^\' for negation) of: ");
+    array->parser.help(buf + strlen(buf), max - strlen(buf), array->parser.arg);
+}
+
 int ucs_config_sscanf_table(const char *buf, void *dest, const void *arg)
 {
     char *tokens;
@@ -1007,9 +1125,12 @@ ucs_config_parser_set_value_internal(void *opts, ucs_config_field_t *fields,
                                      const char *name, const char *value,
                                      const char *table_prefix, int recurse)
 {
+    char value_buf[256] = "";
     ucs_config_field_t *field, *sub_fields;
     size_t prefix_len;
     ucs_status_t status;
+    ucs_status_t UCS_V_UNUSED status_restore;
+    int UCS_V_UNUSED ret;
     unsigned count;
     void *var;
 
@@ -1053,9 +1174,17 @@ ucs_config_parser_set_value_internal(void *opts, ucs_config_field_t *fields,
                 return UCS_ERR_NO_ELEM;
             }
 
+            /* backup current value to restore it in case if new value
+             * is not accepted */
+            ret = field->parser.write(value_buf, sizeof(value_buf) - 1, var,
+                                      field->parser.arg);
+            ucs_assert(ret != 0); /* write success */
             ucs_config_parser_release_field(field, var);
             status = ucs_config_parser_parse_field(field, value, var);
             if (status != UCS_OK) {
+                status_restore = ucs_config_parser_parse_field(field, value_buf, var);
+                /* current value must be valid */
+                ucs_assert(status_restore == UCS_OK);
                 return status;
             }
             ++count;
@@ -1065,6 +1194,12 @@ ucs_config_parser_set_value_internal(void *opts, ucs_config_field_t *fields,
     return (count == 0) ? UCS_ERR_NO_ELEM : UCS_OK;
 }
 
+static int ucs_config_parser_env_vars_track()
+{
+    return ucs_global_opts.warn_unused_env_vars ||
+           ucs_log_is_enabled(UCS_LOG_LEVEL_INFO);
+}
+
 static void ucs_config_parser_mark_env_var_used(const char *name, int *added)
 {
     khiter_t iter;
@@ -1073,7 +1208,7 @@ static void ucs_config_parser_mark_env_var_used(const char *name, int *added)
 
     *added = 0;
 
-    if (!ucs_global_opts.warn_unused_env_vars) {
+    if (!ucs_config_parser_env_vars_track()) {
         return;
     }
 
@@ -1110,9 +1245,71 @@ static void ucs_config_parser_mark_env_var_used(const char *name, int *added)
     pthread_mutex_unlock(&ucs_config_parser_env_vars_hash_lock);
 }
 
-static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fields,
-                                             const char *prefix, const char *table_prefix,
-                                             int recurse, int ignore_errors)
+static char *ucs_config_get_value_from_config_file(const char *name)
+{
+    khiter_t iter = kh_get(ucs_config_map, &ucs_config_file_vars, name);
+
+    if (iter == kh_end(&ucs_config_file_vars)) {
+        return NULL;
+    }
+
+    return kh_val(&ucs_config_file_vars, iter);
+}
+
+static int ucs_config_parse_config_file_line(void *arg, const char *section,
+                                             const char *name,
+                                             const char *value)
+{
+    khiter_t iter = kh_get(ucs_config_map, &ucs_config_file_vars, name);
+    int override  = *(int*)arg;
+    int result;
+
+    if (iter != kh_end(&ucs_config_file_vars)) {
+        if (override) {
+            ucs_free(kh_val(&ucs_config_file_vars, iter));
+        } else {
+            ucs_error("found duplicate '%s' in config map", name);
+            return 0;
+        }
+    } else {
+        iter = kh_put(ucs_config_map, &ucs_config_file_vars,
+                      ucs_strdup(name, "config_var_name"), &result);
+        if (result == UCS_KH_PUT_FAILED) {
+            ucs_error("inserting '%s' to config map failed", name);
+            return 0;
+        }
+    }
+
+    kh_val(&ucs_config_file_vars, iter) = ucs_strdup(value, "config_value");
+    return 1;
+}
+
+ucs_status_t ucs_config_parse_config_file(const char *path, int override)
+{
+    ucs_status_t result = UCS_OK;
+    int parse_result;
+    FILE* file;
+
+    file = fopen(path, "r");
+    if (file == NULL) {
+        ucs_debug("Could not open config file: %s, skipping parsing", path);
+        return UCS_OK;
+    }
+
+    parse_result = ini_parse_file(file, ucs_config_parse_config_file_line,
+                                  &override);
+    if (parse_result != 0) {
+        result = UCS_ERR_INVALID_PARAM;
+    }
+
+    fclose(file);
+    return result;
+}
+
+static ucs_status_t
+ucs_config_apply_config_vars(void *opts, ucs_config_field_t *fields,
+                             const char *prefix, const char *table_prefix,
+                             int recurse, int ignore_errors)
 {
     ucs_config_field_t *field, *sub_fields;
     ucs_status_t status;
@@ -1136,8 +1333,9 @@ static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fi
 
             /* Parse with sub-table prefix */
             if (recurse) {
-                status = ucs_config_apply_env_vars(var, sub_fields, prefix,
-                                                   field->name, 1, ignore_errors);
+                status = ucs_config_apply_config_vars(var, sub_fields, prefix,
+                                                      field->name, 1,
+                                                      ignore_errors);
                 if (status != UCS_OK) {
                     return status;
                 }
@@ -1145,8 +1343,9 @@ static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fi
 
             /* Possible override with my prefix */
             if (table_prefix) {
-                status = ucs_config_apply_env_vars(var, sub_fields, prefix,
-                                                   table_prefix, 0, ignore_errors);
+                status = ucs_config_apply_config_vars(var, sub_fields, prefix,
+                                                      table_prefix, 0,
+                                                      ignore_errors);
                 if (status != UCS_OK) {
                     return status;
                 }
@@ -1154,7 +1353,13 @@ static ucs_status_t ucs_config_apply_env_vars(void *opts, ucs_config_field_t *fi
         } else {
             /* Read and parse environment variable */
             strncpy(buf + prefix_len, field->name, sizeof(buf) - prefix_len - 1);
+
+            /* Env variable has precedence over file config */
             env_value = getenv(buf);
+            if (env_value == NULL) {
+                env_value = ucs_config_get_value_from_config_file(buf);
+            }
+
             if (env_value == NULL) {
                 continue;
             }
@@ -1220,6 +1425,7 @@ ucs_status_t ucs_config_parser_fill_opts(void *opts, ucs_config_field_t *fields,
                                          int ignore_errors)
 {
     const char   *sub_prefix = NULL;
+    static ucs_init_once_t config_file_parse = UCS_INIT_ONCE_INITIALIZER;
     ucs_status_t status;
 
     /* Set default values */
@@ -1234,18 +1440,24 @@ ucs_status_t ucs_config_parser_fill_opts(void *opts, ucs_config_field_t *fields,
         goto err;
     }
 
+    UCS_INIT_ONCE(&config_file_parse) {
+        if (ucs_config_parse_config_file(UCX_CONF_FILE, 0) != UCS_OK) {
+            ucs_warn("could not parse config file: %s", UCX_CONF_FILE);
+        }
+    }
+
     /* Apply environment variables */
     if (sub_prefix != NULL) {
-        status = ucs_config_apply_env_vars(opts, fields, sub_prefix, table_prefix,
-                                           1, ignore_errors);
+        status = ucs_config_apply_config_vars(opts, fields, sub_prefix,
+                                              table_prefix, 1, ignore_errors);
         if (status != UCS_OK) {
             goto err_free;
         }
     }
 
     /* Apply environment variables with custom prefix */
-    status = ucs_config_apply_env_vars(opts, fields, env_prefix, table_prefix,
-                                        1, ignore_errors);
+    status = ucs_config_apply_config_vars(opts, fields, env_prefix,
+                                          table_prefix, 1, ignore_errors);
     if (status != UCS_OK) {
         goto err_free;
     }
@@ -1386,15 +1598,29 @@ static void __print_stream_cb(int num, const char *line, void *arg)
     fprintf(stream, "# %s\n", line);
 }
 
+static int ucs_config_parser_is_default(const char *env_prefix,
+                                        const char *prefix, const char *name)
+{
+    char var_name[128] = {0};
+    khiter_t iter;
+
+    ucs_snprintf_safe(var_name, sizeof(var_name) - 1, "%s%s%s", env_prefix,
+                      prefix, name);
+    iter = kh_get(ucs_config_map, &ucs_config_file_vars, name);
+    return (iter == kh_end(&ucs_config_file_vars)) &&
+           (getenv(var_name) == NULL);
+}
+
 static void
 ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_prefix,
                               ucs_list_link_t *prefix_list, const char *name,
                               const ucs_config_field_t *field, unsigned long flags,
                               const char *docstr, ...)
 {
-    ucs_config_parser_prefix_t *prefix, *head;
     char value_buf[128]  = {0};
     char syntax_buf[256] = {0};
+    ucs_config_parser_prefix_t *prefix, *head;
+    char *default_config_prefix;
     va_list ap;
 
     ucs_assert(!ucs_list_is_empty(prefix_list));
@@ -1411,6 +1637,13 @@ ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_pr
         field->parser.help(syntax_buf, sizeof(syntax_buf) - 1, field->parser.arg);
     }
 
+    if ((flags & UCS_CONFIG_PRINT_COMMENT_DEFAULT) &&
+        ucs_config_parser_is_default(env_prefix, head->prefix, name)) {
+        default_config_prefix = "# ";
+    } else {
+        default_config_prefix = "";
+    }
+
     if (flags & UCS_CONFIG_PRINT_DOC) {
         fprintf(stream, "#\n");
         ucs_config_print_doc_line_by_line(field, __print_stream_cb, stream);
@@ -1446,7 +1679,8 @@ ucs_config_parser_print_field(FILE *stream, const void *opts, const char *env_pr
         fprintf(stream, "#\n");
     }
 
-    fprintf(stream, "%s%s%s%s\n", env_prefix, head->prefix, name, value_buf);
+    fprintf(stream, "%s%s%s%s%s\n", default_config_prefix, env_prefix,
+            head->prefix, name, value_buf);
 
     if (flags & UCS_CONFIG_PRINT_DOC) {
         fprintf(stream, "\n");
@@ -1531,6 +1765,11 @@ void ucs_config_parser_print_opts(FILE *stream, const char *title, const void *o
     ucs_config_parser_prefix_t table_prefix_elem;
     UCS_LIST_HEAD(prefix_list);
 
+    if (flags & UCS_CONFIG_PRINT_DOC) {
+        fprintf(stream, "# UCX library configuration file\n");
+        fprintf(stream, "# Uncomment to modify values\n");
+    }
+
     if (flags & UCS_CONFIG_PRINT_HEADER) {
         fprintf(stream, "\n");
         fprintf(stream, "#\n");
@@ -1589,33 +1828,31 @@ void ucs_config_parser_print_all_opts(FILE *stream, const char *prefix,
     }
 }
 
-static void ucs_config_parser_warn_unused_env_vars(const char *prefix)
+static void ucs_config_parser_print_env_vars(const char *prefix)
 {
-    char unused_env_vars_names[40];
-    int num_unused_vars;
+    int num_unused_vars, num_used_vars;
     char **envp, *envstr;
     size_t prefix_len;
     char *var_name;
-    char *p, *endp;
     khiter_t iter;
     char *saveptr;
-    int truncated;
-    int ret;
+    ucs_string_buffer_t used_vars_strb;
+    ucs_string_buffer_t unused_vars_strb;
 
-    if (!ucs_global_opts.warn_unused_env_vars) {
+    if (!ucs_config_parser_env_vars_track()) {
         return;
     }
 
-    pthread_mutex_lock(&ucs_config_parser_env_vars_hash_lock);
-
     prefix_len      = strlen(prefix);
-    p               = unused_env_vars_names;
-    endp            = p + sizeof(unused_env_vars_names) - 1;
-    *endp           = '\0';
-    truncated       = 0;
     num_unused_vars = 0;
+    num_used_vars   = 0;
+
+    ucs_string_buffer_init(&unused_vars_strb);
+    ucs_string_buffer_init(&used_vars_strb);
+
+    pthread_mutex_lock(&ucs_config_parser_env_vars_hash_lock);
 
-    for (envp = environ; !truncated && (*envp != NULL); ++envp) {
+    for (envp = environ; *envp != NULL; ++envp) {
         envstr = ucs_strdup(*envp, "env_str");
         if (envstr == NULL) {
             continue;
@@ -1629,33 +1866,40 @@ static void ucs_config_parser_warn_unused_env_vars(const char *prefix)
 
         iter = kh_get(ucs_config_env_vars, &ucs_config_parser_env_vars, var_name);
         if (iter == kh_end(&ucs_config_parser_env_vars)) {
-            ret = snprintf(p, endp - p, " %s,", var_name);
-            if (ret > endp - p) {
-                truncated = 1;
-                *p = '\0';
-            } else {
-                p += strlen(p);
+            if (ucs_global_opts.warn_unused_env_vars) {
+                ucs_string_buffer_appendf(&unused_vars_strb, "%s,", var_name);
                 ++num_unused_vars;
             }
+        } else {
+            ucs_string_buffer_appendf(&used_vars_strb, "%s ", *envp);
+            ++num_used_vars;
         }
 
         ucs_free(envstr);
     }
 
+    pthread_mutex_unlock(&ucs_config_parser_env_vars_hash_lock);
+
     if (num_unused_vars > 0) {
-        if (!truncated) {
-            p[-1] = '\0'; /* remove trailing comma */
-        }
-        ucs_warn("unused env variable%s:%s%s (set %s%s=n to suppress this warning)",
-                 (num_unused_vars > 1) ? "s" : "", unused_env_vars_names,
-                 truncated ? "..." : "", UCS_DEFAULT_ENV_PREFIX,
-                 UCS_GLOBAL_OPTS_WARN_UNUSED_CONFIG);
+        ucs_string_buffer_rtrim(&unused_vars_strb, ",");
+        ucs_warn("unused env variable%s: %s (set %s%s=n to suppress this warning)",
+                 (num_unused_vars > 1) ? "s" : "",
+                 ucs_string_buffer_cstr(&unused_vars_strb),
+                 UCS_DEFAULT_ENV_PREFIX, UCS_GLOBAL_OPTS_WARN_UNUSED_CONFIG);
     }
 
-    pthread_mutex_unlock(&ucs_config_parser_env_vars_hash_lock);
+    if (num_used_vars > 0) {
+        ucs_string_buffer_rtrim(&used_vars_strb, " ");
+        ucs_info("%s* env variable%s: %s", prefix,
+                 (num_used_vars > 1) ? "s" : "",
+                 ucs_string_buffer_cstr(&used_vars_strb));
+    }
+
+    ucs_string_buffer_cleanup(&unused_vars_strb);
+    ucs_string_buffer_cleanup(&used_vars_strb);
 }
 
-void ucs_config_parser_warn_unused_env_vars_once(const char *env_prefix)
+void ucs_config_parser_print_env_vars_once(const char *env_prefix)
 {
     const char   *sub_prefix = NULL;
     int          added;
@@ -1669,8 +1913,8 @@ void ucs_config_parser_warn_unused_env_vars_once(const char *env_prefix)
         return;
     }
 
-    ucs_config_parser_warn_unused_env_vars(env_prefix);
- 
+    ucs_config_parser_print_env_vars(env_prefix);
+
     status = ucs_config_parser_get_sub_prefix(env_prefix, &sub_prefix);
     if (status != UCS_OK) {
         return;
@@ -1685,7 +1929,7 @@ void ucs_config_parser_warn_unused_env_vars_once(const char *env_prefix)
         return;
     }
 
-    ucs_config_parser_warn_unused_env_vars(sub_prefix);
+    ucs_config_parser_print_env_vars(sub_prefix);
 }
 
 size_t ucs_config_memunits_get(size_t config_size, size_t auto_size,
@@ -1714,9 +1958,16 @@ int ucs_config_names_search(ucs_config_names_array_t config_names,
 
 UCS_STATIC_CLEANUP {
     const char *key;
+    char *value;
 
     kh_foreach_key(&ucs_config_parser_env_vars, key, {
         ucs_free((void*)key);
     })
     kh_destroy_inplace(ucs_config_env_vars, &ucs_config_parser_env_vars);
+
+    kh_foreach(&ucs_config_file_vars, key, value, {
+        ucs_free((void*)key);
+        ucs_free(value);
+    })
+    kh_destroy_inplace(ucs_config_map, &ucs_config_file_vars);
 }
diff --git a/src/ucs/config/parser.h b/src/ucs/config/parser.h
index 238335e62c6..9cb357d5ecb 100644
--- a/src/ucs/config/parser.h
+++ b/src/ucs/config/parser.h
@@ -18,6 +18,7 @@
 
 #define UCS_DEFAULT_ENV_PREFIX "UCX_"
 #define UCS_CONFIG_ARRAY_MAX   128
+#define UCX_CONF_FILE          UCX_CONF_DIR "/ucx.conf"
 
 BEGIN_C_DECLS
 
@@ -151,7 +152,8 @@ int ucs_config_sscanf_bool(const char *buf, void *dest, const void *arg);
 int ucs_config_sprintf_bool(char *buf, size_t max, const void *src, const void *arg);
 
 int ucs_config_sscanf_ternary(const char *buf, void *dest, const void *arg);
-int ucs_config_sprintf_ternary(char *buf, size_t max, const void *src, const void *arg);
+int ucs_config_sscanf_ternary_auto(const char *buf, void *dest, const void *arg);
+int ucs_config_sprintf_ternary_auto(char *buf, size_t max, const void *src, const void *arg);
 
 int ucs_config_sscanf_on_off(const char *buf, void *dest, const void *arg);
 
@@ -202,6 +204,13 @@ ucs_status_t ucs_config_clone_array(const void *src, void *dest, const void *arg
 void ucs_config_release_array(void *ptr, const void *arg);
 void ucs_config_help_array(char *buf, size_t max, const void *arg);
 
+int ucs_config_sscanf_allow_list(const char *buf, void *dest, const void *arg);
+int ucs_config_sprintf_allow_list(char *buf, size_t max, const void *src,
+                                  const void *arg);
+ucs_status_t ucs_config_clone_allow_list(const void *src, void *dest, const void *arg);
+void ucs_config_release_allow_list(void *ptr, const void *arg);
+void ucs_config_help_allow_list(char *buf, size_t max, const void *arg);
+
 int ucs_config_sscanf_table(const char *buf, void *dest, const void *arg);
 ucs_status_t ucs_config_clone_table(const void *src, void *dest, const void *arg);
 void ucs_config_release_table(void *ptr, const void *arg);
@@ -256,9 +265,13 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg);
                                     ucs_config_clone_int,        ucs_config_release_nop, \
                                     ucs_config_help_generic,     "<y|n>"}
 
-#define UCS_CONFIG_TYPE_TERNARY    {ucs_config_sscanf_ternary,   ucs_config_sprintf_ternary, \
-                                    ucs_config_clone_int,        ucs_config_release_nop, \
-                                    ucs_config_help_generic,     "<yes|no|try>"}
+#define UCS_CONFIG_TYPE_TERNARY    {ucs_config_sscanf_ternary, ucs_config_sprintf_ternary_auto, \
+                                    ucs_config_clone_int,      ucs_config_release_nop, \
+                                    ucs_config_help_generic,   "<yes|no|try>"}
+
+#define UCS_CONFIG_TYPE_TERNARY_AUTO {ucs_config_sscanf_ternary_auto, ucs_config_sprintf_ternary_auto, \
+                                      ucs_config_clone_int,           ucs_config_release_nop, \
+                                      ucs_config_help_generic,        "<yes|no|try|auto>"}
 
 #define UCS_CONFIG_TYPE_ON_OFF     {ucs_config_sscanf_on_off,    ucs_config_sprintf_on_off_auto, \
                                     ucs_config_clone_int,        ucs_config_release_nop, \
@@ -286,7 +299,8 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg);
 
 #define UCS_CONFIG_TYPE_TIME_UNITS {ucs_config_sscanf_time_units, ucs_config_sprintf_time_units, \
                                     ucs_config_clone_ulong,       ucs_config_release_nop, \
-                                    ucs_config_help_generic,      "time value: <number>[s|us|ms|ns]"}
+                                    ucs_config_help_generic, \
+                                    "time value: <number>[s|us|ms|ns], \"inf\", or \"auto\""}
 
 #define UCS_CONFIG_TYPE_BW         {ucs_config_sscanf_bw,        ucs_config_sprintf_bw, \
                                     ucs_config_clone_double,     ucs_config_release_nop, \
@@ -315,6 +329,10 @@ void ucs_config_help_generic(char *buf, size_t max, const void *arg);
                                     ucs_config_clone_array,      ucs_config_release_array, \
                                     ucs_config_help_array,       &ucs_config_array_##a}
 
+#define UCS_CONFIG_TYPE_ALLOW_LIST {ucs_config_sscanf_allow_list,     ucs_config_sprintf_allow_list, \
+                                    ucs_config_clone_allow_list,      ucs_config_release_allow_list, \
+                                    ucs_config_help_allow_list,       &ucs_config_array_string}
+
 #define UCS_CONFIG_TYPE_TABLE(t)   {ucs_config_sscanf_table,     NULL, \
                                     ucs_config_clone_table,      ucs_config_release_table, \
                                     ucs_config_help_table,       t}
@@ -356,6 +374,15 @@ ucs_status_t
 ucs_config_parser_set_default_values(void *opts, ucs_config_field_t *fields);
 
 
+/**
+ * Parse INI configuration file with UCX options.
+ * 
+ * @param path     Path file at this path.
+ * @param override Whether to override, if another file was previously parsed
+ */
+ucs_status_t ucs_config_parse_config_file(const char *path, int override);
+
+
 /**
  * Fill existing opts structure.
  *
@@ -441,14 +468,14 @@ ucs_status_t ucs_config_parser_set_value(void *opts, ucs_config_field_t *fields,
                                          const char *name, const char *value);
 
 /**
- * Wrapper for `ucs_config_parser_warn_unused_env_vars`
+ * Wrapper for `ucs_config_parser_print_env_vars`
  * that ensures that this is called once
  *
  * @param env_prefix     Environment variable prefix.
  *                       env_prefix may consist of multiple sub prefixex
  */
 
-void ucs_config_parser_warn_unused_env_vars_once(const char *env_prefix);
+void ucs_config_parser_print_env_vars_once(const char *env_prefix);
 
 /**
  * Translate configuration value of "MEMUNITS" type to actual value.
diff --git a/src/ucs/config/types.h b/src/ucs/config/types.h
index 4a8659aa690..9bc28f8b419 100644
--- a/src/ucs/config/types.h
+++ b/src/ucs/config/types.h
@@ -48,14 +48,15 @@ extern const char *ucs_async_mode_names[];
 
 
 /**
- * Ternary logic value.
+ * Ternary logic or Auto value.
  */
-typedef enum ucs_ternary_value {
-    UCS_NO  = 0,
-    UCS_YES = 1,
-    UCS_TRY = 2,
+typedef enum ucs_ternary_auto_value {
+    UCS_NO   = 0,
+    UCS_YES  = 1,
+    UCS_TRY  = 2,
+    UCS_AUTO = 3,
     UCS_TERNARY_LAST
-} ucs_ternary_value_t;
+} ucs_ternary_auto_value_t;
 
 
 /**
@@ -76,6 +77,7 @@ typedef enum {
     UCS_HANDLE_ERROR_BACKTRACE, /* Print backtrace */
     UCS_HANDLE_ERROR_FREEZE,    /* Freeze and wait for a debugger */
     UCS_HANDLE_ERROR_DEBUG,     /* Attach debugger */
+    UCS_HANDLE_ERROR_NONE,      /* Do not take any action */
     UCS_HANDLE_ERROR_LAST
 } ucs_handle_error_t;
 
@@ -84,10 +86,11 @@ typedef enum {
  * Configuration printing flags
  */
 typedef enum {
-    UCS_CONFIG_PRINT_CONFIG        = UCS_BIT(0),
-    UCS_CONFIG_PRINT_HEADER        = UCS_BIT(1),
-    UCS_CONFIG_PRINT_DOC           = UCS_BIT(2),
-    UCS_CONFIG_PRINT_HIDDEN        = UCS_BIT(3)
+    UCS_CONFIG_PRINT_CONFIG          = UCS_BIT(0),
+    UCS_CONFIG_PRINT_HEADER          = UCS_BIT(1),
+    UCS_CONFIG_PRINT_DOC             = UCS_BIT(2),
+    UCS_CONFIG_PRINT_HIDDEN          = UCS_BIT(3),
+    UCS_CONFIG_PRINT_COMMENT_DEFAULT = UCS_BIT(4)
 } ucs_config_print_flags_t;
 
 
@@ -110,6 +113,23 @@ typedef enum {
 
 typedef UCS_CONFIG_STRING_ARRAY_FIELD(names) ucs_config_names_array_t;
 
+
+/**
+ * Enum for representing possible modes of an "allow-list"
+ */
+typedef enum {
+    UCS_CONFIG_ALLOW_LIST_ALLOW_ALL, /* Allow all possible options */
+    UCS_CONFIG_ALLOW_LIST_ALLOW, /* Allow only the specified options */
+    UCS_CONFIG_ALLOW_LIST_NEGATE /* Negate (forbid) the specified options */
+} ucs_config_allow_list_mode_t;
+
+
+typedef struct {
+    ucs_config_names_array_t array;
+    ucs_config_allow_list_mode_t mode;
+} ucs_config_allow_list_t;
+
+
 /**
  * @ingroup UCS_RESOURCE
  * BSD socket address specification.
diff --git a/src/ucs/config/ucm_opts.c b/src/ucs/config/ucm_opts.c
index d96ef3846df..bee035a4a9c 100644
--- a/src/ucs/config/ucm_opts.c
+++ b/src/ucs/config/ucm_opts.c
@@ -49,12 +49,13 @@ static ucs_config_field_t ucm_global_config_table[] = {
 
   {"MMAP_HOOK_MODE", UCM_DEFAULT_HOOK_MODE_STR,
    "MMAP hook mode\n"
-   " none   - don't set mmap hooks.\n"
-   " reloc  - use ELF relocation table to set hooks.\n"
+   " none   - Don't set mmap hooks.\n"
+   " reloc  - Use ELF relocation table to set hooks.\n"
 #if UCM_BISTRO_HOOKS
-   " bistro - use binary instrumentation to set hooks.\n"
+   " bistro - Use binary instrumentation to set hooks.\n"
 #endif
-   ,ucs_offsetof(ucm_global_config_t, mmap_hook_mode), UCS_CONFIG_TYPE_ENUM(ucm_mmap_hook_modes)},
+   ,ucs_offsetof(ucm_global_config_t, mmap_hook_mode),
+                 UCS_CONFIG_TYPE_ENUM(ucm_mmap_hook_modes)},
 
   {"MALLOC_HOOKS", "yes",
    "Enable using glibc malloc hooks",
@@ -68,10 +69,24 @@ static ucs_config_field_t ucm_global_config_table[] = {
    "which would use the original implementation and not ours.",
    ucs_offsetof(ucm_global_config_t, enable_malloc_reloc), UCS_CONFIG_TYPE_BOOL},
 
+  {"CUDA_HOOK_MODE", UCM_DEFAULT_HOOK_MODE_STR,
+   "Cuda memory hook mode\n"
+   " none   - Don't set Cuda hooks.\n"
+   " reloc  - Use ELF relocation table to set hooks. In this mode, if any\n"
+   "          part of the application is linked with Cuda runtime statically,\n"
+   "          some memory events may be missed and not reported."
+#if UCM_BISTRO_HOOKS
+   " bistro - Use binary instrumentation to set hooks. In this mode, it's\n"
+   "          possible to intercept calls from the Cuda runtime library to\n"
+   "          Cuda driver APIs, so memory events are reported properly even\n"
+   "          for statically-linked applications."
+#endif
+   ,ucs_offsetof(ucm_global_config_t, cuda_hook_mode),
+                 UCS_CONFIG_TYPE_ENUM(ucm_mmap_hook_modes)},
+
   {"CUDA_RELOC", "yes",
-   "Enable installing CUDA symbols in the relocation table",
-   ucs_offsetof(ucm_global_config_t, enable_cuda_reloc),
-   UCS_CONFIG_TYPE_BOOL},
+   "The configuration parameter replaced by UCX_MEM_CUDA_HOOK_MODE",
+   UCS_CONFIG_DEPRECATED_FIELD_OFFSET, UCS_CONFIG_TYPE_DEPRECATED},
 
   {"DYNAMIC_MMAP_THRESH", "yes",
    "Enable dynamic mmap threshold: for every released block, the\n"
@@ -101,6 +116,9 @@ UCS_CONFIG_REGISTER_TABLE(ucm_global_config_table, "UCM", UCM_CONFIG_PREFIX,
                           ucm_global_config_t, &ucs_config_global_list)
 
 UCS_STATIC_INIT {
-    (void)ucs_config_parser_fill_opts(&ucm_global_opts, ucm_global_config_table,
-                                      UCS_DEFAULT_ENV_PREFIX, UCM_CONFIG_PREFIX, 0);
+    ucm_global_config_t ucm_opts;
+    (void)ucs_config_parser_fill_opts(&ucm_opts, ucm_global_config_table,
+                                      UCS_DEFAULT_ENV_PREFIX, UCM_CONFIG_PREFIX,
+                                      0);
+    ucm_library_init(&ucm_opts);
 }
diff --git a/config/m4/ucs.m4 b/src/ucs/configure.m4
similarity index 74%
rename from config/m4/ucs.m4
rename to src/ucs/configure.m4
index 6722a70d5eb..cc20bb6a33a 100644
--- a/config/m4/ucs.m4
+++ b/src/ucs/configure.m4
@@ -5,6 +5,10 @@
 # See file LICENSE for terms.
 #
 
+ucs_modules=""
+m4_include([src/ucs/vfs/sock/configure.m4])
+m4_include([src/ucs/vfs/fuse/configure.m4])
+AC_DEFINE_UNQUOTED([ucs_MODULES], ["${ucs_modules}"], [UCS loadable modules])
 
 #
 # Internal profiling support.
@@ -141,10 +145,19 @@ AC_ARG_ENABLE([logging],
 	               [Enable debug logging, default: YES])
 	)
 
-AS_IF([test "x$enable_logging" != xno],
-        [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_POLL], [Highest log level])],
-        [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_INFO], [Highest log level])]
-    )
+AS_CASE([$enable_logging],
+        [no],          [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_DEBUG], [Highest log level])],
+        [warn],        [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_WARN], [Highest log level])],
+        [diag],        [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_DIAG], [Highest log level])],
+        [info],        [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_INFO], [Highest log level])],
+        [debug],       [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_DEBUG], [Highest log level])],
+        [trace],       [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE], [Highest log level])],
+        [trace_req],   [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_REQ], [Highest log level])],
+        [trace_data],  [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_DATA], [Highest log level])],
+        [trace_async], [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_ASYNC], [Highest log level])],
+        [trace_func],  [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_FUNC], [Highest log level])],
+        [trace_poll],  [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_POLL], [Highest log level])],
+                       [AC_DEFINE([UCS_MAX_LOG_LEVEL], [UCS_LOG_LEVEL_TRACE_POLL], [Highest log level])])
 
 #
 # Disable assertions
@@ -210,17 +223,19 @@ AS_IF([test "x$with_cache_line_size" != xno],[
 case ${host} in
     aarch64*)
     AC_MSG_CHECKING([support for CNTVCT_EL0 on aarch64])
-    AC_RUN_IFELSE([AC_LANG_PROGRAM(
-                  [[#include <stdint.h>]],
-                  [[uint64_t tmp; asm volatile("mrs %0, cntvct_el0" : "=r" (tmp));]])],
-                  [AC_MSG_RESULT([yes])]
-		  [AC_DEFINE([HAVE_HW_TIMER], [1], [high-resolution hardware timer enabled])],
-		  [AC_MSG_RESULT([no])]
-		  [AC_DEFINE([HAVE_HW_TIMER], [0], [high-resolution hardware timer disabled])]
-                 );;
+    AC_RUN_IFELSE([AC_LANG_PROGRAM([[#include <stdint.h>]],
+                                   [[uint64_t tmp; asm volatile("mrs %0, cntvct_el0" : "=r" (tmp));
+                                   ]])],
+                                   [AC_MSG_RESULT([yes])
+                                    AC_DEFINE([HAVE_HW_TIMER], [1], [high-resolution hardware timer enabled])],
+                                   [AC_MSG_RESULT([no])
+                                    AC_DEFINE([HAVE_HW_TIMER], [0], [high-resolution hardware timer disabled])],
+                                   [AC_MSG_RESULT([no - cross-compiling detected])
+                                    AC_DEFINE([HAVE_HW_TIMER], [0], [high-resolution hardware timer disabled])]
+                  );;
     *)
     # HW timer is supported for all other architectures
-    AC_DEFINE([HAVE_HW_TIMER], [1], [high-resolution hardware timer disabled])
+    AC_DEFINE([HAVE_HW_TIMER], [1], [high-resolution hardware timer enabled])
 esac
 
 #
@@ -240,3 +255,6 @@ AS_IF([test "x$enable_builtin_memcpy" != xno],
 
 AC_CHECK_FUNCS([__clear_cache], [], [])
 AC_CHECK_FUNCS([__aarch64_sync_cache_range], [], [])
+
+
+AC_CONFIG_FILES([src/ucs/Makefile])
diff --git a/src/ucs/datastruct/arbiter.c b/src/ucs/datastruct/arbiter.c
index ac327863f8c..049728f72cd 100644
--- a/src/ucs/datastruct/arbiter.c
+++ b/src/ucs/datastruct/arbiter.c
@@ -77,6 +77,7 @@ void ucs_arbiter_group_push_head_elem_always(ucs_arbiter_group_t *group,
     ucs_arbiter_elem_t *head;
 
     UCS_ARBITER_GROUP_GUARD_CHECK(group);
+    ucs_assert(!ucs_arbiter_elem_is_scheduled(elem));
 
     ucs_arbiter_group_head_reset(elem);
     ucs_arbiter_elem_set_scheduled(elem, group);
@@ -216,7 +217,7 @@ void ucs_arbiter_group_schedule_nonempty(ucs_arbiter_t *arbiter,
     ucs_arbiter_elem_t *tail = group->tail;
     ucs_arbiter_elem_t *head;
 
-    ucs_assert(tail != NULL);
+    ucs_assert(!ucs_arbiter_group_is_empty(group));
     head = tail->next;
 
     ucs_assert(head != NULL);
diff --git a/src/ucs/datastruct/array.h b/src/ucs/datastruct/array.h
index cffcb4d49be..17362e1ce22 100644
--- a/src/ucs/datastruct/array.h
+++ b/src/ucs/datastruct/array.h
@@ -41,11 +41,6 @@ BEGIN_C_DECLS
  * @param _scope       Scope for array's functions (e.g 'static inline')
  */
 #define UCS_ARRAY_DECLARE_FUNCS(_name, _index_type, _value_type, _scope) \
-    _scope void \
-    UCS_ARRAY_IDENTIFIER(_name, _init_dynamic)(ucs_array_t(_name) *array); \
-    \
-    _scope void \
-    UCS_ARRAY_IDENTIFIER(_name, _cleanup_dynamic)(ucs_array_t(_name) *array); \
     \
     _scope ucs_status_t \
     UCS_ARRAY_IDENTIFIER(_name, _reserve)(ucs_array_t(_name) *array, \
@@ -56,6 +51,14 @@ BEGIN_C_DECLS
                                          _index_type *index_p)
 
 
+/**
+ * Dynamic array initializer. The array storage should be released explicitly by
+ * calling @ref ucs_array_cleanup_dynamic()
+ */
+#define UCS_ARRAY_DYNAMIC_INITIALIZER \
+    { NULL, 0, 0 }
+
+
 /**
  * Static initializer to create a fixed-length array with existing static buffer
  * as backing storage. Such array can track the number of elements and check for
@@ -73,8 +76,7 @@ BEGIN_C_DECLS
  * @endcode
  */
 #define UCS_ARRAY_FIXED_INITIALIZER(_buffer, _capacity) \
-    { (_buffer), 0, \
-      ((_capacity) & UCS_ARRAY_CAP_MASK) | UCS_ARRAY_CAP_FLAG_FIXED }
+    { (_buffer), 0, ucs_array_init_fixed_capacity(_capacity) }
 
 
 /**
@@ -133,25 +135,53 @@ BEGIN_C_DECLS
     UCS_ARRAY_IDENTIFIER(_name, _t)
 
 
+/**
+ * Helper function to initialize capacity field of a fixed-size array
+ */
+#define ucs_array_init_fixed_capacity(_capacity) \
+    (((_capacity) & UCS_ARRAY_CAP_MASK) | UCS_ARRAY_CAP_FLAG_FIXED)
+
+
+
 /**
  * Initialize a dynamic array. Such array can grow automatically to accommodate
  * for more elements.
  *
- * @param _name    Array name
  * @param _array   Pointer to the array to initialize
  */
-#define ucs_array_init_dynamic(_name, _array) \
-    UCS_ARRAY_IDENTIFIER(_name, _init_dynamic)(_array)
+#define ucs_array_init_dynamic(_array) \
+    { \
+        (_array)->buffer   = NULL; \
+        (_array)->length   = 0; \
+        (_array)->capacity = 0; \
+    }
+
+
+/**
+ * Initialize a fixed-size array with existing buffer as backing storage.
+ *
+ * @param _array     Pointer to the array to initialize
+ * @param _buffer    Buffer to use as backing store
+ * @param _capacity  Buffer capacity
+ */
+#define ucs_array_init_fixed(_array, _buffer, _capacity) \
+    { \
+        (_array)->buffer   = (_buffer); \
+        (_array)->length   = 0; \
+        (_array)->capacity = ucs_array_init_fixed_capacity(_capacity); \
+    }
 
 
 /*
  * Cleanup a dynamic array.
  *
- * @param _name    Array name
  * @param _array   Array to cleanup
  */
-#define ucs_array_cleanup_dynamic(_name, _array) \
-    UCS_ARRAY_IDENTIFIER(_name, _cleanup_dynamic)(_array)
+#define ucs_array_cleanup_dynamic(_array) \
+    { \
+        ucs_assert(!ucs_array_is_fixed(_array)); \
+        ucs_free((_array)->buffer); \
+    }
 
 
 /*
diff --git a/src/ucs/datastruct/array.inl b/src/ucs/datastruct/array.inl
index d95d364764b..01f03debbc7 100644
--- a/src/ucs/datastruct/array.inl
+++ b/src/ucs/datastruct/array.inl
@@ -19,21 +19,6 @@
  * @param _scope       Scope for array's functions (e.g 'static inline')
  */
 #define UCS_ARRAY_IMPL(_name, _index_type, _value_type, _scope) \
-    \
-    _scope UCS_F_MAYBE_UNUSED void \
-    UCS_ARRAY_IDENTIFIER(_name, _init_dynamic)(ucs_array_t(_name) *array) \
-    { \
-        array->buffer   = NULL; \
-        array->length   = 0; \
-        array->capacity = 0; \
-    } \
-    \
-    _scope UCS_F_MAYBE_UNUSED void \
-    UCS_ARRAY_IDENTIFIER(_name, _cleanup_dynamic)(ucs_array_t(_name) *array) \
-    { \
-        ucs_assert(!ucs_array_is_fixed(array)); \
-        ucs_free(array->buffer); \
-    } \
     \
     _scope UCS_F_MAYBE_UNUSED ucs_status_t \
     UCS_ARRAY_IDENTIFIER(_name, _grow)(ucs_array_t(_name) *array, \
diff --git a/src/ucs/datastruct/bitmap.h b/src/ucs/datastruct/bitmap.h
new file mode 100644
index 00000000000..1962e4412b7
--- /dev/null
+++ b/src/ucs/datastruct/bitmap.h
@@ -0,0 +1,562 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_BITMAP_H_
+#define UCS_BITMAP_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <ucs/arch/bitops.h>
+#include <ucs/sys/compiler_def.h>
+#include <ucs/debug/assert.h>
+#include <ucs/sys/preprocessor.h>
+
+BEGIN_C_DECLS
+
+
+typedef uint64_t ucs_bitmap_word_t;
+
+
+/*
+ * Bits number in a single bitmap word
+ */
+#define UCS_BITMAP_BITS_IN_WORD \
+    (sizeof(ucs_bitmap_word_t) * 8)
+
+
+/*
+ * Fully-set bitmap word
+ */
+#define UCS_BITMAP_WORD_MASK \
+    (~((ucs_bitmap_word_t)0))
+
+
+/**
+ * Get the number of words in a given bitmap
+ *
+ * @param _bitmap Words number in this bitmap
+ *
+ * @return Number of words
+ */
+#define _UCS_BITMAP_NUM_WORDS(_bitmap) ucs_static_array_size((_bitmap).bits)
+
+
+/**
+ * Word index of a bit in bitmap. Assert the bitmap is big enough
+ *
+ * @param _bitmap    Index of this bit relative to the bitmap
+ * @param _bit_index Index of this bit relative to the bitmap
+ *
+ * @return Index of the word this bit belongs to
+ */
+#define UCS_BITMAP_WORD_INDEX(_bitmap, _bit_index) \
+    _ucs_bitmap_word_index(_UCS_BITMAP_NUM_WORDS(_bitmap), (_bit_index))
+
+
+static UCS_F_ALWAYS_INLINE size_t
+_ucs_bitmap_word_index(size_t bitmap_words, size_t bit_index)
+{
+    ucs_assert(bit_index < (bitmap_words * UCS_BITMAP_BITS_IN_WORD));
+    return bit_index / UCS_BITMAP_BITS_IN_WORD;
+}
+
+
+#define _UCS_BITMAP_BIT_IN_WORD_INDEX(_bit_index) \
+    ((_bit_index) % UCS_BITMAP_BITS_IN_WORD)
+
+
+#define _UCS_BITMAP_BITS_TO_WORDS(_length) \
+    ((((_length) + (UCS_BITMAP_BITS_IN_WORD - 1)) / UCS_BITMAP_BITS_IN_WORD))
+
+
+#define _UCS_BITMAP_BIT_INDEX(_bit_in_word_index, _word_index) \
+    ((_word_index) * UCS_BITMAP_BITS_IN_WORD + (_bit_in_word_index))
+
+
+#define _UCS_BITMAP_WORD(_bitmap, _word_index) ((_bitmap).bits[_word_index])
+
+
+#define _UCS_BITMAP_INDEX_IN_BOUNDS_CONDITION(_bitmap, _bit_index) \
+    ((_bit_index) < _UCS_BITMAP_NUM_WORDS(_bitmap) * UCS_BITMAP_BITS_IN_WORD)
+
+
+/**
+ * Given a bitmap and a bit index, get the whole word that contains it
+ *
+ * @param _bitmap    Take the word from this bitmap
+ * @param _bit_index Index of the bit for fetching the word
+ *
+ * @return The word which containt
+ */
+#define _UCS_BITMAP_WORD_BY_BIT(_bitmap, _bit_index) \
+    _UCS_BITMAP_WORD((_bitmap), UCS_BITMAP_WORD_INDEX(_bitmap, _bit_index))
+
+
+#define _UCS_BITMAP_WORD_INDEX0(_bit_index) \
+    ((_bit_index) & ~(UCS_BITMAP_BITS_IN_WORD - 1))
+
+
+#define _UCS_BITMAP_GET_NEXT_BIT(_bit_index) \
+    (-2ull << (uint64_t)((_bit_index) & (UCS_BITMAP_BITS_IN_WORD - 1)))
+
+
+#define _UCS_BITMAP_FOR_EACH_WORD(_bitmap, _word_index) \
+    for (_word_index = 0; _word_index < _UCS_BITMAP_NUM_WORDS(_bitmap); \
+         _word_index++)
+
+
+/**
+ * Check whether all bits of a given lvalue bitmap are set to 0.
+ *
+ * @param _bitmap Check bits of this bitmap
+ *
+ * @return Whether this bitmap consists only of bits set to 0
+ */
+#define UCS_BITMAP_IS_ZERO_INPLACE(_bitmap) \
+    ucs_bitmap_is_zero((_bitmap), _UCS_BITMAP_NUM_WORDS(*(_bitmap)))
+
+
+/**
+ * Perform inplace bitwise NOT of a bitmap
+ *
+ * @param _bitmap Negate this bitmap
+ */
+#define UCS_BITMAP_NOT_INPLACE(_bitmap) \
+    { \
+        size_t _word_index; \
+        _UCS_BITMAP_FOR_EACH_WORD(*(_bitmap), _word_index) { \
+            _UCS_BITMAP_WORD(*(_bitmap), _word_index) = \
+                ~_UCS_BITMAP_WORD(*(_bitmap), _word_index); \
+        } \
+    }
+
+
+#define _UCS_BITMAP_OP_INPLACE(_bitmap1, _bitmap2, _op) \
+    { \
+        typeof(*(_bitmap1)) _bitmap2_copy = (_bitmap2); \
+        size_t              _word_index; \
+        _UCS_BITMAP_FOR_EACH_WORD(*(_bitmap1), _word_index) { \
+            _UCS_BITMAP_WORD(*(_bitmap1), _word_index) = \
+                _UCS_BITMAP_WORD(*(_bitmap1), _word_index) _op \
+                    _UCS_BITMAP_WORD(_bitmap2_copy, _word_index); \
+        } \
+    }
+
+
+/**
+ * Perform inplace bitwise AND of 2 bitmaps, storing the result in the first one
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ */
+#define UCS_BITMAP_AND_INPLACE(_bitmap1, _bitmap2) \
+    _UCS_BITMAP_OP_INPLACE(_bitmap1, _bitmap2, &)
+
+
+/**
+ * Perform inplace bitwise OR of 2 bitmaps, storing the result in the first one
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ */
+#define UCS_BITMAP_OR_INPLACE(_bitmap1, _bitmap2) \
+    _UCS_BITMAP_OP_INPLACE(_bitmap1, _bitmap2, |)
+
+
+/**
+ * Perform inplace bitwise XOR of 2 bitmaps, storing the result in the first one
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ */
+#define UCS_BITMAP_XOR_INPLACE(_bitmap1, _bitmap2) \
+    _UCS_BITMAP_OP_INPLACE(_bitmap1, _bitmap2, ^)
+
+
+/**
+ * Check whether all bits of a given bitmap are set to 0
+ *
+ * @param _bitmap Check bits of this bitmap
+ *
+ * @return Whether this bitmap consists only of bits set to 0
+ */
+#define UCS_BITMAP_IS_ZERO(_bitmap, _length) \
+    UCS_PP_TOKENPASTE3(_ucs_bitmap_, _length, _is_zero)(_bitmap)
+
+
+/**
+ * Represents an n-bit bitmap, by using an array
+ * of 64-bit unsigned long integers.
+ *
+ * @param _length Number of bits in the bitmap
+ */
+#define _UCS_BITMAP_DECLARE_TYPE(_length) \
+    typedef struct { \
+        ucs_bitmap_word_t bits[_UCS_BITMAP_BITS_TO_WORDS(_length)]; \
+    } ucs_bitmap_t(_length); \
+    \
+    static inline ucs_bitmap_t(_length) \
+            _ucs_bitmap_##_length##_not(ucs_bitmap_t(_length) bitmap) \
+    { \
+        UCS_BITMAP_NOT_INPLACE(&bitmap); \
+        return bitmap; \
+    } \
+    \
+    static inline bool _ucs_bitmap_##_length##_is_zero(ucs_bitmap_t(_length) \
+                                                               bitmap) \
+    { \
+        return ucs_bitmap_is_zero(&bitmap, \
+                                  _UCS_BITMAP_BITS_TO_WORDS(_length)); \
+    } \
+    \
+    static inline ucs_bitmap_t(_length) \
+            _ucs_bitmap_##_length##_and(ucs_bitmap_t(_length) bitmap1, \
+                                        ucs_bitmap_t(_length) bitmap2) \
+    { \
+        UCS_BITMAP_AND_INPLACE(&bitmap1, bitmap2); \
+        return bitmap1; \
+    } \
+    \
+    static inline ucs_bitmap_t(_length) \
+            _ucs_bitmap_##_length##_or(ucs_bitmap_t(_length) bitmap1, \
+                                       ucs_bitmap_t(_length) bitmap2) \
+    { \
+        UCS_BITMAP_OR_INPLACE(&bitmap1, bitmap2); \
+        return bitmap1; \
+    } \
+    \
+    static inline ucs_bitmap_t(_length) \
+            _ucs_bitmap_##_length##_xor(ucs_bitmap_t(_length) bitmap1, \
+                                        ucs_bitmap_t(_length) bitmap2) \
+    { \
+        UCS_BITMAP_XOR_INPLACE(&bitmap1, bitmap2); \
+        return bitmap1; \
+    }
+
+
+/**
+ * Expands to bitmap type definition
+ *
+ * @param _length Number of bits (as passed to _UCS_BITMAP_DECLARE_TYPE)
+ *
+ * Example:
+ *
+ * @code{.c}
+ * ucs_bitmap_t(64) my_bitmap;
+ * @endcode
+ */
+#define ucs_bitmap_t(_length) UCS_PP_TOKENPASTE3(ucs_bitmap_, _length, _t)
+
+
+/**
+ * Get the value of a bit in the bitmap
+ *
+ * @param _bitmap    Read value from this bitmap
+ * @param _bit_index Bit index to read
+ *
+ * @return Bit value (0 or 1)
+ */
+#define UCS_BITMAP_GET(_bitmap, _bit_index) \
+    (!!(_UCS_BITMAP_WORD_BY_BIT(_bitmap, _bit_index) & \
+        UCS_BIT(_UCS_BITMAP_BIT_IN_WORD_INDEX(_bit_index))))
+
+
+/**
+ * Set the value of a bit in the bitmap
+ *
+ * @param _bitmap     Set value in this bitmap
+ * @param _bit_index  Bit index to set
+ */
+#define UCS_BITMAP_SET(_bitmap, _bit_index) \
+    ({ \
+        _UCS_BITMAP_WORD_BY_BIT(_bitmap, _bit_index) |= UCS_BIT( \
+                _UCS_BITMAP_BIT_IN_WORD_INDEX(_bit_index)); \
+    })
+
+
+/**
+ * Unset (clear) the value of a bit in the bitmap
+ *
+ * @param _bitmap    Unset value in this bitmap
+ * @param _bit_index Bit index to unset
+ */
+#define UCS_BITMAP_UNSET(_bitmap, _bit_index) \
+    ({ \
+        _UCS_BITMAP_WORD_BY_BIT(_bitmap, _bit_index) &= ~( \
+                UCS_BIT(_UCS_BITMAP_BIT_IN_WORD_INDEX(_bit_index))); \
+    })
+
+
+/**
+ * Clear a bitmap by setting all its bits to zero
+ *
+ * @param _bitmap Clear all bits in this bitmap
+ */
+#define UCS_BITMAP_CLEAR(_bitmap) \
+    memset((_bitmap)->bits, 0, sizeof((_bitmap)->bits))
+
+
+/**
+ * Initialize a bitmap by assigning all its bits to zero.
+ * Use with an assignment operator
+ */
+#define UCS_BITMAP_ZERO \
+    { \
+        .bits = { 0 } \
+    }
+
+
+/**
+ * Find the index of the first bit set to 1 in a given bitmap
+ *
+ * @param _bitmap Look for the first bit in this bitmap
+ */
+#define UCS_BITMAP_FFS(_bitmap) \
+    ({ \
+        size_t _bit_index = UCS_BITMAP_BITS_IN_WORD * \
+                            _UCS_BITMAP_NUM_WORDS(_bitmap); \
+        size_t _word_index, _temp; \
+        _UCS_BITMAP_FOR_EACH_WORD(_bitmap, _word_index) { \
+            _temp = _UCS_BITMAP_WORD(_bitmap, _word_index); \
+            if (_temp != 0) { \
+                _bit_index = ucs_ffs64(_temp) + (_word_index * \
+                        UCS_BITMAP_BITS_IN_WORD); \
+                break; \
+            } \
+        } \
+        _bit_index; \
+    })
+
+
+/**
+ * Return the number of bits set to 1 in a given bitmap
+ *
+ * @param _bitmap Check bits number in this bitmap
+ *
+ * @return Number of bits set to 1
+ */
+#define UCS_BITMAP_POPCOUNT(_bitmap) \
+    ({ \
+        size_t _word_index = 0, _popcount = 0; \
+        _UCS_BITMAP_FOR_EACH_WORD(_bitmap, _word_index) { \
+            _popcount += ucs_popcount(_UCS_BITMAP_WORD(_bitmap, _word_index)); \
+        } \
+        _popcount; \
+    })
+
+
+/**
+ *  Returns the number of bits set to 1 in a given bitmap,
+ *  up to a particular bit index
+ *
+ * @param _bitmap    Check bits number in this bitmap
+ * @param _bit_index Check bits up to this bit
+ *
+ * @return Number of bits set to 1
+ */
+#define UCS_BITMAP_POPCOUNT_UPTO_INDEX(_bitmap, _bit_index) \
+    ({ \
+        size_t _word_index = 0, _popcount = 0; \
+        _UCS_BITMAP_FOR_EACH_WORD(_bitmap, _word_index) { \
+            if ((_bit_index) >= ((_word_index) + 1) * UCS_BITMAP_BITS_IN_WORD) { \
+                _popcount += ucs_popcount( \
+                    _UCS_BITMAP_WORD(_bitmap, _word_index)); \
+            } else { \
+                _popcount += ucs_popcount( \
+                    _UCS_BITMAP_WORD(_bitmap, _word_index) & \
+                    (UCS_MASK((_bit_index) % UCS_BITMAP_BITS_IN_WORD))); \
+                    break; \
+            } \
+        } \
+        _popcount; \
+    })
+
+
+/**
+ *  Return a word-mask for the word at '_word_index' for all the bits up to
+ *  (and not including) '_mask_index'.
+ *
+ * @param _bitmap     Mask bits in this bitmap
+ * @param _word_index Index of the word to be masked
+ * @param _mask_index Mask bits up to this bit index
+ */
+#define _UCS_BITMAP_MASK_WORD(_bitmap, _word_index, _mask_index) \
+    ((_mask_index) > (_word_index) * UCS_BITMAP_BITS_IN_WORD) ? \
+        ((((_mask_index) >= ((_word_index) + 1) * UCS_BITMAP_BITS_IN_WORD) ? \
+              UCS_BITMAP_WORD_MASK : \
+              UCS_MASK((_mask_index) % UCS_BITMAP_BITS_IN_WORD))) : 0; \
+
+
+/**
+ * Mask a bitmap by setting all bits up to a given index (excluding it) to 1
+ *
+ * @param _bitmap     Mask bits in this bitmap
+ * @param _mask_index Mask all bits up to this index (excluding it)
+ */
+#define UCS_BITMAP_MASK(_bitmap, _mask_index) \
+    { \
+        size_t _word_index = 0; \
+        \
+        ucs_assert((_mask_index) < \
+                   _UCS_BITMAP_NUM_WORDS(*_bitmap) * UCS_BITMAP_BITS_IN_WORD); \
+        UCS_BITMAP_CLEAR(_bitmap); \
+        _UCS_BITMAP_FOR_EACH_WORD(*_bitmap, _word_index) { \
+            _UCS_BITMAP_WORD(*_bitmap, _word_index) = \
+                    _UCS_BITMAP_MASK_WORD(*_bitmap, _word_index, (_mask_index)); \
+        } \
+    }
+
+
+/**
+ * Set all bits of a given bitmap to 1
+ *
+ * @param _bitmap Set bits in this bitmap
+ */
+#define UCS_BITMAP_SET_ALL(_bitmap) \
+    { \
+        size_t _word_index = 0; \
+        _UCS_BITMAP_FOR_EACH_WORD(_bitmap, _word_index) { \
+            _UCS_BITMAP_WORD(_bitmap, _word_index) = UCS_BITMAP_WORD_MASK; \
+        } \
+    }
+
+
+/**
+ * Iterate over all set (1) bits of a given bitmap
+ *
+ * @param _bitmap    Iterate over bits of this bitmap
+ * @param _bit_index Bit index (global offset - relative to the whole bitmap)
+ */
+#define UCS_BITMAP_FOR_EACH_BIT(_bitmap, _bit_index) \
+    for (_bit_index = ucs_bitmap_ffs((_bitmap).bits, \
+                                     _UCS_BITMAP_NUM_WORDS(_bitmap), 0); \
+         _bit_index < \
+         _UCS_BITMAP_NUM_WORDS(_bitmap) * UCS_BITMAP_BITS_IN_WORD; \
+         _bit_index = ucs_bitmap_ffs((_bitmap).bits, \
+                                     _UCS_BITMAP_NUM_WORDS(_bitmap), \
+                                     _bit_index + 1))
+
+
+/**
+ * Copy the whole contents of a bitmap
+ *
+ * @param _dest_bitmap Copy bits to this bitmap
+ * @param _src_bitmap  Copy bits from this bitmap
+ */
+#define UCS_BITMAP_COPY(_dest_bitmap, _src_bitmap) \
+    memcpy((_dest_bitmap).bits, (_src_bitmap).bits, \
+           _UCS_BITMAP_NUM_WORDS(_src_bitmap));
+
+
+/**
+ * Perform bitwise NOT of a bitmap
+ *
+ * @param _bitmap Negate this bitmap
+ * @param _length Length of the bitmaps (in bits)
+ *
+ * @return A new bitmap, which is the negation of the given one
+ */
+#define UCS_BITMAP_NOT(_bitmap, _length) \
+    UCS_PP_TOKENPASTE3(_ucs_bitmap_, _length, _not)(_bitmap)
+
+
+/**
+ * Perform bitwise AND of 2 bitmaps and return the result
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ * @param _length  Length of the bitmaps (in bits)
+ *
+ * @return A new bitmap, which is the logical AND of the operands
+ */
+#define UCS_BITMAP_AND(_bitmap1, _bitmap2, _length) \
+    UCS_PP_TOKENPASTE3(_ucs_bitmap_, _length, _and) \
+    (_bitmap1, _bitmap2)
+
+
+/**
+ * Perform bitwise OR of 2 bitmaps and return the result
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ * @param _length  Length of the bitmaps (in bits)
+ *
+ * @return A new bitmap, which is the logical OR of the operands
+ */
+#define UCS_BITMAP_OR(_bitmap1, _bitmap2, _length) \
+    UCS_PP_TOKENPASTE3(_ucs_bitmap_, _length, _or) \
+    (_bitmap1, _bitmap2)
+
+
+/**
+ * Perform bitwise XOR of 2 bitmaps and return the result
+ *
+ * @param _bitmap1 First operand
+ * @param _bitmap2 Second operand
+ * @param _length  Length of the bitmaps (in bits)
+ *
+ * @return A new bitmap, which is the logical XOR of the operands
+ */
+#define UCS_BITMAP_XOR(_bitmap1, _bitmap2, _length) \
+    UCS_PP_TOKENPASTE3(_ucs_bitmap_, _length, _xor) \
+    (_bitmap1, _bitmap2)
+
+
+static UCS_F_ALWAYS_INLINE bool
+ucs_bitmap_is_zero(const void *bitmap, size_t num_words)
+{
+    size_t i;
+
+    for (i = 0; i < num_words; i++) {
+        if (((ucs_bitmap_word_t *)bitmap)[i]) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+
+/**
+ * Find the index of the first bit set to 1 in a given bitmap, starting from
+ * a particular index (excluding it). If all bits are zero, returns the index
+ * past the last bit (bitmap size).
+ *
+ * @param bitmap_words Look for the first bit in the words of this bitmap
+ * @param num_words    Number of words in the bitmsp
+ * @param start_index  The first bit to look from
+ */
+static UCS_F_ALWAYS_INLINE int
+ucs_bitmap_ffs(const ucs_bitmap_word_t *bitmap_words, size_t num_words,
+               size_t start_index)
+{
+    size_t word_index = start_index / UCS_BITMAP_BITS_IN_WORD;
+    size_t mask       = ~UCS_MASK(start_index % UCS_BITMAP_BITS_IN_WORD);
+    size_t first_bit_in_word;
+
+    while (word_index < num_words) {
+        if (bitmap_words[word_index] & mask) {
+            first_bit_in_word = ucs_ffs64(bitmap_words[word_index] & mask);
+            return _UCS_BITMAP_BIT_INDEX(first_bit_in_word, word_index);
+        }
+
+        mask = UCS_BITMAP_WORD_MASK;
+        word_index++;
+    }
+
+    return _UCS_BITMAP_BIT_INDEX(0, word_index);
+}
+
+
+_UCS_BITMAP_DECLARE_TYPE(64)
+_UCS_BITMAP_DECLARE_TYPE(128)
+_UCS_BITMAP_DECLARE_TYPE(256)
+
+
+END_C_DECLS
+
+#endif /* BITMAP_H_ */
diff --git a/src/ucs/datastruct/callbackq.c b/src/ucs/datastruct/callbackq.c
index 7a9c97d0fc1..4999208b387 100644
--- a/src/ucs/datastruct/callbackq.c
+++ b/src/ucs/datastruct/callbackq.c
@@ -14,7 +14,7 @@
 #include <ucs/arch/bitops.h>
 #include <ucs/async/async.h>
 #include <ucs/debug/assert.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/sys/sys.h>
 
 #include "callbackq.h"
@@ -363,18 +363,20 @@ static unsigned ucs_callbackq_slow_proxy(void *arg)
 {
     ucs_callbackq_t      *cbq  = arg;
     ucs_callbackq_priv_t *priv = ucs_callbackq_priv(cbq);
+    unsigned num_slow_elems    = priv->num_slow_elems;
+    unsigned count             = 0;
     ucs_callbackq_elem_t *elem;
     unsigned UCS_V_UNUSED removed_idx;
     unsigned slow_idx, fast_idx;
     ucs_callbackq_elem_t tmp_elem;
-    unsigned count = 0;
 
     ucs_trace_poll("cbq=%p", cbq);
 
     ucs_callbackq_enter(cbq);
 
-    /* Execute and update slow-path callbacks */
-    for (slow_idx = 0; slow_idx < priv->num_slow_elems; ++slow_idx) {
+    /* Execute and update slow-path callbacks by num_slow_elems copy to avoid
+     * infinite loop if callback adds another one */
+    for (slow_idx = 0; slow_idx < num_slow_elems; ++slow_idx) {
         elem = &priv->slow_elems[slow_idx];
         if (elem->id == UCS_CALLBACKQ_ID_NULL) {
             continue;
diff --git a/src/ucs/datastruct/frag_list.c b/src/ucs/datastruct/frag_list.c
index 9590ca54e73..4c8241b32e8 100644
--- a/src/ucs/datastruct/frag_list.c
+++ b/src/ucs/datastruct/frag_list.c
@@ -292,12 +292,15 @@ ucs_frag_list_insert_slow(ucs_frag_list_t *head, ucs_frag_list_elem_t *elem,
         }
 
         /* todo: mark as likely */
-        if (UCS_FRAG_LIST_SN_CMP(h->head.last_sn+1, ==, sn)) {
+        if (UCS_FRAG_LIST_SN_CMP(h->head.last_sn + 1, ==, sn)) {
+            ucs_assertv(UCS_FRAG_LIST_SN_CMP(h->head.first_sn, <=, h->head.last_sn),
+                        "h=%p first_sn=%u last_sn=%u", h, h->head.first_sn,
+                        h->head.last_sn);
             /* add tail, check merge with next list */
             frag_list_add_tail(h, elem);
             nexth = ucs_container_of(h->list.next, ucs_frag_list_elem_t, list);
-
-            if (nexth != NULL && nexth->head.first_sn == sn + 1) {
+            if (!ucs_queue_is_tail(&head->list, &h->list) &&
+                (nexth->head.first_sn == (sn + 1))) {
                 frag_list_merge_heads(head, h, nexth);
                 head->list_count--;
             }
diff --git a/src/ucs/datastruct/mpool.c b/src/ucs/datastruct/mpool.c
index 1b142f8ad23..e8bb703a1ad 100644
--- a/src/ucs/datastruct/mpool.c
+++ b/src/ucs/datastruct/mpool.c
@@ -38,6 +38,7 @@ static void ucs_mpool_chunk_leak_check(ucs_mpool_t *mp, ucs_mpool_chunk_t *chunk
 
     for (i = 0; i < chunk->num_elems; ++i) {
         elem = ucs_mpool_chunk_elem(mp->data, chunk, i);
+        VALGRIND_MAKE_MEM_DEFINED(elem, sizeof *elem);
         if (elem->mpool != NULL) {
             ucs_warn("object %p was not returned to mpool %s", elem + 1,
                      ucs_mpool_name(mp));
diff --git a/src/ucs/datastruct/ptr_array.c b/src/ucs/datastruct/ptr_array.c
index f7aedaf92f0..a9d1d55d246 100644
--- a/src/ucs/datastruct/ptr_array.c
+++ b/src/ucs/datastruct/ptr_array.c
@@ -209,7 +209,7 @@ void ucs_ptr_array_set(ucs_ptr_array_t *ptr_array, unsigned element_index,
     ucs_ptr_array_elem_t *elem;
     unsigned next, free_iter, free_ahead, new_size;
 
-    if (ucs_unlikely(element_index > ptr_array->size)) {
+    if (ucs_unlikely(element_index >= ptr_array->size)) {
         new_size = ucs_max(ptr_array->size * 2, element_index + 1);
         ucs_ptr_array_grow(ptr_array, new_size UCS_MEMTRACK_NAME(ptr_array->name));
     } else if (!__ucs_ptr_array_is_free(ptr_array->start[element_index])) {
diff --git a/src/ucs/datastruct/ptr_map.h b/src/ucs/datastruct/ptr_map.h
index 6db03f19213..758d1906471 100644
--- a/src/ucs/datastruct/ptr_map.h
+++ b/src/ucs/datastruct/ptr_map.h
@@ -23,6 +23,12 @@ BEGIN_C_DECLS
 #define UCS_PTR_MAP_KEY_MIN_ALIGN       UCS_BIT(1)
 
 
+/**
+ * Invalid key.
+ */
+#define UCS_PTR_MAP_KEY_INVALID ((ucs_ptr_map_key_t)0)
+
+
 /**
  * Key to find pointer in @ref ucs_ptr_map_t.
  */
diff --git a/src/ucs/datastruct/ptr_map.inl b/src/ucs/datastruct/ptr_map.inl
index 0419d62a9b8..a2654d6de67 100644
--- a/src/ucs/datastruct/ptr_map.inl
+++ b/src/ucs/datastruct/ptr_map.inl
@@ -20,6 +20,16 @@ BEGIN_C_DECLS
 #define UCS_PTR_MAP_KEY_INDIRECT_FLAG   UCS_BIT(0)
 
 
+/**
+ * Returns whether the key is indirect or not.
+ *
+ * @param [in]  key     Key to object pointer.
+ *
+ * @return 0 - direct, otherside - indirect.
+ */
+#define ucs_ptr_map_key_indirect(_key) ((_key) & UCS_PTR_MAP_KEY_INDIRECT_FLAG)
+
+
 KHASH_IMPL(ucs_ptr_map_impl, ucs_ptr_map_key_t, void*, 1,
            kh_int64_hash_func, kh_int64_hash_equal);
 
@@ -32,6 +42,7 @@ KHASH_IMPL(ucs_ptr_map_impl, ucs_ptr_map_key_t, void*, 1,
  */
 static inline ucs_status_t ucs_ptr_map_init(ucs_ptr_map_t *map)
 {
+    UCS_STATIC_ASSERT(!ucs_ptr_map_key_indirect(UCS_PTR_MAP_KEY_INVALID));
     map->next_id = 0;
     kh_init_inplace(ucs_ptr_map_impl, &map->hash);
     return UCS_OK;
@@ -65,8 +76,9 @@ static inline void ucs_ptr_map_destroy(ucs_ptr_map_t *map)
  *                        the pointer @ptr.
  * @param [out] key       Key to object pointer @ptr if operation completed
  *                        successfully otherwise value is undefined.
- * @return      UCS_OK on success otherwise error code as defined by
- *              @ref ucs_status_t.
+ * @return UCS_OK on success, UCS_ERR_NO_PROGRESS if this key is direct and
+ *         therefore no action was performed, otherwise error code as defined by
+ *         @ref ucs_status_t.
  * @note @ptr must be aligned on @ref UCS_PTR_MAP_KEY_MIN_ALIGN.
  */
 static UCS_F_ALWAYS_INLINE ucs_status_t
@@ -79,7 +91,8 @@ ucs_ptr_map_put(ucs_ptr_map_t *map, void *ptr, int indirect,
     if (ucs_likely(!indirect)) {
         *key = (uintptr_t)ptr;
         ucs_assert(!(*key & UCS_PTR_MAP_KEY_MIN_ALIGN));
-        return UCS_OK;
+        ucs_assert(*key != 0);
+        return UCS_ERR_NO_PROGRESS;
     }
 
     *key = (map->next_id += UCS_PTR_MAP_KEY_MIN_ALIGN) |
@@ -101,47 +114,33 @@ ucs_ptr_map_put(ucs_ptr_map_t *map, void *ptr, int indirect,
  *
  * @param [in]  map     Container to get the pointer value from.
  * @param [in]  key     Key to look up in the container.
- * @return object pointer on success, otherwise NULL.
- */
-static UCS_F_ALWAYS_INLINE void*
-ucs_ptr_map_get(const ucs_ptr_map_t *map, ucs_ptr_map_key_t key)
-{
-    khiter_t iter;
-
-    if (ucs_likely(!(key & UCS_PTR_MAP_KEY_INDIRECT_FLAG))) {
-        return (void*)key;
-    }
-
-    iter = kh_get(ucs_ptr_map_impl, &map->hash, key);
-    return ucs_unlikely(iter == kh_end(&map->hash)) ? NULL :
-           kh_value(&map->hash, iter);
-}
-
-/**
- * Extract a pointer value from the map by its key.
+ * @param [in]  extract Whether to remove the key from the map.
+ * @param [out] ptr_p   If successful, set to the pointer found in the map.
  *
- * @param [in]  map     Container to get the pointer value from.
- * @param [in]  key     Key to look up in the container.
- * @return object pointer on success, otherwise NULL.
+ * @return UCS_OK if found, UCS_ERR_NO_PROGRESS if this key is direct and
+ *         therefore no action was performed, UCS_ERR_NO_ELEM if not found.
  */
-static UCS_F_ALWAYS_INLINE void*
-ucs_ptr_map_extract(ucs_ptr_map_t *map, ucs_ptr_map_key_t key)
+static UCS_F_ALWAYS_INLINE ucs_status_t
+ucs_ptr_map_get(ucs_ptr_map_t *map, ucs_ptr_map_key_t key, int extract,
+                void **ptr_p)
 {
     khiter_t iter;
-    void *value;
 
-    if (ucs_likely(!(key & UCS_PTR_MAP_KEY_INDIRECT_FLAG))) {
-        return (void*)key;
+    if (ucs_likely(!ucs_ptr_map_key_indirect(key))) {
+        *ptr_p = (void*)key;
+        return UCS_ERR_NO_PROGRESS;
     }
 
     iter = kh_get(ucs_ptr_map_impl, &map->hash, key);
     if (ucs_unlikely(iter == kh_end(&map->hash))) {
-        return NULL;
+        return UCS_ERR_NO_ELEM;
     }
 
-    value = kh_value(&map->hash, iter);
-    kh_del(ucs_ptr_map_impl, &map->hash, iter);
-    return value;
+    *ptr_p = kh_value(&map->hash, iter);
+    if (extract) {
+        kh_del(ucs_ptr_map_impl, &map->hash, iter);
+    }
+    return UCS_OK;
 }
 
 /**
@@ -149,15 +148,17 @@ ucs_ptr_map_extract(ucs_ptr_map_t *map, ucs_ptr_map_key_t key)
  *
  * @param [in]  map     Container.
  * @param [in]  key     Key to object pointer.
- * @return       - UCS_OK on success
- *               - UCS_ERR_NO_ELEM if the key is not found in the internal hash
- *                 table.
+ *
+ * @return - UCS_OK on success
+ *         - UCS_ERR_NO_PROGRESS if this key is direct and therefore no action
+ *           was performed
+ *         - UCS_ERR_NO_ELEM if the key is not found in the internal hash table.
  */
 static UCS_F_ALWAYS_INLINE ucs_status_t
 ucs_ptr_map_del(ucs_ptr_map_t *map, ucs_ptr_map_key_t key)
 {
-    return ucs_likely(ucs_ptr_map_extract(map, key) != NULL) ?
-           UCS_OK : UCS_ERR_NO_ELEM;
+    void UCS_V_UNUSED *dummy;
+    return ucs_ptr_map_get(map, key, 1, &dummy);
 }
 
 END_C_DECLS
diff --git a/src/ucs/datastruct/queue.h b/src/ucs/datastruct/queue.h
index cd434c6baa5..8657a188e5b 100644
--- a/src/ucs/datastruct/queue.h
+++ b/src/ucs/datastruct/queue.h
@@ -41,6 +41,15 @@ static inline size_t ucs_queue_length(ucs_queue_head_t *queue)
     return length;
 }
 
+/**
+ * @return Whether the given element is the tail element in the queue.
+ */
+static inline int
+ucs_queue_is_tail(ucs_queue_head_t *queue, ucs_queue_elem_t *elem)
+{
+    return queue->ptail == &elem->next;
+}
+
 /**
  * @return Whether the queue is empty.
  */
@@ -75,7 +84,7 @@ static inline void ucs_queue_push_head(ucs_queue_head_t *queue,
 {
     elem->next = queue->head;
     queue->head = elem;
-    if (queue->ptail == &queue->head) {
+    if (ucs_queue_is_empty(queue)) {
         queue->ptail = &elem->next;
     }
 }
@@ -92,7 +101,7 @@ static inline ucs_queue_elem_t *ucs_queue_pull_non_empty(ucs_queue_head_t *queue
 
     elem = queue->head;
     queue->head = elem->next;
-    if (queue->ptail == &elem->next) {
+    if (ucs_queue_is_tail(queue, elem)) {
         queue->ptail = &queue->head;
     }
     return elem;
@@ -107,7 +116,7 @@ static inline void ucs_queue_del_iter(ucs_queue_head_t *queue, ucs_queue_iter_t
 {
     ucs_assert((iter != NULL) && (*iter != NULL));
 
-    if (queue->ptail == &(*iter)->next) {
+    if (ucs_queue_is_tail(queue, *iter)) {
         queue->ptail = iter; /* deleting the last element */
         *iter = NULL;        /* make *ptail point to NULL */
     } else {
@@ -115,14 +124,14 @@ static inline void ucs_queue_del_iter(ucs_queue_head_t *queue, ucs_queue_iter_t
     }
 
     /* Sanity check */
-    ucs_assertv((queue->head != NULL) || (queue->ptail == &queue->head),
-               "head=%p ptail=%p &head=%p iter=%p", queue->head, queue->ptail,
-               &queue->head, iter);
+    ucs_assertv((queue->head != NULL) || ucs_queue_is_empty(queue),
+                "head=%p ptail=%p &head=%p iter=%p", queue->head, queue->ptail,
+                &queue->head, iter);
 
     /* If the queue is empty, head must point to null */
-    ucs_assertv((queue->ptail != &queue->head) || (queue->head == NULL),
-               "head=%p ptail=%p &head=%p iter=%p", queue->head, queue->ptail,
-               &queue->head, iter);
+    ucs_assertv(!ucs_queue_is_empty(queue) || (queue->head == NULL),
+                "head=%p ptail=%p &head=%p iter=%p", queue->head, queue->ptail,
+                &queue->head, iter);
 }
 
 /**
diff --git a/src/ucs/datastruct/string_buffer.c b/src/ucs/datastruct/string_buffer.c
index 5edd4cf5579..39d4ecc7129 100644
--- a/src/ucs/datastruct/string_buffer.c
+++ b/src/ucs/datastruct/string_buffer.c
@@ -13,6 +13,7 @@
 #include <ucs/debug/assert.h>
 #include <ucs/debug/log.h>
 #include <ucs/debug/memtrack.h>
+#include <ucs/sys/string.h>
 #include <ucs/sys/math.h>
 #include <string.h>
 #include <ctype.h>
@@ -27,12 +28,18 @@ UCS_ARRAY_IMPL(string_buffer, size_t, char, static UCS_F_ALWAYS_INLINE)
 
 void ucs_string_buffer_init(ucs_string_buffer_t *strb)
 {
-    ucs_array_init_dynamic(string_buffer, &strb->str);
+    ucs_array_init_dynamic(&strb->str);
+}
+
+void ucs_string_buffer_init_fixed(ucs_string_buffer_t *strb, char *buffer,
+                                  size_t capacity)
+{
+    ucs_array_init_fixed(&strb->str, buffer, capacity);
 }
 
 void ucs_string_buffer_cleanup(ucs_string_buffer_t *strb)
 {
-    ucs_array_cleanup_dynamic(string_buffer, &strb->str);
+    ucs_array_cleanup_dynamic(&strb->str);
 }
 
 size_t ucs_string_buffer_length(ucs_string_buffer_t *strb)
@@ -87,6 +94,22 @@ void ucs_string_buffer_appendf(ucs_string_buffer_t *strb, const char *fmt, ...)
     ucs_assert(*ucs_array_end(&strb->str) == '\0');
 }
 
+void ucs_string_buffer_append_hex(ucs_string_buffer_t *strb, const void *data,
+                                  size_t size, size_t per_line)
+{
+    size_t prev_length    = ucs_array_length(&strb->str);
+    size_t hexdump_length = (size * 2) + (size / 4) + (size / per_line);
+    size_t new_length;
+
+    ucs_array_reserve(string_buffer, &strb->str, prev_length + hexdump_length);
+    ucs_str_dump_hex(data, size, ucs_array_end(&strb->str),
+                     ucs_array_available_length(&strb->str), per_line);
+
+    new_length = prev_length + strlen(ucs_array_end(&strb->str));
+    ucs_array_set_length(&strb->str, new_length);
+    ucs_assert(*ucs_array_end(&strb->str) == '\0');
+}
+
 void ucs_string_buffer_rtrim(ucs_string_buffer_t *strb, const char *charset)
 {
     char *ptr = ucs_array_end(&strb->str);
@@ -118,3 +141,50 @@ const char *ucs_string_buffer_cstr(const ucs_string_buffer_t *strb)
     ucs_assert(c_str != NULL);
     return c_str;
 }
+
+void ucs_string_buffer_dump(const ucs_string_buffer_t *strb,
+                            const char *line_prefix, FILE *stream)
+{
+    const char *next_tok, *tok;
+    size_t size, remaining;
+
+    if (ucs_array_is_empty(&strb->str)) {
+        return;
+    }
+
+    tok      = ucs_array_begin(&strb->str);
+    next_tok = strchr(tok, '\n');
+    while (next_tok != NULL) {
+        fputs(line_prefix, stream);
+
+        /* Write the line, handle partial writes */
+        remaining = UCS_PTR_BYTE_DIFF(tok, next_tok + 1);
+        while (remaining > 0) {
+            size       = fwrite(tok, sizeof(*tok), remaining, stream);
+            tok        = UCS_PTR_BYTE_OFFSET(tok, size);
+            remaining -= size;
+        }
+
+        next_tok = strchr(tok, '\n');
+    }
+
+    /* Write last line */
+    if (*tok != '\0') {
+        fputs(line_prefix, stream);
+        fputs(tok, stream);
+    }
+}
+
+char *ucs_string_buffer_extract_mem(ucs_string_buffer_t *strb)
+{
+    char *c_str;
+
+    if (ucs_array_is_fixed(&strb->str)) {
+        c_str = ucs_strdup(ucs_array_begin(&strb->str), "ucs_string_buffer");
+    } else {
+        c_str = ucs_array_begin(&strb->str);
+        ucs_array_init_dynamic(&strb->str);
+    }
+
+    return c_str;
+}
diff --git a/src/ucs/datastruct/string_buffer.h b/src/ucs/datastruct/string_buffer.h
index 5c1acc7c18f..94f7f38b158 100644
--- a/src/ucs/datastruct/string_buffer.h
+++ b/src/ucs/datastruct/string_buffer.h
@@ -9,8 +9,9 @@
 
 #include <ucs/sys/compiler_def.h>
 #include <ucs/type/status.h>
-#include <stddef.h>
 #include <ucs/datastruct/array.h>
+#include <stddef.h>
+#include <stdio.h>
 
 
 BEGIN_C_DECLS
@@ -18,6 +19,43 @@ BEGIN_C_DECLS
 UCS_ARRAY_DECLARE_TYPE(string_buffer, size_t, char)
 
 
+/**
+ * Dynamic string buffer initializer. The backing storage should be released
+ * explicitly by calling @ref ucs_string_buffer_cleanup()
+ */
+#define UCS_STRING_BUFFER_INITIALIZER \
+    { \
+        UCS_ARRAY_DYNAMIC_INITIALIZER \
+    }
+
+
+/**
+ * Declare a string buffer which is using an existing string as backing store.
+ * Such string buffer does not allocate additional memory and does not have to
+ * be cleaned-up, and it can also be used to build a string onto existing
+ * C-string buffer passed as a function argument.
+ *
+ * @param _var       String buffer variable name
+ * @param _buffer    Buffer to use as backing store.
+ * @param _capacity  Buffer capacity.
+ *
+ * Example:
+ *
+ * @code{.c}
+ * char * build_my_string(char *buffer, size_t max_length)
+ * {
+ *    UCS_STRING_BUFFER_FIXED(strb, buffer, max_legth);
+ *    ucs_string_buffer_appendf(&strb, "%x%x", 57005, 48879);
+ *    return buffer;
+ * }
+ * @endcode
+ */
+#define UCS_STRING_BUFFER_FIXED(_var, _buffer, _capacity) \
+    ucs_string_buffer_t _var = { \
+        UCS_ARRAY_FIXED_INITIALIZER(_buffer, _capacity) \
+    }
+
+
 /**
  * Declare a string buffer which is using a static array as backing store.
  * Such string buffer does not allocate additional memory and does not have to
@@ -35,10 +73,13 @@ UCS_ARRAY_DECLARE_TYPE(string_buffer, size_t, char)
  * ucs_string_buffer_appendf(&strb, "%x%x", 57005, 48879);
  * @endcode
  */
-#define UCS_STRING_BUFFER_FIXED(_var, _buffer) \
-    ucs_string_buffer_t _var = { \
-        UCS_ARRAY_FIXED_INITIALIZER((_buffer), ucs_static_array_size(_buffer)) \
-    }
+#define UCS_STRING_BUFFER_STATIC(_var, _buffer) \
+    UCS_STRING_BUFFER_FIXED(_var, _buffer, ucs_static_array_size(_buffer))
+
+
+#define UCS_STRING_BUFFER_ONSTACK(_var, _capacity) \
+    UCS_STRING_BUFFER_FIXED(_var, UCS_ARRAY_ALLOC_ONSTACK(string_buffer, _capacity), \
+                            _capacity) \
 
 
 /**
@@ -58,6 +99,17 @@ typedef struct ucs_string_buffer {
 void ucs_string_buffer_init(ucs_string_buffer_t *strb);
 
 
+/**
+ * Initialize a string buffer with fixed-size buffer as backing storage.
+ *
+ * @param [out] strb      String buffer to initialize.
+ * @param [in]  buffer    Buffer to use as backing storage.
+ * @param [in]  capacity  Buffer size.
+ */
+void ucs_string_buffer_init_fixed(ucs_string_buffer_t *strb, char *buffer,
+                                  size_t capacity);
+
+
 /**
  * Cleanup a string buffer and release any memory associated with it.
  *
@@ -89,6 +141,21 @@ void ucs_string_buffer_appendf(ucs_string_buffer_t *strb, const char *fmt, ...)
     UCS_F_PRINTF(2, 3);
 
 
+/**
+ * Append a hex dump to the string buffer.
+ *
+ * @param [inout] strb       String buffer to append to.
+ * @param [in]    data       Raw data to hex-dump.
+ * @param [in]    size       Raw data size.
+ * @param [in]    per_line   Add a newline character after this number of bytes.
+ *
+ * @note If the string cannot grow to the required length, only some of the
+ *       characters would be appended.
+ */
+void ucs_string_buffer_append_hex(ucs_string_buffer_t *strb, const void *data,
+                                  size_t size, size_t per_line);
+
+
 /**
  * Remove specific characters from the end of the string.
  *
@@ -108,13 +175,36 @@ void ucs_string_buffer_rtrim(ucs_string_buffer_t *strb, const char *charset);
  * buffer. The returned string is valid only as long as no other operation is
  * done on the string buffer (including append).
  *
- * @param [in]   strb   String buffer to convert to a C-style string
+ * @param [in]   strb   String buffer to convert to a C-style string.
  *
  * @return C-style string representing the data in the buffer.
  */
 const char *ucs_string_buffer_cstr(const ucs_string_buffer_t *strb);
 
 
+/**
+ * Print the string buffer to a stream as multi-line text.
+ *
+ * @param [in]  strb          String buffer to print.
+ * @param [in]  line_prefix   Prefix to prepend to each output line.
+ * @param [in]  stream        Stream to print to.
+ */
+void ucs_string_buffer_dump(const ucs_string_buffer_t *strb,
+                            const char *line_prefix, FILE *stream);
+
+
+/**
+ * Return a pointer to a C-style string which represents the string buffer. The
+ * returned pointer should be freed with method which deallocates memory, e.g.
+ * ucs_free. There is no need to call ucs_string_buffer_cleanup in case of
+ * extracting memory using this method.
+ *
+ * @param [inout] strb String buffer to convert to a C-style string.
+ *
+ * @return C-style string representing the data in the buffer.
+ */
+char *ucs_string_buffer_extract_mem(ucs_string_buffer_t *strb);
+
 END_C_DECLS
 
 #endif
diff --git a/src/ucs/debug/assert.c b/src/ucs/debug/assert.c
index e22536cdfa4..8902910b39d 100644
--- a/src/ucs/debug/assert.c
+++ b/src/ucs/debug/assert.c
@@ -11,7 +11,7 @@
 #include "assert.h"
 
 #include <ucs/config/global_opts.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/debug/log.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/string.h>
diff --git a/src/ucs/debug/debug.c b/src/ucs/debug/debug.c
index c1db45ba330..b879a9a74af 100644
--- a/src/ucs/debug/debug.c
+++ b/src/ucs/debug/debug.c
@@ -8,7 +8,7 @@
 #  include "config.h"
 #endif
 
-#include "debug.h"
+#include "debug_int.h"
 #include "log.h"
 
 #include <ucs/datastruct/khash.h>
@@ -521,7 +521,7 @@ static void ucs_debug_print_source_file(const char *file, unsigned line,
         return;
     }
 
-    n = 0;
+    n = 1;
     fprintf(stream, "\n");
     fprintf(stream, "%s: [ %s() ]\n", file, function);
     if (line > context) {
@@ -1080,12 +1080,20 @@ void ucs_handle_error(const char *message)
     }
 }
 
+int ucs_debug_is_handle_errors()
+{
+    static const unsigned mask = UCS_BIT(UCS_HANDLE_ERROR_BACKTRACE) |
+                                 UCS_BIT(UCS_HANDLE_ERROR_FREEZE) |
+                                 UCS_BIT(UCS_HANDLE_ERROR_DEBUG);
+    return ucs_global_opts.handle_errors & mask;
+}
+
 static int ucs_debug_is_error_signal(int signum)
 {
     khiter_t hash_it;
     int result;
 
-    if (!ucs_global_opts.handle_errors) {
+    if (!ucs_debug_is_handle_errors()) {
         return 0;
     }
 
@@ -1260,6 +1268,7 @@ static int ucs_debug_backtrace_is_excluded(void *address, const char *symbol)
            !strcmp(symbol, "ucs_debug_handle_error_signal") ||
            !strcmp(symbol, "ucs_debug_backtrace_create") ||
            !strcmp(symbol, "ucs_debug_show_innermost_source_file") ||
+           !strcmp(symbol, "ucs_debug_print_backtrace") ||
            !strcmp(symbol, "ucs_log_default_handler") ||
            !strcmp(symbol, "__ucs_abort") ||
            !strcmp(symbol, "ucs_log_dispatch") ||
@@ -1314,7 +1323,7 @@ void ucs_debug_init()
     kh_init_inplace(ucs_signal_orig_action, &ucs_signal_orig_action_map);
     kh_init_inplace(ucs_debug_symbol, &ucs_debug_symbols_cache);
 
-    if (ucs_global_opts.handle_errors) {
+    if (ucs_debug_is_handle_errors()) {
         ucs_debug_set_signal_alt_stack();
         ucs_set_signal_handler(ucs_error_signal_handler);
     }
diff --git a/src/ucs/debug/debug.h b/src/ucs/debug/debug.h
index 66b90dc0a79..539324c120d 100644
--- a/src/ucs/debug/debug.h
+++ b/src/ucs/debug/debug.h
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -7,141 +7,17 @@
 #ifndef UCS_DEBUG_H_
 #define UCS_DEBUG_H_
 
-#include <ucs/datastruct/list.h>
-#include <ucs/type/status.h>
-#include <ucs/config/types.h>
-#include <stdio.h>
+#include <ucs/sys/compiler_def.h>
 
-
-/**
- * Information about an address in the code.
- */
-typedef struct ucs_debug_address_info {
-    struct {
-        char           path[512];          /* Binary file path */
-        unsigned long  base;               /* Binary file load base */
-    } file;
-    char               function[128];      /* Function name */
-    char               source_file[512];   /* Source file path */
-    unsigned           line_number;        /* Line number */
-} ucs_debug_address_info_t;
-
-
-typedef struct backtrace *backtrace_h;
-typedef struct backtrace_line *backtrace_line_h;
-
-extern const char *ucs_state_detail_level_names[];
-extern const char *ucs_signal_names[];
-
-
-/**
- * Initialize UCS debugging subsystem.
- */
-void ucs_debug_init();
-
-
-/**
- * Cleanup UCS debugging subsystem.
- */
-void ucs_debug_cleanup(int on_error);
+BEGIN_C_DECLS
 
 /**
  * Disable signal handling in UCS for signal.
  * Previous signal handler is set.
+ * @param signum   Signal number to disable handling.
  */
 void ucs_debug_disable_signal(int signum);
 
-/**
- * Disable signal handling in UCS for all signals
- * that was set in ucs_global_opts.error_signals.
- * Previous signal handlers are set.
- */
-void ucs_debug_disable_signals();
-/**
- * Get information about an address in the code of the current program.
- * @param address   Address to look up.
- * @param info      Filled with information about the given address. Source file
- *                  and line number are filled only if the binary file was compiled
- *                  with debug information, and UCS was configured with detailed
- *                  backtrace enabled.
- * @return UCS_ERR_NO_ELEM if the address is not found, UCS_OK otherwise.
- */
-ucs_status_t ucs_debug_lookup_address(void *address, ucs_debug_address_info_t *info);
-
-
-/**
- * @return Full path to current library.
- */
-const char *ucs_debug_get_lib_path();
-
-
-/**
- * @return UCS library loading address.
- */
-unsigned long ucs_debug_get_lib_base_addr();
-
-
-/**
- * Create a backtrace from the calling location.
- *
- * @param bckt          Backtrace object.
- * @param strip         How many frames to strip.
-*/
-ucs_status_t ucs_debug_backtrace_create(backtrace_h *bckt, int strip);
-
-
-/**
- * Destroy a backtrace and free all memory.
- *
- * @param bckt          Backtrace object.
- */
-void ucs_debug_backtrace_destroy(backtrace_h bckt);
-
-
-/**
- * Walk to the next backtrace line information.
- *
- * @param bckt          Backtrace object.
- * @param line          Filled with backtrace frame info.
- *
- * NOTE: the line remains valid as long as the backtrace object is not destroyed.
- */
-int ucs_debug_backtrace_next(backtrace_h bckt, backtrace_line_h *line);
-
-
-/**
- * Print backtrace line to string buffer.
- *
- * @param buffer         Target buffer to print to.
- * @param maxlen         Size of target buffer.
- * @param frame_num      Frame number
- * @param line           Backtrace line to print
- */
-void ucs_debug_print_backtrace_line(char *buffer, size_t maxlen,
-                                    int frame_num,
-                                    backtrace_line_h line);
-
-/**
- * Print backtrace to an output stream.
- *
- * @param stream         Stream to print to.
- * @param strip          How many frames to strip.
- */
-void ucs_debug_print_backtrace(FILE *stream, int strip);
-
-
-/**
- * Called when UCS detects a fatal error and provides means to debug the current
- * state of UCS.
- */
-void ucs_handle_error(const char *message);
-
-
-/**
- * @return Name of a symbol which begins in the given address, or NULL if
- * not found.
- */
-const char *ucs_debug_get_symbol_name(void *address);
-
+END_C_DECLS
 
 #endif
diff --git a/src/ucs/debug/debug_int.h b/src/ucs/debug/debug_int.h
new file mode 100644
index 00000000000..8a2f07ff995
--- /dev/null
+++ b/src/ucs/debug/debug_int.h
@@ -0,0 +1,142 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#ifndef UCS_DEBUG_INT_H_
+#define UCS_DEBUG_INT_H_
+
+#include <ucs/datastruct/list.h>
+#include <ucs/type/status.h>
+#include <ucs/config/types.h>
+#include <ucs/debug/debug.h>
+#include <stdio.h>
+
+
+/**
+ * Information about an address in the code.
+ */
+typedef struct ucs_debug_address_info {
+    struct {
+        char           path[512];          /* Binary file path */
+        unsigned long  base;               /* Binary file load base */
+    } file;
+    char               function[128];      /* Function name */
+    char               source_file[512];   /* Source file path */
+    unsigned           line_number;        /* Line number */
+} ucs_debug_address_info_t;
+
+
+typedef struct backtrace *backtrace_h;
+typedef struct backtrace_line *backtrace_line_h;
+
+extern const char *ucs_state_detail_level_names[];
+extern const char *ucs_signal_names[];
+
+
+/**
+ * Initialize UCS debugging subsystem.
+ */
+void ucs_debug_init();
+
+
+/**
+ * Cleanup UCS debugging subsystem.
+ */
+void ucs_debug_cleanup(int on_error);
+
+/**
+ * Disable signal handling in UCS for all signals
+ * that was set in ucs_global_opts.error_signals.
+ * Previous signal handlers are set.
+ */
+void ucs_debug_disable_signals();
+/**
+ * Get information about an address in the code of the current program.
+ * @param address   Address to look up.
+ * @param info      Filled with information about the given address. Source file
+ *                  and line number are filled only if the binary file was compiled
+ *                  with debug information, and UCS was configured with detailed
+ *                  backtrace enabled.
+ * @return UCS_ERR_NO_ELEM if the address is not found, UCS_OK otherwise.
+ */
+ucs_status_t ucs_debug_lookup_address(void *address, ucs_debug_address_info_t *info);
+
+
+/**
+ * @return Full path to current library.
+ */
+const char *ucs_debug_get_lib_path();
+
+
+/**
+ * @return UCS library loading address.
+ */
+unsigned long ucs_debug_get_lib_base_addr();
+
+
+/**
+ * Create a backtrace from the calling location.
+ *
+ * @param bckt          Backtrace object.
+ * @param strip         How many frames to strip.
+*/
+ucs_status_t ucs_debug_backtrace_create(backtrace_h *bckt, int strip);
+
+
+/**
+ * Destroy a backtrace and free all memory.
+ *
+ * @param bckt          Backtrace object.
+ */
+void ucs_debug_backtrace_destroy(backtrace_h bckt);
+
+
+/**
+ * Walk to the next backtrace line information.
+ *
+ * @param bckt          Backtrace object.
+ * @param line          Filled with backtrace frame info.
+ *
+ * NOTE: the line remains valid as long as the backtrace object is not destroyed.
+ */
+int ucs_debug_backtrace_next(backtrace_h bckt, backtrace_line_h *line);
+
+
+/**
+ * Print backtrace line to string buffer.
+ *
+ * @param buffer         Target buffer to print to.
+ * @param maxlen         Size of target buffer.
+ * @param frame_num      Frame number
+ * @param line           Backtrace line to print
+ */
+void ucs_debug_print_backtrace_line(char *buffer, size_t maxlen,
+                                    int frame_num,
+                                    backtrace_line_h line);
+
+/**
+ * Print backtrace to an output stream.
+ *
+ * @param stream         Stream to print to.
+ * @param strip          How many frames to strip.
+ */
+void ucs_debug_print_backtrace(FILE *stream, int strip);
+
+
+/**
+ * Called when UCS detects a fatal error and provides means to debug the current
+ * state of UCS.
+ */
+void ucs_handle_error(const char *message);
+
+
+/**
+ * @return Name of a symbol which begins in the given address, or NULL if
+ * not found.
+ */
+const char *ucs_debug_get_symbol_name(void *address);
+
+
+#endif
diff --git a/src/ucs/debug/log.c b/src/ucs/debug/log.c
index 5ff78c1093d..e426099763b 100644
--- a/src/ucs/debug/log.c
+++ b/src/ucs/debug/log.c
@@ -10,7 +10,8 @@
 
 #include "log.h"
 
-#include <ucs/debug/debug.h>
+#include <ucs/arch/atomic.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/checker.h>
 #include <ucs/sys/string.h>
@@ -21,22 +22,31 @@
 #define UCS_MAX_LOG_HANDLERS    32
 
 #define UCS_LOG_TIME_FMT        "[%lu.%06lu]"
-#define UCS_LOG_FILE_FMT        "%16s:%-4u"
-#define UCS_LOG_METADATA_FMT    "%-4s %-5s"
-#define UCS_LOG_PROC_DATA_FMT   "[%s:%-5d:%d]"
-#define UCS_LOG_SHORT_FMT       UCS_LOG_TIME_FMT" "UCS_LOG_FILE_FMT" " \
-                                UCS_LOG_METADATA_FMT" ""%s\n"
-#define UCS_LOG_FMT             UCS_LOG_TIME_FMT" "UCS_LOG_PROC_DATA_FMT" " \
-                                UCS_LOG_FILE_FMT" "UCS_LOG_METADATA_FMT" ""%s\n"
+#define UCS_LOG_METADATA_FMT    "%17s:%-4u %-4s %-5s %*s"
+#define UCS_LOG_PROC_DATA_FMT   "[%s:%-5d:%s]"
+
+#define UCS_LOG_SHORT_FMT       UCS_LOG_TIME_FMT " [%s] " UCS_LOG_METADATA_FMT "%s\n"
+#define UCS_LOG_FMT             UCS_LOG_TIME_FMT " " UCS_LOG_PROC_DATA_FMT " " \
+                                UCS_LOG_METADATA_FMT "%s\n"
 
 #define UCS_LOG_TIME_ARG(_tv)  (_tv)->tv_sec, (_tv)->tv_usec
-#define UCS_LOG_SHORT_ARG(_short_file, _line, _level, _comp_conf, _tv, _message) \
-    UCS_LOG_TIME_ARG(_tv), _short_file, _line, (_comp_conf)->name, \
-    ucs_log_level_names[_level], _message
+
+#define UCS_LOG_METADATA_ARG(_short_file, _line, _level, _comp_conf) \
+    (_short_file), (_line), (_comp_conf)->name, \
+    ucs_log_level_names[_level], (ucs_log_current_indent * 2), ""
+
+#define UCS_LOG_PROC_DATA_ARG() \
+    ucs_log_hostname, ucs_log_pid, ucs_log_get_thread_name()
+
+#define UCS_LOG_SHORT_ARG(_short_file, _line, _level, _comp_conf, _tv, \
+                          _message) \
+    UCS_LOG_TIME_ARG(_tv), ucs_log_get_thread_name(), \
+            UCS_LOG_METADATA_ARG(_short_file, _line, _level, _comp_conf), \
+            (_message)
+
 #define UCS_LOG_ARG(_short_file, _line, _level, _comp_conf, _tv, _message) \
-    UCS_LOG_TIME_ARG(_tv), ucs_log_hostname, ucs_log_pid, \
-    ucs_log_get_thread_num(),_short_file, _line, (_comp_conf)->name, \
-    ucs_log_level_names[_level], _message
+    UCS_LOG_TIME_ARG(_tv), UCS_LOG_PROC_DATA_ARG(), \
+    UCS_LOG_METADATA_ARG(_short_file, _line, _level, _comp_conf), (_message)
 
 const char *ucs_log_level_names[] = {
     [UCS_LOG_LEVEL_FATAL]        = "FATAL",
@@ -55,51 +65,32 @@ const char *ucs_log_level_names[] = {
     [UCS_LOG_LEVEL_PRINT]        = "PRINT"
 };
 
-static unsigned ucs_log_handlers_count      = 0;
-static int ucs_log_initialized              = 0;
-static char ucs_log_hostname[HOST_NAME_MAX] = {0};
-static int  ucs_log_pid                     = 0;
-static FILE *ucs_log_file                   = NULL;
-static char *ucs_log_file_base_name         = NULL;
-static int ucs_log_file_close               = 0;
-static int ucs_log_file_last_idx            = 0;
-static unsigned threads_count               = 0;
-static pthread_spinlock_t threads_lock      = 0;
-static pthread_t threads[128]               = {0};
+static unsigned ucs_log_handlers_count       = 0;
+static int ucs_log_initialized               = 0;
+static int __thread ucs_log_current_indent   = 0;
+static char ucs_log_hostname[HOST_NAME_MAX]  = {0};
+static int ucs_log_pid                       = 0;
+static FILE *ucs_log_file                    = NULL;
+static char *ucs_log_file_base_name          = NULL;
+static int ucs_log_file_close                = 0;
+static int ucs_log_file_last_idx             = 0;
+static uint32_t ucs_log_thread_count         = 0;
+static char __thread ucs_log_thread_name[32] = {0};
 static ucs_log_func_t ucs_log_handlers[UCS_MAX_LOG_HANDLERS];
 
 
-static int ucs_log_get_thread_num(void)
+static const char *ucs_log_get_thread_name()
 {
-    pthread_t self = pthread_self();
-    int i;
+    char *name = ucs_log_thread_name;
+    uint32_t thread_num;
 
-    for (i = 0; i < threads_count; ++i) {
-        if (threads[i] == self) {
-            return i;
-        }
+    if (ucs_unlikely(name[0] == '\0')) {
+        thread_num = ucs_atomic_fadd32(&ucs_log_thread_count, 1);
+        ucs_snprintf_safe(ucs_log_thread_name, sizeof(ucs_log_thread_name),
+                          "%u", thread_num);
     }
 
-    pthread_spin_lock(&threads_lock);
-
-    for (i = 0; i < threads_count; ++i) {
-        if (threads[i] == self) {
-            goto unlock_and_return_i;
-        }
-    }
-
-    if (threads_count >= ucs_static_array_size(threads)) {
-        i = -1;
-        goto unlock_and_return_i;
-    }
-
-    i = threads_count;
-    ++threads_count;
-    threads[i] = self;
-
-unlock_and_return_i:
-    pthread_spin_unlock(&threads_lock);
-    return i;
+    return name;
 }
 
 void ucs_log_flush()
@@ -239,8 +230,9 @@ ucs_log_default_handler(const char *file, unsigned line, const char *function,
 {
     size_t buffer_size = ucs_log_get_buffer_size();
     char *saveptr      = "";
-    char *log_line;
+    const char *short_file;
     struct timeval tv;
+    char *log_line;
     char *buf;
 
     if (!ucs_log_component_is_enabled(level, comp_conf) && (level != UCS_LOG_LEVEL_PRINT)) {
@@ -254,12 +246,13 @@ ucs_log_default_handler(const char *file, unsigned line, const char *function,
     if (level <= ucs_global_opts.log_level_trigger) {
         ucs_fatal_error_message(file, line, function, buf);
     } else {
+        short_file = ucs_basename(file);
         gettimeofday(&tv, NULL);
 
         log_line = strtok_r(buf, "\n", &saveptr);
         while (log_line != NULL) {
-            ucs_log_print(buffer_size, ucs_basename(file), line, level, comp_conf,
-                          &tv, log_line);
+            ucs_log_print(buffer_size, short_file, line, level, comp_conf, &tv,
+                          log_line);
             log_line = strtok_r(NULL, "\n", &saveptr);
         }
     }
@@ -286,6 +279,17 @@ void ucs_log_pop_handler()
     }
 }
 
+void ucs_log_indent(int delta)
+{
+    ucs_log_current_indent += delta;
+    ucs_assert(ucs_log_current_indent >= 0);
+}
+
+int ucs_log_get_current_indent()
+{
+    return ucs_log_current_indent;
+}
+
 unsigned ucs_log_num_handlers()
 {
     return ucs_log_handlers_count;
@@ -323,8 +327,8 @@ void ucs_log_fatal_error(const char *format, ...)
     p = buffer;
 
     /* Print hostname:pid */
-    snprintf(p, buffer_size, "[%s:%-5d:%d:%d] ", ucs_log_hostname, ucs_log_pid,
-             ucs_log_get_thread_num(), ucs_get_tid());
+    snprintf(p, buffer_size, "[%s:%-5d:%s:%d] ", ucs_log_hostname, ucs_log_pid,
+             ucs_log_get_thread_name(), ucs_get_tid());
     buffer_size -= strlen(p);
     p           += strlen(p);
 
@@ -413,8 +417,7 @@ void ucs_log_early_init()
     ucs_log_file          = NULL;
     ucs_log_file_last_idx = 0;
     ucs_log_file_close    = 0;
-    threads_count         = 0;
-    pthread_spin_init(&threads_lock, 0);
+    ucs_log_thread_count  = 0;
 }
 
 void ucs_log_init()
@@ -462,7 +465,6 @@ void ucs_log_cleanup()
     if (ucs_log_file_close) {
         fclose(ucs_log_file);
     }
-    pthread_spin_destroy(&threads_lock);
 
     ucs_free(ucs_log_file_base_name);
     ucs_log_file_base_name = NULL;
@@ -494,3 +496,13 @@ void ucs_log_print_backtrace(ucs_log_level_t level)
 
     ucs_debug_backtrace_destroy(bckt);
 }
+
+void ucs_log_set_thread_name(const char *format, ...)
+{
+    va_list ap;
+
+    va_start(ap, format);
+    memset(ucs_log_thread_name, 0, sizeof(ucs_log_thread_name));
+    vsnprintf(ucs_log_thread_name, sizeof(ucs_log_thread_name) - 1, format, ap);
+    va_end(ap);
+}
diff --git a/src/ucs/debug/log_def.h b/src/ucs/debug/log_def.h
index 8aa5795c2ef..95964de488f 100644
--- a/src/ucs/debug/log_def.h
+++ b/src/ucs/debug/log_def.h
@@ -147,7 +147,7 @@ ucs_log_default_handler(const char *file, unsigned line, const char *function,
 /**
  * Show a fatal error
  */
-void ucs_log_fatal_error(const char *format, ...);
+void ucs_log_fatal_error(const char *format, ...) UCS_F_PRINTF(1, 2);
 
 
 /**
@@ -169,6 +169,22 @@ void ucs_log_pop_handler();
 unsigned ucs_log_num_handlers();
 
 
+/**
+ * Add indentation to all subsequent log messages.
+ *
+ * @param [in] delta   How much indentation to add, on top of the current
+ *                     indentation level.
+ *                     A negative number will reduce the indentation level.
+ */
+void ucs_log_indent(int delta);
+
+
+/**
+ * @return Current log indent level.
+ */
+int ucs_log_get_current_indent();
+
+
 /**
  * Log backtrace.
  *
@@ -176,6 +192,14 @@ unsigned ucs_log_num_handlers();
  */
 void ucs_log_print_backtrace(ucs_log_level_t level);
 
+
+/**
+ * Set the name fo current thread, to appear in log messages
+ *
+ * @param name           Thread name to set
+ */
+void ucs_log_set_thread_name(const char *format, ...) UCS_F_PRINTF(1, 2);
+
 END_C_DECLS
 
 #endif
diff --git a/src/ucs/memory/memtype_cache.c b/src/ucs/memory/memtype_cache.c
index a0e105760cb..af30a200d6b 100644
--- a/src/ucs/memory/memtype_cache.c
+++ b/src/ucs/memory/memtype_cache.c
@@ -27,6 +27,14 @@ typedef enum {
     UCS_MEMTYPE_CACHE_ACTION_REMOVE
 } ucs_memtype_cache_action_t;
 
+
+static UCS_F_ALWAYS_INLINE void
+ucs_memory_info_set_unknown(ucs_memory_info_t *mem_info)
+{
+    mem_info->type    = UCS_MEMORY_TYPE_UNKNOWN;
+    mem_info->sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+}
+
 static ucs_pgt_dir_t *ucs_memtype_cache_pgt_dir_alloc(const ucs_pgtable_t *pgtable)
 {
     void *ptr;
@@ -50,10 +58,11 @@ static void ucs_memtype_cache_pgt_dir_release(const ucs_pgtable_t *pgtable,
  */
 static void ucs_memtype_cache_insert(ucs_memtype_cache_t *memtype_cache,
                                      ucs_pgt_addr_t start, ucs_pgt_addr_t end,
-                                     ucs_memory_type_t mem_type)
+                                     const ucs_memory_info_t *mem_info)
 {
     ucs_memtype_cache_region_t *region;
     ucs_status_t status;
+    char dev_name[64];
     int ret;
 
     /* Allocate structure for new region */
@@ -71,7 +80,7 @@ static void ucs_memtype_cache_insert(ucs_memtype_cache_t *memtype_cache,
 
     region->super.start = start;
     region->super.end   = end;
-    region->mem_type    = mem_type;
+    region->mem_info    = *mem_info;
 
     status = UCS_PROFILE_CALL(ucs_pgtable_insert, &memtype_cache->pgtable,
                               &region->super);
@@ -82,9 +91,11 @@ static void ucs_memtype_cache_insert(ucs_memtype_cache_t *memtype_cache,
         return;
     }
 
-    ucs_trace("memtype_cache: insert " UCS_PGT_REGION_FMT " mem_type %s",
+    ucs_trace("memtype_cache: insert " UCS_PGT_REGION_FMT " mem_type %s dev %s",
               UCS_PGT_REGION_ARG(&region->super),
-              ucs_memory_type_names[mem_type]);
+              ucs_memory_type_names[mem_info->type],
+              ucs_topo_sys_device_bdf_name(mem_info->sys_dev, dev_name,
+                                           sizeof(dev_name)));
 }
 
 static void ucs_memtype_cache_region_collect_callback(const ucs_pgtable_t *pgtable,
@@ -98,15 +109,16 @@ static void ucs_memtype_cache_region_collect_callback(const ucs_pgtable_t *pgtab
 }
 
 UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal,
-                      (memtype_cache, address, size, mem_type, action),
+                      (memtype_cache, address, size, mem_info, action),
                       ucs_memtype_cache_t *memtype_cache, const void *address,
-                      size_t size, ucs_memory_type_t mem_type,
+                      size_t size, const ucs_memory_info_t *mem_info,
                       ucs_memtype_cache_action_t action)
 {
     ucs_memtype_cache_region_t *region, *tmp;
     UCS_LIST_HEAD(region_list);
     ucs_pgt_addr_t start, end, search_start, search_end;
     ucs_status_t status;
+    char dev_name[64];
 
     if (!size) {
         return;
@@ -115,10 +127,12 @@ UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal,
     start = ucs_align_down_pow2((uintptr_t)address,        UCS_PGT_ADDR_ALIGN);
     end   = ucs_align_up_pow2  ((uintptr_t)address + size, UCS_PGT_ADDR_ALIGN);
 
-    ucs_trace("%s: [0x%lx..0x%lx] mem_type %s",
-              ((action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) ?
-               "update" : "remove"),
-              start, end, ucs_memory_type_names[mem_type]);
+    ucs_trace("%s: [0x%lx..0x%lx] mem_type %s dev %s",
+              (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) ? "update" :
+                                                                 "remove",
+              start, end, ucs_memory_type_names[mem_info->type],
+              ucs_topo_sys_device_bdf_name(mem_info->sys_dev, dev_name,
+                                           sizeof(dev_name)));
 
     if (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) {
         /* try to find regions that are contiguous and instersected
@@ -139,7 +153,7 @@ UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal,
                              &region_list);
     ucs_list_for_each_safe(region, tmp, &region_list, list) {
         if (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) {
-            if (region->mem_type == mem_type) {
+            if (region->mem_info.type == mem_info->type) {
                 /* merge current region with overlapping or adjacent regions
                  * of same memory type */
                 start = ucs_min(start, region->super.start);
@@ -162,13 +176,15 @@ UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal,
             goto out_unlock;
         }
 
-        ucs_trace("memtype_cache: removed " UCS_PGT_REGION_FMT " %s",
+        ucs_trace("memtype_cache: removed " UCS_PGT_REGION_FMT " %s dev %s",
                   UCS_PGT_REGION_ARG(&region->super),
-                  ucs_memory_type_names[region->mem_type]);
+                  ucs_memory_type_names[region->mem_info.type],
+                  ucs_topo_sys_device_bdf_name(region->mem_info.sys_dev,
+                                               dev_name, sizeof(dev_name)));
     }
 
     if (action == UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE) {
-        ucs_memtype_cache_insert(memtype_cache, start, end, mem_type);
+        ucs_memtype_cache_insert(memtype_cache, start, end, mem_info);
     }
 
     /* slice old regions by the new region, to preserve the previous memory type
@@ -178,12 +194,12 @@ UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal,
         if (start > region->super.start) {
             /* create previous region */
             ucs_memtype_cache_insert(memtype_cache, region->super.start, start,
-                                     region->mem_type);
+                                     &region->mem_info);
         }
         if (end < region->super.end) {
             /* create next region */
             ucs_memtype_cache_insert(memtype_cache, end, region->super.end,
-                                     region->mem_type);
+                                     &region->mem_info);
         }
 
         ucs_free(region);
@@ -195,17 +211,19 @@ UCS_PROFILE_FUNC_VOID(ucs_memtype_cache_update_internal,
 
 void ucs_memtype_cache_update(ucs_memtype_cache_t *memtype_cache,
                               const void *address, size_t size,
-                              ucs_memory_type_t mem_type)
+                              const ucs_memory_info_t *mem_info)
 {
-    ucs_memtype_cache_update_internal(memtype_cache, address, size, mem_type,
+    ucs_memtype_cache_update_internal(memtype_cache, address, size, mem_info,
                                       UCS_MEMTYPE_CACHE_ACTION_SET_MEMTYPE);
 }
 
 void ucs_memtype_cache_remove(ucs_memtype_cache_t *memtype_cache,
                               const void *address, size_t size)
 {
-    ucs_memtype_cache_update_internal(memtype_cache, address, size,
-                                      UCS_MEMORY_TYPE_LAST,
+    ucs_memory_info_t mem_info;
+
+    ucs_memory_info_set_unknown(&mem_info);
+    ucs_memtype_cache_update_internal(memtype_cache, address, size, &mem_info,
                                       UCS_MEMTYPE_CACHE_ACTION_REMOVE);
 }
 
@@ -213,6 +231,10 @@ static void ucs_memtype_cache_event_callback(ucm_event_type_t event_type,
                                               ucm_event_t *event, void *arg)
 {
     ucs_memtype_cache_t *memtype_cache = arg;
+    ucs_memory_info_t mem_info         = {
+        .type    = event->mem_type.mem_type,
+        .sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN
+    };
     ucs_memtype_cache_action_t action;
 
     if (event_type & UCM_EVENT_MEM_TYPE_ALLOC) {
@@ -224,8 +246,7 @@ static void ucs_memtype_cache_event_callback(ucm_event_type_t event_type,
     }
 
     ucs_memtype_cache_update_internal(memtype_cache, event->mem_type.address,
-                                      event->mem_type.size,
-                                      event->mem_type.mem_type, action);
+                                      event->mem_type.size, &mem_info, action);
 }
 
 static void ucs_memtype_cache_purge(ucs_memtype_cache_t *memtype_cache)
@@ -243,9 +264,9 @@ static void ucs_memtype_cache_purge(ucs_memtype_cache_t *memtype_cache)
 }
 
 UCS_PROFILE_FUNC(ucs_status_t, ucs_memtype_cache_lookup,
-                 (memtype_cache, address, size, mem_type_p),
+                 (memtype_cache, address, size, mem_info),
                  ucs_memtype_cache_t *memtype_cache, const void *address,
-                 size_t size, ucs_memory_type_t *mem_type_p)
+                 size_t size, ucs_memory_info_t *mem_info)
 {
     const ucs_pgt_addr_t start = (uintptr_t)address;
     ucs_memtype_cache_region_t *region;
@@ -261,10 +282,13 @@ UCS_PROFILE_FUNC(ucs_status_t, ucs_memtype_cache_lookup,
         goto out_unlock;
     }
 
-    region      = ucs_derived_of(pgt_region, ucs_memtype_cache_region_t);
-    *mem_type_p = ((pgt_region->end >= (start + size)) ?
-                   region->mem_type : UCS_MEMORY_TYPE_LAST);
-    status      = UCS_OK;
+    if (ucs_likely((start + size) <= pgt_region->end)) {
+        region    = ucs_derived_of(pgt_region, ucs_memtype_cache_region_t);
+        *mem_info = region->mem_info;
+    } else {
+        ucs_memory_info_set_unknown(mem_info);
+    }
+    status = UCS_OK;
 
 out_unlock:
     pthread_rwlock_unlock(&memtype_cache->lock);
diff --git a/src/ucs/memory/memtype_cache.h b/src/ucs/memory/memtype_cache.h
index 708f6e144b9..2d25c8d0bb4 100644
--- a/src/ucs/memory/memtype_cache.h
+++ b/src/ucs/memory/memtype_cache.h
@@ -13,6 +13,7 @@
 #include <ucs/datastruct/list.h>
 #include <ucs/stats/stats_fwd.h>
 #include <ucs/sys/compiler_def.h>
+#include <ucs/sys/topo.h>
 #include <pthread.h>
 
 
@@ -22,10 +23,18 @@ typedef struct ucs_memtype_cache         ucs_memtype_cache_t;
 typedef struct ucs_memtype_cache_region  ucs_memtype_cache_region_t;
 
 
+/* Memory information record */
+typedef struct ucs_memory_info {
+    uint8_t          type;    /**< Memory type, use uint8 for compact size */
+    ucs_sys_device_t sys_dev; /**< System device index */
+} ucs_memory_info_t;
+
+
 struct ucs_memtype_cache_region {
     ucs_pgt_region_t    super;    /**< Base class - page table region */
     ucs_list_link_t     list;     /**< List element */
-    ucs_memory_type_t   mem_type; /**< Memory type the address belongs to */
+    ucs_memory_info_t   mem_info; /**< Memory type and system device the address
+                                       belongs to */
 };
 
 
@@ -59,16 +68,16 @@ void ucs_memtype_cache_destroy(ucs_memtype_cache_t *memtype_cache);
  * @param [in]  memtype_cache   Memtype cache to search.
  * @param [in]  address         Address to lookup.
  * @param [in]  size            Length of the memory.
- * @param [out] mem_type_p      Set to the memory type of the address range.
- *                              UCS_MEMORY_TYPE_LAST is a special value which
+ * @param [out] mem_info        Set to the memory info of the address range.
+ *                              UCS_MEMORY_TYPE_UNKNOWN is a special value which
  *                              means the memory type is an unknown non-host
  *                              memory, and should be detected in another way.
  *
  * @return Error code.
  */
-ucs_status_t
-ucs_memtype_cache_lookup(ucs_memtype_cache_t *memtype_cache, const void *address,
-                         size_t size, ucs_memory_type_t *mem_type_p);
+ucs_status_t ucs_memtype_cache_lookup(ucs_memtype_cache_t *memtype_cache,
+                                      const void *address, size_t size,
+                                      ucs_memory_info_t *mem_info);
 
 
 /**
@@ -79,12 +88,12 @@ ucs_memtype_cache_lookup(ucs_memtype_cache_t *memtype_cache, const void *address
  * @param [in]  memtype_cache   Memtype cache to update.
  * @param [in]  address         Start address to update.
  * @param [in]  size            Size of the memory to update.
- * @param [out] mem_type        Set the memory type of the address range to this
+ * @param [in]  mem_info        Set the memory info of the address range to this
  *                              value.
  */
 void ucs_memtype_cache_update(ucs_memtype_cache_t *memtype_cache,
                               const void *address, size_t size,
-                              ucs_memory_type_t mem_type);
+                              const ucs_memory_info_t *mem_info);
 
 
 /**
diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c
index c61c8407078..c2595368b5e 100644
--- a/src/ucs/memory/rcache.c
+++ b/src/ucs/memory/rcache.c
@@ -18,6 +18,7 @@
 #include <ucs/sys/math.h>
 #include <ucs/sys/sys.h>
 #include <ucs/type/spinlock.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <ucm/api/ucm.h>
 
 #include "rcache.h"
@@ -276,7 +277,8 @@ static void ucs_rcache_region_collect_callback(const ucs_pgtable_t *pgtable,
 {
     ucs_rcache_region_t *region = ucs_derived_of(pgt_region, ucs_rcache_region_t);
     ucs_list_link_t *list = arg;
-    ucs_list_add_tail(list, &region->list);
+
+    ucs_list_add_tail(list, &region->tmp_list);
 }
 
 /* Lock must be held */
@@ -288,6 +290,52 @@ static void ucs_rcache_find_regions(ucs_rcache_t *rcache, ucs_pgt_addr_t from,
                              ucs_rcache_region_collect_callback, list);
 }
 
+/* LRU spinlock must be held */
+static inline void
+ucs_rcache_region_lru_add(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
+{
+    if (region->lru_flags & UCS_RCACHE_LRU_FLAG_IN_LRU) {
+        return;
+    }
+
+    ucs_rcache_region_trace(rcache, region, "lru add");
+    ucs_list_add_tail(&rcache->lru.list, &region->lru_list);
+    ++rcache->lru.count;
+    region->lru_flags |= UCS_RCACHE_LRU_FLAG_IN_LRU;
+}
+
+/* LRU spinlock must be held */
+static inline void
+ucs_rcache_region_lru_remove(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
+{
+    if (!(region->lru_flags & UCS_RCACHE_LRU_FLAG_IN_LRU)) {
+        return;
+    }
+
+    ucs_rcache_region_trace(rcache, region, "lru remove");
+    ucs_list_del(&region->lru_list);
+    --rcache->lru.count;
+    region->lru_flags &= ~UCS_RCACHE_LRU_FLAG_IN_LRU;
+}
+
+static void
+ucs_rcache_region_lru_get(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
+{
+    /* A used region cannot be evicted */
+    ucs_spin_lock(&rcache->lru.lock);
+    ucs_rcache_region_lru_remove(rcache, region);
+    ucs_spin_unlock(&rcache->lru.lock);
+}
+
+static void
+ucs_rcache_region_lru_put(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
+{
+    /* When we finish using a region, it's a candidate for LRU eviction */
+    ucs_spin_lock(&rcache->lru.lock);
+    ucs_rcache_region_lru_add(rcache, region);
+    ucs_spin_unlock(&rcache->lru.lock);
+}
+
 /* Lock must be held in write mode */
 static void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache,
                                             ucs_rcache_region_t *region)
@@ -300,8 +348,11 @@ static void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache,
 
     if (region->flags & UCS_RCACHE_REGION_FLAG_REGISTERED) {
         UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_DEREGS, 1);
-        UCS_PROFILE_CODE("mem_dereg") {
-            rcache->params.ops->mem_dereg(rcache->params.context, rcache, region);
+        {
+            UCS_PROFILE_CODE("mem_dereg") {
+                rcache->params.ops->mem_dereg(rcache->params.context, rcache,
+                region);
+            }
         }
     }
 
@@ -310,6 +361,13 @@ static void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache,
         ucs_free(ucs_rcache_region_pfn_ptr(region));
     }
 
+    ucs_spin_lock(&rcache->lru.lock);
+    ucs_rcache_region_lru_remove(rcache, region);
+    ucs_spin_unlock(&rcache->lru.lock);
+
+    --rcache->num_regions;
+    rcache->total_size -= region->super.end - region->super.start;
+
     ucs_free(region);
 }
 
@@ -329,7 +387,7 @@ static inline void ucs_rcache_region_put_internal(ucs_rcache_t *rcache,
         /* Put the region on garbage collection list */
         ucs_spin_lock(&rcache->lock);
         ucs_rcache_region_trace(rcache, region, "put on GC list", flags);
-        ucs_list_add_tail(&rcache->gc_list, &region->list);
+        ucs_list_add_tail(&rcache->gc_list, &region->tmp_list);
         ucs_spin_unlock(&rcache->lock);
         return;
     }
@@ -382,7 +440,7 @@ static void ucs_rcache_invalidate_range(ucs_rcache_t *rcache, ucs_pgt_addr_t sta
     ucs_trace_func("rcache=%s, start=0x%lx, end=0x%lx", rcache->name, start, end);
 
     ucs_rcache_find_regions(rcache, start, end - 1, &region_list);
-    ucs_list_for_each_safe(region, tmp, &region_list, list) {
+    ucs_list_for_each_safe(region, tmp, &region_list, tmp_list) {
         /* all regions on the list are in the page table */
         ucs_rcache_region_invalidate(rcache, region,
                                      flags | UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE);
@@ -427,7 +485,7 @@ static void ucs_rcache_check_gc_list(ucs_rcache_t *rcache)
     ucs_spin_lock(&rcache->lock);
     while (!ucs_list_is_empty(&rcache->gc_list)) {
         region = ucs_list_extract_head(&rcache->gc_list, ucs_rcache_region_t,
-                                       list);
+                                       tmp_list);
 
         /* We need to drop the lock since the following code may trigger memory
          * operations, which could trigger vm_unmapped event which also takes
@@ -507,7 +565,7 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache)
     ucs_list_head_init(&region_list);
     ucs_pgtable_purge(&rcache->pgtable, ucs_rcache_region_collect_callback,
                       &region_list);
-    ucs_list_for_each_safe(region, tmp, &region_list, list) {
+    ucs_list_for_each_safe(region, tmp, &region_list, tmp_list) {
         if (region->flags & UCS_RCACHE_REGION_FLAG_PGTABLE) {
             region->flags &= ~UCS_RCACHE_REGION_FLAG_PGTABLE;
             ucs_atomic_add32(&region->refcount, (uint32_t)-1);
@@ -519,6 +577,55 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache)
     }
 }
 
+/* Lock must be held in write mode */
+static void ucs_rcache_lru_evict(ucs_rcache_t *rcache)
+{
+    int num_evicted, num_skipped;
+    ucs_rcache_region_t *region;
+
+    num_evicted = 0;
+    num_skipped = 0;
+
+    ucs_spin_lock(&rcache->lru.lock);
+    while (!ucs_list_is_empty(&rcache->lru.list) &&
+           ((rcache->num_regions > rcache->params.max_regions) ||
+            (rcache->total_size > rcache->params.max_size))) {
+        region = ucs_list_head(&rcache->lru.list, ucs_rcache_region_t,
+                               lru_list);
+        ucs_assert(region->lru_flags & UCS_RCACHE_LRU_FLAG_IN_LRU);
+
+        if (!(region->flags & UCS_RCACHE_REGION_FLAG_PGTABLE) ||
+            (region->refcount > 1)) {
+            /* region is in use or not in page table - remove from lru */
+            ucs_rcache_region_lru_remove(rcache, region);
+            ++num_skipped;
+            continue;
+        }
+
+        ucs_spin_unlock(&rcache->lru.lock);
+
+        /* The region is expected to have refcount=1 and present in pgt, so it
+         * would be destroyed immediately by this function
+         */
+        ucs_rcache_region_trace(rcache, region, "evict");
+        ucs_rcache_region_invalidate(
+                rcache, region,
+                UCS_RCACHE_REGION_PUT_FLAG_MUST_DESTROY |
+                        UCS_RCACHE_REGION_PUT_FLAG_IN_PGTABLE);
+        ++num_evicted;
+
+        ucs_spin_lock(&rcache->lru.lock);
+    }
+
+    ucs_spin_unlock(&rcache->lru.lock);
+
+    if (num_evicted > 0) {
+        ucs_debug("evicted %d regions, skipped %d regions, usage: %lu (%lu)",
+                  num_evicted, num_skipped, rcache->num_regions,
+                  rcache->params.max_regions);
+    }
+}
+
 static inline int ucs_rcache_region_test(ucs_rcache_region_t *region, int prot)
 {
     return (region->flags & UCS_RCACHE_REGION_FLAG_REGISTERED) &&
@@ -545,8 +652,7 @@ ucs_rcache_check_overlap(ucs_rcache_t *rcache, ucs_pgt_addr_t *start,
 
     /* TODO check if any of the regions is locked */
 
-    ucs_list_for_each_safe(region, tmp, &region_list, list) {
-
+    ucs_list_for_each_safe(region, tmp, &region_list, tmp_list) {
         if ((*start >= region->super.start) && (*end <= region->super.end) &&
             ucs_rcache_region_test(region, *prot))
         {
@@ -716,9 +822,15 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length,
      */
     UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_REGS, 1);
 
-    region->prot     = prot;
-    region->flags    = UCS_RCACHE_REGION_FLAG_PGTABLE;
-    region->refcount = 1;
+    region->prot      = prot;
+    region->flags     = UCS_RCACHE_REGION_FLAG_PGTABLE;
+    region->lru_flags = 0;
+    region->refcount  = 1;
+    region->status    = UCS_INPROGRESS;
+
+    ++rcache->num_regions;
+    rcache->total_size += region->super.end - region->super.start;
+
     region->status = status =
         UCS_PROFILE_NAMED_CALL("mem_reg", rcache->params.ops->mem_reg,
                                rcache->params.context, rcache, arg, region,
@@ -728,7 +840,7 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length,
             /* failure may be due to merge, because memory of the merged
              * regions has different access permission.
              * Retry with original address: there will be no merge because
-             * all merged regions has been invalidated and registration will
+             * all merged regions have been invalidated and registration will
              * succeed.
              */
             ucs_debug("failed to register merged region " UCS_PGT_REGION_FMT ": %s, retrying",
@@ -754,6 +866,8 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length,
             ucs_free(region);
             goto out_unlock;
         }
+
+        ucs_rcache_lru_evict(rcache);
     }
 
     UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_MISSES, 1);
@@ -795,6 +909,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length,
             {
                 ucs_rcache_region_hold(rcache, region);
                 ucs_rcache_region_validate_pfn(rcache, region);
+                ucs_rcache_region_lru_get(rcache, region);
                 *region_p = region;
                 UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_HITS_FAST, 1);
                 pthread_rwlock_unlock(&rcache->pgt_lock);
@@ -815,6 +930,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length,
 
 void ucs_rcache_region_put(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
 {
+    ucs_rcache_region_lru_put(rcache, region);
     ucs_rcache_region_put_internal(rcache, region,
                                    UCS_RCACHE_REGION_PUT_FLAG_TAKE_PGLOCK);
     UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_PUTS, 1);
@@ -884,6 +1000,52 @@ static void ucs_rcache_global_list_remove(ucs_rcache_t *rcache) {
     pthread_mutex_unlock(&ucs_rcache_global_list_lock);
 }
 
+static void ucs_rcache_vfs_show_inv_q_length(void *obj,
+                                             ucs_string_buffer_t *strb,
+                                             void *arg_ptr, uint64_t arg_u64)
+{
+    ucs_rcache_t *rcache = obj;
+    size_t rcache_inv_q_length;
+
+    ucs_spin_lock(&rcache->lock);
+    rcache_inv_q_length = ucs_queue_length(&rcache->inv_q);
+    ucs_spin_unlock(&rcache->lock);
+
+    ucs_string_buffer_appendf(strb, "%zu\n", rcache_inv_q_length);
+}
+
+static void ucs_rcache_vfs_show_gc_list_length(void *obj,
+                                               ucs_string_buffer_t *strb,
+                                               void *arg_ptr, uint64_t arg_u64)
+{
+    ucs_rcache_t *rcache = obj;
+    unsigned long rcache_gc_list_length;
+
+    ucs_spin_lock(&rcache->lock);
+    rcache_gc_list_length = ucs_list_length(&rcache->gc_list);
+    ucs_spin_unlock(&rcache->lock);
+
+    ucs_string_buffer_appendf(strb, "%lu\n", rcache_gc_list_length);
+}
+
+static void ucs_rcache_vfs_init(ucs_rcache_t *rcache)
+{
+    ucs_vfs_obj_add_dir(NULL, rcache, "ucs/rcache/%s", rcache->name);
+    ucs_vfs_obj_add_ro_file(rcache, ucs_vfs_show_primitive,
+                            &rcache->num_regions, UCS_VFS_TYPE_ULONG,
+                            "num_regions");
+    ucs_vfs_obj_add_ro_file(rcache, ucs_vfs_show_primitive, &rcache->total_size,
+                            UCS_VFS_TYPE_SIZET, "total_size");
+    ucs_vfs_obj_add_ro_file(rcache, ucs_vfs_show_ulunits,
+                            &rcache->params.max_regions, 0, "max_regions");
+    ucs_vfs_obj_add_ro_file(rcache, ucs_vfs_show_memunits,
+                            &rcache->params.max_size, 0, "max_size");
+    ucs_vfs_obj_add_ro_file(rcache, ucs_rcache_vfs_show_inv_q_length, NULL, 0,
+                            "inv_q/length");
+    ucs_vfs_obj_add_ro_file(rcache, ucs_rcache_vfs_show_gc_list_length, NULL, 0,
+                            "gc_list/length");
+}
+
 static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
                            const char *name, ucs_stats_node_t *stats_parent)
 {
@@ -949,6 +1111,11 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
 
     ucs_queue_head_init(&self->inv_q);
     ucs_list_head_init(&self->gc_list);
+    self->lru.count   = 0;
+    self->num_regions = 0;
+    self->total_size  = 0;
+    ucs_list_head_init(&self->lru.list);
+    ucs_spinlock_init(&self->lru.lock, 0);
 
     status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority,
                                    ucs_rcache_unmapped_callback, self);
@@ -961,6 +1128,8 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
         goto err_unset_event;
     }
 
+    ucs_rcache_vfs_init(self);
+
     return UCS_OK;
 
 err_unset_event:
@@ -984,6 +1153,7 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
 
 static UCS_CLASS_CLEANUP_FUNC(ucs_rcache_t)
 {
+    ucs_vfs_obj_remove(self);
     ucs_rcache_global_list_remove(self);
     ucm_unset_event_handler(self->params.ucm_events, ucs_rcache_unmapped_callback,
                             self);
@@ -991,6 +1161,18 @@ static UCS_CLASS_CLEANUP_FUNC(ucs_rcache_t)
     ucs_rcache_check_gc_list(self);
     ucs_rcache_purge(self);
 
+    if (self->lru.count > 0) {
+        ucs_assert(!ucs_list_is_empty(&self->lru.list));
+        ucs_warn(
+                "rcache %s: %lu regions remained on lru list, first region: %p",
+                self->name, self->lru.count,
+                ucs_list_head(&self->lru.list, ucs_rcache_region_t, lru_list));
+    } else {
+        ucs_assert(ucs_list_is_empty(&self->lru.list));
+    }
+
+    ucs_spinlock_destroy(&self->lru.lock);
+
     ucs_mpool_cleanup(&self->mp, 1);
     ucs_pgtable_cleanup(&self->pgtable);
     ucs_spinlock_destroy(&self->lock);
diff --git a/src/ucs/memory/rcache.h b/src/ucs/memory/rcache.h
index 655cab4e675..21472ba44a4 100644
--- a/src/ucs/memory/rcache.h
+++ b/src/ucs/memory/rcache.h
@@ -36,7 +36,7 @@ typedef struct ucs_rcache_region  ucs_rcache_region_t;
  */
 enum {
     UCS_RCACHE_REGION_FLAG_REGISTERED = UCS_BIT(0), /**< Memory registered */
-    UCS_RCACHE_REGION_FLAG_PGTABLE    = UCS_BIT(1)  /**< In the page table */
+    UCS_RCACHE_REGION_FLAG_PGTABLE    = UCS_BIT(1), /**< In the page table */
 };
 
 /*
@@ -54,6 +54,14 @@ enum {
     UCS_RCACHE_FLAG_PURGE_ON_FORK = UCS_BIT(1), /**< purge rcache on fork */
 };
 
+/*
+ * Rcache LRU flags.
+ */
+enum {
+    UCS_RCACHE_LRU_FLAG_IN_LRU = UCS_BIT(0) /**< In LRU */
+};
+
+
 /*
  * Registration cache operations.
  */
@@ -122,23 +130,27 @@ struct ucs_rcache_params {
     void                   *context;            /**< User-defined context that will
                                                      be passed to mem_reg/mem_dereg */
     int                    flags;               /**< Flags */
+    unsigned long          max_regions;         /**< Maximal number of regions */
+    size_t                 max_size;            /**< Maximal total size of regions */
 };
 
 
 struct ucs_rcache_region {
-    ucs_pgt_region_t       super;    /**< Base class - page table region */
-    ucs_list_link_t        list;     /**< List element */
-    volatile uint32_t      refcount; /**< Reference count, including +1 if it's
-                                          in the page table */
-    ucs_status_t           status;   /**< Current status code */
-    uint8_t                prot;     /**< Protection bits */
-    uint16_t               flags;    /**< Status flags. Protected by page table lock. */
+    ucs_pgt_region_t       super;     /**< Base class - page table region */
+    ucs_list_link_t        lru_list;  /**< LRU list element */
+    ucs_list_link_t        tmp_list;  /**< Temp list element */
+    volatile uint32_t      refcount;  /**< Reference count, including +1 if it's
+                                           in the page table */
+    ucs_status_t           status;    /**< Current status code */
+    uint8_t                prot;      /**< Protection bits */
+    uint8_t                flags;     /**< Status flags. Protected by page table lock. */
+    uint8_t                lru_flags; /**< LRU flags */
     union {
-        uint64_t           priv;     /**< Used internally */
-        unsigned long     *pfn;      /**< Pointer to PFN array. In case if requested 
-                                          evaluation more than 1 page - PFN array is
-                                          allocated, if 1 page requested - used
-                                          in-place priv value. */
+        uint64_t           priv;      /**< Used internally */
+        unsigned long     *pfn;       /**< Pointer to PFN array. In case if requested 
+                                           evaluation more than 1 page - PFN array is
+                                           allocated, if 1 page requested - used
+                                           in-place priv value. */
     };
 };
 
diff --git a/src/ucs/memory/rcache_int.h b/src/ucs/memory/rcache_int.h
index e1ebc26591f..7e19344e04e 100644
--- a/src/ucs/memory/rcache_int.h
+++ b/src/ucs/memory/rcache_int.h
@@ -7,6 +7,7 @@
 #ifndef UCS_REG_CACHE_INT_H_
 #define UCS_REG_CACHE_INT_H_
 
+#include <ucs/datastruct/list.h>
 #include <ucs/type/spinlock.h>
 
 
@@ -28,34 +29,47 @@ enum {
 
 
 struct ucs_rcache {
-    ucs_rcache_params_t      params;   /**< rcache parameters (immutable) */
-
-    pthread_rwlock_t         pgt_lock; /**< Protects the page table and all
-                                            regions whose refcount is 0 */
-    ucs_pgtable_t            pgtable;  /**< page table to hold the regions */
-
-
-    ucs_spinlock_t           lock;     /**< Protects 'mp', 'inv_q' and 'gc_list'.
-                                            This is a separate lock because we
-                                            may want to invalidate regions
-                                            while the page table lock is held by
-                                            the calling context.
-                                            @note: This lock should always be
-                                            taken **after** 'pgt_lock'. */
-    ucs_mpool_t              mp;       /**< Memory pool to allocate entries for
-                                            inv_q and page table entries, since
-                                            we cannot use regular malloc().
-                                            The backing storage is original mmap()
-                                            which does not generate memory events */
-    ucs_queue_head_t         inv_q;    /**< Regions which were invalidated during
-                                            memory events */
-    ucs_list_link_t          gc_list;  /**< list for regions to destroy, regions
-                                            could not be destroyed from memhook */
-
-    char                     *name;    /**< Name of the cache, for debug purpose */
+    ucs_rcache_params_t      params;      /**< rcache parameters (immutable) */
+
+    pthread_rwlock_t         pgt_lock;    /**< Protects the page table and all
+                                               regions whose refcount is 0 */
+    ucs_pgtable_t            pgtable;     /**< page table to hold the regions */
+
+
+    ucs_spinlock_t           lock;        /**< Protects 'mp', 'inv_q' and 'gc_list'.
+                                               This is a separate lock because we
+                                               may want to invalidate regions
+                                               while the page table lock is held by
+                                               the calling context.
+                                               @note: This lock should always be
+                                               taken **after** 'pgt_lock'. */
+    ucs_mpool_t              mp;          /**< Memory pool to allocate entries for
+                                               inv_q and page table entries, since
+                                               we cannot use regular malloc().
+                                               The backing storage is original mmap()
+                                               which does not generate memory events */
+    ucs_queue_head_t         inv_q;       /**< Regions which were invalidated during
+                                               memory events */
+    ucs_list_link_t          gc_list;     /**< list for regions to destroy, regions
+                                               could not be destroyed from memhook */
+
+    unsigned long            num_regions; /**< Total number of managed regions */
+    size_t                   total_size;  /**< Total size of registered memory */
+
+    struct {
+        ucs_spinlock_t       lock;        /**< Lock for this structure */
+        ucs_list_link_t      list;        /**< List of regions, sorted by usage:
+                                               The head of the list is the least
+                                               recently used region, and the tail
+                                               is the most recently used region. */
+        unsigned long        count;       /**< Number of regions on list */
+    } lru;
+    
+    char                     *name;       /**< Name of the cache, for debug purpose */
+
     UCS_STATS_NODE_DECLARE(stats)
 
-    ucs_list_link_t          list;     /**< list entry in global ucs_rcache list */
+    ucs_list_link_t          list;        /**< list entry in global ucs_rcache list */
 };
 
 #endif
diff --git a/src/ucs/profile/profile.c b/src/ucs/profile/profile.c
index 80dcc0008cd..d336bd362ae 100644
--- a/src/ucs/profile/profile.c
+++ b/src/ucs/profile/profile.c
@@ -11,7 +11,7 @@
 #include "profile.h"
 
 #include <ucs/datastruct/list.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/debug/log.h>
 #include <ucs/sys/string.h>
 #include <ucs/sys/sys.h>
diff --git a/src/ucs/profile/profile_on.h b/src/ucs/profile/profile_on.h
index d17a14a66fa..c7b32148d89 100644
--- a/src/ucs/profile/profile_on.h
+++ b/src/ucs/profile/profile_on.h
@@ -115,9 +115,11 @@ BEGIN_C_DECLS
 #define UCS_PROFILE_FUNC(_ret_type, _name, _arglist, ...) \
     static UCS_F_ALWAYS_INLINE _ret_type _name##_inner(__VA_ARGS__); \
     \
-    _ret_type _name(__VA_ARGS__) { \
+    _ret_type _name(__VA_ARGS__) \
+    { \
+        _ret_type _ret; \
         UCS_PROFILE_SCOPE_BEGIN(); \
-        _ret_type _ret = _name##_inner _arglist; \
+        _ret = _name##_inner _arglist; \
         UCS_PROFILE_SCOPE_END(#_name); \
         return _ret; \
     } \
diff --git a/src/ucs/stats/libstats.c b/src/ucs/stats/libstats.c
index fb65cece10d..3d60439de37 100644
--- a/src/ucs/stats/libstats.c
+++ b/src/ucs/stats/libstats.c
@@ -36,7 +36,7 @@ static ucs_status_t ucs_stats_name_check(const char *name)
         return UCS_ERR_INVALID_PARAM;
     }
 
-    return UCS_OK;;
+    return UCS_OK;
 }
 
 ucs_status_t ucs_stats_node_initv(ucs_stats_node_t *node, ucs_stats_class_t *cls,
diff --git a/src/ucs/stats/stats.c b/src/ucs/stats/stats.c
index 7618c1bc2dc..3491d7f3fa7 100644
--- a/src/ucs/stats/stats.c
+++ b/src/ucs/stats/stats.c
@@ -501,6 +501,8 @@ static void* ucs_stats_thread_func(void *arg)
     unsigned flags;
     long nsec;
 
+    ucs_log_set_thread_name("stats");
+
     if (ucs_stats_context.interval > 0) {
         nsec = (long)(ucs_stats_context.interval * UCS_NSEC_PER_SEC + 0.5);
         timeout.tv_sec  = nsec / UCS_NSEC_PER_SEC;
diff --git a/src/ucs/sys/compiler.h b/src/ucs/sys/compiler.h
index 96cc7fc0200..9be16ff7f3a 100644
--- a/src/ucs/sys/compiler.h
+++ b/src/ucs/sys/compiler.h
@@ -56,8 +56,6 @@
         } \
     }
 
-#define UCS_ALLOCA_MAX_SIZE  1200
-
 /**
  * alloca which makes sure the size is small enough.
  */
diff --git a/src/ucs/sys/compiler_def.h b/src/ucs/sys/compiler_def.h
index 6a8cee67114..9e9b29e12a6 100644
--- a/src/ucs/sys/compiler_def.h
+++ b/src/ucs/sys/compiler_def.h
@@ -26,6 +26,9 @@
 #define UCS_STATIC_ASSERT(_cond) \
      switch(0) {case 0:case (_cond):;}
 
+/* Maximal allocation size for on-stack buffers */
+#define UCS_ALLOCA_MAX_SIZE  1200
+
 /* Aliasing structure */
 #define UCS_S_MAY_ALIAS __attribute__((may_alias))
 
diff --git a/src/ucs/sys/event_set.h b/src/ucs/sys/event_set.h
index ca334024ff3..550f4404555 100644
--- a/src/ucs/sys/event_set.h
+++ b/src/ucs/sys/event_set.h
@@ -11,6 +11,7 @@
 
 #include <stdint.h>
 
+BEGIN_C_DECLS
 
 /**
  * ucs_sys_event_set_t structure used in ucs_event_set_XXX functions.
@@ -145,4 +146,6 @@ void ucs_event_set_cleanup(ucs_sys_event_set_t *event_set);
 ucs_status_t ucs_event_set_fd_get(ucs_sys_event_set_t *event_set,
                                   int *event_fd_p);
 
+END_C_DECLS
+
 #endif
diff --git a/src/ucs/sys/init.c b/src/ucs/sys/init.c
index 1d228af4b63..64c26ff0065 100644
--- a/src/ucs/sys/init.c
+++ b/src/ucs/sys/init.c
@@ -9,9 +9,10 @@
 #endif
 
 #include <ucs/sys/compiler.h>
+#include <ucs/sys/module.h>
 #include <ucs/arch/cpu.h>
 #include <ucs/config/parser.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/debug/log.h>
 #include <ucs/debug/memtrack.h>
 #include <ucs/profile/profile.h>
@@ -78,6 +79,12 @@ static UCS_F_NOOPTIMIZE void ucs_check_cpu_flags(void)
     }
 }
 
+static void ucs_modules_load()
+{
+    UCS_MODULE_FRAMEWORK_DECLARE(ucs);
+    UCS_MODULE_FRAMEWORK_LOAD(ucs, UCS_MODULE_LOAD_FLAG_GLOBAL);
+}
+
 static void UCS_F_CTOR ucs_init()
 {
     ucs_check_cpu_flags();
@@ -97,6 +104,7 @@ static void UCS_F_CTOR ucs_init()
     ucs_debug("%s loaded at 0x%lx", ucs_debug_get_lib_path(),
               ucs_debug_get_lib_base_addr());
     ucs_debug("cmd line: %s", ucs_get_process_cmdline());
+    ucs_modules_load();
 }
 
 static void UCS_F_DTOR ucs_cleanup(void)
diff --git a/src/ucs/sys/math.h b/src/ucs/sys/math.h
index 115ca401619..df31d4bdebb 100644
--- a/src/ucs/sys/math.h
+++ b/src/ucs/sys/math.h
@@ -100,6 +100,12 @@ static inline double ucs_log2(double x)
     return log(x) / log(2.0);
 }
 
+static UCS_F_ALWAYS_INLINE size_t ucs_double_to_sizet(double value, size_t max)
+{
+    double round_value = value + 0.5;
+    return (round_value < (double)max) ? ((size_t)round_value) : max;
+}
+
 /**
  * Convert flags without a branch
  * @return '_newflag' if '_oldflag' is set in '_value', otherwise - 0
@@ -148,11 +154,37 @@ static inline double ucs_log2(double x)
 #define UCS_CIRCULAR_COMPARE32(__a, __op, __b)  UCS_CIRCULAR_COMPARE(__a, __op, __b, int32_t)
 #define UCS_CIRCULAR_COMPARE64(__a, __op, __b)  UCS_CIRCULAR_COMPARE(__a, __op, __b, int64_t)
 
-#define ucs_for_each_bit(_index, _map)                   \
+
+/**
+ * Enumerate on all bit values in the bitmap '_map'
+ */
+#define ucs_for_each_bit(_index, _map) \
     for ((_index) = ucs_ffs64_safe(_map); (_index) < 64; \
          (_index) = ucs_ffs64_safe((uint64_t)(_map) & (-2ull << (uint64_t)(_index))))
 
 
+/**
+ * Generate all sub-masks of the given mask, from 0 to _mask inclusive.
+ *
+ * @param _submask   Variable to iterate over the sub-masks
+ * @param _mask      Generate sub-masks of this value
+ */
+#define ucs_for_each_submask(_submask, _mask) \
+    for (/* start with 0 */ \
+         (_submask) = 0; \
+         /* end when reaching _mask + 1 */ \
+         (_submask) <= (_mask); \
+         /* Increment _submask by 1. If it became larger than _mask, do nothing \
+          * here, and next condition check will exit the loop. Otherwise, add \
+          * ~mask to fast-forward the carry (from ++ operation) to the next \
+          * valid bit in _mask, and then do "& _mask" to remove any bits which \
+          * are not in the mask. \
+          */ \
+         (_submask)++, \
+         ((_submask) <= (_mask)) ? \
+                 ((_submask) = ((_submask )+ ~(_mask)) & (_mask)) : 0)
+
+
 /*
  * Generate a large prime number
  */
diff --git a/src/ucs/sys/sock.c b/src/ucs/sys/sock.c
index 2aa937eb0b9..ab96f3a2899 100644
--- a/src/ucs/sys/sock.c
+++ b/src/ucs/sys/sock.c
@@ -66,7 +66,7 @@ void ucs_close_fd(int *fd_p)
 
 int ucs_netif_flags_is_active(unsigned int flags)
 {
-    return (flags & IFF_UP) && (flags & IFF_RUNNING) && !(flags & IFF_LOOPBACK);
+    return (flags & IFF_UP) && (flags & IFF_RUNNING);
 }
 
 ucs_status_t ucs_netif_ioctl(const char *if_name, unsigned long request,
@@ -365,7 +365,7 @@ ucs_status_t ucs_socket_set_buffer_size(int fd, size_t sockopt_sndbuf,
 
 ucs_status_t ucs_socket_server_init(const struct sockaddr *saddr, socklen_t socklen,
                                     int backlog, int silent_err_in_use,
-                                    int allow_addr_inuse, int *listen_fd)
+                                    int reuse_addr, int *listen_fd)
 {
     int so_reuse_optval = 1;
     char ip_port_str[UCS_SOCKADDR_STRING_LEN];
@@ -374,6 +374,7 @@ ucs_status_t ucs_socket_server_init(const struct sockaddr *saddr, socklen_t sock
     int ret, fd;
 
     /* Create the server socket for accepting incoming connections */
+    fd     = -1; /* Suppress compiler warning */
     status = ucs_socket_create(saddr->sa_family, SOCK_STREAM, &fd);
     if (status != UCS_OK) {
         goto err;
@@ -385,7 +386,7 @@ ucs_status_t ucs_socket_server_init(const struct sockaddr *saddr, socklen_t sock
         goto err_close_socket;
     }
 
-    if (allow_addr_inuse) {
+    if (reuse_addr) {
         status = ucs_socket_setopt(fd, SOL_SOCKET, SO_REUSEADDR,
                                    &so_reuse_optval, sizeof(so_reuse_optval));
         if (status != UCS_OK) {
@@ -637,6 +638,38 @@ const void *ucs_sockaddr_get_inet_addr(const struct sockaddr *addr)
     }
 }
 
+ucs_status_t ucs_sockaddr_set_inet_addr(struct sockaddr *addr,
+                                        const void *in_addr)
+{
+    switch (addr->sa_family) {
+    case AF_INET:
+        memcpy(&UCS_SOCKET_INET_ADDR(addr), in_addr, UCS_IPV4_ADDR_LEN);
+        return UCS_OK;
+    case AF_INET6:
+        memcpy(&UCS_SOCKET_INET6_ADDR(addr), in_addr, UCS_IPV6_ADDR_LEN);
+        return UCS_OK;
+    default:
+        ucs_error("unknown address family: %d", addr->sa_family);
+        return UCS_ERR_INVALID_PARAM;
+    }
+}
+
+ucs_status_t ucs_sockaddr_inet_addr_sizeof(const struct sockaddr *addr,
+                                           size_t *size_p)
+{
+    switch (addr->sa_family) {
+    case AF_INET:
+        *size_p = UCS_IPV4_ADDR_LEN;
+        return UCS_OK;
+    case AF_INET6:
+        *size_p = UCS_IPV6_ADDR_LEN;
+        return UCS_OK;
+    default:
+        ucs_error("unknown address family: %d", addr->sa_family);
+        return UCS_ERR_INVALID_PARAM;
+    }
+}
+
 int ucs_sockaddr_is_known_af(const struct sockaddr *sa)
 {
     return ((sa->sa_family == AF_INET) ||
@@ -649,14 +682,19 @@ const char* ucs_sockaddr_str(const struct sockaddr *sock_addr,
     uint16_t port;
     size_t str_len;
 
+    if (sock_addr == NULL) {
+        ucs_strncpy_zero(str, "<null>", max_size);
+        return str;
+    }
+
     if (!ucs_sockaddr_is_known_af(sock_addr)) {
         ucs_strncpy_zero(str, "<invalid address family>", max_size);
         return str;
     }
 
-    if (!inet_ntop(sock_addr->sa_family, ucs_sockaddr_get_inet_addr(sock_addr),
-                   str, max_size)) {
-        ucs_strncpy_zero(str, "<failed to convert sockaddr to string>", max_size);
+    if (ucs_sockaddr_get_ipstr(sock_addr, str, max_size) != UCS_OK) {
+        ucs_strncpy_zero(str, "<failed to convert sockaddr to string>",
+                         max_size);
         return str;
     }
 
@@ -672,6 +710,33 @@ const char* ucs_sockaddr_str(const struct sockaddr *sock_addr,
     return str;
 }
 
+ucs_status_t ucs_sock_ipstr_to_sockaddr(const char *ip_str,
+                                        struct sockaddr_storage *sa_storage)
+{
+    struct sockaddr_in* sa_in;
+    struct sockaddr_in6* sa_in6;
+    int ret;
+
+    /* try IPv4 */
+    sa_in             = (struct sockaddr_in*)sa_storage;
+    sa_in->sin_family = AF_INET;
+    ret = inet_pton(AF_INET, ip_str, &sa_in->sin_addr);
+    if (ret == 1) {
+        return UCS_OK;
+    }
+
+    /* try IPv6 */
+    sa_in6              = (struct sockaddr_in6*)sa_storage;
+    sa_in6->sin6_family = AF_INET6;
+    ret = inet_pton(AF_INET6, ip_str, &sa_in6->sin6_addr);
+    if (ret == 1) {
+        return UCS_OK;
+    }
+
+    ucs_error("invalid address %s", ip_str);
+    return UCS_ERR_INVALID_ADDR;
+}
+
 int ucs_sockaddr_cmp(const struct sockaddr *sa1,
                      const struct sockaddr *sa2,
                      ucs_status_t *status_p)
@@ -736,11 +801,11 @@ int ucs_sockaddr_ip_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2)
                   UCS_IPV4_ADDR_LEN : UCS_IPV6_ADDR_LEN);
 }
 
-int ucs_sockaddr_is_inaddr_any(struct sockaddr *addr)
+int ucs_sockaddr_is_inaddr_any(const struct sockaddr *addr)
 {
     switch (addr->sa_family) {
     case AF_INET:
-        return UCS_SOCKET_INET_ADDR(addr).s_addr == INADDR_ANY;
+        return UCS_SOCKET_INET_ADDR(addr).s_addr == htonl(INADDR_ANY);
     case AF_INET6:
         return !memcmp(&(UCS_SOCKET_INET6_ADDR(addr)), &in6addr_any,
                        sizeof(UCS_SOCKET_INET6_ADDR(addr)));
@@ -750,6 +815,20 @@ int ucs_sockaddr_is_inaddr_any(struct sockaddr *addr)
     }
 }
 
+int ucs_sockaddr_is_inaddr_loopback(const struct sockaddr *addr)
+{
+    switch (addr->sa_family) {
+    case AF_INET:
+        return UCS_SOCKET_INET_ADDR(addr).s_addr == htonl(INADDR_LOOPBACK);
+    case AF_INET6:
+        return !memcmp(&(UCS_SOCKET_INET6_ADDR(addr)), &in6addr_loopback,
+                       sizeof(UCS_SOCKET_INET6_ADDR(addr)));
+    default:
+        ucs_debug("invalid address family: %d", addr->sa_family);
+        return 0;
+    }
+}
+
 ucs_status_t ucs_sockaddr_copy(struct sockaddr *dst_addr,
                                const struct sockaddr *src_addr)
 {
@@ -857,3 +936,14 @@ ucs_status_t ucs_sockaddr_get_ip_local_port_range(ucs_range_spec_t *port_range)
 
     return UCS_OK;
 }
+
+ucs_status_t
+ucs_sockaddr_get_ipstr(const struct sockaddr *addr, char *str, size_t max_size)
+{
+    if (inet_ntop(addr->sa_family, ucs_sockaddr_get_inet_addr(addr), str,
+                  max_size) == NULL) {
+        return UCS_ERR_INVALID_PARAM;
+    }
+
+    return UCS_OK;
+}
diff --git a/src/ucs/sys/sock.h b/src/ucs/sys/sock.h
index a9fdab6e39a..224d55d3346 100644
--- a/src/ucs/sys/sock.h
+++ b/src/ucs/sys/sock.h
@@ -215,7 +215,7 @@ ucs_status_t ucs_socket_set_buffer_size(int fd, size_t sockopt_sndbuf,
  *                                for the listen() call.
  * @param [in]  silent_bind       Whether or not to print error message on bind
  *                                failure with EADDRINUSE.
- * @param [in]  allow_addr_inuse  Whether or not to allow the socket to use an
+ * @param [in]  reuse_addr        Whether or not to allow the socket to use an
  *                                address that is already in use and was not
  *                                released by another socket yet.
  * @param [out] listen_fd         The fd that belongs to the server.
@@ -223,7 +223,7 @@ ucs_status_t ucs_socket_set_buffer_size(int fd, size_t sockopt_sndbuf,
  * @return UCS_OK on success or an error code on failure.
  */
 ucs_status_t ucs_socket_server_init(const struct sockaddr *saddr, socklen_t socklen,
-                                    int backlog, int silent_bind, int allow_addr_inuse,
+                                    int backlog, int silent_bind, int reuse_addr,
                                     int *listen_fd);
 
 
@@ -322,7 +322,7 @@ ucs_status_t ucs_socket_recv(int fd, void *data, size_t length);
  * 
  * @param [in]   addr       Pointer to sockaddr structure.
  * @param [out]  size_p     Pointer to variable where size of
- *                          sockaddr_in/sockaddr_in6 structure will be written
+ *                          sockaddr_in/sockaddr_in6 structure will be written.
  *
  * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure.
  */
@@ -334,7 +334,8 @@ ucs_status_t ucs_sockaddr_sizeof(const struct sockaddr *addr, size_t *size_p);
  * 
  * @param [in]   addr       Pointer to sockaddr structure.
  * @param [out]  port_p     Pointer to variable where port (host notation)
- *                          of sockaddr_in/sockaddr_in6 structure will be written
+ *                          of sockaddr_in/sockaddr_in6 structure will be
+ *                          written.
  *
  * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure.
  */
@@ -345,7 +346,7 @@ ucs_status_t ucs_sockaddr_get_port(const struct sockaddr *addr, uint16_t *port_p
  * Set port to a given sockaddr structure.
  * 
  * @param [in]   addr       Pointer to sockaddr structure.
- * @param [in]   port       Port (host notation) that will be written
+ * @param [in]   port       Port (host notation) that will be written.
  *
  * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure.
  */
@@ -363,6 +364,31 @@ ucs_status_t ucs_sockaddr_set_port(struct sockaddr *addr, uint16_t port);
 const void *ucs_sockaddr_get_inet_addr(const struct sockaddr *addr);
 
 
+/**
+ * Set IP addr to a given sockaddr structure.
+ * 
+ * @param [in]   addr        Pointer to sockaddr structure.
+ * @param [in]   in_addr     IP address that will be written.
+ *
+ * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure.
+ */
+ucs_status_t ucs_sockaddr_set_inet_addr(struct sockaddr *addr,
+                                        const void *in_addr);
+
+
+/**
+ * Return size of IP address of a given sockaddr structure.
+ * 
+ * @param [in]   addr       Pointer to sockaddr structure.
+ * @param [out]  size_p     Pointer to variable where size of IP address
+ *                          structure will be written.
+ *
+ * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure.
+ */
+ucs_status_t ucs_sockaddr_inet_addr_sizeof(const struct sockaddr *addr,
+                                           size_t *size_p);
+
+
 /**
  * Extract the IP address from a given sockaddr and return it as a string.
  *
@@ -370,13 +396,27 @@ const void *ucs_sockaddr_get_inet_addr(const struct sockaddr *addr);
  * @param [out]  str         A string filled with the IP address.
  * @param [in]   max_size    Size of a string (considering '\0'-terminated symbol)
  *
- * @return ip_str if the sock_addr has a valid IP address or 'Invalid address'
- *         otherwise.
+ * @return '<null>' if NULL is specified or @a str if the sock_addr has a valid
+ *         IP address or 'Invalid address' otherwise.
  */
 const char* ucs_sockaddr_str(const struct sockaddr *sock_addr,
                              char *str, size_t max_size);
 
 
+/**
+ * Extract the IP address from a given string and return it as a sockaddr storage.
+ *
+ * @param [in]  ip_str       A string to take IP address from.
+ * @param [out] sa_storage   sockaddr storage filled with the IP address and
+ *                           address family.
+ *
+ * @return UCS_OK if @a ip_str has a valid IP address or UCS_ERR_INVALID_ADDR
+ *         otherwise.
+ */
+ucs_status_t ucs_sock_ipstr_to_sockaddr(const char *ip_str,
+                                        struct sockaddr_storage *sa_storage);
+
+
 /**
  * Check if the address family of the given sockaddr is IPv4 or IPv6
  *
@@ -436,14 +476,26 @@ int ucs_sockaddr_ip_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2);
 
 
 /**
- * Indicate if given IP addr is INADDR_ANY (IPV4) or in6addr_any (IPV6)
+ * Indicate if given IP address is INADDR_ANY (IPV4) or in6addr_any (IPV6)
  * 
  * @param [in]   addr       Pointer to sockaddr structure.
  *
  * @return 1 if input is INADDR_ANY or in6addr_any
  *         0 if not
  */
-int ucs_sockaddr_is_inaddr_any(struct sockaddr *addr);
+int ucs_sockaddr_is_inaddr_any(const struct sockaddr *addr);
+
+
+/**
+ * Indicate if given IP address is INADDR_LOOPBACK (IPV4) or in6addr_loopback
+ * (IPV6)
+ * 
+ * @param [in]   addr       Pointer to sockaddr structure.
+ *
+ * @return 1 if input is INADDR_LOOPBACK or in6addr_loopback
+ *         0 if not
+ */
+int ucs_sockaddr_is_inaddr_loopback(const struct sockaddr *addr);
 
 
 /**
@@ -489,6 +541,20 @@ const char *ucs_sockaddr_address_family_str(sa_family_t af);
  */
 ucs_status_t ucs_sockaddr_get_ip_local_port_range(ucs_range_spec_t *port_range);
 
+
+/**
+ * Get IP address of a given sockaddr structure.
+ * 
+ * @param [in]  addr     Pointer to the sockaddr structure.
+ * @param [out] str      A string filled with the IP address.
+ * @param [in]  max_size Size of the string including terminating
+ *                       null-character.
+ *
+ * @return UCS_OK on success or UCS_ERR_INVALID_PARAM on failure.
+ */
+ucs_status_t
+ucs_sockaddr_get_ipstr(const struct sockaddr *addr, char *str, size_t max_size);
+
 END_C_DECLS
 
 #endif
diff --git a/src/ucs/sys/string.c b/src/ucs/sys/string.c
index a0f2c222b1e..1e9826bb615 100644
--- a/src/ucs/sys/string.c
+++ b/src/ucs/sys/string.c
@@ -14,11 +14,13 @@
 #include <ucs/config/parser.h>
 #include <ucs/arch/bitops.h>
 #include <ucs/sys/math.h>
+#include <ucs/debug/log.h>
 
 #include <string.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <time.h>
+#include <libgen.h>
 
 
 const char *ucs_memunits_suffixes[] = {"", "K", "M", "G", "T", "P", "E", NULL};
@@ -157,9 +159,16 @@ const char *ucs_memunits_range_str(size_t range_start, size_t range_end,
 {
     char buf_start[64], buf_end[64];
 
-    snprintf(buf, max, "%s..%s",
-             ucs_memunits_to_str(range_start, buf_start, sizeof(buf_start)),
-             ucs_memunits_to_str(range_end,   buf_end,   sizeof(buf_end)));
+    if (range_start == range_end) {
+        snprintf(buf, max, "%s",
+                 ucs_memunits_to_str(range_start, buf_start,
+                                     sizeof(buf_start)));
+    } else {
+        snprintf(buf, max, "%s..%s",
+                 ucs_memunits_to_str(range_start, buf_start, sizeof(buf_start)),
+                 ucs_memunits_to_str(range_end, buf_end, sizeof(buf_end)));
+    }
+
     return buf;
 }
 
@@ -199,6 +208,17 @@ ucs_status_t ucs_str_to_memunits(const char *buf, void *dest)
     return UCS_OK;
 }
 
+char *ucs_dirname(char *path, int num_layers)
+{
+    while (num_layers-- > 0) {
+        path = dirname(path);
+        if (path == NULL) {
+            return NULL;
+        }
+    }
+    return path;
+}
+
 void ucs_snprintf_safe(char *buf, size_t size, const char *fmt, ...)
 {
     va_list ap;
@@ -306,32 +326,72 @@ const char* ucs_flags_str(char *buf, size_t max,
     return buf;
 }
 
+size_t ucs_string_count_char(const char *str, char c)
+{
+    size_t count = 0;
+    const char *p;
+
+    for (p = str; *p != '\0'; ++p) {
+        if (*p == c) {
+            count++;
+        }
+    }
+
+    return count;
+}
+
+size_t ucs_string_common_prefix_len(const char *str1, const char *str2)
+{
+    const char *p1 = str1;
+    const char *p2 = str2;
+
+    /* as long as *p1==*p2, if *p1 is not '\0' then neither is *p2 */
+    while ((*p1 != '\0') && (*p1 == *p2)) {
+        p1++;
+        p2++;
+    }
+
+    return (p1 - str1);
+}
+
 ssize_t ucs_path_calc_distance(const char *path1, const char *path2)
 {
     unsigned distance = 0;
-    int same          = 1;
-    char resolved_path1[PATH_MAX], resolved_path2[PATH_MAX];
-    size_t comp_len, i;
-    size_t rp_len1, rp_len2;
+    size_t common_length;
+    char rpath1[PATH_MAX], rpath2[PATH_MAX];
 
-    if ((NULL == realpath(path1, resolved_path1)) ||
-        (NULL == realpath(path2, resolved_path2))) {
+    if ((NULL == realpath(path1, rpath1)) ||
+        (NULL == realpath(path2, rpath2))) {
         return UCS_ERR_INVALID_PARAM;
     }
 
-    rp_len1  = strlen(resolved_path1);
-    rp_len2  = strlen(resolved_path2);
-    comp_len = ucs_min(rp_len1, rp_len2);
+    common_length = ucs_string_common_prefix_len(rpath1, rpath2);
 
-    for (i = 0; i < comp_len; i++) {
-        if (resolved_path1[i] != resolved_path2[i]) {
-            same = 0;
-        }
-
-        if ((resolved_path1[i] == '/') && !same) {
-            distance++;
-        }
+    if (rpath1[common_length] != rpath2[common_length]) {
+        ++distance; /* count the differentiating path component */
     }
 
+    distance += ucs_max(ucs_string_count_char(rpath1 + common_length, '/'),
+                        ucs_string_count_char(rpath2 + common_length, '/'));
+
     return distance;
 }
+
+const char* ucs_mask_str(uint64_t mask, ucs_string_buffer_t *strb)
+{
+    uint8_t bit;
+
+    if (mask == 0) {
+        ucs_string_buffer_appendf(strb, "<none>");
+        goto out;
+    }
+
+    ucs_for_each_bit(bit, mask) {
+        ucs_string_buffer_appendf(strb, "%u, ", bit);
+    }
+
+    ucs_string_buffer_rtrim(strb, ", ");
+
+out:
+    return ucs_string_buffer_cstr(strb);
+}
diff --git a/src/ucs/sys/string.h b/src/ucs/sys/string.h
index 877cb2ac8f0..b28367922fe 100644
--- a/src/ucs/sys/string.h
+++ b/src/ucs/sys/string.h
@@ -10,6 +10,7 @@
 #include "compiler_def.h"
 #include <ucs/type/status.h>
 #include <ucs/sys/math.h>
+#include <ucs/datastruct/string_buffer.h>
 
 #include <stddef.h>
 #include <stdint.h>
@@ -58,6 +59,17 @@ void ucs_expand_path(const char *path, char *fullpath, size_t max);
 void ucs_fill_filename_template(const char *tmpl, char *buf, size_t max);
 
 
+/**
+ * Strip specified number of last components from file/dir path
+ *
+ * @param path          The pointer of file path to be stripped
+ * @param num_layers    The number of components to be stripped
+ *
+ * @return Pointer of the stripped dir path.
+ */
+char *ucs_dirname(char *path, int num_layers);
+
+
 /**
  * Format a string to a buffer of given size, and fill the rest of the buffer
  * with '\0'. Also, guarantee that the last char in the buffer is '\0'.
@@ -217,18 +229,53 @@ const char* ucs_flags_str(char *str, size_t max,
 
 
 /**
- * Get estimated number of segments different in the two paths. Segments are
- * separated by `/`.
+ * Find the number of occurences of a char in the given string.
+ *
+ * @param  str String buffer to search.
+ * @param  c   Character to search in the string.
+ *
+ * @return a value between 0 and strlen(str).
+ */
+size_t ucs_string_count_char(const char *str, char c);
+
+
+/**
+ * Length of the common string from the start of two given strings.
+ *
+ * @param  str1 First string buffer.
+ * @param  str2 Second string buffer.
+ *
+ * @return a value between 0 and min(strlen(str1), strlen(str2)).
+ */
+size_t ucs_string_common_prefix_len(const char *str1, const char *str2);
+
+
+/**
+ * Get number of segments that are disimilar in the two paths. Segments
+ * are separated by `/`. When the number of segments are unequal for the given
+ * paths, the number of segments different in the larger of the paths is
+ * returned. E.g. for /a/b/c/d and /a/x/y 3 is returned; for /a/b/c/d and
+ * /a/b/c/e 1 is returned; for /a/b/c and /a/b/c 0 is returned
  *
  * @param  path1  String pointing to first path
  * @param  path2  String pointing to second path
  *
- * @return if either of the paths are invalid, UINT_MAX; if paths are the same 0
- *         is returned; otherwise in between
+ * @return if either of the paths are invalid UINT_MAX is returned.
  */
 ssize_t ucs_path_calc_distance(const char *path1, const char *path2);
 
 
+/**
+ * Convert a bitmask to a string buffer that represents it.
+ *
+ * @param mask    Bitmask.
+ * @param strb    String buffer.
+ *
+ * @return C-style string representing a bitmask filled in a string buffer.
+ */
+const char* ucs_mask_str(uint64_t mask, ucs_string_buffer_t *strb);
+
+
 /** Quantifier suffixes for memory units ("K", "M", "G", etc) */
 extern const char *ucs_memunits_suffixes[];
 
diff --git a/src/ucs/sys/sys.c b/src/ucs/sys/sys.c
index 1b36db71ae0..b2076eda54f 100644
--- a/src/ucs/sys/sys.c
+++ b/src/ucs/sys/sys.c
@@ -836,6 +836,8 @@ ucs_status_t ucs_sysv_alloc(size_t *size, size_t max_size, void **address_p,
     ssize_t huge_page_size;
 #endif
     size_t alloc_size;
+    void *shmat_address;
+    int shmat_flags;
     int sys_errno;
     void *ptr;
     int ret;
@@ -888,14 +890,18 @@ ucs_status_t ucs_sysv_alloc(size_t *size, size_t max_size, void **address_p,
     /* Attach segment */
     if (*address_p) {
 #ifdef SHM_REMAP
-        ptr = shmat(*shmid, *address_p, SHM_REMAP);
+        shmat_address = *address_p;
+        shmat_flags   = SHM_REMAP;
 #else
         return UCS_ERR_INVALID_PARAM;
 #endif
     } else {
-        ptr = shmat(*shmid, NULL, 0);
+        shmat_address = NULL;
+        shmat_flags   = 0;
     }
 
+    ptr = shmat(*shmid, shmat_address, shmat_flags);
+
     /* Remove segment, the attachment keeps a reference to the mapping */
     /* FIXME having additional attaches to a removed segment is not portable
     * behavior */
@@ -911,7 +917,9 @@ ucs_status_t ucs_sysv_alloc(size_t *size, size_t max_size, void **address_p,
         } else if (RUNNING_ON_VALGRIND && (errno == EINVAL)) {
             return UCS_ERR_NO_MEMORY;
         } else {
-            ucs_error("shmat(shmid=%d) returned unexpected error: %m", *shmid);
+            ucs_error("shmat(shmid=%d, address=%p, flags=0x%x) returned "
+                      "unexpected error: %m",
+                      *shmid, shmat_address, shmat_flags);
             return UCS_ERR_SHMEM_SEGMENT;
         }
     }
@@ -1404,6 +1412,20 @@ ucs_status_t ucs_sys_get_boot_id(uint64_t *high, uint64_t *low)
     return status;
 }
 
+uint64_t ucs_iface_get_system_id()
+{
+    uint64_t high;
+    uint64_t low;
+    ucs_status_t status;
+
+    status = ucs_sys_get_boot_id(&high, &low);
+    if (status == UCS_OK) {
+        return high ^ low;
+    }
+
+    return ucs_machine_guid();
+}
+
 ucs_status_t ucs_sys_readdir(const char *path, ucs_sys_readdir_cb_t cb, void *ctx)
 {
     ucs_status_t res = UCS_OK;
diff --git a/src/ucs/sys/sys.h b/src/ucs/sys/sys.h
index af6362c2003..37eaae1193d 100644
--- a/src/ucs/sys/sys.h
+++ b/src/ucs/sys/sys.h
@@ -562,6 +562,14 @@ int ucs_sys_ns_is_default(ucs_sys_namespace_type_t name);
 ucs_status_t ucs_sys_get_boot_id(uint64_t *high, uint64_t *low);
 
 
+/**
+ * Read boot ID value or use machine_guid.
+ *
+ * @return 64-bit value representing system ID.
+ */
+uint64_t ucs_iface_get_system_id();
+
+
 /**
  * Read directory
  *
diff --git a/src/ucs/sys/topo.c b/src/ucs/sys/topo.c
index cf9852b6e92..123830606f2 100644
--- a/src/ucs/sys/topo.c
+++ b/src/ucs/sys/topo.c
@@ -38,6 +38,10 @@ typedef struct ucs_topo_global_ctx {
 } ucs_topo_global_ctx_t;
 
 
+const ucs_sys_dev_distance_t ucs_topo_default_distance = {
+    .latency   = 0,
+    .bandwidth = DBL_MAX
+};
 static ucs_topo_global_ctx_t ucs_topo_ctx;
 
 static ucs_bus_id_bit_rep_t ucs_topo_get_bus_id_bit_repr(const ucs_sys_bus_id_t *bus_id)
@@ -61,6 +65,11 @@ void ucs_topo_cleanup()
     ucs_spinlock_destroy(&ucs_topo_ctx.lock);
 }
 
+unsigned ucs_topo_num_devices()
+{
+    return ucs_topo_ctx.sys_dev_to_bus_lookup.count;
+}
+
 ucs_status_t ucs_topo_find_device_by_bus_id(const ucs_sys_bus_id_t *bus_id,
                                             ucs_sys_device_t *sys_dev)
 {
@@ -114,22 +123,23 @@ ucs_status_t ucs_topo_get_distance(ucs_sys_device_t device1,
     /* If one of the devices is unknown, we assume near topology */
     if ((device1 == UCS_SYS_DEVICE_ID_UNKNOWN) ||
         (device2 == UCS_SYS_DEVICE_ID_UNKNOWN) || (device1 == device2)) {
-        path_distance = 0;
-    } else {
-        if ((device1 >= ucs_topo_ctx.sys_dev_to_bus_lookup.count) ||
-            (device2 >= ucs_topo_ctx.sys_dev_to_bus_lookup.count)) {
-            return UCS_ERR_INVALID_PARAM;
-        }
-
-        ucs_topo_get_bus_path(&ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[device1],
-                              path1, sizeof(path1));
-        ucs_topo_get_bus_path(&ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[device2],
-                              path2, sizeof(path2));
-
-        path_distance = ucs_path_calc_distance(path1, path2);
-        if (path_distance < 0) {
-            return (ucs_status_t)path_distance;
-        }
+        goto default_distance;
+    }
+
+    if ((device1 >= ucs_topo_num_devices()) ||
+        (device2 >= ucs_topo_num_devices())) {
+        return UCS_ERR_INVALID_PARAM;
+    }
+
+    ucs_topo_get_bus_path(&ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[device1],
+                          path1, sizeof(path1));
+    ucs_topo_get_bus_path(&ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[device2],
+                          path2, sizeof(path2));
+
+    path_distance = ucs_path_calc_distance(path1, path2);
+    if (path_distance <= 0) {
+        /* Assume default distance for devices that cannot be found in sysfs */
+        goto default_distance;
     }
 
     /* Rough approximation of bandwidth/latency as function of PCI distance in
@@ -137,20 +147,30 @@ ucs_status_t ucs_topo_get_distance(ucs_sys_device_t device1,
      * TODO implement more accurate estimation, based on system type, PCIe
      * switch, etc.
      */
-    if (path_distance <= 2) {
-        distance->latency   = 0;
-        distance->bandwidth = DBL_MAX;
-    } else if (path_distance <= 4) {
-        distance->latency   = 300e-9;
-        distance->bandwidth = 2000 * UCS_MBYTE;
-    } else {
-        distance->latency   = 900e-9;
-        distance->bandwidth = 300 * UCS_MBYTE;
-    }
+    distance->latency   = 100e-9 * path_distance;
+    distance->bandwidth = (20000 / path_distance) * UCS_MBYTE;
+    return UCS_OK;
 
+default_distance:
+    *distance = ucs_topo_default_distance;
     return UCS_OK;
 }
 
+const char *ucs_topo_distance_str(const ucs_sys_dev_distance_t *distance,
+                                  char *buffer, size_t max)
+{
+    UCS_STRING_BUFFER_FIXED(strb, buffer, max);
+
+    if (distance->bandwidth < 1e20) {
+        /* Print bandwidth only if limited */
+        ucs_string_buffer_appendf(&strb, "%.2fMBs/",
+                                  distance->bandwidth / UCS_MBYTE);
+    }
+
+    ucs_string_buffer_appendf(&strb, "%.0fns", distance->latency * 1e9);
+    return ucs_string_buffer_cstr(&strb);
+}
+
 const char *
 ucs_topo_sys_device_bdf_name(ucs_sys_device_t sys_dev, char *buffer, size_t max)
 {
@@ -160,8 +180,8 @@ ucs_topo_sys_device_bdf_name(ucs_sys_device_t sys_dev, char *buffer, size_t max)
         return "<unknown>";
     }
 
-    if (sys_dev >= ucs_topo_ctx.sys_dev_to_bus_lookup.count) {
-        return NULL;
+    if (sys_dev >= ucs_topo_num_devices()) {
+        return "<invalid>";
     }
 
     bus_id = &ucs_topo_ctx.sys_dev_to_bus_lookup.bus_arr[sys_dev];
@@ -170,6 +190,30 @@ ucs_topo_sys_device_bdf_name(ucs_sys_device_t sys_dev, char *buffer, size_t max)
     return buffer;
 }
 
+ucs_status_t
+ucs_topo_find_device_by_bdf_name(const char *name, ucs_sys_device_t *sys_dev)
+{
+    ucs_sys_bus_id_t bus_id;
+    int num_fields;
+
+    /* Try to parse as "<domain>:<bus>:<device>.<function>" */
+    num_fields = sscanf(name, "%hx:%hhx:%hhx.%hhx", &bus_id.domain, &bus_id.bus,
+                        &bus_id.slot, &bus_id.function);
+    if (num_fields == 4) {
+        return ucs_topo_find_device_by_bus_id(&bus_id, sys_dev);
+    }
+
+    /* Try to parse as "<bus>:<device>.<function>", assume domain is 0 */
+    bus_id.domain = 0;
+    num_fields    = sscanf(name, "%hhx:%hhx.%hhx", &bus_id.bus, &bus_id.slot,
+                           &bus_id.function);
+    if (num_fields == 3) {
+        return ucs_topo_find_device_by_bus_id(&bus_id, sys_dev);
+    }
+
+    return UCS_ERR_INVALID_PARAM;
+}
+
 void ucs_topo_print_info(FILE *stream)
 {
 }
diff --git a/src/ucs/sys/topo.h b/src/ucs/sys/topo.h
index 19aacc29cda..ea19da9d702 100644
--- a/src/ucs/sys/topo.h
+++ b/src/ucs/sys/topo.h
@@ -14,10 +14,13 @@
 
 BEGIN_C_DECLS
 
-#define UCS_SYS_DEVICE_ID_UNKNOWN UINT8_MAX /* Indicate that the ucs_sys_device_t
-                                             * for the device has no real bus_id
-                                             * e.g. virtual devices like CMA/knem
-                                             */
+
+/* Upper limit on system device id */
+#define UCS_SYS_DEVICE_ID_MAX UINT8_MAX
+
+/* Indicate that the ucs_sys_device_t for the device has no real bus_id
+ * e.g. virtual devices like CMA/knem */
+#define UCS_SYS_DEVICE_ID_UNKNOWN UINT8_MAX
 
 
 typedef struct ucs_sys_bus_id {
@@ -38,8 +41,8 @@ typedef uint8_t ucs_sys_device_t;
 
 
 /*
- * Capture the estimated latency, bandwidth between two system devices
- * referred by ucs_sys_device_t handle
+ * Captures the estimated latency and bandwidth between two system devices
+ * referred by ucs_sys_device_t handle.
  */
 typedef struct ucs_sys_dev_distance {
     double latency;   /**< in seconds */
@@ -47,13 +50,16 @@ typedef struct ucs_sys_dev_distance {
 } ucs_sys_dev_distance_t;
 
 
+extern const ucs_sys_dev_distance_t ucs_topo_default_distance;
+
+
 /**
- * Find system device by pci bus id
+ * Find system device by pci bus id.
  *
- * @param [in]  bus_id  pointer to bus id of the device of interest
- * @param [out] sys_dev system device index associated with the bus_id
+ * @param [in]  bus_id  pointer to bus id of the device of interest.
+ * @param [out] sys_dev system device index associated with the bus_id.
  *
- * @return UCS_OK or error in case device cannot be found
+ * @return UCS_OK or error in case device cannot be found.
  */
 ucs_status_t ucs_topo_find_device_by_bus_id(const ucs_sys_bus_id_t *bus_id,
                                             ucs_sys_device_t *sys_dev);
@@ -61,14 +67,14 @@ ucs_status_t ucs_topo_find_device_by_bus_id(const ucs_sys_bus_id_t *bus_id,
 
 /**
  * Find the distance between two system devices (in terms of latency,
- * bandwidth, hops, etc)
+ * bandwidth, hops, etc).
  *
- * @param [in]  device1  system device index of the first device
- * @param [in]  device2  system device index of the second device
- * @param [out] distance result populated with distance details between the two
- *                       devices
+ * @param [in]  device1   System device index of the first device.
+ * @param [in]  device2   System device index of the second device.
+ * @param [out] distance  Result populated with distance details between the two
+*                         devices.
  *
- * @return UCS_OK or error in case distance cannot be determined
+ * @return UCS_OK or error in case distance cannot be determined.
  */
 ucs_status_t ucs_topo_get_distance(ucs_sys_device_t device1,
                                    ucs_sys_device_t device2,
@@ -76,20 +82,53 @@ ucs_status_t ucs_topo_get_distance(ucs_sys_device_t device1,
 
 
 /**
- * Return system device name in BFD format: <domain>:<bus>:<device>.<function>
+ * Convert the distance to a human-readable string.
+ *
+ * @param [in]  distance   Distance between two devices.
+ * @param [out] buffer     String buffer to fill with distance string.
+ * @param [in]  max        Maximal size of the string buffer.
+ *
+ * @return Pointer to the distance string.
+ */
+const char *ucs_topo_distance_str(const ucs_sys_dev_distance_t *distance,
+                                  char *buffer, size_t max);
+
+
+/**
+ * Return system device name in BDF format: "<domain>:<bus>:<device>.<function>".
  *
  * @param [in]  sys_dev  System device id, as returned from
- *                       @ref ucs_topo_find_device_by_bus_id
- * @param [out] buffer   String buffer, filled the device name
- * @param [in]  max      Maximal size of @a buffer
+ *                       @ref ucs_topo_find_device_by_bus_id.
+ * @param [out] buffer   String buffer, filled the device name.
+ * @param [in]  max      Maximal size of @a buffer.
  */
 const char *
 ucs_topo_sys_device_bdf_name(ucs_sys_device_t sys_dev, char *buffer, size_t max);
 
 
 /**
- * Print a map indicating the topology information between system
- * devices discovered
+ * Find a system device by its BDF name: "[<domain>:]<bus>:<device>.<function>".
+ *
+ * @param [in]  name     BDF name to search for.
+ * @param [out] sys_dev  Filled with system device id, if found.
+ *
+ * @return UCS_OK if the device was found, error otherwise.
+ */
+ucs_status_t
+ucs_topo_find_device_by_bdf_name(const char *name, ucs_sys_device_t *sys_dev);
+
+
+/**
+ * Get the number of registered system devices.
+ *
+ * @return Number of system devices.
+ */
+unsigned ucs_topo_num_devices();
+
+
+/**
+ * Print a map indicating the topology information between system devices
+ * discovered.
  */
 void ucs_topo_print_info(FILE *stream);
 
diff --git a/src/ucs/time/time.h b/src/ucs/time/time.h
index f49155e9168..775c728ef72 100644
--- a/src/ucs/time/time.h
+++ b/src/ucs/time/time.h
@@ -9,7 +9,6 @@
 
 #include <ucs/arch/cpu.h>
 #include <ucs/time/time_def.h>
-#include <ucs/sys/math.h>
 #include <sys/time.h>
 #include <limits.h>
 
@@ -30,6 +29,7 @@ typedef uint32_t             ucs_short_time_t;
 
 
 #define UCS_TIME_INFINITY  ULLONG_MAX
+#define UCS_TIME_AUTO      (UCS_TIME_INFINITY - 1)
 
 #define UCS_MSEC_PER_SEC   1000ull       /* Milli */
 #define UCS_USEC_PER_SEC   1000000ul     /* Micro */
diff --git a/src/ucs/type/float8.h b/src/ucs/type/float8.h
new file mode 100644
index 00000000000..171758c9493
--- /dev/null
+++ b/src/ucs/type/float8.h
@@ -0,0 +1,241 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#ifndef UCS_TYPE_FLOAT_H
+#define UCS_TYPE_FLOAT_H
+
+#include <ieee754.h>
+
+#include <ucs/sys/preprocessor.h>
+#include <ucs/debug/assert.h>
+#include <ucs/arch/bitops.h>
+
+BEGIN_C_DECLS
+
+
+typedef uint8_t ucs_fp8_t;
+
+
+/**
+ * Bits number in the exponent part of a packed floating-point number
+ */
+#define _UCS_FP8_EXPONENT_BITS 4
+
+
+/**
+ * Bits number in the mantissa part of a packed floating-point number
+ */
+#define _UCS_FP8_MANTISSA_BITS 4
+
+
+/**
+ * The ratio of the value obtained after packing and unpacking to
+ * the original number
+ */
+#define UCS_FP8_PRECISION \
+    ((double)UCS_MASK(_UCS_FP8_MANTISSA_BITS) / UCS_BIT(_UCS_FP8_MANTISSA_BITS))
+
+
+/**
+ * Bits number in the exponent part of an IEEE754 double
+ */
+#define _UCS_FP8_IEEE_EXPONENT_BITS 11
+
+
+/**
+ * Bits number in the significant mantissa part of an IEEE754 double
+ */
+#define _UCS_FP8_IEEE_MANTISSA_BITS 20
+
+
+/**
+ * Shift of the packed mantissa representation, relative to the IEEE representation
+ */
+#define _UCS_FP_MANTISSA_OFFSET \
+    (_UCS_FP8_IEEE_MANTISSA_BITS - _UCS_FP8_MANTISSA_BITS)
+
+
+/**
+ * A special value of exponent which represents NaN in an IEEE754 double
+ */
+#define _UCS_FP8_IEEE_NAN_EXPONENT UCS_MASK(_UCS_FP8_IEEE_EXPONENT_BITS)
+
+
+/**
+ * A special value of exponent which represents NaN in a packed floating-point number
+ */
+#define _UCS_FP8_NAN UCS_MASK(_UCS_FP8_EXPONENT_BITS)
+
+
+/**
+ * The offset of an IEEE754 exponent representation from a packed exponent representation
+ */
+#define _UCS_FP8_EXPONENT_OFFSET (IEEE754_DOUBLE_BIAS - 1)
+
+
+/**
+ * Internal macro to construct floating-point type identifier from a name and a suffix
+ */
+#define _UCS_FP8_IDENTIFIER(_name, _suffix) \
+    UCS_PP_TOKENPASTE3(ucs_fp8_, _name, _suffix)
+
+
+/**
+ * Mask the exponent part of a packed floating-point number
+ */
+#define _UCS_FP8_EXPONENT_MASK (UCS_MASK(_UCS_FP8_EXPONENT_BITS))
+
+
+/**
+ * pack a double-precision floating-point number in a given range to a single byte.
+ * The packing is lossy and the unpacked number is assumed to be
+ * non-negative.
+ *
+ * @param value Pack this number
+ * @param min   Min supported value (assumed to be a power of 2)
+ * @param max   Max supported value (assumed to be a power of 2)
+ *
+ * @return A single byte which represents the given number
+ */
+static UCS_F_ALWAYS_INLINE ucs_fp8_t ucs_fp8_pack(double value, uint64_t min,
+                                                  uint64_t max)
+{
+    union ieee754_double ieee_value = {0};
+    uint8_t exponent;
+    int8_t min_exponent, max_exponent;
+
+    ieee_value.d = value;
+    min_exponent = ucs_ilog2(min);
+    max_exponent = ucs_ilog2(max);
+
+    if (ucs_unlikely(ieee_value.ieee.exponent == _UCS_FP8_IEEE_NAN_EXPONENT)) {
+        /* NaN maps to a special value for NaN */
+        exponent = _UCS_FP8_NAN;
+    } else if (ucs_unlikely(ieee_value.ieee.exponent >
+                            (max_exponent + _UCS_FP8_EXPONENT_OFFSET))) {
+        /* A number beyond the max supported is capped */
+        exponent                  = max_exponent - min_exponent;
+        ieee_value.ieee.mantissa0 = 0;
+        ieee_value.ieee.mantissa1 = 0;
+    } else if (ucs_unlikely(ieee_value.ieee.exponent <
+                            min_exponent + _UCS_FP8_EXPONENT_OFFSET)) {
+        if (ucs_unlikely(value == 0)) {
+            /* 0 maps to a special value for 0 */
+            exponent = 0;
+        } else {
+            /* A number below the max supported is rounded up */
+            exponent                  = 1;
+            ieee_value.ieee.mantissa0 = 0;
+            ieee_value.ieee.mantissa1 = 0;
+        }
+    } else {
+        exponent = ieee_value.ieee.exponent - _UCS_FP8_EXPONENT_OFFSET -
+                   min_exponent;
+    }
+
+    return exponent | ((ieee_value.ieee.mantissa0 >> _UCS_FP_MANTISSA_OFFSET)
+                       << _UCS_FP8_EXPONENT_BITS);
+}
+
+
+/**
+ * Unpack a byte to a double-precision floating-point number in a given range.
+ *
+ * @param value Unpack this number
+ * @param min   Min supported value (assumed to be a power of 2)
+ * @param max   Max supported value (assumed to be a power of 2)
+ *
+ * @return A double-precision floating-point number which approximates the
+ *         original unpacked value
+ */
+static UCS_F_ALWAYS_INLINE double
+ucs_fp8_unpack(ucs_fp8_t value, uint64_t min, uint64_t max)
+{
+    union ieee754_double ieee_value = {0};
+    uint8_t exponent                = value & _UCS_FP8_EXPONENT_MASK;
+
+    ieee_value.ieee.negative = 0;
+    if (ucs_unlikely(exponent == 0)) {
+        ieee_value.ieee.exponent = 0;
+    } else if (ucs_unlikely(exponent == _UCS_FP8_NAN)) {
+        ieee_value.ieee.exponent = _UCS_FP8_IEEE_NAN_EXPONENT;
+    } else {
+        ieee_value.ieee.exponent = exponent + _UCS_FP8_EXPONENT_OFFSET +
+                                   ucs_ilog2(min);
+    }
+    ieee_value.ieee.mantissa0 = value >> _UCS_FP8_EXPONENT_BITS;
+    ieee_value.ieee.mantissa0 = ieee_value.ieee.mantissa0
+                                << _UCS_FP_MANTISSA_OFFSET;
+
+    return ieee_value.d;
+}
+
+
+/**
+ * Declare a packed floating-point type.
+ *
+ * The packed type uses a portable and platform-independent underlying
+ * representation (an 8-bit char), able to perform a (lossy) packing and
+ * unpacking from a double (8-byte) type.
+ *
+ * The packed type is defined by the required min and max values -
+ * the exponent is scaled accordingly, to accommodate the needed range.
+ *
+ * Special values (0 and NaN) are packed and unpacked in a loseless way.
+ *
+ * max/min <= 2^14 must hold, as only 4 bits are used for exponent representation.
+ *
+ * @param _name Packed type name
+ * @param _min  Min supported number (assumed to be a power of 2)
+ * @param _max  Max supported number (assumed to be a power of 2)
+ */
+#define UCS_FP8_DECLARE_TYPE(_name, _min, _max) \
+    \
+    static UCS_F_ALWAYS_INLINE ucs_fp8_t _UCS_FP8_IDENTIFIER(_name, _pack)( \
+            double value) \
+    { \
+        /* 2 is subtracted because of special values for 0 and NaN */ \
+        ucs_assert(ucs_ilog2((_max) / (_min)) < \
+                   UCS_BIT(_UCS_FP8_EXPONENT_BITS) - 2); \
+        return ucs_fp8_pack(value, _min, _max); \
+    } \
+    \
+    static UCS_F_ALWAYS_INLINE double _UCS_FP8_IDENTIFIER(_name, _unpack)( \
+            ucs_fp8_t value) \
+    { \
+        return ucs_fp8_unpack(value, _min, _max); \
+    }
+
+
+/**
+ * Pack a double-precision floating-point number of a given type to a single byte.
+ * The packing is lossy and the unpacked number is assumed to be
+ * non-negative.
+ *
+ * @param _name  Packed type name
+ * @param _value Pack this number
+ *
+ * @return A single byte which represents the given number
+ */
+#define UCS_FP8_PACK(_name, _value) _UCS_FP8_IDENTIFIER(_name, _pack)(_value)
+
+
+/**
+ * Unpack a byte to a double-precision floating-point number of a given type.
+ *
+ * @param _name  Packed type name
+ * @param _value Unpack this number
+ *
+ * @return A double-precision floating-point number which approximates the
+ *         original unpacked value
+ */
+#define UCS_FP8_UNPACK(_name, _value) \
+    _UCS_FP8_IDENTIFIER(_name, _unpack)(_value)
+
+
+END_C_DECLS
+
+#endif
diff --git a/src/ucs/type/init_once.c b/src/ucs/type/init_once.c
deleted file mode 100644
index cfb05c9cb8c..00000000000
--- a/src/ucs/type/init_once.c
+++ /dev/null
@@ -1,20 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
- *
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include <ucs/type/init_once.h>
-#include <ucs/debug/assert.h>
-
-
-unsigned ucs_init_once_mutex_unlock(pthread_mutex_t *lock)
-{
-    int ret = pthread_mutex_unlock(lock);
-    ucs_assert_always(ret == 0);
-    return 0;
-}
diff --git a/src/ucs/type/init_once.h b/src/ucs/type/init_once.h
index 4b7e967ccbc..fc75104f19f 100644
--- a/src/ucs/type/init_once.h
+++ b/src/ucs/type/init_once.h
@@ -51,7 +51,7 @@ unsigned ucs_init_once_mutex_unlock(pthread_mutex_t *lock);
  */
 #define UCS_INIT_ONCE(_once) \
     for (pthread_mutex_lock(&(_once)->lock); \
-         !(_once)->initialized || ucs_init_once_mutex_unlock(&(_once)->lock); \
+         !(_once)->initialized || pthread_mutex_unlock(&(_once)->lock); \
          (_once)->initialized = 1)
 
 #endif
diff --git a/src/ucs/type/param.h b/src/ucs/type/param.h
new file mode 100644
index 00000000000..b2d6a26f5a2
--- /dev/null
+++ b/src/ucs/type/param.h
@@ -0,0 +1,36 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#ifndef UCS_PARAM_H_
+#define UCS_PARAM_H_
+
+#include <ucs/sys/preprocessor.h>
+#include <ucs/sys/compiler_def.h>
+
+BEGIN_C_DECLS
+
+
+/**
+ * Conditionally return a param value, if a flag in the field mask is set.
+ * Otherwise, return a default value.
+ *  
+ * @param _prefix  Prefix of each value in the field mask enum
+ * @param _params  Pointer to params struct
+ * @param _name    Return this member of the params struct
+ * @param _flag    Check for flag with this name
+ * @param _default Return this value if the flag in the field mask is not set
+ * 
+ * @return Param value (if the field mask flag is set) or the default value
+ */
+#define UCS_PARAM_VALUE(_prefix, _params, _name, _flag, _default) \
+    (((_params)->field_mask & UCS_PP_TOKENPASTE3(_prefix, _, _flag)) ? \
+             (_params)->_name : \
+             (_default))
+
+
+END_C_DECLS
+
+#endif
diff --git a/src/ucs/type/serialize.h b/src/ucs/type/serialize.h
new file mode 100644
index 00000000000..44b66dea091
--- /dev/null
+++ b/src/ucs/type/serialize.h
@@ -0,0 +1,47 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#ifndef UCS_SERIALIZE_H
+#define UCS_SERIALIZE_H
+
+#include <ucs/sys/compiler_def.h>
+
+
+/*
+ * Helper macro for serializing/deserializing custom data.
+ * Advance '_iter' by '_offset', and return a typed pointer of '_iter' before
+ * it was advanced.
+ *
+ * @param _iter    Pointer to a pointer, representing the current position.
+ * @param _type    Type of pointer to return (for example, 'const void' or
+ *                 'uint8_t'). Passing the type explicitly helps to avoid
+ *                 casting a const pointer to (non-const) void*.
+ * @param _offset  Offset to advance the pointer _iter.
+ *
+ * @return '_iter' before it was advanced by '_offset', cast to '_type *'.
+ */
+#define ucs_serialize_next_raw(_iter, _type, _offset) \
+    ({ \
+        _type *_result = (_type*)(*(_iter)); \
+        *(_iter)       = UCS_PTR_BYTE_OFFSET(*(_iter), _offset); \
+        _result; \
+    })
+
+
+/*
+ * Helper macro for serializing/deserializing custom data.
+ * Advance '_iter' to the next element, and return a typed pointer to the
+ * current element.
+ *
+ * @param _iter   Pointer to a pointer, representing the current element.
+ * @param _type   Type of the current element.
+ *
+ * @return Typed pointer to the current element.
+ */
+#define ucs_serialize_next(_iter, _type) \
+    ucs_serialize_next_raw(_iter, _type, sizeof(_type))
+
+#endif
diff --git a/src/ucs/type/spinlock.h b/src/ucs/type/spinlock.h
index 81937e31385..d79eb5ccba6 100644
--- a/src/ucs/type/spinlock.h
+++ b/src/ucs/type/spinlock.h
@@ -9,6 +9,7 @@
 #define UCS_SPINLOCK_H
 
 #include <ucs/type/status.h>
+#include <ucs/async/async_fwd.h>
 #include <pthread.h>
 #include <errno.h>
 
@@ -38,8 +39,6 @@ typedef struct ucs_recursive_spinlock {
     pthread_t      owner;
 } ucs_recursive_spinlock_t;
 
-#define UCS_SPINLOCK_OWNER_NULL ((pthread_t)-1)
-
 
 static ucs_status_t ucs_spinlock_init(ucs_spinlock_t *lock, int flags)
 {
@@ -63,7 +62,7 @@ static inline ucs_status_t
 ucs_recursive_spinlock_init(ucs_recursive_spinlock_t* lock, int flags)
 {
     lock->count = 0;
-    lock->owner = UCS_SPINLOCK_OWNER_NULL;
+    lock->owner = UCS_ASYNC_PTHREAD_ID_NULL;
 
     return ucs_spinlock_init(&lock->super, flags);
 }
@@ -73,7 +72,8 @@ void ucs_spinlock_destroy(ucs_spinlock_t *lock);
 void ucs_recursive_spinlock_destroy(ucs_recursive_spinlock_t *lock);
 
 static inline int
-ucs_recursive_spin_is_owner(ucs_recursive_spinlock_t *lock, pthread_t self)
+ucs_recursive_spin_is_owner(const ucs_recursive_spinlock_t *lock,
+                            pthread_t self)
 {
     return lock->owner == self;
 }
@@ -133,7 +133,7 @@ static inline void ucs_recursive_spin_unlock(ucs_recursive_spinlock_t *lock)
 {
     --lock->count;
     if (lock->count == 0) {
-        lock->owner = UCS_SPINLOCK_OWNER_NULL;
+        lock->owner = UCS_ASYNC_PTHREAD_ID_NULL;
         ucs_spin_unlock(&lock->super);
     }
 }
diff --git a/src/ucs/type/thread_mode.c b/src/ucs/type/thread_mode.c
new file mode 100644
index 00000000000..ddb8ab0d9c4
--- /dev/null
+++ b/src/ucs/type/thread_mode.c
@@ -0,0 +1,18 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "thread_mode.h"
+
+
+const char *ucs_thread_mode_names[] = {
+    [UCS_THREAD_MODE_SINGLE]     = "single",
+    [UCS_THREAD_MODE_SERIALIZED] = "serialized",
+    [UCS_THREAD_MODE_MULTI]      = "multi"
+};
diff --git a/src/ucs/type/thread_mode.h b/src/ucs/type/thread_mode.h
index ba6b527ae9b..471cd690dcc 100644
--- a/src/ucs/type/thread_mode.h
+++ b/src/ucs/type/thread_mode.h
@@ -24,4 +24,7 @@ typedef enum {
 } ucs_thread_mode_t;
 
 
+extern const char *ucs_thread_mode_names[];
+
+
 #endif
diff --git a/src/ucs/vfs/base/vfs_obj.c b/src/ucs/vfs/base/vfs_obj.c
new file mode 100644
index 00000000000..83f29695c44
--- /dev/null
+++ b/src/ucs/vfs/base/vfs_obj.c
@@ -0,0 +1,561 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "vfs_obj.h"
+
+#include <ucs/datastruct/khash.h>
+#include <ucs/datastruct/list.h>
+#include <ucs/debug/assert.h>
+#include <ucs/debug/log_def.h>
+#include <ucs/debug/memtrack.h>
+#include <ucs/type/spinlock.h>
+#include <ucs/sys/string.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <sys/stat.h>
+
+
+typedef enum {
+    UCS_VFS_NODE_TYPE_DIR,
+    UCS_VFS_NODE_TYPE_RO_FILE,
+    UCS_VFS_NODE_TYPE_SUBDIR,
+    UCS_VFS_NODE_TYPE_LAST
+} ucs_vfs_node_type_t;
+
+
+#define UCS_VFS_FLAGS_DIRTY UCS_BIT(0)
+
+
+typedef struct ucs_vfs_node ucs_vfs_node_t;
+struct ucs_vfs_node {
+    ucs_vfs_node_type_t    type;
+    int                    refcount;
+    uint8_t                flags;
+    void                   *obj;
+    ucs_vfs_node_t         *parent;
+    ucs_list_link_t        children;
+    ucs_vfs_file_show_cb_t text_cb;
+    ucs_vfs_refresh_cb_t   refresh_cb;
+    ucs_list_link_t        list;
+    void                   *arg_ptr;
+    uint64_t               arg_u64;
+    char                   path[0];
+};
+
+KHASH_MAP_INIT_STR(vfs_path, ucs_vfs_node_t*);
+KHASH_MAP_INIT_INT64(vfs_obj, ucs_vfs_node_t*);
+
+struct {
+    ucs_spinlock_t    lock;
+    ucs_vfs_node_t    root;
+    khash_t(vfs_path) path_hash;
+    khash_t(vfs_obj)  obj_hash;
+} ucs_vfs_obj_context = {};
+
+#define ucs_vfs_kh_put(_name, _h, _k, _node) \
+    { \
+        int khret; \
+        khiter_t khiter = kh_put(_name, _h, _k, &khret); \
+        ucs_assert((khret == UCS_KH_PUT_BUCKET_EMPTY) || \
+                   (khret == UCS_KH_PUT_BUCKET_CLEAR)); \
+        kh_val(_h, khiter) = _node; \
+    }
+
+#define ucs_vfs_kh_del_key(_name, _h, _k) \
+    { \
+        khiter_t khiter = kh_get(_name, _h, _k); \
+        ucs_assert(khiter != kh_end(_h)); \
+        kh_del(_name, _h, khiter); \
+    }
+
+#define ucs_vfs_kh_find(_name, _h, _k, _node) \
+    { \
+        khiter_t khiter = kh_get(_name, _h, _k); \
+        _node           = (khiter != kh_end(_h)) ? kh_val(_h, khiter) : NULL; \
+    }
+
+
+void ucs_vfs_show_memory_address(void *obj, ucs_string_buffer_t *strb,
+                                 void *arg_ptr, uint64_t arg_u64)
+{
+    ucs_string_buffer_appendf(strb, "%p\n", obj);
+}
+
+void ucs_vfs_show_primitive(void *obj, ucs_string_buffer_t *strb, void *arg_ptr,
+                            uint64_t arg_u64)
+{
+    ucs_vfs_primitive_type_t type = arg_u64;
+    unsigned long ulvalue;
+    long lvalue;
+
+    UCS_STATIC_ASSERT(UCS_VFS_TYPE_UNSIGNED >= UCS_VFS_TYPE_LAST);
+    UCS_STATIC_ASSERT(UCS_VFS_TYPE_HEX >= UCS_VFS_TYPE_LAST);
+
+    if (type == UCS_VFS_TYPE_POINTER) {
+        ucs_string_buffer_appendf(strb, "%p\n", *(void**)arg_ptr);
+    } else if (type == UCS_VFS_TYPE_STRING) {
+        ucs_string_buffer_appendf(strb, "%s\n", (char*)arg_ptr);
+    } else {
+        switch (type & ~(UCS_VFS_TYPE_UNSIGNED | UCS_VFS_TYPE_HEX)) {
+        case UCS_VFS_TYPE_CHAR:
+            lvalue  = *(char*)arg_ptr;
+            ulvalue = *(unsigned char*)arg_ptr;
+            break;
+        case UCS_VFS_TYPE_SHORT:
+            lvalue  = *(short*)arg_ptr;
+            ulvalue = *(unsigned short*)arg_ptr;
+            break;
+        case UCS_VFS_TYPE_INT:
+            lvalue  = *(int*)arg_ptr;
+            ulvalue = *(unsigned int*)arg_ptr;
+            break;
+        case UCS_VFS_TYPE_LONG:
+            lvalue  = *(long*)arg_ptr;
+            ulvalue = *(unsigned long*)arg_ptr;
+            break;
+        default:
+            return;
+        }
+
+        if (type & UCS_VFS_TYPE_HEX) {
+            ucs_string_buffer_appendf(strb, "%lx\n", ulvalue);
+        } else if (type & UCS_VFS_TYPE_UNSIGNED) {
+            ucs_string_buffer_appendf(strb, "%lu\n", ulvalue);
+        } else {
+            ucs_string_buffer_appendf(strb, "%ld\n", lvalue);
+        }
+    }
+}
+
+void ucs_vfs_show_ulunits(void *obj, ucs_string_buffer_t *strb, void *arg_ptr,
+                          uint64_t arg_u64)
+{
+    char buf[64];
+
+    ucs_config_sprintf_ulunits(buf, sizeof(buf), arg_ptr, NULL);
+    ucs_string_buffer_appendf(strb, "%s\n", buf);
+}
+
+void ucs_vfs_show_memunits(void *obj, ucs_string_buffer_t *strb, void *arg_ptr,
+                           uint64_t arg_u64)
+{
+    char buf[64];
+
+    ucs_memunits_to_str(*(size_t*)arg_ptr, buf, sizeof(buf));
+    ucs_string_buffer_appendf(strb, "%s\n", buf);
+}
+
+/* must be called with lock held */
+static ucs_vfs_node_t *ucs_vfs_node_find_by_path(const char *path)
+{
+    ucs_vfs_node_t *node;
+
+    ucs_vfs_kh_find(vfs_path, &ucs_vfs_obj_context.path_hash, path, node);
+    ucs_assert((node == NULL) || !strcmp(node->path, path));
+
+    return node;
+}
+
+/* must be called with lock held */
+static ucs_vfs_node_t *ucs_vfs_node_find_by_obj(void *obj)
+{
+    ucs_vfs_node_t *node;
+
+    ucs_vfs_kh_find(vfs_obj, &ucs_vfs_obj_context.obj_hash, (uintptr_t)obj,
+                    node);
+    ucs_assert((node == NULL) || (node->obj == obj));
+
+    return node;
+}
+
+/* must be called with lock held */
+static void ucs_vfs_node_init(ucs_vfs_node_t *node, ucs_vfs_node_type_t type,
+                              void *obj, ucs_vfs_node_t *parent_node)
+{
+    node->type       = type;
+    node->refcount   = 1;
+    node->flags      = 0;
+    node->obj        = obj;
+    node->parent     = parent_node;
+    node->text_cb    = NULL;
+    node->refresh_cb = NULL;
+    node->arg_ptr    = NULL;
+    node->arg_u64    = 0;
+    ucs_list_head_init(&node->children);
+}
+
+/* must be called with lock held */
+static ucs_vfs_node_t *ucs_vfs_node_create(ucs_vfs_node_t *parent_node,
+                                           const char *name,
+                                           ucs_vfs_node_type_t type, void *obj)
+{
+    char path_buf[PATH_MAX];
+    ucs_vfs_node_t *node;
+
+    if (parent_node == &ucs_vfs_obj_context.root) {
+        ucs_snprintf_safe(path_buf, sizeof(path_buf), "/%s", name);
+    } else {
+        ucs_snprintf_safe(path_buf, sizeof(path_buf), "%s/%s",
+                          parent_node->path, name);
+    }
+
+    node = ucs_vfs_node_find_by_path(path_buf);
+    if (node != NULL) {
+        return node;
+    }
+
+    node = ucs_malloc(sizeof(*node) + strlen(path_buf) + 1, "vfs_node");
+    if (node == NULL) {
+        ucs_error("Failed to allocate vfs_node");
+        return NULL;
+    }
+
+    /* initialize node */
+    ucs_vfs_node_init(node, type, obj, parent_node);
+    strcpy(node->path, path_buf);
+
+    /* add to parent */
+    ucs_list_add_head(&parent_node->children, &node->list);
+
+    /* add to obj hash */
+    if (node->obj != NULL) {
+        ucs_vfs_kh_put(vfs_obj, &ucs_vfs_obj_context.obj_hash,
+                       (uintptr_t)node->obj, node);
+    }
+
+    /* add to path hash */
+    ucs_vfs_kh_put(vfs_path, &ucs_vfs_obj_context.path_hash, node->path, node);
+
+    return node;
+}
+
+/* must be called with lock held */
+static ucs_vfs_node_t *ucs_vfs_node_add(void *parent_obj,
+                                        ucs_vfs_node_type_t type, void *obj,
+                                        const char *rel_path, va_list ap)
+{
+    ucs_vfs_node_t *parent_node;
+    char rel_path_buf[PATH_MAX];
+    char *token, *next_token;
+
+    if (parent_obj == NULL) {
+        parent_node = &ucs_vfs_obj_context.root;
+    } else {
+        parent_node = ucs_vfs_node_find_by_obj(parent_obj);
+        if (parent_node == NULL) {
+            return NULL;
+        }
+    }
+
+    /* generate the relative path */
+    vsnprintf(rel_path_buf, sizeof(rel_path_buf), rel_path, ap);
+
+    /* Build parent nodes along the rel_path, without associated object */
+    next_token = rel_path_buf;
+    token      = strsep(&next_token, "/");
+    while (next_token != NULL) {
+        parent_node = ucs_vfs_node_create(parent_node, token,
+                                          UCS_VFS_NODE_TYPE_SUBDIR, NULL);
+        token       = strsep(&next_token, "/");
+    }
+
+    return ucs_vfs_node_create(parent_node, token, type, obj);
+}
+
+/* must be called with lock held */
+static int ucs_vfs_check_node(ucs_vfs_node_t *node, ucs_vfs_node_type_t type)
+{
+    return (node != NULL) && (node->type == type);
+}
+
+/* must be called with lock held */
+static void ucs_vfs_node_increase_refcount(ucs_vfs_node_t *node)
+{
+    ++node->refcount;
+}
+
+/* must be called with lock held */
+static void ucs_vfs_node_decrease_refcount(ucs_vfs_node_t *node)
+{
+    ucs_vfs_node_t *parent_node = node->parent;
+    ucs_vfs_node_t *child_node, *tmp_node;
+
+    if (--node->refcount > 0) {
+        return;
+    }
+
+    /* If reference count is 0, then remove node. */
+
+    /* recursively remove children */
+    ucs_list_for_each_safe(child_node, tmp_node, &node->children, list) {
+        child_node->parent = NULL; /* prevent children from destroying me */
+        ucs_vfs_node_decrease_refcount(child_node);
+    }
+
+    /* remove from object hash */
+    if (node->obj != NULL) {
+        ucs_vfs_kh_del_key(vfs_obj, &ucs_vfs_obj_context.obj_hash,
+                           (uintptr_t)node->obj);
+    }
+
+    /* remove from path hash */
+    ucs_vfs_kh_del_key(vfs_path, &ucs_vfs_obj_context.path_hash, node->path);
+
+    /* remove from parent's list */
+    ucs_list_del(&node->list);
+
+    ucs_free(node);
+
+    /* recursively remove all empty parent subdirs */
+    if ((parent_node != NULL) && ucs_list_is_empty(&parent_node->children) &&
+        (parent_node->type == UCS_VFS_NODE_TYPE_SUBDIR)) {
+        ucs_vfs_node_decrease_refcount(parent_node);
+    }
+}
+
+/* must be called with lock held and incremented refcount */
+static void ucs_vfs_refresh_dir(ucs_vfs_node_t *node)
+{
+    ucs_assert(ucs_vfs_check_node(node, UCS_VFS_NODE_TYPE_DIR) ||
+               ucs_vfs_check_node(node, UCS_VFS_NODE_TYPE_SUBDIR));
+
+    if (!(node->flags & UCS_VFS_FLAGS_DIRTY)) {
+        return;
+    }
+
+    ucs_assert(node->refcount >= 2);
+
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+
+    node->refresh_cb(node->obj);
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    node->flags &= ~UCS_VFS_FLAGS_DIRTY;
+}
+
+/* must be called with lock held */
+static void
+ucs_vfs_read_ro_file(ucs_vfs_node_t *node, ucs_string_buffer_t *strb)
+{
+    ucs_vfs_node_t *parent_node = node->parent;
+
+    ucs_assert(ucs_vfs_check_node(node, UCS_VFS_NODE_TYPE_RO_FILE) == 1);
+
+    while (ucs_vfs_check_node(parent_node, UCS_VFS_NODE_TYPE_SUBDIR)) {
+        parent_node = parent_node->parent;
+    }
+
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+
+    node->text_cb(parent_node->obj, strb, node->arg_ptr, node->arg_u64);
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+}
+
+/* must be called with lock held */
+static void ucs_vfs_path_list_dir_cb(ucs_vfs_node_t *node,
+                                     ucs_vfs_list_dir_cb_t dir_cb, void *arg)
+{
+    ucs_vfs_node_t *child_node;
+
+    ucs_list_for_each(child_node, &node->children, list) {
+        dir_cb(ucs_basename(child_node->path), arg);
+    }
+}
+
+void ucs_vfs_obj_add_dir(void *parent_obj, void *obj, const char *rel_path, ...)
+{
+    va_list ap;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    va_start(ap, rel_path);
+    ucs_vfs_node_add(parent_obj, UCS_VFS_NODE_TYPE_DIR, obj, rel_path, ap);
+    va_end(ap);
+
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+}
+
+void ucs_vfs_obj_add_ro_file(void *obj, ucs_vfs_file_show_cb_t text_cb,
+                             void *arg_ptr, uint64_t arg_u64,
+                             const char *rel_path, ...)
+{
+    ucs_vfs_node_t *node;
+    va_list ap;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    va_start(ap, rel_path);
+    node = ucs_vfs_node_add(obj, UCS_VFS_NODE_TYPE_RO_FILE, NULL, rel_path, ap);
+    va_end(ap);
+
+    if (node != NULL) {
+        node->text_cb = text_cb;
+        node->arg_ptr = arg_ptr;
+        node->arg_u64 = arg_u64;
+    }
+
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+}
+
+void ucs_vfs_obj_remove(void *obj)
+{
+    ucs_vfs_node_t *node;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    node = ucs_vfs_node_find_by_obj(obj);
+    if (node != NULL) {
+        ucs_vfs_node_decrease_refcount(node);
+    }
+
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+}
+
+void ucs_vfs_obj_set_dirty(void *obj, ucs_vfs_refresh_cb_t refresh_cb)
+{
+    ucs_vfs_node_t *node;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    node = ucs_vfs_node_find_by_obj(obj);
+    if (node != NULL) {
+        node->flags     |= UCS_VFS_FLAGS_DIRTY;
+        node->refresh_cb = refresh_cb;
+    }
+
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+}
+
+ucs_status_t ucs_vfs_path_get_info(const char *path, ucs_vfs_path_info_t *info)
+{
+    ucs_string_buffer_t strb;
+    ucs_vfs_node_t *node;
+    ucs_status_t status;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    node = ucs_vfs_node_find_by_path(path);
+    if (node == NULL) {
+        status = UCS_ERR_NO_ELEM;
+        goto out_unlock;
+    }
+
+    ucs_vfs_node_increase_refcount(node);
+
+    switch (node->type) {
+    case UCS_VFS_NODE_TYPE_RO_FILE:
+        ucs_string_buffer_init(&strb);
+        ucs_vfs_read_ro_file(node, &strb);
+        info->mode = S_IFREG | S_IRUSR;
+        info->size = ucs_string_buffer_length(&strb);
+        ucs_string_buffer_cleanup(&strb);
+        status = UCS_OK;
+        break;
+    case UCS_VFS_NODE_TYPE_DIR:
+    case UCS_VFS_NODE_TYPE_SUBDIR:
+        ucs_vfs_refresh_dir(node);
+        info->mode = S_IFDIR | S_IRUSR | S_IXUSR;
+        info->size = ucs_list_length(&node->children);
+        status     = UCS_OK;
+        break;
+    default:
+        status = UCS_ERR_NO_ELEM;
+        break;
+    }
+
+    ucs_vfs_node_decrease_refcount(node);
+
+out_unlock:
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+
+    return status;
+}
+
+ucs_status_t ucs_vfs_path_read_file(const char *path, ucs_string_buffer_t *strb)
+{
+    ucs_vfs_node_t *node;
+    ucs_status_t status;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    node = ucs_vfs_node_find_by_path(path);
+    if (!ucs_vfs_check_node(node, UCS_VFS_NODE_TYPE_RO_FILE)) {
+        status = UCS_ERR_NO_ELEM;
+        goto out_unlock;
+    }
+
+    ucs_vfs_node_increase_refcount(node);
+
+    ucs_vfs_read_ro_file(node, strb);
+    status = UCS_OK;
+
+    ucs_vfs_node_decrease_refcount(node);
+
+out_unlock:
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+
+    return status;
+}
+
+ucs_status_t
+ucs_vfs_path_list_dir(const char *path, ucs_vfs_list_dir_cb_t dir_cb, void *arg)
+{
+    ucs_vfs_node_t *node;
+    ucs_status_t status;
+
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+
+    if (!strcmp(path, "/")) {
+        ucs_vfs_path_list_dir_cb(&ucs_vfs_obj_context.root, dir_cb, arg);
+        status = UCS_OK;
+        goto out_unlock;
+    }
+
+    node = ucs_vfs_node_find_by_path(path);
+
+    if (!ucs_vfs_check_node(node, UCS_VFS_NODE_TYPE_DIR) &&
+        !ucs_vfs_check_node(node, UCS_VFS_NODE_TYPE_SUBDIR)) {
+        status = UCS_ERR_NO_ELEM;
+        goto out_unlock;
+    }
+
+    ucs_vfs_node_increase_refcount(node);
+
+    ucs_vfs_refresh_dir(node);
+    ucs_vfs_path_list_dir_cb(node, dir_cb, arg);
+    status = UCS_OK;
+
+    ucs_vfs_node_decrease_refcount(node);
+
+out_unlock:
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+
+    return status;
+}
+
+UCS_STATIC_INIT
+{
+    ucs_spinlock_init(&ucs_vfs_obj_context.lock, 0);
+    ucs_spin_lock(&ucs_vfs_obj_context.lock);
+    ucs_vfs_node_init(&ucs_vfs_obj_context.root, UCS_VFS_NODE_TYPE_DIR, NULL,
+                      NULL);
+    ucs_spin_unlock(&ucs_vfs_obj_context.lock);
+    kh_init_inplace(vfs_obj, &ucs_vfs_obj_context.obj_hash);
+    kh_init_inplace(vfs_path, &ucs_vfs_obj_context.path_hash);
+}
+
+UCS_STATIC_CLEANUP
+{
+    kh_destroy_inplace(vfs_path, &ucs_vfs_obj_context.path_hash);
+    kh_destroy_inplace(vfs_obj, &ucs_vfs_obj_context.obj_hash);
+    ucs_spinlock_destroy(&ucs_vfs_obj_context.lock);
+}
diff --git a/src/ucs/vfs/base/vfs_obj.h b/src/ucs/vfs/base/vfs_obj.h
new file mode 100644
index 00000000000..934015666bc
--- /dev/null
+++ b/src/ucs/vfs/base/vfs_obj.h
@@ -0,0 +1,250 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_VFS_H_
+#define UCS_VFS_H_
+
+#include <ucs/datastruct/string_buffer.h>
+#include <ucs/sys/compiler_def.h>
+#include <stdint.h>
+
+BEGIN_C_DECLS
+
+/* This header file defines API for manipulating VFS object tree structure */
+
+
+/* Defines type of primitive variables */
+typedef enum {
+    /* Basic type definitions */
+    UCS_VFS_TYPE_POINTER,
+    UCS_VFS_TYPE_STRING,
+    UCS_VFS_TYPE_CHAR,
+    UCS_VFS_TYPE_SHORT,
+    UCS_VFS_TYPE_INT,
+    UCS_VFS_TYPE_LONG,
+    UCS_VFS_TYPE_LAST,
+
+    /* Type modifiers */
+    UCS_VFS_TYPE_UNSIGNED = UCS_BIT(14),
+    UCS_VFS_TYPE_HEX      = UCS_BIT(15),
+
+    /* Convenience flags */
+    UCS_VFS_TYPE_I8       = UCS_VFS_TYPE_CHAR,
+    UCS_VFS_TYPE_U8       = UCS_VFS_TYPE_UNSIGNED | UCS_VFS_TYPE_CHAR,
+    UCS_VFS_TYPE_I16      = UCS_VFS_TYPE_SHORT,
+    UCS_VFS_TYPE_U16      = UCS_VFS_TYPE_UNSIGNED | UCS_VFS_TYPE_SHORT,
+    UCS_VFS_TYPE_I32      = UCS_VFS_TYPE_INT,
+    UCS_VFS_TYPE_U32      = UCS_VFS_TYPE_UNSIGNED | UCS_VFS_TYPE_INT,
+    UCS_VFS_TYPE_U32_HEX  = UCS_VFS_TYPE_U32 | UCS_VFS_TYPE_HEX,
+    UCS_VFS_TYPE_ULONG    = UCS_VFS_TYPE_UNSIGNED | UCS_VFS_TYPE_LONG,
+    UCS_VFS_TYPE_SSIZET   = UCS_VFS_TYPE_LONG,
+    UCS_VFS_TYPE_SIZET    = UCS_VFS_TYPE_ULONG
+} ucs_vfs_primitive_type_t;
+
+
+/**
+ * Structure to describe the vfs node.
+ */
+typedef struct {
+    /**
+     * Size of the content in case of read-only file, and number of child
+     * directories if node is directory.
+     */
+    size_t size;
+    /**
+     * File mode can be either regular file (S_IFREG) or directory (S_IFDIR)
+     * depending of the type of the vfs node.
+     */
+    int    mode;
+} ucs_vfs_path_info_t;
+
+
+/**
+ * Function type to fill information about an object to the string buffer.
+ *
+ * @param [in]    obj      Pointer to the object.
+ * @param [inout] strb     String buffer filled with the object's information.
+ * @param [in]    arg_ptr  Optional pointer argument passed to the function.
+ * @param [in]    arg_u64  Optional numeric argument passed to the function.
+ */
+typedef void (*ucs_vfs_file_show_cb_t)(void *obj, ucs_string_buffer_t *strb,
+                                       void *arg_ptr, uint64_t arg_u64);
+
+
+/**
+ * Callback function to fill the memory address of an object to the string
+ * buffer.
+ *
+ * @param [in]    obj      Pointer to the object.
+ * @param [inout] strb     String buffer filled with the object's information.
+ * @param [in]    arg_ptr  Unused.
+ * @param [in]    arg_u64  Unused.
+ */
+void ucs_vfs_show_memory_address(void *obj, ucs_string_buffer_t *strb,
+                                 void *arg_ptr, uint64_t arg_u64);
+
+
+/**
+ * Callback function to show a variable of a primitive C type.
+ *
+ * @param [in]    obj      Pointer to the object.
+ * @param [inout] strb     String buffer filled with the object's information.
+ * @param [in]    arg_ptr  Points to the variable to show.
+ * @param [in]    arg_u64  Specifies type flags for the variable, as defined in
+ *                         @ref ucs_vfs_primitive_type_t.
+ */
+void ucs_vfs_show_primitive(void *obj, ucs_string_buffer_t *strb, void *arg_ptr,
+                            uint64_t arg_u64);
+
+
+/**
+ * Callback function to fill a value of an unsigned long type to the string
+ * buffer. The function handles 'auto' and 'infinty' values.
+ *
+ * @param [in]    obj      Pointer to the object.
+ * @param [inout] strb     String buffer filled with the object's information.
+ * @param [in]    arg_ptr  Pointer to the value of an unsigned long type.
+ * @param [in]    arg_u64  Unused.
+ */
+void ucs_vfs_show_ulunits(void *obj, ucs_string_buffer_t *strb, void *arg_ptr,
+                          uint64_t arg_u64);
+
+
+/**
+ * Callback function to fill memory units to the string buffer. The function
+ * handles 'auto' and 'infinty' values.
+ *
+ * @param [in]    obj      Pointer to the object.
+ * @param [inout] strb     String buffer filled with the object's information.
+ * @param [in]    arg_ptr  Pointer to the memory unit value.
+ * @param [in]    arg_u64  Unused.
+ */
+void ucs_vfs_show_memunits(void *obj, ucs_string_buffer_t *strb, void *arg_ptr,
+                           uint64_t arg_u64);
+
+
+/**
+ * Function to update representation of object in VFS.
+ *
+ * @param [in] obj Pointer to the object to be updated.
+ */
+typedef void (*ucs_vfs_refresh_cb_t)(void *obj);
+
+
+/**
+ * Function to process VFS nodes during reading of the parent directory.
+ *
+ * @param [in] name Path to directory.
+ * @param [in] arg  Pointer to the arguments.
+ */
+typedef void (*ucs_vfs_list_dir_cb_t)(const char *name, void *arg);
+
+
+/**
+ * Add directory representing object in VFS. If @a parent_obj is NULL, the mount
+ * directory will be used as the base for @a rel_path.
+ *
+ * @param [in] parent_obj Pointer to the parent object. @a rel_path is relative
+ *                        to @a parent_obj directory.
+ * @param [in] obj        Pointer to the object to be represented in VFS.
+ * @param [in] rel_path   Format string which specifies relative path
+ *                        @a obj directory.
+ */
+void ucs_vfs_obj_add_dir(void *parent_obj, void *obj, const char *rel_path, ...)
+        UCS_F_PRINTF(3, 4);
+
+
+/**
+ * Add read-only file describing object features in VFS. If @a obj is NULL, the
+ * mount directory will be used as the base for @a rel_path.
+ *
+ * @param [in] obj      Pointer to the object. @a rel_path is relative to @a obj
+ *                      directory.
+ * @param [in] text_cb  Callback method that generates the content of the file.
+ * @param [in] arg_ptr  Optional pointer argument that is passed to the callback
+ *                      method.
+ * @param [in] arg_u64  Optional numeric argument that is passed to the callback
+ *                      method.
+ * @param [in] rel_path Format string which specifies relative path to the file.
+ */
+void ucs_vfs_obj_add_ro_file(void *obj, ucs_vfs_file_show_cb_t text_cb,
+                             void *arg_ptr, uint64_t arg_u64,
+                             const char *rel_path, ...) UCS_F_PRINTF(5, 6);
+
+
+/**
+ * Recursively remove directories and files associated with the object and its
+ * children from VFS. The method removes all empty parent sub-directories.
+ *
+ * @param [in] obj Pointer to the object to be deleted with its children from
+ *                 VFS.
+ */
+void ucs_vfs_obj_remove(void *obj);
+
+
+/**
+ * Invalidate VFS node and set method to update the node.
+ *
+ * @param [in] obj        Pointer to the object to be invalidate.
+ * @param [in] refresh_cb Method to update the node associated with the object.
+ */
+void ucs_vfs_obj_set_dirty(void *obj, ucs_vfs_refresh_cb_t refresh_cb);
+
+
+/**
+ * Fill information about VFS node corresponding to the specified path.
+ *
+ * @param [in]  path       String which specifies path to find the node in VFS.
+ * @param [out] info       VFS object information.
+ *
+ * @return UCS_OK          VFS node corresponding to specified path exists.
+ *         UCS_ERR_NO_ELEM Otherwise.
+ *
+ * @note The content of the file defined by ucs_vfs_file_show_cb_t of the node.
+ *       The method initiates refresh of the node defined by
+ *       ucs_vfs_refresh_cb_t of the node.
+ */
+ucs_status_t ucs_vfs_path_get_info(const char *path, ucs_vfs_path_info_t *info);
+
+
+/**
+ * Read the content of VFS node corresponding to the specified path. The content
+ * of the file defined by ucs_vfs_file_show_cb_t of the node.
+ *
+ * @param [in]    path     String which specifies path to find the node in VFS.
+ * @param [inout] strb     String buffer to be filled by the content of the
+ *                         file.
+ *
+ * @return UCS_OK          VFS node corresponding to specified path exists and
+ *                         the node is a file.
+ * @return UCS_ERR_NO_ELEM Otherwise.
+ */
+ucs_status_t
+ucs_vfs_path_read_file(const char *path, ucs_string_buffer_t *strb);
+
+
+/**
+ * Invoke callback @a dir_cb for children of VFS node corresponding to the
+ * specified path.
+ *
+ * @param [in] path        String which specifies path to find the node in VFS.
+ * @param [in] dir_cb      Callback method to be invoked for each child of the
+ *                         VFS node.
+ * @param [in] arg         Arguments to be passed to the callback method.
+ *
+ * @return UCS_OK          VFS node corresponding to specified path exists and
+ *                         the node is a directory.
+ *         UCS_ERR_NO_ELEM Otherwise.
+ *
+ * @note The method initiates refresh of the node defined by
+ *       ucs_vfs_refresh_cb_t of the node.
+ */
+ucs_status_t ucs_vfs_path_list_dir(const char *path,
+                                   ucs_vfs_list_dir_cb_t dir_cb, void *arg);
+
+END_C_DECLS
+
+#endif
diff --git a/src/ucs/vfs/fuse/Makefile.am b/src/ucs/vfs/fuse/Makefile.am
new file mode 100644
index 00000000000..8ad3511277b
--- /dev/null
+++ b/src/ucs/vfs/fuse/Makefile.am
@@ -0,0 +1,19 @@
+#
+# Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+#
+# See file LICENSE for terms.
+#
+
+if HAVE_FUSE3
+
+module_LTLIBRARIES      = libucs_fuse.la
+libucs_fuse_la_CPPFLAGS = $(BASE_CPPFLAGS) $(FUSE3_CPPFLAGS)
+libucs_fuse_la_CFLAGS   = $(BASE_CFLAGS) $(CUDA_CFLAGS)
+libucs_fuse_la_LIBADD   = $(FUSE3_LIBS) \
+                          $(top_builddir)/src/ucs/vfs/sock/libucs_vfs_sock.la \
+                          $(top_builddir)/src/ucs/libucs.la
+libucs_fuse_la_LDFLAGS  = -version-info $(SOVERSION)
+libucs_fuse_la_SOURCES  = vfs_fuse.c
+include $(top_srcdir)/config/module.am
+
+endif
diff --git a/src/ucs/vfs/fuse/configure.m4 b/src/ucs/vfs/fuse/configure.m4
new file mode 100644
index 00000000000..bf45ea11dcb
--- /dev/null
+++ b/src/ucs/vfs/fuse/configure.m4
@@ -0,0 +1,12 @@
+#
+# Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+#
+
+
+AC_CHECK_DECLS([inotify_init, inotify_add_watch, IN_ATTRIB],
+               [AC_DEFINE([HAVE_INOTIFY], 1, [Enable inotify support])],
+               [],
+               [[#include <sys/inotify.h>]])
+
+AS_IF([test "x$fuse3_happy" = "xyes"], [ucs_modules="${ucs_modules}:fuse"])
+AC_CONFIG_FILES([src/ucs/vfs/fuse/Makefile])
diff --git a/src/ucs/vfs/fuse/vfs_fuse.c b/src/ucs/vfs/fuse/vfs_fuse.c
new file mode 100644
index 00000000000..33468dd08ad
--- /dev/null
+++ b/src/ucs/vfs/fuse/vfs_fuse.c
@@ -0,0 +1,446 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <ucs/debug/memtrack.h>
+#include <ucs/vfs/sock/vfs_sock.h>
+#include <ucs/vfs/base/vfs_obj.h>
+#include <ucs/sys/compiler.h>
+#include <ucs/sys/string.h>
+#include <ucs/debug/log.h>
+#include <sys/un.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <limits.h>
+#include <errno.h>
+#include <fuse.h>
+
+#ifdef HAVE_INOTIFY
+#include <sys/inotify.h>
+#endif
+
+
+#define UCS_VFS_DUMMY_FILE_NAME   "dummy"
+#define UCS_VFS_DUMMY_FILE_DATA   "UCX FUSE\n"
+
+
+typedef struct {
+    void            *buf;
+    fuse_fill_dir_t filler;
+} ucs_vfs_enum_dir_context_t;
+
+
+static struct {
+    pthread_t       thread_id;
+    pthread_mutex_t mutex;
+    struct fuse     *fuse;
+    int             fuse_fd;
+    int             stop;
+    int             inotify_fd;
+    int             watch_desc;
+} ucs_vfs_fuse_context = {
+    .thread_id  = -1,
+    .mutex      = PTHREAD_MUTEX_INITIALIZER,
+    .fuse       = NULL,
+    .fuse_fd    = -1,
+    .stop       = 0,
+    .inotify_fd = -1,
+    .watch_desc = -1
+};
+
+static void ucs_vfs_enum_dir_cb(const char *name, void *arg)
+{
+    ucs_vfs_enum_dir_context_t *ctx = arg;
+
+    ctx->filler(ctx->buf, name, NULL, 0, 0);
+}
+
+static int ucs_vfs_fuse_getattr(const char *path, struct stat *stbuf,
+                                struct fuse_file_info *fi)
+{
+    ucs_vfs_path_info_t info;
+    ucs_status_t status;
+
+    memset(stbuf, 0, sizeof(struct stat));
+    stbuf->st_uid = getuid();
+    stbuf->st_gid = getgid();
+
+    if (strcmp(path, "/") == 0) {
+        stbuf->st_mode  = S_IFDIR | S_IRWXU;
+        stbuf->st_nlink = 2;
+        return 0;
+    }
+
+    status = ucs_vfs_path_get_info(path, &info);
+    if (status != UCS_OK) {
+        return -ENOENT;
+    }
+
+    stbuf->st_mode  = info.mode;
+    stbuf->st_size  = info.size;
+    stbuf->st_nlink = 1;
+
+    return 0;
+}
+
+static int ucs_vfs_fuse_open(const char *path, struct fuse_file_info *fi)
+{
+    ucs_string_buffer_t strb;
+    ucs_status_t status;
+
+    ucs_string_buffer_init(&strb);
+    status = ucs_vfs_path_read_file(path, &strb);
+    if (status != UCS_OK) {
+        return -ENOENT;
+    }
+
+    fi->fh = (uintptr_t)ucs_string_buffer_extract_mem(&strb);
+
+    return 0;
+}
+
+static int ucs_vfs_fuse_read(const char *path, char *buf, size_t size,
+                             off_t offset, struct fuse_file_info *fi)
+{
+    char *data    = (void*)fi->fh;
+    size_t length = strlen(data);
+    size_t nread;
+
+    if (offset >= length) {
+        return 0;
+    }
+
+    if ((offset + size) <= length) {
+        nread = size; /* read does not pass end-of-file */
+    } else {
+        nread = length - offset; /* read truncated by end-of-file */
+    }
+    memcpy(buf, data + offset, nread);
+
+    return nread;
+}
+
+static int ucs_vfs_fuse_readdir(const char *path, void *buf,
+                                fuse_fill_dir_t filler, off_t offset,
+                                struct fuse_file_info *fi,
+                                enum fuse_readdir_flags flags)
+{
+    ucs_vfs_enum_dir_context_t ctx;
+    ucs_status_t status;
+
+    filler(buf, ".", NULL, 0, 0);
+    filler(buf, "..", NULL, 0, 0);
+
+    ctx.buf    = buf;
+    ctx.filler = filler;
+    status     = ucs_vfs_path_list_dir(path, ucs_vfs_enum_dir_cb, &ctx);
+    if (status != UCS_OK) {
+        return -ENOENT;
+    }
+
+    return 0;
+}
+
+static int ucs_vfs_fuse_release(const char *path, struct fuse_file_info *fi)
+{
+    char *data = (void*)fi->fh;
+
+    ucs_free(data);
+    return 0;
+}
+
+struct fuse_operations ucs_vfs_fuse_operations = {
+    .getattr = ucs_vfs_fuse_getattr,
+    .open    = ucs_vfs_fuse_open,
+    .read    = ucs_vfs_fuse_read,
+    .readdir = ucs_vfs_fuse_readdir,
+    .release = ucs_vfs_fuse_release,
+};
+
+static void ucs_vfs_fuse_main()
+{
+    struct fuse_args fargs = FUSE_ARGS_INIT(0, NULL);
+    char mountpoint_fd[64];
+    int ret;
+
+    fuse_opt_add_arg(&fargs, "");
+
+    pthread_mutex_lock(&ucs_vfs_fuse_context.mutex);
+
+    if (ucs_vfs_fuse_context.stop) {
+        goto out_unlock;
+    }
+
+    ucs_vfs_fuse_context.fuse = fuse_new(&fargs, &ucs_vfs_fuse_operations,
+                                         sizeof(ucs_vfs_fuse_operations), NULL);
+    if (ucs_vfs_fuse_context.fuse == NULL) {
+        ucs_error("fuse_new() failed");
+        goto out_unlock;
+    }
+
+    ucs_snprintf_safe(mountpoint_fd, sizeof(mountpoint_fd), "/dev/fd/%d",
+                      ucs_vfs_fuse_context.fuse_fd);
+    ret = fuse_mount(ucs_vfs_fuse_context.fuse, mountpoint_fd);
+    if (ret < 0) {
+        ucs_error("fuse_mount(%s) failed: %d", mountpoint_fd, ret);
+        goto out_destroy;
+    }
+
+    /* Drop the lock and execute main loop */
+    pthread_mutex_unlock(&ucs_vfs_fuse_context.mutex);
+
+    fuse_loop(ucs_vfs_fuse_context.fuse);
+
+    pthread_mutex_lock(&ucs_vfs_fuse_context.mutex);
+out_destroy:
+    /* destroy when lock is held */
+    fuse_destroy(ucs_vfs_fuse_context.fuse);
+    ucs_vfs_fuse_context.fuse = NULL;
+out_unlock:
+    pthread_mutex_unlock(&ucs_vfs_fuse_context.mutex);
+}
+
+static ucs_status_t ucs_vfs_fuse_wait_for_path(const char *path)
+{
+#ifdef HAVE_INOTIFY
+    char event_buf[sizeof(struct inotify_event) + NAME_MAX];
+    const struct inotify_event *event;
+    char watch_filename[NAME_MAX];
+    const char *watch_dirname;
+    char dir_buf[PATH_MAX];
+    ucs_status_t status;
+    ssize_t nread;
+    size_t offset;
+
+    pthread_mutex_lock(&ucs_vfs_fuse_context.mutex);
+
+    /* copy path components to 'dir_buf' and 'watch_filename' */
+    ucs_strncpy_safe(dir_buf, path, sizeof(dir_buf));
+    ucs_strncpy_safe(watch_filename, ucs_basename(path),
+                     sizeof(watch_filename));
+    watch_dirname = dirname(dir_buf);
+
+    /* Create inotify channel */
+    ucs_vfs_fuse_context.inotify_fd = inotify_init();
+    if (ucs_vfs_fuse_context.inotify_fd < 0) {
+        ucs_error("inotify_init() failed: %m");
+        status = UCS_ERR_IO_ERROR;
+        goto out;
+    }
+
+    /* Watch for new files in 'watch_dirname' */
+    ucs_vfs_fuse_context.watch_desc = inotify_add_watch(
+            ucs_vfs_fuse_context.inotify_fd, watch_dirname, IN_CREATE);
+    if (ucs_vfs_fuse_context.watch_desc < 0) {
+        ucs_error("inotify_add_watch(%s) failed: %m", watch_dirname);
+        status = UCS_ERR_IO_ERROR;
+        goto out_close_inotify_fd;
+    }
+
+    /* Check 'stop' flag before entering the loop. If the main thread sets
+     * 'stop' flag before this thread created 'inotify_fd' fd, the execution
+     * of the thread has to be stopped, otherwise - the thread hangs waiting
+     * for the data on 'inotify_fd' fd.
+     */
+    if (ucs_vfs_fuse_context.stop) {
+        status = UCS_ERR_CANCELED;
+        goto out_close_watch_id;
+    }
+
+    /* Read events from inotify channel and exit when either the main thread set
+     * 'stop' flag, or the file was created
+     */
+    ucs_debug("waiting for creation of '%s' in '%s'", watch_filename,
+              watch_dirname);
+    for (;;) {
+        pthread_mutex_unlock(&ucs_vfs_fuse_context.mutex);
+        nread = read(ucs_vfs_fuse_context.inotify_fd, event_buf,
+                     sizeof(event_buf));
+        pthread_mutex_lock(&ucs_vfs_fuse_context.mutex);
+
+        if (ucs_vfs_fuse_context.stop) {
+            status = UCS_ERR_CANCELED;
+            break;
+        }
+
+        if (nread < 0) {
+            ucs_error("inotify read() failed: %m");
+            status = UCS_ERR_IO_ERROR;
+            break;
+        }
+
+        /* Go over new events in the buffer */
+        for (offset  = 0; offset < nread;
+             offset += sizeof(*event) + event->len) {
+            event = UCS_PTR_BYTE_OFFSET(event_buf, offset);
+            if (!(event->mask & IN_CREATE)) {
+                ucs_trace("ignoring inotify event with mask 0x%x", event->mask);
+                continue;
+            }
+
+            ucs_debug("file '%s' created", event->name);
+            if (strcmp(event->name, watch_filename)) {
+                ucs_trace("ignoring inotify create event of '%s'", event->name);
+                continue;
+            }
+
+            status = UCS_OK;
+            goto out_close_watch_id;
+        }
+    }
+
+out_close_watch_id:
+    inotify_rm_watch(ucs_vfs_fuse_context.inotify_fd,
+                     ucs_vfs_fuse_context.watch_desc);
+out_close_inotify_fd:
+    close(ucs_vfs_fuse_context.inotify_fd);
+    ucs_vfs_fuse_context.inotify_fd = -1;
+out:
+    pthread_mutex_unlock(&ucs_vfs_fuse_context.mutex);
+    return status;
+#else
+    return UCS_ERR_UNSUPPORTED;
+#endif
+}
+
+static void *ucs_vfs_fuse_thread_func(void *arg)
+{
+    ucs_vfs_sock_message_t vfs_msg_in, vfs_msg_out;
+    struct sockaddr_un un_addr;
+    ucs_status_t status;
+    int connfd;
+    int ret;
+
+    connfd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (connfd < 0) {
+        ucs_error("failed to create VFS socket: %m");
+        goto out;
+    }
+
+again:
+    ret = ucs_vfs_sock_get_address(&un_addr);
+    if (ret < 0) {
+        ucs_warn("failed to get vfs socket path: %s", strerror(-ret));
+        goto out_close;
+    }
+
+    ucs_debug("connecting vfs socket %d to daemon on '%s'", connfd,
+              un_addr.sun_path);
+    ret = connect(connfd, (const struct sockaddr*)&un_addr, sizeof(un_addr));
+    if (ret < 0) {
+        /* VFS daemon is not listening. Set up a file watch on the unix socket
+         * path, to retry when the daemon is started.
+         */
+        if ((errno == ECONNREFUSED) || (errno == ENOENT)) {
+            ucs_debug("failed to connect to vfs socket '%s': %m",
+                      un_addr.sun_path);
+            status = ucs_vfs_fuse_wait_for_path(un_addr.sun_path);
+            if (status == UCS_OK) {
+                goto again;
+            }
+
+            ucs_diag("failed to watch on '%s', VFS will be disabled",
+                     un_addr.sun_path);
+        } else {
+            ucs_warn("failed to connect to vfs socket '%s': %m",
+                     un_addr.sun_path);
+        }
+        goto out_close;
+    }
+
+    ucs_debug("sending vfs mount request on socket %d", connfd);
+    vfs_msg_out.action = UCS_VFS_SOCK_ACTION_MOUNT;
+    ret                = ucs_vfs_sock_send(connfd, &vfs_msg_out);
+    if (ret < 0) {
+        ucs_warn("failed to send mount action to vfs daemon: %s",
+                 strerror(-ret));
+        goto out_close;
+    }
+
+    ret = ucs_vfs_sock_recv(connfd, &vfs_msg_in);
+    if (ret < 0) {
+        ucs_warn("failed to receive mount reply from vfs daemon: %s",
+                 strerror(-ret));
+        goto out_close;
+    }
+
+    ucs_vfs_fuse_context.fuse_fd = vfs_msg_in.fd;
+    ucs_vfs_fuse_main();
+    close(vfs_msg_in.fd);
+
+out_close:
+    close(connfd);
+out:
+    return NULL;
+}
+
+static void ucs_fuse_replace_fd_devnull()
+{
+    int devnull_fd;
+
+    devnull_fd = open("/dev/null", O_RDWR);
+    if (devnull_fd < 0) {
+        ucs_warn("failed to open /dev/null: %m");
+        return;
+    }
+
+    /* force exiting from fuse event loop, which reads from fuse_fd */
+    ucs_assert(ucs_vfs_fuse_context.fuse_fd != -1);
+    ucs_debug("dup2(%d, %d)", devnull_fd, ucs_vfs_fuse_context.fuse_fd);
+    dup2(devnull_fd, ucs_vfs_fuse_context.fuse_fd);
+    close(devnull_fd);
+}
+
+static void ucs_fuse_thread_stop()
+{
+    sighandler_t orig_handler;
+
+    orig_handler = signal(SIGUSR1, ucs_empty_function);
+
+    pthread_mutex_lock(&ucs_vfs_fuse_context.mutex);
+
+    ucs_vfs_fuse_context.stop = 1;
+
+    /* If the thread is waiting in inotify loop, wake it */
+    if (ucs_vfs_fuse_context.inotify_fd >= 0) {
+#ifdef HAVE_INOTIFY
+        inotify_rm_watch(ucs_vfs_fuse_context.inotify_fd,
+                         ucs_vfs_fuse_context.watch_desc);
+#endif
+    }
+
+    /* If the thread is in fuse loop, terminate it */
+    if (ucs_vfs_fuse_context.fuse != NULL) {
+        fuse_exit(ucs_vfs_fuse_context.fuse);
+        ucs_fuse_replace_fd_devnull();
+        pthread_kill(ucs_vfs_fuse_context.thread_id, SIGUSR1);
+    }
+
+    pthread_mutex_unlock(&ucs_vfs_fuse_context.mutex);
+
+    pthread_join(ucs_vfs_fuse_context.thread_id, NULL);
+    signal(SIGUSR1, orig_handler);
+}
+
+UCS_STATIC_INIT
+{
+    if (ucs_global_opts.vfs_enable) {
+        pthread_create(&ucs_vfs_fuse_context.thread_id, NULL,
+                       ucs_vfs_fuse_thread_func, NULL);
+    }
+}
+
+UCS_STATIC_CLEANUP
+{
+    if (ucs_vfs_fuse_context.thread_id != -1) {
+        ucs_fuse_thread_stop();
+    }
+}
diff --git a/src/ucs/vfs/sock/Makefile.am b/src/ucs/vfs/sock/Makefile.am
new file mode 100644
index 00000000000..8e9b372ab8c
--- /dev/null
+++ b/src/ucs/vfs/sock/Makefile.am
@@ -0,0 +1,11 @@
+#
+# Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+#
+# See file LICENSE for terms.
+#
+
+noinst_LTLIBRARIES          = libucs_vfs_sock.la
+libucs_vfs_sock_la_SOURCES  = vfs_sock.c
+noinst_HEADERS              = vfs_sock.h
+libucs_vfs_sock_la_CPPFLAGS = $(BASE_CPPFLAGS)
+libucs_vfs_sock_la_LDFLAGS  = $(BASE_LDFLAGS)
diff --git a/src/ucs/vfs/sock/configure.m4 b/src/ucs/vfs/sock/configure.m4
new file mode 100644
index 00000000000..6abed32fc8e
--- /dev/null
+++ b/src/ucs/vfs/sock/configure.m4
@@ -0,0 +1,6 @@
+#
+# Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+#
+
+
+AC_CONFIG_FILES([src/ucs/vfs/sock/Makefile])
diff --git a/src/ucs/vfs/sock/vfs_sock.c b/src/ucs/vfs/sock/vfs_sock.c
new file mode 100644
index 00000000000..4eccc969f1f
--- /dev/null
+++ b/src/ucs/vfs/sock/vfs_sock.c
@@ -0,0 +1,167 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "vfs_sock.h"
+
+#include <ucs/sys/compiler_def.h>
+#include <sys/socket.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <pwd.h>
+
+
+typedef struct {
+    uint8_t action;
+} UCS_S_PACKED ucs_vfs_msg_t;
+
+
+int ucs_vfs_sock_get_address(struct sockaddr_un *un_addr)
+{
+    struct passwd *pw;
+
+    pw = getpwuid(geteuid());
+    if (pw == NULL) {
+        return -errno;
+    }
+
+    memset(un_addr, 0, sizeof(*un_addr));
+    un_addr->sun_family = AF_UNIX;
+    snprintf(un_addr->sun_path, sizeof(un_addr->sun_path) - 1,
+             "/tmp/ucx-vfs-%s.sock", pw->pw_name);
+    return 0;
+}
+
+int ucs_vfs_sock_setopt_passcred(int sockfd)
+{
+    int optval, ret;
+
+    optval = 1;
+    ret = setsockopt(sockfd, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval));
+    if (ret < 0) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int ucs_vfs_sock_retval(ssize_t ret, size_t expected)
+{
+    if (ret == expected) {
+        return 0;
+    } else if (ret < 0) {
+        return -errno;
+    } else {
+        return -EIO;
+    }
+}
+
+int ucs_vfs_sock_send(int sockfd, const ucs_vfs_sock_message_t *vfs_msg)
+{
+    char cbuf[CMSG_SPACE(sizeof(*vfs_msg))] UCS_V_ALIGNED(sizeof(size_t));
+    struct cmsghdr *cmsgp;
+    struct msghdr msgh;
+    ucs_vfs_msg_t msg;
+    struct iovec iov;
+    ssize_t nsent;
+
+    memset(cbuf, 0, sizeof(cbuf));
+    memset(&msgh, 0, sizeof(msgh));
+    msg.action      = vfs_msg->action;
+    iov.iov_base    = &msg;
+    iov.iov_len     = sizeof(msg);
+    msgh.msg_iov    = &iov;
+    msgh.msg_iovlen = 1;
+
+    if (vfs_msg->action == UCS_VFS_SOCK_ACTION_MOUNT_REPLY) {
+        /* send file descriptor */
+        msgh.msg_control    = cbuf;
+        msgh.msg_controllen = sizeof(cbuf);
+        cmsgp               = CMSG_FIRSTHDR(&msgh);
+        cmsgp->cmsg_level   = SOL_SOCKET;
+        cmsgp->cmsg_len     = CMSG_LEN(sizeof(vfs_msg->fd));
+        cmsgp->cmsg_type    = SCM_RIGHTS;
+        memcpy(CMSG_DATA(cmsgp), &vfs_msg->fd, sizeof(vfs_msg->fd));
+    }
+
+    do {
+        nsent = sendmsg(sockfd, &msgh, 0);
+    } while ((nsent < 0) && (errno == EINTR));
+    return ucs_vfs_sock_retval(nsent, iov.iov_len);
+}
+
+int ucs_vfs_sock_recv(int sockfd, ucs_vfs_sock_message_t *vfs_msg)
+{
+    char cbuf[CMSG_SPACE(sizeof(*vfs_msg))] UCS_V_ALIGNED(sizeof(size_t));
+    const struct ucred *cred;
+    struct cmsghdr *cmsgp;
+    struct msghdr msgh;
+    ucs_vfs_msg_t msg;
+    struct iovec iov;
+    ssize_t nrecvd;
+
+    /* initialize to invalid values */
+    vfs_msg->action = UCS_VFS_SOCK_ACTION_LAST;
+    vfs_msg->fd     = -1;
+    vfs_msg->pid    = -1;
+
+    memset(cbuf, 0, sizeof(cbuf));
+    memset(&msgh, 0, sizeof(msgh));
+    iov.iov_base        = &msg;
+    iov.iov_len         = sizeof(msg);
+    msgh.msg_iov        = &iov;
+    msgh.msg_iovlen     = 1;
+    msgh.msg_control    = cbuf;
+    msgh.msg_controllen = sizeof(cbuf);
+
+    do {
+        nrecvd = recvmsg(sockfd, &msgh, MSG_WAITALL);
+    } while ((nrecvd < 0) && (errno == EINTR));
+    if (nrecvd != iov.iov_len) {
+        assert(nrecvd < iov.iov_len);
+        return ucs_vfs_sock_retval(nrecvd, iov.iov_len);
+    }
+
+    vfs_msg->action = msg.action;
+
+    cmsgp = CMSG_FIRSTHDR(&msgh);
+    if ((cmsgp == NULL) || (cmsgp->cmsg_level != SOL_SOCKET)) {
+        return -EINVAL;
+    }
+
+    if (msg.action == UCS_VFS_SOCK_ACTION_MOUNT_REPLY) {
+        /* expect file descriptor */
+        if ((cmsgp->cmsg_type != SCM_RIGHTS) ||
+            (cmsgp->cmsg_len != CMSG_LEN(sizeof(vfs_msg->fd)))) {
+            return -EINVAL;
+        }
+
+        memcpy(&vfs_msg->fd, CMSG_DATA(cmsgp), sizeof(vfs_msg->fd));
+    } else {
+        /* expect credentials */
+        if ((cmsgp->cmsg_type != SCM_CREDENTIALS) ||
+            (cmsgp->cmsg_len != CMSG_LEN(sizeof(*cred)))) {
+            return -EINVAL;
+        }
+
+        cred = (const struct ucred*)CMSG_DATA(cmsgp);
+        if ((cred->uid != getuid()) || (cred->gid != getgid())) {
+            return -EPERM;
+        }
+
+        if (msg.action == UCS_VFS_SOCK_ACTION_MOUNT) {
+            vfs_msg->pid = cred->pid;
+        }
+    }
+
+    return 0;
+}
diff --git a/src/ucs/vfs/sock/vfs_sock.h b/src/ucs/vfs/sock/vfs_sock.h
new file mode 100644
index 00000000000..1d33efb3a1f
--- /dev/null
+++ b/src/ucs/vfs/sock/vfs_sock.h
@@ -0,0 +1,89 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef UCS_VFS_SOCK_H_
+#define UCS_VFS_SOCK_H_
+
+#include <sys/types.h>
+#include <sys/un.h>
+#include <stdint.h>
+
+/* This header file defines socket operations for communicating between UCS
+ * library and VFS daemon */
+
+/**
+ * VFS socket message type
+ */
+typedef enum {
+    UCS_VFS_SOCK_ACTION_STOP,        /* daemon is asked to stop */
+    UCS_VFS_SOCK_ACTION_MOUNT,       /* daemon is asked to mount a file system */
+    UCS_VFS_SOCK_ACTION_MOUNT_REPLY, /* daemon sends back FUSE file descriptor */
+    UCS_VFS_SOCK_ACTION_NOP,         /* no-operation, used to test connection */
+    UCS_VFS_SOCK_ACTION_LAST
+} ucs_vfs_sock_action_t;
+
+
+/**
+ * Parameters structure for sending/receiving a message over VFS socket
+ */
+typedef struct {
+    ucs_vfs_sock_action_t action;
+
+    /* If action==MOUNT_REPLY: in/out parameter, holds FUSE file descriptor.
+     * Otherwise: unused
+     */
+    int                   fd;
+
+    /* If action==MOUNT: out parameter, holds the pid of sender process.
+     * Otherwise: unused
+     */
+    pid_t                 pid;
+} ucs_vfs_sock_message_t;
+
+
+/**
+ * Return the Unix-domain socket address of the VFS daemon.
+ *
+ * @param [out] un_addr  Filled with socket address.
+ *
+ * @return 0 on success, or the negative value of errno in case of failure.
+ */
+int ucs_vfs_sock_get_address(struct sockaddr_un *un_addr);
+
+
+/**
+ * Enable receiving credentials of the remote process for every message.
+ * Typically used by the VFS daemon to verify sender identity.
+ *
+ * @param [in] fd       Enable SO_PASSCRED on this socket.
+ *
+ * @return 0 on success, or the negative value of errno in case of failure.
+ */
+int ucs_vfs_sock_setopt_passcred(int sockfd);
+
+
+/**
+ * Send a message on the VFS socket.
+ *
+ * @param [in] fd       Socket file descriptor to send the message on.
+ * @param [in] vfs_msg  Message to send.
+ *
+ * @return 0 on success, or the negative value of errno in case of failure.
+ */
+int ucs_vfs_sock_send(int sockfd, const ucs_vfs_sock_message_t *vfs_msg);
+
+
+/**
+ * Receive a message on the VFS socket.
+ *
+ * @param [in] fd        Socket file descriptor to receive the message on.
+ * @param [out] vfs_msg  Filled with details of the received message.
+ *
+ * @return 0 on success, or the negative value of errno in case of failure.
+ */
+int ucs_vfs_sock_recv(int sockfd, ucs_vfs_sock_message_t *vfs_msg);
+
+#endif
diff --git a/src/uct/Makefile.am b/src/uct/Makefile.am
index 6114aca3bf1..9fcddd9ee6e 100644
--- a/src/uct/Makefile.am
+++ b/src/uct/Makefile.am
@@ -1,5 +1,5 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 # Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED.
 # Copyright (c) The University of Tennesse and the University of Tennessee
 #               Research Foundation. 2016.  ALL RIGHTS RESERVED.
@@ -42,11 +42,7 @@ noinst_HEADERS = \
 	tcp/tcp.h \
 	tcp/tcp_sockcm.h \
 	tcp/tcp_listener.h \
-	tcp/tcp_sockcm_ep.h \
-	tcp/sockcm/sockcm_def.h \
-	tcp/sockcm/sockcm_iface.h \
-	tcp/sockcm/sockcm_ep.h \
-	tcp/sockcm/sockcm_md.h
+	tcp/tcp_sockcm_ep.h
 
 
 libuct_la_SOURCES = \
@@ -54,6 +50,7 @@ libuct_la_SOURCES = \
 	base/uct_mem.c \
 	base/uct_component.c \
 	base/uct_iface.c \
+	base/uct_iface_vfs.c \
 	base/uct_worker.c \
 	base/uct_cm.c \
 	sm/base/sm_ep.c \
@@ -74,7 +71,4 @@ libuct_la_SOURCES = \
 	tcp/tcp_base.c \
 	tcp/tcp_sockcm.c \
 	tcp/tcp_listener.c \
-	tcp/tcp_sockcm_ep.c \
-	tcp/sockcm/sockcm_iface.c \
-	tcp/sockcm/sockcm_ep.c \
-	tcp/sockcm/sockcm_md.c
+	tcp/tcp_sockcm_ep.c
diff --git a/src/uct/api/tl.h b/src/uct/api/tl.h
index e6f7667e21a..41b85d2b163 100644
--- a/src/uct/api/tl.h
+++ b/src/uct/api/tl.h
@@ -75,6 +75,10 @@ typedef ucs_status_t (*uct_ep_am_short_func_t)(uct_ep_h ep,
                                                const void *payload,
                                                unsigned length);
 
+typedef ucs_status_t (*uct_ep_am_short_iov_func_t)(uct_ep_h ep, uint8_t id,
+                                                   const uct_iov_t *iov,
+                                                   size_t iovcnt);
+
 typedef ssize_t      (*uct_ep_am_bcopy_func_t)(uct_ep_h ep,
                                                uint8_t id,
                                                uct_pack_callback_t pack_cb,
@@ -215,6 +219,9 @@ typedef ucs_status_t (*uct_ep_check_func_t)(uct_ep_h ep,
 typedef ucs_status_t (*uct_ep_create_func_t)(const uct_ep_params_t *params,
                                              uct_ep_h *ep_p);
 
+typedef ucs_status_t (*uct_ep_connect_func_t)(
+        uct_ep_h ep, const uct_ep_connect_params_t *params);
+
 typedef ucs_status_t (*uct_ep_disconnect_func_t)(uct_ep_h ep, unsigned flags);
 
 typedef ucs_status_t (*uct_cm_ep_conn_notify_func_t)(uct_ep_h ep);
@@ -299,6 +306,7 @@ typedef struct uct_iface_ops {
 
     /* endpoint - active message */
     uct_ep_am_short_func_t              ep_am_short;
+    uct_ep_am_short_iov_func_t          ep_am_short_iov;
     uct_ep_am_bcopy_func_t              ep_am_bcopy;
     uct_ep_am_zcopy_func_t              ep_am_zcopy;
 
@@ -333,6 +341,7 @@ typedef struct uct_iface_ops {
 
     /* endpoint - connection establishment */
     uct_ep_create_func_t                ep_create;
+    uct_ep_connect_func_t               ep_connect;
     uct_ep_disconnect_func_t            ep_disconnect;
     uct_cm_ep_conn_notify_func_t        cm_ep_conn_notify;
     uct_ep_destroy_func_t               ep_destroy;
diff --git a/src/uct/api/uct.h b/src/uct/api/uct.h
index 9da472e6cc9..1dc568c31fc 100644
--- a/src/uct/api/uct.h
+++ b/src/uct/api/uct.h
@@ -134,14 +134,16 @@ BEGIN_C_DECLS
  * @ref uct_ep_create
  *      Connect to the client by creating an endpoint if the request is accepted.
  *      The server creates a new endpoint for every connection request that it accepts.
- * @ref uct_cm_ep_priv_data_pack_callback_t
- *      This callback is invoked by the UCT transport to fill auxiliary data in
- *      the connection acknowledgement or reject notification back to the client.
- *      Send the client a connection acknowledgement or reject notification.
- *      Wait for an acknowledgment from the client, indicating that it is connected.
  * @ref uct_cm_ep_server_conn_notify_callback_t
  *      This callback is invoked by the UCT transport to handle the connection
  *      notification from the client.
+ * @note The private data which the server should send to the client can be
+ *       either provided directly to @ref uct_ep_create, or filled by
+ *       @ref uct_cm_ep_priv_data_pack_callback_t provided to
+ *       @ref uct_ep_create.
+ * @note In order to reject a connection request, can either call
+ *       @ref uct_listener_reject or return failure status as defined by
+ *       @ref ucs_status_t from @ref uct_cm_ep_priv_data_pack_callback_t.
  *
  * Disconnecting:
  * @ref uct_ep_disconnect
@@ -169,12 +171,14 @@ BEGIN_C_DECLS
  *      Open a connection manager.
  * @ref uct_ep_create
  *      Create an endpoint for establishing a connection to the server.
- * @ref uct_cm_ep_priv_data_pack_callback_t
- *      This callback is invoked by the UCT transport to fill the user's private data
- *      in the connection request to be sent to the server. This connection request
- *      should be created by the transport.
- *      Send the connection request to the server.
- *      Wait for an acknowledgment from the server, indicating that it is connected.
+ * @ref uct_cm_ep_resolve_callback_t
+ *      This callback is invoked on the client side of the connection manager,
+ *      after the remote server address was resolved to the local device to be
+ *      used for connection establishment.
+ * @ref uct_ep_connect
+ *      This function should be called on the client side, in order to send
+ *      private data and resume connection establishment, following an
+ *      address-resolved notification via @ref uct_cm_ep_resolve_callback_t.
  * @ref uct_cm_ep_client_connect_callback_t
  *      This callback is invoked by the UCT transport to handle a connection response
  *      from the server.
@@ -412,6 +416,13 @@ typedef enum uct_atomic_op {
                                                        and it may also be invoked when uct_worker_progress()
                                                        is called. */
 
+        /* Keepalive */
+#define UCT_IFACE_FLAG_EP_KEEPALIVE   UCS_BIT(46) /**< Transport endpoint has built-in keepalive feature,
+                                                       which guarantees the error callback on the transport
+                                                       interface will be called if the communication
+                                                       channel with remote peer is broken, even if there
+                                                       are no outstanding send operations */
+
         /* Tag matching operations */
 #define UCT_IFACE_FLAG_TAG_EAGER_SHORT UCS_BIT(50) /**< Hardware tag matching short eager support */
 #define UCT_IFACE_FLAG_TAG_EAGER_BCOPY UCS_BIT(51) /**< Hardware tag matching bcopy eager support */
@@ -590,51 +601,60 @@ enum uct_iface_open_mode {
  */
 enum uct_iface_params_field {
     /** Enables @ref uct_iface_params_t::cpu_mask */
-    UCT_IFACE_PARAM_FIELD_CPU_MASK          = UCS_BIT(0),
+    UCT_IFACE_PARAM_FIELD_CPU_MASK           = UCS_BIT(0),
 
     /** Enables @ref uct_iface_params_t::open_mode */
-    UCT_IFACE_PARAM_FIELD_OPEN_MODE         = UCS_BIT(1),
+    UCT_IFACE_PARAM_FIELD_OPEN_MODE          = UCS_BIT(1),
 
     /** Enables @ref uct_iface_params_t_mode_device
      *  "uct_iface_params_t::mode::device" */
-    UCT_IFACE_PARAM_FIELD_DEVICE            = UCS_BIT(2),
+    UCT_IFACE_PARAM_FIELD_DEVICE             = UCS_BIT(2),
 
     /** Enables @ref uct_iface_params_t_mode_sockaddr
      *  "uct_iface_params_t::mode::sockaddr" */
-    UCT_IFACE_PARAM_FIELD_SOCKADDR          = UCS_BIT(3),
+    UCT_IFACE_PARAM_FIELD_SOCKADDR           = UCS_BIT(3),
 
     /** Enables @ref uct_iface_params_t::stats_root */
-    UCT_IFACE_PARAM_FIELD_STATS_ROOT        = UCS_BIT(4),
+    UCT_IFACE_PARAM_FIELD_STATS_ROOT         = UCS_BIT(4),
 
     /** Enables @ref uct_iface_params_t::rx_headroom */
-    UCT_IFACE_PARAM_FIELD_RX_HEADROOM       = UCS_BIT(5),
+    UCT_IFACE_PARAM_FIELD_RX_HEADROOM        = UCS_BIT(5),
 
     /** Enables @ref uct_iface_params_t::err_handler_arg */
-    UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG   = UCS_BIT(6),
+    UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG    = UCS_BIT(6),
 
     /** Enables @ref uct_iface_params_t::err_handler */
-    UCT_IFACE_PARAM_FIELD_ERR_HANDLER       = UCS_BIT(7),
+    UCT_IFACE_PARAM_FIELD_ERR_HANDLER        = UCS_BIT(7),
 
     /** Enables @ref uct_iface_params_t::err_handler_flags */
-    UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS = UCS_BIT(8),
+    UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS  = UCS_BIT(8),
 
     /** Enables @ref uct_iface_params_t::eager_arg */
-    UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG   = UCS_BIT(9),
+    UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG    = UCS_BIT(9),
 
     /** Enables @ref uct_iface_params_t::eager_cb */
-    UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB    = UCS_BIT(10),
+    UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_CB     = UCS_BIT(10),
 
     /** Enables @ref uct_iface_params_t::rndv_arg */
-    UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG    = UCS_BIT(11),
+    UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG     = UCS_BIT(11),
 
     /** Enables @ref uct_iface_params_t::rndv_cb */
-    UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB     = UCS_BIT(12),
+    UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_CB      = UCS_BIT(12),
 
     /** Enables @ref uct_iface_params_t::async_event_arg */
-    UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG   = UCS_BIT(13),
+    UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG    = UCS_BIT(13),
 
     /** Enables @ref uct_iface_params_t::async_event_cb */
-    UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB    = UCS_BIT(14)
+    UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB     = UCS_BIT(14),
+
+    /** Enables @ref uct_iface_params_t::keepalive_interval */
+    UCT_IFACE_PARAM_FIELD_KEEPALIVE_INTERVAL = UCS_BIT(15),
+
+    /** Enables @ref uct_iface_params_t::am_alignment */
+    UCT_IFACE_PARAM_FIELD_AM_ALIGNMENT       = UCS_BIT(16),
+
+    /** Enables @ref uct_iface_params_t::am_align_offset */
+    UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET    = UCS_BIT(17)
 };
 
 /**
@@ -820,7 +840,32 @@ enum uct_ep_params_field {
     UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB     = UCS_BIT(11),
 
     /** Enables @ref uct_ep_params::path_index */
-    UCT_EP_PARAM_FIELD_PATH_INDEX                 = UCS_BIT(12)
+    UCT_EP_PARAM_FIELD_PATH_INDEX                 = UCS_BIT(12),
+
+    /** Enables @ref uct_ep_params::cm_resolve_cb */
+    UCT_EP_PARAM_FIELD_CM_RESOLVE_CB              = UCS_BIT(13),
+
+    /** Enables @ref uct_ep_params::private_data */
+    UCT_EP_PARAM_FIELD_PRIV_DATA                  = UCS_BIT(14),
+
+    /** Enables @ref uct_ep_params::private_data_length */
+    UCT_EP_PARAM_FIELD_PRIV_DATA_LENGTH           = UCS_BIT(15)
+};
+
+
+/**
+ * @ingroup UCT_CLIENT_SERVER
+ * @brief UCT endpoint connected by @ref uct_ep_connect parameters field mask.
+ *
+ * The enumeration allows specifying which fields in
+ * @ref uct_ep_connect_params_t are present, for backward compatibility support.
+ */
+enum uct_ep_connect_params_field {
+    /** Enables @ref uct_ep_connect_params::private_data */
+    UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA         = UCS_BIT(0),
+
+    /** Enables @ref uct_ep_connect_params::private_data_length */
+    UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA_LENGTH  = UCS_BIT(1)
 };
 
 
@@ -879,8 +924,9 @@ struct uct_iface_attr {
         } get;                           /**< Attributes for GET operations */
 
         struct {
-            size_t           max_short;  /**< Total max. size (incl. the header) */
-            size_t           max_bcopy;  /**< Total max. size (incl. the header) */
+            size_t           max_short;  /**< Total maximum size (incl. the header)
+                                              @anchor uct_iface_attr_cap_am_max_short */
+            size_t           max_bcopy;  /**< Total maximum size (incl. the header) */
             size_t           min_zcopy;  /**< Minimal size for am_zcopy (incl. the
                                               header and total of @ref uct_iov_t::length
                                               of the @a iov parameter) */
@@ -1045,6 +1091,36 @@ struct uct_iface_params {
      * read by user if the iface has @ref UCT_IFACE_FLAG_EVENT_ASYNC_CB
      * capability */
     uct_async_event_cb_t                         async_event_cb;
+
+    /* Time period between keepalive rounds */
+    ucs_time_t                                   keepalive_interval;
+
+    /**
+     * Desired alignment for Active Messages on the receiver. Note that only
+     * data received in the UCT descriptor can be aligned (i.e.
+     * @a UCT_CB_PARAM_FLAG_DESC flag is provided in the Active Message
+     * handler callback). The provided value must be power of 2. The default
+     * value is 1.
+     */
+    size_t                                       am_alignment;
+
+    /**
+     * Offset in the Active Message receive buffer, which should be aligned to
+     * the @a am_alignment boundary. Note this parameter has no effect without
+     * setting @a am_alignment parameter. The provided value must be less than
+     * the given @a am_alignment value. The default value is 0.
+     *
+     * +-+ pointer to @a data in @ref uct_am_callback_t
+     * |
+     * |        + alignment boundary
+     * |        |
+     * v        v
+     * +-------------------+
+     * | align  |          |
+     * | offset |          |
+     * +-------------------+
+     */
+    size_t                                       am_align_offset;
 };
 
 
@@ -1095,9 +1171,12 @@ struct uct_ep_params {
     const ucs_sock_addr_t             *sockaddr;
 
     /**
-     * @ref uct_cb_flags to indicate @ref uct_ep_params_t::sockaddr_pack_cb
-     * behavior. If @ref uct_ep_params_t::sockaddr_pack_cb is not set, this
-     * field will be ignored.
+     * @ref uct_cb_flags to indicate @ref uct_ep_params_t::sockaddr_pack_cb,
+     * @ref uct_ep_params_t::sockaddr_cb_client,
+     * @ref uct_ep_params_t::sockaddr_cb_server,
+     * @ref uct_ep_params_t::disconnect_cb and
+     * @ref uct_ep_params_t::cm_resolve_cb behavior.
+     * If none from these are not set, this field will be ignored.
      */
     uint32_t                          sockaddr_cb_flags;
 
@@ -1105,9 +1184,11 @@ struct uct_ep_params {
      * Callback that will be used for filling the user's private data to be
      * delivered to the remote peer by the callback on the server or client side.
      * This field is only valid if @ref uct_ep_params_t::sockaddr is set.
-     * @note It is never guaranteed that the callaback will be called. If, for
+     * @note It is never guaranteed that the callback will be called. If, for
      * example, the endpoint goes into error state before issuing the connection
      * request, the callback will not be invoked.
+     * @note Can not be set together with @ref uct_ep_params_t::private_data or
+     * @ref uct_ep_params_t::cm_resolve_cb.
      */
     uct_cm_ep_priv_data_pack_callback_t sockaddr_pack_cb;
 
@@ -1148,9 +1229,59 @@ struct uct_ep_params {
      * 0..(@ref uct_iface_attr_t.dev_num_paths - 1).
      */
     unsigned                            path_index;
+
+    /**
+     * This callback is invoked when the remote server address provided in field
+     * @ref uct_ep_params_t::sockaddr is resolved to the local device to be used
+     * for connection establishment.
+     * @note In the event of a connection error, this callback will not be
+     *       invoked; @ref uct_ep_params_t::sockaddr_cb_client with indicating
+     *       the error code will be invoked instead.
+     * @note This field is mutually exclusive with
+     *       @ref uct_ep_params::sockaddr_pack_cb.
+     */
+    uct_cm_ep_resolve_callback_t        cm_resolve_cb;
+
+    /**
+     * Private data to be passed from server to client. Can be used only along
+     * with @ref uct_ep_params::conn_request.
+     * @note This field is mutually exclusive with
+     *       @ref uct_ep_params::sockaddr_pack_cb.
+     */
+    const void                          *private_data;
+
+    /**
+     * Length of @ref uct_ep_params::private_data, the maximal allowed value is
+     * indicated by the @ref uct_cm_attr::max_conn_priv.
+     */
+    size_t                              private_data_length;
 };
 
 
+/**
+ * @ingroup UCT_CLIENT_SERVER
+ * @brief Parameters for connecting a UCT endpoint by @ref uct_ep_connect.
+ */
+struct uct_ep_connect_params {
+    /**
+     * Mask of valid fields in this structure, using bits from
+     * @ref uct_ep_connect_params_field. Fields not specified by this mask
+     * will be ignored.
+     */
+    uint64_t                            field_mask;
+
+    /**
+     * User's private data to be passed from client to server.
+     */
+    const void                          *private_data;
+
+    /**
+     * Length of @ref uct_ep_connect_params::private_data, the maximal allowed
+     * value is indicated by the @ref uct_cm_attr::max_conn_priv.
+     */
+    size_t                              private_data_length;
+};
+
 /**
  * @ingroup UCT_CLIENT_SERVER
  * @brief Connection manager attributes, capabilities and limitations.
@@ -1259,14 +1390,20 @@ struct uct_md_attr {
  * The enumeration allows specifying which fields in @ref uct_md_mem_attr_t
  * are present.
  */
-enum uct_md_mem_attr_field {
-    UCT_MD_MEM_ATTR_FIELD_MEM_TYPE = UCS_BIT(0), /**< Indicate if memory type
-                                                      is populated. E.g. CPU/GPU */
-    UCT_MD_MEM_ATTR_FIELD_SYS_DEV  = UCS_BIT(1)  /**< Indicate if details of
-                                                      system device backing
-                                                      the pointer are populated.
-                                                      E.g. NUMA/GPU */
-};
+typedef enum uct_md_mem_attr_field {
+    UCT_MD_MEM_ATTR_FIELD_MEM_TYPE     = UCS_BIT(0), /**< Indicate if memory type
+                                                          is populated. E.g. CPU/GPU */
+    UCT_MD_MEM_ATTR_FIELD_SYS_DEV      = UCS_BIT(1), /**< Indicate if details of
+                                                          system device backing
+                                                          the pointer are populated.
+                                                          E.g. NUMA/GPU */
+    UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS = UCS_BIT(2), /**< Request base address of the
+                                                          allocation to which the buffer
+                                                          belongs. */
+    UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH = UCS_BIT(3)  /**< Request the whole length of the
+                                                          allocation to which the buffer
+                                                          belongs. */
+} uct_md_mem_attr_field_t;
 
 
 /**
@@ -1280,22 +1417,37 @@ enum uct_md_mem_attr_field {
 typedef struct uct_md_mem_attr {
     /**
      * Mask of valid fields in this structure, using bits from
-     * @ref uct_md_mem_attr_t. Note that the field mask is
-     * populated upon return from uct_md_mem_query and not set by user.
-     * Subsequent use of members of the structure are valid after ensuring that
-     * relevant bits in the field_mask are set.
+     * @ref uct_md_mem_attr_field_t.
      */
     uint64_t          field_mask;
 
     /**
-     * The type of memory. E.g. CPU/GPU memory or some other valid type
+     * The type of memory. E.g. CPU/GPU memory or some other valid type.
+     * If the md does not support sys_dev query, then UCS_MEMORY_TYPE_UNKNOWN
+     * is returned.
      */
     ucs_memory_type_t mem_type;
 
     /**
      * Index of the system device on which the buffer resides. eg: NUMA/GPU
+     * If the md does not support sys_dev query, then UCS_SYS_DEVICE_ID_UNKNOWN
+     * is returned.
      */
     ucs_sys_device_t  sys_dev;
+
+    /**
+     * Base address of the allocation to which the provided buffer belongs to.
+     * If the md not support base address query, then the pointer passed to
+     * uct_md_mem_query is returned as is.
+     */
+    void              *base_address;
+
+    /**
+     * Length of the whole allocation to which the provided buffer belongs to.
+     * If the md not support querying allocation length, then the length passed
+     * to uct_md_mem_query is returned as is.
+     */
+    size_t            alloc_length;
 } uct_md_mem_attr_t;
 
 
@@ -1303,8 +1455,8 @@ typedef struct uct_md_mem_attr {
  * @ingroup UCT_MD
  * @brief Query attributes of a given pointer
  *
- * Return attributes such as memory type, and system device for the
- * given pointer of specific length.
+ * Return attributes such as memory type, base address, allocation length,
+ * and system device for the given pointer of specific length.
  *
  * @param [in]     md          Memory domain to run the query on. This function
  *                             returns an error if the md does not recognize the
@@ -1314,11 +1466,12 @@ typedef struct uct_md_mem_attr {
  * @param [in]     length      Length of the memory region to examine.
  *                             Must be nonzero else UCS_ERR_INVALID_PARAM error
  *                             is returned.
- * @param [out]    mem_attr    If successful, filled with ptr attributes.
+ * @param [inout]  mem_attr    If successful, filled with ptr attributes.
  *
- * @return Error code.
+ * @return UCS_OK if at least one attribute is successfully queried otherwise
+ *         an error code as defined by @ref ucs_status_t is returned.
  */
-ucs_status_t uct_md_mem_query(uct_md_h md, const void *address, const size_t length,
+ucs_status_t uct_md_mem_query(uct_md_h md, const void *address, size_t length,
                               uct_md_mem_attr_t *mem_attr);
 
 
@@ -1413,11 +1566,16 @@ struct uct_tag_context {
     /**
      * Tag processing is completed by the transport.
      *
-     * @param [in]  self    Pointer to relevant context structure, which was
-     *                      initially passed to @ref uct_iface_tag_recv_zcopy.
-     * @param [in]  stag    Tag from sender.
-     * @param [in]  imm     Immediate data from sender. For rendezvous, it's always 0.
-     * @param [in]  length  Completed length.
+     * @param [in]  self        Pointer to relevant context structure, which was
+     *                          initially passed to @ref uct_iface_tag_recv_zcopy.
+     * @param [in]  stag        Tag from sender.
+     * @param [in]  imm         Immediate data from sender. For rendezvous, it's always 0.
+     * @param [in]  length      Completed length.
+     * @param [in]  inline_data If non-null, points to a temporary buffer which contains
+                                the received data. In this case the received data was not
+                                placed directly in the receive buffer. This callback routine
+                                is responsible for copy-out the inline data, otherwise it is
+                                released.
      * @param [in]  status  Completion status:
      * (a)   UCS_OK - Success, data placed in provided buffer.
      * (b)   UCS_ERR_TRUNCATED - Sender's length exceed posted
@@ -1425,7 +1583,7 @@ struct uct_tag_context {
      * (c)   UCS_ERR_CANCELED - Canceled by user.
      */
      void (*completed_cb)(uct_tag_context_t *self, uct_tag_t stag, uint64_t imm,
-                          size_t length, ucs_status_t status);
+                          size_t length, void *inline_data, ucs_status_t status);
 
     /**
      * Tag was matched by a rendezvous request, which should be completed by
@@ -1437,15 +1595,26 @@ struct uct_tag_context {
      * @param [in]  header        User defined header.
      * @param [in]  header_length User defined header length in bytes.
      * @param [in]  status        Completion status.
+     * @param [in]  flags         Flags defined by UCT_TAG_RECV_CB_xx.
      */
      void (*rndv_cb)(uct_tag_context_t *self, uct_tag_t stag, const void *header,
-                     unsigned header_length, ucs_status_t status);
+                     unsigned header_length, ucs_status_t status, unsigned flags);
 
      /** A placeholder for the private data used by the transport */
      char priv[UCT_TAG_PRIV_LEN];
 };
 
 
+/**
+ * @ingroup UCT_RESOURCE
+ * @brief flags of @ref uct_tag_context.
+ */
+enum {
+    /* If set, header points to inline data, otherwise it is user buffer. */
+    UCT_TAG_RECV_CB_INLINE_DATA = UCS_BIT(0)
+};
+
+
 extern const char *uct_alloc_method_names[];
 
 
@@ -1992,6 +2161,23 @@ ucs_status_t uct_iface_reject(uct_iface_h iface,
 ucs_status_t uct_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p);
 
 
+/**
+ * @ingroup UCT_CLIENT_SERVER
+ * @brief Connect a client side endpoint after it is bound to a local network
+ *        device, i.e. @ref uct_ep_params_t::cm_resolve_cb was invoked.
+ *
+ * This non-blocking routine establishes connection of the client side endpoint
+ * and sends private data to the peer.
+ *
+ * @param [in] ep       Endpoint to connect.
+ * @param [in] params   Parameters as defined in @ref uct_ep_connect_params_t.
+ *
+ * @return UCS_OK       Operation has been initiated successfully.
+ *         Other error codes as defined by @ref ucs_status_t.
+ */
+ucs_status_t uct_ep_connect(uct_ep_h ep, const uct_ep_connect_params_t *params);
+
+
 /**
  * @ingroup UCT_CLIENT_SERVER
  * @brief Initiate a disconnection of an endpoint connected to a
@@ -2610,6 +2796,43 @@ UCT_INLINE_API ucs_status_t uct_ep_am_short(uct_ep_h ep, uint8_t id, uint64_t he
 }
 
 
+/**
+ * @ingroup UCT_AM
+ * @brief Short io-vector send operation.
+ *
+ * This routine sends a message using @ref uct_short_protocol_desc "short" protocol.
+ * The input data in @a iov array of @ref ::uct_iov_t structures is sent to remote
+ * side to contiguous buffer keeping the order of the data in the array.
+ *
+ * @param [in] ep              Destination endpoint handle.
+ * @param [in] id              Active message id. Must be in range 0..UCT_AM_ID_MAX-1.
+ * @param [in] iov             Points to an array of @ref ::uct_iov_t structures.
+ *                             The @a iov pointer must be a valid address of an array
+ *                             of @ref ::uct_iov_t structures. A particular structure
+ *                             pointer must be a valid address. A NULL terminated
+ *                             array is not required. @a stride and @a count fields in
+ *                             @ref ::uct_iov_t structure are ignored in current
+ *                             implementation. The total size of the data buffers in
+ *                             the array is limited by
+ *                             @ref uct_iface_attr_cap_am_max_short
+ *                             "uct_iface_attr::cap::am::max_short".
+ * @param [in] iovcnt          Size of the @a iov data @ref ::uct_iov_t structures
+ *                             array. If @a iovcnt is zero, the data is considered empty.
+ *                             @a iovcnt is limited by @ref uct_iface_attr_cap_am_max_iov
+ *                             "uct_iface_attr::cap::am::max_iov".
+ *
+ * @return UCS_OK              Operation completed successfully.
+ * @return UCS_ERR_NO_RESOURCE Could not start the operation due to lack of
+ *                             send resources.
+ * @return otherwise           Error code.
+ */
+UCT_INLINE_API ucs_status_t uct_ep_am_short_iov(uct_ep_h ep, uint8_t id,
+                                                const uct_iov_t *iov, size_t iovcnt)
+{
+    return ep->iface->ops.ep_am_short_iov(ep, id, iov, iovcnt);
+}
+
+
 /**
  * @ingroup UCT_AM
  * @brief
diff --git a/src/uct/api/uct_def.h b/src/uct/api/uct_def.h
index 7c53c2af50c..d1eb76354cd 100644
--- a/src/uct/api/uct_def.h
+++ b/src/uct/api/uct_def.h
@@ -73,39 +73,40 @@ enum uct_cb_param_flags {
  * @addtogroup UCT_RESOURCE
  * @{
  */
-typedef struct uct_component       *uct_component_h;
-typedef struct uct_iface           *uct_iface_h;
-typedef struct uct_iface_config    uct_iface_config_t;
-typedef struct uct_md_config       uct_md_config_t;
-typedef struct uct_cm_config       uct_cm_config_t;
-typedef struct uct_ep              *uct_ep_h;
-typedef void *                     uct_mem_h;
-typedef uintptr_t                  uct_rkey_t;
-typedef struct uct_md              *uct_md_h;          /**< @brief Memory domain handler */
-typedef struct uct_md_ops          uct_md_ops_t;
-typedef void                       *uct_rkey_ctx_h;
-typedef struct uct_iface_attr      uct_iface_attr_t;
-typedef struct uct_iface_params    uct_iface_params_t;
-typedef struct uct_md_attr         uct_md_attr_t;
-typedef struct uct_completion      uct_completion_t;
-typedef struct uct_pending_req     uct_pending_req_t;
-typedef struct uct_worker          *uct_worker_h;
-typedef struct uct_md              uct_md_t;
-typedef enum uct_am_trace_type     uct_am_trace_type_t;
-typedef struct uct_device_addr     uct_device_addr_t;
-typedef struct uct_iface_addr      uct_iface_addr_t;
-typedef struct uct_ep_addr         uct_ep_addr_t;
-typedef struct uct_ep_params       uct_ep_params_t;
-typedef struct uct_cm_attr         uct_cm_attr_t;
-typedef struct uct_cm              uct_cm_t;
-typedef uct_cm_t                   *uct_cm_h;
-typedef struct uct_listener_attr   uct_listener_attr_t;
-typedef struct uct_listener        *uct_listener_h;
-typedef struct uct_listener_params uct_listener_params_t;
-typedef struct uct_tag_context     uct_tag_context_t;
-typedef uint64_t                   uct_tag_t;  /* tag type - 64 bit */
-typedef int                        uct_worker_cb_id_t;
-typedef void*                      uct_conn_request_h;
+typedef struct uct_component         *uct_component_h;
+typedef struct uct_iface             *uct_iface_h;
+typedef struct uct_iface_config      uct_iface_config_t;
+typedef struct uct_md_config         uct_md_config_t;
+typedef struct uct_cm_config         uct_cm_config_t;
+typedef struct uct_ep                *uct_ep_h;
+typedef void *                       uct_mem_h;
+typedef uintptr_t                    uct_rkey_t;
+typedef struct uct_md                *uct_md_h;          /**< @brief Memory domain handler */
+typedef struct uct_md_ops            uct_md_ops_t;
+typedef void                         *uct_rkey_ctx_h;
+typedef struct uct_iface_attr        uct_iface_attr_t;
+typedef struct uct_iface_params      uct_iface_params_t;
+typedef struct uct_md_attr           uct_md_attr_t;
+typedef struct uct_completion        uct_completion_t;
+typedef struct uct_pending_req       uct_pending_req_t;
+typedef struct uct_worker            *uct_worker_h;
+typedef struct uct_md                uct_md_t;
+typedef enum uct_am_trace_type       uct_am_trace_type_t;
+typedef struct uct_device_addr       uct_device_addr_t;
+typedef struct uct_iface_addr        uct_iface_addr_t;
+typedef struct uct_ep_addr           uct_ep_addr_t;
+typedef struct uct_ep_params         uct_ep_params_t;
+typedef struct uct_ep_connect_params uct_ep_connect_params_t;
+typedef struct uct_cm_attr           uct_cm_attr_t;
+typedef struct uct_cm                uct_cm_t;
+typedef uct_cm_t                     *uct_cm_h;
+typedef struct uct_listener_attr     uct_listener_attr_t;
+typedef struct uct_listener          *uct_listener_h;
+typedef struct uct_listener_params   uct_listener_params_t;
+typedef struct uct_tag_context       uct_tag_context_t;
+typedef uint64_t                     uct_tag_t;  /* tag type - 64 bit */
+typedef int                          uct_worker_cb_id_t;
+typedef void*                        uct_conn_request_h;
 
 /**
  * @}
@@ -155,16 +156,33 @@ typedef struct uct_iov {
  * @brief Client-Server private data pack callback arguments field mask.
  *
  * The enumeration allows specifying which fields in
- * @ref uct_cm_ep_priv_data_pack_args are present, for backward compatibility support.
+ * @ref uct_cm_ep_priv_data_pack_args are present, for backward compatibility
+ * support.
  */
 enum uct_cm_ep_priv_data_pack_args_field {
     /** Enables @ref uct_cm_ep_priv_data_pack_args::dev_name
-     *  Indicates that dev_name field in uct_cm_ep_priv_data_pack_args_t is valid.
+     *  Indicates that dev_name field in uct_cm_ep_priv_data_pack_args_t is
+     *  valid.
      */
     UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME = UCS_BIT(0)
 };
 
 
+/**
+ * @ingroup UCT_CLIENT_SERVER
+ * @brief Client-Server resolve callback arguments field mask.
+ *
+ * The enumeration allows specifying which fields in
+ * @ref uct_cm_ep_resolve_args are present, for backward compatibility support.
+ */
+enum uct_cm_ep_resolve_args_field {
+    /**
+     * Indicates that @ref uct_cm_ep_resolve_args::dev_name is valid.
+     */
+    UCT_CM_EP_RESOLVE_ARGS_FIELD_DEV_NAME       = UCS_BIT(0)
+};
+
+
 /**
  * @ingroup UCT_CLIENT_SERVER
  * @brief Arguments to the client-server private data pack callback.
@@ -177,7 +195,7 @@ typedef struct uct_cm_ep_priv_data_pack_args {
      * @ref uct_cm_ep_priv_data_pack_args_field.
      * Fields not specified by this mask should not be accessed by the callback.
      */
-    uint64_t                   field_mask;
+    uint64_t                    field_mask;
 
     /**
      * Device name. This routine may fill the user's private data according to
@@ -185,10 +203,34 @@ typedef struct uct_cm_ep_priv_data_pack_args {
      * corresponds to @ref uct_tl_resource_desc_t::dev_name as returned from
      * @ref uct_md_query_tl_resources.
      */
-    char                       dev_name[UCT_DEVICE_NAME_MAX];
+    char                        dev_name[UCT_DEVICE_NAME_MAX];
 } uct_cm_ep_priv_data_pack_args_t;
 
 
+/**
+ * @ingroup UCT_CLIENT_SERVER
+ * @brief Arguments to the client-server resolved callback.
+ *
+ * Used with the client-server API on a connection manager.
+ */
+typedef struct uct_cm_ep_resolve_args {
+    /**
+     * Mask of valid fields in this structure, using bits from
+     * @ref uct_cm_ep_resolve_args_field.
+     * Fields not specified by this mask should not be accessed by the callback.
+     */
+    uint64_t                    field_mask;
+
+   /**
+     * Device name indicates the device that the endpoint was bound to during
+     * address and route resolution. The device name that is passed to this
+     * callback, corresponds to @ref uct_tl_resource_desc_t::dev_name as
+     * returned from @ref uct_md_query_tl_resources.
+     */
+    char                        dev_name[UCT_DEVICE_NAME_MAX];
+} uct_cm_ep_resolve_args_t;
+
+
 /**
  * @ingroup UCT_CLIENT_SERVER
  * @brief Remote data attributes field mask.
@@ -687,7 +729,7 @@ typedef void (*uct_ep_disconnect_cb_t)(uct_ep_h ep, void *arg);
  * This callback routine will be invoked on the client side, before sending the
  * transport's connection request to the server, or on the server side before
  * sending a connection response to the client.
- * The callback routine must be set when creating an endpoint.
+ * This callback routine can be set when creating an endpoint.
  * The user's private data should be placed inside the priv_data buffer to be
  * sent to the remote side.
  * The maximal allowed length of the private data is indicated by the field
@@ -710,6 +752,29 @@ typedef ssize_t
                                        *pack_args, void *priv_data);
 
 
+/**
+ * @ingroup UCT_CLIENT_SERVER
+ * @brief Callback to notify that the client side endpoint is bound to a
+ *        local device.
+ *
+ * This callback routine will be invoked, when the client side endpoint is bound
+ * to a local device.
+ * The callback routine can be set when creating an endpoint.
+ * Communication progress routines should not be called from this callback.
+ * It is allowed to call other UCT communication routines from this callback.
+ *
+ * @param [in]  user_data       User argument as defined in
+ *                              @ref uct_ep_params_t::user_data.
+ * @param [in]  resolve_args    Handle for the extra arguments provided by the
+ *                              transport.
+ *
+ * @return UCS_OK on success or error as defined in @ref ucs_status_t.
+ */
+typedef ucs_status_t
+(*uct_cm_ep_resolve_callback_t)(void *user_data,
+                                const uct_cm_ep_resolve_args_t *resolve_args);
+
+
 /**
  * @ingroup UCT_TAG
  * @brief Callback to process unexpected eager tagged message.
diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h
new file mode 100644
index 00000000000..5c52f9437f4
--- /dev/null
+++ b/src/uct/api/v2/uct_v2.h
@@ -0,0 +1,139 @@
+/**
+ * @file        uct_v2.h
+ * @date        2021
+ * @copyright   Mellanox Technologies Ltd. All rights reserved.
+ * @brief       Unified Communication Transport
+ */
+
+#ifndef UCT_V2_H_
+#define UCT_V2_H_
+
+#include <ucs/sys/compiler_def.h>
+#include <ucs/memory/memory_type.h>
+#include <uct/api/uct.h>
+
+#include <stdint.h>
+
+BEGIN_C_DECLS
+
+/** @file uct_v2.h */
+
+/**
+* @defgroup UCT_RESOURCE   UCT Communication Resource
+* @ingroup UCT_API
+* @{
+* This section describes a concept of the Communication Resource and routines
+* associated with the concept.
+* @}
+*/
+
+/**
+ * @brief All existing UCT operations
+ *
+ * This enumeration defines all available UCT operations.
+ */
+typedef enum uct_ep_operation {
+    UCT_OP_AM_SHORT,     /**< Short active message */
+    UCT_OP_AM_BCOPY,     /**< Buffered active message */
+    UCT_OP_AM_ZCOPY,     /**< Zero-copy active message */
+    UCT_OP_PUT_SHORT,    /**< Short put */
+    UCT_OP_PUT_BCOPY,    /**< Buffered put */
+    UCT_OP_PUT_ZCOPY,    /**< Zero-copy put */
+    UCT_OP_GET_SHORT,    /**< Short get */
+    UCT_OP_GET_BCOPY,    /**< Buffered get */
+    UCT_OP_GET_ZCOPY,    /**< Zero-copy get */
+    UCT_OP_EAGER_SHORT,  /**< Tag matching short eager */
+    UCT_OP_EAGER_BCOPY,  /**< Tag matching bcopy eager */
+    UCT_OP_EAGER_ZCOPY,  /**< Tag matching zcopy eager */
+    UCT_OP_RNDV_ZCOPY,   /**< Tag matching rendezvous eager */
+    UCT_OP_ATOMIC_POST,  /**< Atomic post */
+    UCT_OP_ATOMIC_FETCH  /**< Atomic fetch */
+} uct_ep_operation_t;
+
+
+/**
+ * @ingroup UCT_RESOURCE
+ * @brief UCT interface query by @ref uct_iface_estimate_perf parameters field mask.
+ *
+ * The enumeration allows specifying which fields in @ref uct_perf_attr_t are
+ * present, for backward compatibility support.
+ */
+enum uct_perf_attr_field {
+    /** Enables @ref uct_perf_attr_t::operation */
+    UCT_PERF_ATTR_FIELD_OPERATION          = UCS_BIT(0),
+
+    /** Enables @ref uct_perf_attr_t::local_memory_type */
+    UCT_PERF_ATTR_FIELD_LOCAL_MEMORY_TYPE  = UCS_BIT(1),
+
+    /** Enables @ref uct_perf_attr_t::remote_memory_type */
+    UCT_PERF_ATTR_FIELD_REMOTE_MEMORY_TYPE = UCS_BIT(2),
+
+    /** Enables @ref uct_perf_attr_t::overhead */
+    UCT_PERF_ATTR_FIELD_OVERHEAD           = UCS_BIT(3),
+
+    /** Enables @ref uct_perf_attr_t::bandwidth */
+    UCT_PERF_ATTR_FIELD_BANDWIDTH          = UCS_BIT(4)
+};
+
+
+/**
+ * @ingroup UCT_RESOURCE
+ * @brief Parameters for querying a UCT interface by @ref uct_iface_estimate_perf
+ *
+ * This structure must be allocated and initialized by the user
+ */
+typedef struct {
+    /**
+     * Mask of valid fields in this structure, using bits from
+     * @ref uct_perf_attr_field. Fields not specified by this mask will be
+     * ignored. This field must be initialized by the caller.
+     */
+    uint64_t            field_mask;
+
+    /**
+     * Operation to report performance for.
+     * This field must be initialized by the caller.
+     */
+    uct_ep_operation_t  operation;
+
+    /**
+     * Local memory type to use for determining performance.
+     * This field must be initialized by the caller.
+     */
+    ucs_memory_type_t   local_memory_type;
+
+    /**
+     * Remote memory type to use for determining performance.
+     * Relevant only for operations that have remote memory access.
+     * This field must be initialized by the caller.
+     */
+    ucs_memory_type_t   remote_memory_type;
+
+    /**
+     * Message overhead time, in seconds. This field is set by the UCT layer.
+     */
+    double              overhead;
+
+    /**
+     * Bandwidth model. This field is set by the UCT layer.
+     */
+    uct_ppn_bandwidth_t bandwidth;
+} uct_perf_attr_t;
+
+
+/**
+ * @ingroup UCT_RESOURCE
+ * @brief Get interface performance attributes, by memory types and operation.
+ *        A pointer to uct_perf_attr_t struct must be passed, with the memory
+ *        types and operation members initialized. Overhead and bandwidth
+ *        for the opration on the given memory types will be reported.
+ *
+ * @param [in]    tl_iface  Interface to query.
+ * @param [inout] perf_attr Filled with performance attributes.
+ */
+ucs_status_t
+uct_iface_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr);
+
+END_C_DECLS
+
+#endif
diff --git a/src/uct/base/uct_cm.c b/src/uct/base/uct_cm.c
index faa9579abd0..811c757c856 100644
--- a/src/uct/base/uct_cm.c
+++ b/src/uct/base/uct_cm.c
@@ -20,6 +20,10 @@ ucs_config_field_t uct_cm_config_table[] = {
    "Log level of network errors for the connection manager",
    ucs_offsetof(uct_cm_config_t, failure), UCS_CONFIG_TYPE_ENUM(ucs_log_level_names)},
 
+  {"REUSEADDR", "no",
+   "Allow using an address that is already in use.",
+   ucs_offsetof(uct_cm_config_t, reuse_addr), UCS_CONFIG_TYPE_BOOL},
+
   {NULL}
 };
 
@@ -83,6 +87,19 @@ ucs_status_t uct_cm_ep_pack_cb(uct_cm_base_ep_t *cep, void *arg,
 
     *priv_data_ret = ret;
 out:
+     return status;
+ }
+
+ucs_status_t uct_cm_ep_resolve_cb(uct_cm_base_ep_t *cep,
+                                  const uct_cm_ep_resolve_args_t *args)
+{
+    ucs_status_t status = cep->resolve_cb(cep->user_data, args);
+
+    if (status != UCS_OK) {
+        ucs_diag("resolve callback failed with error: %s",
+                 ucs_status_string(status));
+    }
+
     return status;
 }
 
@@ -135,8 +152,8 @@ static ucs_status_t uct_cm_check_ep_params(const uct_ep_params_t *params)
     return UCS_OK;
 }
 
-ucs_status_t uct_cm_set_common_data(uct_cm_base_ep_t *ep,
-                                    const uct_ep_params_t *params)
+ucs_status_t uct_cm_ep_set_common_data(uct_cm_base_ep_t *ep,
+                                       const uct_ep_params_t *params)
 {
     ucs_status_t status;
 
@@ -147,8 +164,14 @@ ucs_status_t uct_cm_set_common_data(uct_cm_base_ep_t *ep,
 
     status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB,
                            ep->priv_pack_cb, params->sockaddr_pack_cb,
-                           uct_cm_ep_priv_data_pack_callback_t,
-                           ucs_empty_function_return_invalid_param);
+                           uct_cm_ep_priv_data_pack_callback_t, NULL);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_CM_RESOLVE_CB,
+                           ep->resolve_cb, params->cm_resolve_cb,
+                           uct_cm_ep_resolve_callback_t, NULL);
     if (status != UCS_OK) {
         return status;
     }
@@ -172,7 +195,7 @@ UCS_CLASS_INIT_FUNC(uct_cm_base_ep_t, const uct_ep_params_t *params)
 
     UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &params->cm->iface);
 
-    status = uct_cm_set_common_data(self, params);
+    status = uct_cm_ep_set_common_data(self, params);
     if (status != UCS_OK) {
         return status;
     }
@@ -281,10 +304,11 @@ UCS_CLASS_INIT_FUNC(uct_cm_t, uct_cm_ops_t* ops, uct_iface_ops_t* iface_ops,
     self->iface.progress_flags    = 0;
 
     self->config.failure_level    = config->failure;
+    self->config.reuse_addr       = config->reuse_addr;
 
     return UCS_STATS_NODE_ALLOC(&self->iface.stats, &uct_cm_stats_class,
                                 ucs_stats_get_root(), "%s-%p", "iface",
-                                self->iface);
+                                &self->iface);
 }
 
 UCS_CLASS_CLEANUP_FUNC(uct_cm_t)
@@ -296,3 +320,16 @@ UCS_CLASS_DEFINE(uct_cm_t, void);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_cm_t, void, uct_cm_ops_t*, uct_iface_ops_t*,
                           uct_worker_h, uct_component_h, const uct_cm_config_t*);
 UCS_CLASS_DEFINE_DELETE_FUNC(uct_cm_t, void);
+
+void uct_ep_connect_params_get(const uct_ep_connect_params_t *params,
+                               const void **priv_data_p,
+                               size_t *priv_data_length_p)
+{
+    *priv_data_p        = (params->field_mask &
+                           UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA) ?
+                          params->private_data : NULL;
+    *priv_data_length_p = (params->field_mask &
+                           UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA_LENGTH) ?
+                          params->private_data_length : 0;
+}
+
diff --git a/src/uct/base/uct_cm.h b/src/uct/base/uct_cm.h
index d2fe8f15de0..5c3befce4ed 100644
--- a/src/uct/base/uct_cm.h
+++ b/src/uct/base/uct_cm.h
@@ -34,18 +34,27 @@ UCS_CLASS_DECLARE(uct_listener_t, uct_cm_h);
     })
 
 
+#define uct_cm_peer_error(_cm, _fmt, ...) \
+    { \
+        ucs_log((_cm)->config.failure_level, _fmt, ## __VA_ARGS__); \
+    }
+
+
 #define uct_cm_ep_peer_error(_cep, _fmt, ...) \
     { \
-        uct_cm_t *_cm_base = ucs_container_of((_cep)->super.super.iface, uct_cm_t, iface); \
-        ucs_log((_cm_base)->config.failure_level, _fmt, ## __VA_ARGS__); \
+        uct_cm_t *_cm_base = ucs_container_of((_cep)->super.super.iface, \
+                                              uct_cm_t, iface); \
+        uct_cm_peer_error(_cm_base, _fmt, ## __VA_ARGS__); \
     }
 
+
 /**
  * "Base" structure which defines CM configuration options.
  * Specific CMs extend this structure.
  */
 struct uct_cm_config {
     int          failure;   /* Level of failure reports */
+    int          reuse_addr;
 };
 
 /**
@@ -74,6 +83,7 @@ struct uct_cm {
 
     struct {
         ucs_log_level_t  failure_level;
+        int              reuse_addr;
     } config;
 };
 
@@ -93,6 +103,9 @@ typedef struct uct_cm_base_ep {
     /* Callback to fill the user's private data */
     uct_cm_ep_priv_data_pack_callback_t priv_pack_cb;
 
+    /* Callback to notify bound device */
+    uct_cm_ep_resolve_callback_t        resolve_cb;
+
     union {
         struct {
             /* On the client side - callback to process an incoming
@@ -121,14 +134,17 @@ UCS_CLASS_DECLARE(uct_cm_t, uct_cm_ops_t*, uct_iface_ops_t*, uct_worker_h,
 ucs_status_t uct_listener_backlog_adjust(const uct_listener_params_t *params,
                                          int max_value, int *backlog);
 
-ucs_status_t uct_cm_set_common_data(uct_cm_base_ep_t *ep,
-                                    const uct_ep_params_t *params);
+ucs_status_t uct_cm_ep_set_common_data(uct_cm_base_ep_t *ep,
+                                       const uct_ep_params_t *params);
 
 ucs_status_t uct_cm_ep_pack_cb(uct_cm_base_ep_t *cep, void *arg,
                                const uct_cm_ep_priv_data_pack_args_t *pack_args,
                                void *priv_data, size_t priv_data_max,
                                size_t *priv_data_ret);
 
+ucs_status_t uct_cm_ep_resolve_cb(uct_cm_base_ep_t *cep,
+                                  const uct_cm_ep_resolve_args_t *args);
+
 void uct_cm_ep_disconnect_cb(uct_cm_base_ep_t *cep);
 
 void uct_cm_ep_client_connect_cb(uct_cm_base_ep_t *cep,
@@ -137,4 +153,8 @@ void uct_cm_ep_client_connect_cb(uct_cm_base_ep_t *cep,
 
 void uct_cm_ep_server_conn_notify_cb(uct_cm_base_ep_t *cep, ucs_status_t status);
 
+void uct_ep_connect_params_get(const uct_ep_connect_params_t *params,
+                               const void **priv_data_p,
+                               size_t *priv_data_length_p);
+
 #endif /* UCT_CM_H_ */
diff --git a/src/uct/base/uct_component.c b/src/uct/base/uct_component.c
index 549b18fbde0..ab7d67d1c2f 100644
--- a/src/uct/base/uct_component.c
+++ b/src/uct/base/uct_component.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -105,6 +105,10 @@ ucs_status_t uct_config_read(uct_config_bundle_t **bundle,
     uct_config_bundle_t *config_bundle;
     ucs_status_t status;
 
+    if (config_table == NULL) {
+        return UCS_ERR_INVALID_PARAM;
+    }
+
     config_bundle = ucs_calloc(1, sizeof(*config_bundle) + config_size, "uct_config");
     if (config_bundle == NULL) {
         status = UCS_ERR_NO_MEMORY;
diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c
index 7a1a4a50633..1d999e54ec3 100644
--- a/src/uct/base/uct_iface.c
+++ b/src/uct/base/uct_iface.c
@@ -11,12 +11,15 @@
 
 #include "uct_iface.h"
 #include "uct_cm.h"
+#include "uct_iov.inl"
 
 #include <uct/api/uct.h>
+#include <uct/api/v2/uct_v2.h>
 #include <ucs/async/async.h>
 #include <ucs/sys/string.h>
 #include <ucs/time/time.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
+#include <ucs/vfs/base/vfs_obj.h>
 
 
 #ifdef ENABLE_STATS
@@ -158,17 +161,10 @@ void uct_iface_set_async_event_params(const uct_iface_params_t *params,
                                       uct_async_event_cb_t *event_cb,
                                       void **event_arg)
 {
-    if (params->field_mask & UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_CB) {
-        *event_cb = params->async_event_cb;
-    } else {
-        *event_cb = NULL;
-    }
-
-    if (params->field_mask & UCT_IFACE_PARAM_FIELD_ASYNC_EVENT_ARG) {
-        *event_arg = params->async_event_arg;
-    } else {
-        *event_arg = NULL;
-    }
+    *event_cb  = UCT_IFACE_PARAM_VALUE(params, async_event_cb, ASYNC_EVENT_CB,
+                                       NULL);                                       
+    *event_arg = UCT_IFACE_PARAM_VALUE(params, async_event_arg, ASYNC_EVENT_ARG,
+                                       NULL);
 }
 
 
@@ -177,6 +173,14 @@ ucs_status_t uct_iface_query(uct_iface_h iface, uct_iface_attr_t *iface_attr)
     return iface->ops.iface_query(iface, iface_attr);
 }
 
+ucs_status_t
+uct_iface_estimate_perf(uct_iface_h tl_iface, uct_perf_attr_t *perf_attr)
+{
+    uct_base_iface_t *iface = ucs_derived_of(tl_iface, uct_base_iface_t);
+
+    return iface->internal_ops->iface_estimate_perf(tl_iface, perf_attr);
+}
+
 ucs_status_t uct_iface_get_device_address(uct_iface_h iface, uct_device_addr_t *addr)
 {
     return iface->ops.iface_get_device_address(iface, addr);
@@ -211,6 +215,7 @@ ucs_status_t uct_iface_event_arm(uct_iface_h iface, unsigned events)
 
 void uct_iface_close(uct_iface_h iface)
 {
+    ucs_vfs_obj_remove(iface);
     iface->ops.iface_close(iface);
 }
 
@@ -304,119 +309,64 @@ ucs_status_t uct_base_ep_fence(uct_ep_h tl_ep, unsigned flags)
     return UCS_OK;
 }
 
-static void uct_ep_failed_purge_cb(uct_pending_req_t *self, void *arg)
-{
-    uct_pending_req_queue_push((ucs_queue_head_t*)arg, self);
-}
-
-static void uct_ep_failed_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb,
-                                void *arg)
+ucs_status_t uct_iface_handle_ep_err(uct_iface_h iface, uct_ep_h ep,
+                                     ucs_status_t status)
 {
-    uct_failed_iface_t *iface = ucs_derived_of(tl_ep->iface,
-                                               uct_failed_iface_t);
-    uct_pending_req_t *req;
+    uct_base_iface_t *base_iface = ucs_derived_of(iface, uct_base_iface_t);
 
-    ucs_queue_for_each_extract(req, &iface->pend_q, priv, 1) {
-        if (cb != NULL) {
-            cb(req, arg);
-        } else {
-            ucs_warn("ep=%p cancelling user pending request %p", tl_ep, req);
-        }
+    if (base_iface->err_handler) {
+        return base_iface->err_handler(base_iface->err_handler_arg, ep, status);
     }
+
+    ucs_assert(status != UCS_ERR_CANCELED);
+    ucs_debug("error %s was not handled for ep %p", ucs_status_string(status), ep);
+    return status;
 }
 
-static void uct_ep_failed_destroy(uct_ep_h tl_ep)
+void uct_base_iface_query(uct_base_iface_t *iface, uct_iface_attr_t *iface_attr)
 {
-    /* Warn user if some pending reqs left*/
-    uct_ep_failed_purge (tl_ep, NULL, NULL);
+    memset(iface_attr, 0, sizeof(*iface_attr));
 
-    ucs_free(tl_ep->iface);
-    ucs_free(tl_ep);
+    iface_attr->max_num_eps   = iface->config.max_num_eps;
+    iface_attr->dev_num_paths = 1;
 }
 
-ucs_status_t uct_set_ep_failed(ucs_class_t *cls, uct_ep_h tl_ep,
-                               uct_iface_h tl_iface, ucs_status_t status)
+ucs_status_t
+uct_iface_param_am_alignment(const uct_iface_params_t *params, size_t elem_size,
+                             size_t base_offset, size_t payload_offset,
+                             size_t *align, size_t *align_offset)
 {
-    uct_failed_iface_t *f_iface;
-    uct_iface_ops_t    *ops;
-    uct_base_iface_t   *iface = ucs_derived_of(tl_iface, uct_base_iface_t);
+    if (!(params->field_mask & UCT_IFACE_PARAM_FIELD_AM_ALIGNMENT)) {
+        if (params->field_mask & UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET) {
+            ucs_error("alignment offset has no effect without alignment");
+            return UCS_ERR_INVALID_PARAM;
+        }
 
-    ucs_debug("set ep %p to failed state", tl_ep);
+        *align        = UCS_SYS_CACHE_LINE_SIZE;
+        *align_offset = base_offset;
 
-    /* TBD: consider allocating one instance per interface
-     * rather than for each endpoint */
-    f_iface = ucs_malloc(sizeof(*f_iface), "failed iface");
-    if (f_iface == NULL) {
-        ucs_error("Could not create failed iface (nomem)");
-        return status;
+        return UCS_OK;
     }
 
-    ucs_queue_head_init(&f_iface->pend_q);
-    ops = &f_iface->super.ops;
-
-    /* Move all pending requests to the queue.
-     * Failed ep will use that queue for purge. */
-    uct_ep_pending_purge(tl_ep, uct_ep_failed_purge_cb, &f_iface->pend_q);
-
-    ops->ep_put_short        = (uct_ep_put_short_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_put_bcopy        = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout;
-    ops->ep_put_zcopy        = (uct_ep_put_zcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_get_short        = (uct_ep_get_short_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_get_bcopy        = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_get_zcopy        = (uct_ep_get_zcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_am_short         = (uct_ep_am_short_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_am_bcopy         = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_bc_ep_timeout;
-    ops->ep_am_zcopy         = (uct_ep_am_zcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_atomic_cswap64   = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_atomic_cswap32   = (uct_ep_atomic_cswap32_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_atomic64_post    = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_atomic32_post    = (uct_ep_atomic32_post_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_atomic64_fetch   = (uct_ep_atomic64_fetch_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_atomic32_fetch   = (uct_ep_atomic32_fetch_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_tag_eager_short  = (uct_ep_tag_eager_short_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_tag_eager_bcopy  = (uct_ep_tag_eager_bcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_tag_eager_zcopy  = (uct_ep_tag_eager_zcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_tag_rndv_zcopy   = (uct_ep_tag_rndv_zcopy_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_tag_rndv_cancel  = (uct_ep_tag_rndv_cancel_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_tag_rndv_request = (uct_ep_tag_rndv_request_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_pending_add      = (uct_ep_pending_add_func_t)ucs_empty_function_return_busy;
-    ops->ep_pending_purge    = uct_ep_failed_purge;
-    ops->ep_flush            = (uct_ep_flush_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_fence            = (uct_ep_fence_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_check            = (uct_ep_check_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_connect_to_ep    = (uct_ep_connect_to_ep_func_t)ucs_empty_function_return_ep_timeout;
-    ops->ep_destroy          = uct_ep_failed_destroy;
-    ops->ep_get_address      = (uct_ep_get_address_func_t)ucs_empty_function_return_ep_timeout;
-
-    ucs_class_call_cleanup_chain(cls, tl_ep, -1);
-
-    tl_ep->iface = &f_iface->super;
-
-    if (iface->err_handler) {
-        return iface->err_handler(iface->err_handler_arg, tl_ep, status);
-    } else if (status == UCS_ERR_CANCELED) {
-        ucs_debug("error %s was suppressed for ep %p",
-                  ucs_status_string(UCS_ERR_CANCELED), tl_ep);
-        /* Suppress this since the cancellation is initiated by user. */
-        status = UCS_OK;
-    } else {
-        ucs_debug("error %s was not handled for ep %p",
-                  ucs_status_string(status), tl_ep);
-    }
+    *align        = params->am_alignment;
+    *align_offset = UCT_IFACE_PARAM_VALUE(params, am_align_offset,
+                                          AM_ALIGN_OFFSET, 0ul);
 
-    return status;
-}
+    if (*align_offset >= elem_size) {
+        ucs_diag("invalid AM alignment offset %zu, must be less than %zu",
+                 *align_offset, elem_size);
 
-void uct_base_iface_query(uct_base_iface_t *iface, uct_iface_attr_t *iface_attr)
-{
-    memset(iface_attr, 0, sizeof(*iface_attr));
+        *align_offset = 0ul;
+    }
 
-    iface_attr->max_num_eps   = iface->config.max_num_eps;
-    iface_attr->dev_num_paths = 1;
+    *align_offset += payload_offset;
+
+    return UCS_OK;
 }
 
 ucs_status_t uct_single_device_resource(uct_md_h md, const char *dev_name,
                                         uct_device_type_t dev_type,
+                                        ucs_sys_device_t sys_device,
                                         uct_tl_device_resource_t **tl_devices_p,
                                         unsigned *num_tl_devices_p)
 {
@@ -430,13 +380,40 @@ ucs_status_t uct_single_device_resource(uct_md_h md, const char *dev_name,
 
     ucs_snprintf_zero(device->name, sizeof(device->name), "%s", dev_name);
     device->type       = dev_type;
-    device->sys_device = UCS_SYS_DEVICE_ID_UNKNOWN;
+    device->sys_device = sys_device;
 
     *num_tl_devices_p = 1;
     *tl_devices_p     = device;
     return UCS_OK;
 }
 
+ucs_status_t
+uct_base_iface_estimate_perf(uct_iface_h iface, uct_perf_attr_t *perf_attr)
+{
+    ucs_status_t status;
+    uct_iface_attr_t iface_attr;
+
+    status = uct_iface_query(iface, &iface_attr);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    /* By default, the performance is assumed to be the same for all operations */
+    if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_BANDWIDTH) {
+        perf_attr->bandwidth = iface_attr.bandwidth;
+    }
+
+    if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_OVERHEAD) {
+        perf_attr->overhead = iface_attr.overhead;
+    }
+
+    return UCS_OK;
+}
+
+uct_iface_internal_ops_t uct_base_iface_internal_ops = {
+    .iface_estimate_perf = uct_base_iface_estimate_perf
+};
+
 UCS_CLASS_INIT_FUNC(uct_iface_t, uct_iface_ops_t *ops)
 {
     ucs_assert_always(ops->ep_flush                 != NULL);
@@ -463,7 +440,8 @@ UCS_CLASS_CLEANUP_FUNC(uct_iface_t)
 UCS_CLASS_DEFINE(uct_iface_t, void);
 
 
-UCS_CLASS_INIT_FUNC(uct_base_iface_t, uct_iface_ops_t *ops, uct_md_h md,
+UCS_CLASS_INIT_FUNC(uct_base_iface_t, uct_iface_ops_t *ops,
+                    uct_iface_internal_ops_t *internal_ops, uct_md_h md,
                     uct_worker_h worker, const uct_iface_params_t *params,
                     const uct_iface_config_t *config
                     UCS_STATS_ARG(ucs_stats_node_t *stats_parent)
@@ -481,18 +459,16 @@ UCS_CLASS_INIT_FUNC(uct_base_iface_t, uct_iface_ops_t *ops, uct_md_h md,
                        params->err_handler_flags : 0);
 
     self->md                = md;
+    self->internal_ops      = internal_ops;
     self->worker            = ucs_derived_of(worker, uct_priv_worker_t);
     self->am_tracer         = NULL;
     self->am_tracer_arg     = NULL;
-    self->err_handler       = (params->field_mask &
-                               UCT_IFACE_PARAM_FIELD_ERR_HANDLER) ?
-                              params->err_handler : NULL;
-    self->err_handler_flags = (params->field_mask &
-                               UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS) ?
-                              params->err_handler_flags : 0;
-    self->err_handler_arg   = (params->field_mask &
-                               UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG) ?
-                              params->err_handler_arg : NULL;
+    self->err_handler       = UCT_IFACE_PARAM_VALUE(params, err_handler, ERR_HANDLER,
+                                                    NULL);
+    self->err_handler_flags = UCT_IFACE_PARAM_VALUE(params, err_handler_flags,
+                                                    ERR_HANDLER_FLAGS, 0);
+    self->err_handler_arg   = UCT_IFACE_PARAM_VALUE(params, err_handler_arg,
+                                                    ERR_HANDLER_ARG, NULL);
     self->progress_flags    = 0;
     uct_worker_progress_init(&self->prog);
 
@@ -555,6 +531,11 @@ ucs_status_t uct_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p)
     return UCS_ERR_INVALID_PARAM;
 }
 
+ucs_status_t uct_ep_connect(uct_ep_h ep, const uct_ep_connect_params_t *params)
+{
+    return ep->iface->ops.ep_connect(ep, params);
+}
+
 ucs_status_t uct_ep_disconnect(uct_ep_h ep, unsigned flags)
 {
     return ep->iface->ops.ep_disconnect(ep, flags);
@@ -581,9 +562,14 @@ ucs_status_t uct_cm_client_ep_conn_notify(uct_ep_h ep)
     return ep->iface->ops.cm_ep_conn_notify(ep);
 }
 
+void uct_ep_set_iface(uct_ep_h ep, uct_iface_t *iface)
+{
+    ep->iface = iface;
+}
+
 UCS_CLASS_INIT_FUNC(uct_ep_t, uct_iface_t *iface)
 {
-    self->iface = iface;
+    uct_ep_set_iface(self, iface);
     return UCS_OK;
 }
 
@@ -637,3 +623,177 @@ ucs_config_field_t uct_iface_config_table[] = {
 
   {NULL}
 };
+
+ucs_status_t uct_base_ep_stats_reset(uct_base_ep_t *ep, uct_base_iface_t *iface)
+{
+    ucs_status_t status;
+
+    UCS_STATS_NODE_FREE(ep->stats);
+
+    status = UCS_STATS_NODE_ALLOC(&ep->stats, &uct_ep_stats_class, iface->stats,
+                                  "-%p", ep);
+#ifdef ENABLE_STATS
+    if (status != UCS_OK) {
+        /* set the stats to NULL so that the UCS_STATS_NODE_FREE call on the
+         * base_ep's cleanup flow won't fail */
+        ep->stats = NULL;
+    }
+#endif
+
+    return status;
+}
+
+ucs_status_t uct_base_ep_am_short_iov(uct_ep_h ep, uint8_t id, const uct_iov_t *iov,
+                                      size_t iovcnt)
+{
+    uint64_t header = 0;
+    size_t length;
+    void *buffer;
+    ucs_iov_iter_t iov_iter;
+    ucs_status_t status;
+
+    length = uct_iov_total_length(iov, iovcnt);
+
+    /* Copy first sizeof(header) bytes of iov to header. If the total length of
+     * iov is less than sizeof(header), the remainder of the header is filled
+     * with zeros. */
+    ucs_iov_iter_init(&iov_iter);
+    uct_iov_to_buffer(iov, iovcnt, &iov_iter, &header, sizeof(header));
+
+    /* If the total size of iov is greater than sizeof(header), then allocate
+       buffer and copy the remainder of iov to the buffer. */
+    if (length > sizeof(header)) {
+        length -= sizeof(header);
+
+        if (length > UCS_ALLOCA_MAX_SIZE) {
+            buffer = ucs_malloc(length, "uct_base_ep_am_short_iov buffer");
+        } else {
+            buffer = ucs_alloca(length);
+        }
+
+        uct_iov_to_buffer(iov, iovcnt, &iov_iter, buffer, SIZE_MAX);
+    } else {
+        buffer = NULL;
+        length = 0;
+    }
+
+    status = uct_ep_am_short(ep, id, header, buffer, length);
+
+    if (length > UCS_ALLOCA_MAX_SIZE) {
+        ucs_free(buffer);
+    }
+
+    return status;
+}
+
+int uct_ep_get_process_proc_dir(char *buffer, size_t max_len, pid_t pid)
+{
+    ucs_assert((buffer != NULL) || (max_len == 0));
+    /* cppcheck-suppress nullPointer */
+    /* cppcheck-suppress ctunullpointer */
+    return snprintf(buffer, max_len, "/proc/%d", (int)pid);
+}
+
+ucs_status_t uct_ep_keepalive_create(pid_t pid, uct_keepalive_info_t **ka_p)
+{
+    uct_keepalive_info_t *ka;
+    ucs_time_t start_time;
+    ucs_status_t status;
+    int proc_len;
+
+    proc_len = uct_ep_get_process_proc_dir(NULL, 0, pid);
+    if (proc_len <= 0) {
+        ucs_error("failed to get length to hold path to a process directory");
+        status = UCS_ERR_NO_MEMORY;
+        goto err;
+    }
+
+    ka = ucs_malloc(sizeof(*ka) + proc_len + 1, "keepalive");
+    if (ka == NULL) {
+        ucs_error("failed to allocate keepalive info");
+        status = UCS_ERR_NO_MEMORY;
+        goto err;
+    }
+
+    uct_ep_get_process_proc_dir(ka->proc, proc_len + 1, pid);
+
+    status = ucs_sys_get_file_time(ka->proc, UCS_SYS_FILE_TIME_CTIME,
+                                   &start_time);
+    if (status != UCS_OK) {
+        ucs_error("failed to get process start time");
+        goto err_free_ka;
+    }
+
+    ka->start_time = start_time;
+    *ka_p          = ka;
+
+    return UCS_OK;
+
+err_free_ka:
+    ucs_free(ka);
+err:
+    return status;
+}
+
+ucs_status_t
+uct_ep_keepalive_check(uct_ep_h tl_ep, uct_keepalive_info_t **ka, pid_t pid,
+                       unsigned flags, uct_completion_t *comp)
+{
+    ucs_status_t status;
+    ucs_time_t create_time;
+
+    UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);
+
+    if (ucs_unlikely(*ka == NULL)) {
+        status = uct_ep_keepalive_create(pid, ka);
+        if (status != UCS_OK) {
+            return uct_iface_handle_ep_err(tl_ep->iface, tl_ep, status);
+        }
+    } else {
+        status = ucs_sys_get_file_time((*ka)->proc, UCS_SYS_FILE_TIME_CTIME,
+                                       &create_time);
+        if (ucs_unlikely((status != UCS_OK) ||
+                         ((*ka)->start_time != create_time))) {
+            return uct_iface_handle_ep_err(tl_ep->iface, tl_ep,
+                                           UCS_ERR_ENDPOINT_TIMEOUT);
+        }
+    }
+
+    return UCS_OK;
+}
+
+void uct_iface_get_local_address(uct_iface_local_addr_ns_t *addr_ns,
+                                 ucs_sys_namespace_type_t sys_ns_type)
+{
+    addr_ns->super.id = ucs_iface_get_system_id() &
+                        ~UCT_IFACE_LOCAL_ADDR_FLAG_NS;
+
+    if (!ucs_sys_ns_is_default(sys_ns_type)) {
+        addr_ns->super.id |= UCT_IFACE_LOCAL_ADDR_FLAG_NS;
+        addr_ns->sys_ns    = ucs_sys_get_ns(sys_ns_type);
+    }
+}
+
+int uct_iface_local_is_reachable(uct_iface_local_addr_ns_t *addr_ns,
+                                 ucs_sys_namespace_type_t sys_ns_type)
+{
+    uct_iface_local_addr_ns_t my_addr = {};
+
+    uct_iface_get_local_address(&my_addr, sys_ns_type);
+
+    /* Do not merge these evaluations into single 'if' due to Clang compilation
+     * warning */
+    /* Check if both processes are on same host and both of them are in root (or
+     * non-root) pid namespace */
+    if (addr_ns->super.id != my_addr.super.id) {
+        return 0;
+    }
+
+    if (!(addr_ns->super.id & UCT_IFACE_LOCAL_ADDR_FLAG_NS)) {
+        return 1; /* Both processes are in root namespace */
+    }
+
+    /* We are in non-root PID namespace - return 1 if ID of namespaces are the
+     * same */
+    return addr_ns->sys_ns == my_addr.sys_ns;
+}
diff --git a/src/uct/base/uct_iface.h b/src/uct/base/uct_iface.h
index 534a60efeb3..02515b41a1c 100644
--- a/src/uct/base/uct_iface.h
+++ b/src/uct/base/uct_iface.h
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -16,14 +16,22 @@
 #include <ucs/datastruct/mpool.h>
 #include <ucs/datastruct/queue.h>
 #include <ucs/debug/log.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/stats/stats.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/sys.h>
 #include <ucs/type/class.h>
+#include <uct/api/v2/uct_v2.h>
+#include <ucs/type/param.h>
 
 #include <ucs/datastruct/mpool.inl>
 
 
+/* UCT IFACE local address flag which packed to ID and indicates if an address
+ * is extended by a system namespace information */
+#define UCT_IFACE_LOCAL_ADDR_FLAG_NS UCS_BIT(63)
+
+
 enum {
     UCT_EP_STAT_AM,
     UCT_EP_STAT_PUT,
@@ -122,9 +130,17 @@ enum {
                     "UCT_EP_PARAM_FIELD_DEV_ADDR and UCT_EP_PARAM_FIELD_IFACE_ADDR are not defined")
 
 
+#define UCT_EP_PARAM_VALUE(_params, _name, _flag, _default) \
+    UCS_PARAM_VALUE(UCT_EP_PARAM_FIELD, _params, _name, _flag, _default)
+
+
+#define UCT_IFACE_PARAM_VALUE(_params, _name, _flag, _default) \
+    UCS_PARAM_VALUE(UCT_IFACE_PARAM_FIELD, _params, _name, _flag, _default)
+
+
 #define UCT_EP_PARAMS_GET_PATH_INDEX(_params) \
-    (((_params)->field_mask & UCT_EP_PARAM_FIELD_PATH_INDEX) ? \
-     (_params)->path_index : 0)
+    UCT_EP_PARAM_VALUE(_params, path_index, PATH_INDEX, 0)
+
 
 /**
  * Check the condition and return status as a pointer if not true.
@@ -181,6 +197,18 @@ enum {
                     (int)UCT_AM_ID_MAX - 1)
 
 
+/**
+ * In debug mode, check that keepalive params are valid
+ */
+#define UCT_EP_KEEPALIVE_CHECK_PARAM(_flags, _comp) \
+    UCT_CHECK_PARAM((_comp) == NULL, "Unsupported completion on ep_check"); \
+    UCT_CHECK_PARAM((_flags) == 0, "Unsupported flags: %x", (_flags));
+
+
+#define UCT_IFACE_PARAM_VALUE(_params, _name, _flag, _default) \
+    UCS_PARAM_VALUE(UCT_IFACE_PARAM_FIELD, _params, _name, _flag, _default)
+
+
 /**
  * Declare classes for structures defined in api/tl.h
  */
@@ -198,37 +226,55 @@ typedef struct uct_am_handler {
 } uct_am_handler_t;
 
 
+/* Performance estimation operation */
+typedef ucs_status_t (*uct_iface_estimate_perf_func_t)(
+        uct_iface_h iface, uct_perf_attr_t *perf_attr);
+
+
+/* Refresh the VFS representation of the interface */
+typedef void (*uct_iface_vfs_refresh_func_t)(uct_iface_h iface);
+
+
+/* Internal operations, not exposed by the external API */
+typedef struct uct_iface_internal_ops {
+    uct_iface_estimate_perf_func_t iface_estimate_perf;
+    uct_iface_vfs_refresh_func_t   iface_vfs_refresh;
+} uct_iface_internal_ops_t;
+
+
 /**
  * Base structure of all interfaces.
  * Includes the AM table which we don't want to expose.
  */
 typedef struct uct_base_iface {
-    uct_iface_t             super;
-    uct_md_h                md;               /* MD this interface is using */
-    uct_priv_worker_t       *worker;          /* Worker this interface is on */
-    uct_am_handler_t        am[UCT_AM_ID_MAX];/* Active message table */
-    uct_am_tracer_t         am_tracer;        /* Active message tracer */
-    void                    *am_tracer_arg;   /* Tracer argument */
-    uct_error_handler_t     err_handler;      /* Error handler */
-    void                    *err_handler_arg; /* Error handler argument */
-    uint32_t                err_handler_flags; /* Error handler callback flags */
-    uct_worker_progress_t   prog;             /* Will be removed once all transports
-                                                 support progress control */
-    unsigned                progress_flags;   /* Which progress is currently enabled */
+    uct_iface_t              super;
+    uct_iface_internal_ops_t *internal_ops;    /* Internal operations */
+    uct_md_h                 md;               /* MD this interface is using */
+    uct_priv_worker_t        *worker;          /* Worker this interface is on */
+    uct_am_handler_t         am[UCT_AM_ID_MAX];/* Active message table */
+    uct_am_tracer_t          am_tracer;        /* Active message tracer */
+    void                     *am_tracer_arg;   /* Tracer argument */
+    uct_error_handler_t      err_handler;      /* Error handler */
+    void                     *err_handler_arg; /* Error handler argument */
+    uint32_t                 err_handler_flags; /* Error handler callback flags */
+    uct_worker_progress_t    prog;             /* Will be removed once all transports
+                                                  support progress control */
+    unsigned                 progress_flags;   /* Which progress is currently enabled */
 
     struct {
-        unsigned            num_alloc_methods;
-        uct_alloc_method_t  alloc_methods[UCT_ALLOC_METHOD_LAST];
-        ucs_log_level_t     failure_level;
-        size_t              max_num_eps;
+        unsigned             num_alloc_methods;
+        uct_alloc_method_t   alloc_methods[UCT_ALLOC_METHOD_LAST];
+        ucs_log_level_t      failure_level;
+        size_t               max_num_eps;
     } config;
 
     UCS_STATS_NODE_DECLARE(stats)            /* Statistics */
 } uct_base_iface_t;
 
-UCS_CLASS_DECLARE(uct_base_iface_t, uct_iface_ops_t*,  uct_md_h, uct_worker_h,
-                  const uct_iface_params_t*, const uct_iface_config_t*
-                  UCS_STATS_ARG(ucs_stats_node_t*) UCS_STATS_ARG(const char*));
+UCS_CLASS_DECLARE(uct_base_iface_t, uct_iface_ops_t*, uct_iface_internal_ops_t*,
+                  uct_md_h, uct_worker_h, const uct_iface_params_t*,
+                  const uct_iface_config_t *UCS_STATS_ARG(ucs_stats_node_t*)
+                  UCS_STATS_ARG(const char*));
 
 
 /**
@@ -240,6 +286,15 @@ typedef struct uct_failed_iface {
 } uct_failed_iface_t;
 
 
+/**
+ * Keepalive info used by EP
+ */
+typedef struct uct_keepalive_info {
+    ucs_time_t start_time; /* Process start time */
+    char       proc[]; /* Process owner proc dir */
+} uct_keepalive_info_t;
+
+
 /**
  * Base structure of all endpoints.
  */
@@ -283,6 +338,24 @@ typedef struct uct_tl {
 } uct_tl_t;
 
 
+/**
+ * Base UCT IFACE local address
+ */
+typedef struct uct_iface_local_addr_base {
+    uint64_t id; /* System ID + @ref UCT_IFACE_LOCAL_ADDR_FLAG_NS if a local
+                    address is extended by a system namespace information */
+} UCS_S_PACKED uct_iface_local_addr_base_t;
+
+
+/**
+ * Extended UCT IFACE local address
+ */
+typedef struct uct_iface_local_addr_ns {
+    uct_iface_local_addr_base_t super; /* Base UCT IFACE local address */
+    ucs_sys_ns_t                sys_ns; /* System namespace (IPC or network) */
+} UCS_S_PACKED uct_iface_local_addr_ns_t;
+
+
 /**
  * Define a transport
  *
@@ -290,6 +363,9 @@ typedef struct uct_tl {
  * @param _name           Name of the transport (should be a token, not a string)
  * @param _query_devices  Function to query the list of available devices
  * @param _iface_class    Struct type defining the uct_iface class
+ * @param _cfg_prefix     Prefix for configuration variables
+ * @param _cfg_table      Transport configuration table
+ * @param _cfg_struct     Struct type defining transport configuration
  */
 #define UCT_TL_DEFINE(_component, _name, _query_devices, _iface_class, \
                       _cfg_prefix, _cfg_table, _cfg_struct) \
@@ -431,7 +507,7 @@ uct_pending_req_priv_arb_elem(uct_pending_req_t *req)
 /**
  * Add a pending request to the head of group in arbiter.
  */
-#define uct_pending_req_arb_group_push_head(_arbiter, _arbiter_group, _req) \
+#define uct_pending_req_arb_group_push_head(_arbiter_group, _req) \
     do { \
         ucs_arbiter_elem_init(uct_pending_req_priv_arb_elem(_req)); \
         ucs_arbiter_group_push_head_elem_always(_arbiter_group, \
@@ -569,6 +645,9 @@ typedef struct {
 extern ucs_config_field_t uct_iface_config_table[];
 
 
+extern uct_iface_internal_ops_t uct_base_iface_internal_ops;
+
+
 /**
  * Initialize a memory pool for buffers used by TL interface.
  *
@@ -601,13 +680,37 @@ void uct_iface_set_async_event_params(const uct_iface_params_t *params,
                                       uct_async_event_cb_t *event_cb,
                                       void **event_arg);
 
-ucs_status_t uct_set_ep_failed(ucs_class_t* cls, uct_ep_h tl_ep, uct_iface_h
-                               tl_iface, ucs_status_t status);
+ucs_status_t uct_iface_handle_ep_err(uct_iface_h iface, uct_ep_h ep,
+                                      ucs_status_t status);
+
+/**
+ * Initialize AM data alignment and its offset based on the user configuration
+ * provided in interface parameters.
+ *
+ * @param [in]  params         User defined interface parameters.
+ * @param [in]  elem_size      Transport receive buffer size.
+ * @param [in]  base_offset    Default offset in the transport receive buffer,
+ *                             which should be aligned to the certain boundary.
+ * @param [in]  payload_offset Offset to the payload in the transport receive
+ *                             buffer.
+ * @param [out] align          Alignment of the Active Message data on the
+ *                             receiver.
+ * @param [out] align_offset   Offset in the incoming Active Message which
+ *                             should be aligned to the @a align boundary.
+ *
+ * @return UCS_OK on success or UCS_ERR_INVALID_PARAM if user specified invalid
+ *         combination of @a am_alignment and @a am_align_offset in @a params.
+ */
+ucs_status_t
+uct_iface_param_am_alignment(const uct_iface_params_t *params, size_t elem_size,
+                             size_t base_offset, size_t payload_offset,
+                             size_t *align, size_t *align_offset);
 
 void uct_base_iface_query(uct_base_iface_t *iface, uct_iface_attr_t *iface_attr);
 
 ucs_status_t uct_single_device_resource(uct_md_h md, const char *dev_name,
                                         uct_device_type_t dev_type,
+                                        ucs_sys_device_t sys_device,
                                         uct_tl_device_resource_t **tl_devices_p,
                                         unsigned *num_tl_devices_p);
 
@@ -623,11 +726,20 @@ void uct_base_iface_progress_enable_cb(uct_base_iface_t *iface,
 
 void uct_base_iface_progress_disable(uct_iface_h tl_iface, unsigned flags);
 
+ucs_status_t
+uct_base_iface_estimate_perf(uct_iface_h iface, uct_perf_attr_t *perf_attr);
+
 ucs_status_t uct_base_ep_flush(uct_ep_h tl_ep, unsigned flags,
                                uct_completion_t *comp);
 
 ucs_status_t uct_base_ep_fence(uct_ep_h tl_ep, unsigned flags);
 
+void uct_iface_get_local_address(uct_iface_local_addr_ns_t *addr_ns,
+                                 ucs_sys_namespace_type_t sys_ns_type);
+
+int uct_iface_local_is_reachable(uct_iface_local_addr_ns_t *addr_ns,
+                                 ucs_sys_namespace_type_t sys_ns_type);
+
 /*
  * Invoke active message handler.
  *
@@ -667,8 +779,12 @@ uct_iface_invoke_am(uct_base_iface_t *iface, uint8_t id, void *data,
 static UCS_F_ALWAYS_INLINE
 void uct_invoke_completion(uct_completion_t *comp, ucs_status_t status)
 {
-    ucs_trace_func("comp=%p, count=%d, status=%d", comp, comp->count, status);
-    ucs_assertv(comp->count > 0, "comp=%p count=%d", comp, comp->count);
+    ucs_trace_func("comp=%p (%s) count=%d status=%d", comp,
+                   ucs_debug_get_symbol_name((void*)comp->func), comp->count,
+                   status);
+    ucs_assertv(comp->count > 0, "comp=%p (%s) count=%d status=%d", comp,
+                ucs_debug_get_symbol_name((void*)comp->func), comp->count,
+                status);
 
     uct_completion_update_status(comp, status);
     if (--comp->count == 0) {
@@ -699,4 +815,38 @@ void uct_am_short_fill_data(void *buffer, uint64_t header, const void *payload,
     memcpy(packet->payload, payload, length);
 }
 
+
+static UCS_F_ALWAYS_INLINE
+ucs_log_level_t uct_base_iface_failure_log_level(uct_base_iface_t *iface,
+                                                 ucs_status_t err_handler_status,
+                                                 ucs_status_t status)
+{
+    if (err_handler_status != UCS_OK) {
+        return UCS_LOG_LEVEL_FATAL;
+    } else if ((status == UCS_ERR_ENDPOINT_TIMEOUT) ||
+               (status == UCS_ERR_CONNECTION_RESET)) {
+        return iface->config.failure_level;
+    } else {
+        return UCS_LOG_LEVEL_ERROR;
+    }
+}
+
+
+ucs_status_t uct_base_ep_am_short_iov(uct_ep_h ep, uint8_t id, const uct_iov_t *iov,
+                                      size_t iovcnt);
+
+int uct_ep_get_process_proc_dir(char *buffer, size_t max_len, pid_t pid);
+
+ucs_status_t uct_ep_keepalive_create(pid_t pid, uct_keepalive_info_t **ka_p);
+
+ucs_status_t
+uct_ep_keepalive_check(uct_ep_h tl_ep, uct_keepalive_info_t **ka, pid_t pid,
+                       unsigned flags, uct_completion_t *comp);
+
+void uct_ep_set_iface(uct_ep_h ep, uct_iface_t *iface);
+
+ucs_status_t uct_base_ep_stats_reset(uct_base_ep_t *ep, uct_base_iface_t *iface);
+
+void uct_iface_vfs_refresh(void *obj);
+
 #endif
diff --git a/src/uct/base/uct_iface_vfs.c b/src/uct/base/uct_iface_vfs.c
new file mode 100644
index 00000000000..1c3c913fca1
--- /dev/null
+++ b/src/uct/base/uct_iface_vfs.c
@@ -0,0 +1,174 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "uct_iface.h"
+
+#include <uct/api/uct.h>
+#include <ucs/datastruct/string_buffer.h>
+#include <ucs/sys/compiler_def.h>
+#include <ucs/sys/string.h>
+#include <ucs/vfs/base/vfs_obj.h>
+
+
+typedef struct {
+    uint64_t   flag;
+    const char *name;
+} uct_iface_vfs_cap_info_t;
+
+static const uct_iface_vfs_cap_info_t uct_iface_vfs_cap_infos[] = {
+    {UCT_IFACE_FLAG_AM_SHORT, "am_short"},
+    {UCT_IFACE_FLAG_AM_BCOPY, "am_bcopy"},
+    {UCT_IFACE_FLAG_AM_ZCOPY, "am_zcopy"},
+    {UCT_IFACE_FLAG_PENDING, "pending"},
+    {UCT_IFACE_FLAG_PUT_SHORT, "put_short"},
+    {UCT_IFACE_FLAG_PUT_BCOPY, "put_bcopy"},
+    {UCT_IFACE_FLAG_PUT_ZCOPY, "put_zcopy"},
+    {UCT_IFACE_FLAG_GET_SHORT, "get_short"},
+    {UCT_IFACE_FLAG_GET_BCOPY, "get_bcopy"},
+    {UCT_IFACE_FLAG_GET_ZCOPY, "get_zcopy"},
+    {UCT_IFACE_FLAG_ATOMIC_CPU, "atomic_cpu"},
+    {UCT_IFACE_FLAG_ATOMIC_DEVICE, "atomic_device"},
+    {UCT_IFACE_FLAG_ERRHANDLE_SHORT_BUF, "errhandle_short_buf"},
+    {UCT_IFACE_FLAG_ERRHANDLE_BCOPY_BUF, "errhandle_bcopy_buf"},
+    {UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF, "errhandle_zcopy_buf"},
+    {UCT_IFACE_FLAG_ERRHANDLE_AM_ID, "errhandle_am_id"},
+    {UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM, "errhandle_remote_mem"},
+    {UCT_IFACE_FLAG_ERRHANDLE_BCOPY_LEN, "errhandle_bcopy_len"},
+    {UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE, "errhandle_peer_failure"},
+    {UCT_IFACE_FLAG_EP_CHECK, "ep_check"},
+    {UCT_IFACE_FLAG_CONNECT_TO_IFACE, "connect_to_iface"},
+    {UCT_IFACE_FLAG_CONNECT_TO_EP, "connect_to_ep"},
+    {UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR, "connect_to_sockaddr"},
+    {UCT_IFACE_FLAG_AM_DUP, "am_dup"},
+    {UCT_IFACE_FLAG_CB_SYNC, "cb_sync"},
+    {UCT_IFACE_FLAG_CB_ASYNC, "cb_async"},
+    {UCT_IFACE_FLAG_EP_KEEPALIVE, "ep_keepalive"},
+    {UCT_IFACE_FLAG_TAG_EAGER_SHORT, "tag_eager_short"},
+    {UCT_IFACE_FLAG_TAG_EAGER_BCOPY, "tag_eager_bcopy"},
+    {UCT_IFACE_FLAG_TAG_EAGER_ZCOPY, "tag_eager_zcopy"},
+    {UCT_IFACE_FLAG_TAG_RNDV_ZCOPY, "tag_rndv_zcopy"},
+};
+
+typedef struct {
+    uint64_t   flag;
+    const char *op_name;
+    const char *limit_name;
+    size_t     offset;
+} uct_iface_vfs_cap_limit_info_t;
+
+#define uct_iface_vfs_cap_limit_info(_flag, _op, _attr) \
+    { \
+        _flag, #_op, #_attr, ucs_offsetof(uct_iface_attr_t, cap._op._attr) \
+    }
+
+static const uct_iface_vfs_cap_limit_info_t uct_iface_vfs_cap_limit_infos[] = {
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_SHORT, put, max_short),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_BCOPY, put, max_bcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_ZCOPY, put, min_zcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_ZCOPY, put, max_zcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_ZCOPY, put,
+                                 opt_zcopy_align),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_SHORT |
+                                         UCT_IFACE_FLAG_PUT_BCOPY |
+                                         UCT_IFACE_FLAG_PUT_ZCOPY,
+                                 put, align_mtu),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_PUT_ZCOPY, put, max_iov),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_SHORT, get, max_short),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_BCOPY, get, max_bcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_ZCOPY, get, min_zcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_ZCOPY, get, max_zcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_ZCOPY, get,
+                                 opt_zcopy_align),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_SHORT |
+                                         UCT_IFACE_FLAG_GET_BCOPY |
+                                         UCT_IFACE_FLAG_GET_ZCOPY,
+                                 get, align_mtu),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_GET_ZCOPY, get, max_iov),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_SHORT, am, max_short),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_BCOPY, am, max_bcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_ZCOPY, am, min_zcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_ZCOPY, am, max_zcopy),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_ZCOPY, am, opt_zcopy_align),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_SHORT |
+                                         UCT_IFACE_FLAG_AM_BCOPY |
+                                         UCT_IFACE_FLAG_AM_ZCOPY,
+                                 am, align_mtu),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_ZCOPY, am, max_hdr),
+    uct_iface_vfs_cap_limit_info(UCT_IFACE_FLAG_AM_ZCOPY, am, max_iov),
+};
+
+
+static void uct_iface_vfs_show_cap(void *obj, ucs_string_buffer_t *strb,
+                                   void *arg_ptr, uint64_t arg_u64)
+{
+    ucs_string_buffer_appendf(strb, "1\n");
+}
+
+static void uct_iface_vfs_show_cap_limit(void *obj, ucs_string_buffer_t *strb,
+                                         void *arg_ptr, uint64_t arg_u64)
+{
+    uct_iface_h iface = obj;
+    uct_iface_attr_t iface_attr;
+    size_t attr;
+    char buf[64];
+
+    if (uct_iface_query(iface, &iface_attr) != UCS_OK) {
+        ucs_string_buffer_appendf(strb, "<failed to query iface attributes>\n");
+        return;
+    }
+
+    attr = *(size_t*)UCS_PTR_BYTE_OFFSET(&iface_attr, arg_u64);
+    ucs_string_buffer_appendf(strb, "%s\n",
+                              ucs_memunits_to_str(attr, buf, sizeof(buf)));
+}
+
+static void uct_iface_vfs_init_caps(uct_iface_h iface, uint64_t iface_cap_flags)
+{
+    size_t i;
+
+    for (i = 0; i < ucs_static_array_size(uct_iface_vfs_cap_infos); ++i) {
+        if (iface_cap_flags & uct_iface_vfs_cap_infos[i].flag) {
+            ucs_vfs_obj_add_ro_file(iface, uct_iface_vfs_show_cap, NULL, 0,
+                                    "attribute/capability/%s",
+                                    uct_iface_vfs_cap_infos[i].name);
+        }
+    }
+}
+
+static void
+uct_iface_vfs_init_cap_limits(uct_iface_h iface, uint64_t iface_cap_flags)
+{
+    size_t i;
+
+    for (i = 0; i < ucs_static_array_size(uct_iface_vfs_cap_limit_infos); ++i) {
+        if (iface_cap_flags & uct_iface_vfs_cap_limit_infos[i].flag) {
+            ucs_vfs_obj_add_ro_file(iface, uct_iface_vfs_show_cap_limit, NULL,
+                                    uct_iface_vfs_cap_limit_infos[i].offset,
+                                    "attribute/%s/%s",
+                                    uct_iface_vfs_cap_limit_infos[i].op_name,
+                                    uct_iface_vfs_cap_limit_infos[i].limit_name);
+        }
+    }
+}
+
+void uct_iface_vfs_refresh(void *obj)
+{
+    uct_base_iface_t *iface = obj;
+    uct_iface_attr_t iface_attr;
+
+    if (uct_iface_query(&iface->super, &iface_attr) == UCS_OK) {
+        uct_iface_vfs_init_caps(&iface->super, iface_attr.cap.flags);
+        uct_iface_vfs_init_cap_limits(&iface->super, iface_attr.cap.flags);
+    } else {
+        ucs_debug("failed to query iface attributes");
+    }
+
+    iface->internal_ops->iface_vfs_refresh(&iface->super);
+}
diff --git a/src/uct/base/uct_iov.inl b/src/uct/base/uct_iov.inl
index b6728111367..7894747e9da 100644
--- a/src/uct/base/uct_iov.inl
+++ b/src/uct/base/uct_iov.inl
@@ -104,4 +104,48 @@ size_t uct_iov_to_iovec(struct iovec *io_vec, size_t *io_vec_cnt_p,
                              max_length, uct_iov_iter_p);
 }
 
+/**
+ * Copy a data from uct_iov_t to buffer.
+ *
+ * @param [in] iov         Pointer to the array of uct_iov_t elements.
+ * @param [in] iov_cnt     Number of elements in the array.
+ * @param [inout] iov_iter Pointer to the iterator of the array.
+ * @param [in] buf         Buffer to copy the data to.
+ * @param [in] copy_limit  Maximum amount of the data that should be copied.
+ *
+ * @return The amount of copied bytes.
+ */
+static UCS_F_ALWAYS_INLINE
+size_t uct_iov_to_buffer(const uct_iov_t *iov, size_t iovcnt,
+                         ucs_iov_iter_t *iov_iter, void *buf, size_t copy_limit)
+{
+    size_t offset        = iov_iter->buffer_offset;
+    size_t copied        = 0;
+    size_t limit_reached = 0;
+    size_t to_copy;
+
+    for (; iov_iter->iov_index < iovcnt; ++iov_iter->iov_index) {
+        to_copy = iov[iov_iter->iov_index].length - offset;
+        if (copied + to_copy > copy_limit) {
+            to_copy       = copy_limit - copied;
+            limit_reached = 1;
+        }
+        memcpy(UCS_PTR_BYTE_OFFSET(buf, copied),
+               UCS_PTR_BYTE_OFFSET(iov[iov_iter->iov_index].buffer, offset),
+               to_copy);
+        copied += to_copy;
+
+        if (limit_reached) {
+            offset += to_copy;
+            break;
+        }
+
+        offset = 0;
+    }
+
+    iov_iter->buffer_offset = offset;
+
+    return copied;
+}
+
 #endif
diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c
index c8ec53de6d6..6ef0302ae10 100644
--- a/src/uct/base/uct_md.c
+++ b/src/uct/base/uct_md.c
@@ -16,10 +16,12 @@
 #include <uct/api/uct.h>
 #include <ucs/debug/log.h>
 #include <ucs/debug/memtrack.h>
+#include <ucs/memory/rcache.h>
 #include <ucs/type/class.h>
 #include <ucs/sys/module.h>
 #include <ucs/sys/string.h>
 #include <ucs/arch/cpu.h>
+#include <ucs/vfs/base/vfs_obj.h>
 
 
 ucs_config_field_t uct_md_config_table[] = {
@@ -36,9 +38,18 @@ ucs_config_field_t uct_md_config_rcache_table[] = {
 
     {"RCACHE_ADDR_ALIGN", UCS_PP_MAKE_STRING(UCS_SYS_CACHE_LINE_SIZE),
      "Registration cache address alignment, must be power of 2\n"
-     "between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size",
+     "between " UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN) "and system page size",
      ucs_offsetof(uct_md_rcache_config_t, alignment), UCS_CONFIG_TYPE_UINT},
 
+    {"RCACHE_MAX_REGIONS", "inf",
+     "Maximal number of regions in the registration cache",
+     ucs_offsetof(uct_md_rcache_config_t, max_regions),
+     UCS_CONFIG_TYPE_ULUNITS},
+
+    {"RCACHE_MAX_SIZE", "inf",
+     "Maximal total size of registration cache regions",
+     ucs_offsetof(uct_md_rcache_config_t, max_size), UCS_CONFIG_TYPE_MEMUNITS},
+
     {NULL}
 };
 
@@ -253,7 +264,15 @@ ucs_status_t uct_iface_open(uct_md_h md, uct_worker_h worker,
         return UCS_ERR_NO_DEVICE;
     }
 
-    return tl->iface_open(md, worker, params, config, iface_p);
+    status = tl->iface_open(md, worker, params, config, iface_p);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    ucs_vfs_obj_add_dir(worker, *iface_p, "iface/%p", *iface_p);
+    ucs_vfs_obj_set_dirty(*iface_p, uct_iface_vfs_refresh);
+
+    return UCS_OK;
 }
 
 ucs_status_t uct_md_config_read(uct_component_h component,
@@ -373,7 +392,6 @@ ucs_status_t uct_mem_alloc_check_params(size_t length,
                                         unsigned num_methods,
                                         const uct_mem_alloc_params_t *params)
 {
-    const uct_alloc_method_t *method;
     ucs_status_t status;
 
     if (params->field_mask & UCT_MEM_ALLOC_PARAM_FIELD_FLAGS) {
@@ -404,19 +422,6 @@ ucs_status_t uct_mem_alloc_check_params(size_t length,
         return UCS_ERR_INVALID_PARAM;
     }
 
-    for (method = methods;
-         method < methods + num_methods; ++method) {
-        if (*method == UCT_ALLOC_METHOD_MD) {
-            if (!(params->field_mask & UCT_MEM_ALLOC_PARAM_FIELD_MDS) ||
-                (params->mds.count < 1)) {
-                ucs_debug("methods include UCT_ALLOC_METHOD but params->mds"
-                          " not populated correctly: %s",
-                          ucs_status_string(UCS_ERR_INVALID_PARAM));
-                return UCS_ERR_INVALID_PARAM;
-            }
-        }
-    }
-
     return UCS_OK;
 }
 
@@ -450,11 +455,17 @@ ucs_status_t uct_md_mem_reg(uct_md_h md, void *address, size_t length,
     ucs_status_t status;
 
     if ((length == 0) || (address == NULL)) {
+        uct_md_log_mem_reg_error(flags,
+                                 "uct_md_mem_reg(address=%p length=%zu): "
+                                 "invalid parameters", address, length);
         return UCS_ERR_INVALID_PARAM;
     }
 
     status = uct_mem_check_flags(flags);
     if (status != UCS_OK) {
+        uct_md_log_mem_reg_error(flags,
+                                 "uct_md_mem_reg(flags=0x%x): invalid flags",
+                                 flags);
         return status;
     }
 
@@ -466,10 +477,10 @@ ucs_status_t uct_md_mem_dereg(uct_md_h md, uct_mem_h memh)
     return md->ops->mem_dereg(md, memh);
 }
 
-ucs_status_t uct_md_mem_query(uct_md_h md, const void *addr, const size_t length,
-                              uct_md_mem_attr_t *mem_attr_p)
+ucs_status_t uct_md_mem_query(uct_md_h md, const void *address, size_t length,
+                              uct_md_mem_attr_t *mem_attr)
 {
-    return md->ops->mem_query(md, addr, length, mem_attr_p);
+    return md->ops->mem_query(md, address, length, mem_attr);
 }
 
 int uct_md_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr,
@@ -483,3 +494,12 @@ ucs_status_t uct_md_detect_memory_type(uct_md_h md, const void *addr, size_t len
 {
     return md->ops->detect_memory_type(md, addr, length, mem_type_p);
 }
+
+void uct_md_set_rcache_params(ucs_rcache_params_t *rcache_params,
+                              const uct_md_rcache_config_t *rcache_config)
+{
+    rcache_params->alignment          = rcache_config->alignment;
+    rcache_params->ucm_event_priority = rcache_config->event_prio;
+    rcache_params->max_regions        = rcache_config->max_regions;
+    rcache_params->max_size           = rcache_config->max_size;
+}
diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h
index 1803ac3471d..5bfefa76d09 100644
--- a/src/uct/base/uct_md.h
+++ b/src/uct/base/uct_md.h
@@ -15,13 +15,20 @@
 
 #include <uct/api/uct.h>
 #include <ucs/config/parser.h>
+#include <ucs/memory/rcache.h>
 #include <string.h>
 
 
+#define uct_md_log_mem_reg_error(_flags, _fmt, ...) \
+    ucs_log(uct_md_reg_log_lvl(_flags), _fmt, ## __VA_ARGS__)
+
+
 typedef struct uct_md_rcache_config {
     size_t               alignment;    /**< Force address alignment */
     unsigned             event_prio;   /**< Memory events priority */
     double               overhead;     /**< Lookup overhead estimation */
+    unsigned long        max_regions;  /**< Maximal number of rcache regions */
+    size_t               max_size;     /**< Maximal size of mapped memory */
 } uct_md_rcache_config_t;
 
 
@@ -66,9 +73,9 @@ typedef ucs_status_t (*uct_md_mem_reg_func_t)(uct_md_h md, void *address,
 typedef ucs_status_t (*uct_md_mem_dereg_func_t)(uct_md_h md, uct_mem_h memh);
 
 typedef ucs_status_t (*uct_md_mem_query_func_t)(uct_md_h md,
-                                                const void *addr,
-                                                const size_t length,
-                                                uct_md_mem_attr_t *mem_attr_p);
+                                                const void *address,
+                                                size_t length,
+                                                uct_md_mem_attr_t *mem_attr);
 
 typedef ucs_status_t (*uct_md_mkey_pack_func_t)(uct_md_h md, uct_mem_h memh,
                                                 void *rkey_buffer);
@@ -194,6 +201,17 @@ ucs_status_t uct_mem_alloc_check_params(size_t length,
                                         unsigned num_methods,
                                         const uct_mem_alloc_params_t *params);
 
+
+void uct_md_set_rcache_params(ucs_rcache_params_t *rcache_params,
+                              const uct_md_rcache_config_t *rcache_config);
+
+
 extern ucs_config_field_t uct_md_config_table[];
 
+static inline ucs_log_level_t uct_md_reg_log_lvl(unsigned flags)
+{
+    return (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DIAG :
+            UCS_LOG_LEVEL_ERROR;
+}
+
 #endif
diff --git a/src/uct/base/uct_mem.c b/src/uct/base/uct_mem.c
index 3b723980bf1..e3cd5d861e6 100644
--- a/src/uct/base/uct_mem.c
+++ b/src/uct/base/uct_mem.c
@@ -102,11 +102,19 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
                    params->mem_type : UCS_MEMORY_TYPE_HOST;
     alloc_length = length;
 
+    ucs_trace("allocating %s: %s memory length %zu flags 0x%x", alloc_name,
+              ucs_memory_type_names[mem_type], alloc_length, flags);
+    ucs_log_indent(1);
+
     for (method = methods; method < methods + num_methods; ++method) {
         ucs_trace("trying allocation method %s", uct_alloc_method_names[*method]);
 
         switch (*method) {
         case UCT_ALLOC_METHOD_MD:
+            if (!(params->field_mask & UCT_MEM_ALLOC_PARAM_FIELD_MDS)) {
+                break;
+            }
+
             /* Allocate with one of the specified memory domains */
             for (md_index = 0; md_index < params->mds.count; ++md_index) {
                 alloc_length = length;
@@ -115,7 +123,7 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
                 status = uct_md_query(md, &md_attr);
                 if (status != UCS_OK) {
                     ucs_error("Failed to query MD");
-                    return status;
+                    goto out;
                 }
 
                 /* Check if MD supports allocation */
@@ -147,7 +155,7 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
                     ucs_error("failed to allocate %zu bytes using md %s for %s: %s",
                               alloc_length, md->component->name,
                               alloc_name, ucs_status_string(status));
-                    return status;
+                    goto out;
                 }
 
                 ucs_assert(memh != UCT_MEM_HANDLE_NULL);
@@ -163,7 +171,8 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
                  * memory
                  */
                 ucs_error("unable to allocated requested memory type");
-                return UCS_ERR_UNSUPPORTED;
+                status = UCS_ERR_UNSUPPORTED;
+                goto out;
             }
 
             break;
@@ -277,12 +286,14 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
 
         default:
             ucs_error("Invalid allocation method %d", *method);
-            return UCS_ERR_INVALID_PARAM;
+            status = UCS_ERR_INVALID_PARAM;
+            goto out;
         }
     }
 
-    ucs_debug("Could not allocate memory with any of the provided methods");
-    return UCS_ERR_NO_MEMORY;
+    ucs_debug("could not allocate memory with any of the provided methods");
+    status = UCS_ERR_NO_MEMORY;
+    goto out;
 
 allocated_without_md:
     mem->md       = NULL;
@@ -295,7 +306,10 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
     mem->address = address;
     mem->length  = alloc_length;
     mem->method  = *method;
-    return UCS_OK;
+    status       = UCS_OK;
+out:
+    ucs_log_indent(-1);
+    return status;
 }
 
 ucs_status_t uct_mem_free(const uct_allocated_memory_t *mem)
diff --git a/src/uct/base/uct_worker.c b/src/uct/base/uct_worker.c
index 5f7ec093e87..a868ccb6da5 100644
--- a/src/uct/base/uct_worker.c
+++ b/src/uct/base/uct_worker.c
@@ -15,16 +15,20 @@
 #include <ucs/arch/atomic.h>
 #include <ucs/type/class.h>
 #include <ucs/async/async.h>
+#include <ucs/vfs/base/vfs_obj.h>
 
 
 static UCS_CLASS_INIT_FUNC(uct_worker_t)
 {
     ucs_callbackq_init(&self->progress_q);
+    ucs_vfs_obj_add_dir(NULL, self, "uct/worker/%p", self);
+
     return UCS_OK;
 }
 
 static UCS_CLASS_CLEANUP_FUNC(uct_worker_t)
 {
+    ucs_vfs_obj_remove(self);
     ucs_callbackq_cleanup(&self->progress_q);
 }
 
diff --git a/src/uct/configure.m4 b/src/uct/configure.m4
index 338b257e2ec..bb1b5a1906e 100644
--- a/src/uct/configure.m4
+++ b/src/uct/configure.m4
@@ -13,3 +13,15 @@ m4_include([src/uct/ugni/configure.m4])
 AC_DEFINE_UNQUOTED([uct_MODULES], ["${uct_modules}"], [UCT loadable modules])
 
 AC_CONFIG_FILES([src/uct/Makefile])
+
+#
+# TCP flags
+#
+AC_CHECK_DECLS([IPPROTO_TCP, SOL_SOCKET, SO_KEEPALIVE,
+                TCP_KEEPCNT, TCP_KEEPIDLE, TCP_KEEPINTVL],
+               [],
+               [tcp_keepalive_happy=no],
+               [[#include <netinet/tcp.h>]
+                [#include <netinet/in.h>]])
+AS_IF([test "x$tcp_keepalive_happy" != "xno"],
+      [AC_DEFINE([UCT_TCP_EP_KEEPALIVE], 1, [Enable TCP keepalive configuration])]);
diff --git a/src/uct/cuda/Makefile.am b/src/uct/cuda/Makefile.am
index 0992bb4d180..647092eda46 100644
--- a/src/uct/cuda/Makefile.am
+++ b/src/uct/cuda/Makefile.am
@@ -10,9 +10,10 @@ SUBDIRS = . gdr_copy
 module_LTLIBRARIES      = libuct_cuda.la
 libuct_cuda_la_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS)
 libuct_cuda_la_CFLAGS   = $(BASE_CFLAGS) $(CUDA_CFLAGS)
-libuct_cuda_la_LIBADD   = $(top_builddir)/src/ucs/libucs.la \
-                          $(top_builddir)/src/uct/libuct.la
 libuct_cuda_la_LDFLAGS  = $(CUDA_LDFLAGS) -version-info $(SOVERSION)
+libuct_cuda_la_LIBADD   = $(top_builddir)/src/ucs/libucs.la \
+                          $(top_builddir)/src/uct/libuct.la \
+                          $(CUDA_LIBS)
 
 noinst_HEADERS = \
 	base/cuda_md.h \
diff --git a/src/uct/cuda/base/cuda_iface.c b/src/uct/cuda/base/cuda_iface.c
index 3babb7ef423..6fd7b25c18c 100644
--- a/src/uct/cuda/base/cuda_iface.c
+++ b/src/uct/cuda/base/cuda_iface.c
@@ -12,10 +12,27 @@
 
 
 ucs_status_t
-uct_cuda_base_query_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
-                           unsigned *num_tl_devices_p)
+uct_cuda_base_query_devices_common(
+        uct_md_h md, uct_device_type_t dev_type,
+        uct_tl_device_resource_t **tl_devices_p, unsigned *num_tl_devices_p)
 {
-    return uct_single_device_resource(md, UCT_CUDA_DEV_NAME, UCT_DEVICE_TYPE_ACC,
-                                      tl_devices_p, num_tl_devices_p);
+    ucs_sys_device_t sys_device = UCS_SYS_DEVICE_ID_UNKNOWN;
+    CUdevice cuda_device;
+
+    if (cuCtxGetDevice(&cuda_device) == CUDA_SUCCESS) {
+        uct_cuda_base_get_sys_dev(cuda_device, &sys_device);
+    }
+
+    return uct_single_device_resource(md, UCT_CUDA_DEV_NAME, dev_type,
+                                      sys_device, tl_devices_p,
+                                      num_tl_devices_p);
 }
 
+ucs_status_t
+uct_cuda_base_query_devices(
+        uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
+        unsigned *num_tl_devices_p)
+{
+    return uct_cuda_base_query_devices_common(md, UCT_DEVICE_TYPE_ACC,
+                                              tl_devices_p, num_tl_devices_p);
+}
diff --git a/src/uct/cuda/base/cuda_iface.h b/src/uct/cuda/base/cuda_iface.h
index b612205954d..119cfdac398 100644
--- a/src/uct/cuda/base/cuda_iface.h
+++ b/src/uct/cuda/base/cuda_iface.h
@@ -58,20 +58,23 @@
     UCT_CUDADRV_FUNC(_func, UCS_LOG_LEVEL_ERROR)
 
 
-#define UCT_CUDADRV_CTX_ACTIVE(_state)                                       \
-    {                                                                        \
-        CUcontext cur_ctx;                                                   \
-        CUdevice dev;                                                        \
-        unsigned flags;                                                      \
-                                                                             \
-        _state = 0;                                                          \
-        /* avoid active state check if no cuda activity */                   \
-        if ((CUDA_SUCCESS == cuCtxGetCurrent(&cur_ctx)) &&                   \
-            (NULL != cur_ctx)) {                                             \
-            UCT_CUDADRV_FUNC_LOG_ERR(cuCtxGetDevice(&dev));                  \
-            UCT_CUDADRV_FUNC_LOG_ERR(cuDevicePrimaryCtxGetState(dev, &flags, \
-                                                                &_state));   \
-        }                                                                    \
+#define UCT_CUDADRV_CTX_ACTIVE(_state) \
+    { \
+        CUdevice _dev; \
+        CUcontext _ctx; \
+        int _flags; \
+        if (CUDA_SUCCESS == cuCtxGetDevice(&_dev)) { \
+            cuDevicePrimaryCtxGetState(_dev, &_flags, &_state); \
+            if (_state == 0) { \
+                /* need to retain for malloc purposes */ \
+                if (CUDA_SUCCESS != cuDevicePrimaryCtxRetain(&_ctx, _dev)) { \
+                    ucs_fatal("unable to retain ctx after detecting device"); \
+                } \
+            } \
+            _state = 1; \
+        } else { \
+            _state = 0; \
+        } \
     }
 
 
@@ -83,7 +86,16 @@ typedef enum uct_cuda_base_gen {
 
 
 ucs_status_t
-uct_cuda_base_query_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
-                           unsigned *num_tl_devices_p);
+uct_cuda_base_query_devices_common(
+        uct_md_h md, uct_device_type_t dev_type,
+        uct_tl_device_resource_t **tl_devices_p, unsigned *num_tl_devices_p);
+
+ucs_status_t
+uct_cuda_base_query_devices(
+        uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
+        unsigned *num_tl_devices_p);
+
+ucs_status_t
+uct_cuda_base_get_sys_dev(CUdevice cuda_device, ucs_sys_device_t *sys_dev_p);
 
 #endif
diff --git a/src/uct/cuda/base/cuda_md.c b/src/uct/cuda/base/cuda_md.c
index c239b48bcc8..e38467d77cc 100644
--- a/src/uct/cuda/base/cuda_md.c
+++ b/src/uct/cuda/base/cuda_md.c
@@ -9,6 +9,7 @@
 #endif
 
 #include "cuda_md.h"
+#include "cuda_iface.h"
 
 #include <ucs/sys/module.h>
 #include <ucs/profile/profile.h>
@@ -17,8 +18,8 @@
 #include <cuda.h>
 
 
-static ucs_status_t uct_cuda_base_get_sys_dev(CUdevice cuda_device,
-                                              ucs_sys_device_t *sys_dev_p)
+ucs_status_t uct_cuda_base_get_sys_dev(CUdevice cuda_device,
+                                       ucs_sys_device_t *sys_dev_p)
 {
     ucs_sys_bus_id_t bus_id;
     CUresult cu_err;
@@ -56,15 +57,16 @@ static ucs_status_t uct_cuda_base_get_sys_dev(CUdevice cuda_device,
 
 
 UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_base_detect_memory_type,
-                 (md, addr, length, mem_type_p),
-                 uct_md_h md, const void *addr, size_t length,
+                 (md, address, length, mem_type_p),
+                 uct_md_h md, const void *address, size_t length,
                  ucs_memory_type_t *mem_type_p)
 {
     uct_md_mem_attr_t mem_attr;
     ucs_status_t status;
 
     mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE;
-    status              = uct_cuda_base_mem_query(md, addr, length, &mem_attr);
+    status              = uct_cuda_base_mem_query(md, address, length,
+                                                  &mem_attr);
     if (status != UCS_OK) {
         return status;
     }
@@ -83,6 +85,9 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_base_mem_query,
     uint32_t is_managed        = 0;
     unsigned value             = 1;
     CUdevice cuda_device       = -1;
+    void *base_address         = (void*)address;
+    size_t alloc_length        = length;
+    ucs_sys_device_t sys_dev   =  UCS_SYS_DEVICE_ID_UNKNOWN;
     CUpointer_attribute attr_type[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
     void *attr_data[UCT_CUDA_MEM_QUERY_NUM_ATTRS];
     ucs_memory_type_t mem_type;
@@ -90,16 +95,15 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_base_mem_query,
     ucs_status_t status;
     CUresult cu_err;
 
-    if (!(mem_attr->field_mask & (UCT_MD_MEM_ATTR_FIELD_MEM_TYPE |
-                                  UCT_MD_MEM_ATTR_FIELD_SYS_DEV))) {
+    if (!(mem_attr->field_mask & (UCT_MD_MEM_ATTR_FIELD_MEM_TYPE     |
+                                  UCT_MD_MEM_ATTR_FIELD_SYS_DEV      |
+                                  UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS |
+                                  UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH))) {
         return UCS_OK;
     }
 
     if (address == NULL) {
         mem_type              = UCS_MEMORY_TYPE_HOST;
-        if (mem_attr->field_mask & UCT_MD_MEM_ATTR_FIELD_SYS_DEV) {
-            mem_attr->sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
-        }
     } else {
         attr_type[0] = CU_POINTER_ATTRIBUTE_MEMORY_TYPE;
         attr_data[0] = &cuda_mem_mype;
@@ -133,17 +137,41 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_base_mem_query,
         }
 
         if (mem_attr->field_mask & UCT_MD_MEM_ATTR_FIELD_SYS_DEV) {
-            status = uct_cuda_base_get_sys_dev(cuda_device, &mem_attr->sys_dev);
+            status = uct_cuda_base_get_sys_dev(cuda_device, &sys_dev);
             if (status != UCS_OK) {
                 return status;
             }
         }
+
+        if (mem_attr->field_mask & (UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH |
+                                    UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS)) {
+            cu_err = cuMemGetAddressRange((CUdeviceptr*)&base_address,
+                                          &alloc_length, (CUdeviceptr)address);
+            if (cu_err != CUDA_SUCCESS) {
+                cuGetErrorString(cu_err, &cu_err_str);
+                ucs_error("ccuMemGetAddressRange(%p) error: %s", address,
+                          cu_err_str);
+                return UCS_ERR_INVALID_ADDR;
+            }
+        }
     }
 
     if (mem_attr->field_mask & UCT_MD_MEM_ATTR_FIELD_MEM_TYPE) {
         mem_attr->mem_type = mem_type;
     }
 
+    if (mem_attr->field_mask & UCT_MD_MEM_ATTR_FIELD_SYS_DEV) {
+        mem_attr->sys_dev = sys_dev;
+    }
+
+    if (mem_attr->field_mask & UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS) {
+        mem_attr->base_address = base_address;
+    }
+
+    if (mem_attr->field_mask & UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH) {
+        mem_attr->alloc_length = alloc_length;
+    }
+
     return UCS_OK;
 }
 
diff --git a/src/uct/cuda/base/cuda_md.h b/src/uct/cuda/base/cuda_md.h
index 97e838e980c..917bac8194f 100644
--- a/src/uct/cuda/base/cuda_md.h
+++ b/src/uct/cuda/base/cuda_md.h
@@ -8,7 +8,7 @@
 
 #include <uct/base/uct_md.h>
 
-ucs_status_t uct_cuda_base_detect_memory_type(uct_md_h md, const void *addr,
+ucs_status_t uct_cuda_base_detect_memory_type(uct_md_h md, const void *address,
                                               size_t length,
                                               ucs_memory_type_t *mem_type_p);
 
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_ep.c b/src/uct/cuda/cuda_copy/cuda_copy_ep.c
index 65573b8ddec..de0e028afd9 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_ep.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_ep.c
@@ -37,9 +37,9 @@ UCS_CLASS_DEFINE(uct_cuda_copy_ep_t, uct_base_ep_t)
 UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_copy_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_copy_ep_t, uct_ep_t);
 
-#define uct_cuda_copy_trace_data(_remote_addr, _rkey, _fmt, ...) \
-     ucs_trace_data(_fmt " to %"PRIx64"(%+ld)", ## __VA_ARGS__, (_remote_addr), \
-                    (_rkey))
+#define uct_cuda_copy_trace_data(_name, _remote_addr, _iov, _iovcnt) \
+    ucs_trace_data("%s [ptr %p len %zu] to 0x%" PRIx64, _name, (_iov)->buffer, \
+                   (_iov)->length, (_remote_addr))
 
 #define UCT_CUDA_COPY_CHECK_AND_CREATE_STREAM(_iface, _id) \
     if ((_iface)->stream[_id] == 0) { \
@@ -87,8 +87,8 @@ uct_cuda_copy_post_cuda_async_copy(uct_ep_h tl_ep, void *dst, void *src, size_t
     ucs_queue_push(&iface->outstanding_event_q[id], &cuda_event->queue);
     cuda_event->comp = comp;
 
-    ucs_trace("cuda async issued :%p dst:%p, src:%p  len:%ld",
-             cuda_event, dst, src, length);
+    ucs_trace_data("cuda async issued :%p dst:%p, src:%p len:%ld", cuda_event,
+                   dst, src, length);
     return UCS_INPROGRESS;
 }
 
@@ -110,8 +110,7 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_ep_get_zcopy,
 
     UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY,
                       uct_iov_total_length(iov, iovcnt));
-    uct_cuda_copy_trace_data(remote_addr, rkey, "GET_ZCOPY [length %zu]",
-                             uct_iov_total_length(iov, iovcnt));
+    uct_cuda_copy_trace_data("GET_ZCOPY", remote_addr, iov, iovcnt);
     return status;
 }
 
@@ -130,8 +129,7 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_ep_put_zcopy,
 
     UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, ZCOPY,
                       uct_iov_total_length(iov, iovcnt));
-    uct_cuda_copy_trace_data(remote_addr, rkey, "PUT_ZCOPY [length %zu]",
-                             uct_iov_total_length(iov, iovcnt));
+    uct_cuda_copy_trace_data("PUT_ZCOPY", remote_addr, iov, iovcnt);
     return status;
 
 }
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.c b/src/uct/cuda/cuda_copy/cuda_copy_iface.c
index 410e323115f..cce32f323a9 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_iface.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.c
@@ -105,10 +105,10 @@ static ucs_status_t uct_cuda_copy_iface_query(uct_iface_h tl_iface,
     iface_attr->cap.am.max_hdr          = 0;
     iface_attr->cap.am.max_iov          = 1;
 
-    iface_attr->latency                 = ucs_linear_func_make(10e-6, 0);
+    iface_attr->latency                 = ucs_linear_func_make(8e-6, 0);
     iface_attr->bandwidth.dedicated     = 0;
-    iface_attr->bandwidth.shared        = 6911.0 * UCS_MBYTE;
-    iface_attr->overhead                = 0;
+    iface_attr->bandwidth.shared        = UCT_CUDA_COPY_IFACE_DEFAULT_BANDWIDTH;
+    iface_attr->overhead                = UCT_CUDA_COPY_IFACE_OVERHEAD;
     iface_attr->priority                = 0;
 
     return UCS_OK;
@@ -161,6 +161,7 @@ uct_cuda_copy_progress_event_queue(uct_cuda_copy_iface_t *iface,
                                cudaEventQuery(cuda_event->event) == cudaSuccess) {
         ucs_queue_remove(queue_head, &cuda_event->queue);
         if (cuda_event->comp != NULL) {
+            ucs_trace_data("cuda_copy event %p completed", cuda_event);
             uct_invoke_completion(cuda_event->comp, UCS_OK);
         }
         ucs_trace_poll("CUDA Event Done :%p", cuda_event);
@@ -284,6 +285,42 @@ static void uct_cuda_copy_event_desc_cleanup(ucs_mpool_t *mp, void *obj)
     }
 }
 
+static ucs_status_t
+uct_cuda_copy_estimate_perf(uct_iface_h iface, uct_perf_attr_t *perf_attr)
+{
+    if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_BANDWIDTH) {
+        perf_attr->bandwidth.dedicated = 0;
+        if (!(perf_attr->field_mask & UCT_PERF_ATTR_FIELD_OPERATION)) {
+            perf_attr->bandwidth.shared = UCT_CUDA_COPY_IFACE_DEFAULT_BANDWIDTH;
+        } else {
+            switch (perf_attr->operation) {
+            case UCT_OP_GET_SHORT:
+                perf_attr->bandwidth.shared = 9320.0 * UCS_MBYTE;
+                break;
+            case UCT_OP_GET_ZCOPY:
+                perf_attr->bandwidth.shared = 11660.0 * UCS_MBYTE;
+                break;
+            case UCT_OP_PUT_SHORT:
+                perf_attr->bandwidth.shared = 8110.0 * UCS_MBYTE;
+                break;
+            case UCT_OP_PUT_ZCOPY:
+                perf_attr->bandwidth.shared = 9980.0 * UCS_MBYTE;
+                break;
+            default:
+                perf_attr->bandwidth.shared =
+                        UCT_CUDA_COPY_IFACE_DEFAULT_BANDWIDTH;
+                break;
+            }
+        }
+    }
+
+    if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_OVERHEAD) {
+        perf_attr->overhead = UCT_CUDA_COPY_IFACE_OVERHEAD;
+    }
+
+    return UCS_OK;
+}
+
 static ucs_mpool_ops_t uct_cuda_copy_event_desc_mpool_ops = {
     .chunk_alloc   = ucs_mpool_chunk_malloc,
     .chunk_release = ucs_mpool_chunk_free,
@@ -291,6 +328,10 @@ static ucs_mpool_ops_t uct_cuda_copy_event_desc_mpool_ops = {
     .obj_cleanup   = uct_cuda_copy_event_desc_cleanup,
 };
 
+static uct_iface_internal_ops_t uct_cuda_copy_iface_internal_ops = {
+    .iface_estimate_perf = uct_cuda_copy_estimate_perf
+};
+
 static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h worker,
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
@@ -300,8 +341,10 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_copy_iface_t, uct_md_h md, uct_worker_h work
     int i;
     ucs_status_t status;
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_copy_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_copy_iface_ops,
+                              &uct_cuda_copy_iface_internal_ops, md, worker,
+                              params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG("cuda_copy"));
 
     if (strncmp(params->mode.device.dev_name,
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_iface.h b/src/uct/cuda/cuda_copy/cuda_copy_iface.h
index 24729078a07..a5e50913c5a 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_iface.h
+++ b/src/uct/cuda/cuda_copy/cuda_copy_iface.h
@@ -11,6 +11,12 @@
 #include <pthread.h>
 
 
+#define UCT_CUDA_COPY_IFACE_DEFAULT_BANDWIDTH (10000.0 * UCS_MBYTE)
+
+
+#define UCT_CUDA_COPY_IFACE_OVERHEAD          (0)
+
+
 typedef uint64_t uct_cuda_copy_iface_addr_t;
 
 
diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c
index 3582a680cfd..a02da8e00c8 100644
--- a/src/uct/cuda/cuda_copy/cuda_copy_md.c
+++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c
@@ -30,12 +30,17 @@ static ucs_config_field_t uct_cuda_copy_md_config_table[] = {
 
 static ucs_status_t uct_cuda_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr)
 {
-    md_attr->cap.flags            = UCT_MD_FLAG_REG;
-    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
-    md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA);
+    md_attr->cap.flags            = UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC;
+    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST) |
+                                    UCS_BIT(UCS_MEMORY_TYPE_CUDA) |
+                                    UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED);
+    md_attr->cap.alloc_mem_types  = UCS_BIT(UCS_MEMORY_TYPE_CUDA) |
+                                    UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED);
+    md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) |
+                                    UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED);
     md_attr->cap.detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) |
                                     UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED);
-    md_attr->cap.max_alloc        = 0;
+    md_attr->cap.max_alloc        = SIZE_MAX;
     md_attr->cap.max_reg          = ULONG_MAX;
     md_attr->rkey_packed_size     = 0;
     md_attr->reg_cost             = ucs_linear_func_make(0, 0);
@@ -82,9 +87,13 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_mem_reg,
 
     result = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
                                    (CUdeviceptr)(address));
-    if ((result == CUDA_SUCCESS) && (memType == CU_MEMORYTYPE_HOST)) {
-        /* memory is allocated with cudaMallocHost which is already registered */
-        *memh_p = NULL;
+    if ((result == CUDA_SUCCESS) && ((memType == CU_MEMORYTYPE_HOST)    ||
+                                     (memType == CU_MEMORYTYPE_UNIFIED) ||
+                                     (memType == CU_MEMORYTYPE_DEVICE))) {
+        /* only host memory not allocated by cuda needs to be registered */
+        /* using deadbeef as VA to avoid gtest error */
+        UCS_STATIC_ASSERT((uint64_t)0xdeadbeef != (uint64_t)UCT_MEM_HANDLE_NULL);
+        *memh_p = (void *)0xdeadbeef;
         return UCS_OK;
     }
 
@@ -107,7 +116,7 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_mem_dereg,
     void *address = (void *)memh;
     ucs_status_t status;
 
-    if (address == NULL) {
+    if (address == (void*)0xdeadbeef) {
         return UCS_OK;
     }
 
@@ -119,6 +128,50 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_copy_mem_dereg,
     return UCS_OK;
 }
 
+static ucs_status_t uct_cuda_copy_mem_alloc(uct_md_h md, size_t *length_p,
+                                            void **address_p,
+                                            ucs_memory_type_t mem_type,
+                                            unsigned flags,
+                                            const char *alloc_name,
+                                            uct_mem_h *memh_p)
+{
+    ucs_status_t status;
+    int active;
+
+    if ((mem_type != UCS_MEMORY_TYPE_CUDA_MANAGED) &&
+        (mem_type != UCS_MEMORY_TYPE_CUDA)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    UCT_CUDADRV_CTX_ACTIVE(active);
+    if (!active) {
+        return UCS_ERR_NO_DEVICE;
+    }
+
+    if (mem_type == UCS_MEMORY_TYPE_CUDA) {
+        status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAlloc((CUdeviceptr*)address_p,
+                                                     *length_p));
+    } else {
+        status = 
+            UCT_CUDADRV_FUNC_LOG_ERR(cuMemAllocManaged((CUdeviceptr*)address_p,
+                                                       *length_p,
+                                                       CU_MEM_ATTACH_GLOBAL));
+    }
+
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    *memh_p = *address_p;
+    return UCS_OK;
+}
+
+static ucs_status_t uct_cuda_copy_mem_free(uct_md_h md, uct_mem_h memh)
+{
+    return UCT_CUDADRV_FUNC_LOG_ERR(cuMemFree((CUdeviceptr)memh));
+}
+
+
 static void uct_cuda_copy_md_close(uct_md_h uct_md) {
     uct_cuda_copy_md_t *md = ucs_derived_of(uct_md, uct_cuda_copy_md_t);
 
@@ -126,13 +179,16 @@ static void uct_cuda_copy_md_close(uct_md_h uct_md) {
 }
 
 static uct_md_ops_t md_ops = {
-    .close               = uct_cuda_copy_md_close,
-    .query               = uct_cuda_copy_md_query,
-    .mkey_pack           = uct_cuda_copy_mkey_pack,
-    .mem_reg             = uct_cuda_copy_mem_reg,
-    .mem_dereg           = uct_cuda_copy_mem_dereg,
-    .mem_query           = uct_cuda_base_mem_query,
-    .detect_memory_type  = uct_cuda_base_detect_memory_type,
+    .close                  = uct_cuda_copy_md_close,
+    .query                  = uct_cuda_copy_md_query,
+    .mem_alloc              = uct_cuda_copy_mem_alloc,
+    .mem_free               = uct_cuda_copy_mem_free,
+    .mkey_pack              = uct_cuda_copy_mkey_pack,
+    .mem_reg                = uct_cuda_copy_mem_reg,
+    .mem_dereg              = uct_cuda_copy_mem_dereg,
+    .mem_query              = uct_cuda_base_mem_query,
+    .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+    .detect_memory_type     = uct_cuda_base_detect_memory_type
 };
 
 static ucs_status_t
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c
index d6151d52edc..a2afd15eabb 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c
@@ -9,11 +9,44 @@
 #endif
 
 #include "cuda_ipc_cache.h"
+#include "cuda_ipc_iface.h"
 #include <ucs/debug/log.h>
 #include <ucs/debug/memtrack.h>
 #include <ucs/profile/profile.h>
 #include <ucs/sys/sys.h>
+#include <ucs/sys/string.h>
 #include <ucs/sys/math.h>
+#include <ucs/datastruct/khash.h>
+
+
+typedef struct uct_cuda_ipc_cache_hash_key {
+    pid_t    pid;
+    CUdevice cu_device;
+} uct_cuda_ipc_cache_hash_key_t;
+
+static UCS_F_ALWAYS_INLINE int
+uct_cuda_ipc_cache_hash_equal(uct_cuda_ipc_cache_hash_key_t key1,
+                              uct_cuda_ipc_cache_hash_key_t key2)
+{
+    return (key1.pid == key2.pid) && (key1.cu_device == key2.cu_device);
+}
+
+static UCS_F_ALWAYS_INLINE khint32_t
+uct_cuda_ipc_cache_hash_func(uct_cuda_ipc_cache_hash_key_t key)
+{
+    return kh_int_hash_func((key.pid << 8) | key.cu_device);
+}
+
+KHASH_INIT(cuda_ipc_rem_cache, uct_cuda_ipc_cache_hash_key_t,
+           uct_cuda_ipc_cache_t*, 1, uct_cuda_ipc_cache_hash_func,
+           uct_cuda_ipc_cache_hash_equal);
+
+typedef struct uct_cuda_ipc_remote_cache {
+    khash_t(cuda_ipc_rem_cache) hash;
+    ucs_recursive_spinlock_t    lock;
+} uct_cuda_ipc_remote_cache_t;
+
+uct_cuda_ipc_remote_cache_t uct_cuda_ipc_remote_cache;
 
 static ucs_pgt_dir_t *uct_cuda_ipc_cache_pgt_dir_alloc(const ucs_pgtable_t *pgtable)
 {
@@ -48,38 +81,42 @@ static void uct_cuda_ipc_cache_purge(uct_cuda_ipc_cache_t *cache)
 {
     uct_cuda_ipc_cache_region_t *region, *tmp;
     ucs_list_link_t region_list;
+    int active;
+
+    UCT_CUDADRV_CTX_ACTIVE(active);
 
     ucs_list_head_init(&region_list);
     ucs_pgtable_purge(&cache->pgtable, uct_cuda_ipc_cache_region_collect_callback,
                       &region_list);
     ucs_list_for_each_safe(region, tmp, &region_list, list) {
-        UCT_CUDADRV_FUNC_LOG_ERR(
-                cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr));
+        if (active) {
+            UCT_CUDADRV_FUNC_LOG_ERR(
+                    cuIpcCloseMemHandle((CUdeviceptr)region->mapped_addr));
+        }
         ucs_free(region);
     }
     ucs_trace("%s: cuda ipc cache purged", cache->name);
 }
 
-static ucs_status_t uct_cuda_ipc_open_memhandle(CUipcMemHandle memh,
+static ucs_status_t uct_cuda_ipc_open_memhandle(const uct_cuda_ipc_key_t *key,
                                                 CUdeviceptr *mapped_addr)
 {
     const char *cu_err_str;
     CUresult cuerr;
+    ucs_status_t status;
 
-    cuerr = cuIpcOpenMemHandle(mapped_addr, memh,
+    cuerr = cuIpcOpenMemHandle(mapped_addr, key->ph,
                                CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-    if (cuerr != CUDA_SUCCESS) {
-        if (cuerr == CUDA_ERROR_ALREADY_MAPPED) {
-            return UCS_ERR_ALREADY_EXISTS;
-        }
-
+    if (cuerr == CUDA_SUCCESS) {
+        status = UCS_OK;
+    } else {
         cuGetErrorString(cuerr, &cu_err_str);
-        ucs_error("cuIpcOpenMemHandle() failed: %s", cu_err_str);
-
-        return UCS_ERR_INVALID_PARAM;
+        ucs_debug("cuIpcOpenMemHandle() failed: %s", cu_err_str);
+        status = (cuerr == CUDA_ERROR_ALREADY_MAPPED) ? UCS_ERR_ALREADY_EXISTS :
+                                                        UCS_ERR_INVALID_PARAM;
     }
 
-    return UCS_OK;
+    return status;
 }
 
 static void uct_cuda_ipc_cache_invalidate_regions(uct_cuda_ipc_cache_t *cache,
@@ -108,14 +145,59 @@ static void uct_cuda_ipc_cache_invalidate_regions(uct_cuda_ipc_cache_t *cache,
               cache->name, from, to);
 }
 
-ucs_status_t uct_cuda_ipc_unmap_memhandle(void *rem_cache, uintptr_t d_bptr,
+static ucs_status_t
+uct_cuda_ipc_get_remote_cache(pid_t pid, uct_cuda_ipc_cache_t **cache)
+{
+    ucs_status_t status = UCS_OK;
+    char target_name[64];
+    uct_cuda_ipc_cache_hash_key_t key;
+    khiter_t khiter;
+    int khret;
+
+    ucs_recursive_spin_lock(&uct_cuda_ipc_remote_cache.lock);
+
+    key.pid = pid;
+    UCT_CUDADRV_FUNC_LOG_ERR(cuCtxGetDevice(&key.cu_device));
+
+    khiter = kh_put(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, key,
+                    &khret);
+    if ((khret == UCS_KH_PUT_BUCKET_EMPTY) ||
+        (khret == UCS_KH_PUT_BUCKET_CLEAR)) {
+        ucs_snprintf_safe(target_name, sizeof(target_name), "dest:%d:%d",
+                          key.pid, key.cu_device);
+        status = uct_cuda_ipc_create_cache(cache, target_name);
+        if (status != UCS_OK) {
+            kh_del(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, khiter);
+            ucs_error("could not create create cuda ipc cache: %s",
+                      ucs_status_string(status));
+            goto err_unlock;
+        }
+
+        kh_val(&uct_cuda_ipc_remote_cache.hash, khiter) = *cache;
+    } else if (khret == UCS_KH_PUT_KEY_PRESENT) {
+        *cache = kh_val(&uct_cuda_ipc_remote_cache.hash, khiter);
+    } else {
+        ucs_error("unable to use cuda_ipc remote_cache hash");
+        status = UCS_ERR_NO_RESOURCE;
+    }
+err_unlock:
+    ucs_recursive_spin_unlock(&uct_cuda_ipc_remote_cache.lock);
+    return status;
+}
+
+ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, uintptr_t d_bptr,
                                           void *mapped_addr, int cache_enabled)
 {
-    uct_cuda_ipc_cache_t *cache = (uct_cuda_ipc_cache_t *) rem_cache;
-    ucs_status_t status         = UCS_OK;
+    ucs_status_t status = UCS_OK;
+    uct_cuda_ipc_cache_t *cache;
     ucs_pgt_region_t *pgt_region;
     uct_cuda_ipc_cache_region_t *region;
 
+    status = uct_cuda_ipc_get_remote_cache(pid, &cache);
+    if (status != UCS_OK) {
+        return status;
+    }
+
     /* use write lock because cache maybe modified */
     pthread_rwlock_wrlock(&cache->lock);
     pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &cache->pgtable, d_bptr);
@@ -144,16 +226,20 @@ ucs_status_t uct_cuda_ipc_unmap_memhandle(void *rem_cache, uintptr_t d_bptr,
     return status;
 }
 
-UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
-                 (arg, key, mapped_addr),
-                 void *arg, uct_cuda_ipc_key_t *key, void **mapped_addr)
+UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, (key, mapped_addr),
+                 const uct_cuda_ipc_key_t *key, void **mapped_addr)
 {
-    uct_cuda_ipc_cache_t *cache = (uct_cuda_ipc_cache_t *)arg;
+    uct_cuda_ipc_cache_t *cache;
     ucs_status_t status;
     ucs_pgt_region_t *pgt_region;
     uct_cuda_ipc_cache_region_t *region;
     int ret;
 
+    status = uct_cuda_ipc_get_remote_cache(key->pid, &cache);
+    if (status != UCS_OK) {
+        return status;
+    }
+
     pthread_rwlock_wrlock(&cache->lock);
     pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup,
                                   &cache->pgtable, key->d_bptr);
@@ -191,19 +277,22 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
         }
     }
 
-    status = uct_cuda_ipc_open_memhandle(key->ph, (CUdeviceptr *)mapped_addr);
+    status = uct_cuda_ipc_open_memhandle(key, (CUdeviceptr*)mapped_addr);
     if (ucs_unlikely(status != UCS_OK)) {
         if (ucs_likely(status == UCS_ERR_ALREADY_EXISTS)) {
             /* unmap all overlapping regions and retry*/
             uct_cuda_ipc_cache_invalidate_regions(cache, (void *)key->d_bptr,
                                                   UCS_PTR_BYTE_OFFSET(key->d_bptr,
                                                                       key->b_len));
-            status = uct_cuda_ipc_open_memhandle(key->ph, (CUdeviceptr *)mapped_addr);
+            status = uct_cuda_ipc_open_memhandle(key,
+                                                 (CUdeviceptr*)mapped_addr);
             if (ucs_unlikely(status != UCS_OK)) {
                 if (ucs_likely(status == UCS_ERR_ALREADY_EXISTS)) {
                     /* unmap all cache entries and retry */
                     uct_cuda_ipc_cache_purge(cache);
-                    status = uct_cuda_ipc_open_memhandle(key->ph, (CUdeviceptr *)mapped_addr);
+                    status =
+                        uct_cuda_ipc_open_memhandle(key,
+                                                    (CUdeviceptr*)mapped_addr);
                     if (status != UCS_OK) {
                         ucs_fatal("%s: failed to open ipc mem handle. addr:%p "
                                   "len:%lu (%s)", cache->name,
@@ -216,8 +305,9 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
                 }
             }
         } else {
-            ucs_fatal("%s: failed to open ipc mem handle. addr:%p len:%lu",
+            ucs_debug("%s: failed to open ipc mem handle. addr:%p len:%lu",
                       cache->name, (void *)key->d_bptr, key->b_len);
+            goto err;
         }
     }
 
@@ -262,8 +352,8 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle,
     ucs_trace("%s: cuda_ipc cache new region:"UCS_PGT_REGION_FMT" size:%lu",
               cache->name, UCS_PGT_REGION_ARG(&region->super), key->b_len);
 
-    pthread_rwlock_unlock(&cache->lock);
-    return UCS_OK;
+    status = UCS_OK;
+
 err:
     pthread_rwlock_unlock(&cache->lock);
     return status;
@@ -320,3 +410,18 @@ void uct_cuda_ipc_destroy_cache(uct_cuda_ipc_cache_t *cache)
     free(cache->name);
     ucs_free(cache);
 }
+
+UCS_STATIC_INIT {
+    ucs_recursive_spinlock_init(&uct_cuda_ipc_remote_cache.lock, 0);
+    kh_init_inplace(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash);
+}
+
+UCS_STATIC_CLEANUP {
+    uct_cuda_ipc_cache_t *rem_cache;
+
+    kh_foreach_value(&uct_cuda_ipc_remote_cache.hash, rem_cache, {
+        uct_cuda_ipc_destroy_cache(rem_cache);
+    })
+    kh_destroy_inplace(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash);
+    ucs_recursive_spinlock_destroy(&uct_cuda_ipc_remote_cache.lock);
+}
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h
index 588f5c97c11..d3a948f6288 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h
@@ -9,16 +9,16 @@
 
 #include <ucs/datastruct/pgtable.h>
 #include <ucs/datastruct/list.h>
+#include <ucs/type/init_once.h>
+#include <ucs/type/spinlock.h>
 #include "cuda_ipc_md.h"
 #include <cuda.h>
 #include <cuda_runtime.h>
 
 
-typedef struct uct_cuda_ipc_cache         uct_cuda_ipc_cache_t;
-typedef struct uct_cuda_ipc_cache_region  uct_cuda_ipc_cache_region_t;
-
-
-typedef struct uct_cuda_ipc_rem_memh uct_cuda_ipc_rem_memh_t;
+typedef struct uct_cuda_ipc_cache        uct_cuda_ipc_cache_t;
+typedef struct uct_cuda_ipc_cache_region uct_cuda_ipc_cache_region_t;
+typedef struct uct_cuda_ipc_rem_memh     uct_cuda_ipc_rem_memh_t;
 
 
 struct uct_cuda_ipc_cache_region {
@@ -44,8 +44,8 @@ ucs_status_t uct_cuda_ipc_create_cache(uct_cuda_ipc_cache_t **cache,
 void uct_cuda_ipc_destroy_cache(uct_cuda_ipc_cache_t *cache);
 
 
-ucs_status_t uct_cuda_ipc_map_memhandle(void *arg, uct_cuda_ipc_key_t *key,
-                                        void **mapped_addr);
-ucs_status_t uct_cuda_ipc_unmap_memhandle(void *rem_cache, uintptr_t d_bptr,
+ucs_status_t
+uct_cuda_ipc_map_memhandle(const uct_cuda_ipc_key_t *key, void **mapped_addr);
+ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, uintptr_t d_bptr,
                                           void *mapped_addr, int cache_enabled);
 #endif
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
index 886ffbcb4ca..a72435a39e5 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
@@ -22,36 +22,24 @@
 #define UCT_CUDA_IPC_PUT 0
 #define UCT_CUDA_IPC_GET 1
 
+
 static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_ep_t, const uct_ep_params_t *params)
 {
     uct_cuda_ipc_iface_t *iface = ucs_derived_of(params->iface,
                                                  uct_cuda_ipc_iface_t);
-    ucs_status_t status;
-    char target_name[64];
 
     UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
     UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
-    self->remote_memh_cache = NULL;
 
-    /* create a cache by default; disabling implies remove mapping immediately
-     * after use */
-    snprintf(target_name, sizeof(target_name), "dest:%d",
-            *(pid_t*)params->iface_addr);
-    status = uct_cuda_ipc_create_cache(&self->remote_memh_cache, target_name);
-    if (status != UCS_OK) {
-        ucs_error("could not create create cuda ipc cache: %s",
-                  ucs_status_string(status));
-        return status;
-    }
+    self->remote_pid = *(const pid_t*)params->iface_addr;
+    self->keepalive  = NULL;
 
     return UCS_OK;
 }
 
 static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_ep_t)
 {
-    if (self->remote_memh_cache) {
-        uct_cuda_ipc_destroy_cache(self->remote_memh_cache);
-    }
+    ucs_free(self->keepalive);
 }
 
 UCS_CLASS_DEFINE(uct_cuda_ipc_ep_t, uct_base_ep_t)
@@ -67,7 +55,6 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr,
                                   uct_completion_t *comp, int direction)
 {
     uct_cuda_ipc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cuda_ipc_iface_t);
-    uct_cuda_ipc_ep_t *ep       = ucs_derived_of(tl_ep, uct_cuda_ipc_ep_t);
     uct_cuda_ipc_key_t *key     = (uct_cuda_ipc_key_t *) rkey;
     void *mapped_rem_addr;
     void *mapped_addr;
@@ -83,7 +70,7 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr,
         return UCS_OK;
     }
 
-    status = iface->map_memhandle((void *)ep->remote_memh_cache, key, &mapped_addr);
+    status = uct_cuda_ipc_map_memhandle(key, &mapped_addr);
     if (status != UCS_OK) {
         return UCS_ERR_IO_ERROR;
     }
@@ -135,8 +122,8 @@ uct_cuda_ipc_post_cuda_async_copy(uct_ep_h tl_ep, uint64_t remote_addr,
     ucs_queue_push(outstanding_queue, &cuda_ipc_event->queue);
     cuda_ipc_event->comp        = comp;
     cuda_ipc_event->mapped_addr = mapped_addr;
-    cuda_ipc_event->cache       = ep->remote_memh_cache;
     cuda_ipc_event->d_bptr      = (uintptr_t)key->d_bptr;
+    cuda_ipc_event->pid         = key->pid;
     ucs_trace("cuMemcpyDtoDAsync issued :%p dst:%p, src:%p  len:%ld",
              cuda_ipc_event, (void *) dst, (void *) src, iov[0].length);
     return UCS_INPROGRESS;
@@ -183,3 +170,12 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_ep_put_zcopy,
                                 uct_iov_total_length(iov, iovcnt));
     return status;
 }
+
+ucs_status_t uct_cuda_ipc_ep_check(const uct_ep_h tl_ep, unsigned flags,
+                                   uct_completion_t *comp)
+{
+    uct_cuda_ipc_ep_t *ep = ucs_derived_of(tl_ep, uct_cuda_ipc_ep_t);
+
+    return uct_ep_keepalive_check(tl_ep, &ep->keepalive, ep->remote_pid, flags,
+                                  comp);
+}
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.h b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.h
index 4be71d28f73..6826b160d97 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_ep.h
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_ep.h
@@ -9,18 +9,15 @@
 #include <uct/api/uct.h>
 #include <uct/base/uct_iface.h>
 #include <ucs/type/class.h>
-#include "cuda_ipc_md.h"
-#include "cuda_ipc_cache.h"
 
-typedef struct uct_cuda_ipc_ep_addr {
-    int                ep_id;
-} uct_cuda_ipc_ep_addr_t;
 
 typedef struct uct_cuda_ipc_ep {
-    uct_base_ep_t                   super;
-    uct_cuda_ipc_cache_t            *remote_memh_cache;
+    uct_base_ep_t        super;
+    pid_t                remote_pid;
+    uct_keepalive_info_t *keepalive; /* keepalive metadata */
 } uct_cuda_ipc_ep_t;
 
+
 UCS_CLASS_DECLARE_NEW_FUNC(uct_cuda_ipc_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DECLARE_DELETE_FUNC(uct_cuda_ipc_ep_t, uct_ep_t);
 
@@ -33,4 +30,8 @@ ucs_status_t uct_cuda_ipc_ep_put_zcopy(uct_ep_h tl_ep,
                                        const uct_iov_t *iov, size_t iovcnt,
                                        uint64_t remote_addr, uct_rkey_t rkey,
                                        uct_completion_t *comp);
+
+ucs_status_t uct_cuda_ipc_ep_check(const uct_ep_h tl_ep, unsigned flags,
+                                   uct_completion_t *comp);
+
 #endif
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
index b56a8dab5cc..5adc0a3f226 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
@@ -17,6 +17,7 @@
 #include <ucs/sys/string.h>
 #include <ucs/debug/assert.h>
 #include <sys/eventfd.h>
+#include <pthread.h>
 
 static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = {
 
@@ -78,7 +79,7 @@ static int uct_cuda_ipc_iface_is_reachable(const uct_iface_h tl_iface,
     uct_cuda_ipc_iface_t  *iface = ucs_derived_of(tl_iface, uct_cuda_ipc_iface_t);
 
     return ((uct_cuda_ipc_iface_node_guid(&iface->super) ==
-            *((const uint64_t *)dev_addr)) && ((getpid() != *(pid_t *)iface_addr)));
+             *((const uint64_t *)dev_addr)) && ((getpid() != *(pid_t *)iface_addr)));
 }
 
 static double uct_cuda_ipc_iface_get_bw()
@@ -127,6 +128,7 @@ static ucs_status_t uct_cuda_ipc_iface_query(uct_iface_h tl_iface,
     iface_attr->ep_addr_len             = 0;
     iface_attr->max_conn_priv           = 0;
     iface_attr->cap.flags               = UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
+                                          UCT_IFACE_FLAG_EP_CHECK               |
                                           UCT_IFACE_FLAG_CONNECT_TO_IFACE       |
                                           UCT_IFACE_FLAG_PENDING                |
                                           UCT_IFACE_FLAG_GET_ZCOPY              |
@@ -251,10 +253,10 @@ uct_cuda_ipc_progress_event_q(uct_cuda_ipc_iface_t *iface,
             uct_invoke_completion(cuda_ipc_event->comp, UCS_OK);
         }
 
-        status = iface->unmap_memhandle(cuda_ipc_event->cache,
-                                        cuda_ipc_event->d_bptr,
-                                        cuda_ipc_event->mapped_addr,
-                                        iface->config.enable_cache);
+        status = uct_cuda_ipc_unmap_memhandle(cuda_ipc_event->pid,
+                                              cuda_ipc_event->d_bptr,
+                                              cuda_ipc_event->mapped_addr,
+                                              iface->config.enable_cache);
         if (status != UCS_OK) {
             ucs_fatal("failed to unmap addr:%p", cuda_ipc_event->mapped_addr);
         }
@@ -342,6 +344,7 @@ static uct_iface_ops_t uct_cuda_ipc_iface_ops = {
     .ep_pending_purge         = ucs_empty_function,
     .ep_flush                 = uct_base_ep_flush,
     .ep_fence                 = uct_base_ep_fence,
+    .ep_check                 = uct_cuda_ipc_ep_check,
     .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_cuda_ipc_ep_t),
     .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_cuda_ipc_ep_t),
     .iface_flush              = uct_cuda_ipc_iface_flush,
@@ -413,8 +416,9 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_iface_t, uct_md_h md, uct_worker_h worke
     ucs_status_t status;
 
     config = ucs_derived_of(tl_config, uct_cuda_ipc_iface_config_t);
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_ipc_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_cuda_ipc_iface_ops, NULL,
+                              md, worker, params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG("cuda_ipc"));
 
     if (strncmp(params->mode.device.dev_name,
@@ -428,9 +432,6 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_iface_t, uct_md_h md, uct_worker_h worke
     self->config.enable_cache        = config->enable_cache;
     self->config.max_cuda_ipc_events = config->max_cuda_ipc_events;
 
-    self->map_memhandle   = uct_cuda_ipc_map_memhandle;
-    self->unmap_memhandle = uct_cuda_ipc_unmap_memhandle;
-
     status = ucs_mpool_init(&self->event_desc,
                             0,
                             sizeof(uct_cuda_ipc_event_desc_t),
@@ -480,11 +481,20 @@ static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_iface_t)
     }
 }
 
+ucs_status_t
+uct_cuda_ipc_query_devices(
+        uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
+        unsigned *num_tl_devices_p)
+{
+    return uct_cuda_base_query_devices_common(md, UCT_DEVICE_TYPE_SHM,
+                                              tl_devices_p, num_tl_devices_p);
+}
+
 UCS_CLASS_DEFINE(uct_cuda_ipc_iface_t, uct_base_iface_t);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_cuda_ipc_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
                           const uct_iface_params_t*, const uct_iface_config_t*);
 static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cuda_ipc_iface_t, uct_iface_t);
 
-UCT_TL_DEFINE(&uct_cuda_ipc_component.super, cuda_ipc, uct_cuda_base_query_devices,
-              uct_cuda_ipc_iface_t, "CUDA_IPC_", uct_cuda_ipc_iface_config_table,
-              uct_cuda_ipc_iface_config_t);
+UCT_TL_DEFINE(&uct_cuda_ipc_component.super, cuda_ipc,
+              uct_cuda_ipc_query_devices, uct_cuda_ipc_iface_t, "CUDA_IPC_",
+              uct_cuda_ipc_iface_config_table, uct_cuda_ipc_iface_config_t);
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h
index 42e135a92e1..ed3efac8ef9 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h
@@ -14,11 +14,11 @@
 
 #include "cuda_ipc_md.h"
 #include "cuda_ipc_ep.h"
+#include "cuda_ipc_cache.h"
 
 
 #define UCT_CUDA_IPC_MAX_PEERS  16
 
-
 typedef struct uct_cuda_ipc_iface {
     uct_base_iface_t super;
     ucs_mpool_t      event_desc;              /* cuda event desc */
@@ -35,10 +35,6 @@ typedef struct uct_cuda_ipc_iface {
         unsigned     max_cuda_ipc_events;     /* max mpool entries */
         int          enable_cache;            /* enable/disable ipc handle cache */
     } config;
-    ucs_status_t     (*map_memhandle)(void *context, uct_cuda_ipc_key_t *key,
-                                      void **map_addr);
-    ucs_status_t     (*unmap_memhandle)(void *rem_cache, uintptr_t d_bptr,
-                                        void *mapped_addr, int cache_enabled);
 } uct_cuda_ipc_iface_t;
 
 
@@ -58,8 +54,8 @@ typedef struct uct_cuda_ipc_event_desc {
     uct_completion_t  *comp;
     ucs_queue_elem_t  queue;
     uct_cuda_ipc_ep_t *ep;
-    void              *cache;
     uintptr_t         d_bptr;
+    pid_t             pid;
 } uct_cuda_ipc_event_desc_t;
 
 
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c
index ed9037a1e9e..a44799478b0 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c
@@ -9,6 +9,7 @@
 #endif
 
 #include "cuda_ipc_md.h"
+#include "cuda_ipc_cache.h"
 
 #include <string.h>
 #include <limits.h>
@@ -32,6 +33,7 @@ static ucs_status_t uct_cuda_ipc_md_query(uct_md_h md, uct_md_attr_t *md_attr)
     md_attr->cap.flags            = UCT_MD_FLAG_REG |
                                     UCT_MD_FLAG_NEED_RKEY;
     md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_CUDA);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
@@ -74,6 +76,9 @@ ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx,
                                                     uct_cuda_ipc_key_t *rkey)
 {
     int i;
+    int num_devices;
+    int original_capacity, new_capacity;
+    int original_count, new_count;
 
     for (i = 0; i < md->uuid_map_size; i++) {
         if (uct_cuda_ipc_uuid_equals(&rkey->uuid, &md->uuid_map[i])) {
@@ -84,13 +89,12 @@ ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx,
 
     if (ucs_unlikely(md->uuid_map_size == md->uuid_map_capacity)) {
         /* reallocate on demand */
-        int num_devices;
-        int original_cache_size, new_cache_size;
-        int new_capacity = md->uuid_map_capacity * 2;
-
         UCT_CUDA_IPC_DEVICE_GET_COUNT(num_devices);
-        original_cache_size   = md->uuid_map_capacity * num_devices;
-        new_cache_size        = new_capacity * num_devices;
+        original_capacity     = md->uuid_map_capacity;
+        new_capacity          = md->uuid_map_capacity ?
+                                (md->uuid_map_capacity * 2) : 16;
+        original_count        = original_capacity * num_devices;
+        new_count             = new_capacity * num_devices;
         md->uuid_map_capacity = new_capacity;
         md->uuid_map          = ucs_realloc(md->uuid_map,
                                             new_capacity * sizeof(CUuuid),
@@ -100,14 +104,16 @@ ucs_status_t uct_cuda_ipc_get_unique_index_for_uuid(int* idx,
         }
 
         md->peer_accessible_cache = ucs_realloc(md->peer_accessible_cache,
-                                                new_cache_size,
+                                                new_count *
+                                                sizeof(ucs_ternary_auto_value_t),
                                                 "uct_cuda_ipc_peer_accessible_cache");
         if (md->peer_accessible_cache == NULL) {
             return UCS_ERR_NO_MEMORY;
         }
 
-        memset(md->peer_accessible_cache + original_cache_size, 0xFF,
-               new_cache_size - original_cache_size);
+        for (i = original_count; i < new_count; i++) {
+            md->peer_accessible_cache[i] = UCS_TRY;
+        }
     }
 
     /* Add new mapping */
@@ -124,12 +130,12 @@ static ucs_status_t uct_cuda_ipc_is_peer_accessible(uct_cuda_ipc_component_t *md
     ucs_status_t status;
     int peer_idx;
     int num_devices;
-    char* accessible;
-    CUdeviceptr d_mapped;
+    ucs_ternary_auto_value_t *accessible;
+    void *d_mapped;
 
     status = uct_cuda_ipc_get_unique_index_for_uuid(&peer_idx, mdc->md, rkey);
     if (ucs_unlikely(status != UCS_OK)) {
-        return status;
+        goto err;
     }
 
     /* overwrite dev_num with a unique ID; this means that relative remote
@@ -137,23 +143,40 @@ static ucs_status_t uct_cuda_ipc_is_peer_accessible(uct_cuda_ipc_component_t *md
      * stream sequentialization */
     rkey->dev_num = peer_idx;
 
-    UCT_CUDA_IPC_GET_DEVICE(this_device);
-    UCT_CUDA_IPC_DEVICE_GET_COUNT(num_devices);
+    if ((CUDA_SUCCESS != cuCtxGetDevice(&this_device)) ||
+        (CUDA_SUCCESS != cuDeviceGetCount(&num_devices))) {
+        goto err;
+    }
 
     accessible = &mdc->md->peer_accessible_cache[peer_idx * num_devices + this_device];
-    if (*accessible == (char)0xFF) { /* unchecked, add to cache */
-        CUresult result = cuIpcOpenMemHandle(&d_mapped,
-                                             rkey->ph,
-                                             CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-        *accessible = ((result != CUDA_SUCCESS) && (result != CUDA_ERROR_ALREADY_MAPPED))
-                    ? 0 : 1;
-        if (result == CUDA_SUCCESS) {
-            result = cuIpcCloseMemHandle(d_mapped);
-            if (result != CUDA_SUCCESS) ucs_fatal("Unable to close memhandle");
-        }
+    if (*accessible == UCS_TRY) { /* unchecked, add to cache */
+
+        /* Check if peer is reachable by trying to open memory handle. This is
+         * necessary when the device is not visible through CUDA_VISIBLE_DEVICES
+         * and checking peer accessibility through CUDA driver API is not
+         * possible.
+         * Previously, reachability was checked by opening a memory handle
+         * and immediately closing it as the handle to memory handle cache
+         * was not not globally visible. Doing this with multiple threads is an
+         * issue as a thread may first check reachability, and later open the
+         * handle, and save mapped pointer in cache as part of a put/get
+         * operation. At this point another thread can then close the same
+         * memory handle as part of reachability check. This leads to a
+         * cuMemcpyAsync error when accessing the mapped pointer as part of
+         * put/get operation.
+         * Now, we immediately insert into cache to save on calling
+         * OpenMemHandle for the same handle because the cache is globally
+         * accessible using rkey->pid. */
+        status = uct_cuda_ipc_map_memhandle(rkey, &d_mapped);
+
+        *accessible = ((status == UCS_OK) || (status == UCS_ERR_ALREADY_EXISTS))
+                      ? UCS_YES : UCS_NO;
     }
 
-    return (*accessible == 1) ? UCS_OK : UCS_ERR_UNREACHABLE;
+    return (*accessible == UCS_YES) ? UCS_OK : UCS_ERR_UNREACHABLE;
+
+err:
+    return status;
 }
 
 UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_rkey_unpack,
@@ -206,6 +229,7 @@ uct_cuda_ipc_mem_reg_internal(uct_md_h uct_md, void *addr, size_t length,
 
     log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG :
                 UCS_LOG_LEVEL_ERROR;
+
     status    = UCT_CUDADRV_FUNC(cuIpcGetMemHandle(&key->ph, (CUdeviceptr)addr),
                                  log_level);
     if (UCS_OK != status) {
@@ -219,6 +243,7 @@ uct_cuda_ipc_mem_reg_internal(uct_md_h uct_md, void *addr, size_t length,
                      log_level);
 
     key->dev_num  = (int) cu_device;
+    key->pid      = getpid();
     ucs_trace("registered memory:%p..%p length:%lu dev_num:%d",
               addr, UCS_PTR_BYTE_OFFSET(addr, length), length, (int) cu_device);
     return UCS_OK;
@@ -267,19 +292,19 @@ uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name,
                      const uct_md_config_t *config, uct_md_h *md_p)
 {
     static uct_md_ops_t md_ops = {
-        .close              = uct_cuda_ipc_md_close,
-        .query              = uct_cuda_ipc_md_query,
-        .mkey_pack          = uct_cuda_ipc_mkey_pack,
-        .mem_reg            = uct_cuda_ipc_mem_reg,
-        .mem_dereg          = uct_cuda_ipc_mem_dereg,
-        .detect_memory_type = ucs_empty_function_return_unsupported,
+        .close                  = uct_cuda_ipc_md_close,
+        .query                  = uct_cuda_ipc_md_query,
+        .mkey_pack              = uct_cuda_ipc_mkey_pack,
+        .mem_reg                = uct_cuda_ipc_mem_reg,
+        .mem_dereg              = uct_cuda_ipc_mem_dereg,
+        .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+        .detect_memory_type     = ucs_empty_function_return_unsupported
     };
 
     int num_devices;
     uct_cuda_ipc_md_t* md;
     uct_cuda_ipc_component_t* com;
 
-    UCS_STATIC_ASSERT(sizeof(md->peer_accessible_cache[0]) == sizeof(char));
     UCT_CUDA_IPC_DEVICE_GET_COUNT(num_devices);
 
     md = ucs_calloc(1, sizeof(uct_cuda_ipc_md_t), "uct_cuda_ipc_md");
@@ -291,26 +316,10 @@ uct_cuda_ipc_md_open(uct_component_t *component, const char *md_name,
     md->super.component = &uct_cuda_ipc_component.super;
 
     /* allocate uuid map and peer accessible cache */
-    md->uuid_map_size     = 0;
-    md->uuid_map_capacity = 16;
-    md->uuid_map          = ucs_malloc(md->uuid_map_capacity * sizeof(CUuuid),
-                                       "uct_cuda_ipc_uuid_map");
-    if (md->uuid_map == NULL) {
-        free(md);
-        return UCS_ERR_NO_MEMORY;
-    }
-
-    /* Initially support caching accessibility of up to 16 other peers */
-    md->peer_accessible_cache = ucs_malloc(num_devices * md->uuid_map_capacity,
-                                           "uct_cuda_ipc_peer_accessible_cache");
-    if (md->peer_accessible_cache == NULL) {
-        free(md->uuid_map);
-        free(md);
-        return UCS_ERR_NO_MEMORY;
-    }
-
-    /* 0xFF = !cached, 1 = accessible, 0 = !accessible */
-    memset(md->peer_accessible_cache, 0xFF, num_devices * md->uuid_map_capacity);
+    md->uuid_map_size         = 0;
+    md->uuid_map_capacity     = 0;
+    md->uuid_map              = NULL;
+    md->peer_accessible_cache = NULL;
 
     com     = ucs_derived_of(md->super.component, uct_cuda_ipc_component_t);
     com->md = md;
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h
index 5e0ef493867..ab2ea3b02b6 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.h
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.h
@@ -10,17 +10,19 @@
 #include <uct/base/uct_md.h>
 #include <uct/cuda/base/cuda_md.h>
 #include <uct/cuda/base/cuda_iface.h>
+#include <ucs/type/spinlock.h>
+#include <ucs/config/types.h>
 
 
 /**
  * @brief cuda ipc MD descriptor
  */
 typedef struct uct_cuda_ipc_md {
-    struct uct_md super;   /**< Domain info */
-    CUuuid*       uuid_map;
-    char*         peer_accessible_cache;
-    int           uuid_map_size;
-    int           uuid_map_capacity;
+    struct uct_md            super;   /**< Domain info */
+    CUuuid*                  uuid_map;
+    ucs_ternary_auto_value_t *peer_accessible_cache;
+    int                      uuid_map_size;
+    int                      uuid_map_capacity;
 } uct_cuda_ipc_md_t;
 
 /**
@@ -45,11 +47,12 @@ typedef struct uct_cuda_ipc_md_config {
  * @brief cuda_ipc packed and remote key for put/get
  */
 typedef struct uct_cuda_ipc_key {
-    CUipcMemHandle ph;           /* Memory handle of GPU memory */
-    CUdeviceptr    d_bptr;       /* Allocation base address */
-    size_t         b_len;        /* Allocation size */
-    int            dev_num;      /* GPU Device number */
-    CUuuid         uuid;         /* GPU Device UUID */
+    CUipcMemHandle ph;      /* Memory handle of GPU memory */
+    pid_t          pid;     /* PID as key to resolve peer_map hash */
+    CUdeviceptr    d_bptr;  /* Allocation base address */
+    size_t         b_len;   /* Allocation size */
+    int            dev_num; /* GPU Device number */
+    CUuuid         uuid;    /* GPU Device UUID */
 } uct_cuda_ipc_key_t;
 
 
diff --git a/src/uct/cuda/gdr_copy/Makefile.am b/src/uct/cuda/gdr_copy/Makefile.am
index e551d5f5862..752fc984523 100644
--- a/src/uct/cuda/gdr_copy/Makefile.am
+++ b/src/uct/cuda/gdr_copy/Makefile.am
@@ -8,9 +8,11 @@ if HAVE_GDR_COPY
 module_LTLIBRARIES              = libuct_cuda_gdrcopy.la
 libuct_cuda_gdrcopy_la_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS) $(GDR_COPY_CPPFLAGS)
 libuct_cuda_gdrcopy_la_CFLAGS   = $(BASE_CFLAGS)
+libuct_cuda_gdrcopy_la_LDFLAGS  = $(CUDA_LDFLAGS) $(GDR_COPY_LDFLAGS) \
+                                  -version-info $(SOVERSION)
 libuct_cuda_gdrcopy_la_LIBADD   = $(top_builddir)/src/ucs/libucs.la \
-                                  $(top_builddir)/src/uct/cuda/libuct_cuda.la
-libuct_cuda_gdrcopy_la_LDFLAGS  = $(CUDA_LDFLAGS) $(GDR_COPY_LDFLAGS) -version-info $(SOVERSION)
+                                  $(top_builddir)/src/uct/cuda/libuct_cuda.la \
+                                  $(CUDA_LIBS)
 
 noinst_HEADERS = \
 	gdr_copy_md.h \
diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.c b/src/uct/cuda/gdr_copy/gdr_copy_iface.c
index c46aa28b295..5a3a4d4966b 100644
--- a/src/uct/cuda/gdr_copy/gdr_copy_iface.c
+++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.c
@@ -47,8 +47,8 @@ static int uct_gdr_copy_iface_is_reachable(const uct_iface_h tl_iface,
     return (addr != NULL) && (iface->id == *addr);
 }
 
-static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h tl_iface,
-                                             uct_iface_attr_t *iface_attr)
+static ucs_status_t
+uct_gdr_copy_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
 {
     uct_gdr_copy_iface_t *iface = ucs_derived_of(tl_iface, uct_gdr_copy_iface_t);
 
@@ -88,13 +88,43 @@ static ucs_status_t uct_gdr_copy_iface_query(uct_iface_h tl_iface,
 
     iface_attr->latency                 = ucs_linear_func_make(1e-6, 0);
     iface_attr->bandwidth.dedicated     = 0;
-    iface_attr->bandwidth.shared        = 6911.0 * UCS_MBYTE;
-    iface_attr->overhead                = 0;
+    iface_attr->bandwidth.shared        = UCT_GDR_COPY_IFACE_DEFAULT_BANDWIDTH;
+    iface_attr->overhead                = UCT_GDR_COPY_IFACE_OVERHEAD;
     iface_attr->priority                = 0;
 
     return UCS_OK;
 }
 
+static ucs_status_t
+uct_gdr_copy_estimate_perf(uct_iface_h iface, uct_perf_attr_t *perf_attr)
+{
+    if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_BANDWIDTH) {
+        perf_attr->bandwidth.dedicated = 0;
+        if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_OPERATION) {
+            switch (perf_attr->operation) {
+            case UCT_OP_GET_SHORT:
+            case UCT_OP_GET_ZCOPY:
+                perf_attr->bandwidth.shared = 440.0 * UCS_MBYTE;
+                break;
+            case UCT_OP_PUT_SHORT:
+                perf_attr->bandwidth.shared = 10200.0 * UCS_MBYTE;
+                break;
+            default:
+                perf_attr->bandwidth.shared =
+                        UCT_GDR_COPY_IFACE_DEFAULT_BANDWIDTH;
+            }
+        } else {
+            perf_attr->bandwidth.shared = UCT_GDR_COPY_IFACE_DEFAULT_BANDWIDTH;
+        }
+    }
+
+    if (perf_attr->field_mask & UCT_PERF_ATTR_FIELD_OVERHEAD) {
+        perf_attr->overhead = UCT_GDR_COPY_IFACE_OVERHEAD;
+    }
+
+    return UCS_OK;
+}
+
 static uct_iface_ops_t uct_gdr_copy_iface_ops = {
     .ep_put_short             = uct_gdr_copy_ep_put_short,
     .ep_get_short             = uct_gdr_copy_ep_get_short,
@@ -116,12 +146,18 @@ static uct_iface_ops_t uct_gdr_copy_iface_ops = {
     .iface_is_reachable       = uct_gdr_copy_iface_is_reachable,
 };
 
+static uct_iface_internal_ops_t uct_gdr_copy_iface_internal_ops = {
+    .iface_estimate_perf = uct_gdr_copy_estimate_perf
+};
+
 static UCS_CLASS_INIT_FUNC(uct_gdr_copy_iface_t, uct_md_h md, uct_worker_h worker,
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
 {
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_gdr_copy_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_gdr_copy_iface_ops,
+                              &uct_gdr_copy_iface_internal_ops, md, worker,
+                              params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG("gdr_copy"));
 
     if (strncmp(params->mode.device.dev_name,
diff --git a/src/uct/cuda/gdr_copy/gdr_copy_iface.h b/src/uct/cuda/gdr_copy/gdr_copy_iface.h
index 1d4875e8cbe..9d2b8c00909 100644
--- a/src/uct/cuda/gdr_copy/gdr_copy_iface.h
+++ b/src/uct/cuda/gdr_copy/gdr_copy_iface.h
@@ -9,6 +9,12 @@
 #include <uct/base/uct_iface.h>
 
 
+#define UCT_GDR_COPY_IFACE_DEFAULT_BANDWIDTH (6911.0 * UCS_MBYTE)
+
+
+#define UCT_GDR_COPY_IFACE_OVERHEAD (0)
+
+
 typedef uint64_t uct_gdr_copy_iface_addr_t;
 
 
diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c
index c2f749a46fc..722690a5af2 100644
--- a/src/uct/cuda/gdr_copy/gdr_copy_md.c
+++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c
@@ -48,6 +48,7 @@ static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr)
     md_attr->cap.flags            = UCT_MD_FLAG_REG |
                                     UCT_MD_FLAG_NEED_RKEY;
     md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_CUDA);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
@@ -259,12 +260,13 @@ static void uct_gdr_copy_md_close(uct_md_h uct_md)
 }
 
 static uct_md_ops_t md_ops = {
-    .close               = uct_gdr_copy_md_close,
-    .query               = uct_gdr_copy_md_query,
-    .mkey_pack           = uct_gdr_copy_mkey_pack,
-    .mem_reg             = uct_gdr_copy_mem_reg,
-    .mem_dereg           = uct_gdr_copy_mem_dereg,
-    .detect_memory_type  = ucs_empty_function_return_unsupported,
+    .close                  = uct_gdr_copy_md_close,
+    .query                  = uct_gdr_copy_md_query,
+    .mkey_pack              = uct_gdr_copy_mkey_pack,
+    .mem_reg                = uct_gdr_copy_mem_reg,
+    .mem_dereg              = uct_gdr_copy_mem_dereg,
+    .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+    .detect_memory_type     = ucs_empty_function_return_unsupported
 };
 
 static inline uct_gdr_copy_rcache_region_t*
@@ -384,11 +386,10 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name,
     }
 
     if (md_config->enable_rcache != UCS_NO) {
+        uct_md_set_rcache_params(&rcache_params, &md_config->rcache);
         rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t);
-        rcache_params.alignment          = md_config->rcache.alignment;
         rcache_params.max_alignment      = UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN;
         rcache_params.ucm_events         = UCM_EVENT_MEM_TYPE_FREE;
-        rcache_params.ucm_event_priority = md_config->rcache.event_prio;
         rcache_params.context            = md;
         rcache_params.ops                = &uct_gdr_copy_rcache_ops;
         rcache_params.flags              = 0;
diff --git a/src/uct/ib/Makefile.am b/src/uct/ib/Makefile.am
index 61f2cdd08e6..7880c088063 100644
--- a/src/uct/ib/Makefile.am
+++ b/src/uct/ib/Makefile.am
@@ -5,7 +5,7 @@
 
 if HAVE_IB
 
-SUBDIRS = . cm rdmacm
+SUBDIRS = . rdmacm
 
 module_LTLIBRARIES    = libuct_ib.la
 libuct_ib_la_CPPFLAGS = $(BASE_CPPFLAGS) $(IBVERBS_CPPFLAGS)
@@ -101,6 +101,7 @@ endif # HAVE_TL_RC
 if HAVE_TL_DC
 noinst_HEADERS += \
 	dc/dc_mlx5_ep.h \
+	dc/dc_mlx5.inl \
 	dc/dc_mlx5.h
 
 libuct_ib_la_SOURCES += \
diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c
index 5a8041a1335..cd861600c7d 100644
--- a/src/uct/ib/base/ib_device.c
+++ b/src/uct/ib/base/ib_device.c
@@ -195,15 +195,6 @@ static void uct_ib_device_get_locality(const char *dev_name,
     *numa_node = (status == UCS_OK) ? n : -1;
 }
 
-static unsigned uct_ib_device_async_event_proxy(void *arg)
-{
-    uct_ib_async_event_wait_t *wait_ctx = arg;
-
-    wait_ctx->cb_id = UCS_CALLBACKQ_ID_NULL;
-    wait_ctx->cb(wait_ctx);
-    return 1;
-}
-
 static void
 uct_ib_device_async_event_dispatch(uct_ib_device_t *dev,
                                    const uct_ib_async_event_t *event)
@@ -218,9 +209,10 @@ uct_ib_device_async_event_dispatch(uct_ib_device_t *dev,
         entry->flag = 1;
         if (entry->wait_ctx != NULL) {
             /* someone is waiting */
+            ucs_assert(entry->wait_ctx->cb_id == UCS_CALLBACKQ_ID_NULL);
             entry->wait_ctx->cb_id = ucs_callbackq_add_safe(
-                    entry->wait_ctx->cbq, uct_ib_device_async_event_proxy,
-                    entry->wait_ctx, UCS_CALLBACKQ_FLAG_ONESHOT);
+                    entry->wait_ctx->cbq, entry->wait_ctx->cb,
+                    entry->wait_ctx, 0);
         }
     }
     ucs_spin_unlock(&dev->async_event_lock);
@@ -482,24 +474,95 @@ void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event
     ucs_log(level, "IB Async event on %s: %s", uct_ib_device_name(dev), event_info);
 }
 
+static ucs_status_t uct_ib_device_get_path_buffer(uct_ib_device_t *dev,
+                                                  char *path_buffer)
+{
+    char *resolved_path;
+
+    resolved_path = realpath(dev->ibv_context->device->ibdev_path, path_buffer);
+    if (resolved_path == NULL) {
+        return UCS_ERR_IO_ERROR;
+    }
+
+    /* Make sure there is "/infiniband/" substring in path_buffer */
+    if (strstr(path_buffer, "/infiniband/") == NULL) {
+        return UCS_ERR_IO_ERROR;
+    }
+
+    return UCS_OK;
+}
+
+static ucs_status_t uct_ib_device_get_ids_from_path(const char *path,
+                                                    uint16_t *vendor_id,
+                                                    uint16_t *device_id)
+{
+    ucs_status_t status;
+    long value;
+
+    status = ucs_read_file_number(&value, 1, "%s/%s", path, "vendor");
+    if (status != UCS_OK) {
+        return status;
+    }
+    *vendor_id = value;
+
+    status = ucs_read_file_number(&value, 1, "%s/%s", path, "device");
+    if (status != UCS_OK) {
+        return status;
+    }
+    *device_id = value;
+
+    return UCS_OK;
+}
+
 static void uct_ib_device_get_ids(uct_ib_device_t *dev)
 {
-    long vendor_id, device_id;
-
-    if ((ucs_read_file_number(&vendor_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
-                              uct_ib_device_name(dev), "vendor") == UCS_OK) &&
-        (ucs_read_file_number(&device_id, 1, UCT_IB_DEVICE_SYSFS_FMT,
-                              uct_ib_device_name(dev), "device") == UCS_OK)) {
-        dev->pci_id.vendor = vendor_id;
-        dev->pci_id.device = device_id;
-        ucs_debug("%s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev),
+    char *ids_path;
+    char path_buffer[PATH_MAX];
+    ucs_status_t status;
+
+    /* PF: realpath name is of form /sys/devices/.../0000:03:00.0/infiniband/mlx5_0 */
+    /* SF: realpath name is of form /sys/devices/.../0000:03:00.0/<UUID>/infiniband/mlx5_0 */
+
+    status = uct_ib_device_get_path_buffer(dev, path_buffer);
+    if (status != UCS_OK) {
+        goto not_found;
+    }
+
+    /* PF: strip 2 layers. */
+    ids_path = ucs_dirname(path_buffer, 2);
+    if (ids_path == NULL) {
+        goto not_found;
+    }
+
+    status = uct_ib_device_get_ids_from_path(ids_path,
+                                             &dev->pci_id.vendor,
+                                             &dev->pci_id.device);
+    if (status == UCS_OK) {
+        ucs_debug("PF: %s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev),
                   dev->pci_id.vendor, dev->pci_id.device);
-    } else {
-        dev->pci_id.vendor = 0;
-        dev->pci_id.device = 0;
-        ucs_warn("%s: could not read device/vendor id from sysfs, "
-                 "performance may be affected", uct_ib_device_name(dev));
+        return;
+    }
+
+    /* SF: strip 3 layers (1 more layer than PF). */
+    ids_path = ucs_dirname(path_buffer, 1);
+    if (ids_path == NULL) {
+        goto not_found;
+    }
+
+    status = uct_ib_device_get_ids_from_path(ids_path,
+                                             &dev->pci_id.vendor,
+                                             &dev->pci_id.device);
+    if (status == UCS_OK) {
+        ucs_debug("SF: %s vendor_id: 0x%x device_id: %d", uct_ib_device_name(dev),
+                  dev->pci_id.vendor, dev->pci_id.device);
+        return;
     }
+
+not_found:
+    dev->pci_id.vendor = 0;
+    dev->pci_id.device = 0;
+    ucs_warn("%s: could not read device/vendor id from sysfs, "
+             "performance may be affected", uct_ib_device_name(dev));
 }
 
 ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
@@ -528,10 +591,10 @@ ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
     }
 
     if (dev->num_ports > UCT_IB_DEV_MAX_PORTS) {
-        ucs_error("%s has %d ports, but only up to %d are supported",
+        ucs_debug("%s has %d ports, but only up to %d are supported",
                   ibv_get_device_name(ibv_device), dev->num_ports,
                   UCT_IB_DEV_MAX_PORTS);
-        return UCS_ERR_UNSUPPORTED;
+        dev->num_ports = UCT_IB_DEV_MAX_PORTS;
     }
 
     /* Query all ports */
@@ -676,11 +739,18 @@ ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
     uint8_t required_dev_flags;
     ucs_status_t status;
     union ibv_gid gid;
+    int gid_index;
 
     if (port_num < dev->first_port || port_num >= dev->first_port + dev->num_ports) {
         return UCS_ERR_NO_DEVICE;
     }
 
+    if (uct_ib_device_port_attr(dev, port_num)->gid_tbl_len == 0) {
+        ucs_debug("%s:%d has no gid", uct_ib_device_name(dev),
+                  port_num);
+        return UCS_ERR_UNSUPPORTED;
+    }
+
     if (uct_ib_device_port_attr(dev, port_num)->state != IBV_PORT_ACTIVE) {
         ucs_trace("%s:%d is not active (state: %d)", uct_ib_device_name(dev),
                   port_num, uct_ib_device_port_attr(dev, port_num)->state);
@@ -716,18 +786,18 @@ ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
         return UCS_ERR_UNSUPPORTED;
     }
 
-    if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num)) {
-        status = uct_ib_device_query_gid(dev, port_num,
-                                         uct_ib_device_get_ib_gid_index(md), &gid);
-        if (status != UCS_OK) {
-            return status;
-        }
+    gid_index = uct_ib_device_get_ib_gid_index(md);
+    status    = uct_ib_device_query_gid(dev, port_num, gid_index, &gid,
+                                        UCS_LOG_LEVEL_DIAG);
+    if (status != UCS_OK) {
+        return status;
+    }
 
-        if (md->subnet_filter != gid.global.subnet_prefix) {
-            ucs_trace("%s:%d subnet_prefix does not match",
-                      uct_ib_device_name(dev), port_num);
-            return UCS_ERR_UNSUPPORTED;
-        }
+    if (md->check_subnet_filter && uct_ib_device_is_port_ib(dev, port_num) &&
+        (md->subnet_filter != gid.global.subnet_prefix)) {
+        ucs_trace("%s:%d subnet_prefix does not match", uct_ib_device_name(dev),
+                  port_num);
+        return UCS_ERR_UNSUPPORTED;
     }
 
     return UCS_OK;
@@ -767,8 +837,8 @@ static sa_family_t uct_ib_device_get_addr_family(union ibv_gid *gid, int gid_ind
     const uint32_t addr_last_bits = raw->s6_addr32[2] ^ htonl(0x0000ffff);
     char p[128];
 
-    ucs_debug("testing addr_family on gid index %d: %s",
-              gid_index, uct_ib_gid_str(gid, p, sizeof(p)));
+    ucs_trace_func("testing addr_family on gid index %d: %s",
+                   gid_index, uct_ib_gid_str(gid, p, sizeof(p)));
 
     if (!((raw->s6_addr32[0] | raw->s6_addr32[1]) | addr_last_bits) ||
         uct_ib_device_is_addr_ipv4_mcast(raw, addr_last_bits)) {
@@ -1015,7 +1085,7 @@ ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state)
 
 static ucs_sys_device_t uct_ib_device_get_sys_dev(uct_ib_device_t *dev)
 {
-    char path_buffer[PATH_MAX], *resolved_path;
+    char path_buffer[PATH_MAX];
     ucs_sys_device_t sys_dev;
     ucs_sys_bus_id_t bus_id;
     ucs_status_t status;
@@ -1025,17 +1095,20 @@ static ucs_sys_device_t uct_ib_device_get_sys_dev(uct_ib_device_t *dev)
     /* realpath name is of form /sys/devices/.../0000:05:00.0/infiniband/mlx5_0
      * and bus_id is constructed from 0000:05:00.0 */
 
-    resolved_path = realpath(dev->ibv_context->device->ibdev_path, path_buffer);
-    if (resolved_path == NULL) {
+    status = uct_ib_device_get_path_buffer(dev, path_buffer);
+    if (status != UCS_OK) {
         return UCS_SYS_DEVICE_ID_UNKNOWN;
     }
 
-    /* Make sure there is "/infiniband/" substring in path_buffer*/
-    if (strstr(path_buffer, "/infiniband/") == NULL) {
+    pcie_bus = ucs_dirname(path_buffer, 2);
+    if (pcie_bus == NULL) {
+        return UCS_SYS_DEVICE_ID_UNKNOWN;
+    }
+    pcie_bus = basename(pcie_bus);
+    if (pcie_bus == NULL) {
         return UCS_SYS_DEVICE_ID_UNKNOWN;
     }
 
-    pcie_bus   = basename(dirname(dirname(path_buffer)));
     num_fields = sscanf(pcie_bus, "%hx:%hhx:%hhx.%hhx", &bus_id.domain,
                         &bus_id.bus, &bus_id.slot, &bus_id.function);
     if (num_fields != 4) {
@@ -1168,7 +1241,8 @@ int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw)
 }
 
 ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
-                                     unsigned gid_index, union ibv_gid *gid)
+                                     unsigned gid_index, union ibv_gid *gid,
+                                     ucs_log_level_t error_level)
 {
     uct_ib_device_gid_info_t gid_info;
     ucs_status_t status;
@@ -1180,8 +1254,8 @@ ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
     }
 
     if (uct_ib_device_is_gid_raw_empty(gid_info.gid.raw)) {
-        ucs_error("Invalid gid[%d] on %s:%d", gid_index,
-                  uct_ib_device_name(dev), port_num);
+        ucs_log(error_level, "invalid gid[%d] on %s:%d", gid_index,
+                uct_ib_device_name(dev), port_num);
         return UCS_ERR_INVALID_ADDR;
     }
 
@@ -1329,7 +1403,7 @@ int uct_ib_get_cqe_size(int cqe_size_min)
 
 static ucs_status_t
 uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num,
-                                 char *ndev_name, size_t max)
+                                 uint8_t gid_index, char *ndev_name, size_t max)
 {
     ssize_t nread;
 
@@ -1338,7 +1412,7 @@ uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num,
     /* get the network device name which corresponds to a RoCE port */
     nread = ucs_read_file_str(ndev_name, max, 1,
                               UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT,
-                              uct_ib_device_name(dev), port_num, 0);
+                              uct_ib_device_name(dev), port_num, gid_index);
     if (nread < 0) {
         ucs_diag("failed to read " UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT": %m",
                  uct_ib_device_name(dev), port_num, 0);
@@ -1349,14 +1423,15 @@ uct_ib_device_get_roce_ndev_name(uct_ib_device_t *dev, uint8_t port_num,
     return UCS_OK;
 }
 
-unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev, uint8_t port_num)
+unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev, uint8_t port_num,
+                                          uint8_t gid_index)
 {
     char ndev_name[IFNAMSIZ];
     unsigned roce_lag_level;
     ucs_status_t status;
 
-    status = uct_ib_device_get_roce_ndev_name(dev, port_num, ndev_name,
-                                              sizeof(ndev_name));
+    status = uct_ib_device_get_roce_ndev_name(dev, port_num, gid_index,
+                                              ndev_name, sizeof(ndev_name));
     if (status != UCS_OK) {
         return 1;
     }
diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h
index d11b358cb5e..274852ffe46 100644
--- a/src/uct/ib/base/ib_device.h
+++ b/src/uct/ib/base/ib_device.h
@@ -23,6 +23,7 @@
 
 
 #define UCT_IB_QPN_ORDER                  24  /* How many bits can be an IB QP number */
+#define UCT_IB_UIDX_SHIFT                 8   /* BE uidx shift */
 #define UCT_IB_LRH_LEN                    8   /* IB Local routing header */
 #define UCT_IB_GRH_LEN                    40  /* IB GLobal routing header */
 #define UCT_IB_BTH_LEN                    12  /* IB base transport header */
@@ -50,6 +51,9 @@
 #define UCT_IB_SITE_LOCAL_MASK            be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */
 #define UCT_IB_DEFAULT_ROCEV2_DSCP        106  /* Default DSCP for RoCE v2 */
 #define UCT_IB_ROCE_UDP_SRC_PORT_BASE     0xC000
+#define UCT_IB_CQE_SL_PKTYPE_MASK         0x7 /* SL for IB or packet type
+                                                 (GRH/IPv4/IPv6) for RoCE in the
+                                                 CQE */
 #define UCT_IB_DEVICE_SYSFS_PFX           "/sys/class/infiniband/%s"
 #define UCT_IB_DEVICE_SYSFS_FMT           UCT_IB_DEVICE_SYSFS_PFX "/device/%s"
 #define UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX  UCT_IB_DEVICE_SYSFS_PFX "/ports/%d/gid_attrs"
@@ -176,7 +180,7 @@ typedef struct uct_ib_async_event {
  * IB async event waiting context.
  */
 typedef struct uct_ib_async_event_wait {
-    void                (*cb)(struct uct_ib_async_event_wait*); /* Callback */
+    ucs_callback_t      cb;                     /* Callback */
     ucs_callbackq_t     *cbq;                   /* Async queue for callback */
     int                 cb_id;                  /* Scheduled callback ID */
 } uct_ib_async_event_wait_t;
@@ -217,6 +221,7 @@ typedef struct uct_ib_device {
     uint8_t                     pci_fadd_arg_sizes;
     uint8_t                     pci_cswap_arg_sizes;
     uint8_t                     atomic_align;
+    uint8_t                     lag_level;
     /* AH hash */
     khash_t(uct_ib_ah)          ah_hash;
     ucs_recursive_spinlock_t    ah_lock;
@@ -367,7 +372,8 @@ ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
 void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev);
 
 unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev,
-                                          uint8_t port_num);
+                                          uint8_t port_num,
+                                          uint8_t gid_index);
 
 
 static inline struct ibv_port_attr*
@@ -387,7 +393,8 @@ const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver);
 const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size);
 
 ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
-                                     unsigned gid_index, union ibv_gid *gid);
+                                     unsigned gid_index, union ibv_gid *gid,
+                                     ucs_log_level_t error_level);
 
 ucs_status_t uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name,
                                           uint8_t port_num, unsigned gid_index,
diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c
index 078adeb4542..287d7742cbe 100644
--- a/src/uct/ib/base/ib_iface.c
+++ b/src/uct/ib/base/ib_iface.c
@@ -1,5 +1,7 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+* Copyright (C) 2021 Broadcom. ALL RIGHTS RESERVED. The term “Broadcom”
+* refers to Broadcom Inc. and/or its subsidiaries.
 *
 * See file LICENSE for terms.
 */
@@ -16,6 +18,7 @@
 #include <ucs/arch/cpu.h>
 #include <ucs/type/class.h>
 #include <ucs/type/cpu_set.h>
+#include <ucs/type/serialize.h>
 #include <ucs/debug/log.h>
 #include <ucs/time/time.h>
 #include <ucs/memory/numa.h>
@@ -89,7 +92,7 @@ ucs_config_field_t uct_ib_iface_config_table[] = {
    "enough, such as of atomic operations and small reads, will be received inline.",
    ucs_offsetof(uct_ib_iface_config_t, inl[UCT_IB_DIR_TX]), UCS_CONFIG_TYPE_MEMUNITS},
 
-  {"TX_MIN_SGE", "3",
+  {"TX_MIN_SGE", "4",
    "Number of SG entries to reserve in the send WQE.",
    ucs_offsetof(uct_ib_iface_config_t, tx.min_sge), UCS_CONFIG_TYPE_UINT},
 
@@ -148,9 +151,10 @@ ucs_config_field_t uct_ib_iface_config_table[] = {
    "Force interface to use global routing.",
    ucs_offsetof(uct_ib_iface_config_t, is_global), UCS_CONFIG_TYPE_BOOL},
 
-  {"SL", "0",
-   "IB Service Level / RoCEv2 Ethernet Priority.\n",
-   ucs_offsetof(uct_ib_iface_config_t, sl), UCS_CONFIG_TYPE_UINT},
+  {"SL", "auto",
+   "InfiniBand: Service level. 'auto' will select a value matching UCX_IB_AR configuration.\n"
+   "RoCEv2: Ethernet Priority. 'auto' will select 0 by default.",
+   ucs_offsetof(uct_ib_iface_config_t, sl), UCS_CONFIG_TYPE_ULUNITS},
 
   {"TRAFFIC_CLASS", "auto",
    "IB Traffic Class / RoCEv2 Differentiated Services Code Point (DSCP).\n"
@@ -221,8 +225,11 @@ static void uct_ib_iface_recv_desc_init(uct_iface_h tl_iface, void *obj, uct_mem
 
 ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
                                           const uct_ib_iface_config_t *config,
+                                          const uct_iface_params_t *params,
                                           const char *name, ucs_mpool_t *mp)
 {
+    size_t align_offset, alignment;
+    ucs_status_t status;
     unsigned grow;
 
     if (config->rx.queue_len < 1024) {
@@ -233,13 +240,24 @@ ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
                         config->rx.mp.max_bufs);
     }
 
+    /* Preserve the default alignment by UCT header if user does not request
+     * specific alignment.
+     * TODO: Analyze how to keep UCT header aligned by cache line even when
+     * user requested specific alignment for payload.
+     */
+    status = uct_iface_param_am_alignment(params, iface->config.seg_size,
+                                          iface->config.rx_hdr_offset,
+                                          iface->config.rx_payload_offset,
+                                          &alignment, &align_offset);
+    if (status != UCS_OK) {
+        return status;
+    }
+
     return uct_iface_mpool_init(&iface->super, mp,
-                                iface->config.rx_payload_offset + iface->config.seg_size,
-                                iface->config.rx_hdr_offset,
-                                UCS_SYS_CACHE_LINE_SIZE,
-                                &config->rx.mp, grow,
-                                uct_ib_iface_recv_desc_init,
-                                name);
+                                iface->config.rx_payload_offset +
+                                        iface->config.seg_size,
+                                align_offset, alignment, &config->rx.mp, grow,
+                                uct_ib_iface_recv_desc_init, name);
 }
 
 void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc)
@@ -317,6 +335,7 @@ void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
                          uct_ib_address_t *ib_addr)
 {
     void *ptr = ib_addr + 1;
+    union ibv_gid *gid;
 
     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
         /* RoCE, in this case we don't use the lid, we pack the gid, the RoCE
@@ -330,19 +349,18 @@ void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
         }
 
         /* uint8_t raw[16]; */
-        memcpy(ptr, params->gid.raw, sizeof(params->gid.raw));
-        ptr = UCS_PTR_TYPE_OFFSET(ptr, params->gid.raw);
+        gid = ucs_serialize_next(&ptr, union ibv_gid);
+        memcpy(gid->raw, params->gid.raw, sizeof(params->gid.raw));
     } else {
         /* IB, LID */
-        ib_addr->flags   = 0;
-        *(uint16_t*)ptr  = params->lid;
-        ptr              = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
+        ib_addr->flags                      = 0;
+        *ucs_serialize_next(&ptr, uint16_t) = params->lid;
 
         if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID) {
             /* Pack GUID */
-            ib_addr->flags  |= UCT_IB_ADDRESS_FLAG_IF_ID;
-            *(uint64_t*) ptr = params->gid.global.interface_id;
-            ptr              = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
+            ib_addr->flags               |= UCT_IB_ADDRESS_FLAG_IF_ID;
+            *ucs_serialize_next(&ptr,
+                                uint64_t) = params->gid.global.interface_id;
         }
 
         if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
@@ -350,13 +368,13 @@ void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
                                                     UCT_IB_SITE_LOCAL_PREFIX) {
                 /* Site-local */
                 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16;
-                *(uint16_t*)ptr = params->gid.global.subnet_prefix >> 48;
-                ptr             = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
+                *ucs_serialize_next(&ptr, uint16_t) =
+                        params->gid.global.subnet_prefix >> 48;
             } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
                 /* Global */
                 ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64;
-                *(uint64_t*)ptr = params->gid.global.subnet_prefix;
-                ptr             = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
+                *ucs_serialize_next(&ptr, uint64_t) =
+                        params->gid.global.subnet_prefix;
             }
         }
     }
@@ -364,19 +382,18 @@ void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
         ucs_assert((int)params->path_mtu < UINT8_MAX);
         ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PATH_MTU;
-        *(uint8_t*)ptr  = (uint8_t)params->path_mtu;
-        ptr             = UCS_PTR_TYPE_OFFSET(ptr, uint8_t);
+        *ucs_serialize_next(&ptr, uint8_t) = params->path_mtu;
     }
 
     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX) {
         ib_addr->flags |= UCT_IB_ADDRESS_FLAG_GID_INDEX;
-        *(uint8_t*)ptr  = params->gid_index;
+        *ucs_serialize_next(&ptr, uint8_t) = params->gid_index;
     }
 
     if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) {
         ucs_assert(params->pkey != UCT_IB_ADDRESS_DEFAULT_PKEY);
         ib_addr->flags |= UCT_IB_ADDRESS_FLAG_PKEY;
-        *(uint16_t*)ptr = params->pkey;
+        *ucs_serialize_next(&ptr, uint16_t) = params->pkey;
     }
 }
 
@@ -400,6 +417,10 @@ unsigned uct_ib_iface_address_pack_flags(uct_ib_iface_t *iface)
         pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
     }
 
+    if (iface->config.path_mtu != IBV_MTU_4096) {
+        pack_flags |= UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU;
+    }
+
     return pack_flags;
 }
 
@@ -421,8 +442,8 @@ void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr)
     params.gid       = iface->gid_info.gid;
     params.lid       = uct_ib_iface_port_attr(iface)->lid;
     params.roce_info = iface->gid_info.roce_info;
+    params.path_mtu  = iface->config.path_mtu;
     /* to suppress gcc 4.3.4 warning */
-    params.path_mtu  = UCT_IB_ADDRESS_INVALID_PATH_MTU;
     params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX;
     params.pkey      = iface->pkey;
     uct_ib_address_pack(&params, ib_addr);
@@ -434,6 +455,8 @@ void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
     const void *ptr                     = ib_addr + 1;
     /* silence cppcheck warning */
     uct_ib_address_pack_params_t params = {0};
+    uint64_t site_local_subnet;
+    const union ibv_gid *gid;
 
     params.gid_index = UCT_IB_ADDRESS_INVALID_GID_INDEX;
     params.path_mtu  = UCT_IB_ADDRESS_INVALID_PATH_MTU;
@@ -441,8 +464,8 @@ void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
 
     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH) {
         /* uint8_t raw[16]; */
-        memcpy(params.gid.raw, ptr, sizeof(params.gid.raw));
-        ptr           = UCS_PTR_BYTE_OFFSET(ptr, sizeof(params.gid.raw));
+        gid = ucs_serialize_next(&ptr, const union ibv_gid);
+        memcpy(params.gid.raw, gid->raw, sizeof(params.gid.raw));
         params.flags |= UCT_IB_ADDRESS_PACK_FLAG_ETH;
 
         params.roce_info.addr_family =
@@ -458,42 +481,40 @@ void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
 
         /* If the link layer is not ETHERNET, then it is IB and a lid
          * must be present */
-        params.lid                      = *(const uint16_t*)ptr;
-        ptr                             = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
+        params.lid = *ucs_serialize_next(&ptr, const uint16_t);
 
         if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_IF_ID) {
-            params.gid.global.interface_id = *(uint64_t*)ptr;
-            ptr                            = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
+            params.gid.global.interface_id =
+                    *ucs_serialize_next(&ptr, const uint64_t);
         }
 
         if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) {
+            site_local_subnet = *ucs_serialize_next(&ptr, const uint16_t);
             params.gid.global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX |
-                                              ((uint64_t)*(uint16_t*)ptr << 48);
-            ptr                             = UCS_PTR_TYPE_OFFSET(ptr, uint16_t);
+                                              (site_local_subnet << 48);
             ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64));
         }
 
         if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) {
-            params.gid.global.subnet_prefix = *(uint64_t*)ptr;
-            ptr                             = UCS_PTR_TYPE_OFFSET(ptr, uint64_t);
-            params.flags                   |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
+            params.gid.global.subnet_prefix =
+                    *ucs_serialize_next(&ptr, const uint64_t);
+            params.flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
         }
     }
 
     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PATH_MTU) {
-        params.path_mtu = (enum ibv_mtu)*(const uint8_t*)ptr;
-        ptr             = UCS_PTR_TYPE_OFFSET(ptr, const uint8_t);
+        params.path_mtu = (enum ibv_mtu) *
+                          ucs_serialize_next(&ptr, const uint8_t);
         params.flags   |= UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU;
     }
 
     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_GID_INDEX) {
-        params.gid_index = *(const uint8_t*)ptr;
-        ptr              = UCS_PTR_TYPE_OFFSET(ptr, const uint16_t);
+        params.gid_index = *ucs_serialize_next(&ptr, const uint8_t);
         params.flags    |= UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX;
     }
 
     if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_PKEY) {
-        params.pkey = *(const uint16_t*)ptr;
+        params.pkey = *ucs_serialize_next(&ptr, const uint16_t);
     }
     /* PKEY is always in params */
     params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY;
@@ -646,6 +667,8 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
 
     memset(ah_attr, 0, sizeof(*ah_attr));
 
+    ucs_assert(iface->config.sl < UCT_IB_SL_NUM);
+
     ah_attr->sl                = iface->config.sl;
     ah_attr->port_num          = iface->config.port_num;
     ah_attr->grh.traffic_class = iface->config.traffic_class;
@@ -653,9 +676,8 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
     if (uct_ib_iface_is_roce(iface)) {
         ah_attr->dlid          = UCT_IB_ROCE_UDP_SRC_PORT_BASE |
                                  (iface->config.roce_path_factor * path_index);
-        /* Workaround rdma-core issue of calling rand() which affects global
-         * random state in glibc */
-        ah_attr->grh.flow_label = 1;
+        /* Workaround rdma-core flow label to udp sport conversion */
+        ah_attr->grh.flow_label = ~(iface->config.roce_path_factor * path_index);
     } else {
         /* TODO iface->path_bits should be removed and replaced by path_index */
         path_bits              = iface->path_bits[path_index %
@@ -749,7 +771,7 @@ static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface,
             /* take only the lower 15 bits for the comparison */
             ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey)) {
             if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK) &&
-                /* limited PKEY has not yet been found */ 
+                /* limited PKEY has not yet been found */
                 (lim_pkey == UCT_IB_ADDRESS_INVALID_PKEY)) {
                 lim_pkey_index = pkey_index;
                 lim_pkey       = pkey;
@@ -936,11 +958,12 @@ ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
                                     int preferred_cpu, size_t inl)
 {
     uct_ib_device_t *dev = uct_ib_iface_device(iface);
+    unsigned cq_size     = uct_ib_cq_size(iface, init_attr, dir);
     struct ibv_cq *cq;
 #if HAVE_DECL_IBV_CREATE_CQ_ATTR_IGNORE_OVERRUN
     struct ibv_cq_init_attr_ex cq_attr = {};
 
-    cq_attr.cqe         = init_attr->cq_len[dir];
+    cq_attr.cqe         = cq_size;
     cq_attr.channel     = iface->comp_channel;
     cq_attr.comp_vector = preferred_cpu;
     if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) {
@@ -949,16 +972,16 @@ ucs_status_t uct_ib_verbs_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
     }
 
     cq = ibv_cq_ex_to_cq(ibv_create_cq_ex(dev->ibv_context, &cq_attr));
-    if (!cq && (errno == ENOSYS))
+    if (!cq && ((errno == EOPNOTSUPP) || (errno == ENOSYS)))
 #endif
     {
         iface->config.max_inl_cqe[dir] = 0;
-        cq = ibv_create_cq(dev->ibv_context, init_attr->cq_len[dir], NULL,
-                           iface->comp_channel, preferred_cpu);
+        cq = ibv_create_cq(dev->ibv_context, cq_size, NULL, iface->comp_channel,
+                           preferred_cpu);
     }
 
     if (!cq) {
-        ucs_error("ibv_create_cq(cqe=%d) failed: %m", init_attr->cq_len[dir]);
+        ucs_error("ibv_create_cq(cqe=%d) failed: %m", cq_size);
         return UCS_ERR_IO_ERROR;
     }
 
@@ -1019,7 +1042,7 @@ uct_ib_iface_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
 
 out_unsetenv:
 #if HAVE_DECL_IBV_EXP_SETENV && !HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
-    iface->config.max_inl_cqe[dir] = cqe_size / 2;
+    iface->config.max_inl_cqe[dir] = (inl > 0) ? (cqe_size / 2) : 0;
     if (env_var_added) {
         /* if we created a new environment variable, remove it */
         ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var);
@@ -1081,8 +1104,12 @@ static void uct_ib_iface_set_num_paths(uct_ib_iface_t *iface,
     if (config->num_paths == UCS_ULUNITS_AUTO) {
         if (uct_ib_iface_is_roce(iface)) {
             /* RoCE - number of paths is RoCE LAG level */
-            iface->num_paths =
-                    uct_ib_device_get_roce_lag_level(dev, iface->config.port_num);
+            if (dev->lag_level == 0) {
+                iface->num_paths = uct_ib_device_get_roce_lag_level(
+                        dev, iface->config.port_num, iface->gid_info.gid_index);
+            } else {
+                iface->num_paths = dev->lag_level;
+            }
         } else {
             /* IB - number of paths is LMC level */
             ucs_assert(iface->path_bits_count > 0);
@@ -1140,7 +1167,8 @@ static ucs_status_t uct_ib_iface_init_gid_info(uct_ib_iface_t *iface,
     /* Fill the gid */
     status = uct_ib_device_query_gid(uct_ib_iface_device(iface),
                                      iface->config.port_num,
-                                     gid_info->gid_index, &gid_info->gid);
+                                     gid_info->gid_index, &gid_info->gid,
+                                     UCS_LOG_LEVEL_ERROR);
     if (status != UCS_OK) {
         goto out;
     }
@@ -1173,16 +1201,26 @@ static void uct_ib_iface_set_path_mtu(uct_ib_iface_t *iface,
     }
 }
 
-UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
-                    uct_worker_h worker, const uct_iface_params_t *params,
+uint8_t uct_ib_iface_config_select_sl(const uct_ib_iface_config_t *ib_config)
+{
+    if (ib_config->sl == UCS_ULUNITS_AUTO) {
+        return 0;
+    }
+
+    ucs_assert(ib_config->sl < UCT_IB_SL_NUM);
+    return (uint8_t)ib_config->sl;
+}
+
+UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops,
+                    uct_iface_ops_t *tl_ops, uct_md_h md, uct_worker_h worker,
+                    const uct_iface_params_t *params,
                     const uct_ib_iface_config_t *config,
                     const uct_ib_iface_init_attr_t *init_attr)
 {
     uct_ib_md_t *ib_md   = ucs_derived_of(md, uct_ib_md_t);
     uct_ib_device_t *dev = &ib_md->dev;
-    size_t rx_headroom   = (params->field_mask &
-                            UCT_IFACE_PARAM_FIELD_RX_HEADROOM) ?
-                           params->rx_headroom : 0;
+    size_t rx_headroom   = UCT_IFACE_PARAM_VALUE(params, rx_headroom,
+                                                 RX_HEADROOM, 0);
     ucs_cpu_set_t cpu_mask;
     int preferred_cpu;
     ucs_status_t status;
@@ -1200,16 +1238,17 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
 
     preferred_cpu = ucs_cpu_set_find_lcs(&cpu_mask);
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker,
-                              params, &config->super
-                              UCS_STATS_ARG(((params->field_mask &
-                                              UCT_IFACE_PARAM_FIELD_STATS_ROOT) &&
-                                             (params->stats_root != NULL)) ?
-                                            params->stats_root :
-                                            dev->stats)
-                              UCS_STATS_ARG(params->mode.device.dev_name));
-
-    status = uct_ib_device_find_port(dev, params->mode.device.dev_name, &port_num);
+    UCS_CLASS_CALL_SUPER_INIT(
+            uct_base_iface_t, tl_ops, &uct_base_iface_internal_ops, md, worker, params,
+            &config->super UCS_STATS_ARG(
+                    ((params->field_mask & UCT_IFACE_PARAM_FIELD_STATS_ROOT) &&
+                     (params->stats_root != NULL)) ?
+                            params->stats_root :
+                            dev->stats)
+                     UCS_STATS_ARG(params->mode.device.dev_name));
+
+    status = uct_ib_device_find_port(dev, params->mode.device.dev_name,
+                                     &port_num);
     if (status != UCS_OK) {
         goto err;
     }
@@ -1232,7 +1271,8 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
     self->config.rx_max_batch       = ucs_min(config->rx.max_batch,
                                               config->rx.queue_len / 4);
     self->config.port_num           = port_num;
-    self->config.sl                 = config->sl;
+    /* initialize to invalid value */
+    self->config.sl                 = UCT_IB_SL_NUM;
     self->config.hop_limit          = config->hop_limit;
     self->release_desc.cb           = uct_ib_iface_release_desc;
     self->config.enable_res_domain  = config->enable_res_domain;
@@ -1446,35 +1486,34 @@ static ucs_status_t uct_ib_iface_get_numa_latency(uct_ib_iface_t *iface,
 ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
                                 uct_iface_attr_t *iface_attr)
 {
-    uct_ib_device_t *dev = uct_ib_iface_device(iface);
-    uct_ib_md_t     *md  = uct_ib_iface_md(iface);
-    static const unsigned ib_port_widths[] = {
-        [0] = 1,
-        [1] = 4,
-        [2] = 8,
-        [3] = 12,
-        [4] = 16
-    };
-    uint8_t active_width, active_speed, active_mtu, width_idx;
+    static const uint8_t ib_port_widths[] =
+            {[1] = 1, [2] = 4, [4] = 8, [8] = 12, [16] = 2};
+    uct_ib_device_t *dev                 = uct_ib_iface_device(iface);
+    uct_ib_md_t *md                      = uct_ib_iface_md(iface);
+    uint8_t active_width, active_speed, active_mtu, width;
     double encoding, signal_rate, wire_speed;
-    size_t mtu, width, extra_pkt_len;
+    size_t mtu, extra_pkt_len;
     ucs_status_t status;
     double numa_latency;
 
     uct_base_iface_query(&iface->super, iface_attr);
-    
+
     active_width = uct_ib_iface_port_attr(iface)->active_width;
     active_speed = uct_ib_iface_port_attr(iface)->active_speed;
     active_mtu   = uct_ib_iface_port_attr(iface)->active_mtu;
 
-    /* Get active width */
-    width_idx = ucs_ilog2(active_width);
-    if (!ucs_is_pow2(active_width) ||
-        (active_width < 1) || (width_idx > 4))
-    {
-        ucs_error("Invalid active_width on %s:%d: %d",
-                  UCT_IB_IFACE_ARG(iface), active_width);
-        return UCS_ERR_IO_ERROR;
+    /*
+     * Parse active width.
+     * See IBTA section 14.2.5.6 "PortInfo", Table 164, field "LinkWidthEnabled"
+     */
+    if ((active_width >= ucs_static_array_size(ib_port_widths)) ||
+        (ib_port_widths[active_width] == 0)) {
+        ucs_warn("invalid active width on " UCT_IB_IFACE_FMT ": %d, "
+                 "assuming 1x",
+                 UCT_IB_IFACE_ARG(iface), active_width);
+        width = 1;
+    } else {
+        width = ib_port_widths[active_width];
     }
 
     iface_attr->device_addr_len = iface->addr_size;
@@ -1518,7 +1557,7 @@ ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
         signal_rate           = 25.78125e9;
         encoding              = 64.0/66.0;
         break;
-    case 64: /* 50g Eth */
+    case 64: /* HDR / HDR100 / 50g Eth */
         iface_attr->latency.c = 600e-9;
         signal_rate           = 25.78125e9 * 2;
         encoding              = 64.0/66.0;
@@ -1538,12 +1577,11 @@ ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len,
     iface_attr->latency.m  = 0;
 
     /* Wire speed calculation: Width * SignalRate * Encoding */
-    width                 = ib_port_widths[width_idx];
-    wire_speed            = (width * signal_rate * encoding) / 8.0;
+    wire_speed = (width * signal_rate * encoding) / 8.0;
 
     /* Calculate packet overhead  */
-    mtu                   = ucs_min(uct_ib_mtu_value((enum ibv_mtu)active_mtu),
-                                    iface->config.seg_size);
+    mtu = ucs_min(uct_ib_mtu_value((enum ibv_mtu)active_mtu),
+                  iface->config.seg_size);
 
     extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len +  UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN;
 
diff --git a/src/uct/ib/base/ib_iface.h b/src/uct/ib/base/ib_iface.h
index 359f45947dc..a0fc213fd61 100644
--- a/src/uct/ib/base/ib_iface.h
+++ b/src/uct/ib/base/ib_iface.h
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -16,6 +16,8 @@
 #include <ucs/sys/string.h>
 #include <ucs/sys/math.h>
 #include <ucs/datastruct/mpool.inl>
+#include <ucs/datastruct/string_buffer.h>
+
 
 #define UCT_IB_MAX_IOV                     8UL
 #define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY   0u
@@ -24,6 +26,7 @@
 #define UCT_IB_ADDRESS_INVALID_PATH_MTU    ((enum ibv_mtu)0)
 #define UCT_IB_ADDRESS_INVALID_PKEY        0
 #define UCT_IB_ADDRESS_DEFAULT_PKEY        0xffff
+#define UCT_IB_SL_NUM                      16
 
 /* Forward declarations */
 typedef struct uct_ib_iface_config   uct_ib_iface_config_t;
@@ -59,6 +62,8 @@ enum {
     UCT_IB_QPT_DCI = IBV_EXP_QPT_DC_INI,
 #elif HAVE_DC_DV
     UCT_IB_QPT_DCI = IBV_QPT_DRIVER,
+#else
+    UCT_IB_QPT_DCI = UCT_IB_QPT_UNKNOWN,
 #endif
 };
 
@@ -136,8 +141,8 @@ struct uct_ib_iface_config {
     /* Force global routing */
     int                     is_global;
 
-    /* IB SL to use */
-    unsigned                sl;
+    /* IB SL to use (default: AUTO) */
+    unsigned long           sl;
 
     /* IB Traffic Class to use */
     unsigned long           traffic_class;
@@ -167,7 +172,12 @@ struct uct_ib_iface_config {
 
 enum {
     UCT_IB_CQ_IGNORE_OVERRUN         = UCS_BIT(0),
-    UCT_IB_TM_SUPPORTED              = UCS_BIT(1)
+    UCT_IB_TM_SUPPORTED              = UCS_BIT(1),
+
+    /* Indicates that TX cq len in uct_ib_iface_init_attr_t is specified per
+     * each IB path. Therefore IB interface constructor would need to multiply
+     * TX CQ len by the number of IB paths (when it is properly initialized). */
+    UCT_IB_TX_OPS_PER_PATH           = UCS_BIT(2)
 };
 
 
@@ -221,12 +231,11 @@ typedef ucs_status_t (*uct_ib_iface_set_ep_failed_func_t)(uct_ib_iface_t *iface,
 
 
 struct uct_ib_iface_ops {
-    uct_iface_ops_t                    super;
+    uct_iface_internal_ops_t           super;
     uct_ib_iface_create_cq_func_t      create_cq;
     uct_ib_iface_arm_cq_func_t         arm_cq;
     uct_ib_iface_event_cq_func_t       event_cq;
     uct_ib_iface_handle_failure_func_t handle_failure;
-    uct_ib_iface_set_ep_failed_func_t  set_ep_failed;
 };
 
 
@@ -276,8 +285,9 @@ typedef struct uct_ib_fence_info {
 } uct_ib_fence_info_t;
 
 
-UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_md_h, uct_worker_h,
-                  const uct_iface_params_t*, const uct_ib_iface_config_t*,
+UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_iface_ops_t*,
+                  uct_md_h, uct_worker_h, const uct_iface_params_t*,
+                  const uct_ib_iface_config_t*,
                   const uct_ib_iface_init_attr_t*);
 
 /*
@@ -334,6 +344,7 @@ extern const char *uct_ib_mtu_values[];
  */
 ucs_status_t uct_ib_iface_recv_mpool_init(uct_ib_iface_t *iface,
                                           const uct_ib_iface_config_t *config,
+                                          const uct_iface_params_t *params,
                                           const char *name, ucs_mpool_t *mp);
 
 void uct_ib_iface_release_desc(uct_recv_desc_t *self, void *desc);
@@ -529,6 +540,8 @@ ucs_status_t uct_ib_iface_create_qp(uct_ib_iface_t *iface,
 void uct_ib_iface_fill_attr(uct_ib_iface_t *iface,
                             uct_ib_qp_attr_t *attr);
 
+uint8_t uct_ib_iface_config_select_sl(const uct_ib_iface_config_t *ib_config);
+
 
 #define UCT_IB_IFACE_FMT \
     "%s:%d"
@@ -575,7 +588,7 @@ size_t uct_ib_verbs_sge_fill_iov(struct ibv_sge *sge, const uct_iov_t *iov,
             continue; /* to avoid zero length elements in sge */
         }
 
-        if (iov[sge_it].memh == UCT_MEM_HANDLE_NULL) {
+        if (iov[iov_it].memh == UCT_MEM_HANDLE_NULL) {
             sge[sge_it].lkey = 0;
         } else {
             sge[sge_it].lkey = uct_ib_memh_get_lkey(iov[iov_it].memh);
@@ -598,18 +611,16 @@ uct_ib_fence_info_init(uct_ib_fence_info_t* fence)
     fence->fence_beat = 0;
 }
 
-static UCS_F_ALWAYS_INLINE
-ucs_log_level_t uct_ib_iface_failure_log_level(uct_ib_iface_t *ib_iface,
-                                               ucs_status_t err_handler_status,
-                                               ucs_status_t status)
+static UCS_F_ALWAYS_INLINE unsigned
+uct_ib_cq_size(uct_ib_iface_t *iface, const uct_ib_iface_init_attr_t *init_attr,
+               uct_ib_dir_t dir)
 {
-    if (err_handler_status != UCS_OK) {
-        return UCS_LOG_LEVEL_FATAL;
-    } else if ((status == UCS_ERR_ENDPOINT_TIMEOUT) ||
-               (status == UCS_ERR_CONNECTION_RESET)) {
-        return ib_iface->super.config.failure_level;
+    if (dir == UCT_IB_DIR_RX) {
+        return init_attr->cq_len[UCT_IB_DIR_RX];
+    } else if (init_attr->flags & UCT_IB_TX_OPS_PER_PATH) {
+        return init_attr->cq_len[UCT_IB_DIR_TX] * iface->num_paths;
     } else {
-        return UCS_LOG_LEVEL_ERROR;
+        return init_attr->cq_len[UCT_IB_DIR_TX];
     }
 }
 
diff --git a/src/uct/ib/base/ib_log.c b/src/uct/ib/base/ib_log.c
index d79c6ebb1b7..d9620f04951 100644
--- a/src/uct/ib/base/ib_log.c
+++ b/src/uct/ib/base/ib_log.c
@@ -43,11 +43,11 @@ void uct_ib_log_dump_sg_list(uct_ib_iface_t *iface, uct_am_trace_type_t type,
                              struct ibv_sge *sg_list, int num_sge,
                              uint64_t inline_bitmap,
                              uct_log_data_dump_func_t data_dump,
-                             char *buf, size_t max)
+                             int data_dump_sge, char *buf, size_t max)
 {
     char data[256];
     size_t total_len       = 0;
-    size_t total_valid_len = 0;;
+    size_t total_valid_len = 0;
     char *s    = buf;
     char *ends = buf + max;
     void *md   = data;
@@ -64,7 +64,7 @@ void uct_ib_log_dump_sg_list(uct_ib_iface_t *iface, uct_am_trace_type_t type,
 
         s               += strlen(s);
 
-        if (data_dump) {
+        if ((i < data_dump_sge) && data_dump) {
             len = ucs_min(sg_list[i].length,
                           UCS_PTR_BYTE_DIFF(md, data) + sizeof(data));
             memcpy(md, (void*)sg_list[i].addr, len);
@@ -213,9 +213,9 @@ static void uct_ib_dump_send_wr(uct_ib_iface_t *iface, struct ibv_qp *qp,
     s += strlen(s);
 
     uct_ib_log_dump_sg_list(iface, UCT_AM_TRACE_TYPE_SEND, wr->sg_list,
-                            ucs_min(wr->num_sge, max_sge),
+                            wr->num_sge,
                             (wr->send_flags & IBV_SEND_INLINE) ? -1 : 0,
-                            data_dump, s, ends - s);
+                            data_dump, max_sge, s, ends - s);
 }
 
 void __uct_ib_log_post_send(const char *file, int line, const char *function,
@@ -325,9 +325,9 @@ static void uct_ib_dump_exp_send_wr(uct_ib_iface_t *iface, struct ibv_qp *qp,
 #endif
 
    uct_ib_log_dump_sg_list(iface, UCT_AM_TRACE_TYPE_SEND, wr->sg_list,
-                           ucs_min(wr->num_sge, max_sge),
+                           wr->num_sge,
                            (wr->exp_send_flags & IBV_EXP_SEND_INLINE) ? -1 : 0,
-                           data_dump_cb, s, ends - s);
+                           data_dump_cb, max_sge, s, ends - s);
 }
 
 void __uct_ib_log_exp_post_send(const char *file, int line, const char *function,
diff --git a/src/uct/ib/base/ib_log.h b/src/uct/ib/base/ib_log.h
index bf7e466be3e..5cc5b46c047 100644
--- a/src/uct/ib/base/ib_log.h
+++ b/src/uct/ib/base/ib_log.h
@@ -37,7 +37,7 @@ void uct_ib_log_dump_sg_list(uct_ib_iface_t *iface, uct_am_trace_type_t type,
                              struct ibv_sge *sg_list, int num_sge,
                              uint64_t inline_bitmap,
                              uct_log_data_dump_func_t data_dump,
-                             char *buf, size_t max);
+                             int data_dump_sge, char *buf, size_t max);
 
 void uct_ib_log_dump_remote_addr(uint64_t remote_addr, uint32_t rkey,
                                  char *buf, size_t max);
diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c
index e33119fa9a4..6ee37a1342e 100644
--- a/src/uct/ib/base/ib_md.c
+++ b/src/uct/ib/base/ib_md.c
@@ -32,14 +32,14 @@
 #define UCT_IB_MD_RCACHE_DEFAULT_ALIGN 16
 
 typedef struct uct_ib_md_pci_info {
-    double      bw;       /* bandwidth */
-    uint16_t    payload;  /* payload used to data transfer */
-    uint16_t    overhead; /* PHY + data link layer + header + *CRC* */
-    uint16_t    nack;     /* number of TLC before ACK */
-    uint16_t    ctrl;     /* length of control TLP */
-    uint16_t    encoding; /* number of bits in symbol encoded, 8 - gen 1/2, 128 - gen 3 */
-    uint16_t    decoding; /* number of bits in symbol decoded, 10 - gen 1/2, 130 - gen 3 */
-    const char *name;     /* name of PCI generation */
+    double     bw_gbps; /* link speed */
+    uint16_t   payload; /* payload used to data transfer */
+    uint16_t   tlp_overhead; /* PHY + data link layer + header + *CRC* */
+    uint16_t   ctrl_ratio; /* number of TLC before ACK */
+    uint16_t   ctrl_overhead; /* length of control TLP */
+    uint16_t   encoding; /* number of encoded symbol bits */
+    uint16_t   decoding; /* number of decoded symbol bits */
+    const char *name; /* name of PCI generation */
 } uct_ib_md_pci_info_t;
 
 static UCS_CONFIG_DEFINE_ARRAY(pci_bw,
@@ -51,6 +51,7 @@ static const char *uct_ib_devx_objs[] = {
     [UCT_IB_DEVX_OBJ_RCSRQ] = "rcsrq",
     [UCT_IB_DEVX_OBJ_DCT]   = "dct",
     [UCT_IB_DEVX_OBJ_DCSRQ] = "dcsrq",
+    [UCT_IB_DEVX_OBJ_DCI]   = "dci",
     NULL
 };
 
@@ -166,7 +167,7 @@ static ucs_config_field_t uct_ib_md_config_table[] = {
      "DEVX support\n",
      ucs_offsetof(uct_ib_md_config_t, devx), UCS_CONFIG_TYPE_TERNARY},
 
-    {"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq",
+    {"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq,dci",
      "Objects to be created by DevX\n",
      ucs_offsetof(uct_ib_md_config_t, devx_objs),
      UCS_CONFIG_TYPE_BITMAP(uct_ib_devx_objs)},
@@ -204,36 +205,65 @@ static ucs_stats_class_t uct_ib_md_stats_class = {
 };
 #endif
 
+/*
+ * - TLP (Transaction Layer Packet) overhead calculations (no ECRC):
+ *   Gen1/2:
+ *     Start   SeqNum   Hdr_64bit   LCRC   End
+ *       1   +   2    +   16      +   4  +  1  = 24
+ *
+ *   Gen3/4:
+ *     Start   SeqNum   Hdr_64bit   LCRC
+ *       4   +   2    +   16      +   4  = 26
+ *
+ * - DLLP (Data Link Layer Packet) overhead calculations:
+ *    - Control packet 8b ACK + 8b flow control
+ *    - ACK/FC ratio: 1 per 4 TLPs
+ *
+ * References:
+ * [1] https://www.xilinx.com/support/documentation/white_papers/wp350.pdf
+ * [2] https://xdevs.com/doc/Standards/PCI/PCI_Express_Base_4.0_Rev0.3_February19-2014.pdf
+ * [3] https://www.nxp.com/docs/en/application-note/AN3935.pdf
+ */
 static const uct_ib_md_pci_info_t uct_ib_md_pci_info[] = {
-    { /* GEN 1 */
-        .bw       = 2.5 * UCS_GBYTE / 8,
-        .payload  = 512,
-        .overhead = 28,
-        .nack     = 5,
-        .ctrl     = 256,
-        .encoding = 8,
-        .decoding = 10,
-        .name     = "gen1"
+    {
+        .name          = "gen1",
+        .bw_gbps       = 2.5,
+        .payload       = 256,
+        .tlp_overhead  = 24,
+        .ctrl_ratio    = 4,
+        .ctrl_overhead = 16,
+        .encoding      = 8,
+        .decoding      = 10
     },
-    { /* GEN 2 */
-        .bw       = 5.0 * UCS_GBYTE / 8,
-        .payload  = 512,
-        .overhead = 28,
-        .nack     = 5,
-        .ctrl     = 256,
-        .encoding = 8,
-        .decoding = 10,
-        .name     = "gen2"
+    {
+        .name          = "gen2",
+        .bw_gbps       = 5,
+        .payload       = 256,
+        .tlp_overhead  = 24,
+        .ctrl_ratio    = 4,
+        .ctrl_overhead = 16,
+        .encoding      = 8,
+        .decoding      = 10
+    },
+    {
+        .name          = "gen3",
+        .bw_gbps       = 8,
+        .payload       = 256,
+        .tlp_overhead  = 26,
+        .ctrl_ratio    = 4,
+        .ctrl_overhead = 16,
+        .encoding      = 128,
+        .decoding      = 130
     },
-    { /* GEN 3 */
-        .bw       = 8.0 * UCS_GBYTE / 8,
-        .payload  = 512,
-        .overhead = 30,
-        .nack     = 5,
-        .ctrl     = 256,
-        .encoding = 128,
-        .decoding = 130,
-        .name     = "gen3"
+    {
+        .name          = "gen4",
+        .bw_gbps       = 16,
+        .payload       = 256,
+        .tlp_overhead  = 26,
+        .ctrl_ratio    = 4,
+        .ctrl_overhead = 16,
+        .encoding      = 128,
+        .decoding      = 130
     },
 };
 
@@ -279,7 +309,8 @@ static ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr)
                              UCT_MD_FLAG_NEED_MEMH |
                              UCT_MD_FLAG_NEED_RKEY |
                              UCT_MD_FLAG_ADVISE;
-    md_attr->cap.reg_mem_types    = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     md_attr->cap.detect_mem_types = 0;
 
@@ -314,13 +345,11 @@ static void uct_ib_md_print_mem_reg_err_msg(void *address, size_t length,
                                             int silent)
 {
     ucs_log_level_t level = silent ? UCS_LOG_LEVEL_DEBUG : UCS_LOG_LEVEL_ERROR;
-    ucs_string_buffer_t msg;
+    UCS_STRING_BUFFER_ONSTACK(msg, 256);
     struct rlimit limit_info;
     size_t page_size;
     size_t unused;
 
-    ucs_string_buffer_init(&msg);
-
     ucs_string_buffer_appendf(&msg,
                               "%s(address=%p, length=%zu, access=0x%lx) failed: %m",
                               ibv_reg_mr_func_name, address, length, access_flags);
@@ -347,7 +376,6 @@ static void uct_ib_md_print_mem_reg_err_msg(void *address, size_t length,
     }
 
     ucs_log(level, "%s", ucs_string_buffer_cstr(&msg));
-    ucs_string_buffer_cleanup(&msg);
 }
 
 void *uct_ib_md_mem_handle_thread_func(void *arg)
@@ -509,7 +537,7 @@ static ucs_status_t uct_ib_md_reg_mr(uct_ib_md_t *md, void *address,
             }
 
             return status;
-        } /* if unsuported - fallback to regular registration */
+        } /* if unsupported - fallback to regular registration */
     }
 
     return md->ops->reg_key(md, address, length, access_flags, memh, mr_type,
@@ -520,6 +548,7 @@ ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
                            uint64_t access_flags, struct ibv_mr **mr_p,
                            int silent)
 {
+    ucs_time_t start_time = ucs_get_time();
     struct ibv_mr *mr;
 #if HAVE_DECL_IBV_EXP_REG_MR
     struct ibv_exp_reg_mr_in in = {};
@@ -539,6 +568,11 @@ ucs_status_t uct_ib_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
     }
 
     *mr_p = mr;
+
+    /* to prevent clang dead code */
+    (void)start_time;
+    ucs_trace("ibv_reg_mr(%p, %p, %zu) took %.3f msec", pd, addr, length,
+              ucs_time_to_msec(ucs_get_time() - start_time));
     return UCS_OK;
 }
 
@@ -803,6 +837,8 @@ static ucs_status_t uct_ib_mem_reg(uct_md_h uct_md, void *address, size_t length
 
     memh = uct_ib_memh_alloc(md);
     if (memh == NULL) {
+        uct_md_log_mem_reg_error(flags,
+                                 "md %p: failed to allocate memory handle", md);
         return UCS_ERR_NO_MEMORY;
     }
 
@@ -1000,13 +1036,14 @@ static ucs_status_t uct_ib_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh)
 }
 
 static uct_md_ops_t uct_ib_md_rcache_ops = {
-    .close              = uct_ib_md_close,
-    .query              = uct_ib_md_query,
-    .mem_reg            = uct_ib_mem_rcache_reg,
-    .mem_dereg          = uct_ib_mem_rcache_dereg,
-    .mem_advise         = uct_ib_mem_advise,
-    .mkey_pack          = uct_ib_mkey_pack,
-    .detect_memory_type = ucs_empty_function_return_unsupported,
+    .close                  = uct_ib_md_close,
+    .query                  = uct_ib_md_query,
+    .mem_reg                = uct_ib_mem_rcache_reg,
+    .mem_dereg              = uct_ib_mem_rcache_dereg,
+    .mem_advise             = uct_ib_mem_advise,
+    .mkey_pack              = uct_ib_mkey_pack,
+    .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+    .detect_memory_type     = ucs_empty_function_return_unsupported,
 };
 
 static ucs_status_t uct_ib_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache,
@@ -1229,15 +1266,14 @@ uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr,
 
     for (i = 0; i < md_config->reg_methods.count; ++i) {
         if (!strcasecmp(md_config->reg_methods.rmtd[i], "rcache")) {
+            uct_md_set_rcache_params(&rcache_params, &md_config->rcache);
             rcache_params.region_struct_size = sizeof(ucs_rcache_region_t) +
                                                md->memh_struct_size;
-            rcache_params.alignment          = md_config->rcache.alignment;
             rcache_params.max_alignment      = ucs_get_page_size();
             rcache_params.ucm_events         = UCM_EVENT_VM_UNMAPPED;
             if (md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) {
                 rcache_params.ucm_events     |= UCM_EVENT_MEM_TYPE_FREE;
             }
-            rcache_params.ucm_event_priority = md_config->rcache.event_prio;
             rcache_params.context            = md;
             rcache_params.ops                = &uct_ib_rcache_ops;
             rcache_params.flags              = UCS_RCACHE_FLAG_PURGE_ON_FORK;
@@ -1396,11 +1432,11 @@ static double uct_ib_md_read_pci_bw(struct ibv_device *ib_device)
 {
     const char *pci_width_file_name = "current_link_width";
     const char *pci_speed_file_name = "current_link_speed";
+    double bw_gbps, effective_bw, link_utilization;
     char pci_width_str[16];
     char pci_speed_str[16];
     char gts[16];
     const uct_ib_md_pci_info_t *p;
-    double bw, effective_bw;
     unsigned width;
     ssize_t len;
     size_t i;
@@ -1431,28 +1467,29 @@ static double uct_ib_md_read_pci_bw(struct ibv_device *ib_device)
         return DBL_MAX;
     }
 
-    if ((sscanf(pci_speed_str, "%lf%s", &bw, gts) < 2) ||
+    if ((sscanf(pci_speed_str, "%lf%s", &bw_gbps, gts) < 2) ||
         strcasecmp("GT/s", ucs_strtrim(gts))) {
         ucs_debug("incorrect format of %s file: expected: <double> GT/s, actual: %s\n",
                   pci_speed_file_name, pci_speed_str);
         return DBL_MAX;
     }
 
-    bw *= UCS_GBYTE / 8; /* gigabit -> gigabyte */
-
     for (i = 0; i < ucs_static_array_size(uct_ib_md_pci_info); i++) {
-        if (bw < (uct_ib_md_pci_info[i].bw * 1.2)) { /* use 1.2 multiplex to avoid round issues */
-            p = &uct_ib_md_pci_info[i]; /* use pointer to make equation shorter */
-            /* coverity[overflow] */
-            effective_bw = bw * width *
-                           (p->payload * p->nack) /
-                           (((p->payload + p->overhead) * p->nack) + p->ctrl) *
-                           p->encoding / p->decoding;
-            ucs_trace("%s: pcie %ux %s, effective throughput %.3lfMB/s (%.3lfGb/s)",
-                      ib_device->name, width, p->name,
-                      (effective_bw / UCS_MBYTE), (effective_bw * 8 / UCS_GBYTE));
-            return effective_bw;
+        p = &uct_ib_md_pci_info[i];
+        if ((bw_gbps / p->bw_gbps) > 1.01) { /* floating-point compare */
+            continue;
         }
+
+        link_utilization = (double)(p->payload * p->ctrl_ratio) /
+                           (((p->payload + p->tlp_overhead) * p->ctrl_ratio) +
+                            p->ctrl_overhead);
+        /* coverity[overflow] */
+        effective_bw     = (p->bw_gbps * 1e9 / 8.0) * width *
+                           ((double)p->encoding / p->decoding) * link_utilization;
+        ucs_trace("%s: PCIe %s %ux, effective throughput %.3f MB/s %.3f Gb/s",
+                  ib_device->name, p->name, width, effective_bw / UCS_MBYTE,
+                  effective_bw * 8e-9);
+        return effective_bw;
     }
 
     return DBL_MAX;
@@ -1696,7 +1733,8 @@ static ucs_status_t uct_ib_verbs_md_open(struct ibv_device *ibv_device,
     dev              = &md->dev;
     dev->ibv_context = ibv_open_device(ibv_device);
     if (dev->ibv_context == NULL) {
-        ucs_error("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device));
+        ucs_diag("ibv_open_device(%s) failed: %m",
+                 ibv_get_device_name(ibv_device));
         status = UCS_ERR_IO_ERROR;
         goto err;
     }
diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h
index 87182791ffa..98d062620a2 100644
--- a/src/uct/ib/base/ib_md.h
+++ b/src/uct/ib/base/ib_md.h
@@ -60,7 +60,8 @@ enum {
     UCT_IB_DEVX_OBJ_RCQP,
     UCT_IB_DEVX_OBJ_RCSRQ,
     UCT_IB_DEVX_OBJ_DCT,
-    UCT_IB_DEVX_OBJ_DCSRQ
+    UCT_IB_DEVX_OBJ_DCSRQ,
+    UCT_IB_DEVX_OBJ_DCI
 };
 
 typedef struct uct_ib_md_ext_config {
diff --git a/src/uct/ib/base/ib_verbs.h b/src/uct/ib/base/ib_verbs.h
index 09f30c2b61e..83727ddf5df 100644
--- a/src/uct/ib/base/ib_verbs.h
+++ b/src/uct/ib/base/ib_verbs.h
@@ -1,6 +1,9 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
 * Copyright (C) UT-Battelle, LLC. 2014. ALL RIGHTS RESERVED.
+* Copyright (C) 2021 Broadcom. ALL RIGHTS RESERVED. The term “Broadcom”
+* refers to Broadcom Inc. and/or its subsidiaries.
+*
 * See file LICENSE for terms.
 */
 
@@ -234,7 +237,7 @@ static inline int ibv_exp_cq_ignore_overrun(struct ibv_cq *cq) { return 0; }
 #else
 static inline int ibv_exp_cq_ignore_overrun(struct ibv_cq *cq)
 {
-    errno = ENOSYS;
+    errno = EOPNOTSUPP;
     return -1;
 }
 #endif /* HAVE_IBV_EXP_CQ_IGNORE_OVERRUN */
@@ -323,8 +326,4 @@ static inline ucs_status_t uct_ib_qp_max_send_sge(struct ibv_qp *qp,
     return UCS_OK;
 }
 
-typedef struct uct_ib_qpnum {
-    uct_ib_uint24_t qp_num;
-} uct_ib_qpnum_t;
-
 #endif /* UCT_IB_VERBS_H */
diff --git a/src/uct/ib/cm/Makefile.am b/src/uct/ib/cm/Makefile.am
deleted file mode 100644
index bd0c26e986c..00000000000
--- a/src/uct/ib/cm/Makefile.am
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Copyright (C) Mellanox Technologies Ltd. 2001-2018.  ALL RIGHTS RESERVED.
-# See file LICENSE for terms.
-#
-
-if HAVE_TL_CM
-
-module_LTLIBRARIES       = libuct_ib_cm.la
-libuct_ib_cm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(IBVERBS_CPPFLAGS)
-libuct_ib_cm_la_CFLAGS   = $(BASE_CFLAGS)
-libuct_ib_cm_la_LIBADD   = $(top_builddir)/src/ucs/libucs.la \
-                           $(top_builddir)/src/uct/ib/libuct_ib.la
-libuct_ib_cm_la_LDFLAGS  = $(IBVERBS_LDFLAGS) $(IBCM_LIBS) -version-info $(SOVERSION)
-
-noinst_HEADERS = \
-	cm.h
-
-libuct_ib_cm_la_SOURCES = \
-	cm_iface.c \
-	cm_ep.c
-	
-include $(top_srcdir)/config/module.am
-
-endif
diff --git a/src/uct/ib/cm/cm.h b/src/uct/ib/cm/cm.h
deleted file mode 100644
index 019af5abd71..00000000000
--- a/src/uct/ib/cm/cm.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
-*
-* See file LICENSE for terms.
-*/
-
-#ifndef UCT_IB_CM_H_
-#define UCT_IB_CM_H_
-
-#include <uct/base/uct_worker.h>
-#include <uct/ib/base/ib_iface.h>
-#include <ucs/datastruct/queue.h>
-#include <ucs/sys/compiler.h>
-#include <ucs/type/class.h>
-#include <infiniband/cm.h>
-
-
-/**
- * IB CM configuration
- */
-typedef struct uct_cm_iface_config {
-    uct_ib_iface_config_t  super;
-    double                 timeout;
-    unsigned               retry_count;
-    unsigned               max_outstanding;
-} uct_cm_iface_config_t;
-
-
-/**
- * Outstanding operation - can be either a send or flush request.
- */
-typedef struct uct_cm_iface_op {
-    ucs_queue_elem_t       queue;    /* queue element */
-    int                    is_id;    /* 1: id field is valid. 0: comp field is valid */
-    union {
-        struct ib_cm_id    *id;      /* send operation: cm id */
-        uct_completion_t   *comp;    /* flush request: user completion */
-    };
-} uct_cm_iface_op_t;
-
-
-/**
- * IB CM interface/
- */
-typedef struct uct_cm_iface {
-    uct_ib_iface_t            super;
-    uint32_t                  service_id;      /* Service ID we're listening to */
-    struct ib_cm_device      *cmdev;           /* CM device */
-    struct ib_cm_id          *listen_id;       /* Listening "socket" */
-    ucs_queue_head_t          notify_q;        /* Notification queue */
-    uint32_t                  num_outstanding; /* Number of outstanding sends */
-    uint32_t                  num_completions; /* Number of completed sends */
-    ucs_queue_head_t          outstanding_q;   /* Outstanding operations queue */
-    uct_worker_cb_id_t        slow_prog_id;    /* Callback id for slowpath progress */
-
-    struct {
-        int                timeout_ms;
-        uint32_t           max_outstanding;
-        uint8_t            retry_count;
-    } config;
-} uct_cm_iface_t;
-
-
-/**
- * CM endpoint - container for destination address
- */
-typedef struct uct_cm_ep {
-    uct_base_ep_t          super;
-    uint16_t               dlid;
-    uint32_t               dest_service_id;
-    union ibv_gid          dgid;
-} uct_cm_ep_t;
-
-
-/**
- * CM network header
- */
-typedef struct uct_cm_hdr {
-    uint8_t                am_id;   /* Active message ID */
-    uint8_t                length;  /* Payload length */
-} UCS_S_PACKED uct_cm_hdr_t;
-
-
-/**
- * CM pending request private data
- */
-typedef struct {
-    uct_pending_req_priv_queue_t base;
-    uct_cm_ep_t                  *ep;
-} uct_cm_pending_req_priv_t;
-
-
-UCS_CLASS_DECLARE_NEW_FUNC(uct_cm_ep_t, uct_ep_t, const uct_ep_params_t *);
-UCS_CLASS_DECLARE_DELETE_FUNC(uct_cm_ep_t, uct_ep_t);
-
-ucs_status_t uct_cm_ep_connect_to_iface(uct_ep_h ep, const uct_iface_addr_t *iface_addr);
-ucs_status_t uct_cm_iface_flush(uct_iface_h tl_iface, unsigned flags,
-                                uct_completion_t *comp);
-
-ucs_status_t uct_cm_iface_flush_do(uct_cm_iface_t *iface, uct_completion_t *comp);
-
-ssize_t uct_cm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb,
-                           void *arg, unsigned flags);
-
-ucs_status_t uct_cm_ep_pending_add(uct_ep_h ep, uct_pending_req_t *req,
-                                   unsigned flags);
-void uct_cm_ep_pending_purge(uct_ep_h ep, uct_pending_purge_callback_t cb,
-                             void *arg);
-
-ucs_status_t uct_cm_ep_flush(uct_ep_h tl_ep, unsigned flags,
-                             uct_completion_t *comp);
-
-static inline int uct_cm_iface_has_tx_resources(uct_cm_iface_t *iface)
-{
-    return iface->num_outstanding < iface->config.max_outstanding;
-}
-
-
-static UCS_F_ALWAYS_INLINE uct_cm_pending_req_priv_t *
-uct_cm_pending_req_priv(uct_pending_req_t *req)
-{
-    return (uct_cm_pending_req_priv_t *)&req->priv;
-}
-
-
-#define uct_cm_iface_trace_data(_iface, _type, _hdr, _fmt, ...) \
-    uct_iface_trace_am(&(_iface)->super.super, _type, (_hdr)->am_id, \
-                       (_hdr) + 1, (_hdr)->length, _fmt, ## __VA_ARGS__)
-
-
-#define uct_cm_iface_worker(_iface) \
-    ((_iface)->super.super.worker)
-
-
-#define uct_cm_enter(_iface) \
-    UCS_ASYNC_BLOCK(uct_cm_iface_worker(_iface)->async);
-
-
-#define uct_cm_leave(_iface) \
-    UCS_ASYNC_UNBLOCK(uct_cm_iface_worker(_iface)->async); \
-    ucs_async_check_miss(uct_cm_iface_worker(_iface)->async);
-
-
-#endif
diff --git a/src/uct/ib/cm/cm_ep.c b/src/uct/ib/cm/cm_ep.c
deleted file mode 100644
index 9f2435fd2a5..00000000000
--- a/src/uct/ib/cm/cm_ep.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
-* Copyright (c) 2007-2009 Cisco Systems, Inc.  All rights reserved.
-* Copyright (c) 2009      IBM Corporation.  All rights reserved.
-*
-* See file LICENSE for terms.
-*/
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "cm.h"
-
-#include <ucs/arch/atomic.h>
-#include <ucs/async/async.h>
-#include <uct/base/uct_log.h>
-
-
-typedef struct uct_cm_iov {
-    uct_pack_callback_t pack;
-    const void          *arg;
-    size_t              length;
-} uct_cm_iov_t;
-
-
-static UCS_CLASS_INIT_FUNC(uct_cm_ep_t, const uct_ep_params_t *params)
-
-{
-    uct_cm_iface_t *iface = ucs_derived_of(params->iface, uct_cm_iface_t);
-
-    UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super);
-
-    self->dest_service_id = *(const uint32_t*)params->iface_addr;
-    return UCS_OK;
-}
-
-static UCS_CLASS_CLEANUP_FUNC(uct_cm_ep_t)
-{
-    ucs_trace_func("");
-}
-
-UCS_CLASS_DEFINE(uct_cm_ep_t, uct_base_ep_t);
-UCS_CLASS_DEFINE_NEW_FUNC(uct_cm_ep_t, uct_ep_t, const uct_ep_params_t *);
-UCS_CLASS_DEFINE_DELETE_FUNC(uct_cm_ep_t, uct_ep_t);
-
-
-static ucs_status_t uct_cm_ep_fill_path_rec(uct_cm_ep_t *ep,
-                                            struct ibv_sa_path_rec *path)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_cm_iface_t);
-
-    path->sgid                      = iface->super.gid_info.gid;
-    path->dlid                      = htons(ep->dlid);
-    path->slid                      = htons(uct_ib_iface_port_attr(&iface->super)->lid);
-    if (iface->super.config.force_global_addr) {
-        ucs_assert_always(ep->dgid.global.interface_id != 0);
-        path->dgid                  = ep->dgid;
-        path->hop_limit             = iface->super.config.hop_limit;
-    } else {
-        memset(&path->dgid, 0, sizeof(path->dgid));
-        path->hop_limit             = 0;
-    }
-    path->raw_traffic               = 0; /* IB traffic */
-    path->flow_label                = 0;
-    path->traffic_class             = iface->super.config.traffic_class;
-    path->reversible                = htonl(1); /* IBCM currently only supports reversible paths */
-    path->numb_path                 = 0;
-    path->pkey                      = ntohs(iface->super.pkey);
-    path->sl                        = iface->super.config.sl;
-    path->mtu_selector              = 2; /* EQ */
-    path->mtu                       = uct_ib_iface_port_attr(&iface->super)->active_mtu;
-    path->rate_selector             = 2; /* EQ */
-    path->rate                      = IBV_RATE_MAX;
-    path->packet_life_time_selector = 2; /* EQ */
-    path->packet_life_time          = 0;
-    path->preference                = 0; /* Use first path */
-    return UCS_OK;
-}
-
-static void uct_cm_dump_path(struct ibv_sa_path_rec *path)
-{
-    char sgid_buf[256];
-    char dgid_buf[256];
-
-    uct_ib_gid_str(&path->dgid, dgid_buf, sizeof(dgid_buf));
-    uct_ib_gid_str(&path->sgid, sgid_buf, sizeof(sgid_buf));
-
-    ucs_trace_data("slid %d sgid %s dlid %d dgid %s",
-                   ntohs(path->slid), sgid_buf, ntohs(path->dlid), dgid_buf);
-    ucs_trace_data("traffic %d flow_label %d hop %d class %d revers. 0x%x "
-                   "numb %d pkey 0x%x sl %d",
-                   path->raw_traffic, path->flow_label, path->hop_limit,
-                   path->traffic_class, path->reversible, path->numb_path,
-                   path->pkey, path->sl);
-    ucs_trace_data("mtu %d(%d) rate %d(%d) lifetime %d(%d) pref %d",
-                   path->mtu, path->mtu_selector, path->rate, path->rate_selector,
-                   path->packet_life_time, path->packet_life_time_selector,
-                   path->preference);
-}
-
-ssize_t uct_cm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t am_id, uct_pack_callback_t pack_cb,
-                           void *arg, unsigned flags)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cm_iface_t);
-    uct_cm_ep_t *ep = ucs_derived_of(tl_ep, uct_cm_ep_t);
-    struct ib_cm_sidr_req_param req;
-    struct ibv_sa_path_rec path;
-    uct_cm_iface_op_t *op;
-    ucs_status_t status;
-    uct_cm_hdr_t *hdr;
-    size_t payload_len;
-    size_t total_len;
-    int ret;
-
-    UCT_CHECK_AM_ID(am_id);
-
-    uct_cm_enter(iface);
-
-    if (!uct_cm_iface_has_tx_resources(iface)) {
-        status = UCS_ERR_NO_RESOURCE;
-        goto err;
-    }
-
-    /* Allocate temporary contiguous buffer */
-    hdr = ucs_malloc(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, "cm_send_buf");
-    if (hdr == NULL) {
-        status = UCS_ERR_NO_MEMORY;
-        goto err;
-    }
-
-    payload_len = pack_cb(hdr + 1, arg);
-    hdr->am_id  = am_id;
-    hdr->length = payload_len;
-    total_len   = sizeof(*hdr) + payload_len;
-
-    status = uct_cm_ep_fill_path_rec(ep, &path);
-    if (status != UCS_OK) {
-        goto err_free_hdr;
-    }
-
-    /* Fill SIDR request */
-    memset(&req, 0, sizeof req);
-    req.path             = &path;
-    req.service_id       = ep->dest_service_id;
-    req.timeout_ms       = iface->config.timeout_ms;
-    req.private_data     = hdr;
-    req.private_data_len = total_len;
-    req.max_cm_retries   = iface->config.retry_count;
-
-    op = ucs_malloc(sizeof *op, "cm_op");
-    if (op == NULL) {
-        status = UCS_ERR_NO_MEMORY;
-        goto err_free_hdr;
-    }
-
-    op->is_id = 1;
-
-    /* Create temporary ID for this message. Will be released when getting REP. */
-    ret = ib_cm_create_id(iface->cmdev, &op->id, NULL);
-    if (ret) {
-        ucs_error("ib_cm_create_id() failed: %m");
-        status = UCS_ERR_IO_ERROR;
-        goto err_free_op;
-    }
-
-    uct_cm_dump_path(&path);
-
-    ret = ib_cm_send_sidr_req(op->id, &req);
-    if (ret) {
-        ucs_error("ib_cm_send_sidr_req() failed: %m");
-        status = UCS_ERR_IO_ERROR;
-        goto err_destroy_id;
-    }
-
-    ucs_queue_push(&iface->outstanding_q, &op->queue);
-    ++iface->num_outstanding;
-    ucs_trace("outs=%d", iface->num_outstanding);
-    UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, payload_len);
-
-    uct_cm_iface_trace_data(iface, UCT_AM_TRACE_TYPE_SEND, hdr,
-                            "TX: SIDR_REQ [id %p{%u} dlid %d svc 0x%"PRIx64"]",
-                            op->id, op->id->handle, ntohs(path.dlid),
-                            (uint64_t)req.service_id);
-    uct_cm_leave(iface);
-    ucs_free(hdr);
-    /* coverity[missing_unlock] */
-    return payload_len;
-
-err_destroy_id:
-    ib_cm_destroy_id(op->id);
-err_free_op:
-    ucs_free(op);
-err_free_hdr:
-    ucs_free(hdr);
-err:
-    uct_cm_leave(iface);
-    return status;
-}
-
-ucs_status_t uct_cm_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *req,
-                                   unsigned flags)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cm_iface_t);
-    ucs_status_t status;
-
-    uct_cm_enter(iface);
-    if (iface->num_outstanding < iface->config.max_outstanding) {
-        status = UCS_ERR_BUSY;
-    } else {
-        uct_cm_pending_req_priv(req)->ep = ucs_derived_of(tl_ep, uct_cm_ep_t);
-        uct_pending_req_queue_push(&iface->notify_q, req);
-        status = UCS_OK;
-        UCT_TL_EP_STAT_PEND(ucs_derived_of(tl_ep, uct_base_ep_t));
-    }
-    uct_cm_leave(iface);
-    return status;
-}
-
-void uct_cm_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb,
-                             void *arg)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cm_iface_t);
-    uct_cm_ep_t *ep       = ucs_derived_of(tl_ep, uct_cm_ep_t);
-    uct_cm_pending_req_priv_t *priv;
-
-    uct_cm_enter(iface);
-    uct_pending_queue_purge(priv, &iface->notify_q, priv->ep == ep, cb, arg);
-    uct_cm_leave(iface);
-}
-
-ucs_status_t uct_cm_ep_flush(uct_ep_h tl_ep, unsigned flags,
-                             uct_completion_t *comp)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_cm_iface_t);
-    ucs_status_t status;
-
-    uct_cm_enter(iface);
-    if (!uct_cm_iface_has_tx_resources(iface)) {
-        status = UCS_ERR_NO_RESOURCE;
-    } else {
-        status = uct_cm_iface_flush_do(iface, comp);
-        if (status == UCS_OK) {
-            UCT_TL_EP_STAT_FLUSH(ucs_derived_of(tl_ep, uct_base_ep_t));
-        } else if (status == UCS_INPROGRESS) {
-            UCT_TL_EP_STAT_FLUSH_WAIT(ucs_derived_of(tl_ep, uct_base_ep_t));
-        }
-    }
-    uct_cm_leave(iface);
-
-    return status;
-}
diff --git a/src/uct/ib/cm/cm_iface.c b/src/uct/ib/cm/cm_iface.c
deleted file mode 100644
index 67fc225fe63..00000000000
--- a/src/uct/ib/cm/cm_iface.c
+++ /dev/null
@@ -1,495 +0,0 @@
-/**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
-*
-* See file LICENSE for terms.
-*/
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "cm.h"
-
-#include <uct/api/uct.h>
-#include <uct/ib/base/ib_iface.h>
-#include <uct/base/uct_md.h>
-#include <ucs/arch/atomic.h>
-#include <ucs/async/async.h>
-#include <ucs/debug/log.h>
-#include <poll.h>
-
-
-static ucs_config_field_t uct_cm_iface_config_table[] = {
-  {UCT_IB_CONFIG_PREFIX, "RX_INLINE=0", NULL,
-   ucs_offsetof(uct_cm_iface_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_ib_iface_config_table)},
-
-  {"TIMEOUT", "300ms", "Timeout for MAD layer",
-   ucs_offsetof(uct_cm_iface_config_t, timeout), UCS_CONFIG_TYPE_TIME},
-
-  {"RETRY_COUNT", "100", "Number of retries for MAD layer",
-   ucs_offsetof(uct_cm_iface_config_t, retry_count), UCS_CONFIG_TYPE_UINT},
-
-  {"MAX_OP", "1024", "Maximal number of outstanding SIDR operations",
-   ucs_offsetof(uct_cm_iface_config_t, max_outstanding), UCS_CONFIG_TYPE_UINT},
-
-  {NULL}
-};
-
-static uct_ib_iface_ops_t uct_cm_iface_ops;
-
-
-static unsigned uct_cm_iface_progress(void *arg)
-{
-    uct_cm_iface_t *iface = arg;
-    uct_cm_pending_req_priv_t *priv;
-    uct_cm_iface_op_t *op;
-    unsigned count;
-
-    uct_cm_enter(iface);
-
-    /* Invoke flush completions at the head of the queue - the sends which
-     * started before them were already completed.
-     */
-    count = 0;
-    ucs_queue_for_each_extract(op, &iface->outstanding_q, queue, !op->is_id) {
-        uct_invoke_completion(op->comp, UCS_OK);
-        ucs_free(op);
-        ++count;
-    }
-
-    /* we are in the progress() context. Now it is safe to release resources. */
-    iface->num_outstanding -= iface->num_completions;
-    iface->num_completions  = 0;
-
-    /* Dispatch pending operations */
-    uct_pending_queue_dispatch(priv, &iface->notify_q,
-                               iface->num_outstanding < iface->config.max_outstanding);
-
-    /* Remove the progress callback only if there is no user completion at the
-     * head of the queue. It could be added by the progress callback.
-     */
-    if (ucs_queue_is_empty(&iface->outstanding_q) ||
-        ucs_queue_head_elem_non_empty(&iface->outstanding_q, uct_cm_iface_op_t, queue)->is_id)
-    {
-        uct_worker_progress_unregister_safe(&uct_cm_iface_worker(iface)->super,
-                                            &iface->slow_prog_id);
-    }
-
-    uct_cm_leave(iface);
-
-    return count;
-}
-
-ucs_status_t uct_cm_iface_flush_do(uct_cm_iface_t *iface, uct_completion_t *comp)
-{
-    uct_cm_iface_op_t *op;
-
-    if (iface->num_outstanding == 0) {
-        return UCS_OK;
-    }
-
-    /* If user request a completion callback, allocate a new operation and put
-     * it in the tail of the queue. It will be called when all operations which
-     * were sent before are completed.
-     */
-    if (comp != NULL) {
-        op = ucs_malloc(sizeof *op, "cm_op");
-        if (op == NULL) {
-            return UCS_ERR_NO_MEMORY;
-        }
-
-        op->is_id = 0;
-        op->comp  = comp;
-        ucs_queue_push(&iface->outstanding_q, &op->queue);
-    }
-
-    sched_yield();
-    return UCS_INPROGRESS;
-}
-
-ucs_status_t uct_cm_iface_flush(uct_iface_h tl_iface, unsigned flags,
-                                uct_completion_t *comp)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_iface, uct_cm_iface_t);
-    ucs_status_t status;
-
-    uct_cm_enter(iface);
-    status = uct_cm_iface_flush_do(iface, comp);
-    if (status == UCS_OK) {
-        UCT_TL_IFACE_STAT_FLUSH(ucs_derived_of(tl_iface, uct_base_iface_t));
-    } else if (status == UCS_INPROGRESS){
-        UCT_TL_IFACE_STAT_FLUSH_WAIT(ucs_derived_of(tl_iface, uct_base_iface_t));
-    }
-    uct_cm_leave(iface);
-
-    return status;
-}
-
-static void uct_cm_iface_handle_sidr_req(uct_cm_iface_t *iface,
-                                         struct ib_cm_event *event)
-{
-    uct_cm_hdr_t *hdr = event->private_data;
-    struct ib_cm_sidr_rep_param rep;
-    int ret;
-
-    VALGRIND_MAKE_MEM_DEFINED(hdr, sizeof(hdr));
-    VALGRIND_MAKE_MEM_DEFINED(hdr + 1, hdr->length);
-
-    uct_cm_iface_trace_data(iface, UCT_AM_TRACE_TYPE_RECV, hdr, "RX: SIDR_REQ");
-
-    /* Send reply */
-    ucs_trace_data("TX: SIDR_REP [id %p{%u}]", event->cm_id,
-                   event->cm_id->handle);
-    memset(&rep, 0, sizeof rep);
-    rep.status = IB_SIDR_SUCCESS;
-    ret = ib_cm_send_sidr_rep(event->cm_id, &rep);
-    if (ret) {
-        ucs_error("ib_cm_send_sidr_rep() failed: %m");
-    }
-
-    uct_iface_invoke_am(&iface->super.super, hdr->am_id, hdr + 1, hdr->length, 0);
-}
-
-static void uct_cm_iface_outstanding_remove(uct_cm_iface_t* iface,
-                                            struct ib_cm_id* id)
-{
-    uct_cm_iface_op_t *op;
-    ucs_queue_iter_t iter;
-
-    ucs_queue_for_each_safe(op, iter, &iface->outstanding_q, queue) {
-        if (op->is_id && (op->id == id)) {
-            ucs_queue_del_iter(&iface->outstanding_q, iter);
-            /* Must not release resources from the async context
-             * because it will break pending op ordering.
-             * For example bcopy() may succeed while there are queued
-             * pending ops:
-             * bcopy() -> no resources
-             * pending_add() -> ok
-             * <-- async event: resources available
-             * bcopy() --> ok. oops this is out of order send
-             *
-             * save the number and do actual release in the
-             * progress() context.
-             */
-            ++iface->num_completions;
-            ucs_free(op);
-            return;
-        }
-    }
-
-    ucs_fatal("outstanding cm id %p not found", id);
-}
-
-static void uct_cm_iface_outstanding_purge(uct_cm_iface_t *iface)
-{
-    uct_cm_iface_op_t *op;
-
-    ucs_queue_for_each_extract(op, &iface->outstanding_q, queue, 1) {
-        if (op->is_id) {
-            ib_cm_destroy_id(op->id);
-        } else {
-            uct_invoke_completion(op->comp, UCS_ERR_CANCELED);
-        }
-        ucs_free(op);
-    }
-    iface->num_outstanding = 0;
-}
-
-static void uct_cm_iface_event_handler(int fd, ucs_event_set_types_t events,
-                                       void *arg)
-{
-    uct_cm_iface_t *iface = arg;
-    struct ib_cm_event *event;
-    struct ib_cm_id *id;
-    int destroy_id;
-    int ret;
-
-    ucs_trace_func("");
-
-    for (;;) {
-        /* Fetch all events */
-        ret = ib_cm_get_event(iface->cmdev, &event);
-        if (ret) {
-            if (errno != EAGAIN) {
-                ucs_warn("ib_cm_get_event() failed: %m");
-            }
-            return;
-        }
-
-        id  = event->cm_id;
-
-        /* Handle the event */
-        switch (event->event) {
-        case IB_CM_SIDR_REQ_ERROR:
-            ucs_error("SIDR request error, status: %s",
-                      ibv_wc_status_str(event->param.send_status));
-            destroy_id = 1;
-            break;
-        case IB_CM_SIDR_REQ_RECEIVED:
-            uct_cm_iface_handle_sidr_req(iface, event);
-            destroy_id = 1; /* Destroy the ID created by the driver */
-            break;
-        case IB_CM_SIDR_REP_RECEIVED:
-            ucs_trace_data("RX: SIDR_REP [id %p{%u}]", id, id->handle);
-            uct_cm_iface_outstanding_remove(iface, id);
-            destroy_id = 1; /* Destroy the ID which was used for sending */
-            break;
-        default:
-            ucs_warn("Unexpected CM event: %d", event->event);
-            destroy_id = 0;
-            break;
-        }
-
-        /* Acknowledge CM event, remember the id, in case we would destroy it */
-        ret = ib_cm_ack_event(event);
-        if (ret) {
-            ucs_warn("ib_cm_ack_event() failed: %m");
-        }
-
-        /* If there is an id which should be destroyed, do it now, after
-         * acknowledging all events.
-         */
-        if (destroy_id) {
-            ret = ib_cm_destroy_id(id);
-            if (ret) {
-                ucs_error("ib_cm_destroy_id() failed: %m");
-            }
-        }
-
-        uct_worker_progress_register_safe(&uct_cm_iface_worker(iface)->super,
-                                          uct_cm_iface_progress, iface, 0,
-                                          &iface->slow_prog_id);
-    }
-}
-
-static void uct_cm_iface_release_desc(uct_recv_desc_t *self, void *desc)
-{
-    uct_ib_iface_t *iface = ucs_container_of(self, uct_ib_iface_t, release_desc);
-    /* Don't use UCS_PTR_BYTE_OFFSET here due to coverity false positive report */
-    ucs_free((char*)desc - iface->config.rx_headroom_offset);
-}
-
-static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_md_h md, uct_worker_h worker,
-                           const uct_iface_params_t *params,
-                           const uct_iface_config_t *tl_config)
-{
-    uct_cm_iface_config_t *config = ucs_derived_of(tl_config, uct_cm_iface_config_t);
-    uct_ib_iface_init_attr_t init_attr = {};
-    ucs_status_t status;
-    int ret;
-
-    ucs_trace_func("");
-
-    init_attr.cq_len[UCT_IB_DIR_TX] = 1;
-    init_attr.cq_len[UCT_IB_DIR_RX] = config->super.rx.queue_len;
-    init_attr.seg_size              = ucs_min(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE,
-                                              config->super.seg_size);
-
-    UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &uct_cm_iface_ops, md, worker,
-                              params, &config->super, &init_attr);
-
-    if (self->super.super.worker->async == NULL) {
-        ucs_error("cm must have async!=NULL");
-        return UCS_ERR_INVALID_PARAM;
-    }
-
-    self->num_outstanding     = 0;
-    self->num_completions     = 0;
-    self->service_id          = 0;
-    self->config.timeout_ms   = (int)(config->timeout * 1e3 + 0.5);
-    self->config.max_outstanding = config->max_outstanding;
-    self->config.retry_count  = ucs_min(config->retry_count, UINT8_MAX);
-    self->notify_q.head       = NULL;
-    self->slow_prog_id        = UCS_CALLBACKQ_ID_NULL;
-    ucs_queue_head_init(&self->notify_q);
-    ucs_queue_head_init(&self->outstanding_q);
-
-    /* Redefine receive desc release callback */
-    self->super.release_desc.cb = uct_cm_iface_release_desc;
-
-    self->cmdev = ib_cm_open_device(uct_ib_iface_device(&self->super)->ibv_context);
-    if (self->cmdev == NULL) {
-        ucs_error("ib_cm_open_device() failed: %m. Check if ib_ucm.ko module is loaded.");
-        status = UCS_ERR_NO_DEVICE;
-        goto err;
-    }
-
-    status = ucs_sys_fcntl_modfl(self->cmdev->fd, O_NONBLOCK, 0);
-    if (status != UCS_OK) {
-        goto err_close_device;
-    }
-
-    ret = ib_cm_create_id(self->cmdev, &self->listen_id, self);
-    if (ret) {
-        ucs_error("ib_cm_create_id() failed: %m");
-        status = UCS_ERR_NO_DEVICE;
-        goto err_close_device;
-    }
-
-    do {
-        self->service_id = (uint32_t)(ucs_generate_uuid((uintptr_t)self) &
-                                      (~IB_CM_ASSIGN_SERVICE_ID_MASK));
-        ret = ib_cm_listen(self->listen_id, self->service_id, 0);
-        if (ret) {
-            if (errno == EBUSY) {
-                /* The generated service id is already in use - try to
-                 * generate another one.
-                 */
-                ucs_debug("ib_cm service id 0x%x already in use, "
-                          "trying another one", self->service_id);
-                continue;
-            } else {
-                ucs_error("ib_cm_listen(service_id=0x%x) failed: %m",
-                          self->service_id);
-                status = UCS_ERR_INVALID_ADDR;
-                goto err_destroy_id;
-            }
-        }
-    } while (ret);
-
-    if (self->super.super.worker->async->mode == UCS_ASYNC_MODE_SIGNAL) {
-        ucs_warn("ib_cm fd does not support SIGIO");
-    }
-
-    status = ucs_async_set_event_handler(self->super.super.worker->async->mode,
-                                         self->cmdev->fd, UCS_EVENT_SET_EVREAD,
-                                         uct_cm_iface_event_handler, self,
-                                         self->super.super.worker->async);
-    if (status != UCS_OK) {
-        ucs_error("failed to set event handler");
-        goto err_destroy_id;
-    }
-
-    ucs_debug("listening for SIDR service_id 0x%x on fd %d", self->service_id,
-              self->cmdev->fd);
-    return UCS_OK;
-
-err_destroy_id:
-    ib_cm_destroy_id(self->listen_id);
-err_close_device:
-    ib_cm_close_device(self->cmdev);
-err:
-    return status;
-}
-
-static UCS_CLASS_CLEANUP_FUNC(uct_cm_iface_t)
-{
-
-    ucs_trace_func("");
-
-    ucs_async_remove_handler(self->cmdev->fd, 1);
-
-    uct_cm_enter(self);
-    uct_cm_iface_outstanding_purge(self);
-    ib_cm_destroy_id(self->listen_id);
-    ib_cm_close_device(self->cmdev);
-    uct_worker_progress_unregister_safe(&uct_cm_iface_worker(self)->super,
-                                        &self->slow_prog_id);
-    uct_cm_leave(self);
-
-    /* At this point all outstanding have been removed, and no further events
-     * can be added.
-     */
-}
-
-UCS_CLASS_DEFINE(uct_cm_iface_t, uct_ib_iface_t);
-static UCS_CLASS_DEFINE_NEW_FUNC(uct_cm_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
-                                 const uct_iface_params_t*, const uct_iface_config_t*);
-static UCS_CLASS_DEFINE_DELETE_FUNC(uct_cm_iface_t, uct_iface_t);
-
-static ucs_status_t uct_cm_iface_query(uct_iface_h tl_iface,
-                                       uct_iface_attr_t *iface_attr)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_iface, uct_cm_iface_t);
-    ucs_status_t status;
-    size_t mtu;
-
-    status = uct_ib_iface_query(&iface->super, 32 /* TODO */, iface_attr);
-    if (status != UCS_OK) {
-        return status;
-    }
-
-    iface_attr->overhead = 1200e-9;
-
-    mtu = ucs_min(IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - sizeof(uct_cm_hdr_t),
-                  UINT8_MAX);
-
-    iface_attr->cap.am.max_bcopy      = mtu;
-    iface_attr->iface_addr_len        = sizeof(uint32_t);
-    iface_attr->ep_addr_len           = 0;
-    iface_attr->max_conn_priv         = 0;
-    iface_attr->cap.flags             = UCT_IFACE_FLAG_AM_BCOPY |
-                                        UCT_IFACE_FLAG_AM_DUP   |
-                                        UCT_IFACE_FLAG_PENDING  |
-                                        UCT_IFACE_FLAG_CB_ASYNC |
-                                        UCT_IFACE_FLAG_CONNECT_TO_IFACE;
-    return UCS_OK;
-}
-
-static ucs_status_t uct_cm_iface_get_address(uct_iface_h tl_iface,
-                                             uct_iface_addr_t *iface_addr)
-{
-    uct_cm_iface_t *iface = ucs_derived_of(tl_iface, uct_cm_iface_t);
-    *(uint32_t*)iface_addr = iface->service_id;
-    return UCS_OK;
-}
-
-
-static uct_ib_iface_ops_t uct_cm_iface_ops = {
-    {
-    .ep_am_bcopy              = uct_cm_ep_am_bcopy,
-    .ep_pending_add           = uct_cm_ep_pending_add,
-    .ep_pending_purge         = uct_cm_ep_pending_purge,
-    .ep_flush                 = uct_cm_ep_flush,
-    .ep_fence                 = uct_base_ep_fence,
-    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_cm_ep_t),
-    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_cm_ep_t),
-    .iface_flush              = uct_cm_iface_flush,
-    .iface_fence              = uct_base_iface_fence,
-    .iface_progress_enable    = ucs_empty_function,
-    .iface_progress_disable   = ucs_empty_function,
-    .iface_progress           = ucs_empty_function_return_zero,
-    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_cm_iface_t),
-    .iface_query              = uct_cm_iface_query,
-    .iface_get_device_address = uct_ib_iface_get_device_address,
-    .iface_get_address        = uct_cm_iface_get_address,
-    .iface_is_reachable       = uct_ib_iface_is_reachable
-    },
-    .create_cq                = uct_ib_verbs_create_cq,
-    .arm_cq                   = ucs_empty_function_return_success,
-};
-
-static int uct_cm_is_module_loaded(uct_ib_md_t *ib_md)
-{
-    struct ib_cm_device *cmdev = NULL;
-
-    cmdev = ib_cm_open_device(ib_md->dev.ibv_context);
-    if (cmdev == NULL) {
-        ucs_debug("ib_cm_open_device() for %s failed: %m. "
-                  "Check if ib_ucm.ko module is loaded.",
-                  uct_ib_device_name(&ib_md->dev));
-        return 0;
-    }
-
-    ib_cm_close_device(cmdev);
-    return 1;
-}
-
-static ucs_status_t
-uct_cm_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
-                        unsigned *num_tl_devices_p)
-{
-    uct_ib_md_t *ib_md = ucs_derived_of(md, uct_ib_md_t);
-
-    if (!uct_cm_is_module_loaded(ib_md)) {
-        *num_tl_devices_p = 0;
-        *tl_devices_p     = NULL;
-        return UCS_OK;
-    }
-
-   return uct_ib_device_query_ports(&ib_md->dev, UCT_IB_DEVICE_FLAG_LINK_IB,
-                                    tl_devices_p, num_tl_devices_p);
-}
-
-UCT_TL_DEFINE(&uct_ib_component, cm, uct_cm_query_tl_devices, uct_cm_iface_t,
-              "CM_", uct_cm_iface_config_table, uct_cm_iface_config_t);
diff --git a/src/uct/ib/cm/configure.m4 b/src/uct/ib/cm/configure.m4
deleted file mode 100644
index b7bc0416a05..00000000000
--- a/src/uct/ib/cm/configure.m4
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
-# Copyright (C) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED.
-# Copyright (C) The University of Tennessee and the University of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
-#
-# See file LICENSE for terms.
-#
-
-#
-# CM (IB connection manager) Support
-#
-cm_happy="no"
-
-AC_ARG_WITH([cm],
-            [AC_HELP_STRING([--with-cm], [Compile with IB Connection Manager support])],
-            [],
-            [with_cm=guess])
-
-AS_IF([test "x$with_cm" != xno],
-      [save_LIBS="$LIBS"
-       AC_CHECK_LIB([ibcm], [ib_cm_send_req],
-                    [AC_SUBST(IBCM_LIBS, [-libcm])
-                     uct_ib_modules="${uct_ib_modules}:cm"
-                     cm_happy="yes"],
-                    [AS_IF([test "x$with_cm" = xyes],
-                           [AC_MSG_ERROR([CM requested but lib ibcm not found])],
-                           [AC_MSG_WARN([CM support not found, skipping])]
-                           )
-                    ])
-       LIBS="$save_LIBS"])
-
-AM_CONDITIONAL([HAVE_TL_CM], [test "x$cm_happy" != xno])
-AC_CONFIG_FILES([src/uct/ib/cm/Makefile])
diff --git a/src/uct/ib/configure.m4 b/src/uct/ib/configure.m4
index 21902b5a726..d4d77da5ff8 100644
--- a/src/uct/ib/configure.m4
+++ b/src/uct/ib/configure.m4
@@ -205,7 +205,9 @@ AS_IF([test "x$with_ib" = "xyes"],
                            mlx5dv_is_supported,
                            mlx5dv_devx_subscribe_devx_event,
                            MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE,
-                           MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE],
+                           MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE,
+                           MLX5DV_UAR_ALLOC_TYPE_BF,
+                           MLX5DV_UAR_ALLOC_TYPE_NC],
                                   [], [], [[#include <infiniband/mlx5dv.h>]])
                        AC_CHECK_MEMBERS([struct mlx5dv_cq.cq_uar],
                                   [], [], [[#include <infiniband/mlx5dv.h>]])
@@ -446,7 +448,6 @@ AM_CONDITIONAL([HAVE_EXP],     [test "x$verbs_exp" != xno])
 AM_CONDITIONAL([HAVE_MLX5_HW_UD], [test "x$with_mlx5_hw" != xno -a "x$has_get_av" != xno])
 
 uct_ib_modules=""
-m4_include([src/uct/ib/cm/configure.m4])
 m4_include([src/uct/ib/rdmacm/configure.m4])
 AC_DEFINE_UNQUOTED([uct_ib_MODULES], ["${uct_ib_modules}"], [IB loadable modules])
 AC_CONFIG_FILES([src/uct/ib/Makefile])
diff --git a/src/uct/ib/dc/dc_mlx5.c b/src/uct/ib/dc/dc_mlx5.c
index 81cf1203b0b..08a226e239e 100644
--- a/src/uct/ib/dc/dc_mlx5.c
+++ b/src/uct/ib/dc/dc_mlx5.c
@@ -8,14 +8,15 @@
 #  include "config.h"
 #endif
 
+#include "dc_mlx5.inl"
 #include "dc_mlx5.h"
 #include "dc_mlx5_ep.h"
 
 #include <uct/api/uct.h>
-#include <uct/ib/rc/accel/rc_mlx5.inl>
 #include <uct/ib/base/ib_device.h>
 #include <uct/ib/base/ib_log.h>
 #include <uct/ib/mlx5/ib_mlx5_log.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <uct/base/uct_md.h>
 #include <ucs/arch/bitops.h>
 #include <ucs/arch/cpu.h>
@@ -40,7 +41,9 @@ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = {
      ucs_offsetof(uct_dc_mlx5_iface_config_t, super),
      UCS_CONFIG_TYPE_TABLE(uct_rc_iface_common_config_table)},
 
-    {"RC_", "", NULL,
+    /* Since long timeout will block SRQ in case of network failure on single
+     * peer default SRQ to list topology. Incur performance degradation. */
+    {"RC_", "SRQ_TOPO=list", NULL,
      ucs_offsetof(uct_dc_mlx5_iface_config_t, rc_mlx5_common),
      UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)},
 
@@ -50,7 +53,7 @@ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = {
 
     {"NUM_DCI", "8",
      "Number of DC initiator QPs (DCI) used by the interface "
-     "(up to " UCS_PP_MAKE_STRING(UCT_DC_MLX5_IFACE_MAX_DCIS) ").",
+     "(up to " UCS_PP_MAKE_STRING(UCT_DC_MLX5_IFACE_MAX_USER_DCIS) ").",
      ucs_offsetof(uct_dc_mlx5_iface_config_t, ndci), UCS_CONFIG_TYPE_UINT},
 
     {"TX_POLICY", "dcs_quota",
@@ -69,6 +72,21 @@ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = {
      ucs_offsetof(uct_dc_mlx5_iface_config_t, tx_policy),
      UCS_CONFIG_TYPE_ENUM(uct_dc_tx_policy_names)},
 
+    {"DCI_FULL_HANDSHAKE", "n",
+     "Force full-handshake protocol for DC initiator. Enabling this mode\n"
+     "increases network latency, but is more resilient to packet drops.",
+     ucs_offsetof(uct_dc_mlx5_iface_config_t, dci_full_handshake),
+     UCS_CONFIG_TYPE_BOOL},
+
+    {"DCI_KA_FULL_HANDSHAKE", "n",
+     "Force full-handshake protocol for DC keepalive initiator.",
+     ucs_offsetof(uct_dc_mlx5_iface_config_t, dci_ka_full_handshake),
+     UCS_CONFIG_TYPE_BOOL},
+
+    {"DCT_FULL_HANDSHAKE", "n", "Force full-handshake protocol for DC target.",
+     ucs_offsetof(uct_dc_mlx5_iface_config_t, dct_full_handshake),
+     UCS_CONFIG_TYPE_BOOL},
+
     {"RAND_DCI_SEED", "0",
      "Seed for DCI allocation when \"rand\" dci policy is used (0 - use default).",
      ucs_offsetof(uct_dc_mlx5_iface_config_t, rand_seed), UCS_CONFIG_TYPE_UINT},
@@ -93,14 +111,11 @@ ucs_config_field_t uct_dc_mlx5_iface_config_table[] = {
     {NULL}
 };
 
-static void uct_dc_mlx5_dci_handle_failure(uct_dc_mlx5_iface_t *iface,
-                                           struct mlx5_cqe64 *cqe,
-                                           uint8_t dci,
-                                           ucs_status_t status);
-
-static uct_dc_dci_ops_t uct_dc_mlx5_dci_ops = {
-    .handle_failure = uct_dc_mlx5_dci_handle_failure
-};
+static void
+uct_dc_mlx5_dci_keepalive_handle_failure(uct_dc_mlx5_iface_t *iface,
+                                         struct mlx5_cqe64 *cqe,
+                                         uint8_t dci_index,
+                                         ucs_status_t ep_status);
 
 
 static ucs_status_t
@@ -131,17 +146,14 @@ uct_dc_mlx5_ep_create_connected(const uct_ep_params_t *params, uct_ep_h* ep_p)
     }
 
     if (is_global) {
-        return UCS_CLASS_NEW(uct_dc_mlx5_grh_ep_t, ep_p, iface, if_addr, &av, &grh_av);
+        return UCS_CLASS_NEW(uct_dc_mlx5_grh_ep_t, ep_p, iface, if_addr, &av,
+                             path_index, &grh_av);
     } else {
-        return UCS_CLASS_NEW(uct_dc_mlx5_ep_t, ep_p, iface, if_addr, &av);
+        return UCS_CLASS_NEW(uct_dc_mlx5_ep_t, ep_p, iface, if_addr, &av,
+                             path_index);
     }
 }
 
-static void uct_dc_mlx5_ep_destroy(uct_ep_h tl_ep)
-{
-    uct_dc_mlx5_ep_cleanup(tl_ep, &UCS_CLASS_NAME(uct_dc_mlx5_ep_t));
-}
-
 static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t);
@@ -187,6 +199,8 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr
         iface_attr->cap.flags &= ~(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
                                    UCT_IFACE_FLAG_ERRHANDLE_ZCOPY_BUF    |
                                    UCT_IFACE_FLAG_ERRHANDLE_REMOTE_MEM);
+    } else {
+        iface_attr->cap.flags |= UCT_IFACE_FLAG_EP_CHECK;
     }
 
     return UCS_OK;
@@ -199,19 +213,11 @@ static void uct_dc_mlx5_iface_progress_enable(uct_iface_h tl_iface, unsigned fla
     uct_base_iface_progress_enable_cb(&iface->super.super, iface->progress, flags);
 }
 
-static ucs_status_t uct_dc_mlx5_ep_set_failed(uct_ib_iface_t *ib_iface,
-                                              uct_ep_h ep, ucs_status_t status)
-{
-    return uct_set_ep_failed(&UCS_CLASS_NAME(uct_dc_mlx5_ep_t), ep,
-                             &ib_iface->super.super, status);
-}
-
 static UCS_F_ALWAYS_INLINE unsigned
 uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface)
 {
-    uint8_t dci;
+    uint8_t dci_index;
     struct mlx5_cqe64 *cqe;
-    uint32_t qp_num;
     uint16_t hw_ci;
     UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
@@ -224,101 +230,97 @@ uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface)
 
     ucs_memory_cpu_load_fence();
 
-    qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
-    dci = uct_dc_mlx5_iface_dci_find(iface, qp_num);
-    txqp = &iface->tx.dcis[dci].txqp;
-    txwq = &iface->tx.dcis[dci].txwq;
-    hw_ci = ntohs(cqe->wqe_counter);
+    dci_index = uct_dc_mlx5_iface_dci_find(iface, cqe);
+    txqp      = &iface->tx.dcis[dci_index].txqp;
+    txwq      = &iface->tx.dcis[dci_index].txwq;
+    hw_ci     = ntohs(cqe->wqe_counter);
 
-    ucs_trace_poll("dc iface %p tx_cqe: dci[%d] qpn 0x%x txqp %p hw_ci %d",
-                   iface, dci, qp_num, txqp, hw_ci);
+    ucs_trace_poll("dc iface %p tx_cqe: dci[%d] txqp %p hw_ci %d",
+                   iface, dci_index, txqp, hw_ci);
 
     uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci);
-
-    uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci));
-    ucs_assert(uct_rc_txqp_available(txqp) <= txwq->bb_max);
-
-    uct_rc_iface_update_reads(&iface->super.super);
+    uct_dc_mlx5_update_tx_res(iface, txwq, txqp, hw_ci);
 
     /**
      * Note: DCI is released after handling completion callbacks,
      *       to avoid OOO sends when this is the only missing resource.
      */
-    uct_dc_mlx5_iface_dci_put(iface, dci);
-    uct_dc_mlx5_iface_progress_pending(iface);
+    uct_dc_mlx5_iface_dci_put(iface, dci_index);
+    uct_dc_mlx5_iface_progress_pending(iface,
+                                       iface->tx.dcis[dci_index].pool_index);
+
     return 1;
 }
 
-static unsigned uct_dc_mlx5_iface_progress(void *arg)
+static UCS_F_ALWAYS_INLINE unsigned
+uct_dc_mlx5_iface_progress(void *arg, int flags)
 {
     uct_dc_mlx5_iface_t *iface = arg;
     unsigned count;
 
-    count = uct_rc_mlx5_iface_common_poll_rx(&iface->super, 0);
-    if (count > 0) {
+    count = uct_rc_mlx5_iface_common_poll_rx(&iface->super, flags);
+    if (!uct_rc_iface_poll_tx(&iface->super.super, count)) {
         return count;
     }
-    return uct_dc_mlx5_poll_tx(iface);
+
+    return count + uct_dc_mlx5_poll_tx(iface);
 }
 
-static unsigned uct_dc_mlx5_iface_progress_tm(void *arg)
+static unsigned uct_dc_mlx5_iface_progress_cyclic(void *arg)
 {
-    uct_dc_mlx5_iface_t *iface = arg;
-    unsigned count;
-
-    count = uct_rc_mlx5_iface_common_poll_rx(&iface->super,
-                                             UCT_RC_MLX5_POLL_FLAG_TM);
-    if (count > 0) {
-        return count;
-    }
-    return uct_dc_mlx5_poll_tx(iface);
+    return uct_dc_mlx5_iface_progress(arg, 0);
 }
 
-static void UCS_CLASS_DELETE_FUNC_NAME(uct_dc_mlx5_iface_t)(uct_iface_t*);
-
-ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface,
-                                         uct_dc_dci_t *dci)
+static unsigned uct_dc_mlx5_iface_progress_ll(void *arg)
 {
-    uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
-                                          uct_ib_mlx5_md_t);
-    ucs_status_t status;
-
-    ucs_assert(dci->txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS);
-    ucs_debug("iface %p reset dci[%p]", iface, dci);
-
-    /* Synchronize CQ index with the driver, since it would remove pending
-     * completions for this QP (both send and receive) during ibv_destroy_qp().
-     */
-    uct_rc_mlx5_iface_common_update_cqs_ci(&iface->super,
-                                           &iface->super.super.super);
-    status = uct_ib_mlx5_modify_qp_state(md, &dci->txwq.super, IBV_QPS_RESET);
-    uct_rc_mlx5_iface_common_sync_cqs_ci(&iface->super,
-                                         &iface->super.super.super);
-
-    uct_rc_mlx5_iface_commom_clean(&iface->super.cq[UCT_IB_DIR_TX], NULL,
-                                   dci->txwq.super.qp_num);
-
-    /* Resume posting from to the beginning of the QP */
-    uct_ib_mlx5_txwq_reset(&dci->txwq);
+    return uct_dc_mlx5_iface_progress(arg, UCT_RC_MLX5_POLL_FLAG_LINKED_LIST);
+}
 
-    return status;
+static unsigned uct_dc_mlx5_iface_progress_tm(void *arg)
+{
+    return uct_dc_mlx5_iface_progress(arg, UCT_RC_MLX5_POLL_FLAG_TM);
 }
 
-static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
-                                                struct ibv_qp_cap *cap,
-                                                uct_dc_dci_t *dci)
+static void UCS_CLASS_DELETE_FUNC_NAME(uct_dc_mlx5_iface_t)(uct_iface_t*);
+
+static ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
+                                                 int pool_index, int dci_index,
+                                                 uint8_t path_index,
+                                                 int full_handshake)
 {
     uct_ib_iface_t *ib_iface           = &iface->super.super.super;
     uct_ib_mlx5_qp_attr_t attr         = {};
     ucs_status_t status;
+    uct_ib_mlx5_md_t *md               = ucs_derived_of(ib_iface->super.md,
+                                                        uct_ib_mlx5_md_t);
+    uct_dc_dci_t *dci                  = &iface->tx.dcis[dci_index];
 #if HAVE_DC_DV
     uct_ib_device_t *dev               = uct_ib_iface_device(ib_iface);
     struct mlx5dv_qp_init_attr dv_attr = {};
     struct ibv_qp *qp;
 
+    ucs_assert(iface->super.super.super.config.qp_type == UCT_IB_QPT_DCI);
+
+    dci->pool_index = pool_index;
+    dci->path_index = path_index;
+
     uct_rc_mlx5_iface_fill_attr(&iface->super, &attr,
                                 iface->super.super.config.tx_qp_len,
                                 &iface->super.rx.srq);
+
+    if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_DCI) {
+        attr.super.max_inl_cqe[UCT_IB_DIR_RX] = 0;
+        attr.uidx           = htonl(dci_index) >> UCT_IB_UIDX_SHIFT;
+        attr.full_handshake = full_handshake;
+        status = uct_ib_mlx5_devx_create_qp(ib_iface, &dci->txwq.super,
+                                            &dci->txwq, &attr);
+        if (status != UCS_OK) {
+            return status;
+        }
+
+        goto init_qp;
+    }
+
     status = uct_ib_mlx5_iface_fill_attr(ib_iface, &dci->txwq.super, &attr);
     if (status != UCS_OK) {
         return status;
@@ -341,6 +343,8 @@ static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
 
     dci->txwq.super.verbs.qp = qp;
     dci->txwq.super.qp_num = dci->txwq.super.verbs.qp->qp_num;
+
+init_qp:
 #else
     uct_rc_mlx5_iface_fill_attr(&iface->super, &attr,
                                 iface->super.super.config.tx_qp_len,
@@ -350,7 +354,6 @@ static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
         return status;
     }
 #endif
-
     status = uct_rc_txqp_init(&dci->txqp, &iface->super.super,
                               dci->txwq.super.qp_num
                               UCS_STATS_ARG(iface->super.super.stats));
@@ -373,21 +376,22 @@ static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
     dci->flags = 0;
 #endif
 
-    status = uct_ib_mlx5_txwq_init(iface->super.super.super.super.worker,
-                                   iface->super.tx.mmio_mode, &dci->txwq,
-                                   dci->txwq.super.verbs.qp);
-    if (status != UCS_OK) {
-        goto err;
+    if (dci->txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS) {
+        status = uct_ib_mlx5_txwq_init(iface->super.super.super.super.worker,
+                                       iface->super.tx.mmio_mode, &dci->txwq,
+                                       dci->txwq.super.verbs.qp);
+        if (status != UCS_OK) {
+            goto err;
+        }
     }
 
     uct_rc_txqp_available_set(&dci->txqp, dci->txwq.bb_max);
-    *cap = attr.super.ibv.cap;
     return UCS_OK;
 
 err:
     uct_rc_txqp_cleanup(&iface->super.super, &dci->txqp);
 err_qp:
-    ibv_destroy_qp(dci->txwq.super.verbs.qp);
+    uct_ib_mlx5_destroy_qp(md, &dci->txwq.super);
     return status;
 }
 
@@ -401,7 +405,8 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface,
     long attr_mask;
 
     if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) {
-        return uct_dc_mlx5_iface_devx_dci_connect(iface, &dci->txwq.super);
+        return uct_dc_mlx5_iface_devx_dci_connect(iface, &dci->txwq.super,
+                                                  dci->path_index);
     }
 
     ucs_assert(dci->txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS);
@@ -457,7 +462,9 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface,
     return UCS_OK;
 }
 
-ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface)
+ucs_status_t
+uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
+                             const uct_dc_mlx5_iface_config_t *config)
 {
     uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
                                           uct_ib_mlx5_md_t);
@@ -468,7 +475,8 @@ ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface)
     int ret;
 
     if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_DCT) {
-        return uct_dc_mlx5_iface_devx_create_dct(iface);
+        return uct_dc_mlx5_iface_devx_create_dct(iface,
+                                                 config->dct_full_handshake);
     }
 
     init_attr.comp_mask             = IBV_QP_INIT_ATTR_PD;
@@ -556,9 +564,10 @@ void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface)
 
 static void uct_dc_mlx5_iface_cleanup_dcis(uct_dc_mlx5_iface_t *iface)
 {
+    int num_dcis = uct_dc_mlx5_iface_total_ndci(iface);
     int i;
 
-    for (i = 0; i < iface->tx.ndci; i++) {
+    for (i = 0; i < num_dcis; i++) {
         if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
             ucs_arbiter_group_cleanup(&iface->tx.dcis[i].arb_group);
         }
@@ -653,7 +662,11 @@ uct_dc_mlx5_init_rx(uct_rc_iface_t *rc_iface,
         goto err;
     }
 
-    iface->super.super.progress = uct_dc_mlx5_iface_progress;
+    if (iface->super.config.srq_topo == UCT_RC_MLX5_SRQ_TOPO_LIST) {
+        iface->super.super.progress = uct_dc_mlx5_iface_progress_ll;
+    } else {
+        iface->super.super.progress = uct_dc_mlx5_iface_progress_cyclic;
+    }
     return UCS_OK;
 
 err_free_srq:
@@ -672,7 +685,9 @@ void uct_dc_mlx5_cleanup_rx(uct_rc_iface_t *rc_iface)
 }
 
 #ifdef HAVE_DC_EXP
-ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface)
+ucs_status_t
+uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
+                             const uct_dc_mlx5_iface_config_t *config)
 {
     struct ibv_exp_dct_init_attr init_attr;
 
@@ -778,56 +793,89 @@ void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface)
 
 void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max)
 {
+    uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
+                                          uct_ib_mlx5_md_t);
     int i;
+
     for (i = 0; i < max; i++) {
         uct_rc_txqp_cleanup(&iface->super.super, &iface->tx.dcis[i].txqp);
-        ucs_assert(iface->tx.dcis[i].txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS);
-        uct_ib_destroy_qp(iface->tx.dcis[i].txwq.super.verbs.qp);
+        uct_ib_mlx5_destroy_qp(md, &iface->tx.dcis[i].txwq.super);
     }
 }
 
-static ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
-                                                 uct_dc_dci_ops_t *ops,
-                                                 uct_dc_dci_t *dci)
-{
-    struct ibv_qp_cap cap = {};
-
-    dci->ops = ops;
-    return uct_dc_mlx5_iface_create_qp(iface, &cap, dci);
-}
-
-static ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface)
+ucs_status_t
+uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface,
+                              const uct_dc_mlx5_iface_config_t *config)
 {
+    uint8_t num_paths = iface->super.super.super.num_paths;
+    uct_dc_mlx5_dci_pool_t *dci_pool;
+    int i, pool_index, dci_index;
     ucs_status_t status;
-    int i;
 
-    ucs_debug("creating %d dci(s)", iface->tx.ndci);
-    ucs_assert(iface->super.super.super.config.qp_type == UCT_IB_QPT_DCI);
+    dci_index = 0;
+    for (pool_index = 0; pool_index < iface->tx.num_dci_pools; pool_index++) {
+        dci_pool = &iface->tx.dci_pool[pool_index];
+        ucs_debug("creating dci pool %d with %d QPs", pool_index,
+                  iface->tx.ndci);
+        dci_pool->stack_top = 0;
+        ucs_arbiter_init(&dci_pool->arbiter);
+
+        for (i = 0; i < iface->tx.ndci; ++i) {
+            status = uct_dc_mlx5_iface_create_dci(iface, pool_index, dci_index,
+                                                  pool_index % num_paths,
+                                                  config->dci_full_handshake);
+            if (status != UCS_OK) {
+                goto err;
+            }
 
-    iface->tx.stack_top = 0;
-    for (i = 0; i < iface->tx.ndci; i++) {
-        status = uct_dc_mlx5_iface_create_dci(iface, &uct_dc_mlx5_dci_ops,
-                                              &iface->tx.dcis[i]);
-        if (status != UCS_OK) {
-            goto err;
+            dci_pool->stack[i] = dci_index;
+            ++dci_index;
         }
-
-        iface->tx.dcis_stack[i] = i;
     }
 
-    iface->super.super.config.tx_qp_len = iface->tx.dcis[0].txwq.bb_max;
-
+    iface->tx.bb_max = iface->tx.dcis[0].txwq.bb_max;
     return UCS_OK;
 
 err:
-    uct_dc_mlx5_iface_dcis_destroy(iface, i);
+    uct_dc_mlx5_iface_dcis_destroy(iface, dci_index);
     return status;
 }
 
 void uct_dc_mlx5_iface_set_quota(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_iface_config_t *config)
 {
-    iface->tx.available_quota = iface->super.super.config.tx_qp_len -
-                                ucs_min(iface->super.super.config.tx_qp_len, config->quota);
+    iface->tx.available_quota = iface->tx.bb_max - ucs_min(iface->tx.bb_max,
+                                                           config->quota);
+}
+
+static void uct_dc_mlx5_iface_vfs_refresh(uct_iface_h tl_iface)
+{
+    uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t);
+    uct_dc_mlx5_dci_pool_t *dci_pool;
+    int i, pool_index, dci_index;
+    uct_dc_dci_t *dci;
+
+    /* Add iface resources */
+    uct_rc_iface_vfs_populate(&iface->super.super);
+
+    /* Add objects for DCIs */
+    dci_index = 0;
+    for (pool_index = 0; pool_index < iface->tx.num_dci_pools; pool_index++) {
+        dci_pool = &iface->tx.dci_pool[pool_index];
+        ucs_vfs_obj_add_dir(iface, dci_pool, "dci_pool/%d", pool_index);
+        for (i = 0; i < iface->tx.ndci; ++i) {
+            dci = &iface->tx.dcis[dci_index];
+            ucs_vfs_obj_add_dir(dci_pool, dci, "%d", dci_index);
+            uct_ib_mlx5_txwq_vfs_populate(&dci->txwq, dci);
+            uct_rc_txqp_vfs_populate(&dci->txqp, dci);
+            ++dci_index;
+        }
+    }
+
+    /* Add objects for DCT */
+    ucs_vfs_obj_add_dir(iface, &iface->rx.dct, "dct");
+    ucs_vfs_obj_add_ro_file(&iface->rx.dct, ucs_vfs_show_primitive,
+                            &iface->rx.dct.qp_num, UCS_VFS_TYPE_U32_HEX,
+                            "qp_num");
 }
 
 void uct_dc_mlx5_iface_init_version(uct_dc_mlx5_iface_t *iface, uct_md_h md)
@@ -888,13 +936,13 @@ static inline ucs_status_t uct_dc_mlx5_iface_flush_dcis(uct_dc_mlx5_iface_t *ifa
 {
     int i;
 
-    if (iface->tx.fc_grants) {
+    if (kh_size(&iface->tx.fc_hash) != 0) {
         /* If some ep is waiting for grant it may have some pending
          * operations, while all QP resources are available. */
         return UCS_INPROGRESS;
     }
 
-    for (i = 0; i < iface->tx.ndci; i++) {
+    for (i = 0; i < iface->tx.ndci * iface->tx.num_dci_pools; i++) {
         if (uct_dc_mlx5_iface_flush_dci(iface, i) != UCS_OK) {
             return UCS_INPROGRESS;
         }
@@ -948,7 +996,8 @@ ucs_status_t uct_dc_mlx5_iface_init_fc_ep(uct_dc_mlx5_iface_t *iface)
         goto err_free;
     }
 
-    status = uct_dc_mlx5_ep_basic_init(iface, ep);
+    ep->flags = 0;
+    status    = uct_dc_mlx5_ep_basic_init(iface, ep);
     if (status != UCS_OK) {
         ucs_error("FC ep init failed %s", ucs_status_string(status));
         goto err_cleanup;
@@ -997,11 +1046,16 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_
                                           uint32_t imm_data, uint16_t lid, unsigned flags)
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(rc_iface, uct_dc_mlx5_iface_t);
-    uint8_t             fc_hdr = uct_rc_fc_get_fc_hdr(hdr->am_id);
+    uint8_t fc_hdr             = uct_rc_fc_get_fc_hdr(hdr->am_id);
+    uct_dc_fc_sender_data_t *sender;
     uct_dc_fc_request_t *dc_req;
-    int16_t             cur_wnd;
-    ucs_status_t        status;
-    uct_dc_mlx5_ep_t    *ep;
+    int16_t cur_wnd;
+    ucs_status_t status;
+    uct_dc_mlx5_ep_t *ep;
+    khiter_t it;
+    ucs_arbiter_t *waitq;
+    ucs_arbiter_group_t *group;
+    uint8_t pool_index;
 
     ucs_assert(rc_iface->config.fc_enabled);
 
@@ -1031,22 +1085,25 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_
                                ucs_status_string(status));
         }
     } else if (fc_hdr == UCT_RC_EP_FC_PURE_GRANT) {
-        ep = *((uct_dc_mlx5_ep_t**)(hdr + 1));
-
-        if (!(ep->flags & UCT_DC_MLX5_EP_FLAG_VALID)) {
-            /* Just remove ep now, no need to clear waiting for grant state
-             * (it was done in destroy_ep func) */
-            uct_dc_mlx5_ep_release(ep);
+        sender = (uct_dc_fc_sender_data_t *)(hdr + 1);
+
+        it = kh_get(uct_dc_mlx5_fc_hash, &iface->tx.fc_hash, sender->ep);
+        if ((it == kh_end(&iface->tx.fc_hash)) ||
+            (kh_value(&iface->tx.fc_hash, it) != sender->payload.seq)) {
+            /* Just ignore, ep was removed. We either not expecting grant on
+             * this EP or this is not the same grant sequence number we are
+             * expecting. */
             return UCS_OK;
         }
 
+        ep      = (uct_dc_mlx5_ep_t *)sender->ep;
         cur_wnd = ep->fc.fc_wnd;
 
         /* Peer granted resources, so update wnd */
         ep->fc.fc_wnd = rc_iface->config.fc_wnd_size;
 
-        /* Clear the flag for flush to complete  */
-        uct_dc_mlx5_ep_clear_fc_grant_flag(iface, ep);
+        /* Remove entry for flush to complete  */
+        kh_del(uct_dc_mlx5_fc_hash, &iface->tx.fc_hash, it);
 
         UCS_STATS_UPDATE_COUNTER(ep->fc.stats, UCT_RC_FC_STAT_RX_PURE_GRANT, 1);
         UCS_STATS_SET_COUNTER(ep->fc.stats, UCT_RC_FC_STAT_FC_WND, ep->fc.fc_wnd);
@@ -1054,17 +1111,10 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_
         /* To preserve ordering we have to dispatch all pending
          * operations if current fc_wnd is <= 0 */
         if (cur_wnd <= 0) {
-            if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
-                ucs_arbiter_group_schedule(uct_dc_mlx5_iface_dci_waitq(iface),
-                                           &ep->arb_group);
-            } else {
-                /* Need to schedule fake ep in TX arbiter, because it
-                 * might have been descheduled due to lack of FC window. */
-                ucs_arbiter_group_schedule(uct_dc_mlx5_iface_tx_waitq(iface),
-                                           uct_dc_mlx5_ep_arb_group(iface, ep));
-            }
-
-            uct_dc_mlx5_iface_progress_pending(iface);
+            uct_dc_mlx5_get_arbiter_params(iface, ep, &waitq, &group,
+                                           &pool_index);
+            ucs_arbiter_group_schedule(waitq, group);
+            uct_dc_mlx5_iface_progress_pending(iface, pool_index);
         }
     }
 
@@ -1073,7 +1123,7 @@ ucs_status_t uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_
 
 static void uct_dc_mlx5_dci_handle_failure(uct_dc_mlx5_iface_t *iface,
                                            struct mlx5_cqe64 *cqe,
-                                           uint8_t dci,
+                                           uint8_t dci_index,
                                            ucs_status_t status)
 {
     uct_dc_mlx5_ep_t *ep;
@@ -1083,18 +1133,18 @@ static void uct_dc_mlx5_dci_handle_failure(uct_dc_mlx5_iface_t *iface,
         ep    = NULL;
         level = UCS_LOG_LEVEL_FATAL; /* error handling is not supported with rand dci */
     } else {
-        ep    = uct_dc_mlx5_ep_from_dci(iface, dci);
+        ep    = uct_dc_mlx5_ep_from_dci(iface, dci_index);
         level = iface->super.super.super.super.config.failure_level;
     }
 
     if (ep == NULL) {
         uct_ib_mlx5_completion_with_err(&iface->super.super.super,
                                         (uct_ib_mlx5_err_cqe_t*)cqe,
-                                        &iface->tx.dcis[dci].txwq, level);
+                                        &iface->tx.dcis[dci_index].txwq, level);
         return;
     }
 
-    ep = uct_dc_mlx5_ep_from_dci(iface, dci);
+    ep = uct_dc_mlx5_ep_from_dci(iface, dci_index);
     uct_dc_mlx5_ep_handle_failure(ep, cqe, status);
 }
 
@@ -1102,23 +1152,41 @@ static void uct_dc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface,
                                              void *arg, ucs_status_t status)
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_dc_mlx5_iface_t);
-    struct mlx5_cqe64   *cqe   = arg;
-    uint32_t            qp_num = ntohl(cqe->sop_drop_qpn) &
-                                 UCS_MASK(UCT_IB_QPN_ORDER);
-    uint8_t             dci    = uct_dc_mlx5_iface_dci_find(iface, qp_num);
+    struct mlx5_cqe64 *cqe     = arg;
+    uint8_t dci_index          = uct_dc_mlx5_iface_dci_find(iface, cqe);
 
-    iface->tx.dcis[dci].ops->handle_failure(iface, cqe, dci, status);
+    if (uct_dc_mlx5_iface_is_dci_keepalive(iface, dci_index)) {
+        uct_dc_mlx5_dci_keepalive_handle_failure(iface, cqe, dci_index, status);
+    } else {
+        uct_dc_mlx5_dci_handle_failure(iface, cqe, dci_index, status);
+    }
 }
 
 static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = {
-    {
-    {
+    .super = {
+        .super = {
+            .iface_estimate_perf = uct_base_iface_estimate_perf,
+            .iface_vfs_refresh   = uct_dc_mlx5_iface_vfs_refresh,
+        },
+        .create_cq      = uct_ib_mlx5_create_cq,
+        .arm_cq         = uct_rc_mlx5_iface_common_arm_cq,
+        .event_cq       = uct_rc_mlx5_iface_common_event_cq,
+        .handle_failure = uct_dc_mlx5_iface_handle_failure,
+    },
+    .init_rx    = uct_dc_mlx5_init_rx,
+    .cleanup_rx = uct_dc_mlx5_cleanup_rx,
+    .fc_ctrl    = uct_dc_mlx5_ep_fc_ctrl,
+    .fc_handler = uct_dc_mlx5_iface_fc_handler,
+};
+
+static uct_iface_ops_t uct_dc_mlx5_iface_tl_ops = {
     .ep_put_short             = uct_dc_mlx5_ep_put_short,
     .ep_put_bcopy             = uct_dc_mlx5_ep_put_bcopy,
     .ep_put_zcopy             = uct_dc_mlx5_ep_put_zcopy,
     .ep_get_bcopy             = uct_dc_mlx5_ep_get_bcopy,
     .ep_get_zcopy             = uct_dc_mlx5_ep_get_zcopy,
     .ep_am_short              = uct_dc_mlx5_ep_am_short,
+    .ep_am_short_iov          = uct_dc_mlx5_ep_am_short_iov,
     .ep_am_bcopy              = uct_dc_mlx5_ep_am_bcopy,
     .ep_am_zcopy              = uct_dc_mlx5_ep_am_zcopy,
     .ep_atomic_cswap64        = uct_dc_mlx5_ep_atomic_cswap64,
@@ -1131,6 +1199,7 @@ static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = {
     .ep_pending_purge         = uct_dc_mlx5_ep_pending_purge,
     .ep_flush                 = uct_dc_mlx5_ep_flush,
     .ep_fence                 = uct_dc_mlx5_ep_fence,
+    .ep_check                 = uct_dc_mlx5_ep_check,
 #if IBV_HW_TM
     .ep_tag_eager_short       = uct_dc_mlx5_ep_tag_eager_short,
     .ep_tag_eager_bcopy       = uct_dc_mlx5_ep_tag_eager_bcopy,
@@ -1149,23 +1218,12 @@ static uct_rc_iface_ops_t uct_dc_mlx5_iface_ops = {
     .iface_event_fd_get       = uct_ib_iface_event_fd_get,
     .iface_event_arm          = uct_rc_iface_event_arm,
     .ep_create                = uct_dc_mlx5_ep_create_connected,
-    .ep_destroy               = uct_dc_mlx5_ep_destroy,
+    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_dc_mlx5_ep_t),
     .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_dc_mlx5_iface_t),
     .iface_query              = uct_dc_mlx5_iface_query,
     .iface_get_device_address = uct_ib_iface_get_device_address,
     .iface_is_reachable       = uct_dc_mlx5_iface_is_reachable,
     .iface_get_address        = uct_dc_mlx5_iface_get_address,
-    },
-    .create_cq                = uct_ib_mlx5_create_cq,
-    .arm_cq                   = uct_rc_mlx5_iface_common_arm_cq,
-    .event_cq                 = uct_rc_mlx5_iface_common_event_cq,
-    .handle_failure           = uct_dc_mlx5_iface_handle_failure,
-    .set_ep_failed            = uct_dc_mlx5_ep_set_failed,
-    },
-    .init_rx                  = uct_dc_mlx5_init_rx,
-    .cleanup_rx               = uct_dc_mlx5_cleanup_rx,
-    .fc_ctrl                  = uct_dc_mlx5_ep_fc_ctrl,
-    .fc_handler               = uct_dc_mlx5_iface_fc_handler,
 };
 
 static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h worker,
@@ -1177,6 +1235,8 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
     uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
     uct_ib_iface_init_attr_t init_attr = {};
     ucs_status_t status;
+    unsigned tx_cq_size;
+
     ucs_trace_func("");
 
     if (config->ndci < 1) {
@@ -1185,72 +1245,87 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
         return UCS_ERR_INVALID_PARAM;
     }
 
-    if (config->ndci > UCT_DC_MLX5_IFACE_MAX_DCIS) {
+    if (config->ndci > UCT_DC_MLX5_IFACE_MAX_USER_DCIS) {
         ucs_error("dc interface can have at most %d dcis (requested: %d)",
-                  UCT_DC_MLX5_IFACE_MAX_DCIS, config->ndci);
+                  UCT_DC_MLX5_IFACE_MAX_USER_DCIS, config->ndci);
         return UCS_ERR_INVALID_PARAM;
     }
 
     init_attr.qp_type     = UCT_IB_QPT_DCI;
-    init_attr.flags       = UCT_IB_CQ_IGNORE_OVERRUN;
+    init_attr.flags       = UCT_IB_CQ_IGNORE_OVERRUN |
+                            UCT_IB_TX_OPS_PER_PATH;
     init_attr.fc_req_size = sizeof(uct_dc_fc_request_t);
-    init_attr.rx_hdr_len  = sizeof(uct_rc_mlx5_hdr_t);
 
     if (md->flags & UCT_IB_MLX5_MD_FLAG_DC_TM) {
         init_attr.flags  |= UCT_IB_TM_SUPPORTED;
     }
 
-    /* driver will round up to pow of 2 if needed */
     init_attr.cq_len[UCT_IB_DIR_TX] = config->super.super.tx.queue_len *
-                                      UCT_IB_MLX5_MAX_BB * config->ndci;
+                                      UCT_IB_MLX5_MAX_BB *
+                                      (config->ndci + UCT_DC_MLX5_KEEPALIVE_NUM_DCIS);
     /* TODO check caps instead */
-    if (ucs_roundup_pow2(init_attr.cq_len[UCT_IB_DIR_TX]) > UCT_DC_MLX5_MAX_TX_CQ_LEN) {
+    UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t,
+                              &uct_dc_mlx5_iface_ops, &uct_dc_mlx5_iface_tl_ops,
+                              tl_md, worker, params, &config->super,
+                              &config->rc_mlx5_common, &init_attr);
+
+    tx_cq_size = uct_ib_cq_size(&self->super.super.super, &init_attr,
+                                UCT_IB_DIR_TX);
+
+    /* driver will round up num cqes to pow of 2 if needed */
+    if (ucs_roundup_pow2(tx_cq_size) > UCT_DC_MLX5_MAX_TX_CQ_LEN) {
         ucs_error("Can't allocate TX resources, try to decrease dcis number (%d)"
                   " or tx qp length (%d)",
                   config->ndci, config->super.super.tx.queue_len);
         return UCS_ERR_INVALID_PARAM;
     }
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t,
-                              &uct_dc_mlx5_iface_ops,
-                              tl_md, worker, params, &config->super,
-                              &config->rc_mlx5_common, &init_attr);
-
     uct_dc_mlx5_iface_init_version(self, tl_md);
 
     self->tx.ndci                          = config->ndci;
     self->tx.policy                        = (uct_dc_tx_policy_t)config->tx_policy;
-    self->tx.fc_grants                     = 0;
+    self->tx.fc_seq                        = 0;
+    self->keepalive_dci                    = -1;
+    self->tx.num_dci_pools                 = 1;
     self->super.super.config.tx_moderation = 0; /* disable tx moderation for dcs */
-    ucs_list_head_init(&self->tx.gc_list);
+    self->flags                            = 0;
+    kh_init_inplace(uct_dc_mlx5_fc_hash, &self->tx.fc_hash);
 
     self->tx.rand_seed = config->rand_seed ? config->rand_seed : time(NULL);
     self->tx.pend_cb   = uct_dc_mlx5_iface_is_dci_rand(self) ?
                          uct_dc_mlx5_iface_dci_do_rand_pending_tx :
                          uct_dc_mlx5_iface_dci_do_dcs_pending_tx;
 
+    if (ucs_test_all_flags(md->flags, UCT_IB_MLX5_MD_FLAG_DEVX_DCI |
+                                      UCT_IB_MLX5_MD_FLAG_CQE_V1)) {
+        self->flags           |= UCT_DC_MLX5_IFACE_FLAG_UIDX;
+        self->tx.num_dci_pools = self->super.super.super.num_paths;
+    }
+    if (config->dci_ka_full_handshake) {
+        self->flags |= UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE;
+    }
+    ucs_assert(self->tx.num_dci_pools <= UCT_DC_MLX5_IFACE_MAX_DCI_POOLS);
+
     /* create DC target */
-    status = uct_dc_mlx5_iface_create_dct(self);
+    status = uct_dc_mlx5_iface_create_dct(self, config);
     if (status != UCS_OK) {
         goto err;
     }
 
     /* create DC initiators */
-    status = uct_dc_mlx5_iface_create_dcis(self);
+    status = uct_dc_mlx5_iface_create_dcis(self, config);
     if (status != UCS_OK) {
         goto err_destroy_dct;
     }
 
     ucs_debug("dc iface %p: using '%s' policy with %d dcis and %d cqes, dct 0x%x",
               self, uct_dc_tx_policy_names[self->tx.policy], self->tx.ndci,
-              init_attr.cq_len[UCT_IB_DIR_TX], UCT_RC_MLX5_TM_ENABLED(&self->super) ?
+              tx_cq_size, UCT_RC_MLX5_TM_ENABLED(&self->super) ?
               0 : self->rx.dct.qp_num);
 
     /* Create fake endpoint which will be used for sending FC grants */
     uct_dc_mlx5_iface_init_fc_ep(self);
 
-    ucs_arbiter_init(&self->tx.dci_arbiter);
-
     /* mlx5 init part */
     status = uct_ud_mlx5_iface_common_init(&self->super.super.super,
                                            &self->ud_common, &config->mlx5_ud);
@@ -1258,8 +1333,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
         goto err_destroy_dct;
     }
 
-    self->tx.available_quota = self->super.super.config.tx_qp_len -
-                               ucs_min(self->super.super.config.tx_qp_len, config->quota);
+    uct_dc_mlx5_iface_set_quota(self, config);
 
     uct_rc_mlx5_iface_common_prepost_recvs(&self->super);
 
@@ -1275,7 +1349,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
 
 static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_iface_t)
 {
-    uct_dc_mlx5_ep_t *ep, *tmp;
+    int pool_index;
 
     ucs_trace_func("");
     uct_base_iface_progress_disable(&self->super.super.super.super.super,
@@ -1283,13 +1357,12 @@ static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_iface_t)
     uct_dc_mlx5_iface_cleanup_dcis(self);
 
     uct_dc_mlx5_destroy_dct(self);
-
-    ucs_list_for_each_safe(ep, tmp, &self->tx.gc_list, list) {
-        uct_dc_mlx5_ep_release(ep);
-    }
-    uct_dc_mlx5_iface_dcis_destroy(self, self->tx.ndci);
+    kh_destroy_inplace(uct_dc_mlx5_fc_hash, &self->tx.fc_hash);
+    uct_dc_mlx5_iface_dcis_destroy(self, uct_dc_mlx5_iface_total_ndci(self));
     uct_dc_mlx5_iface_cleanup_fc_ep(self);
-    ucs_arbiter_cleanup(&self->tx.dci_arbiter);
+    for (pool_index = 0; pool_index < self->tx.num_dci_pools; pool_index++) {
+        ucs_arbiter_cleanup(&self->tx.dci_pool[pool_index].arbiter);
+    }
 }
 
 UCS_CLASS_DEFINE(uct_dc_mlx5_iface_t, uct_rc_mlx5_iface_common_t);
@@ -1316,3 +1389,151 @@ uct_dc_mlx5_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_
 UCT_TL_DEFINE(&uct_ib_component, dc_mlx5, uct_dc_mlx5_query_tl_devices,
               uct_dc_mlx5_iface_t, "DC_MLX5_", uct_dc_mlx5_iface_config_table,
               uct_dc_mlx5_iface_config_t);
+
+static void
+uct_dc_mlx5_dci_keepalive_handle_failure(uct_dc_mlx5_iface_t *iface,
+                                         struct mlx5_cqe64 *cqe,
+                                         uint8_t dci_index,
+                                         ucs_status_t ep_status)
+{
+    uint16_t hw_ci = ntohs(cqe->wqe_counter);
+    uct_rc_txqp_t *txqp;
+    uct_ib_mlx5_txwq_t *txwq;
+    uct_dc_mlx5_ep_t *ep;
+    uct_rc_iface_send_op_t *op;
+    ucs_queue_elem_t *elem;
+
+    ucs_assert(dci_index == iface->keepalive_dci);
+    UCT_DC_MLX5_IFACE_TXQP_DCI_GET(iface, dci_index, txqp, txwq);
+
+    elem = ucs_queue_pull(&txqp->outstanding);
+    if (elem == NULL) {
+        /* outstanding list is empty, just exit */
+        goto reset_dci;
+    }
+
+    op = ucs_container_of(elem, uct_rc_iface_send_op_t, queue);
+    if (hw_ci != op->sn) {
+        goto put_op;
+    }
+
+    ep = ucs_derived_of(op->ep, uct_dc_mlx5_ep_t);
+
+    if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
+        ucs_assert(ep != iface->tx.fc_ep);
+        uct_dc_mlx5_iface_set_ep_failed(iface, ep, cqe, txwq, ep_status);
+    } else {
+        /* ep has another dci assinged to post operations, which sould be
+         * restarted too */
+        uct_dc_mlx5_ep_handle_failure(ep, cqe, ep_status);
+    }
+
+put_op:
+    ucs_mpool_put(op);
+
+reset_dci:
+    uct_rc_txqp_available_set(txqp, iface->tx.bb_max);
+    uct_rc_txqp_purge_outstanding(&iface->super.super, txqp, ep_status,
+                                  txwq->sw_pi, 0);
+    uct_dc_mlx5_iface_reset_dci(iface, dci_index);
+}
+
+ucs_status_t uct_dc_mlx5_iface_keepalive_init(uct_dc_mlx5_iface_t *iface)
+{
+    int full_handshake = iface->flags &
+                         UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE;
+    ucs_status_t status;
+    uint8_t dci_index;
+
+    if (ucs_likely(iface->flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE)) {
+        return UCS_OK;
+    }
+
+    dci_index = uct_dc_mlx5_iface_total_ndci(iface);
+    status    = uct_dc_mlx5_iface_create_dci(iface, 0, dci_index, 0,
+                                             full_handshake);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    iface->flags        |= UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE;
+    iface->keepalive_dci = dci_index;
+    return UCS_OK;
+}
+
+void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci_index)
+{
+    uct_ib_mlx5_md_t *md     = ucs_derived_of(iface->super.super.super.super.md,
+                                              uct_ib_mlx5_md_t);
+    uct_ib_mlx5_txwq_t *txwq = &iface->tx.dcis[dci_index].txwq;
+    ucs_status_t status;
+
+    ucs_debug("iface %p reset dci[%d] qpn 0x%x", iface, dci_index,
+              txwq->super.qp_num);
+
+    ucs_assert(!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index));
+
+    /* Synchronize CQ index with the driver, since it would remove pending
+     * completions for this QP (both send and receive) during ibv_destroy_qp().
+     */
+    uct_rc_mlx5_iface_common_update_cqs_ci(&iface->super,
+                                           &iface->super.super.super);
+    status = uct_ib_mlx5_modify_qp_state(md, &txwq->super, IBV_QPS_RESET);
+    uct_rc_mlx5_iface_common_sync_cqs_ci(&iface->super,
+                                         &iface->super.super.super);
+
+    uct_rc_mlx5_iface_commom_clean(&iface->super.cq[UCT_IB_DIR_TX], NULL,
+                                   txwq->super.qp_num);
+
+    /* Resume posting from to the beginning of the QP */
+    uct_ib_mlx5_txwq_reset(txwq);
+    if (status != UCS_OK) {
+        ucs_fatal("iface %p failed to reset dci[%d] qpn 0x%x: %s",
+                  iface, dci_index, txwq->super.qp_num,
+                  ucs_status_string(status));
+    }
+
+    status = uct_dc_mlx5_iface_dci_connect(iface, &iface->tx.dcis[dci_index]);
+    if (status != UCS_OK) {
+        ucs_fatal("iface %p failed to connect dci[%d] qpn 0x%x: %s",
+                  iface, dci_index, txwq->super.qp_num,
+                  ucs_status_string(status));
+    }
+}
+
+void uct_dc_mlx5_iface_set_ep_failed(uct_dc_mlx5_iface_t *iface,
+                                     uct_dc_mlx5_ep_t *ep,
+                                     struct mlx5_cqe64 *cqe,
+                                     uct_ib_mlx5_txwq_t *txwq,
+                                     ucs_status_t ep_status)
+{
+    uct_ib_iface_t *ib_iface = &iface->super.super.super;
+    ucs_status_t status;
+    ucs_log_level_t log_lvl;
+
+    if (ep->flags & (UCT_DC_MLX5_EP_FLAG_ERR_HANDLER_INVOKED |
+                     UCT_DC_MLX5_EP_FLAG_FLUSH_CANCEL)) {
+        return;
+    }
+
+    if (ep_status == UCS_ERR_CANCELED) {
+        return;
+    }
+
+    if (ep == iface->tx.fc_ep) {
+        /* Do not report errors on flow control endpoint */
+        ucs_debug("got error on DC flow-control endpoint, iface %p: %s", iface,
+                  ucs_status_string(ep_status));
+        return;
+    }
+
+    status  = uct_iface_handle_ep_err(&ib_iface->super.super,
+                                      &ep->super.super, ep_status);
+    log_lvl = uct_base_iface_failure_log_level(&ib_iface->super, status,
+                                               ep_status);
+    uct_ib_mlx5_completion_with_err(ib_iface, (uct_ib_mlx5_err_cqe_t*)cqe,
+                                    txwq, log_lvl);
+
+    ep->flags |= UCT_DC_MLX5_EP_FLAG_ERR_HANDLER_INVOKED;
+}
+
diff --git a/src/uct/ib/dc/dc_mlx5.h b/src/uct/ib/dc/dc_mlx5.h
index 490439d8c26..291e84b6349 100644
--- a/src/uct/ib/dc/dc_mlx5.h
+++ b/src/uct/ib/dc/dc_mlx5.h
@@ -36,11 +36,22 @@ struct ibv_ravh {
 #  define UCT_DC_RNDV_HDR_LEN   0
 #endif
 
-#define UCT_DC_MLX5_IFACE_MAX_DCIS   16
+#define UCT_DC_MLX5_IFACE_MAX_USER_DCIS 15
+#define UCT_DC_MLX5_KEEPALIVE_NUM_DCIS  1
+#define UCT_DC_MLX5_IFACE_MAX_DCI_POOLS 8
+#define UCT_DC_MLX5_IFACE_MAX_DCIS      ((UCT_DC_MLX5_IFACE_MAX_USER_DCIS * \
+                                          UCT_DC_MLX5_IFACE_MAX_DCI_POOLS) + \
+                                          UCT_DC_MLX5_KEEPALIVE_NUM_DCIS)
 
 #define UCT_DC_MLX5_IFACE_ADDR_TM_ENABLED(_addr) \
     (!!((_addr)->flags & UCT_DC_MLX5_IFACE_ADDR_HW_TM))
 
+#define UCT_DC_MLX5_IFACE_TXQP_DCI_GET(_iface, _dci, _txqp, _txwq) \
+    { \
+        _txqp = &(_iface)->tx.dcis[_dci].txqp; \
+        _txwq = &(_iface)->tx.dcis[_dci].txwq; \
+    }
+
 typedef struct uct_dc_mlx5_ep     uct_dc_mlx5_ep_t;
 typedef struct uct_dc_mlx5_iface  uct_dc_mlx5_iface_t;
 
@@ -54,6 +65,18 @@ typedef enum {
 } uct_dc_mlx5_iface_addr_flags_t;
 
 
+typedef enum {
+    /** Keepalive dci is created */
+    UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE                = UCS_BIT(0),
+
+    /** Enable full handshake for keepalive DCI */
+    UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE = UCS_BIT(1),
+
+    /** uidx is set to dci idx */
+    UCT_DC_MLX5_IFACE_FLAG_UIDX                     = UCS_BIT(2)
+} uct_dc_mlx5_iface_flags_t;
+
+
 typedef struct uct_dc_mlx5_iface_addr {
     uct_ib_uint24_t   qp_num;
     uint8_t           atomic_mr_id;
@@ -107,6 +130,9 @@ typedef struct uct_dc_mlx5_iface_config {
     uct_ud_iface_common_config_t        ud_common;
     int                                 ndci;
     int                                 tx_policy;
+    int                                 dci_full_handshake;
+    int                                 dci_ka_full_handshake;
+    int                                 dct_full_handshake;
     unsigned                            quota;
     unsigned                            rand_seed;
     uct_ud_mlx5_iface_common_config_t   mlx5_ud;
@@ -115,22 +141,11 @@ typedef struct uct_dc_mlx5_iface_config {
 
 typedef void (*uct_dc_dci_handle_failure_func_t)(uct_dc_mlx5_iface_t *iface,
                                                  struct mlx5_cqe64 *cqe,
-                                                 uint8_t dci,
+                                                 uint8_t dci_index,
                                                  ucs_status_t status);
 
 
-/**
- * DCI QP-specific operations.
- */
-typedef struct uct_dc_dci_ops {
-    uct_dc_dci_handle_failure_func_t handle_failure; /* callback for handling
-                                                      * completion with error
-                                                      * on the DCI */
-} uct_dc_dci_ops_t;
-
-
 typedef struct uct_dc_dci {
-    uct_dc_dci_ops_t              *ops; /* DCI operations */
     uct_rc_txqp_t                 txqp; /* DCI qp */
     uct_ib_mlx5_txwq_t            txwq; /* DCI mlx5 wq */
     union {
@@ -143,6 +158,8 @@ typedef struct uct_dc_dci {
                                                 processed. Better have dci num
                                                 groups scheduled than ep num. */
     };
+    uint8_t                       pool_index; /* DCI pool index. */
+    uint8_t                       path_index; /* Path index */
 #if UCS_ENABLE_ASSERT
     uint8_t                       flags; /* debug state, @ref uct_dc_dci_state_t */
 #endif
@@ -152,9 +169,10 @@ typedef struct uct_dc_dci {
 typedef struct uct_dc_fc_sender_data {
     uint64_t                      ep;
     struct {
+        uint64_t                  seq;
         int                       is_global;
         union ibv_gid             gid;
-    } UCS_S_PACKED global;
+    } UCS_S_PACKED payload;
 } UCS_S_PACKED uct_dc_fc_sender_data_t;
 
 typedef struct uct_dc_fc_request {
@@ -168,6 +186,17 @@ typedef struct uct_dc_fc_request {
 } uct_dc_fc_request_t;
 
 
+KHASH_MAP_INIT_INT64(uct_dc_mlx5_fc_hash, uint64_t);
+
+
+typedef struct {
+    uint8_t       stack_top;                               /* dci stack top */
+    uint8_t       stack[UCT_DC_MLX5_IFACE_MAX_USER_DCIS];  /* LIFO of indexes of available dcis */
+    ucs_arbiter_t arbiter;                                 /* queue of requests
+                                                              waiting for DCI */
+} uct_dc_mlx5_dci_pool_t;
+
+
 struct uct_dc_mlx5_iface {
     uct_rc_mlx5_iface_common_t    super;
     struct {
@@ -175,24 +204,25 @@ struct uct_dc_mlx5_iface {
         uct_dc_dci_t              dcis[UCT_DC_MLX5_IFACE_MAX_DCIS];
 
         uint8_t                   ndci;                        /* Number of DCIs */
-        uct_dc_tx_policy_t        policy;                      /* dci selection algorithm */
-        int16_t                   available_quota;             /* if available tx is lower, let
-                                                                  another endpoint use the dci */
 
         /* LIFO is only relevant for dcs allocation policy */
-        uint8_t                   stack_top;                   /* dci stack top */
-        uint8_t                   dcis_stack[UCT_DC_MLX5_IFACE_MAX_DCIS];  /* LIFO of indexes of available dcis */
+        uct_dc_mlx5_dci_pool_t    dci_pool[UCT_DC_MLX5_IFACE_MAX_DCI_POOLS];
+        uint8_t                   num_dci_pools;
 
-        ucs_arbiter_t             dci_arbiter;
+        uint8_t                   policy;                      /* dci selection algorithm */
+        int16_t                   available_quota;             /* if available tx is lower, let
+                                                                  another endpoint use the dci */
+        /* DCI max elements */
+        unsigned                  bb_max;
 
         /* Used to send grant messages for all peers */
         uct_dc_mlx5_ep_t          *fc_ep;
 
-        /* List of destroyed endpoints waiting for credit grant */
-        ucs_list_link_t           gc_list;
+        /* Hash of expected FC grants */
+        khash_t(uct_dc_mlx5_fc_hash) fc_hash;
 
-        /* Number of expected FC grants */
-        unsigned                  fc_grants;
+        /* Sequence number of expected FC grants */
+        uint64_t                  fc_seq;
 
         /* Seed used for random dci allocation */
         unsigned                  rand_seed;
@@ -206,13 +236,20 @@ struct uct_dc_mlx5_iface {
 
     uint8_t                       version_flag;
 
+    /* iface flags, see uct_dc_mlx5_iface_flags_t */
+    uint8_t                       flags;
+
+    uint8_t                       keepalive_dci;
+
     uct_ud_mlx5_iface_common_t    ud_common;
 };
 
 
 extern ucs_config_field_t uct_dc_mlx5_iface_config_table[];
 
-ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface);
+ucs_status_t
+uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
+                             const uct_dc_mlx5_iface_config_t *config);
 
 int uct_dc_mlx5_iface_is_reachable(const uct_iface_h tl_iface,
                                    const uct_device_addr_t *dev_addr,
@@ -238,22 +275,35 @@ void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface);
 
 void uct_dc_mlx5_iface_init_version(uct_dc_mlx5_iface_t *iface, uct_md_h md);
 
-ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface,
-                                         uct_dc_dci_t *dci);
-
 ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface,
                                            uct_dc_dci_t *dci);
 
 void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max);
 
+ucs_status_t uct_dc_mlx5_iface_keepalive_init(uct_dc_mlx5_iface_t *iface);
+
+void uct_dc_mlx5_iface_set_ep_failed(uct_dc_mlx5_iface_t *iface,
+                                     uct_dc_mlx5_ep_t *ep,
+                                     struct mlx5_cqe64 *cqe,
+                                     uct_ib_mlx5_txwq_t *txwq,
+                                     ucs_status_t ep_status);
+
+ucs_status_t
+uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface,
+                              const uct_dc_mlx5_iface_config_t *config);
+
+void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci_index);
+
 #if HAVE_DEVX
 
-ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface);
+ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface,
+                                               int full_handshake);
 
 ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface);
 
 ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
-                                                uct_ib_mlx5_qp_t *qp);
+                                                uct_ib_mlx5_qp_t *qp,
+                                                uint8_t path_index);
 
 #else
 
@@ -269,9 +319,8 @@ uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface)
     return UCS_ERR_UNSUPPORTED;
 }
 
-static UCS_F_MAYBE_UNUSED ucs_status_t
-uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
-                                   uct_ib_mlx5_qp_t *qp)
+static UCS_F_MAYBE_UNUSED ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(
+        uct_dc_mlx5_iface_t *iface, uct_ib_mlx5_qp_t *qp, uint8_t path_index)
 {
     return UCS_ERR_UNSUPPORTED;
 }
@@ -288,22 +337,38 @@ uct_dc_mlx5_iface_fill_ravh(struct ibv_ravh *ravh, uint32_t dct_num)
 }
 #endif
 
+static UCS_F_ALWAYS_INLINE uint8_t
+uct_dc_mlx5_iface_total_ndci(uct_dc_mlx5_iface_t *iface)
+{
+    return (iface->tx.ndci * iface->tx.num_dci_pools) +
+        ((iface->flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE) ?
+         UCT_DC_MLX5_KEEPALIVE_NUM_DCIS : 0);
+}
+
 /* TODO:
  * use a better seach algorithm (perfect hash, bsearch, hash) ???
  *
  * linear search is most probably the best way to go
  * because the number of dcis is usually small
  */
-static inline uint8_t uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, uint32_t qp_num)
+static UCS_F_ALWAYS_INLINE uint8_t
+uct_dc_mlx5_iface_dci_find(uct_dc_mlx5_iface_t *iface, struct mlx5_cqe64 *cqe)
 {
-    uct_dc_dci_t *dcis = iface->tx.dcis;
-    int i, ndci = iface->tx.ndci;
+    uint32_t qp_num;
+    int i, ndci;
+
+    if (ucs_likely(iface->flags & UCT_DC_MLX5_IFACE_FLAG_UIDX)) {
+        return cqe->srqn_uidx >> UCT_IB_UIDX_SHIFT;
+    }
 
+    qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
+    ndci   = uct_dc_mlx5_iface_total_ndci(iface);
     for (i = 0; i < ndci; i++) {
-        if (dcis[i].txwq.super.qp_num == qp_num) {
+        if (iface->tx.dcis[i].txwq.super.qp_num == qp_num) {
             return i;
         }
     }
+
     ucs_fatal("DCI (qpnum=%d) does not exist", qp_num);
 }
 
@@ -314,44 +379,55 @@ uct_dc_mlx5_iface_has_tx_resources(uct_dc_mlx5_iface_t *iface)
            (iface->super.super.tx.reads_available > 0);
 }
 
-static inline int uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface, uint8_t dci)
+static UCS_F_ALWAYS_INLINE int
+uct_dc_mlx5_iface_dci_has_tx_resources(uct_dc_mlx5_iface_t *iface,
+                                       uint8_t dci_index)
 {
-    return uct_rc_txqp_available(&iface->tx.dcis[dci].txqp) > 0;
+    return uct_rc_txqp_available(&iface->tx.dcis[dci_index].txqp) > 0;
 }
 
 /* returns pending queue of eps waiting for tx resources */
-static inline ucs_arbiter_t *uct_dc_mlx5_iface_tx_waitq(uct_dc_mlx5_iface_t *iface)
+static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
+uct_dc_mlx5_iface_tx_waitq(uct_dc_mlx5_iface_t *iface)
 {
-    return &iface->tx.dci_arbiter;
+    return &iface->super.super.tx.arbiter;
 }
 
 /* returns pending queue of eps waiting for the dci allocation */
-static inline ucs_arbiter_t *uct_dc_mlx5_iface_dci_waitq(uct_dc_mlx5_iface_t *iface)
+static UCS_F_ALWAYS_INLINE ucs_arbiter_t *
+uct_dc_mlx5_iface_dci_waitq(uct_dc_mlx5_iface_t *iface, uint8_t pool_index)
 {
-    return &iface->super.super.tx.arbiter;
+    return &iface->tx.dci_pool[pool_index].arbiter;
 }
 
-static inline int
-uct_dc_mlx5_iface_dci_has_outstanding(uct_dc_mlx5_iface_t *iface, int dci)
+static UCS_F_ALWAYS_INLINE int
+uct_dc_mlx5_iface_dci_has_outstanding(uct_dc_mlx5_iface_t *iface, int dci_index)
 {
     uct_rc_txqp_t *txqp;
 
-    txqp = &iface->tx.dcis[dci].txqp;
-    return uct_rc_txqp_available(txqp) < (int16_t)iface->super.super.config.tx_qp_len;
+    txqp = &iface->tx.dcis[dci_index].txqp;
+    return uct_rc_txqp_available(txqp) < (int16_t)iface->tx.bb_max;
 }
 
-static inline ucs_status_t uct_dc_mlx5_iface_flush_dci(uct_dc_mlx5_iface_t *iface, int dci)
+static UCS_F_ALWAYS_INLINE ucs_status_t
+uct_dc_mlx5_iface_flush_dci(uct_dc_mlx5_iface_t *iface, int dci_index)
 {
 
-    if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
+    if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index)) {
         return UCS_OK;
     }
-    ucs_trace_poll("dci %d is not flushed %d/%d", dci,
-                   iface->tx.dcis[dci].txqp.available,
-                   iface->super.super.config.tx_qp_len);
-    ucs_assertv(uct_rc_txqp_unsignaled(&iface->tx.dcis[dci].txqp) == 0,
+
+    ucs_trace_poll("dci %d is not flushed %d/%d", dci_index,
+                   iface->tx.dcis[dci_index].txqp.available, iface->tx.bb_max);
+    ucs_assertv(uct_rc_txqp_unsignaled(&iface->tx.dcis[dci_index].txqp) == 0,
                 "unsignalled send is not supported!!!");
     return UCS_INPROGRESS;
 }
 
+static UCS_F_ALWAYS_INLINE int
+uct_dc_mlx5_iface_is_dci_keepalive(uct_dc_mlx5_iface_t *iface, int dci_index)
+{
+    return dci_index == iface->keepalive_dci;
+}
+
 #endif
diff --git a/src/uct/ib/dc/dc_mlx5.inl b/src/uct/ib/dc/dc_mlx5.inl
new file mode 100644
index 00000000000..5eb6decc883
--- /dev/null
+++ b/src/uct/ib/dc/dc_mlx5.inl
@@ -0,0 +1,41 @@
+/**
+* Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+*
+* See file LICENSE for terms.
+*/
+
+#include "dc_mlx5.h"
+#include "dc_mlx5_ep.h"
+
+
+#include <uct/ib/rc/accel/rc_mlx5.inl>
+#include "uct/ib/rc/base/rc_iface.h"
+#include "uct/ib/rc/base/rc_ep.h"
+
+
+static UCS_F_ALWAYS_INLINE void
+uct_dc_mlx5_update_tx_res(uct_dc_mlx5_iface_t *iface, uct_ib_mlx5_txwq_t *txwq,
+                          uct_rc_txqp_t *txqp, uint16_t hw_ci)
+{
+    uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci));
+    ucs_assert(uct_rc_txqp_available(txqp) <= txwq->bb_max);
+
+    uct_rc_iface_update_reads(&iface->super.super);
+}
+
+static UCS_F_ALWAYS_INLINE void
+uct_dc_mlx5_get_arbiter_params(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep,
+                               ucs_arbiter_t **waitq_p,
+                               ucs_arbiter_group_t **group_p,
+                               uint8_t *pool_index_p)
+{
+    *pool_index_p = uct_dc_mlx5_ep_pool_index(ep);
+
+    if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) {
+        *waitq_p = uct_dc_mlx5_iface_tx_waitq(iface);
+        *group_p = uct_dc_mlx5_ep_arb_group(iface, ep);
+    } else {
+        *waitq_p = uct_dc_mlx5_iface_dci_waitq(iface, *pool_index_p);
+        *group_p = &ep->arb_group;
+    }
+}
diff --git a/src/uct/ib/dc/dc_mlx5_devx.c b/src/uct/ib/dc/dc_mlx5_devx.c
index ad65f222cfb..9f8fe766858 100644
--- a/src/uct/ib/dc/dc_mlx5_devx.c
+++ b/src/uct/ib/dc/dc_mlx5_devx.c
@@ -15,7 +15,8 @@
 #include <ucs/arch/bitops.h>
 
 
-ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface)
+ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface,
+                                               int full_handshake)
 {
     uct_ib_device_t *dev  = uct_ib_iface_device(&iface->super.super.super);
     struct mlx5dv_pd dvpd = {};
@@ -47,6 +48,7 @@ ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface)
     UCT_IB_MLX5DV_SET(dctc, dctc, rre, true);
     UCT_IB_MLX5DV_SET(dctc, dctc, rwe, true);
     UCT_IB_MLX5DV_SET(dctc, dctc, rae, true);
+    UCT_IB_MLX5DV_SET(dctc, dctc, force_full_handshake, !!full_handshake);
     UCT_IB_MLX5DV_SET(dctc, dctc, cs_res, uct_ib_mlx5_qpc_cs_res(
                       iface->super.super.super.config.max_inl_cqe[UCT_IB_DIR_RX], 1));
     UCT_IB_MLX5DV_SET(dctc, dctc, atomic_mode, UCT_IB_MLX5_ATOMIC_MODE);
@@ -72,16 +74,19 @@ ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface)
     return UCS_OK;
 }
 
-ucs_status_t
-uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
-                                   uct_ib_mlx5_qp_t *qp)
+ucs_status_t uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
+                                                uct_ib_mlx5_qp_t *qp,
+                                                uint8_t path_index)
 {
+    uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
+                                          uct_ib_mlx5_md_t);
     char in_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_in)]   = {};
     char out_2init[UCT_IB_MLX5DV_ST_SZ_BYTES(rst2init_qp_out)] = {};
     char in_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_in)]    = {};
     char out_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_out)]  = {};
     char in_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_in)]     = {};
     char out_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_out)]   = {};
+    uint32_t opt_param_mask = UCT_IB_MLX5_QP_OPTPAR_RAE;
     ucs_status_t status;
     void *qpc;
 
@@ -101,7 +106,6 @@ uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
 
     UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opcode, UCT_IB_MLX5_CMD_OP_INIT2RTR_QP);
     UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, qpn, qp->qp_num);
-    UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, 4);
 
     qpc = UCT_IB_MLX5DV_ADDR_OF(init2rtr_qp_in, in_2rtr, qpc);
     UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
@@ -112,11 +116,14 @@ uct_dc_mlx5_iface_devx_dci_connect(uct_dc_mlx5_iface_t *iface,
     if (uct_ib_iface_is_roce(&iface->super.super.super)) {
         UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.eth_prio,
                           iface->super.super.super.config.sl);
+        uct_ib_mlx5_devx_set_qpc_port_affinity(md, path_index, qpc,
+                                               &opt_param_mask);
     } else {
         UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.sl,
                           iface->super.super.super.config.sl);
     }
 
+    UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, opt_param_mask);
     status = uct_ib_mlx5_devx_modify_qp(qp, in_2rtr, sizeof(in_2rtr),
                                         out_2rtr, sizeof(out_2rtr));
     if (status != UCS_OK) {
diff --git a/src/uct/ib/dc/dc_mlx5_ep.c b/src/uct/ib/dc/dc_mlx5_ep.c
index 7c7beeaebee..6d6af0d8e5e 100644
--- a/src/uct/ib/dc/dc_mlx5_ep.c
+++ b/src/uct/ib/dc/dc_mlx5_ep.c
@@ -8,19 +8,14 @@
 #  include "config.h"
 #endif
 
+#include "dc_mlx5.inl"
 #include "dc_mlx5_ep.h"
 #include "dc_mlx5.h"
 
-#include <uct/ib/rc/accel/rc_mlx5.inl>
 #include <uct/ib/mlx5/ib_mlx5_log.h>
 
 #define UCT_DC_MLX5_IFACE_TXQP_GET(_iface, _ep, _txqp, _txwq) \
-{ \
-    uint8_t dci; \
-    dci = (_ep)->dci; \
-    _txqp = &(_iface)->tx.dcis[dci].txqp; \
-    _txwq = &(_iface)->tx.dcis[dci].txwq; \
-}
+    UCT_DC_MLX5_IFACE_TXQP_DCI_GET(_iface, (_ep)->dci, _txqp, _txwq)
 
 static UCS_F_ALWAYS_INLINE void
 uct_dc_mlx5_iface_bcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep,
@@ -33,18 +28,15 @@ uct_dc_mlx5_iface_bcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep,
     UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
     UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
-    desc->super.sn = txwq->sw_pi;
-    uct_rc_mlx5_txqp_dptr_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq,
-                               opcode, buffer, length, &desc->lkey,
-                               rdma_raddr, rdma_rkey, 0, 0, 0, 0,
-                               &ep->av, uct_dc_mlx5_ep_get_grh(ep),
-                               uct_ib_mlx5_wqe_av_size(&ep->av),
-                               MLX5_WQE_CTRL_CQ_UPDATE | send_flags, imm_val_be, INT_MAX,
-                               log_sge);
-    uct_rc_txqp_add_send_op(txqp, &desc->super);
+    uct_rc_mlx5_common_txqp_bcopy_post(&iface->super, UCT_IB_QPT_DCI, txqp,
+                                       txwq, opcode, length, rdma_raddr,
+                                       rdma_rkey, &ep->av,
+                                       uct_dc_mlx5_ep_get_grh(ep),
+                                       uct_ib_mlx5_wqe_av_size(&ep->av),
+                                       MLX5_WQE_CTRL_CQ_UPDATE | send_flags,
+                                       imm_val_be, desc, buffer, log_sge);
 }
 
-
 static UCS_F_ALWAYS_INLINE void
 uct_dc_mlx5_iface_zcopy_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep,
                              unsigned opcode, const uct_iov_t *iov,
@@ -245,7 +237,8 @@ uct_dc_mlx5_ep_am_short_inline(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
     UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
-    UCT_RC_MLX5_CHECK_AM_SHORT(id, length, UCT_IB_MLX5_AV_FULL_SIZE);
+    UCT_RC_MLX5_CHECK_AM_SHORT(id, uct_rc_mlx5_am_short_hdr_t, length,
+                               UCT_IB_MLX5_AV_FULL_SIZE);
     UCT_DC_CHECK_RES_AND_FC(iface, ep);
 
     UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
@@ -264,34 +257,28 @@ uct_dc_mlx5_ep_am_short_inline(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     return UCS_OK;
 }
 
-#if HAVE_IBV_DM
-static ucs_status_t UCS_F_ALWAYS_INLINE
-uct_dc_mlx5_ep_short_dm(uct_dc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache,
-                        size_t hdr_len, const void *payload, unsigned length,
-                        unsigned opcode, uint8_t fm_ce_se,
-                        uint64_t rdma_raddr, uct_rkey_t rdma_rkey)
+static ucs_status_t UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_am_short_iov_inline(
+        uct_ep_h tl_ep, uint8_t id, const uct_iov_t *iov, size_t iovcnt,
+        size_t iov_length)
 {
-    uct_dc_mlx5_iface_t *iface     = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t);
-    uct_rc_iface_send_desc_t *desc = NULL;
-    void *buffer;
-    ucs_status_t status;
-    uct_ib_log_sge_t log_sge;
+    uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
+                                                uct_dc_mlx5_iface_t);
+    uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
-    status = uct_rc_mlx5_common_dm_make_data(&iface->super, cache, hdr_len,
-                                             payload, length, &desc,
-                                             &buffer, &log_sge);
-    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
-        return status;
-    }
+    UCT_RC_MLX5_CHECK_AM_SHORT(id, uct_rc_mlx5_hdr_t, iov_length,
+                               UCT_IB_MLX5_AV_FULL_SIZE);
+    UCT_DC_CHECK_RES_AND_FC(iface, ep);
+    UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
+    uct_rc_mlx5_txqp_inline_iov_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq,
+                                     iov, iovcnt, iov_length, id, &ep->av,
+                                     uct_dc_mlx5_ep_get_grh(ep),
+                                     uct_ib_mlx5_wqe_av_size(&ep->av));
+    UCT_RC_UPDATE_FC_WND(&iface->super.super, &ep->fc);
+    UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, iov_length);
 
-    uct_dc_mlx5_iface_bcopy_post(iface, ep, opcode,
-                                 hdr_len + length,
-                                 rdma_raddr, rdma_rkey,
-                                 desc, fm_ce_se, 0, buffer,
-                                 log_sge.num_sge ? &log_sge : NULL);
     return UCS_OK;
 }
-#endif
 
 ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
                                      const void *buffer, unsigned length)
@@ -301,6 +288,7 @@ ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
     ucs_status_t status;
     uct_rc_mlx5_dm_copy_data_t cache;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
     if (ucs_likely((sizeof(uct_rc_mlx5_am_short_hdr_t) + length <=
                     UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)) ||
@@ -318,10 +306,13 @@ ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     uct_rc_mlx5_am_hdr_fill(&cache.am_hdr.rc_hdr, id);
     cache.am_hdr.am_hdr = hdr;
 
-    status = uct_dc_mlx5_ep_short_dm(ep, &cache, sizeof(cache.am_hdr), buffer, length,
-                                     MLX5_OPCODE_SEND,
-                                     MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE,
-                                     0, 0);
+    UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
+    status = uct_rc_mlx5_common_ep_short_dm(
+            &iface->super, UCT_IB_QPT_DCI, &cache, sizeof(cache.am_hdr), buffer,
+            length, MLX5_OPCODE_SEND,
+            MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE, 0, 0, txqp, txwq,
+            &ep->av, uct_dc_mlx5_ep_get_grh(ep),
+            uct_ib_mlx5_wqe_av_size(&ep->av));
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
     }
@@ -331,6 +322,44 @@ ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
 #endif
 }
 
+ucs_status_t uct_dc_mlx5_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                         const uct_iov_t *iov, size_t iovcnt)
+{
+    size_t iov_length = uct_iov_total_length(iov, iovcnt);
+#if HAVE_IBV_DM
+    uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
+                                                uct_dc_mlx5_iface_t);
+    uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
+    ucs_status_t status;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
+
+    if (ucs_likely((sizeof(uct_rc_mlx5_hdr_t) + iov_length <=
+                    UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)) ||
+                   !iface->super.dm.dm)) {
+#endif
+        return uct_dc_mlx5_ep_am_short_iov_inline(tl_ep, id, iov, iovcnt,
+                                                  iov_length);
+#if HAVE_IBV_DM
+    }
+
+    UCT_CHECK_AM_ID(id);
+    UCT_DC_CHECK_RES_AND_FC(iface, ep);
+    UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
+
+    status = uct_rc_mlx5_common_ep_am_short_iov_dm(
+            &ep->super, id, &iface->super, iov, iovcnt, iov_length,
+            UCT_IB_QPT_DCI, txqp, txwq, &ep->av, uct_dc_mlx5_ep_get_grh(ep),
+            uct_ib_mlx5_wqe_av_size(&ep->av));
+    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
+        return status;
+    }
+
+    UCT_RC_UPDATE_FC_WND(&iface->super.super, &ep->fc);
+
+    return UCS_OK;
+#endif
+}
+
 ssize_t uct_dc_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                 uct_pack_callback_t pack_cb, void *arg,
                                 unsigned flags)
@@ -415,8 +444,8 @@ ucs_status_t uct_dc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *payload,
 #if HAVE_IBV_DM
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
     uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
-    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
     ucs_status_t status;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
     if (ucs_likely((length <= UCT_IB_MLX5_PUT_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)) ||
                    !iface->super.dm.dm)) {
@@ -430,10 +459,13 @@ ucs_status_t uct_dc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *payload,
     UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
     uct_rc_mlx5_ep_fence_put(&iface->super, txwq, &rkey, &remote_addr,
                              ep->atomic_mr_offset);
-    status = uct_dc_mlx5_ep_short_dm(ep, NULL, 0, payload, length,
-                                     MLX5_OPCODE_RDMA_WRITE,
-                                     MLX5_WQE_CTRL_CQ_UPDATE,
-                                     remote_addr, rkey);
+    status = uct_rc_mlx5_common_ep_short_dm(&iface->super, UCT_IB_QPT_DCI, NULL,
+                                            0, payload, length,
+                                            MLX5_OPCODE_RDMA_WRITE,
+                                            MLX5_WQE_CTRL_CQ_UPDATE,
+                                            remote_addr, rkey, txqp, txwq,
+                                            &ep->av, uct_dc_mlx5_ep_get_grh(ep),
+                                            uct_ib_mlx5_wqe_av_size(&ep->av));
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
     }
@@ -447,9 +479,9 @@ ssize_t uct_dc_mlx5_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb,
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
     uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
-    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
     uct_rc_iface_send_desc_t *desc;
     size_t length;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
     UCT_DC_MLX5_CHECK_RES(iface, ep);
     UCT_RC_IFACE_GET_TX_PUT_BCOPY_DESC(&iface->super.super, &iface->super.super.tx.mp,
@@ -499,8 +531,8 @@ ucs_status_t uct_dc_mlx5_ep_get_bcopy(uct_ep_h tl_ep,
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
     uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
     uint8_t fm_ce_se           = 0;
-    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
     uct_rc_iface_send_desc_t *desc;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
     UCT_CHECK_LENGTH(length, 0, iface->super.super.super.config.seg_size,
                      "get_bcopy");
@@ -559,30 +591,19 @@ ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags,
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
     uct_dc_mlx5_ep_t    *ep    = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
+    uct_ib_mlx5_md_t    *md    = ucs_derived_of(iface->super.super.super.super.md,
+                                                uct_ib_mlx5_md_t);
+    uint8_t pool_index         = uct_dc_mlx5_ep_pool_index(ep);
     ucs_status_t        status;
+    uint16_t            sn;
     UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
-    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
-        if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
-            return UCS_ERR_UNSUPPORTED;
-        }
-
-        uct_ep_pending_purge(tl_ep, NULL, 0);
-        if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
-            /* No dci -> no WQEs -> HW is clean, nothing to cancel */
-            return UCS_OK;
-        }
-
-        uct_dc_mlx5_ep_handle_failure(ep, NULL, UCS_ERR_CANCELED);
-        return UCS_OK;
-    }
-
     if (!uct_dc_mlx5_iface_has_tx_resources(iface)) {
         return UCS_ERR_NO_RESOURCE;
     }
 
     if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
-        if (!uct_dc_mlx5_iface_dci_can_alloc(iface)) {
+        if (!uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index)) {
             return UCS_ERR_NO_RESOURCE; /* waiting for dci */
         } else {
             UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */
@@ -604,9 +625,41 @@ ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags,
     ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI);
 
     UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
+    sn = txwq->sig_pi;
 
+    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
+        UCT_DC_MLX5_CHECK_RES(iface, ep);
+
+        if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
+            return UCS_ERR_UNSUPPORTED;
+        }
+
+        if (ep->flags & UCT_DC_MLX5_EP_FLAG_FLUSH_CANCEL) {
+            goto out;
+        }
+
+        status = uct_ib_mlx5_modify_qp_state(md, &txwq->super, IBV_QPS_ERR);
+        if (status != UCS_OK) {
+            return status;
+        }
+
+        ep->flags |= UCT_DC_MLX5_EP_FLAG_FLUSH_CANCEL;
+        sn         = txwq->sw_pi;
+        /* post NOP operation which will complete with error, to trigger DCI
+         * reset. Otherwise, DCI could be returned to poll in error state */
+        uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI,
+                                     txqp, txwq,
+                                     MLX5_OPCODE_NOP, NULL, 0,
+                                     0, 0, 0,
+                                     0, 0,
+                                     &ep->av, uct_dc_mlx5_ep_get_grh(ep),
+                                     uct_ib_mlx5_wqe_av_size(&ep->av),
+                                     0, INT_MAX);
+    }
+
+out:
     return uct_rc_txqp_add_flush_comp(&iface->super.super, &ep->super, txqp,
-                                      comp, txwq->sig_pi);
+                                      comp, sn);
 }
 
 #if IBV_HW_TM
@@ -645,6 +698,7 @@ ucs_status_t uct_dc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag,
     uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
     uct_rc_mlx5_dm_copy_data_t cache;
     ucs_status_t status;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
     if (ucs_likely((sizeof(struct ibv_tmh) + length <=
                     UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE)) ||
@@ -660,10 +714,14 @@ ucs_status_t uct_dc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag,
 
     uct_rc_mlx5_fill_tmh(ucs_unaligned_ptr(&cache.tm_hdr), tag, 0, IBV_TMH_EAGER);
 
-    status = uct_dc_mlx5_ep_short_dm(ep, &cache, sizeof(cache.tm_hdr), data,
-                                     length, MLX5_OPCODE_SEND,
-                                     MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE,
-                                     0, 0);
+    UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq);
+
+    status = uct_rc_mlx5_common_ep_short_dm(
+            &iface->super, UCT_IB_QPT_DCI, &cache, sizeof(cache.tm_hdr), data,
+            length, MLX5_OPCODE_SEND,
+            MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE, 0, 0, txqp, txwq,
+            &ep->av, uct_dc_mlx5_ep_get_grh(ep),
+            uct_ib_mlx5_wqe_av_size(&ep->av));
     if (!UCS_STATUS_IS_ERR(status)) {
         UCT_TL_EP_STAT_OP(&ep->super, TAG, SHORT, length);
     }
@@ -838,6 +896,8 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
     ucs_status_t status;
     uintptr_t sender_ep;
     struct ibv_ah *ah;
+    khiter_t it;
+    int ret;
 
     UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
 
@@ -855,9 +915,9 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
         sender_ep = (uintptr_t)dc_req->sender.ep;
 
         /* TODO: look at common code with uct_ud_mlx5_iface_get_av */
-        if (dc_req->sender.global.is_global) {
+        if (dc_req->sender.payload.is_global) {
             uct_ib_iface_fill_ah_attr_from_gid_lid(ib_iface, dc_req->lid,
-                                                   ucs_unaligned_ptr(&dc_req->sender.global.gid),
+                                                   ucs_unaligned_ptr(&dc_req->sender.payload.gid),
                                                    iface->super.super.super.gid_info.gid_index,
                                                    0, &ah_attr);
 
@@ -887,25 +947,36 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
 
         uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI,
                                      txqp, txwq, MLX5_OPCODE_SEND,
-                                     &av /*dummy*/, 0, op, sender_ep, 0,
-                                     0, 0,
+                                     &dc_req->sender.payload.seq,
+                                     sizeof(sender.payload.seq),
+                                     op, sender_ep, 0, 0, 0,
                                      &av, ah_attr.is_global ? mlx5_av_grh(&mlx5_av) : NULL,
                                      uct_ib_mlx5_wqe_av_size(&av), 0, INT_MAX);
     } else {
         ucs_assert(op == UCT_RC_EP_FLAG_FC_HARD_REQ);
-        sender.ep               = (uint64_t)dc_ep;
-        sender.global.gid       = ib_iface->gid_info.gid;
-        sender.global.is_global = dc_ep->flags & UCT_DC_MLX5_EP_FLAG_GRH;
+
+        it = kh_put(uct_dc_mlx5_fc_hash, &iface->tx.fc_hash, (uint64_t)dc_ep, &ret);
+        if (ret == UCS_KH_PUT_KEY_PRESENT) {
+            return UCS_OK;
+        } else if (ret == UCS_KH_PUT_FAILED) {
+            ucs_error("failed to create hash entry for fc hard req");
+            return UCS_ERR_NO_MEMORY;
+        }
+
+        sender.ep                        = (uint64_t)dc_ep;
+        sender.payload.seq               = iface->tx.fc_seq++;
+        sender.payload.gid               = ib_iface->gid_info.gid;
+        sender.payload.is_global         = dc_ep->flags & UCT_DC_MLX5_EP_FLAG_GRH;
+        kh_value(&iface->tx.fc_hash, it) = sender.payload.seq;
 
         UCS_STATS_UPDATE_COUNTER(dc_ep->fc.stats,
                                  UCT_RC_FC_STAT_TX_HARD_REQ, 1);
 
         uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI,
                                      txqp, txwq, MLX5_OPCODE_SEND_IMM,
-                                     &sender.global, sizeof(sender.global), op, sender.ep,
-                                     iface->rx.dct.qp_num,
-                                     0, 0,
-                                     &dc_ep->av,
+                                     &sender.payload, sizeof(sender.payload),
+                                     op, sender.ep, iface->rx.dct.qp_num,
+                                     0, 0, &dc_ep->av,
                                      uct_dc_mlx5_ep_get_grh(dc_ep),
                                      uct_ib_mlx5_wqe_av_size(&dc_ep->av),
                                      MLX5_WQE_CTRL_SOLICITED, INT_MAX);
@@ -914,10 +985,32 @@ ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
     return UCS_OK;
 }
 
+static void uct_dc_mlx5_ep_keepalive_cleanup(uct_dc_mlx5_ep_t *ep)
+{
+    uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface,
+                                                uct_dc_mlx5_iface_t);
+    uct_rc_iface_send_op_t *op;
+    ucs_queue_iter_t iter;
+    uct_rc_txqp_t *txqp;
+
+    if (!(ep->flags & UCT_DC_MLX5_EP_FLAG_KEEPALIVE_POSTED)) {
+        return;
+    }
+
+    /* clean keepalive requests */
+    txqp = &iface->tx.dcis[iface->keepalive_dci].txqp;
+    ucs_queue_for_each_safe(op, iter, &txqp->outstanding, queue) {
+        if (op->ep == &ep->super.super) {
+            ucs_queue_del_iter(&txqp->outstanding, iter);
+            ucs_mpool_put(op);
+            break;
+        }
+    }
+}
 
 UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface,
                     const uct_dc_mlx5_iface_addr_t *if_addr,
-                    uct_ib_mlx5_base_av_t *av)
+                    uct_ib_mlx5_base_av_t *av, uint8_t path_index)
 {
     uint32_t remote_dctn;
 
@@ -930,19 +1023,26 @@ UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface,
 
     memcpy(&self->av, av, sizeof(*av));
     self->av.dqp_dct |= htonl(remote_dctn);
+    self->flags       = path_index % iface->tx.num_dci_pools;
 
     return uct_dc_mlx5_ep_basic_init(iface, self);
 }
 
-static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_ep_t)
+UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_ep_t)
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(self->super.super.iface,
                                                 uct_dc_mlx5_iface_t);
+    khiter_t it;
 
     uct_dc_mlx5_ep_pending_purge(&self->super.super, NULL, NULL);
     uct_rc_fc_cleanup(&self->fc);
 
-    ucs_assert_always(self->flags & UCT_DC_MLX5_EP_FLAG_VALID);
+    uct_dc_mlx5_ep_keepalive_cleanup(self);
+
+    it = kh_get(uct_dc_mlx5_fc_hash, &iface->tx.fc_hash, (uint64_t)self);
+    if (it != kh_end(&iface->tx.fc_hash)) {
+        kh_del(uct_dc_mlx5_fc_hash, &iface->tx.fc_hash, it);
+    }
 
     if ((self->dci == UCT_DC_MLX5_EP_NO_DCI) ||
         uct_dc_mlx5_iface_is_dci_rand(iface)) {
@@ -968,16 +1068,17 @@ static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_ep_t)
 UCS_CLASS_DEFINE(uct_dc_mlx5_ep_t, uct_base_ep_t);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_dc_mlx5_ep_t, uct_ep_t, uct_dc_mlx5_iface_t *,
                           const uct_dc_mlx5_iface_addr_t *,
-                          uct_ib_mlx5_base_av_t *);
+                          uct_ib_mlx5_base_av_t *, uint8_t);
+UCS_CLASS_DEFINE_DELETE_FUNC(uct_dc_mlx5_ep_t, uct_ep_t);
 
 UCS_CLASS_INIT_FUNC(uct_dc_mlx5_grh_ep_t, uct_dc_mlx5_iface_t *iface,
                     const uct_dc_mlx5_iface_addr_t *if_addr,
-                    uct_ib_mlx5_base_av_t *av,
+                    uct_ib_mlx5_base_av_t *av, uint8_t path_index,
                     struct mlx5_grh_av *grh_av)
 {
     ucs_trace_func("");
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_dc_mlx5_ep_t, iface, if_addr, av);
+    UCS_CLASS_CALL_SUPER_INIT(uct_dc_mlx5_ep_t, iface, if_addr, av, path_index);
 
     self->super.flags |= UCT_DC_MLX5_EP_FLAG_GRH;
     memcpy(&self->grh_av, grh_av, sizeof(*grh_av));
@@ -992,33 +1093,8 @@ UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_grh_ep_t)
 UCS_CLASS_DEFINE(uct_dc_mlx5_grh_ep_t, uct_dc_mlx5_ep_t);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_dc_mlx5_grh_ep_t, uct_ep_t, uct_dc_mlx5_iface_t *,
                           const uct_dc_mlx5_iface_addr_t *,
-                          uct_ib_mlx5_base_av_t *, struct mlx5_grh_av *);
-
-void uct_dc_mlx5_ep_cleanup(uct_ep_h tl_ep, ucs_class_t *cls)
-{
-    uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
-    uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t);
-
-    UCS_CLASS_CLEANUP_CALL(cls, ep);
-
-    if (uct_dc_mlx5_ep_fc_wait_for_grant(ep)) {
-        ucs_trace("not releasing dc_mlx5_ep %p - waiting for grant", ep);
-        ep->flags &= ~UCT_DC_MLX5_EP_FLAG_VALID;
-        /* No need to wait for grant on this ep anymore */
-        uct_dc_mlx5_ep_clear_fc_grant_flag(iface, ep);
-        ucs_list_add_tail(&iface->tx.gc_list, &ep->list);
-    } else {
-        ucs_free(ep);
-    }
-}
-
-void uct_dc_mlx5_ep_release(uct_dc_mlx5_ep_t *ep)
-{
-    ucs_assert_always(!(ep->flags & UCT_DC_MLX5_EP_FLAG_VALID));
-    ucs_debug("release dc_mlx5_ep %p", ep);
-    ucs_list_del(&ep->list);
-    ucs_free(ep);
-}
+                          uct_ib_mlx5_base_av_t *, uint8_t,
+                          struct mlx5_grh_av *);
 
 void uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface,
                                    uct_dc_mlx5_ep_t *ep, uct_pending_req_t *r,
@@ -1038,10 +1114,7 @@ void uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface,
     }
 
     if (push_to_head) {
-        uct_pending_req_arb_group_push_head(no_dci ?
-                                            uct_dc_mlx5_iface_dci_waitq(iface) :
-                                            uct_dc_mlx5_iface_tx_waitq(iface),
-                                            group, r);
+        uct_pending_req_arb_group_push_head(group, r);
     } else {
         uct_pending_req_arb_group_push(group, r);
     }
@@ -1070,6 +1143,7 @@ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r,
 {
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
     uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
+    uint8_t pool_index         = uct_dc_mlx5_ep_pool_index(ep);
 
     /* ep can tx iff
      * - iface has resources: cqe and tx skb
@@ -1078,7 +1152,8 @@ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r,
      */
     if (uct_dc_mlx5_iface_has_tx_resources(iface)) {
         if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
-            if (uct_dc_mlx5_iface_dci_can_alloc(iface) && (ep->fc.fc_wnd > 0)) {
+            if (uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index) &&
+                (ep->fc.fc_wnd > 0)) {
                 return UCS_ERR_BUSY;
             }
         } else {
@@ -1105,6 +1180,7 @@ uct_dc_mlx5_iface_dci_do_pending_wait(ucs_arbiter_t *arbiter,
 {
     uct_dc_mlx5_ep_t *ep = ucs_container_of(group, uct_dc_mlx5_ep_t, arb_group);
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t);
+    uint8_t pool_index = uct_dc_mlx5_ep_pool_index(ep);
 
     ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
 
@@ -1112,7 +1188,7 @@ uct_dc_mlx5_iface_dci_do_pending_wait(ucs_arbiter_t *arbiter,
         return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
     }
 
-    if (!uct_dc_mlx5_iface_dci_can_alloc(iface)) {
+    if (!uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index)) {
         return UCS_ARBITER_CB_RESULT_STOP;
     }
     uct_dc_mlx5_iface_dci_alloc(iface, ep);
@@ -1130,24 +1206,24 @@ uct_dc_mlx5_iface_dci_do_common_pending_tx(uct_dc_mlx5_ep_t *ep,
                                                 uct_dc_mlx5_iface_t);
     ucs_status_t status;
 
-    if (!uct_dc_mlx5_iface_has_tx_resources(iface)) {
-        return UCS_ARBITER_CB_RESULT_STOP;
-    }
-
     status = uct_rc_iface_invoke_pending_cb(&iface->super.super, req);
     if (status == UCS_OK) {
         return UCS_ARBITER_CB_RESULT_REMOVE_ELEM;
     } else if (status == UCS_INPROGRESS) {
         return UCS_ARBITER_CB_RESULT_NEXT_GROUP;
+    } else if (!uct_dc_mlx5_iface_has_tx_resources(iface)) {
+        return UCS_ARBITER_CB_RESULT_STOP;
     }
 
-    if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) {
-        return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
-    }
+    /* No pending operations (except no-op, flush(CANCEL), and others
+     * that don't consume TX resources) allowed to be still scheduled on an
+     * arbiter group for which flush(CANCEL) was done */
+    ucs_assert(!(ep->flags & UCT_DC_MLX5_EP_FLAG_FLUSH_CANCEL));
 
-    ucs_assertv(!uct_dc_mlx5_iface_has_tx_resources(iface),
-                "pending callback returned error but send resources are available");
-    return UCS_ARBITER_CB_RESULT_STOP;
+    ucs_assertv(!uct_dc_mlx5_iface_dci_ep_can_send(ep),
+                "pending callback returned error, but send resources are"
+                " available");
+    return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
 }
 
 /**
@@ -1247,6 +1323,9 @@ void uct_dc_mlx5_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t c
     uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
     void *priv_args[2]         = {ep, arg};
     uct_purge_cb_args_t args   = {cb, priv_args};
+    ucs_arbiter_t *waitq;
+    ucs_arbiter_group_t *group;
+    uint8_t pool_index;
 
     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
         ucs_arbiter_group_purge(uct_dc_mlx5_iface_tx_waitq(iface),
@@ -1255,12 +1334,11 @@ void uct_dc_mlx5_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t c
         return;
     }
 
-    if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
-        ucs_arbiter_group_purge(uct_dc_mlx5_iface_dci_waitq(iface), &ep->arb_group,
-                                uct_dc_mlx5_ep_arbiter_purge_cb, &args);
-    } else {
-        ucs_arbiter_group_purge(uct_dc_mlx5_iface_tx_waitq(iface), &ep->arb_group,
-                                uct_dc_mlx5_ep_arbiter_purge_cb, &args);
+    uct_dc_mlx5_get_arbiter_params(iface, ep, &waitq, &group, &pool_index);
+    ucs_arbiter_group_purge(waitq, group, uct_dc_mlx5_ep_arbiter_purge_cb,
+                            &args);
+
+    if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) {
         uct_dc_mlx5_iface_dci_free(iface, ep);
     }
 }
@@ -1271,16 +1349,13 @@ ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_
 
     if (iface->super.super.config.fc_enabled) {
         UCT_RC_CHECK_FC_WND(&ep->fc, ep->super.stats);
-        if ((ep->fc.fc_wnd == iface->super.super.config.fc_hard_thresh) &&
-            !uct_dc_mlx5_ep_fc_wait_for_grant(ep)) {
+        if (ep->fc.fc_wnd == iface->super.super.config.fc_hard_thresh) {
             status = uct_rc_fc_ctrl(&ep->super.super,
                                     UCT_RC_EP_FLAG_FC_HARD_REQ,
                                     NULL);
             if (status != UCS_OK) {
                 return status;
             }
-            ep->flags |= UCT_DC_MLX5_EP_FLAG_FC_WAIT_FOR_GRANT;
-            ++iface->tx.fc_grants;
         }
     } else {
         /* Set fc_wnd to max, to send as much as possible without checks */
@@ -1292,65 +1367,98 @@ ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_
 void uct_dc_mlx5_ep_handle_failure(uct_dc_mlx5_ep_t *ep, void *arg,
                                    ucs_status_t ep_status)
 {
+    struct mlx5_cqe64 *cqe     = arg;
     uct_iface_h tl_iface       = ep->super.super.iface;
-    uint8_t dci                = ep->dci;
-    uct_ib_iface_t *ib_iface   = ucs_derived_of(tl_iface, uct_ib_iface_t);
+    uint8_t dci_index          = ep->dci;
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t);
-    uct_rc_txqp_t *txqp        = &iface->tx.dcis[dci].txqp;
-    uct_ib_mlx5_txwq_t *txwq   = &iface->tx.dcis[dci].txwq;
-    int16_t outstanding;
-    ucs_status_t status;
-    ucs_log_level_t log_lvl;
+    uct_rc_txqp_t *txqp        = &iface->tx.dcis[dci_index].txqp;
+    uct_ib_mlx5_txwq_t *txwq   = &iface->tx.dcis[dci_index].txwq;
+    uint16_t pi                = ntohs(cqe->wqe_counter);
+    ucs_arbiter_t *waitq;
+    ucs_arbiter_group_t *group;
+    uint8_t pool_index;
 
-    ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
+    ucs_debug("handle failure iface: %p, dci[%d] qpn 0x%x, status: %s", iface,
+              dci_index, txwq->super.qp_num, ucs_status_string(ep_status));
 
-    uct_rc_txqp_purge_outstanding(&iface->super.super, txqp, ep_status,
-                                  txwq->sw_pi, 0);
+    ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
 
-    /* poll_cqe for mlx5 returns NULL in case of failure and the cq_avaialble
-       is not updated for the error cqe and all outstanding wqes*/
-    outstanding = (int16_t)iface->super.super.config.tx_qp_len -
-                  uct_rc_txqp_available(txqp);
-    iface->super.super.tx.cq_available += outstanding;
-    uct_rc_txqp_available_set(txqp, (int16_t)iface->super.super.config.tx_qp_len);
+    uct_dc_mlx5_update_tx_res(iface, txwq, txqp, pi);
+    uct_rc_txqp_purge_outstanding(&iface->super.super, txqp, ep_status, pi, 0);
 
-    /* since we removed all outstanding ops on the dci, it should be released */
     ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI);
-    uct_dc_mlx5_iface_dci_put(iface, dci);
-    ucs_assert_always(ep->dci == UCT_DC_MLX5_EP_NO_DCI);
+    /* Try to return DCI into iface stack */
+    uct_dc_mlx5_iface_dci_put(iface, dci_index);
+    uct_dc_mlx5_get_arbiter_params(iface, ep, &waitq, &group, &pool_index);
+
+    /* Do not invoke pending requests on a failed endpoint. This call should be
+     * done after uct_dc_mlx5_iface_dci_put because uct_dc_mlx5_iface_dci_put
+     * may schedule group to arbiter */
+    ucs_arbiter_group_desched(waitq, group);
+    uct_dc_mlx5_iface_set_ep_failed(iface, ep, cqe, txwq, ep_status);
 
-    if (uct_dc_mlx5_ep_fc_wait_for_grant(ep)) {
-        /* No need to wait for grant on this ep anymore */
-        uct_dc_mlx5_ep_clear_fc_grant_flag(iface, ep);
+    if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) {
+        /* No more operations scheduled on DCI, reset it.
+         * This operation should be done prior to
+         * uct_dc_mlx5_iface_progress_pending call to avoid reset of working
+         * DCI */
+        uct_dc_mlx5_iface_reset_dci(iface, dci_index);
     }
 
-    if (ep == iface->tx.fc_ep) {
-        ucs_assert(ep_status != UCS_ERR_CANCELED);
-        /* Cannot handle errors on flow-control endpoint.
-         * Or shall we ignore them?
-         */
-        ucs_debug("got error on DC flow-control endpoint, iface %p: %s", iface,
-                  ucs_status_string(ep_status));
-    } else {
-        status  = ib_iface->ops->set_ep_failed(ib_iface, &ep->super.super,
-                                               ep_status);
-        log_lvl = uct_ib_iface_failure_log_level(ib_iface, status, ep_status);
+    uct_dc_mlx5_iface_progress_pending(iface, pool_index);
+}
 
-        if (ep_status != UCS_ERR_CANCELED) {
-            uct_ib_mlx5_completion_with_err(ib_iface, arg,
-                                            &iface->tx.dcis[dci].txwq, log_lvl);
-        }
+static void
+uct_dc_mlx5_ep_check_send_completion(uct_rc_iface_send_op_t *op, const void *resp)
+{
+    uct_dc_mlx5_ep_t *ep = ucs_derived_of(op->ep, uct_dc_mlx5_ep_t);
+
+    ucs_assert(ep->flags & UCT_DC_MLX5_EP_FLAG_KEEPALIVE_POSTED);
+    ep->flags &= ~UCT_DC_MLX5_EP_FLAG_KEEPALIVE_POSTED;
+    ucs_mpool_put(op);
+}
+
+ucs_status_t
+uct_dc_mlx5_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
+{
+    uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t);
+    uct_dc_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t);
+    uint64_t dummy             = 0;
+    ucs_status_t status;
+    uct_rc_iface_send_op_t *op;
+    UCT_DC_MLX5_TXQP_DECL(txqp, txwq);
+
+    UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);
+
+    if ((ep->dci != UCT_DC_MLX5_EP_NO_DCI) ||
+        (ep->flags & UCT_DC_MLX5_EP_FLAG_KEEPALIVE_POSTED)) {
+        /* in case if EP has DCI and some TX resources are involved in
+         * communications, then keepalive operation is not needed */
+        return UCS_OK;
     }
 
-    status = uct_dc_mlx5_iface_reset_dci(iface, &iface->tx.dcis[dci]);
+    status = uct_dc_mlx5_iface_keepalive_init(iface);
     if (status != UCS_OK) {
-        ucs_fatal("iface %p failed to reset dci[%d] qpn 0x%x: %s",
-                  iface, dci, txwq->super.qp_num, ucs_status_string(status));
+        ucs_error("failed to initialize keepalive dci: %s",
+                  ucs_status_string(status));
+        return status;
     }
 
-    status = uct_dc_mlx5_iface_dci_connect(iface, &iface->tx.dcis[dci]);
-    if (status != UCS_OK) {
-        ucs_fatal("iface %p failed to connect dci[%d] qpn 0x%x: %s",
-                  iface, dci, txwq->super.qp_num, ucs_status_string(status));
+    op = ucs_mpool_get(&iface->super.super.tx.send_op_mp);
+    if (ucs_unlikely(op == NULL)) {
+        ucs_error("failed to allocate keepalive op");
+        return UCS_ERR_NO_MEMORY;
     }
+
+    uct_rc_ep_init_send_op(op, 0, NULL, uct_dc_mlx5_ep_check_send_completion);
+    op->ep = tl_ep;
+    UCT_DC_MLX5_IFACE_TXQP_DCI_GET(iface, iface->keepalive_dci, txqp, txwq);
+    uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI,
+                                 txqp, txwq, MLX5_OPCODE_RDMA_WRITE,
+                                 &dummy, 0, 0, 0, 0, 0, 0,
+                                 &ep->av, uct_dc_mlx5_ep_get_grh(ep),
+                                 uct_ib_mlx5_wqe_av_size(&ep->av), 0, INT_MAX);
+    uct_rc_txqp_add_send_op_sn(txqp, op, txwq->sig_pi);
+    ep->flags |= UCT_DC_MLX5_EP_FLAG_KEEPALIVE_POSTED;
+    return UCS_OK;
 }
diff --git a/src/uct/ib/dc/dc_mlx5_ep.h b/src/uct/ib/dc/dc_mlx5_ep.h
index 224c1f7f935..97fb6f2065e 100644
--- a/src/uct/ib/dc/dc_mlx5_ep.h
+++ b/src/uct/ib/dc/dc_mlx5_ep.h
@@ -17,18 +17,26 @@
 
 
 enum uct_dc_mlx5_ep_flags {
-    UCT_DC_MLX5_EP_FLAG_TX_WAIT           = UCS_BIT(0), /* ep is in the tx_wait state. See
-                                                           description of the dcs+quota dci
-                                                           selection policy above */
-    UCT_DC_MLX5_EP_FLAG_GRH               = UCS_BIT(1), /* ep has GRH address. Used by
-                                                           dc_mlx5 endpoint */
-    UCT_DC_MLX5_EP_FLAG_VALID             = UCS_BIT(2), /* ep is a valid endpoint */
-    /* Indicates that FC grant has been requested, but is not received yet.
-     * Flush will not complete until an outgoing grant request is acked.
-     * It is needed to avoid the following cases:
-     * 1) Grant arrives for the recently deleted ep.
-     * 2) QP resources are available, but there are some pending requests. */
-    UCT_DC_MLX5_EP_FLAG_FC_WAIT_FOR_GRANT = UCS_BIT(3)
+    /* DCI pool EP assigned to according to it's lag port */
+    UCT_DC_MLX5_EP_FLAG_POOL_INDEX_MASK     = UCS_MASK(3),
+
+    /* EP is in the tx_wait state. See description of the dcs+quota dci
+       selection policy above */
+    UCT_DC_MLX5_EP_FLAG_TX_WAIT             = UCS_BIT(3),
+
+    /* EP has GRH address. Used by dc_mlx5 endpoint */
+    UCT_DC_MLX5_EP_FLAG_GRH                 = UCS_BIT(4),
+
+    /* Keepalive Request scheduled: indicates that keepalive request
+     * is scheduled in outstanding queue and no more keepalive actions
+     * are needed */
+    UCT_DC_MLX5_EP_FLAG_KEEPALIVE_POSTED    = UCS_BIT(5),
+
+    /* Flush cancel was executed on EP */
+    UCT_DC_MLX5_EP_FLAG_FLUSH_CANCEL        = UCS_BIT(6),
+
+    /* Error handler already called or flush(CANCEL) disabled it */
+    UCT_DC_MLX5_EP_FLAG_ERR_HANDLER_INVOKED = UCS_BIT(7),
 };
 
 
@@ -65,12 +73,13 @@ typedef struct {
 
 
 UCS_CLASS_DECLARE(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *, const uct_dc_mlx5_iface_addr_t *,
-                  uct_ib_mlx5_base_av_t *);
+                  uct_ib_mlx5_base_av_t *, uint8_t);
 
 UCS_CLASS_DECLARE(uct_dc_mlx5_grh_ep_t, uct_dc_mlx5_iface_t *,
                   const uct_dc_mlx5_iface_addr_t *,
-                  uct_ib_mlx5_base_av_t *, struct mlx5_grh_av *);
+                  uct_ib_mlx5_base_av_t *, uint8_t, struct mlx5_grh_av *);
 
+UCS_CLASS_DECLARE_DELETE_FUNC(uct_dc_mlx5_ep_t, uct_ep_t);
 
 ucs_status_t uct_dc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *payload,
                                       unsigned length, uint64_t remote_addr,
@@ -96,6 +105,9 @@ ucs_status_t uct_dc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size
 ucs_status_t uct_dc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
                                      const void *buffer, unsigned length);
 
+ucs_status_t uct_dc_mlx5_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                         const uct_iov_t *iov, size_t iovcnt);
+
 ssize_t uct_dc_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                 uct_pack_callback_t pack_cb, void *arg,
                                 unsigned flags);
@@ -200,9 +212,14 @@ void uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface,
                                    uct_dc_mlx5_ep_t *ep, uct_pending_req_t *r,
                                    unsigned flags, int push_to_head);
 
-void uct_dc_mlx5_ep_cleanup(uct_ep_h tl_ep, ucs_class_t *cls);
+ucs_status_t
+uct_dc_mlx5_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
 
-void uct_dc_mlx5_ep_release(uct_dc_mlx5_ep_t *ep);
+static UCS_F_ALWAYS_INLINE uint8_t
+uct_dc_mlx5_ep_pool_index(uct_dc_mlx5_ep_t *ep)
+{
+    return ep->flags & UCT_DC_MLX5_EP_FLAG_POOL_INDEX_MASK;
+}
 
 static UCS_F_ALWAYS_INLINE uct_dc_mlx5_pending_req_priv_t *
 uct_dc_mlx5_pending_req_priv(uct_pending_req_t *req)
@@ -244,22 +261,12 @@ uct_dc_mlx5_iface_dci_sched_tx(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
 }
 
 static UCS_F_ALWAYS_INLINE uct_dc_mlx5_ep_t *
-uct_dc_mlx5_ep_from_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci)
+uct_dc_mlx5_ep_from_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci_index)
 {
     /* Can be used with dcs* policies only, with rand policy every dci may
      * be used by many eps */
     ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
-    return iface->tx.dcis[dci].ep;
-}
-
-static UCS_F_ALWAYS_INLINE void
-uct_dc_mlx5_ep_clear_fc_grant_flag(uct_dc_mlx5_iface_t *iface,
-                                   uct_dc_mlx5_ep_t *ep)
-{
-    ucs_assert((ep->flags & UCT_DC_MLX5_EP_FLAG_FC_WAIT_FOR_GRANT) &&
-               iface->tx.fc_grants);
-    ep->flags &= ~UCT_DC_MLX5_EP_FLAG_FC_WAIT_FOR_GRANT;
-    --iface->tx.fc_grants;
+    return iface->tx.dcis[dci_index].ep;
 }
 
 void uct_dc_mlx5_ep_handle_failure(uct_dc_mlx5_ep_t *ep, void *arg,
@@ -277,22 +284,22 @@ uct_dc_mlx5_ep_basic_init(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
         ep->dci = UCT_DC_MLX5_EP_NO_DCI;
     }
 
-    /* valid = 1, global = 0, tx_wait = 0 */
-    ep->flags = UCT_DC_MLX5_EP_FLAG_VALID;
-
     return uct_rc_fc_init(&ep->fc, iface->super.super.config.fc_wnd_size
                           UCS_STATS_ARG(ep->super.stats));
 }
 
 static UCS_F_ALWAYS_INLINE int
-uct_dc_mlx5_iface_dci_can_alloc(uct_dc_mlx5_iface_t *iface)
+uct_dc_mlx5_iface_dci_can_alloc(uct_dc_mlx5_iface_t *iface, uint8_t pool_index)
 {
-    return iface->tx.stack_top < iface->tx.ndci;
+    return iface->tx.dci_pool[pool_index].stack_top < iface->tx.ndci;
 }
 
 static UCS_F_ALWAYS_INLINE void
-uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface)
+uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface,
+                                   uint8_t pool_index)
 {
+    ucs_arbiter_t *dci_waitq = uct_dc_mlx5_iface_dci_waitq(iface, pool_index);
+
     do {
         /**
          * Pending op on the tx_waitq can complete with the UCS_OK
@@ -305,16 +312,16 @@ uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface)
          * NOTE: in case of rand dci allocation policy, dci_waitq is always
          * empty.
          */
-        if (uct_dc_mlx5_iface_dci_can_alloc(iface) &&
+        if (uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index) &&
             !uct_dc_mlx5_iface_is_dci_rand(iface)) {
-            ucs_arbiter_dispatch(uct_dc_mlx5_iface_dci_waitq(iface), 1,
+            ucs_arbiter_dispatch(dci_waitq, 1,
                                  uct_dc_mlx5_iface_dci_do_pending_wait, NULL);
         }
         ucs_arbiter_dispatch(uct_dc_mlx5_iface_tx_waitq(iface), 1,
                              iface->tx.pend_cb, NULL);
 
-    } while (ucs_unlikely(!ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface)) &&
-                           uct_dc_mlx5_iface_dci_can_alloc(iface)));
+    } while (ucs_unlikely(!ucs_arbiter_is_empty(dci_waitq) &&
+                          uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index)));
 }
 
 static inline int uct_dc_mlx5_iface_dci_ep_can_send(uct_dc_mlx5_ep_t *ep)
@@ -328,51 +335,71 @@ static inline int uct_dc_mlx5_iface_dci_ep_can_send(uct_dc_mlx5_ep_t *ep)
 static UCS_F_ALWAYS_INLINE
 void uct_dc_mlx5_iface_schedule_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
 {
+    ucs_arbiter_t *waitq;
+
     /* If FC window is empty the group will be scheduled when
      * grant is received */
     if (uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) {
-        ucs_arbiter_group_schedule(uct_dc_mlx5_iface_dci_waitq(iface), &ep->arb_group);
+        waitq = uct_dc_mlx5_iface_dci_waitq(iface, uct_dc_mlx5_ep_pool_index(ep));
+        ucs_arbiter_group_schedule(waitq, &ep->arb_group);
     }
 }
 
+static UCS_F_ALWAYS_INLINE uint8_t
+uct_dc_mlx5_iface_dci_pool_index(uct_dc_mlx5_iface_t *iface, uint8_t dci_index)
+{
+    ucs_assert(iface->tx.dcis[dci_index].pool_index <
+               UCT_DC_MLX5_IFACE_MAX_DCI_POOLS);
+    return iface->tx.dcis[dci_index].pool_index;
+}
+
 static UCS_F_ALWAYS_INLINE void
- uct_dc_mlx5_iface_dci_release(uct_dc_mlx5_iface_t *iface, uint8_t dci)
+uct_dc_mlx5_iface_dci_release(uct_dc_mlx5_iface_t *iface, uint8_t dci_index)
 {
-    iface->tx.stack_top--;
-    iface->tx.dcis_stack[iface->tx.stack_top] = dci;
+    uint8_t pool_index           = uct_dc_mlx5_iface_dci_pool_index(iface,
+                                                                    dci_index);
+    uct_dc_mlx5_dci_pool_t *pool = &iface->tx.dci_pool[pool_index];
+
+    pool->stack_top--;
+    pool->stack[pool->stack_top] = dci_index;
 #if UCS_ENABLE_ASSERT
-    iface->tx.dcis[dci].flags = 0;
+    iface->tx.dcis[dci_index].flags = 0;
 #endif
 }
 
 static UCS_F_ALWAYS_INLINE void
- uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t dci)
+uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t dci_index)
 {
     uct_dc_mlx5_ep_t *ep;
+    ucs_arbiter_t *waitq;
+    uint8_t pool_index;
 
-    if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
+    if (uct_dc_mlx5_iface_is_dci_rand(iface) ||
+        uct_dc_mlx5_iface_is_dci_keepalive(iface, dci_index)) {
         return;
     }
 
-    ep = uct_dc_mlx5_ep_from_dci(iface, dci);
-
-    ucs_assert(iface->tx.stack_top > 0);
+    ep = uct_dc_mlx5_ep_from_dci(iface, dci_index);
 
     if (ucs_unlikely(ep == NULL)) {
-        if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
-            uct_dc_mlx5_iface_dci_release(iface, dci);
+        if (!uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index)) {
+            uct_dc_mlx5_iface_dci_release(iface, dci_index);
         }
         return;
     }
 
-    if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
+    pool_index = uct_dc_mlx5_ep_pool_index(ep);
+    ucs_assert(iface->tx.dci_pool[pool_index].stack_top > 0);
+
+    if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index)) {
         if (iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) {
             /* in tx_wait state:
              * -  if there are no eps are waiting for dci allocation
              *    ep goes back to normal state
              */
             if (ep->flags & UCT_DC_MLX5_EP_FLAG_TX_WAIT) {
-                if (!ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface))) {
+                waitq = uct_dc_mlx5_iface_dci_waitq(iface, pool_index);
+                if (!ucs_arbiter_is_empty(waitq)) {
                     return;
                 }
                 ep->flags &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
@@ -382,12 +409,13 @@ static UCS_F_ALWAYS_INLINE void
         return;
     }
 
-    uct_dc_mlx5_iface_dci_release(iface, dci);
+    uct_dc_mlx5_iface_dci_release(iface, dci_index);
 
-    ucs_assert(uct_dc_mlx5_ep_from_dci(iface, dci)->dci != UCT_DC_MLX5_EP_NO_DCI);
+    ucs_assert(uct_dc_mlx5_ep_from_dci(iface, dci_index)->dci !=
+               UCT_DC_MLX5_EP_NO_DCI);
     ep->dci    = UCT_DC_MLX5_EP_NO_DCI;
     ep->flags &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
-    iface->tx.dcis[dci].ep = NULL;
+    iface->tx.dcis[dci_index].ep = NULL;
 
     /* it is possible that dci is released while ep still has scheduled pending ops.
      * move the group to the 'wait for dci alloc' state
@@ -402,42 +430,49 @@ static inline void uct_dc_mlx5_iface_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_d
      * There is no need to check txqp because
      * dci must have resources to transmit.
      */
+    uint8_t pool_index           = uct_dc_mlx5_ep_pool_index(ep);
+    uct_dc_mlx5_dci_pool_t *pool = &iface->tx.dci_pool[pool_index];
+
     ucs_assert(!uct_dc_mlx5_iface_is_dci_rand(iface));
-    ep->dci = iface->tx.dcis_stack[iface->tx.stack_top];
-    ucs_assert(ep->dci < iface->tx.ndci);
+    ep->dci = pool->stack[pool->stack_top];
+    ucs_assert(ep->dci >= (iface->tx.ndci * pool_index));
+    ucs_assert(ep->dci < (iface->tx.ndci * (pool_index + 1)));
     ucs_assert(uct_dc_mlx5_ep_from_dci(iface, ep->dci) == NULL);
     ucs_assert(iface->tx.dcis[ep->dci].flags == 0);
     iface->tx.dcis[ep->dci].ep = ep;
-    iface->tx.stack_top++;
+    pool->stack_top++;
 }
 
-static inline void uct_dc_mlx5_iface_dci_free(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
+static inline void uct_dc_mlx5_iface_dci_free(uct_dc_mlx5_iface_t *iface,
+                                              uct_dc_mlx5_ep_t *ep)
 {
-    uint8_t dci;
+    uint8_t dci_index;
 
     if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
         return;
     }
 
-    dci = ep->dci;
+    dci_index = ep->dci;
 
-    ucs_assert(dci != UCT_DC_MLX5_EP_NO_DCI);
-    ucs_assert(iface->tx.stack_top > 0);
+    ucs_assert(dci_index != UCT_DC_MLX5_EP_NO_DCI);
+    ucs_assert(iface->tx.dci_pool[uct_dc_mlx5_ep_pool_index(ep)].stack_top > 0);
 
-    if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci)) {
+    if (uct_dc_mlx5_iface_dci_has_outstanding(iface, dci_index)) {
         return;
     }
 
-    uct_dc_mlx5_iface_dci_release(iface, dci);
+    uct_dc_mlx5_iface_dci_release(iface, dci_index);
 
-    iface->tx.dcis[dci].ep = NULL;
-    ep->dci                = UCT_DC_MLX5_EP_NO_DCI;
-    ep->flags             &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
+    iface->tx.dcis[dci_index].ep = NULL;
+    ep->dci                      = UCT_DC_MLX5_EP_NO_DCI;
+    ep->flags                   &= ~UCT_DC_MLX5_EP_FLAG_TX_WAIT;
 }
 
 static UCS_F_ALWAYS_INLINE ucs_status_t
 uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
 {
+    uint8_t pool_index = uct_dc_mlx5_ep_pool_index(ep);
+    ucs_arbiter_t *waitq;
     uct_rc_txqp_t *txqp;
     int16_t available;
 
@@ -465,9 +500,10 @@ uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
          */
         txqp      = &iface->tx.dcis[ep->dci].txqp;
         available = uct_rc_txqp_available(txqp);
+        waitq     = uct_dc_mlx5_iface_dci_waitq(iface, pool_index);
         if ((iface->tx.policy == UCT_DC_TX_POLICY_DCS_QUOTA) &&
             (available <= iface->tx.available_quota) &&
-            !ucs_arbiter_is_empty(uct_dc_mlx5_iface_dci_waitq(iface)))
+            !ucs_arbiter_is_empty(waitq))
         {
             ep->flags |= UCT_DC_MLX5_EP_FLAG_TX_WAIT;
             goto out_no_res;
@@ -483,7 +519,7 @@ uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
 
     /* Do not alloc dci if no TX desc resources,
      * otherwise this dci may never be released. */
-    if (uct_dc_mlx5_iface_dci_can_alloc(iface) &&
+    if (uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index) &&
         uct_dc_mlx5_iface_has_tx_resources(iface)) {
         uct_dc_mlx5_iface_dci_alloc(iface, ep);
         return UCS_OK;
@@ -495,11 +531,6 @@ uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)
     return UCS_ERR_NO_RESOURCE;
 }
 
-static UCS_F_ALWAYS_INLINE int uct_dc_mlx5_ep_fc_wait_for_grant(uct_dc_mlx5_ep_t *ep)
-{
-    return ep->flags & UCT_DC_MLX5_EP_FLAG_FC_WAIT_FOR_GRANT;
-}
-
 ucs_status_t uct_dc_mlx5_ep_check_fc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep);
 
 static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep)
@@ -527,9 +558,12 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep)
     { \
         UCT_RC_CHECK_NUM_RDMA_READ_RET(&(_iface)->super.super, \
                                        UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) \
-        ucs_status_t status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \
-        if (ucs_unlikely(status != UCS_OK)) { \
-            return UCS_STATUS_PTR(status); \
+        { \
+            ucs_status_t status = \
+                uct_dc_mlx5_iface_dci_get(_iface, _ep); \
+            if (ucs_unlikely(status != UCS_OK)) { \
+                return UCS_STATUS_PTR(status); \
+            } \
         } \
     }
 
diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_dv.c b/src/uct/ib/mlx5/dv/ib_mlx5_dv.c
index 92cd87a495f..fd03bb39463 100644
--- a/src/uct/ib/mlx5/dv/ib_mlx5_dv.c
+++ b/src/uct/ib/mlx5/dv/ib_mlx5_dv.c
@@ -117,7 +117,16 @@ ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
 
     UCT_IB_MLX5DV_SET(create_qp_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_QP);
     qpc = UCT_IB_MLX5DV_ADDR_OF(create_qp_in, in, qpc);
-    UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_RC);
+    if (attr->super.qp_type == UCT_IB_QPT_DCI) {
+        UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_DCI);
+        UCT_IB_MLX5DV_SET(qpc, qpc, full_handshake, !!attr->full_handshake);
+    } else if (attr->super.qp_type == IBV_QPT_RC) {
+        UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_RC);
+    } else {
+        ucs_error("create qp failed: unknown type %d", attr->super.qp_type);
+        status = UCS_ERR_UNSUPPORTED;
+        goto err_free_db;
+    }
     UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
     UCT_IB_MLX5DV_SET(qpc, qpc, pd, dvpd.pdn);
     UCT_IB_MLX5DV_SET(qpc, qpc, uar_page, uar->uar->page_id);
@@ -135,6 +144,7 @@ ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
             uct_ib_mlx5_qpc_cs_res(attr->super.max_inl_cqe[UCT_IB_DIR_RX], 0));
     UCT_IB_MLX5DV_SET64(qpc, qpc, dbr_addr, qp->devx.dbrec->offset);
     UCT_IB_MLX5DV_SET(qpc, qpc, dbr_umem_id, qp->devx.dbrec->mem_id);
+    UCT_IB_MLX5DV_SET(qpc, qpc, user_index, attr->uidx);
 
     if (qp->devx.wq_buf == NULL) {
         UCT_IB_MLX5DV_SET(qpc, qpc, no_sq, true);
@@ -155,20 +165,22 @@ ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,
 
     qp->qp_num = UCT_IB_MLX5DV_GET(create_qp_out, out, qpn);
 
-    qpc = UCT_IB_MLX5DV_ADDR_OF(rst2init_qp_in, in_2init, qpc);
-    UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, opcode, UCT_IB_MLX5_CMD_OP_RST2INIT_QP);
-    UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, qpn, qp->qp_num);
-    UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
-    UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->super.port);
-    UCT_IB_MLX5DV_SET(qpc, qpc, rwe, true);
+    if (attr->super.qp_type == IBV_QPT_RC) {
+        qpc = UCT_IB_MLX5DV_ADDR_OF(rst2init_qp_in, in_2init, qpc);
+        UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, opcode, UCT_IB_MLX5_CMD_OP_RST2INIT_QP);
+        UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, qpn, qp->qp_num);
+        UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
+        UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->super.port);
+        UCT_IB_MLX5DV_SET(qpc, qpc, rwe, true);
 
-    ret = mlx5dv_devx_obj_modify(qp->devx.obj, in_2init, sizeof(in_2init),
-                                 out_2init, sizeof(out_2init));
-    if (ret) {
-        ucs_error("mlx5dv_devx_obj_modify(2INIT_QP) failed, syndrome %x: %m",
-                  UCT_IB_MLX5DV_GET(rst2init_qp_out, out_2init, syndrome));
-        status = UCS_ERR_IO_ERROR;
-        goto err_free;
+        ret = mlx5dv_devx_obj_modify(qp->devx.obj, in_2init, sizeof(in_2init),
+                out_2init, sizeof(out_2init));
+        if (ret) {
+            ucs_error("mlx5dv_devx_obj_modify(2INIT_QP) failed, syndrome %x: %m",
+                    UCT_IB_MLX5DV_GET(rst2init_qp_out, out_2init, syndrome));
+            status = UCS_ERR_IO_ERROR;
+            goto err_free;
+        }
     }
 
     qp->type = UCT_IB_MLX5_OBJ_TYPE_DEVX;
@@ -264,6 +276,59 @@ void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp)
     uct_ib_mlx5_put_dbrec(qp->devx.dbrec);
     uct_ib_mlx5_md_buf_free(md, qp->devx.wq_buf, &qp->devx.mem);
 }
+
+ucs_status_t uct_ib_mlx5_devx_query_ooo_sl_mask(uct_ib_mlx5_md_t *md,
+                                                uint8_t port_num,
+                                                uint16_t *ooo_sl_mask_p)
+{
+    char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_vport_context_in)]   = {};
+    char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_vport_context_out)] = {};
+    void *ctx;
+    int ret;
+
+    if (!(md->flags & UCT_IB_MLX5_MD_FLAG_OOO_SL_MASK)) {
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    UCT_IB_MLX5DV_SET(query_hca_vport_context_in, in, opcode,
+                      UCT_IB_MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT);
+    UCT_IB_MLX5DV_SET(query_hca_vport_context_in, in, port_num, port_num);
+
+    ret = mlx5dv_devx_general_cmd(md->super.dev.ibv_context, in, sizeof(in),
+                                  out, sizeof(out));
+    if (ret != 0) {
+        ucs_error("mlx5dv_devx_general_cmd(QUERY_HCA_VPORT_CONTEXT) failed,"
+                  " syndrome %x: %m",
+                  UCT_IB_MLX5DV_GET(query_hca_vport_context_out, out,
+                                    syndrome));
+        return UCS_ERR_IO_ERROR;
+    }
+
+    ctx = UCT_IB_MLX5DV_ADDR_OF(query_hca_vport_context_out, out,
+                                hca_vport_context);
+
+    *ooo_sl_mask_p = UCT_IB_MLX5DV_GET(hca_vport_context, ctx, ooo_sl_mask);
+
+    return UCS_OK;
+}
+
+void uct_ib_mlx5_devx_set_qpc_port_affinity(uct_ib_mlx5_md_t *md,
+                                            uint8_t path_index, void *qpc,
+                                            uint32_t *opt_param_mask)
+{
+    uct_ib_device_t *dev = &md->super.dev;
+    uint8_t tx_port      = dev->first_port;
+
+    if (!(md->flags & UCT_IB_MLX5_MD_FLAG_LAG)) {
+        return;
+    }
+
+    *opt_param_mask |= UCT_IB_MLX5_QP_OPTPAR_LAG_TX_AFF;
+    if (dev->lag_level > 0) {
+        tx_port += path_index % dev->lag_level;
+    }
+    UCT_IB_MLX5DV_SET(qpc, qpc, lag_tx_port_affinity, tx_port);
+}
 #endif
 
 ucs_status_t uct_ib_mlx5dv_arm_cq(uct_ib_mlx5_cq_t *cq, int solicited)
diff --git a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h
index 9835ba1e377..cdd66bf52d1 100644
--- a/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h
+++ b/src/uct/ib/mlx5/dv/ib_mlx5_ifc.h
@@ -74,7 +74,9 @@ enum {
     UCT_IB_MLX5_CMD_OP_CREATE_DCT              = 0x710,
     UCT_IB_MLX5_CMD_OP_DRAIN_DCT               = 0x712,
     UCT_IB_MLX5_CMD_OP_CREATE_XRQ              = 0x717,
-    UCT_IB_MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY = 0x726
+    UCT_IB_MLX5_CMD_OP_SET_XRQ_DC_PARAMS_ENTRY = 0x726,
+    UCT_IB_MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT = 0x762,
+    UCT_IB_MLX5_CMD_OP_QUERY_LAG               = 0x842
 };
 
 enum {
@@ -129,12 +131,16 @@ struct uct_ib_mlx5_cmd_hca_cap_bits {
 
     uint8_t    reserved_at_120[0xa];
     uint8_t    log_max_ra_req_dc[0x6];
-    uint8_t    reserved_at_130[0xa];
+    uint8_t    reserved_at_130[0x8];
+    uint8_t    ooo_sl_mask[0x1];
+    uint8_t    reserved_at_139[0x1];
     uint8_t    log_max_ra_res_dc[0x6];
 
     uint8_t    reserved_at_140[0xa];
     uint8_t    log_max_ra_req_qp[0x6];
-    uint8_t    reserved_at_150[0xa];
+    uint8_t    reserved_at_150[0x2];
+    uint8_t    rts2rts_lag_tx_port_affinity[0x1];
+    uint8_t    reserved_at_153[0x7];
     uint8_t    log_max_ra_res_qp[0x6];
 
     uint8_t    end_pad[0x1];
@@ -199,7 +205,9 @@ struct uct_ib_mlx5_cmd_hca_cap_bits {
     uint8_t    wol_p[0x1];
 
     uint8_t    stat_rate_support[0x10];
-    uint8_t    reserved_at_1f0[0xc];
+    uint8_t    reserved_at_1f0[0x8];
+    uint8_t    init2_lag_tx_port_affinity[0x1];
+    uint8_t    reserved_at_1f9[0x3];
     uint8_t    cqe_version[0x4];
 
     uint8_t    compact_address_vector[0x1];
@@ -234,7 +242,8 @@ struct uct_ib_mlx5_cmd_hca_cap_bits {
     uint8_t    cq_oi[0x1];
     uint8_t    cq_resize[0x1];
     uint8_t    cq_moderation[0x1];
-    uint8_t    reserved_at_223[0x3];
+    uint8_t    reserved_at_223[0x2];
+    uint8_t    ib_striding_wq_cq_first_indication[0x1];
     uint8_t    cq_eq_remap[0x1];
     uint8_t    pg[0x1];
     uint8_t    block_lb_mc[0x1];
@@ -268,8 +277,10 @@ struct uct_ib_mlx5_cmd_hca_cap_bits {
     uint8_t    pad_tx_eth_packet[0x1];
     uint8_t    reserved_at_263[0x8];
     uint8_t    log_bf_reg_size[0x5];
-
-    uint8_t    reserved_at_270[0xb];
+    uint8_t    reserved_at_270[0x6];
+    uint8_t    lag_dct[0x2];
+    uint8_t    lag_tx_port_affinity[0x1];
+    uint8_t    reserved_at_279[0x2];
     uint8_t    lag_master[0x1];
     uint8_t    num_lag_ports[0x4];
 
@@ -490,6 +501,110 @@ struct uct_ib_mlx5_query_hca_cap_in_bits {
     uint8_t    reserved_at_40[0x40];
 };
 
+struct uct_ib_mlx5_lag_context_bits {
+    uint8_t    reserved_at_0[0x1d];
+    uint8_t    lag_state[0x3];
+    uint8_t    reserved_at_20[0x20];
+};
+
+struct uct_ib_mlx5_query_lag_out_bits {
+    uint8_t    status[0x8];
+    uint8_t    reserved_at_8[0x18];
+
+    uint8_t    syndrome[0x20];
+
+    struct uct_ib_mlx5_lag_context_bits lag_context;
+};
+
+struct uct_ib_mlx5_query_lag_in_bits {
+    uint8_t    opcode[0x10];
+    uint8_t    uid[0x10];
+
+    uint8_t    reserved_at_20[0x10];
+    uint8_t    op_mod[0x10];
+
+    uint8_t    reserved_at_40[0x40];
+};
+
+struct uct_ib_mlx5_hca_vport_context_bits {
+    uint8_t    field_select[0x20];
+
+    uint8_t    reserved_at_20[0xe0];
+
+    uint8_t    sm_virt_aware[0x1];
+    uint8_t    has_smi[0x1];
+    uint8_t    has_raw[0x1];
+    uint8_t    grh_required[0x1];
+    uint8_t    reserved_at_104[0xc];
+    uint8_t    port_physical_state[0x4];
+    uint8_t    vport_state_policy[0x4];
+    uint8_t    port_state[0x4];
+    uint8_t    vport_state[0x4];
+
+    uint8_t    reserved_at_120[0x20];
+
+    uint8_t    system_image_guid[0x40];
+
+    uint8_t    port_guid[0x40];
+
+    uint8_t    node_guid[0x40];
+
+    uint8_t    cap_mask1[0x20];
+
+    uint8_t    cap_mask1_field_select[0x20];
+
+    uint8_t    cap_mask2[0x20];
+
+    uint8_t    cap_mask2_field_select[0x20];
+
+    uint8_t    reserved_at_280[0x10];
+
+    uint8_t    ooo_sl_mask[0x10];
+
+    uint8_t    reserved_at_296[0x40];
+
+    uint8_t    lid[0x10];
+    uint8_t    reserved_at_310[0x4];
+    uint8_t    init_type_reply[0x4];
+    uint8_t    lmc[0x3];
+    uint8_t    subnet_timeout[0x5];
+
+    uint8_t    sm_lid[0x10];
+    uint8_t    sm_sl[0x4];
+    uint8_t    reserved_at_334[0xc];
+
+    uint8_t    qkey_violation_counter[0x10];
+    uint8_t    pkey_violation_counter[0x10];
+
+    uint8_t    reserved_at_360[0xca0];
+};
+
+struct uct_ib_mlx5_query_hca_vport_context_out_bits {
+    uint8_t    status[0x8];
+    uint8_t    reserved_at_8[0x18];
+
+    uint8_t    syndrome[0x20];
+
+    uint8_t    reserved_at_40[0x40];
+
+    struct uct_ib_mlx5_hca_vport_context_bits hca_vport_context;
+};
+
+struct uct_ib_mlx5_query_hca_vport_context_in_bits {
+    uint8_t    opcode[0x10];
+    uint8_t    reserved_at_10[0x10];
+
+    uint8_t    reserved_at_20[0x10];
+    uint8_t    op_mod[0x10];
+
+    uint8_t    other_vport[0x1];
+    uint8_t    reserved_at_41[0xb];
+    uint8_t    port_num[0x4];
+    uint8_t    vport_number[0x10];
+
+    uint8_t    reserved_at_60[0x20];
+};
+
 enum {
     UCT_IB_MLX5_MKC_ACCESS_MODE_PA    = 0x0,
     UCT_IB_MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
@@ -677,8 +792,9 @@ struct uct_ib_mlx5_dctc_bits {
     uint8_t         atomic_like_write_en[0x1];
     uint8_t         latency_sensitive[0x1];
     uint8_t         rlky[0x1];
-    uint8_t         free_ar[0x1];
-    uint8_t         reserved_at_73[0xd];
+    uint8_t         force_full_handshake[0x1];
+    uint8_t         multi_path[0x1];
+    uint8_t         reserved_at_74[0xc];
 
     uint8_t         reserved_at_80[0x8];
     uint8_t         cs_res[0x8];
@@ -1087,6 +1203,13 @@ enum {
     UCT_IB_MLX5_QPC_CS_RES_UP_TO_64B  = 0x2
 };
 
+enum {
+    UCT_IB_MLX5_QP_OPTPAR_RRE        = 1 << 1,
+    UCT_IB_MLX5_QP_OPTPAR_RAE        = 1 << 2,
+    UCT_IB_MLX5_QP_OPTPAR_RWE        = 1 << 3,
+    UCT_IB_MLX5_QP_OPTPAR_LAG_TX_AFF = 1 << 15
+};
+
 static inline unsigned uct_ib_mlx5_qpc_cs_res(unsigned size, int dc)
 {
     return (size > 32) ? UCT_IB_MLX5_QPC_CS_RES_UP_TO_64B :
@@ -1129,7 +1252,9 @@ struct uct_ib_mlx5_qpc_bits {
     uint8_t         counter_set_id[0x8];
     uint8_t         uar_page[0x18];
 
-    uint8_t         reserved_at_80[0x8];
+    uint8_t         reserved_at_80[0x3];
+    uint8_t         full_handshake[0x1];
+    uint8_t         cnak_reverse_sl[0x4];
     uint8_t         user_index[0x18];
 
     uint8_t         reserved_at_a0[0x3];
diff --git a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c
index 7a983bf834c..e60cd40e9af 100644
--- a/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c
+++ b/src/uct/ib/mlx5/dv/ib_mlx5dv_md.c
@@ -546,6 +546,27 @@ uct_ib_mlx5_devx_check_odp(uct_ib_mlx5_md_t *md,
     return UCS_OK;
 }
 
+static ucs_status_t
+uct_ib_mlx5_devx_query_lag(uct_ib_mlx5_md_t *md, uint8_t *state)
+{
+    char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_lag_out)] = {};
+    char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_lag_out)]  = {};
+    void *lag;
+    int ret;
+
+    lag = UCT_IB_MLX5DV_ADDR_OF(query_lag_out, out, lag_context);
+    UCT_IB_MLX5DV_SET(query_lag_in, in, opcode, UCT_IB_MLX5_CMD_OP_QUERY_LAG);
+    ret = mlx5dv_devx_general_cmd(md->super.dev.ibv_context, in, sizeof(in),
+                                  out, sizeof(out));
+    if (ret != 0) {
+        ucs_debug("mlx5dv_devx_general_cmd(QUERY_LAG) failed: %m");
+        return UCS_ERR_IO_ERROR;
+    }
+
+    *state = UCT_IB_MLX5DV_GET(lag_context, lag, lag_state);
+    return UCS_OK;
+}
+
 static struct ibv_context *
 uct_ib_mlx5_devx_open_device(struct ibv_device *ibv_device,
                              struct mlx5dv_context_attr *dv_attr)
@@ -576,8 +597,9 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
 {
     char out[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_out)] = {};
     char in[UCT_IB_MLX5DV_ST_SZ_BYTES(query_hca_cap_in)]   = {};
-    struct mlx5dv_context_attr dv_attr = {};
-    ucs_status_t status = UCS_OK;
+    struct mlx5dv_context_attr dv_attr                     = {};
+    ucs_status_t status                                    = UCS_OK;
+    uint8_t lag_state                                      = 0;
     struct ibv_context *ctx;
     uct_ib_device_t *dev;
     uct_ib_mlx5_md_t *md;
@@ -649,8 +671,19 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
         goto err_free;
     }
 
-    if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, dct)) {
-        dev->flags |= UCT_IB_DEVICE_FLAG_DC;
+    status = uct_ib_mlx5_devx_query_lag(md, &lag_state);
+    if (status != UCS_OK) {
+        dev->lag_level = 0;
+    } else if (lag_state == 0) {
+        dev->lag_level = 1;
+    } else {
+        dev->lag_level = UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, num_lag_ports);
+    }
+
+    if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, dct) &&
+         (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, lag_dct) || (lag_state == 0))) {
+         /* Either DCT supports LAG, or LAG is off */
+         dev->flags |= UCT_IB_DEVICE_FLAG_DC;
     }
 
     if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, rndv_offload_dc)) {
@@ -677,6 +710,23 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
         md->flags |= UCT_IB_MLX5_MD_FLAG_RMP;
     }
 
+    if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, ooo_sl_mask)) {
+        md->flags |= UCT_IB_MLX5_MD_FLAG_OOO_SL_MASK;
+    }
+
+    if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, init2_lag_tx_port_affinity)) {
+        md->flags |= UCT_IB_MLX5_MD_FLAG_LAG;
+    }
+
+    if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap, cqe_version)) {
+        md->flags |= UCT_IB_MLX5_MD_FLAG_CQE_V1;
+    }
+
+    if (UCT_IB_MLX5DV_GET(cmd_hca_cap, cap,
+                          ib_striding_wq_cq_first_indication)) {
+        md->flags |= UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG;
+    }
+
     status = uct_ib_mlx5_devx_check_odp(md, md_config, cap);
     if (status != UCS_OK) {
         goto err_free;
@@ -768,7 +818,7 @@ static ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
     return status;
 }
 
-void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd)
+static void uct_ib_mlx5_devx_md_cleanup(uct_ib_md_t *ibmd)
 {
     uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t);
 
@@ -921,7 +971,8 @@ static ucs_status_t uct_ib_mlx5dv_md_open(struct ibv_device *ibv_device,
 
     ctx = ibv_open_device(ibv_device);
     if (ctx == NULL) {
-        ucs_debug("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device));
+        ucs_diag("ibv_open_device(%s) failed: %m",
+                 ibv_get_device_name(ibv_device));
         status = UCS_ERR_UNSUPPORTED;
         goto err;
     }
diff --git a/src/uct/ib/mlx5/exp/ib_exp_md.c b/src/uct/ib/mlx5/exp/ib_exp_md.c
index 8218ac2a899..4c3cf8b2d5e 100644
--- a/src/uct/ib/mlx5/exp/ib_exp_md.c
+++ b/src/uct/ib/mlx5/exp/ib_exp_md.c
@@ -171,7 +171,8 @@ static ucs_status_t uct_ib_mlx5_exp_md_umr_qp_create(uct_ib_mlx5_md_t *md)
     qp_attr.ah_attr.dlid             = port_attr->lid;
     qp_attr.ah_attr.is_global        = 1;
     if (uct_ib_device_query_gid(ibdev, port_num, UCT_IB_MD_DEFAULT_GID_INDEX,
-                                &qp_attr.ah_attr.grh.dgid) != UCS_OK) {
+                                &qp_attr.ah_attr.grh.dgid,
+                                UCS_LOG_LEVEL_ERROR) != UCS_OK) {
         goto err_destroy_qp;
     }
 
@@ -489,9 +490,16 @@ static ucs_status_t uct_ib_mlx5_exp_dereg_atomic_key(uct_ib_md_t *ibmd,
 {
 #ifdef HAVE_EXP_UMR
     uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t);
+    struct ibv_mr *atomic_mr;
     int ret;
 
-    ret = UCS_PROFILE_CALL(ibv_dereg_mr, memh->atomic_mr);
+    if (memh->super.flags & UCT_IB_MEM_MULTITHREADED) {
+        atomic_mr = memh->ksm_data->atomic_mr;
+    } else {
+        atomic_mr = memh->atomic_mr;
+    }
+
+    ret = UCS_PROFILE_CALL(ibv_dereg_mr, atomic_mr);
     if (ret != 0) {
         ucs_error("ibv_dereg_mr() failed: %m");
         return UCS_ERR_IO_ERROR;
@@ -576,13 +584,6 @@ static ucs_status_t uct_ib_mlx5_exp_dereg_multithreaded(uct_ib_md_t *ibmd,
     size_t chunk = ibmd->config.mt_reg_chunk;
     ucs_status_t s, status = UCS_OK;
 
-    if (memh->super.flags & UCT_IB_MEM_FLAG_ATOMIC_MR) {
-        s = uct_ib_dereg_mr(memh->ksm_data->atomic_mr);
-        if (s != UCS_OK) {
-            status = s;
-        }
-    }
-
     s = uct_ib_md_handle_mr_list_multithreaded(ibmd, memh->mr->addr,
                                                memh->mr->length,
                                                UCT_IB_MEM_DEREG, chunk,
@@ -622,7 +623,8 @@ static ucs_status_t uct_ib_mlx5_exp_md_open(struct ibv_device *ibv_device,
 
     ctx = ibv_open_device(ibv_device);
     if (ctx == NULL) {
-        ucs_debug("ibv_open_device(%s) failed: %m", ibv_get_device_name(ibv_device));
+        ucs_diag("ibv_open_device(%s) failed: %m",
+                 ibv_get_device_name(ibv_device));
         status = UCS_ERR_UNSUPPORTED;
         goto err;
     }
diff --git a/src/uct/ib/mlx5/ib_mlx5.c b/src/uct/ib/mlx5/ib_mlx5.c
index 67e8bce07d3..afd57bd4738 100644
--- a/src/uct/ib/mlx5/ib_mlx5.c
+++ b/src/uct/ib/mlx5/ib_mlx5.c
@@ -18,6 +18,7 @@
 #include <ucs/debug/log.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/sys.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <string.h>
 
 
@@ -50,6 +51,21 @@ ucs_config_field_t uct_ib_mlx5_iface_config_table[] = {
      ucs_offsetof(uct_ib_mlx5_iface_config_t, mmio_mode),
      UCS_CONFIG_TYPE_ENUM(uct_ib_mlx5_mmio_modes)},
 
+    {"AR_ENABLE", "auto",
+     "Enable Adaptive Routing (out of order) feature on SL that supports it.\n"
+     "SLs are selected as follows:\n"
+     "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
+     "+                                         + UCX_IB_AR_ENABLE=yes  + UCX_IB_AR_ENABLE=no   + UCX_IB_AR_ENABLE=try  + UCX_IB_AR_ENABLE=auto +\n"
+     "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
+     "+ UCX_IB_SL=auto + AR enabled on some SLs + Use 1st SL with AR    + Use 1st SL without AR + Use 1st SL with AR    + Use SL=0              +\n"
+     "+                + AR enabled on all SLs  + Use SL=0              + Failure               + Use SL=0              + Use SL=0              +\n"
+     "+                + AR disabled on all SLs + Failure               + Use SL=0              + Use SL=0              + Use SL=0              +\n"
+     "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"
+     "+ UCX_IB_SL=<sl> + AR enabled on <sl>     + Use SL=<sl>           + Failure               + Use SL=<sl>           + Use SL=<sl>           +\n"
+     "+                + AR disabled on <sl>    + Failure               + Use SL=<sl>           + Use SL=<sl>           + Use SL=<sl>           +\n"
+     "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n",
+     ucs_offsetof(uct_ib_mlx5_iface_config_t, ar_enable), UCS_CONFIG_TYPE_TERNARY_AUTO},
+
     {NULL}
 };
 
@@ -58,12 +74,12 @@ ucs_status_t uct_ib_mlx5_create_cq(uct_ib_iface_t *iface, uct_ib_dir_t dir,
                                    int preferred_cpu, size_t inl)
 {
 #if HAVE_DECL_MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE
-    uct_ib_device_t *dev = uct_ib_iface_device(iface);
-    struct ibv_cq *cq;
+    uct_ib_device_t *dev               = uct_ib_iface_device(iface);
     struct ibv_cq_init_attr_ex cq_attr = {};
     struct mlx5dv_cq_init_attr dv_attr = {};
+    struct ibv_cq *cq;
 
-    cq_attr.cqe         = init_attr->cq_len[dir];
+    cq_attr.cqe         = uct_ib_cq_size(iface, init_attr, dir);
     cq_attr.channel     = iface->comp_channel;
     cq_attr.comp_vector = preferred_cpu;
     if (init_attr->flags & UCT_IB_CQ_IGNORE_OVERRUN) {
@@ -385,15 +401,56 @@ int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
     return uar->ctx == md->super.dev.ibv_context;
 }
 
+#if HAVE_DEVX
+static ucs_status_t
+uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md, unsigned flags, int log_level,
+                           char *title, char *fallback,
+                           struct mlx5dv_devx_uar **uar_p)
+{
+    struct mlx5dv_devx_uar *uar;
+    char buf[512];
+
+    uar = mlx5dv_devx_alloc_uar(md->super.dev.ibv_context, flags);
+    if (uar == NULL) {
+        sprintf(buf, "mlx5dv_devx_alloc_uar(device=%s, flags=0x%x(%s)) "
+                "failed: %m", uct_ib_device_name(&md->super.dev), flags, title);
+        if (fallback == NULL) {
+            ucs_log(log_level, "%s", buf);
+        } else {
+            ucs_log(log_level, "%s, fallback to %s", buf, fallback);
+        }
+
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    *uar_p = uar;
+    return UCS_OK;
+}
+#endif
+
 ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
                                        uct_ib_mlx5_md_t *md,
                                        uct_ib_mlx5_mmio_mode_t mmio_mode)
 {
 #if HAVE_DEVX
-    uar->uar            = mlx5dv_devx_alloc_uar(md->super.dev.ibv_context, 0);
-    if (uar->uar == NULL) {
-        ucs_error("mlx5dv_devx_alloc_uar() failed: %m");
-        return UCS_ERR_NO_MEMORY;
+    ucs_status_t status;
+
+#if HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC
+    status = uct_ib_mlx5_devx_alloc_uar(md, UCT_IB_MLX5_UAR_ALLOC_TYPE_WC,
+                                        UCS_LOG_LEVEL_DEBUG, "WC", "NC",
+                                        &uar->uar);
+    if (status != UCS_OK) {
+        status = uct_ib_mlx5_devx_alloc_uar(md, UCT_IB_MLX5_UAR_ALLOC_TYPE_NC,
+                                            UCS_LOG_LEVEL_ERROR, "NC", NULL,
+                                            &uar->uar);
+    }
+#else
+    status = uct_ib_mlx5_devx_alloc_uar(md, UCT_IB_MLX5_UAR_ALLOC_TYPE_WC,
+                                        UCS_LOG_LEVEL_ERROR, "WC", NULL,
+                                        &uar->uar);
+#endif
+    if (status != UCS_OK) {
+        return status;
     }
 
     uar->super.addr.ptr = uar->uar->reg_addr;
@@ -425,6 +482,29 @@ void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq)
     memset(txwq->qstart, 0, UCS_PTR_BYTE_DIFF(txwq->qstart, txwq->qend));
 }
 
+void uct_ib_mlx5_txwq_vfs_populate(uct_ib_mlx5_txwq_t *txwq, void *parent_obj)
+{
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive,
+                            &txwq->super.qp_num, UCS_VFS_TYPE_U32_HEX,
+                            "qp_num");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive, &txwq->sw_pi,
+                            UCS_VFS_TYPE_U16, "sw_pi");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive,
+                            &txwq->prev_sw_pi, UCS_VFS_TYPE_U16, "prev_sw_pi");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive, &txwq->qstart,
+                            UCS_VFS_TYPE_POINTER, "qstart");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive, &txwq->qend,
+                            UCS_VFS_TYPE_POINTER, "qend");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive, &txwq->bb_max,
+                            UCS_VFS_TYPE_U16, "bb_max");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive, &txwq->sig_pi,
+                            UCS_VFS_TYPE_U16, "sig_pi");
+#if UCS_ENABLE_ASSERT
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive, &txwq->hw_ci,
+                            UCS_VFS_TYPE_U16, "hw_ci");
+#endif
+}
+
 ucs_status_t
 uct_ib_mlx5_get_mmio_mode(uct_priv_worker_t *worker,
                           uct_ib_mlx5_mmio_mode_t cfg_mmio_mode,
@@ -632,7 +712,6 @@ void uct_ib_mlx5_srq_buff_init(uct_ib_mlx5_srq_t *srq, uint32_t head,
     srq->ready_idx = UINT16_MAX;
     srq->sw_pi     = UINT16_MAX;
     srq->mask      = tail;
-    srq->tail      = tail;
     srq->stride    = uct_ib_mlx5_srq_stride(sge_num);
 
     for (i = head; i <= tail; ++i) {
@@ -700,3 +779,129 @@ void uct_ib_mlx5_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp)
         break;
     }
 }
+
+/* Keep the function as a separate to test SL selection */
+ucs_status_t
+uct_ib_mlx5_select_sl(const uct_ib_iface_config_t *ib_config,
+                      ucs_ternary_auto_value_t ar_enable,
+                      uint16_t hw_sl_mask, int have_sl_mask_cap,
+                      const char *dev_name, uint8_t port_num,
+                      uint8_t *sl_p)
+{
+    ucs_status_t status = UCS_OK;
+    const char UCS_V_UNUSED *sl_ar_support_str;
+    uint16_t sl_allow_mask, sls_with_ar, sls_without_ar;
+    ucs_string_buffer_t sls_with_ar_str, sls_without_ar_str;
+    char sl_str[8];
+    char ar_enable_str[8];
+    uint8_t sl;
+
+    ucs_assert(have_sl_mask_cap || (hw_sl_mask == 0));
+
+    /* which SLs are allowed by user config */
+    sl_allow_mask = (ib_config->sl == UCS_ULUNITS_AUTO) ?
+                    UCS_MASK(UCT_IB_SL_NUM) : UCS_BIT(ib_config->sl);
+
+    if (have_sl_mask_cap) {
+        sls_with_ar    = sl_allow_mask & hw_sl_mask;
+        sls_without_ar = sl_allow_mask & ~hw_sl_mask;
+    } else {
+        sls_with_ar    =
+        sls_without_ar = 0;
+    }
+
+    ucs_string_buffer_init(&sls_with_ar_str);
+    ucs_string_buffer_init(&sls_without_ar_str);
+
+    if (ar_enable == UCS_AUTO) {
+        /* selects SL requested by a user */
+        sl                    = ucs_ffs64(sl_allow_mask);
+        if (have_sl_mask_cap) {
+            sl_ar_support_str = (sl & sls_with_ar) ? "yes" : "no";
+        } else {
+            sl_ar_support_str = "unknown";
+        }
+    } else if (((ar_enable == UCS_YES) || (ar_enable == UCS_TRY)) &&
+               (sls_with_ar != 0)) {
+        /* have SLs with AR, and AR is YES/TRY */
+        sl                = ucs_ffs64(sls_with_ar);
+        sl_ar_support_str = "yes";
+    } else if (((ar_enable == UCS_NO) || (ar_enable == UCS_TRY)) &&
+               (sls_without_ar != 0)) {
+        /* have SLs without AR, and AR is NO/TRY */
+        sl                = ucs_ffs64(sls_without_ar);
+        sl_ar_support_str = "no";
+    } else if (ar_enable == UCS_TRY) {
+        ucs_assert(!have_sl_mask_cap);
+        sl                = ucs_ffs64(sl_allow_mask);
+        sl_ar_support_str = "unknown"; /* we don't know which SLs support AR */
+    } else {
+        sl_ar_support_str = (ar_enable == UCS_YES) ? "with" : "without";
+        goto err;
+    }
+
+    *sl_p = sl;
+    ucs_debug("SL=%u (AR support - %s) was selected on %s:%u,"
+              " SLs with AR support = { %s }, SLs without AR support = { %s }",
+              sl, sl_ar_support_str, dev_name, port_num,
+              ucs_mask_str(sls_with_ar, &sls_with_ar_str),
+              ucs_mask_str(sls_without_ar, &sls_without_ar_str));
+out_str_buf_clean:
+    ucs_string_buffer_cleanup(&sls_with_ar_str);
+    ucs_string_buffer_cleanup(&sls_without_ar_str);
+    return status;
+
+err:
+    ucs_assert(ar_enable != UCS_TRY);
+    ucs_config_sprintf_ulunits(sl_str, sizeof(sl_str), &ib_config->sl, NULL);
+    ucs_config_sprintf_ternary_auto(ar_enable_str, sizeof(ar_enable_str),
+                                    &ar_enable, NULL);
+    ucs_error("AR=%s was requested for SL=%s, but %s %s AR on %s:%u,"
+              " SLs with AR support = { %s }, SLs without AR support = { %s }",
+              ar_enable_str, sl_str,
+              have_sl_mask_cap ? "could not select SL" :
+              "could not detect AR mask for SLs. Please, set SL manually",
+              sl_ar_support_str, dev_name, port_num,
+              ucs_mask_str(sls_with_ar, &sls_with_ar_str),
+              ucs_mask_str(sls_without_ar, &sls_without_ar_str));
+    status = UCS_ERR_UNSUPPORTED;
+    goto out_str_buf_clean;
+}
+
+ucs_status_t
+uct_ib_mlx5_iface_select_sl(uct_ib_iface_t *iface,
+                            const uct_ib_mlx5_iface_config_t *ib_mlx5_config,
+                            const uct_ib_iface_config_t *ib_config)
+{
+#if HAVE_DEVX
+    uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.md, uct_ib_mlx5_md_t);
+#endif
+    uint16_t ooo_sl_mask = 0;
+    ucs_status_t status;
+
+    ucs_assert(iface->config.sl == UCT_IB_SL_NUM);
+
+    if (uct_ib_device_is_port_roce(uct_ib_iface_device(iface),
+                                   iface->config.port_num)) {
+        /* Ethernet priority for RoCE devices can't be selected regardless
+         * AR support requested by user, pass empty ooo_sl_mask */
+        return uct_ib_mlx5_select_sl(ib_config, UCS_NO, 0, 1,
+                                     UCT_IB_IFACE_ARG(iface),
+                                     &iface->config.sl);
+    }
+
+#if HAVE_DEVX
+    status = uct_ib_mlx5_devx_query_ooo_sl_mask(md, iface->config.port_num,
+                                                &ooo_sl_mask);
+    if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) {
+        return status;
+    }
+#else
+    status = UCS_ERR_UNSUPPORTED;
+#endif
+
+    return uct_ib_mlx5_select_sl(ib_config, ib_mlx5_config->ar_enable,
+                                 ooo_sl_mask, status == UCS_OK,
+                                 UCT_IB_IFACE_ARG(iface),
+                                 &iface->config.sl);
+}
diff --git a/src/uct/ib/mlx5/ib_mlx5.h b/src/uct/ib/mlx5/ib_mlx5.h
index ff50f21a3cc..e8c5182f443 100644
--- a/src/uct/ib/mlx5/ib_mlx5.h
+++ b/src/uct/ib/mlx5/ib_mlx5.h
@@ -45,33 +45,43 @@
 #include <string.h>
 
 
-#define UCT_IB_MLX5_WQE_SEG_SIZE        16 /* Size of a segment in a WQE */
-#define UCT_IB_MLX5_CQE64_MAX_INL       32 /* Inline scatter size in 64-byte CQE */
-#define UCT_IB_MLX5_CQE128_MAX_INL      64 /* Inline scatter size in 128-byte CQE */
-#define UCT_IB_MLX5_CQE64_SIZE_LOG      6
-#define UCT_IB_MLX5_CQE128_SIZE_LOG     7
-#define UCT_IB_MLX5_MAX_BB              4
-#define UCT_IB_MLX5_WORKER_BF_KEY       0x00c1b7e8u
-#define UCT_IB_MLX5_DEVX_UAR_KEY        0xdea1ab1eU
-#define UCT_IB_MLX5_RES_DOMAIN_KEY      0x1b1bda7aU
-#define UCT_IB_MLX5_WORKER_DM_KEY       0xacdf1245u
-#define UCT_IB_MLX5_EXTENDED_UD_AV      0x80 /* htonl(0x80000000) */
-#define UCT_IB_MLX5_AV_GRH_PRESENT      0x40 /* htonl(UCS_BIT(30)) */
-#define UCT_IB_MLX5_BF_REG_SIZE         256
-#define UCT_IB_MLX5_CQE_VENDOR_SYND_ODP 0x93
-#define UCT_IB_MLX5_CQE_VENDOR_SYND_PSN 0x99
-#define UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK 0x80
-#define UCT_IB_MLX5_MAX_SEND_WQE_SIZE   (UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB)
-#define UCT_IB_MLX5_CQ_SET_CI           0
-#define UCT_IB_MLX5_CQ_ARM_DB           1
-#define UCT_IB_MLX5_LOG_MAX_MSG_SIZE    30
-#define UCT_IB_MLX5_ATOMIC_MODE         3
-#define UCT_IB_MLX5_CQE_FLAG_L3_IN_DATA UCS_BIT(28) /* GRH/IP in the receive buffer */
-#define UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE  UCS_BIT(29) /* GRH/IP in the CQE */
-#define UCT_IB_MLX5_MP_RQ_BYTE_CNT_MASK 0x0000FFFF  /* Byte count mask for multi-packet RQs */
-#define UCT_IB_MLX5_MP_RQ_LAST_MSG_FLAG UCS_BIT(30) /* MP last packet indication */
-#define UCT_IB_MLX5_MP_RQ_FILLER_FLAG   UCS_BIT(31) /* Filler CQE indicator */
+#define UCT_IB_MLX5_WQE_SEG_SIZE         16 /* Size of a segment in a WQE */
+#define UCT_IB_MLX5_CQE64_MAX_INL        32 /* Inline scatter size in 64-byte CQE */
+#define UCT_IB_MLX5_CQE128_MAX_INL       64 /* Inline scatter size in 128-byte CQE */
+#define UCT_IB_MLX5_CQE64_SIZE_LOG       6
+#define UCT_IB_MLX5_CQE128_SIZE_LOG      7
+#define UCT_IB_MLX5_MAX_BB               4
+#define UCT_IB_MLX5_WORKER_BF_KEY        0x00c1b7e8u
+#define UCT_IB_MLX5_DEVX_UAR_KEY         0xdea1ab1eU
+#define UCT_IB_MLX5_RES_DOMAIN_KEY       0x1b1bda7aU
+#define UCT_IB_MLX5_WORKER_DM_KEY        0xacdf1245u
+#define UCT_IB_MLX5_EXTENDED_UD_AV       0x80 /* htonl(0x80000000) */
+#define UCT_IB_MLX5_AV_GRH_PRESENT       0x40 /* htonl(UCS_BIT(30)) */
+#define UCT_IB_MLX5_BF_REG_SIZE          256
+#define UCT_IB_MLX5_CQE_VENDOR_SYND_ODP  0x93
+#define UCT_IB_MLX5_CQE_VENDOR_SYND_PSN  0x99
+#define UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK  0x80
+#define UCT_IB_MLX5_MAX_SEND_WQE_SIZE    (UCT_IB_MLX5_MAX_BB * MLX5_SEND_WQE_BB)
+#define UCT_IB_MLX5_CQ_SET_CI            0
+#define UCT_IB_MLX5_CQ_ARM_DB            1
+#define UCT_IB_MLX5_LOG_MAX_MSG_SIZE     30
+#define UCT_IB_MLX5_ATOMIC_MODE          3
+#define UCT_IB_MLX5_CQE_FLAG_L3_IN_DATA  UCS_BIT(28) /* GRH/IP in the receive buffer */
+#define UCT_IB_MLX5_CQE_FLAG_L3_IN_CQE   UCS_BIT(29) /* GRH/IP in the CQE */
+#define UCT_IB_MLX5_MP_RQ_BYTE_CNT_MASK  0x0000FFFF  /* Byte count mask for multi-packet RQs */
+#define UCT_IB_MLX5_MP_RQ_FIRST_MSG_FLAG UCS_BIT(29) /* MP first packet indication */
+#define UCT_IB_MLX5_MP_RQ_LAST_MSG_FLAG  UCS_BIT(30) /* MP last packet indication */
+#define UCT_IB_MLX5_MP_RQ_FILLER_FLAG    UCS_BIT(31) /* Filler CQE indicator */
+
+#if HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_BF
+#  define UCT_IB_MLX5_UAR_ALLOC_TYPE_WC MLX5DV_UAR_ALLOC_TYPE_BF
+#else
+#  define UCT_IB_MLX5_UAR_ALLOC_TYPE_WC 0
+#endif
 
+#if HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC
+#  define UCT_IB_MLX5_UAR_ALLOC_TYPE_NC MLX5DV_UAR_ALLOC_TYPE_NC
+#endif
 
 #define UCT_IB_MLX5_OPMOD_EXT_ATOMIC(_log_arg_size) \
     ((8) | ((_log_arg_size) - 2))
@@ -161,13 +171,22 @@ enum {
     UCT_IB_MLX5_MD_FLAG_INDIRECT_ATOMICS = UCS_BIT(4),
     /* Device supports RMP to create SRQ for AM */
     UCT_IB_MLX5_MD_FLAG_RMP              = UCS_BIT(5),
+    /* Device supports querying bitmask of OOO (AR) states per SL */
+    UCT_IB_MLX5_MD_FLAG_OOO_SL_MASK      = UCS_BIT(6),
+    /* Device has LAG */
+    UCT_IB_MLX5_MD_FLAG_LAG              = UCS_BIT(7),
+    /* Device supports CQE V1 */
+    UCT_IB_MLX5_MD_FLAG_CQE_V1           = UCS_BIT(8),
+    /* Device supports first fragment indication for MP XRQ */
+    UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG = UCS_BIT(9),
 
     /* Object to be created by DevX */
-    UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT  = 6,
+    UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT  = 10,
     UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP       = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCQP),
     UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ      = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCSRQ),
     UCT_IB_MLX5_MD_FLAG_DEVX_DCT         = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCT),
     UCT_IB_MLX5_MD_FLAG_DEVX_DC_SRQ      = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCSRQ),
+    UCT_IB_MLX5_MD_FLAG_DEVX_DCI         = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCI),
 };
 
 
@@ -225,6 +244,7 @@ typedef struct uct_ib_mlx5_iface_config {
     } dm;
 #endif
     uct_ib_mlx5_mmio_mode_t  mmio_mode;
+    ucs_ternary_auto_value_t ar_enable;
 } uct_ib_mlx5_iface_config_t;
 
 
@@ -256,7 +276,6 @@ typedef struct uct_ib_mlx5_srq {
     uint16_t                           ready_idx;  /* what is ready to be posted to hw */
     uint16_t                           sw_pi;      /* what is posted to hw */
     uint16_t                           mask;
-    uint16_t                           tail;       /* tail in the driver */
     uint16_t                           stride;
     union {
         struct {
@@ -321,6 +340,8 @@ typedef struct uct_ib_mlx5_res_domain {
 typedef struct uct_ib_mlx5_qp_attr {
     uct_ib_qp_attr_t            super;
     uct_ib_mlx5_mmio_mode_t     mmio_mode;
+    uint32_t                    uidx;
+    int                         full_handshake;
 } uct_ib_mlx5_qp_attr_t;
 
 
@@ -527,6 +548,11 @@ void uct_ib_mlx5_qp_mmio_cleanup(uct_ib_mlx5_qp_t *qp,
  */
 void uct_ib_mlx5_txwq_reset(uct_ib_mlx5_txwq_t *txwq);
 
+/**
+ * Add txwq attributes to a VFS object
+ */
+void uct_ib_mlx5_txwq_vfs_populate(uct_ib_mlx5_txwq_t *txwq, void *parent_obj);
+
 /**
  * Initialize rxwq structure.
  */
@@ -577,6 +603,14 @@ ucs_status_t uct_ib_mlx5_devx_modify_qp_state(uct_ib_mlx5_qp_t *qp,
 
 void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5_qp_t *qp);
 
+ucs_status_t uct_ib_mlx5_devx_query_ooo_sl_mask(uct_ib_mlx5_md_t *md,
+                                                uint8_t port_num,
+                                                uint16_t *ooo_sl_mask_p);
+
+void uct_ib_mlx5_devx_set_qpc_port_affinity(uct_ib_mlx5_md_t *md,
+                                            uint8_t path_index, void *qpc,
+                                            uint32_t *opt_param_mask);
+
 static inline ucs_status_t
 uct_ib_mlx5_md_buf_alloc(uct_ib_mlx5_md_t *md, size_t size, int silent,
                          void **buf_p, uct_ib_mlx5_devx_umem_t *mem,
@@ -670,6 +704,18 @@ static inline void uct_ib_mlx5_devx_destroy_qp(uct_ib_mlx5_md_t *md, uct_ib_mlx5
 
 #endif
 
+ucs_status_t
+uct_ib_mlx5_select_sl(const uct_ib_iface_config_t *ib_config,
+                      ucs_ternary_auto_value_t ar_enable,
+                      uint16_t hw_sl_mask, int have_sl_mask_cap,
+                      const char *dev_name, uint8_t port_num,
+                      uint8_t *sl_p);
+
+ucs_status_t
+uct_ib_mlx5_iface_select_sl(uct_ib_iface_t *iface,
+                            const uct_ib_mlx5_iface_config_t *ib_mlx5_config,
+                            const uct_ib_iface_config_t *ib_config);
+
 static inline uct_ib_mlx5_dbrec_t *uct_ib_mlx5_get_dbrec(uct_ib_mlx5_md_t *md)
 {
     uct_ib_mlx5_dbrec_t *dbrec;
diff --git a/src/uct/ib/mlx5/ib_mlx5.inl b/src/uct/ib/mlx5/ib_mlx5.inl
index bc12d093dd1..97e8c18c68a 100644
--- a/src/uct/ib/mlx5/ib_mlx5.inl
+++ b/src/uct/ib/mlx5/ib_mlx5.inl
@@ -175,6 +175,36 @@ uct_ib_mlx5_inline_copy(void *restrict dest, const void *restrict src, unsigned
 }
 
 
+/**
+ * Copy uct_iov_t array to inline segment, taking into account QP wrap-around.
+ *
+ * @param dest     Inline data in the WQE to copy to.
+ * @param iov      A pointer to an array of uct_iov_t elements.
+ * @param iov_cnt  A number of elements in iov array.
+ * @param length   A total size of data in iov array.
+ * @param wq       Send work-queue.
+ */
+static UCS_F_ALWAYS_INLINE void
+uct_ib_mlx5_inline_iov_copy(void *restrict dest, const uct_iov_t *iov,
+                            size_t iovcnt, size_t length,
+                            uct_ib_mlx5_txwq_t *wq)
+{
+    ptrdiff_t remainder;
+    ucs_iov_iter_t iov_iter;
+
+    ucs_assert(dest != NULL);
+
+    ucs_iov_iter_init(&iov_iter);
+    remainder = UCS_PTR_BYTE_DIFF(dest, wq->qend);
+    if (ucs_likely(length <= remainder)) {
+        uct_iov_to_buffer(iov, iovcnt, &iov_iter, dest, SIZE_MAX);
+    } else {
+        uct_iov_to_buffer(iov, iovcnt, &iov_iter, dest, remainder);
+        uct_iov_to_buffer(iov, iovcnt, &iov_iter, wq->qstart, SIZE_MAX);
+    }
+}
+
+
 /* wrapping of 'seg' should not happen */
 static UCS_F_ALWAYS_INLINE void*
 uct_ib_mlx5_txwq_wrap_none(uct_ib_mlx5_txwq_t *txwq, void *seg)
@@ -285,10 +315,22 @@ uct_ib_mlx5_set_ctrl_seg(struct mlx5_wqe_ctrl_seg* ctrl, uint16_t pi,
                          uint8_t opcode, uint8_t opmod, uint32_t qp_num,
                          uint8_t fm_ce_se, unsigned wqe_size)
 {
-    uint8_t ds;
+    uint8_t ds = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
+#if defined(__ARM_NEON)
+    uint8x16_t table = {1,               /* opmod */
+                        5,  4,           /* sw_pi in BE */
+                        2,               /* opcode */
+                        14, 13, 12,      /* QP num */
+                        8,               /* data size */
+                        16,              /* signature (set 0) */
+                        16, 16,          /* reserved (set 0) */
+                        0,               /* signal/fence_mode */
+                        16, 16, 16, 16}; /* immediate (set to 0)*/
+    uint32x4_t data = {(opcode << 16) | (opmod << 8) | (uint32_t)fm_ce_se,
+                       pi, ds, qp_num};
+#endif
 
     ucs_assert(((unsigned long)ctrl % UCT_IB_MLX5_WQE_SEG_SIZE) == 0);
-    ds = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
 #if defined(__SSE4_2__)
     *(__m128i *) ctrl = _mm_shuffle_epi8(
                     _mm_set_epi32(qp_num, ds, pi,
@@ -304,17 +346,6 @@ uct_ib_mlx5_set_ctrl_seg(struct mlx5_wqe_ctrl_seg* ctrl, uint16_t pi,
                                  1           /* opmod */
                                  ));
 #elif defined(__ARM_NEON)
-    uint8x16_t table = {1,               /* opmod */
-                        5,  4,           /* sw_pi in BE */
-                        2,               /* opcode */
-                        14, 13, 12,      /* QP num */
-                        8,               /* data size */
-                        16,              /* signature (set 0) */
-                        16, 16,          /* reserved (set 0) */
-                        0,               /* signal/fence_mode */
-                        16, 16, 16, 16}; /* immediate (set to 0)*/
-    uint32x4_t data = {(opcode << 16) | (opmod << 8) | (uint32_t)fm_ce_se,
-                       pi, ds, qp_num};
     *(uint8x16_t *)ctrl = vqtbl1q_u8((uint8x16_t)data, table);
 #else
     ctrl->opmod_idx_opcode = (opcode << 24) | (htons(pi) << 8) | opmod;
@@ -329,10 +360,23 @@ uct_ib_mlx5_set_ctrl_seg_with_imm(struct mlx5_wqe_ctrl_seg* ctrl, uint16_t pi,
                                   uint8_t opcode, uint8_t opmod, uint32_t qp_num,
                                   uint8_t fm_ce_se, unsigned wqe_size, uint32_t imm)
 {
-    uint8_t ds;
+    uint8_t ds = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
+#if defined(__ARM_NEON)
+    uint8x16_t table = {1,               /* opmod */
+                        5,  4,           /* sw_pi in BE */
+                        2,               /* opcode */
+                        14, 13, 12,      /* QP num */
+                        6,               /* data size */
+                        16,              /* signature (set 0) */
+                        16, 16,          /* reserved (set 0) */
+                        0,               /* signal/fence_mode */
+                        8, 9, 10, 11}; /* immediate (set to 0)*/
+    uint32x4_t data = {(opcode << 16) | (opmod << 8) | (uint32_t)fm_ce_se,
+                       (ds << 16) | pi, imm,  qp_num};
+#endif
 
     ucs_assert(((unsigned long)ctrl % UCT_IB_MLX5_WQE_SEG_SIZE) == 0);
-    ds = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
+    
 #if defined(__SSE4_2__)
     *(__m128i *) ctrl = _mm_shuffle_epi8(
                     _mm_set_epi32(qp_num, imm, (ds << 16) | pi,
@@ -348,17 +392,6 @@ uct_ib_mlx5_set_ctrl_seg_with_imm(struct mlx5_wqe_ctrl_seg* ctrl, uint16_t pi,
                                  1             /* opmod */
                                  ));
 #elif defined(__ARM_NEON)
-    uint8x16_t table = {1,               /* opmod */
-                        5,  4,           /* sw_pi in BE */
-                        2,               /* opcode */
-                        14, 13, 12,      /* QP num */
-                        6,               /* data size */
-                        16,              /* signature (set 0) */
-                        16, 16,          /* reserved (set 0) */
-                        0,               /* signal/fence_mode */
-                        8, 9, 10, 11}; /* immediate (set to 0)*/
-    uint32x4_t data = {(opcode << 16) | (opmod << 8) | (uint32_t)fm_ce_se,
-                       (ds << 16) | pi, imm,  qp_num};
     *(uint8x16_t *)ctrl = vqtbl1q_u8((uint8x16_t)data, table);
 #else
     ctrl->opmod_idx_opcode = (opcode << 24) | (htons(pi) << 8) | opmod;
diff --git a/src/uct/ib/mlx5/ib_mlx5_log.c b/src/uct/ib/mlx5/ib_mlx5_log.c
index 78232c1b084..cb9f375f196 100644
--- a/src/uct/ib/mlx5/ib_mlx5_log.c
+++ b/src/uct/ib/mlx5/ib_mlx5_log.c
@@ -71,11 +71,6 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface,
         wqe_index %= UCS_PTR_BYTE_DIFF(txwq->qstart, txwq->qend) / MLX5_SEND_WQE_BB;
     }
 
-    if (ecqe->syndrome == MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
-        ucs_trace("QP 0x%x wqe[%d] is flushed", qp_num, wqe_index);
-        return status;
-    }
-
     switch (ecqe->syndrome) {
     case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
         snprintf(err_info, sizeof(err_info), "Local length");
@@ -87,7 +82,10 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface,
         snprintf(err_info, sizeof(err_info), "Local protection");
         break;
     case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
-        snprintf(err_info, sizeof(err_info), "WR flushed because QP in error state");
+        snprintf(err_info, sizeof(err_info),
+                 "WR flushed because QP in error state");
+        log_level = UCS_LOG_LEVEL_TRACE;
+        status    = UCS_ERR_CANCELED;
         break;
     case MLX5_CQE_SYNDROME_MW_BIND_ERR:
         snprintf(err_info, sizeof(err_info), "Memory window bind");
@@ -106,7 +104,7 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface,
         status = UCS_ERR_CONNECTION_RESET;
         break;
     case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
-        snprintf(err_info, sizeof(err_info), "Remote QP");
+        snprintf(err_info, sizeof(err_info), "Remote OP");
         status = UCS_ERR_CONNECTION_RESET;
         break;
     case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
@@ -126,6 +124,10 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface,
         break;
     }
 
+    if (!ucs_log_is_enabled(log_level)) {
+        goto out;
+    }
+
     if ((txwq != NULL) && ((ecqe->op_own >> 4) == MLX5_CQE_REQ_ERR)) {
         wqe = UCS_PTR_BYTE_OFFSET(txwq->qstart, MLX5_SEND_WQE_BB * wqe_index);
         uct_ib_mlx5_wqe_dump(iface, wqe, txwq->qstart, txwq->qend, INT_MAX, 0,
@@ -143,6 +145,8 @@ ucs_status_t uct_ib_mlx5_completion_with_err(uct_ib_iface_t *iface,
             ecqe->syndrome, ecqe->vendor_err_synd, ecqe->hw_synd_type >> 4,
             ecqe->hw_err_synd, uct_ib_qp_type_str(iface->config.qp_type),
             qp_num, wqe_index, wqe_info);
+
+out:
     return status;
 }
 
@@ -384,15 +388,15 @@ static void uct_ib_mlx5_wqe_dump(uct_ib_iface_t *iface, void *wqe, void *qstart,
             if (is_inline) {
                 inline_bitmap |= UCS_BIT(i-1);
             }
-            s += strlen(s);
         }
+        uct_ib_log_dump_sg_list(iface, UCT_AM_TRACE_TYPE_SEND, sg_list, i,
+                                inline_bitmap, packet_dump_cb, max_sge, s,
+                                ends - s);
+    } else {
+        uct_ib_log_dump_sg_list(iface, UCT_AM_TRACE_TYPE_SEND, log_sge->sg_list,
+                                log_sge->num_sge, log_sge->inline_bitmap,
+                                packet_dump_cb, log_sge->num_sge, s, ends - s);
     }
-
-    uct_ib_log_dump_sg_list(iface, UCT_AM_TRACE_TYPE_SEND,
-                            log_sge ? log_sge->sg_list : sg_list,
-                            log_sge ? log_sge->num_sge : ucs_min(i, max_sge),
-                            log_sge ? log_sge->inline_bitmap : inline_bitmap,
-                            packet_dump_cb, s, ends - s);
 }
 
 void __uct_ib_mlx5_log_tx(const char *file, int line, const char *function,
diff --git a/src/uct/ib/rc/accel/rc_mlx5.h b/src/uct/ib/rc/accel/rc_mlx5.h
index 5f3f8821208..5c352ca970c 100644
--- a/src/uct/ib/rc/accel/rc_mlx5.h
+++ b/src/uct/ib/rc/accel/rc_mlx5.h
@@ -51,7 +51,8 @@ UCS_CLASS_DECLARE_NEW_FUNC(uct_rc_mlx5_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DECLARE_DELETE_FUNC(uct_rc_mlx5_ep_t, uct_ep_t);
 
 void uct_rc_mlx5_iface_check_rx_completion(uct_rc_mlx5_iface_common_t *iface,
-                                           struct mlx5_cqe64 *cqe);
+                                           struct mlx5_cqe64 *cqe,
+                                           int poll_flags);
 
 ucs_status_t uct_rc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length,
                                       uint64_t remote_addr, uct_rkey_t rkey);
@@ -76,6 +77,9 @@ ucs_status_t uct_rc_mlx5_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size
 ucs_status_t uct_rc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header,
                                      const void *payload, unsigned length);
 
+ucs_status_t uct_rc_mlx5_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                         const uct_iov_t *iov, size_t iovcnt);
+
 ssize_t uct_rc_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                 uct_pack_callback_t pack_cb, void *arg,
                                 unsigned flags);
@@ -111,7 +115,7 @@ ucs_status_t uct_rc_mlx5_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode,
 
 ucs_status_t uct_rc_mlx5_ep_fence(uct_ep_h tl_ep, unsigned flags);
 
-ucs_status_t uct_rc_mlx5_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
+void uct_rc_mlx5_ep_post_check(uct_ep_h tl_ep);
 
 ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
 
@@ -126,14 +130,13 @@ ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface,
 ucs_status_t
 uct_rc_mlx5_ep_connect_qp(uct_rc_mlx5_iface_common_t *iface,
                           uct_ib_mlx5_qp_t *qp, uint32_t qp_num,
-                          struct ibv_ah_attr *ah_attr, enum ibv_mtu path_mtu);
+                          struct ibv_ah_attr *ah_attr, enum ibv_mtu path_mtu,
+                          uint8_t path_index);
 
 ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,
                                           const uct_device_addr_t *dev_addr,
                                           const uct_ep_addr_t *ep_addr);
 
-unsigned uct_rc_mlx5_iface_progress(void *arg);
-
 ucs_status_t uct_rc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag,
                                             const void *data, size_t length);
 
@@ -161,16 +164,6 @@ ucs_status_t uct_rc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag,
 
 ucs_status_t uct_rc_mlx5_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr);
 
-ucs_status_t uct_rc_mlx5_ep_handle_failure(uct_rc_mlx5_ep_t *ep,
-                                           ucs_status_t status, uint16_t sn);
-
-ucs_status_t uct_rc_mlx5_ep_set_failed(uct_ib_iface_t *iface, uct_ep_h ep,
-                                       ucs_status_t status);
-
-void uct_rc_mlx5_ep_pending_purge(uct_ep_h tl_ep,
-                                  uct_pending_purge_callback_t cb,
-                                  void *arg);
-
-void uct_rc_mlx5_ep_cleanup_qp(uct_ib_async_event_wait_t *wait_ctx);
+unsigned uct_rc_mlx5_ep_cleanup_qp(void *arg);
 
 #endif
diff --git a/src/uct/ib/rc/accel/rc_mlx5.inl b/src/uct/ib/rc/accel/rc_mlx5.inl
index 07d20d78841..b4a14b3a32c 100644
--- a/src/uct/ib/rc/accel/rc_mlx5.inl
+++ b/src/uct/ib/rc/accel/rc_mlx5.inl
@@ -32,33 +32,6 @@ uct_rc_mlx5_ep_fence_get(uct_rc_mlx5_iface_common_t *iface, uct_ib_mlx5_txwq_t *
     *fm_ce_se |= uct_rc_ep_fm(&iface->super, &txwq->fi, iface->config.atomic_fence_flag);
 }
 
-static UCS_F_ALWAYS_INLINE void
-uct_rc_mlx5_common_update_tx_res(uct_rc_iface_t *rc_iface, uct_ib_mlx5_txwq_t *txwq,
-                                 uct_rc_txqp_t *txqp, uint16_t hw_ci)
-{
-    uint16_t bb_num;
-
-    bb_num = uct_ib_mlx5_txwq_update_bb(txwq, hw_ci) - uct_rc_txqp_available(txqp);
-
-    /* Must always have positive number of released resources. The first completion
-     * will report bb_num=1 (because prev_sw_pi is initialized to -1) and all the rest
-     * report the amount of BBs the previous WQE has consumed.
-     */
-    ucs_assertv(bb_num > 0, "hw_ci=%d prev_sw_pi=%d available=%d bb_num=%d",
-                hw_ci, txwq->prev_sw_pi, txqp->available, bb_num);
-
-    uct_rc_txqp_available_add(txqp, bb_num);
-    ucs_assert(uct_rc_txqp_available(txqp) <= txwq->bb_max);
-
-    uct_rc_iface_update_reads(rc_iface);
-
-    rc_iface->tx.cq_available += bb_num;
-    ucs_assertv(rc_iface->tx.cq_available <= rc_iface->config.tx_cq_len,
-                "cq_available=%d tx_cq_len=%d bb_num=%d txwq=%p txqp=%p",
-                rc_iface->tx.cq_available, rc_iface->config.tx_cq_len, bb_num,
-                txwq, txqp);
-}
-
 static UCS_F_ALWAYS_INLINE void
 uct_rc_mlx5_txqp_process_tx_cqe(uct_rc_txqp_t *txqp, struct mlx5_cqe64 *cqe,
                                 uint16_t hw_ci)
@@ -132,15 +105,16 @@ uct_rc_mlx5_iface_release_srq_seg(uct_rc_mlx5_iface_common_t *iface,
                                   uct_ib_mlx5_srq_seg_t *seg,
                                   struct mlx5_cqe64 *cqe, uint16_t wqe_ctr,
                                   ucs_status_t status, unsigned offset,
-                                  uct_recv_desc_t *release_desc)
+                                  uct_recv_desc_t *release_desc, int poll_flags)
 {
+    uct_ib_mlx5_srq_t *srq = &iface->rx.srq;
     uint16_t wqe_index;
     int seg_free;
 
     /* Need to wrap wqe_ctr, because in case of cyclic srq topology
      * it is wrapped around 0xFFFF regardless of real SRQ size.
      * But it respects srq size when srq topology is a linked-list. */
-    wqe_index = wqe_ctr & iface->rx.srq.mask;
+    wqe_index = wqe_ctr & srq->mask;
 
     if (ucs_unlikely(status != UCS_OK)) {
         uct_rc_mlx5_iface_hold_srq_desc(iface, seg, cqe, wqe_ctr, status,
@@ -155,28 +129,35 @@ uct_rc_mlx5_iface_release_srq_seg(uct_rc_mlx5_iface_common_t *iface,
         seg->srq.strides = iface->tm.mp.num_strides;
     }
 
+    ++iface->super.rx.srq.available;
+
+    if (poll_flags & UCT_RC_MLX5_POLL_FLAG_LINKED_LIST) {
+        seg                     = uct_ib_mlx5_srq_get_wqe(srq, srq->free_idx);
+        seg->srq.next_wqe_index = htons(wqe_index);
+        srq->free_idx           = wqe_index;
+        return;
+    }
+
     seg_free = (seg->srq.ptr_mask == UCS_MASK(iface->tm.mp.num_strides));
 
-    if (ucs_likely(seg_free && (wqe_index == ((iface->rx.srq.ready_idx + 1) &
-                                              iface->rx.srq.mask)))) {
+    if (ucs_likely(seg_free && (wqe_ctr == (srq->ready_idx + 1)))) {
          /* If the descriptor was not used - if there are no "holes", we can just
           * reuse it on the receive queue. Otherwise, ready pointer will stay behind
           * until post_recv allocated more descriptors from the memory pool, fills
           * the holes, and moves it forward.
           */
-         ucs_assert(wqe_index == ((iface->rx.srq.free_idx + 1) & iface->rx.srq.mask));
-         ++iface->rx.srq.ready_idx;
-         ++iface->rx.srq.free_idx;
-    } else {
-         if (wqe_index == ((iface->rx.srq.free_idx + 1) & iface->rx.srq.mask)) {
-             ++iface->rx.srq.free_idx;
-         } else {
-             /* Mark the segment as out-of-order, post_recv will advance free */
-             seg->srq.free = 1;
-         }
+         ucs_assert(wqe_ctr == (srq->free_idx + 1));
+         ++srq->ready_idx;
+         ++srq->free_idx;
+         return;
     }
 
-    ++iface->super.rx.srq.available;
+    if (wqe_ctr == (srq->free_idx + 1)) {
+        ++srq->free_idx;
+    } else {
+        /* Mark the segment as out-of-order, post_recv will advance free */
+        seg->srq.free = 1;
+    }
 }
 
 #define uct_rc_mlx5_iface_mp_hash_lookup(_h_name, _h_ptr, _key, _last, _flags, \
@@ -237,12 +218,24 @@ uct_rc_mlx5_iface_rx_mp_context_from_hash(uct_rc_mlx5_iface_common_t *iface,
                                           struct mlx5_cqe64 *cqe,
                                           unsigned *flags)
 {
+#if UCS_ENABLE_ASSERT
+    uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md,
+                                          uct_ib_mlx5_md_t);
+#endif
     uct_rc_mlx5_mp_context_t *mp_ctx;
     uct_rc_mlx5_mp_hash_key_t key_gid;
     uint64_t key_lid;
     void *gid;
     int last;
 
+    if (ucs_likely(ucs_test_all_flags(cqe->byte_cnt,
+                                      htonl(UCT_IB_MLX5_MP_RQ_LAST_MSG_FLAG |
+                                            UCT_IB_MLX5_MP_RQ_FIRST_MSG_FLAG)))) {
+        ucs_assert(md->flags & UCT_IB_MLX5_MD_FLAG_MP_XRQ_FIRST_MSG);
+        *flags |= UCT_CB_PARAM_FLAG_FIRST;
+        return &iface->tm.mp.last_frag_ctx;
+    }
+
     last = cqe->byte_cnt & htonl(UCT_IB_MLX5_MP_RQ_LAST_MSG_FLAG);
 
     if (uct_ib_mlx5_cqe_is_grh_present(cqe)) {
@@ -270,7 +263,7 @@ uct_rc_mlx5_iface_rx_mp_context_from_hash(uct_rc_mlx5_iface_common_t *iface,
 }
 
 static UCS_F_ALWAYS_INLINE struct mlx5_cqe64*
-uct_rc_mlx5_iface_poll_rx_cq(uct_rc_mlx5_iface_common_t *iface)
+uct_rc_mlx5_iface_poll_rx_cq(uct_rc_mlx5_iface_common_t *iface, int poll_flags)
 {
     uct_ib_mlx5_cq_t *cq = &iface->cq[UCT_IB_DIR_RX];
     struct mlx5_cqe64 *cqe;
@@ -287,7 +280,7 @@ uct_rc_mlx5_iface_poll_rx_cq(uct_rc_mlx5_iface_common_t *iface)
     if (ucs_unlikely(uct_ib_mlx5_cqe_is_hw_owned(op_own, idx, cq->cq_length))) {
         return NULL;
     } else if (ucs_unlikely(op_own & UCT_IB_MLX5_CQE_OP_OWN_ERR_MASK)) {
-        uct_rc_mlx5_iface_check_rx_completion(iface, cqe);
+        uct_rc_mlx5_iface_check_rx_completion(iface, cqe, poll_flags);
         return NULL;
     }
 
@@ -370,7 +363,7 @@ uct_rc_mlx5_iface_tm_common_data(uct_rc_mlx5_iface_common_t *iface,
         *context_p = uct_rc_mlx5_iface_rx_mp_context_from_hash(iface, cqe, flags);
     } else {
         /* Non-tagged messages (AM, RNDV Fin) should always arrive in
-         * a single frgament */
+         * a single fragment */
         *context_p = uct_rc_mlx5_iface_single_frag_context(iface, flags);
     }
 
@@ -401,8 +394,8 @@ uct_rc_mlx5_iface_tm_common_data(uct_rc_mlx5_iface_common_t *iface,
 static UCS_F_ALWAYS_INLINE void
 uct_rc_mlx5_iface_common_am_handler(uct_rc_mlx5_iface_common_t *iface,
                                     struct mlx5_cqe64 *cqe,
-                                    uct_rc_mlx5_hdr_t *hdr,
-                                    unsigned flags, unsigned byte_len)
+                                    uct_rc_mlx5_hdr_t *hdr, unsigned flags,
+                                    unsigned byte_len, int poll_flags)
 {
     uint16_t wqe_ctr;
     uct_rc_iface_ops_t *rc_ops;
@@ -432,7 +425,7 @@ uct_rc_mlx5_iface_common_am_handler(uct_rc_mlx5_iface_common_t *iface,
 
     uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, status,
                                       iface->tm.am_desc.offset,
-                                      &iface->tm.am_desc.super);
+                                      &iface->tm.am_desc.super, poll_flags);
 }
 
 static UCS_F_ALWAYS_INLINE uint8_t
@@ -496,6 +489,33 @@ uct_rc_mlx5_common_post_send(uct_rc_mlx5_iface_common_t *iface, int qp_type,
     uct_rc_txqp_posted(txqp, &iface->super, res_count, fm_ce_se & MLX5_WQE_CTRL_CQ_UPDATE);
 }
 
+static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_txqp_inline_iov_post(
+        uct_rc_mlx5_iface_common_t *iface, int qp_type, uct_rc_txqp_t *txqp,
+        uct_ib_mlx5_txwq_t *txwq, const uct_iov_t *iov, size_t iovcnt,
+        size_t iov_length, uint8_t am_id, uct_ib_mlx5_base_av_t *av,
+        struct mlx5_grh_av *grh_av, size_t av_size)
+{
+    struct mlx5_wqe_ctrl_seg *ctrl = txwq->curr;
+    struct mlx5_wqe_inl_data_seg *inl;
+    uct_rc_mlx5_hdr_t *rch;
+    size_t wqe_size, ctrl_av_size;
+    unsigned fm_ce_se;
+
+    ctrl_av_size    = sizeof(*ctrl) + av_size;
+    wqe_size        = ctrl_av_size + sizeof(*inl) + sizeof(*rch) + iov_length;
+    inl             = UCS_PTR_BYTE_OFFSET(ctrl, ctrl_av_size);
+    inl             = uct_ib_mlx5_txwq_wrap_exact(txwq, inl);
+    inl->byte_count = htonl((iov_length + sizeof(*rch)) | MLX5_INLINE_SEG);
+    rch             = (uct_rc_mlx5_hdr_t*)(inl + 1);
+    fm_ce_se        = MLX5_WQE_CTRL_SOLICITED | uct_rc_iface_tx_moderation(
+            &iface->super, txqp, MLX5_WQE_CTRL_CQ_UPDATE);
+
+    uct_rc_mlx5_am_hdr_fill(rch, am_id);
+    uct_ib_mlx5_inline_iov_copy(rch + 1, iov, iovcnt, iov_length, txwq);
+    uct_rc_mlx5_common_post_send(iface, qp_type, txqp, txwq, MLX5_OPCODE_SEND,
+                                 0, fm_ce_se, wqe_size, av, grh_av, 0, INT_MAX,
+                                 NULL);
+}
 
 /*
  * Generic function that setups and posts WQE with inline segment
@@ -874,6 +894,26 @@ void uct_rc_mlx5_txqp_dptr_post_iov(uct_rc_mlx5_iface_common_t *iface, int qp_ty
                                  max_log_sge, NULL);
 }
 
+/*
+ * Helper function for buffer-copy post.
+ * Adds the descriptor to the callback queue.
+ */
+static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_common_txqp_bcopy_post(
+        uct_rc_mlx5_iface_common_t *iface, int qp_type, uct_rc_txqp_t *txqp,
+        uct_ib_mlx5_txwq_t *txwq, unsigned opcode, unsigned length,
+        uint64_t rdma_raddr, uct_rkey_t rdma_rkey, uct_ib_mlx5_base_av_t *av,
+        struct mlx5_grh_av *grh_av, size_t av_size, uint8_t fm_ce_se,
+        uint32_t imm_val_be, uct_rc_iface_send_desc_t *desc, const void *buffer,
+        uct_ib_log_sge_t *log_sge)
+{
+    desc->super.sn = txwq->sw_pi;
+    uct_rc_mlx5_txqp_dptr_post(iface, qp_type, txqp, txwq, opcode, buffer,
+                               length, &desc->lkey, rdma_raddr, rdma_rkey, 0, 0,
+                               0, 0, av, grh_av, av_size, fm_ce_se, imm_val_be,
+                               INT_MAX, log_sge);
+    uct_rc_txqp_add_send_op(txqp, &desc->super);
+}
+
 #if IBV_HW_TM
 static UCS_F_ALWAYS_INLINE void
 uct_rc_mlx5_set_tm_seg(uct_ib_mlx5_txwq_t *txwq,
@@ -1143,7 +1183,7 @@ uct_rc_mlx5_iface_handle_tm_list_op(uct_rc_mlx5_iface_common_t *iface, int opcod
         ctx  = op->tag->ctx;
         priv = uct_rc_mlx5_ctx_priv(ctx);
         uct_rc_mlx5_iface_tag_del_from_hash(iface, priv->buffer);
-        ctx->completed_cb(ctx, priv->tag, 0, priv->length, UCS_ERR_CANCELED);
+        ctx->completed_cb(ctx, priv->tag, 0, priv->length, NULL, UCS_ERR_CANCELED);
     }
 }
 
@@ -1174,6 +1214,7 @@ static UCS_F_ALWAYS_INLINE void
 uct_rc_mlx5_iface_handle_expected(uct_rc_mlx5_iface_common_t *iface, struct mlx5_cqe64 *cqe,
                                   uint64_t tag, uint32_t app_ctx)
 {
+    int is_inline = cqe->op_own & MLX5_INLINE_SCATTER_64;
     uint64_t imm_data;
     uct_rc_mlx5_tag_entry_t *tag_entry;
     uct_tag_context_t *ctx;
@@ -1190,9 +1231,8 @@ uct_rc_mlx5_iface_handle_expected(uct_rc_mlx5_iface_common_t *iface, struct mlx5
     uct_rc_mlx5_release_tag_entry(iface, tag_entry);
     uct_rc_mlx5_iface_tag_del_from_hash(iface, priv->buffer);
 
-    if (cqe->op_own & MLX5_INLINE_SCATTER_64) {
+    if (is_inline) {
         ucs_assert(byte_len <= priv->length);
-        memcpy(priv->buffer, cqe - 1, byte_len);
     } else {
         VALGRIND_MAKE_MEM_DEFINED(priv->buffer, byte_len);
     }
@@ -1202,10 +1242,12 @@ uct_rc_mlx5_iface_handle_expected(uct_rc_mlx5_iface_common_t *iface, struct mlx5
                                                MLX5_CQE_RESP_SEND_IMM);
 
     if (UCT_RC_MLX5_TM_IS_SW_RNDV(cqe, imm_data)) {
-        ctx->rndv_cb(ctx, tag, priv->buffer, byte_len, UCS_OK);
+        ctx->rndv_cb(ctx, tag, is_inline ? (cqe - 1) : priv->buffer, byte_len,
+                     UCS_OK, is_inline ? UCT_TAG_RECV_CB_INLINE_DATA : 0);
         UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_REQ_EXP);
     } else {
-        ctx->completed_cb(ctx, tag, imm_data, byte_len, UCS_OK);
+        ctx->completed_cb(ctx, tag, imm_data, byte_len,
+                          (is_inline) ? (cqe - 1) : NULL, UCS_OK);
         UCT_RC_MLX5_TM_STAT(iface, RX_EXP);
     }
 }
@@ -1214,14 +1256,14 @@ static UCS_F_ALWAYS_INLINE void
 uct_rc_mlx5_iface_unexp_consumed(uct_rc_mlx5_iface_common_t *iface,
                                  unsigned offset, uct_recv_desc_t *release_desc,
                                  struct mlx5_cqe64 *cqe, ucs_status_t status,
-                                 uint16_t wqe_ctr)
+                                 uint16_t wqe_ctr, int poll_flags)
 {
     uct_ib_mlx5_srq_seg_t *seg;
 
     seg = uct_ib_mlx5_srq_get_wqe(&iface->rx.srq, wqe_ctr);
 
     uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr,
-                                      status, offset, release_desc);
+                                      status, offset, release_desc, poll_flags);
 
     if (ucs_unlikely(!(iface->tm.unexpected_cnt % IBV_DEVICE_MAX_UNEXP_COUNT))) {
         uct_rc_mlx5_iface_common_post_srq_op(&iface->tm.cmd_wq, 0,
@@ -1260,7 +1302,8 @@ uct_rc_mlx5_iface_tag_handle_unexp(uct_rc_mlx5_iface_common_t *iface,
         ++iface->tm.unexpected_cnt;
         uct_rc_mlx5_iface_unexp_consumed(iface, iface->tm.eager_desc.offset,
                                          &iface->tm.eager_desc.super, cqe,
-                                         status, ntohs(cqe->wqe_counter));
+                                         status, ntohs(cqe->wqe_counter),
+                                         poll_flags);
 
         UCT_RC_MLX5_TM_STAT(iface, RX_EAGER_UNEXP);
         return;
@@ -1288,14 +1331,16 @@ uct_rc_mlx5_iface_tag_handle_unexp(uct_rc_mlx5_iface_common_t *iface,
         uct_rc_mlx5_iface_unexp_consumed(iface,
                                          iface->super.super.config.rx_headroom_offset,
                                          &iface->super.super.release_desc,
-                                         cqe, status, ntohs(cqe->wqe_counter));
+                                         cqe, status, ntohs(cqe->wqe_counter),
+                                         poll_flags);
         return;
     }
 
     ++iface->tm.unexpected_cnt;
 
     if (ucs_unlikely(tmh->opcode == IBV_TMH_RNDV)) {
-        uct_rc_mlx5_handle_unexp_rndv(iface, tmh, tmh->tag, cqe, flags, byte_len);
+        uct_rc_mlx5_handle_unexp_rndv(iface, tmh, tmh->tag, cqe, flags,
+                                      byte_len, poll_flags);
         return;
     }
 
@@ -1335,12 +1380,13 @@ uct_rc_mlx5_iface_tag_handle_unexp(uct_rc_mlx5_iface_common_t *iface,
 
     uct_rc_mlx5_iface_unexp_consumed(iface, iface->tm.eager_desc.offset,
                                      &iface->tm.eager_desc.super, cqe,
-                                     status, ntohs(cqe->wqe_counter));
+                                     status, ntohs(cqe->wqe_counter),
+                                     poll_flags);
 }
 
 static UCS_F_NOINLINE void
 uct_rc_mlx5_iface_handle_filler_cqe(uct_rc_mlx5_iface_common_t *iface,
-                                    struct mlx5_cqe64 *cqe)
+                                    struct mlx5_cqe64 *cqe, int poll_flags)
 {
     uct_ib_mlx5_srq_seg_t *seg;
 
@@ -1352,7 +1398,7 @@ uct_rc_mlx5_iface_handle_filler_cqe(uct_rc_mlx5_iface_common_t *iface,
     /* at least one stride should be in HW ownership when filler CQE arrives */
     ucs_assert(seg->srq.strides);
     uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, ntohs(cqe->wqe_counter),
-                                      UCS_OK, 0, NULL);
+                                      UCS_OK, 0, NULL, poll_flags);
 }
 #endif /* IBV_HW_TM */
 
@@ -1375,10 +1421,11 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface,
     uct_rc_mlx5_mp_context_t UCS_V_UNUSED *dummy_ctx;
 #endif
 
-    ucs_assert(uct_ib_mlx5_srq_get_wqe(&iface->rx.srq,
-                                       iface->rx.srq.mask)->srq.next_wqe_index == 0);
+    ucs_assert((poll_flags & UCT_RC_MLX5_POLL_FLAG_LINKED_LIST) ||
+               (uct_ib_mlx5_srq_get_wqe(&iface->rx.srq,
+                                        iface->rx.srq.mask)->srq.next_wqe_index == 0));
 
-    cqe = uct_rc_mlx5_iface_poll_rx_cq(iface);
+    cqe = uct_rc_mlx5_iface_poll_rx_cq(iface, poll_flags);
     if (cqe == NULL) {
         /* If no CQE - post receives */
         count = 0;
@@ -1393,7 +1440,8 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface,
 
     if (!(poll_flags & UCT_RC_MLX5_POLL_FLAG_TM)) {
         rc_hdr = uct_rc_mlx5_iface_common_data(iface, cqe, byte_len, &flags);
-        uct_rc_mlx5_iface_common_am_handler(iface, cqe, rc_hdr, flags, byte_len);
+        uct_rc_mlx5_iface_common_am_handler(iface, cqe, rc_hdr, flags,
+                                            byte_len, poll_flags);
         goto done;
     }
 
@@ -1403,7 +1451,7 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface,
     if (ucs_unlikely(byte_len & UCT_IB_MLX5_MP_RQ_FILLER_FLAG)) {
         /* TODO: Check if cqe->app_op is valid for filler CQE. Then this check
          * could be done for specific CQE types only. */
-        uct_rc_mlx5_iface_handle_filler_cqe(iface, cqe);
+        uct_rc_mlx5_iface_handle_filler_cqe(iface, cqe, poll_flags);
         count = 0;
         goto done;
     }
@@ -1437,7 +1485,7 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface,
         if (tmh->opcode == IBV_TMH_NO_TAG) {
             uct_rc_mlx5_iface_common_am_handler(iface, cqe,
                                                 (uct_rc_mlx5_hdr_t*)tmh,
-                                                flags, byte_len);
+                                                flags, byte_len, poll_flags);
         } else {
             ucs_assert(tmh->opcode == IBV_TMH_FIN);
             uct_rc_mlx5_handle_rndv_fin(iface, tmh->app_ctx);
@@ -1446,7 +1494,7 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface,
 
             uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe,
                                               ntohs(cqe->wqe_counter), UCS_OK,
-                                              0, NULL);
+                                              0, NULL, poll_flags);
 
             UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_FIN);
         }
@@ -1483,7 +1531,11 @@ uct_rc_mlx5_iface_common_poll_rx(uct_rc_mlx5_iface_common_t *iface,
 done:
     max_batch = iface->super.super.config.rx_max_batch;
     if (ucs_unlikely(iface->super.rx.srq.available >= max_batch)) {
-        uct_rc_mlx5_iface_srq_post_recv(iface);
+        if (poll_flags & UCT_RC_MLX5_POLL_FLAG_LINKED_LIST) {
+            uct_rc_mlx5_iface_srq_post_recv_ll(iface);
+        } else {
+            uct_rc_mlx5_iface_srq_post_recv(iface);
+        }
     }
     return count;
 }
@@ -1493,70 +1545,83 @@ done:
  * processor cache issues. To make this used uct_rc_mlx5_dm_copy_data_t
  * datatype where first hdr_len bytes are filled by message header
  * and tail is filled by head of message. */
-static void UCS_F_ALWAYS_INLINE
-uct_rc_mlx5_iface_common_copy_to_dm(uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len,
-                                    const void *payload, size_t length, void *dm,
-                                    uct_ib_log_sge_t *log_sge)
+static void UCS_F_ALWAYS_INLINE uct_rc_mlx5_iface_common_copy_to_dm(
+        uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len, const uct_iov_t *iov,
+        size_t iovcnt, void *dm, uct_ib_log_sge_t *log_sge)
 {
     typedef uint64_t misaligned_t UCS_V_ALIGNED(1);
 
-    uint64_t padding = 0; /* init by 0 to suppress valgrind error */
-    size_t head      = (cache && hdr_len) ? ucs_min(length, sizeof(*cache) - hdr_len) : 0;
-    size_t body      = ucs_align_down(length - head, sizeof(padding));
-    size_t tail      = length - (head + body);
-    char   *dst      = dm;
-    int i            = 0;
+    size_t head = ((cache != NULL) && (hdr_len > 0)) ?
+                  sizeof(cache->bytes) - hdr_len : 0;
+    int i       = 0;
+    ucs_iov_iter_t iov_iter;
+    uint64_t word;
+    size_t to_copy;
+    void *src;
 
-    ucs_assert(sizeof(*cache) >= hdr_len);
-    ucs_assert(head + body + tail == length);
-    ucs_assert(tail < sizeof(padding));
+    ucs_assert(sizeof(cache->bytes) >= hdr_len);
+    ucs_iov_iter_init(&iov_iter);
 
-    /* copy head of payload to tail of cache */
-    memcpy(cache->bytes + hdr_len, payload, head);
+    /* copy head of iov to tail of cache */
+    uct_iov_to_buffer(iov, iovcnt, &iov_iter,
+                      UCS_PTR_BYTE_OFFSET(cache->bytes, hdr_len), head);
 
-    UCS_STATIC_ASSERT(sizeof(*cache) == sizeof(cache->bytes));
-    UCS_STATIC_ASSERT(sizeof(log_sge->sg_list) / sizeof(log_sge->sg_list[0]) >= 2);
+    UCS_STATIC_ASSERT(ucs_static_array_size(log_sge->sg_list) >= 2);
 
     /* condition is static-evaluated */
-    if (cache && hdr_len) {
+    if ((cache != NULL) && (hdr_len > 0)) {
         /* atomically by 8 bytes copy data to DM */
         /* cache buffer must be aligned, so, source data type is aligned */
-        UCS_WORD_COPY(volatile uint64_t, dst, uint64_t, cache->bytes, sizeof(cache->bytes));
-        dst += sizeof(cache->bytes);
+        UCS_WORD_COPY(volatile uint64_t, dm, uint64_t, cache->bytes,
+                      sizeof(cache->bytes));
+        dm = UCS_PTR_BYTE_OFFSET(dm, sizeof(cache->bytes));
         if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_DATA)) {
-            log_sge->sg_list[0].addr   = (uint64_t)cache;
-            log_sge->sg_list[0].length = (uint32_t)hdr_len;
-            i++;
+            log_sge->sg_list[i].addr   = (uint64_t)cache;
+            log_sge->sg_list[i].length = (uint32_t)hdr_len;
+            ++i;
         }
     }
     if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_DATA)) {
-        log_sge->sg_list[i].addr   = (uint64_t)payload;
-        log_sge->sg_list[i].length = (uint32_t)length;
-        i++;
+        log_sge->sg_list[i].addr   = (uint64_t)iov->buffer;
+        log_sge->sg_list[i].length = (uint32_t)iov->length;
+        ++i;
     }
     log_sge->num_sge = i;
 
-    /* copy payload to DM */
-    UCS_WORD_COPY(volatile uint64_t, dst, misaligned_t,
-                  UCS_PTR_BYTE_OFFSET(payload, head), body);
-    if (tail) {
-        dst += body;
-        memcpy(&padding, UCS_PTR_BYTE_OFFSET(payload, head + body), tail);
-        /* use uint64_t for source datatype because it is aligned buffer on stack */
-        UCS_WORD_COPY(volatile uint64_t, dst, uint64_t, &padding, sizeof(padding));
+    while (iov_iter.iov_index < iovcnt) {
+        to_copy = ucs_align_down(
+                iov[iov_iter.iov_index].length - iov_iter.buffer_offset,
+                sizeof(word));
+        src     = UCS_PTR_BYTE_OFFSET(iov[iov_iter.iov_index].buffer,
+                                      iov_iter.buffer_offset);
+        UCS_WORD_COPY(volatile uint64_t, dm, misaligned_t, src, to_copy);
+
+        dm                      = UCS_PTR_BYTE_OFFSET(dm, to_copy);
+        iov_iter.buffer_offset += to_copy;
+
+        if (iov_iter.buffer_offset < iov[iov_iter.iov_index].length) {
+            /* copy remainder of the current iov buffer, and partially next
+             * buffer(s) to fill word if there is smth left to copy */
+            word = 0;
+            uct_iov_to_buffer(iov, iovcnt, &iov_iter, &word, sizeof(word));
+            UCS_WORD_COPY(volatile uint64_t, dm, uint64_t, &word, sizeof(word));
+            dm = UCS_PTR_BYTE_OFFSET(dm, sizeof(word));
+        } else {
+            ++iov_iter.iov_index;
+            iov_iter.buffer_offset = 0;
+        }
     }
 }
 
-static ucs_status_t UCS_F_ALWAYS_INLINE
-uct_rc_mlx5_common_dm_make_data(uct_rc_mlx5_iface_common_t *iface,
-                                uct_rc_mlx5_dm_copy_data_t *cache,
-                                size_t hdr_len, const void *payload,
-                                unsigned length,
-                                uct_rc_iface_send_desc_t **desc_p,
-                                void **buffer_p, uct_ib_log_sge_t *log_sge)
+static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_common_dm_make_data(
+        uct_rc_mlx5_iface_common_t *iface, uct_rc_mlx5_dm_copy_data_t *cache,
+        size_t hdr_len, const uct_iov_t *iov, size_t iovcnt,
+        uct_rc_iface_send_desc_t **desc_p, void **buffer_p,
+        uct_ib_log_sge_t *log_sge)
 {
     uct_rc_iface_send_desc_t *desc;
     void *buffer;
+    ucs_iov_iter_t iov_iter;
 
     ucs_assert(iface->dm.dm != NULL);
     ucs_assert(log_sge != NULL);
@@ -1572,17 +1637,21 @@ uct_rc_mlx5_common_dm_make_data(uct_rc_mlx5_iface_common_t *iface,
         if (cache && hdr_len) {
             memcpy(buffer, cache->bytes, hdr_len);
         }
-        memcpy(UCS_PTR_BYTE_OFFSET(buffer, hdr_len), payload, length);
+        ucs_iov_iter_init(&iov_iter);
+        uct_iov_to_buffer(iov, iovcnt, &iov_iter,
+                          UCS_PTR_BYTE_OFFSET(buffer, hdr_len), SIZE_MAX);
         log_sge->num_sge = 0;
     } else {
         /* desc must be partially initialized by mpool.
          * hint to valgrind to make it defined */
         VALGRIND_MAKE_MEM_DEFINED(desc, sizeof(*desc));
         ucs_assert(desc->super.buffer != NULL);
-        buffer = (void*)UCS_PTR_BYTE_DIFF(iface->dm.dm->start_va, desc->super.buffer);
+        buffer = (void*)UCS_PTR_BYTE_DIFF(iface->dm.dm->start_va,
+                                          desc->super.buffer);
+
+        uct_rc_mlx5_iface_common_copy_to_dm(cache, hdr_len, iov, iovcnt,
+                                            desc->super.buffer, log_sge);
 
-        uct_rc_mlx5_iface_common_copy_to_dm(cache, hdr_len, payload,
-                                            length, desc->super.buffer, log_sge);
         if (ucs_log_is_enabled(UCS_LOG_LEVEL_TRACE_DATA)) {
             log_sge->sg_list[0].lkey = log_sge->sg_list[1].lkey = desc->lkey;
             log_sge->inline_bitmap = 0;
@@ -1593,6 +1662,80 @@ uct_rc_mlx5_common_dm_make_data(uct_rc_mlx5_iface_common_t *iface,
     *buffer_p = buffer;
     return UCS_OK;
 }
+
+static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_common_ep_short_iov_dm(
+        uct_rc_mlx5_iface_common_t *iface, int qp_type,
+        uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len, const uct_iov_t *iov,
+        size_t iovcnt, size_t iov_length, unsigned opcode, uint8_t fm_ce_se,
+        uint64_t rdma_raddr, uct_rkey_t rdma_rkey, uct_rc_txqp_t *txqp,
+        uct_ib_mlx5_txwq_t *txwq, uct_ib_mlx5_base_av_t *av,
+        struct mlx5_grh_av *grh_av, size_t av_size)
+{
+    uct_rc_iface_send_desc_t *desc = NULL;
+    void *buffer;
+    ucs_status_t status;
+    uct_ib_log_sge_t log_sge;
+
+    status = uct_rc_mlx5_common_dm_make_data(iface, cache, hdr_len, iov, iovcnt,
+                                             &desc, &buffer, &log_sge);
+    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
+        return status;
+    }
+
+    uct_rc_mlx5_common_txqp_bcopy_post(iface, qp_type, txqp, txwq, opcode,
+                                       hdr_len + iov_length, rdma_raddr,
+                                       rdma_rkey, av, grh_av, av_size, fm_ce_se,
+                                       0, desc, buffer,
+                                       (log_sge.num_sge > 0) ? &log_sge : NULL);
+
+    return UCS_OK;
+}
+
+static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_common_ep_short_dm(
+        uct_rc_mlx5_iface_common_t *iface, int qp_type,
+        uct_rc_mlx5_dm_copy_data_t *cache, size_t hdr_len, const void *payload,
+        unsigned length, unsigned opcode, uint8_t fm_ce_se, uint64_t rdma_raddr,
+        uct_rkey_t rdma_rkey, uct_rc_txqp_t *txqp, uct_ib_mlx5_txwq_t *txwq,
+        uct_ib_mlx5_base_av_t *av, struct mlx5_grh_av *grh_av, size_t av_size)
+{
+    uct_iov_t iov;
+
+    iov.buffer = (void*)payload;
+    iov.count  = 1;
+    iov.length = length;
+
+    return uct_rc_mlx5_common_ep_short_iov_dm(iface, qp_type, cache, hdr_len,
+                                              &iov, 1, length, opcode, fm_ce_se,
+                                              rdma_raddr, rdma_rkey, txqp, txwq,
+                                              av, grh_av, av_size);
+}
+
+static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_common_ep_am_short_iov_dm(
+        uct_base_ep_t *ep, uint8_t am_id, uct_rc_mlx5_iface_common_t *iface,
+        const uct_iov_t *iov, size_t iovcnt, size_t iov_length, int qp_type,
+        uct_rc_txqp_t *txqp, uct_ib_mlx5_txwq_t *txwq,
+        uct_ib_mlx5_base_av_t *av, struct mlx5_grh_av *grh_av, size_t av_size)
+{
+    uct_rc_mlx5_dm_copy_data_t cache;
+    ucs_status_t status;
+
+    UCT_CHECK_LENGTH(sizeof(cache.am_hdr.rc_hdr) + iov_length, 0,
+                     iface->dm.seg_len, "am_short_iov");
+
+    uct_rc_mlx5_am_hdr_fill(&cache.am_hdr.rc_hdr, am_id);
+    status = uct_rc_mlx5_common_ep_short_iov_dm(
+            iface, qp_type, &cache, sizeof(cache.am_hdr.rc_hdr), iov, iovcnt,
+            iov_length, MLX5_OPCODE_SEND,
+            MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE, 0, 0, txqp, txwq,
+            av, grh_av, av_size);
+    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
+        return status;
+    }
+
+    UCT_TL_EP_STAT_OP(ep, AM, SHORT, sizeof(cache.am_hdr.rc_hdr) + iov_length);
+
+    return UCS_OK;
+}
 #endif
 
 static ucs_status_t UCS_F_ALWAYS_INLINE
diff --git a/src/uct/ib/rc/accel/rc_mlx5_common.c b/src/uct/ib/rc/accel/rc_mlx5_common.c
index 9bdbc6005a2..2bf2882d755 100644
--- a/src/uct/ib/rc/accel/rc_mlx5_common.c
+++ b/src/uct/ib/rc/accel/rc_mlx5_common.c
@@ -62,27 +62,73 @@ ucs_config_field_t uct_rc_mlx5_common_config_table[] = {
    ucs_offsetof(uct_rc_mlx5_iface_common_config_t, exp_backoff),
    UCS_CONFIG_TYPE_UINT},
 
-  {"CYCLIC_SRQ_ENABLE", "try",
-   "Enable using the \"cyclic\" SRQ type (SRQ is organized as a continuous \n"
-   "array of WQEs), otherwise - using the \"list\" SRQ type (SRQ is organized \n"
-   "as a buffer containing linked list of WQEs.",
-   ucs_offsetof(uct_rc_mlx5_iface_common_config_t, cyclic_srq_enable),
-   UCS_CONFIG_TYPE_TERNARY},
+  {"SRQ_TOPO", "cyclic,cyclic_emulated",
+   "List of SRQ topology types in order of preference. Supported types are:\n"
+   "\n"
+   "list              SRQ is organized as a buffer containing linked list of WQEs.\n"
+   "\n"
+   "cyclic            SRQ is organized as a continuous array of WQEs. Requires DEVX.\n"
+   "\n"
+   "cyclic_emulated   SRQ is organized as a continuous array of WQEs, but HW\n"
+   "                  treats it as a linked list. Doesn`t require DEVX.",
+   ucs_offsetof(uct_rc_mlx5_iface_common_config_t, srq_topo),
+   UCS_CONFIG_TYPE_STRING_ARRAY},
 
   {NULL}
 };
 
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+uct_rc_mlx5_iface_srq_set_seg(uct_rc_mlx5_iface_common_t *iface,
+                              uct_ib_mlx5_srq_seg_t *seg)
+{
+    uct_ib_iface_recv_desc_t *desc;
+    uint64_t desc_map;
+    void *hdr;
+    int i;
+
+    desc_map = ~seg->srq.ptr_mask & UCS_MASK(iface->tm.mp.num_strides);
+    ucs_for_each_bit(i, desc_map) {
+        UCT_TL_IFACE_GET_RX_DESC(&iface->super.super.super, &iface->super.rx.mp,
+                                 desc, return UCS_ERR_NO_MEMORY);
+
+        /* Set receive data segment pointer. Length is pre-initialized. */
+        hdr                = uct_ib_iface_recv_desc_hdr(&iface->super.super,
+                                                        desc);
+        seg->srq.ptr_mask |= UCS_BIT(i);
+        seg->srq.desc      = desc; /* Optimization for non-MP case (1 stride) */
+        seg->dptr[i].lkey  = htonl(desc->lkey);
+        seg->dptr[i].addr  = htobe64((uintptr_t)hdr);
+        VALGRIND_MAKE_MEM_NOACCESS(hdr, iface->super.super.config.seg_size);
+    }
+
+    return UCS_OK;
+}
+
+/* Update resources and write doorbell record */
+static UCS_F_ALWAYS_INLINE void
+uct_rc_mlx5_iface_update_srq_res(uct_rc_iface_t *iface, uct_ib_mlx5_srq_t *srq,
+                                 uint16_t wqe_index, uint16_t count)
+{
+    ucs_assert(iface->rx.srq.available >= count);
+
+    if (count == 0) {
+        return;
+    }
+
+    srq->ready_idx              = wqe_index;
+    srq->sw_pi                 += count;
+    iface->rx.srq.available    -= count;
+    ucs_memory_cpu_store_fence();
+    *srq->db                    = htonl(srq->sw_pi);
+}
+
 unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface)
 {
     uct_ib_mlx5_srq_t *srq   = &iface->rx.srq;
     uct_rc_iface_t *rc_iface = &iface->super;
     uct_ib_mlx5_srq_seg_t *seg;
-    uct_ib_iface_recv_desc_t *desc;
     uint16_t count, wqe_index, next_index;
-    uint64_t desc_map;
-    void *hdr;
-    int i;
 
     /* Make sure the union is right */
     UCS_STATIC_ASSERT(ucs_offsetof(uct_ib_mlx5_srq_seg_t, mlx5_srq.next_wqe_index) ==
@@ -91,6 +137,7 @@ unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface)
                       sizeof(struct mlx5_wqe_srq_next_seg));
 
     ucs_assert(UCS_CIRCULAR_COMPARE16(srq->ready_idx, <=, srq->free_idx));
+    ucs_assert(rc_iface->rx.srq.available > 0);
 
     wqe_index = srq->ready_idx;
     for (;;) {
@@ -106,40 +153,59 @@ unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface)
             srq->free_idx  = next_index;
         }
 
-        desc_map = ~seg->srq.ptr_mask & UCS_MASK(iface->tm.mp.num_strides);
-        ucs_for_each_bit(i, desc_map) {
-            UCT_TL_IFACE_GET_RX_DESC(&rc_iface->super.super, &rc_iface->rx.mp,
-                                     desc, goto out);
-
-            /* Set receive data segment pointer. Length is pre-initialized. */
-            hdr                = uct_ib_iface_recv_desc_hdr(&rc_iface->super, desc);
-            seg->srq.ptr_mask |= UCS_BIT(i);
-            seg->srq.desc      = desc; /* Optimization for non-MP case (1 stride) */
-            seg->dptr[i].lkey  = htonl(desc->lkey);
-            seg->dptr[i].addr  = htobe64((uintptr_t)hdr);
-            VALGRIND_MAKE_MEM_NOACCESS(hdr, rc_iface->super.config.seg_size);
+        if (uct_rc_mlx5_iface_srq_set_seg(iface, seg) != UCS_OK) {
+            break;
         }
 
         wqe_index = next_index;
     }
 
-out:
     count = wqe_index - srq->sw_pi;
-    ucs_assert(rc_iface->rx.srq.available >= count);
-
-    if (count > 0) {
-        srq->ready_idx              = wqe_index;
-        srq->sw_pi                  = wqe_index;
-        rc_iface->rx.srq.available -= count;
-        ucs_memory_cpu_store_fence();
-        *srq->db                    = htonl(srq->sw_pi);
-        ucs_assert(uct_ib_mlx5_srq_get_wqe(srq, srq->mask)->srq.next_wqe_index == 0);
+    uct_rc_mlx5_iface_update_srq_res(rc_iface, srq, wqe_index, count);
+    ucs_assert(uct_ib_mlx5_srq_get_wqe(srq, srq->mask)->srq.next_wqe_index == 0);
+    return count;
+}
+
+unsigned uct_rc_mlx5_iface_srq_post_recv_ll(uct_rc_mlx5_iface_common_t *iface)
+{
+    uct_ib_mlx5_srq_t *srq     = &iface->rx.srq;
+    uct_rc_iface_t *rc_iface   = &iface->super;
+    uct_ib_mlx5_srq_seg_t *seg = NULL;
+    uint16_t count             = 0;
+    uint16_t wqe_index, next_index;
+
+    ucs_assert(rc_iface->rx.srq.available > 0);
+
+    wqe_index = srq->ready_idx;
+    seg       = uct_ib_mlx5_srq_get_wqe(srq, wqe_index);
+
+    for (;;) {
+        next_index = ntohs(seg->srq.next_wqe_index);
+        if (next_index == (srq->free_idx & srq->mask)) {
+            break;
+        }
+        seg = uct_ib_mlx5_srq_get_wqe(srq, next_index);
+
+        if (uct_rc_mlx5_iface_srq_set_seg(iface, seg) != UCS_OK) {
+            break;
+        }
+
+        wqe_index = next_index;
+        count++;
     }
+
+    uct_rc_mlx5_iface_update_srq_res(rc_iface, srq, wqe_index, count);
     return count;
 }
 
 void uct_rc_mlx5_iface_common_prepost_recvs(uct_rc_mlx5_iface_common_t *iface)
 {
+    /* prepost recvs only if quota available (recvs were not preposted
+     * before) */ 
+    if (iface->super.rx.srq.quota == 0) {
+        return;
+    }
+
     iface->super.rx.srq.available = iface->super.rx.srq.quota;
     iface->super.rx.srq.quota     = 0;
     uct_rc_mlx5_iface_srq_post_recv(iface);
@@ -202,6 +268,7 @@ uct_rc_mlx5_devx_create_cmd_qp(uct_rc_mlx5_iface_common_t *iface)
 
     ucs_assert(iface->tm.cmd_wq.super.super.type == UCT_IB_MLX5_OBJ_TYPE_LAST);
 
+    attr.super.qp_type          = IBV_QPT_RC;
     attr.super.cap.max_send_wr  = iface->tm.cmd_qp_len;
     attr.super.cap.max_send_sge = 1;
     attr.super.ibv.pd           = md->super.pd;
@@ -225,7 +292,7 @@ uct_rc_mlx5_devx_create_cmd_qp(uct_rc_mlx5_iface_common_t *iface)
     status = uct_rc_mlx5_iface_common_devx_connect_qp(
             iface, &iface->tm.cmd_wq.super.super,
             iface->tm.cmd_wq.super.super.qp_num, &ah_attr,
-            iface->super.super.config.path_mtu);
+            iface->super.super.config.path_mtu, 0);
     if (status != UCS_OK) {
         goto err_destroy_qp;
     }
@@ -459,6 +526,7 @@ void uct_rc_mlx5_iface_fill_attr(uct_rc_mlx5_iface_common_t *iface,
         break;
     case UCT_IB_MLX5_OBJ_TYPE_DEVX:
         uct_rc_iface_fill_attr(&iface->super, &qp_attr->super, max_send_wr, NULL);
+        qp_attr->mmio_mode = iface->tx.mmio_mode;
         break;
     case UCT_IB_MLX5_OBJ_TYPE_LAST:
         break;
@@ -467,18 +535,6 @@ void uct_rc_mlx5_iface_fill_attr(uct_rc_mlx5_iface_common_t *iface,
     qp_attr->super.srq_num = srq->srq_num;
 }
 
-static ucs_status_t
-uct_rc_mlx5_iface_check_no_devx_rx(uct_rc_mlx5_iface_common_t *iface)
-{
-    if (iface->config.cyclic_srq_enable == UCS_YES) {
-        ucs_error(UCT_IB_IFACE_FMT ": cyclic SRQ type is not supported",
-                  UCT_IB_IFACE_ARG(&iface->super.super));
-        return UCS_ERR_UNSUPPORTED;
-    }
-
-    return UCS_OK;
-}
-
 ucs_status_t
 uct_rc_mlx5_common_iface_init_rx(uct_rc_mlx5_iface_common_t *iface,
                                  const uct_rc_iface_common_config_t *rc_config)
@@ -486,10 +542,7 @@ uct_rc_mlx5_common_iface_init_rx(uct_rc_mlx5_iface_common_t *iface,
     uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, uct_ib_mlx5_md_t);
     ucs_status_t status;
 
-    status = uct_rc_mlx5_iface_check_no_devx_rx(iface);
-    if (status != UCS_OK) {
-        return status;
-    }
+    ucs_assert(iface->config.srq_topo != UCT_RC_MLX5_SRQ_TOPO_CYCLIC);
 
     status = uct_rc_iface_init_rx(&iface->super, rc_config,
                                   &iface->rx.srq.verbs.srq);
@@ -549,7 +602,7 @@ void uct_rc_mlx5_release_desc(uct_recv_desc_t *self, void *desc)
 void uct_rc_mlx5_handle_unexp_rndv(uct_rc_mlx5_iface_common_t *iface,
                                    struct ibv_tmh *tmh, uct_tag_t tag,
                                    struct mlx5_cqe64 *cqe, unsigned flags,
-                                   unsigned byte_len)
+                                   unsigned byte_len, int poll_flags)
 {
     uct_rc_mlx5_tmh_priv_data_t *priv = (uct_rc_mlx5_tmh_priv_data_t*)tmh->reserved;
     uct_ib_md_t *ib_md                = uct_ib_iface_md(&iface->super.super);
@@ -596,7 +649,8 @@ void uct_rc_mlx5_handle_unexp_rndv(uct_rc_mlx5_iface_common_t *iface,
 
     uct_rc_mlx5_iface_unexp_consumed(iface, iface->tm.rndv_desc.offset,
                                      &iface->tm.rndv_desc.super, cqe,
-                                     status, ntohs(cqe->wqe_counter));
+                                     status, ntohs(cqe->wqe_counter),
+                                     poll_flags);
 
     UCT_RC_MLX5_TM_STAT(iface, RX_RNDV_UNEXP);
 }
@@ -797,10 +851,7 @@ ucs_status_t uct_rc_mlx5_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
     uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super);
     ucs_status_t status;
 
-    status = uct_rc_mlx5_iface_check_no_devx_rx(iface);
-    if (status != UCS_OK) {
-        return status;
-    }
+    ucs_assert(iface->config.srq_topo != UCT_RC_MLX5_SRQ_TOPO_CYCLIC);
 
     uct_rc_mlx5_init_rx_tm_common(iface, config, rndv_hdr_len);
 
@@ -1090,15 +1141,6 @@ void uct_rc_mlx5_iface_common_sync_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
 #endif
 }
 
-void uct_rc_mlx5_iface_common_check_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
-                                           uct_ib_iface_t *ib_iface)
-{
-#if !HAVE_DECL_MLX5DV_INIT_OBJ
-    ucs_assert(iface->cq[UCT_IB_DIR_TX].cq_ci == uct_ib_mlx5_get_cq_ci(ib_iface->cq[UCT_IB_DIR_TX]));
-    ucs_assert(iface->cq[UCT_IB_DIR_RX].cq_ci == uct_ib_mlx5_get_cq_ci(ib_iface->cq[UCT_IB_DIR_RX]));
-#endif
-}
-
 ucs_status_t
 uct_rc_mlx5_iface_common_arm_cq(uct_ib_iface_t *ib_iface, uct_ib_dir_t dir,
                                 int solicited_only)
diff --git a/src/uct/ib/rc/accel/rc_mlx5_common.h b/src/uct/ib/rc/accel/rc_mlx5_common.h
index f17889aa6f4..e1fc8b48d5d 100644
--- a/src/uct/ib/rc/accel/rc_mlx5_common.h
+++ b/src/uct/ib/rc/accel/rc_mlx5_common.h
@@ -65,10 +65,10 @@
                      UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(_av_size), "am_zcopy header");
 
 
-#define UCT_RC_MLX5_CHECK_AM_SHORT(_id, _length, _av_size) \
+#define UCT_RC_MLX5_CHECK_AM_SHORT(_id, _header_t, _length, _av_size) \
     UCT_CHECK_AM_ID(_id); \
-    UCT_CHECK_LENGTH(sizeof(uct_rc_mlx5_am_short_hdr_t) + _length, 0, \
-        UCT_IB_MLX5_AM_MAX_SHORT(_av_size), "am_short");
+    UCT_CHECK_LENGTH(sizeof(_header_t) + _length, 0, \
+                     UCT_IB_MLX5_AM_MAX_SHORT(_av_size), "am_short");
 
 
 /* there is no need to do a special check for length == 0 because in that
@@ -103,6 +103,15 @@
 UCT_RC_MLX5_DECLARE_ATOMIC_LE_HANDLER(32)
 UCT_RC_MLX5_DECLARE_ATOMIC_LE_HANDLER(64)
 
+
+typedef enum {
+    UCT_RC_MLX5_SRQ_TOPO_LIST,
+    UCT_RC_MLX5_SRQ_TOPO_CYCLIC,
+    UCT_RC_MLX5_SRQ_TOPO_CYCLIC_EMULATED,
+    UCT_RC_MLX5_SRQ_TOPO_LAST
+} uct_rc_mlx5_srq_topo_t;
+
+
 enum {
     UCT_RC_MLX5_IFACE_STAT_RX_INL_32,
     UCT_RC_MLX5_IFACE_STAT_RX_INL_64,
@@ -137,7 +146,8 @@ enum {
 enum {
     UCT_RC_MLX5_POLL_FLAG_TM                 = UCS_BIT(0),
     UCT_RC_MLX5_POLL_FLAG_HAS_EP             = UCS_BIT(1),
-    UCT_RC_MLX5_POLL_FLAG_TAG_CQE            = UCS_BIT(2)
+    UCT_RC_MLX5_POLL_FLAG_TAG_CQE            = UCS_BIT(2),
+    UCT_RC_MLX5_POLL_FLAG_LINKED_LIST        = UCS_BIT(3)
 };
 
 
@@ -414,7 +424,7 @@ typedef struct uct_rc_mlx5_iface_common {
 #endif
     struct {
         uint8_t                        atomic_fence_flag;
-        ucs_ternary_value_t            cyclic_srq_enable;
+        uct_rc_mlx5_srq_topo_t         srq_topo;
     } config;
     UCS_STATS_NODE_DECLARE(stats)
 } uct_rc_mlx5_iface_common_t;
@@ -423,25 +433,23 @@ typedef struct uct_rc_mlx5_iface_common {
  * Common RC/DC mlx5 interface configuration
  */
 typedef struct uct_rc_mlx5_iface_common_config {
-    uct_ib_mlx5_iface_config_t       super;
-    unsigned                         tx_max_bb;
+    uct_ib_mlx5_iface_config_t           super;
+    unsigned                             tx_max_bb;
     struct {
-        int                          enable;
-        unsigned                     list_size;
-        size_t                       seg_size;
-        ucs_ternary_value_t          mp_enable;
-        size_t                       mp_num_strides;
+        int                              enable;
+        unsigned                         list_size;
+        size_t                           seg_size;
+        ucs_ternary_auto_value_t         mp_enable;
+        size_t                           mp_num_strides;
     } tm;
-    unsigned                         exp_backoff;
-    ucs_ternary_value_t              cyclic_srq_enable;
+    unsigned                             exp_backoff;
+    UCS_CONFIG_STRING_ARRAY_FIELD(types) srq_topo;
 } uct_rc_mlx5_iface_common_config_t;
 
 
-UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t,
-                  uct_rc_iface_ops_t*,
-                  uct_md_h, uct_worker_h,
-                  const uct_iface_params_t*,
-                  uct_rc_iface_common_config_t*,
+UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t*,
+                  uct_iface_ops_t*, uct_md_h, uct_worker_h,
+                  const uct_iface_params_t*, uct_rc_iface_common_config_t*,
                   uct_rc_mlx5_iface_common_config_t*,
                   uct_ib_iface_init_attr_t*);
 
@@ -500,7 +508,7 @@ UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t,
 void uct_rc_mlx5_handle_unexp_rndv(uct_rc_mlx5_iface_common_t *iface,
                                    struct ibv_tmh *tmh, uct_tag_t tag,
                                    struct mlx5_cqe64 *cqe, unsigned flags,
-                                   unsigned byte_len);
+                                   unsigned byte_len, int poll_flags);
 
 
 static UCS_F_ALWAYS_INLINE void
@@ -583,6 +591,7 @@ uct_rc_mlx5_handle_rndv_fin(uct_rc_mlx5_iface_common_t *iface, uint32_t app_ctx)
 extern ucs_config_field_t uct_rc_mlx5_common_config_table[];
 
 unsigned uct_rc_mlx5_iface_srq_post_recv(uct_rc_mlx5_iface_common_t *iface);
+unsigned uct_rc_mlx5_iface_srq_post_recv_ll(uct_rc_mlx5_iface_common_t *iface);
 
 void uct_rc_mlx5_iface_common_prepost_recvs(uct_rc_mlx5_iface_common_t *iface);
 
@@ -609,9 +618,6 @@ void uct_rc_mlx5_iface_common_update_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
 void uct_rc_mlx5_iface_common_sync_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
                                           uct_ib_iface_t *ib_iface);
 
-void uct_rc_mlx5_iface_common_check_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
-                                           uct_ib_iface_t *ib_iface);
-
 ucs_status_t
 uct_rc_mlx5_iface_common_arm_cq(uct_ib_iface_t *ib_iface, uct_ib_dir_t dir,
                                 int solicited_only);
@@ -717,7 +723,8 @@ uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
                                          uct_ib_mlx5_qp_t *qp,
                                          uint32_t dest_qp_num,
                                          struct ibv_ah_attr *ah_attr,
-                                         enum ibv_mtu path_mtu);
+                                         enum ibv_mtu path_mtu,
+                                         uint8_t path_index);
 
 #else
 static UCS_F_MAYBE_UNUSED ucs_status_t
@@ -725,7 +732,8 @@ uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
                                          uct_ib_mlx5_qp_t *qp,
                                          uint32_t dest_qp_num,
                                          struct ibv_ah_attr *ah_attr,
-                                         enum ibv_mtu path_mtu)
+                                         enum ibv_mtu path_mtu,
+                                         uint8_t path_index)
 {
     return UCS_ERR_UNSUPPORTED;
 }
diff --git a/src/uct/ib/rc/accel/rc_mlx5_devx.c b/src/uct/ib/rc/accel/rc_mlx5_devx.c
index 172b43ddf34..73460a8f2ac 100644
--- a/src/uct/ib/rc/accel/rc_mlx5_devx.c
+++ b/src/uct/ib/rc/accel/rc_mlx5_devx.c
@@ -166,14 +166,14 @@ uct_rc_mlx5_devx_init_rx_common(uct_rc_mlx5_iface_common_t *iface,
 
     iface->rx.srq.db = &iface->rx.srq.devx.dbrec->db[MLX5_RCV_DBR];
 
-    if (iface->config.cyclic_srq_enable == UCS_NO) {
-        wq_type = UCT_RC_MLX5_MP_ENABLED(iface) ?
-                  UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ :
-                  UCT_IB_MLX5_SRQ_TOPO_LIST;
-    } else {
+    if (iface->config.srq_topo == UCT_RC_MLX5_SRQ_TOPO_CYCLIC) {
         wq_type = UCT_RC_MLX5_MP_ENABLED(iface) ?
                   UCT_IB_MLX5_SRQ_TOPO_CYCLIC_MP_RQ :
                   UCT_IB_MLX5_SRQ_TOPO_CYCLIC;
+    } else {
+        wq_type = UCT_RC_MLX5_MP_ENABLED(iface) ?
+                  UCT_IB_MLX5_SRQ_TOPO_LIST_MP_RQ :
+                  UCT_IB_MLX5_SRQ_TOPO_LIST;
     }
 
     UCT_IB_MLX5DV_SET  (wq, wq, wq_type,       wq_type);
@@ -208,6 +208,7 @@ uct_rc_mlx5_devx_init_rx_common(uct_rc_mlx5_iface_common_t *iface,
     return status;
 }
 
+#if IBV_HW_TM
 ucs_status_t
 uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
                             const uct_rc_iface_common_config_t *config,
@@ -267,6 +268,7 @@ uct_rc_mlx5_devx_init_rx_tm(uct_rc_mlx5_iface_common_t *iface,
     uct_rc_mlx5_devx_cleanup_srq(md, &iface->rx.srq);
     return status;
 }
+#endif
 
 ucs_status_t uct_rc_mlx5_devx_init_rx(uct_rc_mlx5_iface_common_t *iface,
                                       const uct_rc_iface_common_config_t *config)
@@ -326,13 +328,19 @@ uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
                                          uct_ib_mlx5_qp_t *qp,
                                          uint32_t dest_qp_num,
                                          struct ibv_ah_attr *ah_attr,
-                                         enum ibv_mtu path_mtu)
+                                         enum ibv_mtu path_mtu,
+                                         uint8_t path_index)
 {
+    uct_ib_mlx5_md_t *md = ucs_derived_of(uct_ib_iface_md(&iface->super.super),
+                                          uct_ib_mlx5_md_t);
     char in_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_in)]   = {};
     char out_2rtr[UCT_IB_MLX5DV_ST_SZ_BYTES(init2rtr_qp_out)] = {};
     char in_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_in)]    = {};
     char out_2rts[UCT_IB_MLX5DV_ST_SZ_BYTES(rtr2rts_qp_out)]  = {};
     uct_ib_device_t *dev = uct_ib_iface_device(&iface->super.super);
+    uint32_t opt_param_mask = UCT_IB_MLX5_QP_OPTPAR_RRE |
+                              UCT_IB_MLX5_QP_OPTPAR_RAE |
+                              UCT_IB_MLX5_QP_OPTPAR_RWE;
     struct mlx5_wqe_av mlx5_av;
     ucs_status_t status;
     struct ibv_ah *ah;
@@ -341,7 +349,6 @@ uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
     UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opcode,
                       UCT_IB_MLX5_CMD_OP_INIT2RTR_QP);
     UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, qpn, qp->qp_num);
-    UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, 14);
 
     ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
     qpc = UCT_IB_MLX5DV_ADDR_OF(init2rtr_qp_in, in_2rtr, qpc);
@@ -372,21 +379,27 @@ uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
             UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.dscp,
                               iface->super.super.config.traffic_class >> 2);
         }
+
+        uct_ib_mlx5_devx_set_qpc_port_affinity(md, path_index, qpc,
+                                               &opt_param_mask);
     } else {
         UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.grh, ah_attr->is_global);
         UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.rlid, ah_attr->dlid);
         UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.mlid,
                           ah_attr->src_path_bits & 0x7f);
-        UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.hop_limit,
-                          ah_attr->grh.hop_limit);
-        memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
-               &ah_attr->grh.dgid,
-               UCT_IB_MLX5DV_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip));
         UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.sl,
                           iface->super.super.config.sl);
-        /* TODO add flow_label support */
-        UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.tclass,
-                          iface->super.super.config.traffic_class);
+
+        if (ah_attr->is_global) {
+            UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.hop_limit,
+                              ah_attr->grh.hop_limit);
+            memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+                   &ah_attr->grh.dgid,
+                   UCT_IB_MLX5DV_FLD_SZ_BYTES(qpc, primary_address_path.rgid_rip));
+            /* TODO add flow_label support */
+            UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.tclass,
+                              iface->super.super.config.traffic_class);
+        }
     }
 
     UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, ah_attr->port_num);
@@ -398,6 +411,8 @@ uct_rc_mlx5_iface_common_devx_connect_qp(uct_rc_mlx5_iface_common_t *iface,
     UCT_IB_MLX5DV_SET(qpc, qpc, rae, true);
     UCT_IB_MLX5DV_SET(qpc, qpc, min_rnr_nak, iface->super.config.min_rnr_timer);
 
+    UCT_IB_MLX5DV_SET(init2rtr_qp_in, in_2rtr, opt_param_mask, opt_param_mask);
+
     status = uct_ib_mlx5_devx_modify_qp(qp, in_2rtr, sizeof(in_2rtr),
                                         out_2rtr, sizeof(out_2rtr));
     if (status != UCS_OK) {
diff --git a/src/uct/ib/rc/accel/rc_mlx5_ep.c b/src/uct/ib/rc/accel/rc_mlx5_ep.c
index b44df6ef034..9f9eab4f69b 100644
--- a/src/uct/ib/rc/accel/rc_mlx5_ep.c
+++ b/src/uct/ib/rc/accel/rc_mlx5_ep.c
@@ -33,28 +33,6 @@ typedef struct {
 } uct_rc_mlx5_ep_cleanup_ctx_t;
 
 
-/*
- *
- * Helper function for buffer-copy post.
- * Adds the descriptor to the callback queue.
- */
-static UCS_F_ALWAYS_INLINE void
-uct_rc_mlx5_txqp_bcopy_post(uct_rc_mlx5_iface_common_t *iface,
-                            uct_rc_txqp_t *txqp, uct_ib_mlx5_txwq_t *txwq,
-                            unsigned opcode, unsigned length,
-                            /* RDMA */ uint64_t rdma_raddr, uct_rkey_t rdma_rkey,
-                            uint8_t fm_ce_se, uint32_t imm_val_be,
-                            uct_rc_iface_send_desc_t *desc, const void *buffer,
-                            uct_ib_log_sge_t *log_sge)
-{
-    desc->super.sn = txwq->sw_pi;
-    uct_rc_mlx5_txqp_dptr_post(iface, IBV_QPT_RC, txqp, txwq,
-                               opcode, buffer, length, &desc->lkey,
-                               rdma_raddr, rdma_rkey, 0, 0, 0, 0,
-                               NULL, NULL, 0, fm_ce_se, imm_val_be, INT_MAX, log_sge);
-    uct_rc_txqp_add_send_op(txqp, &desc->super);
-}
-
 /*
  * Helper function for zero-copy post.
  * Adds user completion to the callback queue.
@@ -114,7 +92,7 @@ uct_rc_mlx5_ep_am_short_inline(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
                                const void *payload, unsigned length)
 {
     UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
-    UCT_RC_MLX5_CHECK_AM_SHORT(id, length, 0);
+    UCT_RC_MLX5_CHECK_AM_SHORT(id, uct_rc_mlx5_am_short_hdr_t, length, 0);
     UCT_RC_CHECK_RES_AND_FC(&iface->super, &ep->super, id);
 
     uct_rc_mlx5_txqp_inline_post(iface, IBV_QPT_RC,
@@ -131,43 +109,29 @@ uct_rc_mlx5_ep_am_short_inline(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     return UCS_OK;
 }
 
-#if HAVE_IBV_DM
-static ucs_status_t UCS_F_ALWAYS_INLINE
-uct_rc_mlx5_ep_short_dm(uct_rc_mlx5_ep_t *ep, uct_rc_mlx5_dm_copy_data_t *cache,
-                        size_t hdr_len, const void *payload, unsigned length,
-                        unsigned opcode, uint8_t fm_ce_se,
-                        uint64_t rdma_raddr, uct_rkey_t rdma_rkey)
+static ucs_status_t UCS_F_ALWAYS_INLINE uct_rc_mlx5_ep_am_short_iov_inline(
+        uct_ep_h tl_ep, uint8_t id, const uct_iov_t *iov, size_t iovcnt,
+        size_t iov_length)
 {
-    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ep->super.super.super.iface,
-                                                       uct_rc_mlx5_iface_common_t);
-    uct_rc_iface_send_desc_t *desc    = NULL;
-    void *buffer;
-    ucs_status_t status;
-    uct_ib_log_sge_t log_sge;
-
-    status = uct_rc_mlx5_common_dm_make_data(iface, cache, hdr_len, payload,
-                                             length, &desc, &buffer, &log_sge);
-    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
-        return status;
-    }
+    UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
+    UCT_RC_MLX5_CHECK_AM_SHORT(id, uct_rc_mlx5_hdr_t, iov_length, 0);
+    UCT_RC_CHECK_RES_AND_FC(&iface->super, &ep->super, id);
+    uct_rc_mlx5_txqp_inline_iov_post(iface, IBV_QPT_RC, &ep->super.txqp,
+                                     &ep->tx.wq, iov, iovcnt, iov_length, id,
+                                     NULL, NULL, 0);
+    UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, iov_length);
+    UCT_RC_UPDATE_FC(&iface->super, &ep->super, id);
 
-    uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq,
-                                opcode, hdr_len + length,
-                                rdma_raddr, rdma_rkey, fm_ce_se,
-                                0, desc, buffer,
-                                log_sge.num_sge ? &log_sge : NULL);
     return UCS_OK;
 }
-#endif
 
 ucs_status_t
 uct_rc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length,
                          uint64_t remote_addr, uct_rkey_t rkey)
 {
 #if HAVE_IBV_DM
-    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_mlx5_iface_common_t);
-    uct_rc_iface_t *rc_iface          = &iface->super;
-    uct_rc_mlx5_ep_t *ep              = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t);
+    UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
+    uct_rc_iface_t *rc_iface = &iface->super;
     ucs_status_t status;
 
     if (ucs_likely((length <= UCT_IB_MLX5_PUT_MAX_SHORT(0)) || !iface->dm.dm)) {
@@ -180,10 +144,11 @@ uct_rc_mlx5_ep_put_short(uct_ep_h tl_ep, const void *buffer, unsigned length,
     UCT_RC_CHECK_RES(rc_iface, &ep->super);
     uct_rc_mlx5_ep_fence_put(iface, &ep->tx.wq, &rkey, &remote_addr,
                              ep->super.atomic_mr_offset);
-    status = uct_rc_mlx5_ep_short_dm(ep, NULL, 0, buffer, length,
-                                     MLX5_OPCODE_RDMA_WRITE,
-                                     MLX5_WQE_CTRL_CQ_UPDATE,
-                                     remote_addr, rkey);
+    status = uct_rc_mlx5_common_ep_short_dm(iface, IBV_QPT_RC, NULL, 0, buffer,
+                                            length, MLX5_OPCODE_RDMA_WRITE,
+                                            MLX5_WQE_CTRL_CQ_UPDATE,
+                                            remote_addr, rkey, &ep->super.txqp,
+                                            &ep->tx.wq, NULL, NULL, 0);
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
     }
@@ -205,12 +170,13 @@ ssize_t uct_rc_mlx5_ep_put_bcopy(uct_ep_h tl_ep, uct_pack_callback_t pack_cb,
                                        desc, pack_cb, arg, length);
     uct_rc_mlx5_ep_fence_put(iface, &ep->tx.wq, &rkey, &remote_addr,
                              ep->super.atomic_mr_offset);
-
-    uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq,
-                                MLX5_OPCODE_RDMA_WRITE, length, remote_addr,
-                                rkey, MLX5_WQE_CTRL_CQ_UPDATE, 0, desc, desc + 1,
-                                NULL);
+    uct_rc_mlx5_common_txqp_bcopy_post(iface, IBV_QPT_RC, &ep->super.txqp,
+                                       &ep->tx.wq, MLX5_OPCODE_RDMA_WRITE,
+                                       length, remote_addr, rkey, NULL, NULL, 0,
+                                       MLX5_WQE_CTRL_CQ_UPDATE, 0, desc,
+                                       desc + 1, NULL);
     UCT_TL_EP_STAT_OP(&ep->super.super, PUT, BCOPY, length);
+
     return length;
 }
 
@@ -256,11 +222,13 @@ ucs_status_t uct_rc_mlx5_ep_get_bcopy(uct_ep_h tl_ep,
                                        unpack_cb, comp, arg, length);
 
     uct_rc_mlx5_ep_fence_get(iface, &ep->tx.wq, &rkey, &fm_ce_se);
-    uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq,
-                                MLX5_OPCODE_RDMA_READ, length, remote_addr,
-                                rkey, fm_ce_se, 0, desc, desc + 1, NULL);
+    uct_rc_mlx5_common_txqp_bcopy_post(iface, IBV_QPT_RC, &ep->super.txqp,
+                                       &ep->tx.wq, MLX5_OPCODE_RDMA_READ,
+                                       length, remote_addr, rkey, NULL, NULL, 0,
+                                       fm_ce_se, 0, desc, desc + 1, NULL);
     UCT_TL_EP_STAT_OP(&ep->super.super, GET, BCOPY, length);
     UCT_RC_RDMA_READ_POSTED(&iface->super, length);
+
     return UCS_INPROGRESS;
 }
 
@@ -298,9 +266,8 @@ uct_rc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
                         const void *payload, unsigned length)
 {
 #if HAVE_IBV_DM
-    uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_mlx5_iface_common_t);
-    uct_rc_iface_t *rc_iface          = &iface->super;
-    uct_rc_mlx5_ep_t *ep              = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t);
+    UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
+    uct_rc_iface_t *rc_iface = &iface->super;
     ucs_status_t status;
     uct_rc_mlx5_dm_copy_data_t cache;
 
@@ -319,10 +286,10 @@ uct_rc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     uct_rc_mlx5_am_hdr_fill(&cache.am_hdr.rc_hdr, id);
     cache.am_hdr.am_hdr = hdr;
 
-    status = uct_rc_mlx5_ep_short_dm(ep, &cache, sizeof(cache.am_hdr), payload, length,
-                                     MLX5_OPCODE_SEND,
-                                     MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE,
-                                     0, 0);
+    status = uct_rc_mlx5_common_ep_short_dm(
+            iface, IBV_QPT_RC, &cache, sizeof(cache.am_hdr), payload, length,
+            MLX5_OPCODE_SEND, MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE,
+            0, 0, &ep->super.txqp, &ep->tx.wq, NULL, NULL, 0);
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
     }
@@ -333,6 +300,40 @@ uct_rc_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
 #endif
 }
 
+ucs_status_t uct_rc_mlx5_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                         const uct_iov_t *iov, size_t iovcnt)
+{
+    size_t iov_length = uct_iov_total_length(iov, iovcnt);
+#if HAVE_IBV_DM
+    UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
+    ucs_status_t status;
+
+    if (ucs_likely((sizeof(uct_rc_mlx5_hdr_t) + iov_length <=
+                    UCT_IB_MLX5_AM_MAX_SHORT(0)) ||
+                   !iface->dm.dm)) {
+#endif
+        return uct_rc_mlx5_ep_am_short_iov_inline(tl_ep, id, iov, iovcnt,
+                                                  iov_length);
+#if HAVE_IBV_DM
+    }
+
+    UCT_CHECK_AM_ID(id);
+    UCT_RC_CHECK_RES_AND_FC(&iface->super, &ep->super, id);
+
+    status = uct_rc_mlx5_common_ep_am_short_iov_dm(&ep->super.super, id, iface,
+                                                   iov, iovcnt, iov_length,
+                                                   IBV_QPT_RC, &ep->super.txqp,
+                                                   &ep->tx.wq, NULL, NULL, 0);
+    if (ucs_unlikely(UCS_STATUS_IS_ERR(status))) {
+        return status;
+    }
+
+    UCT_RC_UPDATE_FC(&iface->super, &ep->super, id);
+
+    return UCS_OK;
+#endif
+}
+
 ssize_t uct_rc_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                 uct_pack_callback_t pack_cb, void *arg,
                                 unsigned flags)
@@ -347,12 +348,14 @@ ssize_t uct_rc_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                       id, uct_rc_mlx5_am_hdr_fill, uct_rc_mlx5_hdr_t,
                                       pack_cb, arg, &length);
 
-    uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq,
-                                MLX5_OPCODE_SEND, sizeof(uct_rc_mlx5_hdr_t) + length,
-                                0, 0, MLX5_WQE_CTRL_SOLICITED, 0, desc, desc + 1,
-                                NULL);
+    uct_rc_mlx5_common_txqp_bcopy_post(iface, IBV_QPT_RC, &ep->super.txqp,
+                                       &ep->tx.wq, MLX5_OPCODE_SEND,
+                                       sizeof(uct_rc_mlx5_hdr_t) + length, 0, 0,
+                                       NULL, NULL, 0, MLX5_WQE_CTRL_SOLICITED,
+                                       0, desc, desc + 1, NULL);
     UCT_TL_EP_STAT_OP(&ep->super.super, AM, BCOPY, length);
     UCT_RC_UPDATE_FC(&iface->super, &ep->super, id);
+
     return length;
 }
 
@@ -542,18 +545,12 @@ ucs_status_t uct_rc_mlx5_ep_fence(uct_ep_h tl_ep, unsigned flags)
     return uct_rc_ep_fence(tl_ep, &ep->tx.wq.fi, 1);
 }
 
-static ucs_status_t uct_rc_mlx5_ep_check_internal(uct_ep_h tl_ep)
+void uct_rc_mlx5_ep_post_check(uct_ep_h tl_ep)
 {
     UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
     /* use this variable as dummy buffer to suppress compiler warning */ 
     uint64_t dummy = 0;
 
-    /* in case if no TX resources are available then there is at least
-     * one signaled operation which provides actual peer status, in this case
-     * just return without any actions */
-    UCT_RC_CHECK_TXQP_RET(&iface->super, &ep->super, UCS_OK);
-    UCT_RC_CHECK_CQE_RET(&iface->super, &ep->super, UCS_ERR_NO_RESOURCE);
-
     uct_rc_mlx5_txqp_inline_post(iface, IBV_QPT_RC,
                                  &ep->super.txqp, &ep->tx.wq,
                                  MLX5_OPCODE_RDMA_WRITE, &dummy, 0,
@@ -561,81 +558,18 @@ static ucs_status_t uct_rc_mlx5_ep_check_internal(uct_ep_h tl_ep)
                                  0, 0,
                                  NULL, NULL, 0, 0,
                                  INT_MAX);
-    return UCS_OK;
-}
-
-static ucs_status_t uct_rc_mlx5_ep_check_progress(uct_pending_req_t *self)
-{
-    uct_rc_pending_req_t *req = ucs_derived_of(self, uct_rc_pending_req_t);
-    uct_rc_mlx5_ep_t *ep      = ucs_derived_of(req->ep, uct_rc_mlx5_ep_t);
-    ucs_status_t status;
-
-    ucs_assert(ep->super.flags & UCT_RC_EP_FLAG_KEEPALIVE_PENDING);
-
-    status = uct_rc_mlx5_ep_check_internal(req->ep);
-    if (status == UCS_OK) {
-        ep->super.flags &= ~UCT_RC_EP_FLAG_KEEPALIVE_PENDING;
-        ucs_mpool_put(req);
-    } else {
-        ucs_assert(status == UCS_ERR_NO_RESOURCE);
-    }
-
-    return status;
-}
-
-ucs_status_t
-uct_rc_mlx5_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
-{
-    UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
-    uct_rc_pending_req_t *req;
-    ucs_status_t status;
-
-    UCT_CHECK_PARAM(comp == NULL, "Unsupported completion on ep_check");
-    UCT_CHECK_PARAM(flags == 0, "Unsupported flags: %x", flags);
-
-    ucs_assert(ep->super.flags & UCT_RC_EP_FLAG_CONNECTED);
-
-    if (ep->super.flags & UCT_RC_EP_FLAG_KEEPALIVE_PENDING) {
-        /* keepalive request is in pending queue and will be
-         * processed when resources are available */
-        return UCS_OK;
-    }
-
-    status = uct_rc_mlx5_ep_check_internal(tl_ep);
-    if (status != UCS_ERR_NO_RESOURCE) {
-        ucs_assert(status == UCS_OK);
-        return status;
-    }
-
-    /* there are no iface resources, add pending request */
-
-    req = ucs_mpool_get(&iface->super.tx.pending_mp);
-    if (req == NULL) {
-        return UCS_ERR_NO_MEMORY;
-    }
-
-    req->ep          = &ep->super.super.super;
-    req->super.func  = uct_rc_mlx5_ep_check_progress;
-    status           = uct_ep_pending_add(tl_ep, &req->super, 0);
-    ep->super.flags |= UCT_RC_EP_FLAG_KEEPALIVE_PENDING;
-    ucs_assert_always(status == UCS_OK);
-
-    return UCS_OK;
 }
 
 ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags,
                                   uct_completion_t *comp)
 {
     UCT_RC_MLX5_EP_DECL(tl_ep, iface, ep);
+    uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md,
+                                          uct_ib_mlx5_md_t);
+    int already_canceled = ep->super.flags & UCT_RC_EP_FLAG_FLUSH_CANCEL;
     ucs_status_t status;
     uint16_t sn;
 
-    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
-        uct_ep_pending_purge(&ep->super.super.super, NULL, 0);
-        uct_rc_mlx5_ep_handle_failure(ep, UCS_ERR_CANCELED, ep->tx.wq.sw_pi);
-        return UCS_OK;
-    }
-
     status = uct_rc_ep_flush(&ep->super, ep->tx.wq.bb_max, flags);
     if (status != UCS_INPROGRESS) {
         return status;
@@ -655,6 +589,13 @@ ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags,
         sn = ep->tx.wq.sig_pi;
     }
 
+    if (ucs_unlikely((flags & UCT_FLUSH_FLAG_CANCEL) && !already_canceled)) {
+        status = uct_ib_mlx5_modify_qp_state(md, &ep->tx.wq.super, IBV_QPS_ERR);
+        if (status != UCS_OK) {
+            return status;
+        }
+    }
+
     return uct_rc_txqp_add_flush_comp(&iface->super, &ep->super.super,
                                       &ep->super.txqp, comp, sn);
 }
@@ -739,14 +680,16 @@ void uct_rc_mlx5_common_packet_dump(uct_base_iface_t *iface, uct_am_trace_type_t
 ucs_status_t
 uct_rc_mlx5_ep_connect_qp(uct_rc_mlx5_iface_common_t *iface,
                           uct_ib_mlx5_qp_t *qp, uint32_t qp_num,
-                          struct ibv_ah_attr *ah_attr, enum ibv_mtu path_mtu)
+                          struct ibv_ah_attr *ah_attr, enum ibv_mtu path_mtu,
+                          uint8_t path_index)
 {
     uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.md, uct_ib_mlx5_md_t);
 
     ucs_assert(path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
     if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX) {
         return uct_rc_mlx5_iface_common_devx_connect_qp(iface, qp, qp_num,
-                                                        ah_attr, path_mtu);
+                                                        ah_attr, path_mtu,
+                                                        path_index);
     } else {
         return uct_rc_iface_qp_connect(&iface->super, qp->verbs.qp, qp_num,
                                        ah_attr, path_mtu);
@@ -776,7 +719,8 @@ ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,
          * should be posted to the send side of the QP which is owned by device. */
         status = uct_rc_mlx5_ep_connect_qp(iface, &ep->tm_qp,
                                            uct_ib_unpack_uint24(rc_addr->qp_num),
-                                           &ah_attr, path_mtu);
+                                           &ah_attr, path_mtu,
+                                           ep->super.path_index);
         if (status != UCS_OK) {
             return status;
         }
@@ -789,7 +733,8 @@ ucs_status_t uct_rc_mlx5_ep_connect_to_ep(uct_ep_h tl_ep,
     }
 
     status = uct_rc_mlx5_ep_connect_qp(iface, &ep->tx.wq.super, qp_num,
-                                       &ah_attr, path_mtu);
+                                       &ah_attr, path_mtu,
+                                       ep->super.path_index);
     if (status != UCS_OK) {
         return status;
     }
@@ -852,10 +797,10 @@ ucs_status_t uct_rc_mlx5_ep_tag_eager_short(uct_ep_h tl_ep, uct_tag_t tag,
 
     uct_rc_mlx5_fill_tmh(ucs_unaligned_ptr(&cache.tm_hdr), tag, 0, IBV_TMH_EAGER);
 
-    status = uct_rc_mlx5_ep_short_dm(ep, &cache, sizeof(cache.tm_hdr), data, length,
-                                   MLX5_OPCODE_SEND,
-                                   MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE,
-                                   0, 0);
+    status = uct_rc_mlx5_common_ep_short_dm(
+            iface, IBV_QPT_RC, &cache, sizeof(cache.tm_hdr), data, length,
+            MLX5_OPCODE_SEND, MLX5_WQE_CTRL_SOLICITED | MLX5_WQE_CTRL_CQ_UPDATE,
+            0, 0, &ep->super.txqp, &ep->tx.wq, NULL, NULL, 0);
     if (!UCS_STATUS_IS_ERR(status)) {
         UCT_TL_EP_STAT_OP(&ep->super.super, TAG, SHORT, length);
     }
@@ -883,10 +828,11 @@ ssize_t uct_rc_mlx5_ep_tag_eager_bcopy(uct_ep_h tl_ep, uct_tag_t tag,
     UCT_RC_MLX5_IFACE_GET_TM_BCOPY_DESC(&iface->super, iface->tm.bcopy_mp,
                                         desc, tag, app_ctx, pack_cb, arg, length);
 
-    uct_rc_mlx5_txqp_bcopy_post(iface, &ep->super.txqp, &ep->tx.wq,
-                                opcode, sizeof(struct ibv_tmh) + length,
-                                0, 0, MLX5_WQE_CTRL_SOLICITED, ib_imm,
-                                desc, desc + 1, NULL);
+    uct_rc_mlx5_common_txqp_bcopy_post(iface, IBV_QPT_RC, &ep->super.txqp,
+                                       &ep->tx.wq, opcode,
+                                       sizeof(struct ibv_tmh) + length, 0, 0,
+                                       NULL, NULL, 0, MLX5_WQE_CTRL_SOLICITED,
+                                       ib_imm, desc, desc + 1, NULL);
 
     UCT_TL_EP_STAT_OP(&ep->super.super, TAG, BCOPY, length);
 
@@ -1038,17 +984,23 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_ep_t, const uct_ep_params_t *params)
     return status;
 }
 
-void uct_rc_mlx5_ep_cleanup_qp(uct_ib_async_event_wait_t *wait_ctx)
+unsigned uct_rc_mlx5_ep_cleanup_qp(void *arg)
 {
     uct_rc_mlx5_ep_cleanup_ctx_t *ep_cleanup_ctx
-                                      = ucs_derived_of(wait_ctx,
-                                                       uct_rc_mlx5_ep_cleanup_ctx_t);
+                                      = arg;
     uct_rc_mlx5_iface_common_t *iface = ucs_derived_of(ep_cleanup_ctx->super.iface,
                                                        uct_rc_mlx5_iface_common_t);
     uct_ib_mlx5_md_t *md              = ucs_derived_of(iface->super.super.super.md,
                                                        uct_ib_mlx5_md_t);
-
-    uct_rc_mlx5_iface_common_check_cqs_ci(iface, &iface->super.super);
+#if !HAVE_DECL_MLX5DV_INIT_OBJ
+    int count;
+
+    count = uct_rc_mlx5_iface_commom_clean(&iface->cq[UCT_IB_DIR_RX],
+                                           &iface->rx.srq,
+                                           ep_cleanup_ctx->qp.qp_num);
+    iface->super.rx.srq.available += count;
+    uct_rc_mlx5_iface_common_update_cqs_ci(iface, &iface->super.super);
+#endif
 
 #if IBV_HW_TM
     if (UCT_RC_MLX5_TM_ENABLED(iface)) {
@@ -1063,6 +1015,7 @@ void uct_rc_mlx5_ep_cleanup_qp(uct_ib_async_event_wait_t *wait_ctx)
     uct_ib_mlx5_qp_mmio_cleanup(&ep_cleanup_ctx->qp, ep_cleanup_ctx->reg);
     uct_ib_mlx5_destroy_qp(md, &ep_cleanup_ctx->qp);
     uct_rc_ep_cleanup_qp_done(&ep_cleanup_ctx->super, ep_cleanup_ctx->qp.qp_num);
+    return 1;
 }
 
 UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t)
@@ -1079,7 +1032,6 @@ UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t)
     ep_cleanup_ctx->qp    = self->tx.wq.super;
     ep_cleanup_ctx->reg   = self->tx.wq.reg;
 
-    /* TODO should be removed by flush */
     uct_rc_txqp_purge_outstanding(&iface->super, &self->super.txqp,
                                   UCS_ERR_CANCELED, self->tx.wq.sw_pi, 1);
 #if IBV_HW_TM
@@ -1094,58 +1046,6 @@ UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t)
                          self->tx.wq.super.qp_num);
 }
 
-ucs_status_t uct_rc_mlx5_ep_handle_failure(uct_rc_mlx5_ep_t *ep,
-                                           ucs_status_t status, uint16_t pi)
-{
-    uct_rc_iface_t *rc_iface = ucs_derived_of(ep->super.super.super.iface,
-                                              uct_rc_iface_t);
-
-    uct_rc_txqp_purge_outstanding(rc_iface, &ep->super.txqp, status, pi, 0);
-    /* poll_cqe for mlx5 returns NULL in case of failure and the cq_avaialble
-       is not updated for the error cqe and all outstanding wqes*/
-    rc_iface->tx.cq_available += ep->tx.wq.bb_max -
-                                 uct_rc_txqp_available(&ep->super.txqp);
-    return rc_iface->super.ops->set_ep_failed(&rc_iface->super,
-                                              &ep->super.super.super,
-                                              status);
-}
-
-ucs_status_t uct_rc_mlx5_ep_set_failed(uct_ib_iface_t *iface, uct_ep_h ep,
-                                       ucs_status_t status)
-{
-    return uct_set_ep_failed(&UCS_CLASS_NAME(uct_rc_mlx5_ep_t), ep,
-                             &iface->super.super, status);
-}
-
-static ucs_arbiter_cb_result_t
-uct_rc_mlx5_ep_arbiter_purge_cb(ucs_arbiter_t *arbiter,
-                                ucs_arbiter_group_t *group,
-                                ucs_arbiter_elem_t *elem,
-                                void *arg)
-{
-    uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv);
-
-    /* process internal EP_CHECK message */
-    if (req->func == uct_rc_mlx5_ep_check_progress) {
-        ucs_mpool_put(req);
-        return UCS_ARBITER_CB_RESULT_REMOVE_ELEM;
-    }
-
-    return uct_rc_ep_arbiter_purge_cb(arbiter, group, elem, arg);
-}
-
-void uct_rc_mlx5_ep_pending_purge(uct_ep_h tl_ep,
-                                  uct_pending_purge_callback_t cb,
-                                  void *arg)
-{
-    uct_rc_iface_t *iface     = ucs_derived_of(tl_ep->iface, uct_rc_iface_t);
-    uct_rc_ep_t *ep           = ucs_derived_of(tl_ep, uct_rc_ep_t);
-    uct_purge_cb_args_t  args = {cb, arg};
-
-    ucs_arbiter_group_purge(&iface->tx.arbiter, &ep->arb_group,
-                            uct_rc_mlx5_ep_arbiter_purge_cb, &args);
-}
-
 UCS_CLASS_DEFINE(uct_rc_mlx5_ep_t, uct_rc_ep_t);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_mlx5_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_mlx5_ep_t, uct_ep_t);
diff --git a/src/uct/ib/rc/accel/rc_mlx5_iface.c b/src/uct/ib/rc/accel/rc_mlx5_iface.c
index 3220d93d2c2..e83056251a0 100644
--- a/src/uct/ib/rc/accel/rc_mlx5_iface.c
+++ b/src/uct/ib/rc/accel/rc_mlx5_iface.c
@@ -53,6 +53,7 @@ ucs_config_field_t uct_rc_mlx5_iface_config_table[] = {
 
 
 static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops;
+static uct_iface_ops_t uct_rc_mlx5_iface_tl_ops;
 
 #ifdef ENABLE_STATS
 ucs_stats_class_t uct_rc_mlx5_iface_stats_class = {
@@ -66,7 +67,8 @@ ucs_stats_class_t uct_rc_mlx5_iface_stats_class = {
 #endif
 
 void uct_rc_mlx5_iface_check_rx_completion(uct_rc_mlx5_iface_common_t *iface,
-                                           struct mlx5_cqe64 *cqe)
+                                           struct mlx5_cqe64 *cqe,
+                                           int poll_flags)
 {
     uct_ib_mlx5_cq_t *cq      = &iface->cq[UCT_IB_DIR_RX];
     struct mlx5_err_cqe *ecqe = (void*)cqe;
@@ -87,13 +89,47 @@ void uct_rc_mlx5_iface_check_rx_completion(uct_rc_mlx5_iface_common_t *iface,
         /* TODO: Check if ib_stride_index valid for error CQE */
         uct_rc_mlx5_iface_release_srq_seg(iface, seg, cqe, wqe_ctr, UCS_OK,
                                           iface->super.super.config.rx_headroom_offset,
-                                          &iface->super.super.release_desc);
+                                          &iface->super.super.release_desc,
+                                          poll_flags);
     } else {
         ucs_assert((ecqe->op_own >> 4) != MLX5_CQE_INVALID);
         uct_ib_mlx5_check_completion(&iface->super.super, cq, cqe);
     }
 }
 
+static UCS_F_ALWAYS_INLINE void
+uct_rc_mlx5_iface_update_tx_res(uct_rc_iface_t *rc_iface,
+                                uct_rc_mlx5_ep_t *rc_mlx5_ep, uint16_t hw_ci)
+{
+    uct_ib_mlx5_txwq_t *txwq = &rc_mlx5_ep->tx.wq;
+    uct_rc_txqp_t *txqp      = &rc_mlx5_ep->super.txqp;
+    uint16_t bb_num;
+
+    bb_num = uct_ib_mlx5_txwq_update_bb(txwq, hw_ci) -
+             uct_rc_txqp_available(txqp);
+
+    /* Must always have positive number of released resources. The first
+     * completion will report bb_num=1 (because prev_sw_pi is initialized to -1)
+     * and all the rest report the amount of BBs the previous WQE has consumed.
+     */
+    ucs_assertv(bb_num > 0, "hw_ci=%d prev_sw_pi=%d available=%d bb_num=%d",
+                hw_ci, txwq->prev_sw_pi, txqp->available, bb_num);
+
+    uct_rc_txqp_available_add(txqp, bb_num);
+    ucs_assert(uct_rc_txqp_available(txqp) <= txwq->bb_max);
+
+    uct_rc_iface_update_reads(rc_iface);
+
+    rc_iface->tx.cq_available += bb_num;
+    ucs_assertv(rc_iface->tx.cq_available <= rc_iface->config.tx_cq_len,
+                "cq_available=%d tx_cq_len=%d bb_num=%d txwq=%p txqp=%p",
+                rc_iface->tx.cq_available, rc_iface->config.tx_cq_len, bb_num,
+                txwq, txqp);
+
+    ucs_arbiter_dispatch(&rc_iface->tx.arbiter, 1, uct_rc_ep_process_pending,
+                         NULL);
+}
+
 static UCS_F_ALWAYS_INLINE unsigned
 uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_common_t *iface)
 {
@@ -107,47 +143,56 @@ uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_common_t *iface)
         return 0;
     }
 
-    UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1);
+    UCS_STATS_UPDATE_COUNTER(iface->super.stats,
+                             UCT_RC_IFACE_STAT_TX_COMPLETION, 1);
 
     ucs_memory_cpu_load_fence();
 
     qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
     ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num),
                         uct_rc_mlx5_ep_t);
-    /* TODO: temporary workaround for uct_ep_flush(cancel) case when EP has been
-     *       destroyed but successful CQE was not polled out from the CQ */
-    if (ucs_unlikely(ep == NULL)) {
-        ucs_debug(UCT_IB_IFACE_FMT": qp_num %x not found",
-                  UCT_IB_IFACE_ARG(&iface->super.super), qp_num);
-        return 1;
-    }
+    ucs_assert(ep != NULL);
 
     hw_ci = ntohs(cqe->wqe_counter);
-    ucs_trace_poll("rc_mlx5 iface %p tx_cqe: ep %p qpn 0x%x hw_ci %d", iface, ep,
-                   qp_num, hw_ci);
+    ucs_trace_poll("rc_mlx5 iface %p tx_cqe: ep %p qpn 0x%x hw_ci %d", iface,
+                   ep, qp_num, hw_ci);
 
     uct_rc_mlx5_txqp_process_tx_cqe(&ep->super.txqp, cqe, hw_ci);
-
-    uct_rc_mlx5_common_update_tx_res(&iface->super, &ep->tx.wq, &ep->super.txqp,
-                                     hw_ci);
-
     ucs_arbiter_group_schedule(&iface->super.tx.arbiter, &ep->super.arb_group);
-    ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1, uct_rc_ep_process_pending,
-                         NULL);
+    uct_rc_mlx5_iface_update_tx_res(&iface->super, ep, hw_ci);
 
     return 1;
 }
 
-unsigned uct_rc_mlx5_iface_progress(void *arg)
+static UCS_F_ALWAYS_INLINE unsigned
+uct_rc_mlx5_iface_progress(void *arg, int flags)
 {
     uct_rc_mlx5_iface_common_t *iface = arg;
     unsigned count;
 
-    count = uct_rc_mlx5_iface_common_poll_rx(iface, UCT_RC_MLX5_POLL_FLAG_HAS_EP);
-    if (count > 0) {
+    count = uct_rc_mlx5_iface_common_poll_rx(iface, flags);
+    if (!uct_rc_iface_poll_tx(&iface->super, count)) {
         return count;
     }
-    return uct_rc_mlx5_iface_poll_tx(iface);
+
+    return count + uct_rc_mlx5_iface_poll_tx(iface);
+}
+
+static unsigned uct_rc_mlx5_iface_progress_cyclic(void *arg)
+{
+    return uct_rc_mlx5_iface_progress(arg, UCT_RC_MLX5_POLL_FLAG_HAS_EP);
+}
+
+static unsigned uct_rc_mlx5_iface_progress_ll(void *arg)
+{
+    return uct_rc_mlx5_iface_progress(arg, UCT_RC_MLX5_POLL_FLAG_HAS_EP |
+                                           UCT_RC_MLX5_POLL_FLAG_LINKED_LIST);
+}
+
+static unsigned uct_rc_mlx5_iface_progress_tm(void *arg)
+{
+    return uct_rc_mlx5_iface_progress(arg, UCT_RC_MLX5_POLL_FLAG_HAS_EP |
+                                           UCT_RC_MLX5_POLL_FLAG_TM);
 }
 
 static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr)
@@ -187,7 +232,7 @@ static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr
 
 static void
 uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
-                                 ucs_status_t status)
+                                 ucs_status_t ep_status)
 {
     struct mlx5_cqe64  *cqe    = arg;
     uct_rc_iface_t     *iface  = ucs_derived_of(ib_iface, uct_rc_iface_t);
@@ -196,35 +241,30 @@ uct_rc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
     uct_rc_mlx5_ep_t   *ep     = ucs_derived_of(uct_rc_iface_lookup_ep(iface,
                                                                        qp_num),
                                                 uct_rc_mlx5_ep_t);
+    uint16_t           pi      = ntohs(cqe->wqe_counter);
     ucs_log_level_t    log_lvl;
-    uct_ib_mlx5_txwq_t txwq_copy;
-    size_t             txwq_size;
-    ucs_status_t       err_handler_status;
+    ucs_status_t       status;
 
-    if (!ep) {
+    ucs_assert(ep != NULL);
+    uct_rc_txqp_purge_outstanding(iface, &ep->super.txqp, ep_status, pi, 0);
+
+    /* Do not invoke pending requests on a failed endpoint */
+    ucs_arbiter_group_desched(&iface->tx.arbiter, &ep->super.arb_group);
+    uct_rc_mlx5_iface_update_tx_res(iface, ep, pi);
+
+    if (ep->super.flags & (UCT_RC_EP_FLAG_ERR_HANDLER_INVOKED |
+                           UCT_RC_EP_FLAG_FLUSH_CANCEL)) {
         return;
     }
 
-    /* Create a copy of RC txwq for completion error reporting, since the QP
-     * would be released by set_ep_failed()*/
-    txwq_copy = ep->tx.wq;
-    txwq_size = UCS_PTR_BYTE_DIFF(ep->tx.wq.qstart, ep->tx.wq.qend);
-    txwq_copy.qstart = ucs_malloc(txwq_size, "rc_txwq_copy");
-    if (txwq_copy.qstart != NULL) {
-        memcpy(txwq_copy.qstart, ep->tx.wq.qstart, txwq_size);
-        txwq_copy.qend = UCS_PTR_BYTE_OFFSET(txwq_copy.qstart, txwq_size);
-    }
+    ep->super.flags |= UCT_RC_EP_FLAG_ERR_HANDLER_INVOKED;
 
-    err_handler_status = uct_rc_mlx5_ep_handle_failure(ep, status,
-                                                       ep->tx.wq.sw_pi);
-    log_lvl            = uct_ib_iface_failure_log_level(ib_iface,
-                                                        err_handler_status,
-                                                        status);
+    status  = uct_iface_handle_ep_err(&iface->super.super.super,
+                                      &ep->super.super.super, ep_status);
+    log_lvl = uct_base_iface_failure_log_level(&ib_iface->super, status,
+                                               ep_status);
 
-    uct_ib_mlx5_completion_with_err(ib_iface, arg,
-                                    txwq_copy.qstart ? &txwq_copy : NULL,
-                                    log_lvl);
-    ucs_free(txwq_copy.qstart);
+    uct_ib_mlx5_completion_with_err(ib_iface, arg, &ep->tx.wq, log_lvl);
 }
 
 static void uct_rc_mlx5_iface_progress_enable(uct_iface_h tl_iface, unsigned flags)
@@ -253,7 +293,7 @@ ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface,
     struct mlx5dv_qp_init_attr dv_attr = {};
 
     if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP) {
-        attr->mmio_mode = iface->tx.mmio_mode;
+        attr->uidx      = 0xffffff;
         status = uct_ib_mlx5_devx_create_qp(ib_iface, qp, txwq, attr);
         if (status != UCS_OK) {
             return status;
@@ -317,20 +357,6 @@ ucs_status_t uct_rc_mlx5_iface_create_qp(uct_rc_mlx5_iface_common_t *iface,
     return status;
 }
 
-static UCS_F_MAYBE_UNUSED unsigned uct_rc_mlx5_iface_progress_tm(void *arg)
-{
-    uct_rc_mlx5_iface_common_t *iface = arg;
-    unsigned count;
-
-    count = uct_rc_mlx5_iface_common_poll_rx(iface,
-                                             UCT_RC_MLX5_POLL_FLAG_HAS_EP |
-                                             UCT_RC_MLX5_POLL_FLAG_TM);
-    if (count > 0) {
-        return count;
-    }
-    return uct_rc_mlx5_iface_poll_tx(iface);
-}
-
 #if IBV_HW_TM
 static ucs_status_t uct_rc_mlx5_iface_tag_recv_zcopy(uct_iface_h tl_iface,
                                                      uct_tag_t tag,
@@ -355,6 +381,34 @@ static ucs_status_t uct_rc_mlx5_iface_tag_recv_cancel(uct_iface_h tl_iface,
 }
 #endif
 
+static ucs_status_t
+uct_rc_mlx5_iface_parse_srq_topo(uct_ib_mlx5_md_t *md,
+                                 uct_rc_mlx5_iface_common_config_t *config,
+                                 uct_rc_mlx5_srq_topo_t *topo_p)
+
+{
+    int i;
+
+    for (i = 0; i < config->srq_topo.count; ++i) {
+        if (!strcasecmp(config->srq_topo.types[i], "list")) {
+            *topo_p = UCT_RC_MLX5_SRQ_TOPO_LIST;
+            return UCS_OK;
+        } else if (!strcasecmp(config->srq_topo.types[i], "cyclic")) {
+            /* real cyclic list requires DevX support */
+            if (!(md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ)) {
+                continue;
+            }
+            *topo_p = UCT_RC_MLX5_SRQ_TOPO_CYCLIC;
+            return UCS_OK;
+        } else if (!strcasecmp(config->srq_topo.types[i], "cyclic_emulated")) {
+            *topo_p = UCT_RC_MLX5_SRQ_TOPO_CYCLIC_EMULATED;
+            return UCS_OK;
+        }
+    }
+
+    return UCS_ERR_INVALID_PARAM;
+}
+
 static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface,
                                               uct_md_h tl_md,
                                               uct_rc_iface_common_config_t *rc_config,
@@ -362,16 +416,20 @@ static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface,
                                               const uct_iface_params_t *params,
                                               uct_ib_iface_init_attr_t *init_attr)
 {
-    uct_ib_mlx5_md_t *md              = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
+    uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t);
 #if IBV_HW_TM
-    uct_ib_device_t UCS_V_UNUSED *dev = &md->super.dev;
+    uct_ib_device_t *dev = &md->super.dev;
     struct ibv_tmh tmh;
     int mtu;
     int tm_params;
-    ucs_status_t status;
 #endif
+    ucs_status_t status;
 
-    iface->config.cyclic_srq_enable = mlx5_config->cyclic_srq_enable;
+    status = uct_rc_mlx5_iface_parse_srq_topo(md, mlx5_config,
+                                              &iface->config.srq_topo);
+    if (status != UCS_OK) {
+        return status;
+    }
 
 #if IBV_HW_TM
     /* Both eager and rndv callbacks should be provided for
@@ -398,12 +456,10 @@ static ucs_status_t uct_rc_mlx5_iface_preinit(uct_rc_mlx5_iface_common_t *iface,
 
     iface->tm.eager_unexp.cb  = params->eager_cb;
     iface->tm.rndv_unexp.cb   = params->rndv_cb;
-    iface->tm.eager_unexp.arg = (params->field_mask &
-                                 UCT_IFACE_PARAM_FIELD_HW_TM_EAGER_ARG) ?
-                                params->eager_arg : NULL;
-    iface->tm.rndv_unexp.arg  = (params->field_mask &
-                                 UCT_IFACE_PARAM_FIELD_HW_TM_RNDV_ARG) ?
-                                params->rndv_arg : NULL;
+    iface->tm.eager_unexp.arg = UCT_IFACE_PARAM_VALUE(params, eager_arg,
+                                                      HW_TM_EAGER_ARG, NULL);
+    iface->tm.rndv_unexp.arg  = UCT_IFACE_PARAM_VALUE(params, eager_arg,
+                                                      HW_TM_RNDV_ARG, NULL);
     iface->tm.unexpected_cnt  = 0;
     iface->tm.num_outstanding = 0;
     iface->tm.num_tags        = ucs_min(IBV_DEVICE_TM_CAPS(dev, max_num_tags),
@@ -517,7 +573,11 @@ uct_rc_mlx5_iface_init_rx(uct_rc_iface_t *rc_iface,
         return status;
     }
 
-    iface->super.progress = uct_rc_mlx5_iface_progress;
+    if (iface->config.srq_topo == UCT_RC_MLX5_SRQ_TOPO_LIST) {
+        iface->super.progress = uct_rc_mlx5_iface_progress_ll;
+    } else {
+        iface->super.progress = uct_rc_mlx5_iface_progress_cyclic;
+    }
     return UCS_OK;
 }
 
@@ -560,10 +620,9 @@ int uct_rc_mlx5_iface_is_reachable(const uct_iface_h tl_iface,
     return uct_ib_iface_is_reachable(tl_iface, dev_addr, iface_addr);
 }
 
-UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t,
-                    uct_rc_iface_ops_t *ops,
-                    uct_md_h tl_md, uct_worker_h worker,
-                    const uct_iface_params_t *params,
+UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t, uct_rc_iface_ops_t *ops,
+                    uct_iface_ops_t *tl_ops, uct_md_h tl_md,
+                    uct_worker_h worker, const uct_iface_params_t *params,
                     uct_rc_iface_common_config_t *rc_config,
                     uct_rc_mlx5_iface_common_config_t *mlx5_config,
                     uct_ib_iface_init_attr_t *init_attr)
@@ -583,8 +642,8 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t,
     init_attr->rx_hdr_len            = UCT_RC_MLX5_MP_ENABLED(self) ?
                                        0 : sizeof(uct_rc_mlx5_hdr_t);
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, ops, tl_md, worker, params,
-                              rc_config, init_attr);
+    UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, ops, tl_ops, tl_md, worker,
+                              params, rc_config, init_attr);
 
     dev                       = uct_ib_iface_device(&self->super.super);
     self->tx.mmio_mode        = mlx5_config->super.mmio_mode;
@@ -595,6 +654,13 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_common_t,
         self->tm.am_desc.offset = self->super.super.config.rx_headroom_offset;
     }
 
+    status = uct_ib_mlx5_iface_select_sl(&self->super.super,
+                                         &mlx5_config->super,
+                                         &rc_config->super);
+    if (status != UCS_OK) {
+        return status;
+    }
+
     status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX],
                                 &self->cq[UCT_IB_DIR_TX]);
     if (status != UCS_OK) {
@@ -726,7 +792,6 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t,
 
     init_attr.fc_req_size           = sizeof(uct_rc_pending_req_t);
     init_attr.flags                 = UCT_IB_CQ_IGNORE_OVERRUN;
-    init_attr.rx_hdr_len            = sizeof(uct_rc_mlx5_hdr_t);
     init_attr.cq_len[UCT_IB_DIR_TX] = config->super.tx_cq_len;
     init_attr.qp_type               = IBV_QPT_RC;
 
@@ -734,7 +799,8 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t,
         init_attr.flags  |= UCT_IB_TM_SUPPORTED;
     }
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t, &uct_rc_mlx5_iface_ops,
+    UCS_CLASS_CALL_SUPER_INIT(uct_rc_mlx5_iface_common_t,
+                              &uct_rc_mlx5_iface_ops, &uct_rc_mlx5_iface_tl_ops,
                               tl_md, worker, params, &config->super.super,
                               &config->rc_mlx5_common, &init_attr);
 
@@ -764,14 +830,32 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_mlx5_iface_t, uct_iface_t, uct_md_h,
 static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_mlx5_iface_t, uct_iface_t);
 
 static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = {
-    {
-    {
+    .super = {
+        .super = {
+            .iface_estimate_perf = uct_base_iface_estimate_perf,
+            .iface_vfs_refresh   = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
+        },
+        .create_cq      = uct_ib_mlx5_create_cq,
+        .arm_cq         = uct_rc_mlx5_iface_common_arm_cq,
+        .event_cq       = uct_rc_mlx5_iface_common_event_cq,
+        .handle_failure = uct_rc_mlx5_iface_handle_failure,
+    },
+    .init_rx       = uct_rc_mlx5_iface_init_rx,
+    .cleanup_rx    = uct_rc_mlx5_iface_cleanup_rx,
+    .fc_ctrl       = uct_rc_mlx5_ep_fc_ctrl,
+    .fc_handler    = uct_rc_iface_fc_handler,
+    .cleanup_qp    = uct_rc_mlx5_ep_cleanup_qp,
+    .ep_post_check = uct_rc_mlx5_ep_post_check,
+};
+
+static uct_iface_ops_t uct_rc_mlx5_iface_tl_ops = {
     .ep_put_short             = uct_rc_mlx5_ep_put_short,
     .ep_put_bcopy             = uct_rc_mlx5_ep_put_bcopy,
     .ep_put_zcopy             = uct_rc_mlx5_ep_put_zcopy,
     .ep_get_bcopy             = uct_rc_mlx5_ep_get_bcopy,
     .ep_get_zcopy             = uct_rc_mlx5_ep_get_zcopy,
     .ep_am_short              = uct_rc_mlx5_ep_am_short,
+    .ep_am_short_iov          = uct_rc_mlx5_ep_am_short_iov,
     .ep_am_bcopy              = uct_rc_mlx5_ep_am_bcopy,
     .ep_am_zcopy              = uct_rc_mlx5_ep_am_zcopy,
     .ep_atomic_cswap64        = uct_rc_mlx5_ep_atomic_cswap64,
@@ -781,10 +865,10 @@ static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = {
     .ep_atomic64_fetch        = uct_rc_mlx5_ep_atomic64_fetch,
     .ep_atomic32_fetch        = uct_rc_mlx5_ep_atomic32_fetch,
     .ep_pending_add           = uct_rc_ep_pending_add,
-    .ep_pending_purge         = uct_rc_mlx5_ep_pending_purge,
+    .ep_pending_purge         = uct_rc_ep_pending_purge,
     .ep_flush                 = uct_rc_mlx5_ep_flush,
     .ep_fence                 = uct_rc_mlx5_ep_fence,
-    .ep_check                 = uct_rc_mlx5_ep_check,
+    .ep_check                 = uct_rc_ep_check,
     .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_rc_mlx5_ep_t),
     .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_mlx5_ep_t),
     .ep_get_address           = uct_rc_mlx5_ep_get_address,
@@ -811,19 +895,7 @@ static uct_rc_iface_ops_t uct_rc_mlx5_iface_ops = {
     .iface_get_address        = uct_rc_mlx5_iface_get_address,
     .iface_get_device_address = uct_ib_iface_get_device_address,
     .iface_is_reachable       = uct_rc_mlx5_iface_is_reachable
-    },
-    .create_cq                = uct_ib_mlx5_create_cq,
-    .arm_cq                   = uct_rc_mlx5_iface_common_arm_cq,
-    .event_cq                 = uct_rc_mlx5_iface_common_event_cq,
-    .handle_failure           = uct_rc_mlx5_iface_handle_failure,
-    .set_ep_failed            = uct_rc_mlx5_ep_set_failed,
-    },
-    .init_rx                  = uct_rc_mlx5_iface_init_rx,
-    .cleanup_rx               = uct_rc_mlx5_iface_cleanup_rx,
-    .fc_ctrl                  = uct_rc_mlx5_ep_fc_ctrl,
-    .fc_handler               = uct_rc_iface_fc_handler,
-    .cleanup_qp               = uct_rc_mlx5_ep_cleanup_qp,
-};
+    };
 
 static ucs_status_t
 uct_rc_mlx5_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
diff --git a/src/uct/ib/rc/base/rc_ep.c b/src/uct/ib/rc/base/rc_ep.c
index abb5001efd3..13b74a3c40f 100644
--- a/src/uct/ib/rc/base/rc_ep.c
+++ b/src/uct/ib/rc/base/rc_ep.c
@@ -15,6 +15,7 @@
 #include <uct/ib/base/ib_verbs.h>
 #include <ucs/debug/memtrack.h>
 #include <ucs/debug/log.h>
+#include <ucs/vfs/base/vfs_obj.h>
 #include <ucs/type/class.h>
 #include <endian.h>
 
@@ -46,6 +47,8 @@ static ucs_stats_class_t uct_rc_txqp_stats_class = {
 };
 #endif
 
+static ucs_status_t uct_rc_ep_check_progress(uct_pending_req_t *self);
+
 ucs_status_t uct_rc_txqp_init(uct_rc_txqp_t *txqp, uct_rc_iface_t *iface,
                               uint32_t qp_num
                               UCS_STATS_ARG(ucs_stats_node_t* stats_parent))
@@ -64,6 +67,14 @@ void uct_rc_txqp_cleanup(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp)
     UCS_STATS_NODE_FREE(txqp->stats);
 }
 
+void uct_rc_txqp_vfs_populate(uct_rc_txqp_t *txqp, void *parent_obj)
+{
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive,
+                            &txqp->unsignaled, UCS_VFS_TYPE_U16, "unsignaled");
+    ucs_vfs_obj_add_ro_file(parent_obj, ucs_vfs_show_primitive,
+                            &txqp->available, UCS_VFS_TYPE_I16, "available");
+}
+
 ucs_status_t uct_rc_fc_init(uct_rc_fc_t *fc, int16_t winsize
                             UCS_STATS_ARG(ucs_stats_node_t* stats_parent))
 {
@@ -155,8 +166,9 @@ UCS_CLASS_INIT_FUNC(uct_rc_ep_t, uct_rc_iface_t *iface, uint32_t qp_num,
     UCS_STATIC_ASSERT(UCT_RC_EP_FC_MASK < UINT8_MAX);
 
     ucs_arbiter_group_init(&self->arb_group);
-
     ucs_list_add_head(&iface->ep_list, &self->list);
+
+    ucs_debug("created rc ep %p", self);
     return UCS_OK;
 
 err_txqp_cleanup:
@@ -346,7 +358,7 @@ ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter,
                                                   void *arg)
 {
     uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv);
-    uct_rc_ep_t *ep        = ucs_container_of(group, uct_rc_ep_t, arb_group);;
+    uct_rc_ep_t *ep        = ucs_container_of(group, uct_rc_ep_t, arb_group);
     uct_rc_iface_t *iface  = ucs_derived_of(ep->super.super.iface, uct_rc_iface_t);
     ucs_status_t status;
 
@@ -358,12 +370,18 @@ ucs_arbiter_cb_result_t uct_rc_ep_process_pending(ucs_arbiter_t *arbiter,
     } else if (!uct_rc_iface_has_tx_resources(iface)) {
         /* No iface resources */
         return UCS_ARBITER_CB_RESULT_STOP;
-    } else {
-        /* No ep resources */
-        ucs_assertv(!uct_rc_ep_has_tx_resources(ep),
-                    "pending callback returned error but send resources are available");
-        return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
     }
+
+    /* No any other pending operations (except no-op, flush(CANEL), and others
+     * which don't consume TX resources) allowed to be still scheduled on an
+     * arbiter group for which flush(CANCEL) was done */
+    ucs_assert(!(ep->flags & UCT_RC_EP_FLAG_FLUSH_CANCEL));
+
+    /* No ep resources */
+    ucs_assertv(!uct_rc_ep_has_tx_resources(ep),
+                "pending callback returned error, but send resources are"
+                " available");
+    return UCS_ARBITER_CB_RESULT_DESCHED_GROUP;
 }
 
 ucs_arbiter_cb_result_t uct_rc_ep_arbiter_purge_cb(ucs_arbiter_t *arbiter,
@@ -379,8 +397,11 @@ ucs_arbiter_cb_result_t uct_rc_ep_arbiter_purge_cb(ucs_arbiter_t *arbiter,
                                                        arb_group);
     uct_rc_pending_req_t *freq;
 
-    /* Invoke user's callback only if it is not internal FC message */
-    if (ucs_likely(req->func != uct_rc_ep_fc_grant)) {
+    if (req->func == uct_rc_ep_check_progress) {
+        ep->flags &= ~UCT_RC_EP_FLAG_KEEPALIVE_PENDING;
+        ucs_mpool_put(req);
+    } else if (ucs_likely(req->func != uct_rc_ep_fc_grant)) {
+        /* Invoke user's callback only if it is not internal FC message */
         if (cb != NULL) {
             cb(req, cb_args->arg);
         } else {
@@ -430,9 +451,10 @@ void uct_rc_txqp_purge_outstanding(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp,
     ucs_queue_for_each_extract(op, &txqp->outstanding, queue,
                                UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) {
         if (op->handler != (uct_rc_send_handler_t)ucs_mpool_put) {
-            if (warn) {
-                ucs_warn("destroying rc ep %p with uncompleted operation %p",
-                         txqp, op);
+            /* Allow clean flush cancel op from destroy flow */
+            if (warn && (op->handler != uct_rc_ep_flush_op_completion_handler)) {
+                ucs_warn("destroying txqp %p with uncompleted operation %p handler %s",
+                         txqp, op, ucs_debug_get_symbol_name(op->handler));
             }
 
             if (op->user_comp != NULL) {
@@ -476,7 +498,13 @@ ucs_status_t uct_rc_ep_flush(uct_rc_ep_t *ep, int16_t max_available,
                                            uct_rc_iface_t);
 
     if (!uct_rc_iface_has_tx_resources(iface) ||
-        !uct_rc_ep_has_tx_resources(ep)) {
+        (uct_rc_txqp_available(&ep->txqp) <= 0)) {
+        return UCS_ERR_NO_RESOURCE;
+    }
+
+    /* Ignore FC limitations when performing flush(CANCEL) */
+    if (!uct_rc_fc_has_resources(iface, &ep->fc) &&
+        !(flags & UCT_FLUSH_FLAG_CANCEL)) {
         return UCS_ERR_NO_RESOURCE;
     }
 
@@ -485,9 +513,93 @@ ucs_status_t uct_rc_ep_flush(uct_rc_ep_t *ep, int16_t max_available,
         return UCS_OK;
     }
 
+    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
+        ep->flags |= UCT_RC_EP_FLAG_FLUSH_CANCEL;
+    }
+
     return UCS_INPROGRESS;
 }
 
+static ucs_status_t uct_rc_ep_check_internal(uct_ep_h tl_ep)
+{
+    uct_rc_ep_t *ep         = ucs_derived_of(tl_ep, uct_rc_ep_t);
+    uct_rc_iface_t *iface   = ucs_derived_of(tl_ep->iface,
+                                            uct_rc_iface_t);
+    uct_rc_iface_ops_t *ops = ucs_derived_of(iface->super.ops, uct_rc_iface_ops_t);
+
+    /* in case if no TX resources are available then there is at least
+     * one signaled operation which provides actual peer status, in this case
+     * just return without any actions */
+    UCT_RC_CHECK_TXQP_RET(iface, ep, UCS_OK);
+
+    /* in case of not iface resources available then return NO_RESOURCE
+     * to add request to pending queue */
+    UCT_RC_CHECK_CQE_RET(iface, ep, UCS_ERR_NO_RESOURCE);
+
+    ops->ep_post_check(tl_ep);
+
+    return UCS_OK;
+}
+
+static ucs_status_t uct_rc_ep_check_progress(uct_pending_req_t *self)
+{
+    uct_rc_pending_req_t *req = ucs_derived_of(self, uct_rc_pending_req_t);
+    uct_rc_ep_t *ep           = ucs_derived_of(req->ep, uct_rc_ep_t);
+    ucs_status_t status;
+
+    ucs_assert(ep->flags & UCT_RC_EP_FLAG_KEEPALIVE_PENDING);
+
+    status = uct_rc_ep_check_internal(req->ep);
+    if (status == UCS_OK) {
+        ep->flags &= ~UCT_RC_EP_FLAG_KEEPALIVE_PENDING;
+        ucs_mpool_put(req);
+    } else {
+        ucs_assert(status == UCS_ERR_NO_RESOURCE);
+    }
+
+    return status;
+}
+
+ucs_status_t
+uct_rc_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
+{
+    uct_rc_ep_t *ep       = ucs_derived_of(tl_ep, uct_rc_ep_t);
+    uct_rc_iface_t *iface = ucs_derived_of(tl_ep->iface,
+                                           uct_rc_iface_t);
+    uct_rc_pending_req_t *req;
+    ucs_status_t status;
+
+    UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);
+
+    ucs_assert(ep->flags & UCT_RC_EP_FLAG_CONNECTED);
+
+    if (ep->flags & UCT_RC_EP_FLAG_KEEPALIVE_PENDING) {
+        /* keepalive request is in pending queue and will be
+         * processed when resources are available */
+        return UCS_OK;
+    }
+
+    status = uct_rc_ep_check_internal(tl_ep);
+    if (status != UCS_ERR_NO_RESOURCE) {
+        ucs_assert(status == UCS_OK);
+        return status;
+    }
+
+    /* there are no iface resources, add pending request */
+    req = ucs_mpool_get(&iface->tx.pending_mp);
+    if (req == NULL) {
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    req->ep          = &ep->super.super;
+    req->super.func  = uct_rc_ep_check_progress;
+    status           = uct_rc_ep_pending_add(tl_ep, &req->super, 0);
+    ep->flags       |= UCT_RC_EP_FLAG_KEEPALIVE_PENDING;
+    ucs_assert_always(status == UCS_OK);
+
+    return UCS_OK;
+}
+
 #define UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC(_num_bits, _is_be) \
     void UCT_RC_DEFINE_ATOMIC_HANDLER_FUNC_NAME(_num_bits, _is_be) \
             (uct_rc_iface_send_op_t *op, const void *resp) \
diff --git a/src/uct/ib/rc/base/rc_ep.h b/src/uct/ib/rc/base/rc_ep.h
index 9eef07f16d4..32def16d902 100644
--- a/src/uct/ib/rc/base/rc_ep.h
+++ b/src/uct/ib/rc/base/rc_ep.h
@@ -10,7 +10,7 @@
 #include "rc_iface.h"
 
 #include <uct/api/uct.h>
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 
 
 #define RC_UNSIGNALED_INF UINT16_MAX
@@ -42,32 +42,38 @@ enum {
     /* Keepalive Request scheduled: indicates that keepalive request
      * is scheduled in pending queue and no more keepalive actions
      * are needed */
-    UCT_RC_EP_FLAG_KEEPALIVE_PENDING = UCS_BIT(0),
+    UCT_RC_EP_FLAG_KEEPALIVE_PENDING   = UCS_BIT(0),
 
     /* EP is connected to peer */
-    UCT_RC_EP_FLAG_CONNECTED         = UCS_BIT(1),
+    UCT_RC_EP_FLAG_CONNECTED           = UCS_BIT(1),
+
+    /* Flush cancel was executed on EP */
+    UCT_RC_EP_FLAG_FLUSH_CANCEL        = UCS_BIT(2),
+
+    /* Error handler already called or flush(CANCEL) disabled it */
+    UCT_RC_EP_FLAG_ERR_HANDLER_INVOKED = UCS_BIT(3),
 
     /* Soft Credit Request: indicates that peer needs to piggy-back credits
      * grant to counter AM (if any). Can be bundled with
      * UCT_RC_EP_FLAG_FC_GRANT  */
-    UCT_RC_EP_FLAG_FC_SOFT_REQ       = UCS_BIT(UCT_AM_ID_BITS),
+    UCT_RC_EP_FLAG_FC_SOFT_REQ         = UCS_BIT(UCT_AM_ID_BITS),
 
     /* Hard Credit Request: indicates that wnd is close to be exhausted.
      * The peer must send separate AM with credit grant as soon as it
      * receives AM  with this bit set. Can be bundled with
      * UCT_RC_EP_FLAG_FC_GRANT */
-    UCT_RC_EP_FLAG_FC_HARD_REQ       = UCS_BIT((UCT_AM_ID_BITS) + 1),
+    UCT_RC_EP_FLAG_FC_HARD_REQ         = UCS_BIT((UCT_AM_ID_BITS) + 1),
 
     /* Credit Grant: ep should update its FC wnd as soon as it receives AM with
      * this bit set. Can be bundled with either soft or hard request bits */
-    UCT_RC_EP_FLAG_FC_GRANT          = UCS_BIT((UCT_AM_ID_BITS) + 2),
+    UCT_RC_EP_FLAG_FC_GRANT            = UCS_BIT((UCT_AM_ID_BITS) + 2),
 
     /* Special FC AM with Credit Grant: Just an empty message indicating
      * credit grant. Can't be bundled with any other FC flag (as it consumes
      * all 3 FC bits). */
-    UCT_RC_EP_FC_PURE_GRANT          = (UCT_RC_EP_FLAG_FC_HARD_REQ |
-                                        UCT_RC_EP_FLAG_FC_SOFT_REQ |
-                                        UCT_RC_EP_FLAG_FC_GRANT)
+    UCT_RC_EP_FC_PURE_GRANT            = (UCT_RC_EP_FLAG_FC_HARD_REQ |
+                                          UCT_RC_EP_FLAG_FC_SOFT_REQ |
+                                          UCT_RC_EP_FLAG_FC_GRANT)
 };
 
 /*
@@ -277,6 +283,9 @@ void uct_rc_txqp_purge_outstanding(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp,
 ucs_status_t uct_rc_ep_flush(uct_rc_ep_t *ep, int16_t max_available,
                              unsigned flags);
 
+ucs_status_t
+uct_rc_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
+
 void uct_rc_ep_cleanup_qp(uct_rc_iface_t *iface, uct_rc_ep_t *ep,
                           uct_rc_ep_cleanup_ctx_t *cleanup_ctx, uint32_t qp_num);
 
@@ -296,6 +305,7 @@ ucs_status_t uct_rc_txqp_init(uct_rc_txqp_t *txqp, uct_rc_iface_t *iface,
                               uint32_t qp_num
                               UCS_STATS_ARG(ucs_stats_node_t* stats_parent));
 void uct_rc_txqp_cleanup(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp);
+void uct_rc_txqp_vfs_populate(uct_rc_txqp_t *txqp, void *parent_obj);
 
 static inline int16_t uct_rc_txqp_available(uct_rc_txqp_t *txqp)
 {
@@ -380,6 +390,16 @@ uct_rc_txqp_add_send_comp(uct_rc_iface_t *iface, uct_rc_txqp_t *txqp,
     uct_rc_txqp_add_send_op_sn(txqp, op, sn);
 }
 
+static inline void
+uct_rc_ep_init_send_op(uct_rc_iface_send_op_t *op, unsigned flags,
+                            uct_completion_t *comp,
+                            uct_rc_send_handler_t handler)
+{
+    op->flags     = flags;
+    op->user_comp = comp;
+    op->handler   = handler;
+}
+
 static UCS_F_ALWAYS_INLINE ucs_status_t
 uct_rc_txqp_add_flush_comp(uct_rc_iface_t *iface, uct_base_ep_t *ep,
                            uct_rc_txqp_t *txqp, uct_completion_t *comp,
@@ -394,13 +414,11 @@ uct_rc_txqp_add_flush_comp(uct_rc_iface_t *iface, uct_base_ep_t *ep,
             return UCS_ERR_NO_MEMORY;
         }
 
-        op->flags     = 0;
-        op->user_comp = comp;
+        uct_rc_ep_init_send_op(op, 0, comp, uct_rc_ep_flush_op_completion_handler);
+        op->iface = iface;
         uct_rc_txqp_add_send_op_sn(txqp, op, sn);
-        VALGRIND_MAKE_MEM_DEFINED(op, sizeof(*op)); /* handler set by mpool init */
     }
     UCT_TL_EP_STAT_FLUSH_WAIT(ep);
-
     return UCS_INPROGRESS;
 }
 
diff --git a/src/uct/ib/rc/base/rc_iface.c b/src/uct/ib/rc/base/rc_iface.c
index 7547141edaa..ed70989aa80 100644
--- a/src/uct/ib/rc/base/rc_iface.c
+++ b/src/uct/ib/rc/base/rc_iface.c
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -15,6 +15,7 @@
 #include <ucs/debug/memtrack.h>
 #include <ucs/debug/log.h>
 #include <ucs/type/class.h>
+#include <ucs/vfs/base/vfs_obj.h>
 
 
 static const char *uct_rc_fence_mode_values[] = {
@@ -90,6 +91,11 @@ ucs_config_field_t uct_rc_iface_common_config_table[] = {
    "Maximal number of bytes simultaneously transferred by get/RDMA_READ operations.",
    ucs_offsetof(uct_rc_iface_common_config_t, tx.max_get_bytes), UCS_CONFIG_TYPE_MEMUNITS},
 
+  {"TX_POLL_ALWAYS", "n",
+   "When enabled, TX completions are polled every time the progress function is invoked.\n"
+   "Otherwise poll TX completions only if no RX completions found.",
+   ucs_offsetof(uct_rc_iface_common_config_t, tx.poll_always), UCS_CONFIG_TYPE_BOOL},
+
   {NULL}
 };
 
@@ -139,21 +145,10 @@ static ucs_mpool_ops_t uct_rc_pending_mpool_ops = {
     .obj_cleanup   = NULL
 };
 
-static void
-uct_rc_iface_flush_comp_init(ucs_mpool_t *mp, void *obj, void *chunk)
-{
-    uct_rc_iface_t *iface      = ucs_container_of(mp, uct_rc_iface_t, tx.send_op_mp);
-    uct_rc_iface_send_op_t *op = obj;
-
-    op->handler = uct_rc_ep_flush_op_completion_handler;
-    op->flags   = 0;
-    op->iface   = iface;
-}
-
 static ucs_mpool_ops_t uct_rc_send_op_mpool_ops = {
     .chunk_alloc   = ucs_mpool_chunk_malloc,
     .chunk_release = ucs_mpool_chunk_free,
-    .obj_init      = uct_rc_iface_flush_comp_init,
+    .obj_init      = NULL,
     .obj_cleanup   = NULL
 };
 
@@ -354,8 +349,8 @@ ucs_status_t uct_rc_iface_fc_handler(uct_rc_iface_t *iface, unsigned qp_num,
 
     ucs_assert(iface->config.fc_enabled);
 
-    if (ep == NULL) {
-        /* We get fc for ep which is being removed so should ignore it */
+    if ((ep == NULL) || (ep->flags & UCT_RC_EP_FLAG_FLUSH_CANCEL)) {
+        /* We get fc for ep which is being removed or canceled so should ignore it */
         goto out;
     }
 
@@ -409,8 +404,7 @@ ucs_status_t uct_rc_iface_fc_handler(uct_rc_iface_t *iface, unsigned qp_num,
         if (status == UCS_ERR_NO_RESOURCE){
             /* force add request to group & schedule group to eliminate
              * FC deadlock */
-            uct_pending_req_arb_group_push_head(&iface->tx.arbiter,
-                                                &ep->arb_group, &fc_req->super);
+            uct_pending_req_arb_group_push_head(&ep->arb_group, &fc_req->super);
             ucs_arbiter_group_schedule(&iface->tx.arbiter, &ep->arb_group);
         } else {
             ucs_assertv_always(status == UCS_OK, "Failed to send FC grant msg: %s",
@@ -450,7 +444,7 @@ static ucs_status_t uct_rc_iface_tx_ops_init(uct_rc_iface_t *iface)
     status = ucs_mpool_init(&iface->tx.send_op_mp, 0, sizeof(*op), 0,
                             UCS_SYS_CACHE_LINE_SIZE, 256,
                             UINT_MAX, &uct_rc_send_op_mpool_ops,
-                            "flush-comps-only");
+                            "send-ops-mpool");
 
     return status;
 }
@@ -516,42 +510,47 @@ static int uct_rc_iface_config_limit_value(const char *name,
      }
 }
 
-UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,
-                    uct_worker_h worker, const uct_iface_params_t *params,
+UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops,
+                    uct_iface_ops_t *tl_ops, uct_md_h md, uct_worker_h worker,
+                    const uct_iface_params_t *params,
                     const uct_rc_iface_common_config_t *config,
-                    uct_ib_iface_init_attr_t *init_attr)
+                    const uct_ib_iface_init_attr_t *init_attr)
 {
     uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
     uint32_t max_ib_msg_size;
     ucs_status_t status;
-
-    UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, md, worker, params,
-                              &config->super, init_attr);
-
-    self->tx.cq_available           = init_attr->cq_len[UCT_IB_DIR_TX] - 1;
-    self->rx.srq.available          = 0;
-    self->rx.srq.quota              = 0;
-    self->config.tx_qp_len          = config->super.tx.queue_len;
-    self->config.tx_min_sge         = config->super.tx.min_sge;
-    self->config.tx_min_inline      = config->super.tx.min_inline;
-    self->config.tx_ops_count       = init_attr->cq_len[UCT_IB_DIR_TX];
-    self->config.min_rnr_timer      = uct_ib_to_rnr_fabric_time(config->tx.rnr_timeout);
-    self->config.timeout            = uct_ib_to_qp_fabric_time(config->tx.timeout);
-    self->config.rnr_retry          = uct_rc_iface_config_limit_value(
-                                                  "RNR_RETRY_COUNT",
-                                                  config->tx.rnr_retry_count,
-                                                  UCT_RC_QP_MAX_RETRY_COUNT);
-    self->config.retry_cnt          = uct_rc_iface_config_limit_value(
-                                                  "RETRY_COUNT",
-                                                  config->tx.retry_count,
-                                                  UCT_RC_QP_MAX_RETRY_COUNT);
-    self->config.max_rd_atomic      = config->max_rd_atomic;
-    self->config.ooo_rw             = config->ooo_rw;
+    unsigned tx_cq_size;
+
+    UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, tl_ops, md, worker,
+                              params, &config->super, init_attr);
+
+    tx_cq_size                  = uct_ib_cq_size(&self->super, init_attr,
+                                                 UCT_IB_DIR_TX);
+    self->tx.cq_available       = tx_cq_size - 1;
+    self->rx.srq.available      = 0;
+    self->rx.srq.quota          = 0;
+    self->config.tx_qp_len      = config->super.tx.queue_len;
+    self->config.tx_min_sge     = config->super.tx.min_sge;
+    self->config.tx_min_inline  = config->super.tx.min_inline;
+    self->config.tx_poll_always = config->tx.poll_always;
+    self->config.tx_ops_count   = tx_cq_size;
+    self->config.min_rnr_timer  = uct_ib_to_rnr_fabric_time(config->tx.rnr_timeout);
+    self->config.timeout        = uct_ib_to_qp_fabric_time(config->tx.timeout);
+    self->config.rnr_retry      = uct_rc_iface_config_limit_value(
+                                                     "RNR_RETRY_COUNT",
+                                                     config->tx.rnr_retry_count,
+                                                     UCT_RC_QP_MAX_RETRY_COUNT);
+    self->config.retry_cnt      = uct_rc_iface_config_limit_value(
+                                                     "RETRY_COUNT",
+                                                     config->tx.retry_count,
+                                                     UCT_RC_QP_MAX_RETRY_COUNT);
+    self->config.max_rd_atomic  = config->max_rd_atomic;
+    self->config.ooo_rw         = config->ooo_rw;
 #if UCS_ENABLE_ASSERT
-    self->config.tx_cq_len          = init_attr->cq_len[UCT_IB_DIR_TX];
-    self->tx.in_pending             = 0;
+    self->config.tx_cq_len      = tx_cq_size;
+    self->tx.in_pending         = 0;
 #endif
-    max_ib_msg_size                 = uct_ib_iface_port_attr(&self->super)->max_msg_sz;
+    max_ib_msg_size             = uct_ib_iface_port_attr(&self->super)->max_msg_sz;
 
     if (config->tx.max_get_zcopy == UCS_MEMUNITS_AUTO) {
         self->config.max_get_zcopy = max_ib_msg_size;
@@ -588,7 +587,7 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,
     }
 
     /* Create RX buffers mempool */
-    status = uct_ib_iface_recv_mpool_init(&self->super, &config->super,
+    status = uct_ib_iface_recv_mpool_init(&self->super, &config->super, params,
                                           "rc_recv_desc", &self->rx.mp);
     if (status != UCS_OK) {
         goto err;
@@ -652,13 +651,13 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,
          * Then FC window size is the same for all endpoints as well.
          * TODO: Make wnd size to be a property of the particular interface.
          * We could distribute it via rc address then.*/
-        self->config.fc_wnd_size     = ucs_min(config->fc.wnd_size,
-                                               config->super.rx.queue_len);
-        self->config.fc_hard_thresh  = ucs_max((int)(self->config.fc_wnd_size *
-                                               config->fc.hard_thresh), 1);
+        self->config.fc_wnd_size    = ucs_min(config->fc.wnd_size,
+                                              config->super.rx.queue_len);
+        self->config.fc_hard_thresh = ucs_max((int)(self->config.fc_wnd_size *
+                                              config->fc.hard_thresh), 1);
     } else {
-        self->config.fc_wnd_size     = INT16_MAX;
-        self->config.fc_hard_thresh  = 0;
+        self->config.fc_wnd_size    = INT16_MAX;
+        self->config.fc_hard_thresh = 0;
     }
 
     return UCS_OK;
@@ -908,3 +907,16 @@ ucs_status_t uct_rc_iface_fence(uct_iface_h tl_iface, unsigned flags)
     UCT_TL_IFACE_STAT_FENCE(&iface->super.super);
     return UCS_OK;
 }
+
+void uct_rc_iface_vfs_populate(uct_rc_iface_t *iface)
+{
+    ucs_vfs_obj_add_ro_file(iface, ucs_vfs_show_primitive,
+                            &iface->tx.cq_available, UCS_VFS_TYPE_INT,
+                            "cq_available");
+    ucs_vfs_obj_add_ro_file(iface, ucs_vfs_show_primitive,
+                            &iface->tx.reads_available, UCS_VFS_TYPE_SSIZET,
+                            "reads_available");
+    ucs_vfs_obj_add_ro_file(iface, ucs_vfs_show_primitive,
+                            &iface->tx.reads_completed, UCS_VFS_TYPE_SSIZET,
+                            "reads_completed");
+}
diff --git a/src/uct/ib/rc/base/rc_iface.h b/src/uct/ib/rc/base/rc_iface.h
index 5577be9b11b..d243ead0aa7 100644
--- a/src/uct/ib/rc/base/rc_iface.h
+++ b/src/uct/ib/rc/base/rc_iface.h
@@ -23,9 +23,9 @@
 #define UCT_RC_QP_TABLE_MEMB_ORDER  (UCT_IB_QPN_ORDER - UCT_RC_QP_TABLE_ORDER)
 #define UCT_RC_QP_MAX_RETRY_COUNT   7
 
-#define UCT_RC_CHECK_AM_SHORT(_am_id, _length, _max_inline) \
+#define UCT_RC_CHECK_AM_SHORT(_am_id, _length, _header_t, _max_inline) \
      UCT_CHECK_AM_ID(_am_id); \
-     UCT_CHECK_LENGTH(sizeof(uct_rc_am_short_hdr_t) + _length, 0, _max_inline, "am_short");
+     UCT_CHECK_LENGTH(sizeof(_header_t) + _length, 0, _max_inline, "am_short");
 
 #define UCT_RC_CHECK_ZCOPY_DATA(_header_length, _length, _seg_size) \
     UCT_CHECK_LENGTH(_header_length + _length, 0, _seg_size, "am_zcopy payload"); \
@@ -157,6 +157,7 @@ typedef struct uct_rc_iface_common_config {
         unsigned             rnr_retry_count;
         size_t               max_get_zcopy;
         size_t               max_get_bytes;
+        int                  poll_always;
     } tx;
 
     struct {
@@ -188,7 +189,8 @@ typedef struct uct_rc_iface_ops {
                                        uct_rc_hdr_t *hdr, unsigned length,
                                        uint32_t imm_data, uint16_t lid,
                                        unsigned flags);
-    void                 (*cleanup_qp)(uct_ib_async_event_wait_t *cleanup_ctx);
+    unsigned             (*cleanup_qp)(void *arg);
+    void                 (*ep_post_check)(uct_ep_h tl_ep);
 } uct_rc_iface_ops_t;
 
 
@@ -234,6 +236,7 @@ struct uct_rc_iface {
         unsigned             tx_min_inline;
         unsigned             tx_ops_count;
         uint16_t             tx_moderation;
+        uint8_t              tx_poll_always;
 
         /* Threshold to send "soft" FC credit request. The peer will try to
          * piggy-back credits grant to the counter AM, if any. */
@@ -275,9 +278,10 @@ struct uct_rc_iface {
     /* Progress function (either regular or TM aware) */
     ucs_callback_t           progress;
 };
-UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_md_h, uct_worker_h,
-                  const uct_iface_params_t*, const uct_rc_iface_common_config_t*,
-                  uct_ib_iface_init_attr_t*);
+UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_iface_ops_t*,
+                  uct_md_h, uct_worker_h, const uct_iface_params_t*,
+                  const uct_rc_iface_common_config_t*,
+                  const uct_ib_iface_init_attr_t*);
 
 
 struct uct_rc_iface_send_op {
@@ -294,6 +298,7 @@ struct uct_rc_iface_send_op {
         void                      *unpack_arg; /* get_bcopy / desc */
         uct_rc_iface_t            *iface;      /* should not be used with
                                                   get_bcopy completions */
+        uct_ep_h                  ep;          /* ep on which we sent ep_check */
     };
     uct_completion_t              *user_comp;
 #ifndef NVALGRIND
@@ -380,6 +385,8 @@ ucs_status_t uct_rc_iface_init_rx(uct_rc_iface_t *iface,
 
 ucs_status_t uct_rc_iface_fence(uct_iface_h tl_iface, unsigned flags);
 
+void uct_rc_iface_vfs_populate(uct_rc_iface_t *iface);
+
 static UCS_F_ALWAYS_INLINE ucs_status_t
 uct_rc_fc_ctrl(uct_ep_t *ep, unsigned op, uct_rc_pending_req_t *req)
 {
@@ -529,4 +536,10 @@ uct_rc_iface_invoke_pending_cb(uct_rc_iface_t *iface, uct_pending_req_t *req)
     return status;
 }
 
+static UCS_F_ALWAYS_INLINE int
+uct_rc_iface_poll_tx(uct_rc_iface_t *iface, unsigned count)
+{
+    return (count == 0) || iface->config.tx_poll_always;
+}
+
 #endif
diff --git a/src/uct/ib/rc/verbs/rc_verbs.h b/src/uct/ib/rc/verbs/rc_verbs.h
index b75b819fe98..cfce29f6869 100644
--- a/src/uct/ib/rc/verbs/rc_verbs.h
+++ b/src/uct/ib/rc/verbs/rc_verbs.h
@@ -27,11 +27,17 @@ enum {
 };
 
 
+enum {
+    UCT_RC_VERBS_FLUSH_MODE_RDMA_WRITE_0,
+    UCT_RC_VERBS_FLUSH_MODE_FLOW_CONTROL,
+    UCT_RC_VERBS_FLUSH_MODE_AUTO,
+    UCT_RC_VERBS_FLUSH_MODE_LAST
+};
+
+
 typedef struct uct_rc_verbs_ep_address {
     uint8_t          flags;
     uct_ib_uint24_t  qp_num;
-    uint64_t         flush_addr;
-    uint32_t         flush_rkey;
 } UCS_S_PACKED uct_rc_verbs_ep_address_t;
 
 
@@ -49,10 +55,6 @@ typedef struct uct_rc_verbs_ep {
     uct_rc_verbs_txcnt_t   txcnt;
     uct_ib_fence_info_t    fi;
     struct ibv_qp          *qp;
-    struct {
-        uintptr_t          remote_addr;
-        uint32_t           rkey;
-    } flush;
 } uct_rc_verbs_ep_t;
 
 
@@ -63,6 +65,7 @@ typedef struct uct_rc_verbs_iface_config {
     uct_rc_iface_config_t              super;
     size_t                             max_am_hdr;
     unsigned                           tx_max_wr;
+    unsigned                           flush_mode;
 } uct_rc_verbs_iface_config_t;
 
 
@@ -74,23 +77,20 @@ typedef struct uct_rc_verbs_iface {
     struct ibv_srq              *srq;
     struct ibv_send_wr          inl_am_wr;
     struct ibv_send_wr          inl_rwrite_wr;
-    struct ibv_sge              inl_sge[2];
+    struct ibv_sge              inl_sge[UCT_IB_MAX_IOV];
     uct_rc_am_short_hdr_t       am_inl_hdr;
     ucs_mpool_t                 short_desc_mp;
     uct_rc_iface_send_desc_t    *fc_desc; /* used when max_inline is zero */
-    struct ibv_mr               *flush_mr; /* MR for writing dummy value to flush */
-    void                        *flush_mem;
     struct {
         size_t                  short_desc_size;
         size_t                  max_inline;
         size_t                  max_send_sge;
         unsigned                tx_max_wr;
+        uint8_t                 flush_by_fc;
     } config;
 } uct_rc_verbs_iface_t;
 
 
-ucs_status_t uct_rc_verbs_iface_flush_mem_create(uct_rc_verbs_iface_t *iface);
-
 UCS_CLASS_DECLARE(uct_rc_verbs_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DECLARE_NEW_FUNC(uct_rc_verbs_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DECLARE_DELETE_FUNC(uct_rc_verbs_ep_t, uct_ep_t);
@@ -122,6 +122,9 @@ ucs_status_t uct_rc_verbs_ep_get_zcopy(uct_ep_h tl_ep,
 ucs_status_t uct_rc_verbs_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
                                       const void *buffer, unsigned length);
 
+ucs_status_t uct_rc_verbs_ep_am_short_iov(uct_ep_h ep, uint8_t id,
+                                          const uct_iov_t *iov, size_t iovcnt);
+
 ssize_t uct_rc_verbs_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                  uct_pack_callback_t pack_cb, void *arg,
                                  unsigned flags);
@@ -148,18 +151,17 @@ ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags,
 
 ucs_status_t uct_rc_verbs_ep_fence(uct_ep_h tl_ep, unsigned flags);
 
+void uct_rc_verbs_ep_post_check(uct_ep_h tl_ep);
+
 ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
                                      uct_rc_pending_req_t *req);
 
-ucs_status_t uct_rc_verbs_ep_handle_failure(uct_rc_verbs_ep_t *ep,
-                                            ucs_status_t status);
-
 ucs_status_t uct_rc_verbs_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr);
 
 ucs_status_t uct_rc_verbs_ep_connect_to_ep(uct_ep_h tl_ep,
                                            const uct_device_addr_t *dev_addr,
                                            const uct_ep_addr_t *ep_addr);
 
-void uct_rc_verbs_ep_cleanup_qp(uct_ib_async_event_wait_t *wait_ctx);
+unsigned uct_rc_verbs_ep_cleanup_qp(void *arg);
 
 #endif
diff --git a/src/uct/ib/rc/verbs/rc_verbs_ep.c b/src/uct/ib/rc/verbs/rc_verbs_ep.c
index 5ed5c6aa098..cac8ab98623 100644
--- a/src/uct/ib/rc/verbs/rc_verbs_ep.c
+++ b/src/uct/ib/rc/verbs/rc_verbs_ep.c
@@ -264,7 +264,7 @@ ucs_status_t uct_rc_verbs_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t);
     uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
 
-    UCT_RC_CHECK_AM_SHORT(id, length, iface->config.max_inline);
+    UCT_RC_CHECK_AM_SHORT(id, length, uct_rc_am_short_hdr_t, iface->config.max_inline);
     UCT_RC_CHECK_RES_AND_FC(&iface->super, &ep->super, id);
     uct_rc_verbs_iface_fill_inl_am_sge(iface, id, hdr, buffer, length);
     UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, sizeof(hdr) + length);
@@ -275,6 +275,25 @@ ucs_status_t uct_rc_verbs_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     return UCS_OK;
 }
 
+ucs_status_t uct_rc_verbs_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                          const uct_iov_t *iov, size_t iovcnt)
+{
+    uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t);
+    uct_rc_verbs_ep_t *ep       = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
+
+    UCT_RC_CHECK_AM_SHORT(id, uct_iov_total_length(iov, iovcnt), uct_rc_hdr_t,
+                          iface->config.max_inline);
+    UCT_RC_CHECK_RES_AND_FC(&iface->super, &ep->super, id);
+    UCT_CHECK_IOV_SIZE(iovcnt, UCT_IB_MAX_IOV - 1, "uct_rc_verbs_ep_am_short_iov");
+    uct_rc_verbs_iface_fill_inl_am_sge_iov(iface, id, iov, iovcnt);
+    UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, uct_iov_total_length(iov, iovcnt));
+    uct_rc_verbs_ep_post_send(iface, ep, &iface->inl_am_wr,
+                              IBV_SEND_INLINE | IBV_SEND_SOLICITED, INT_MAX);
+    UCT_RC_UPDATE_FC(&iface->super, &ep->super, id);
+
+    return UCS_OK;
+}
+
 ssize_t uct_rc_verbs_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                  uct_pack_callback_t pack_cb, void *arg,
                                  unsigned flags)
@@ -383,7 +402,7 @@ ucs_status_t uct_rc_verbs_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, ui
                                   remote_addr, rkey, comp);
 }
 
-static ucs_status_t uct_rc_verbs_ep_post_flush(uct_rc_verbs_ep_t *ep)
+static void uct_rc_verbs_ep_post_flush(uct_rc_verbs_ep_t *ep, int send_flags)
 {
     uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface,
                                                  uct_rc_verbs_iface_t);
@@ -391,48 +410,51 @@ static ucs_status_t uct_rc_verbs_ep_post_flush(uct_rc_verbs_ep_t *ep)
     struct ibv_sge sge;
     int inl_flag;
 
-    UCT_RC_CHECK_RES(&iface->super, &ep->super);
+    if (iface->config.flush_by_fc || (iface->config.max_inline == 0)) {
+        /* Flush by flow control pure grant, in case the device does not
+         * support 0-size RDMA_WRITE or does not support inline.
+         */
+        sge.addr   = (uintptr_t)(iface->fc_desc + 1);
+        sge.length = sizeof(uct_rc_hdr_t);
+        sge.lkey   = iface->fc_desc->lkey;
+        wr.sg_list = &sge;
+        wr.num_sge = 1;
+        wr.opcode  = IBV_WR_SEND;
+        inl_flag   = 0;
+    } else {
+        /* Flush by empty RDMA_WRITE */
+        wr.sg_list             = NULL;
+        wr.num_sge             = 0;
+        wr.opcode              = IBV_WR_RDMA_WRITE;
+        wr.wr.rdma.remote_addr = 0;
+        wr.wr.rdma.rkey        = 0;
+        inl_flag               = IBV_SEND_INLINE;
+    }
+    wr.next = NULL;
 
-    /*
-     * Send small RDMA_WRITE as a flush operation
-     * (some adapters do not support 0-size RDMA_WRITE or inline sends)
-     */
-    sge.addr               = (uintptr_t)iface->flush_mem;
-    sge.length             = 1;
-    sge.lkey               = iface->flush_mr->lkey;
-    wr.next                = NULL;
-    wr.sg_list             = &sge;
-    wr.num_sge             = 1;
-    wr.opcode              = IBV_WR_RDMA_WRITE;
-    wr.wr.rdma.remote_addr = ep->flush.remote_addr;
-    wr.wr.rdma.rkey        = ep->flush.rkey;
-    inl_flag               = (iface->config.max_inline >= sge.length) ?
-                             IBV_SEND_INLINE : 0;
-
-    uct_rc_verbs_ep_post_send(iface, ep, &wr, inl_flag | IBV_SEND_SIGNALED, 1);
-    return UCS_OK;
+    uct_rc_verbs_ep_post_send(iface, ep, &wr, inl_flag | send_flags, 1);
 }
 
 ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags,
                                    uct_completion_t *comp)
 {
     uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t);
-    uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
+    uct_rc_verbs_ep_t *ep       = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
+    int already_canceled        = ep->super.flags & UCT_RC_EP_FLAG_FLUSH_CANCEL;
     ucs_status_t status;
 
-    if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
-        uct_ep_pending_purge(&ep->super.super.super, NULL, 0);
-        uct_rc_verbs_ep_handle_failure(ep, UCS_ERR_CANCELED);
-        return UCS_OK;
-    }
-
     status = uct_rc_ep_flush(&ep->super, iface->config.tx_max_wr, flags);
     if (status != UCS_INPROGRESS) {
         return status;
     }
 
     if (uct_rc_txqp_unsignaled(&ep->super.txqp) != 0) {
-        status = uct_rc_verbs_ep_post_flush(ep);
+        UCT_RC_CHECK_RES(&iface->super, &ep->super);
+        uct_rc_verbs_ep_post_flush(ep, IBV_SEND_SIGNALED);
+    }
+
+    if (ucs_unlikely((flags & UCT_FLUSH_FLAG_CANCEL) && !already_canceled)) {
+        status = uct_ib_modify_qp(ep->qp, IBV_QPS_ERR);
         if (status != UCS_OK) {
             return status;
         }
@@ -449,6 +471,13 @@ ucs_status_t uct_rc_verbs_ep_fence(uct_ep_h tl_ep, unsigned flags)
     return uct_rc_ep_fence(tl_ep, &ep->fi, 1);
 }
 
+void uct_rc_verbs_ep_post_check(uct_ep_h tl_ep)
+{
+    uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
+
+    uct_rc_verbs_ep_post_flush(ep, 0);
+}
+
 ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
                                      uct_rc_pending_req_t *req)
 {
@@ -492,21 +521,6 @@ ucs_status_t uct_rc_verbs_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op,
     return UCS_OK;
 }
 
-ucs_status_t uct_rc_verbs_ep_handle_failure(uct_rc_verbs_ep_t *ep,
-                                            ucs_status_t status)
-{
-    uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.super.iface,
-                                           uct_rc_iface_t);
-
-    iface->tx.cq_available += ep->txcnt.pi - ep->txcnt.ci;
-    /* Reset CI to prevent cq_available overrun on ep_destroy */
-    ep->txcnt.ci = ep->txcnt.pi;
-    uct_rc_txqp_purge_outstanding(iface, &ep->super.txqp, status, ep->txcnt.pi, 0);
-
-    return iface->super.ops->set_ep_failed(&iface->super, &ep->super.super.super,
-                                           status);
-}
-
 ucs_status_t uct_rc_verbs_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr)
 {
     uct_rc_verbs_iface_t *iface        = ucs_derived_of(tl_ep->iface,
@@ -514,17 +528,9 @@ ucs_status_t uct_rc_verbs_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr)
     uct_rc_verbs_ep_t *ep              = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t);
     uct_ib_md_t *md                    = uct_ib_iface_md(&iface->super.super);
     uct_rc_verbs_ep_address_t *rc_addr = (uct_rc_verbs_ep_address_t*)addr;
-    ucs_status_t status;
     uint8_t mr_id;
 
-    status = uct_rc_verbs_iface_flush_mem_create(iface);
-    if (status != UCS_OK) {
-        return status;
-    }
-
     rc_addr->flags      = 0;
-    rc_addr->flush_addr = (uintptr_t)iface->flush_mem;
-    rc_addr->flush_rkey = iface->flush_mr->rkey;
     uct_ib_pack_uint24(rc_addr->qp_num, ep->qp->qp_num);
 
     if (md->ops->get_atomic_mr_id(md, &mr_id) == UCS_OK) {
@@ -561,9 +567,6 @@ ucs_status_t uct_rc_verbs_ep_connect_to_ep(uct_ep_h tl_ep,
         return status;
     }
 
-    ep->flush.remote_addr = rc_addr->flush_addr;
-    ep->flush.rkey        = rc_addr->flush_rkey;
-
     if (rc_addr->flags & UCT_RC_VERBS_ADDR_HAS_ATOMIC_MR) {
         ep->super.atomic_mr_offset = uct_ib_md_atomic_offset(*(uint8_t*)(rc_addr + 1));
     } else {
@@ -620,14 +623,14 @@ typedef struct {
     struct ibv_qp              *qp;
 } uct_rc_verbs_ep_cleanup_ctx_t;
 
-void uct_rc_verbs_ep_cleanup_qp(uct_ib_async_event_wait_t *wait_ctx)
+unsigned uct_rc_verbs_ep_cleanup_qp(void *arg)
 {
-    uct_rc_verbs_ep_cleanup_ctx_t *ep_cleanup_ctx
-                    = ucs_derived_of(wait_ctx, uct_rc_verbs_ep_cleanup_ctx_t);
+    uct_rc_verbs_ep_cleanup_ctx_t *ep_cleanup_ctx = arg;
     uint32_t qp_num = ep_cleanup_ctx->qp->qp_num;
 
     uct_ib_destroy_qp(ep_cleanup_ctx->qp);
     uct_rc_ep_cleanup_qp_done(&ep_cleanup_ctx->super, qp_num);
+    return 1;
 }
 
 UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_ep_t)
@@ -640,7 +643,6 @@ UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_ep_t)
     ucs_assert_always(ep_cleanup_ctx != NULL);
     ep_cleanup_ctx->qp = self->qp;
 
-    /* TODO should be removed by flush */
     uct_rc_txqp_purge_outstanding(&iface->super, &self->super.txqp,
                                   UCS_ERR_CANCELED, self->txcnt.pi, 1);
     /* NOTE: usually, ci == pi here, but if user calls
diff --git a/src/uct/ib/rc/verbs/rc_verbs_iface.c b/src/uct/ib/rc/verbs/rc_verbs_iface.c
index f72401431e6..174491ce150 100644
--- a/src/uct/ib/rc/verbs/rc_verbs_iface.c
+++ b/src/uct/ib/rc/verbs/rc_verbs_iface.c
@@ -23,6 +23,14 @@
 #include <string.h>
 
 static uct_rc_iface_ops_t uct_rc_verbs_iface_ops;
+static uct_iface_ops_t uct_rc_verbs_iface_tl_ops;
+
+static const char *uct_rc_verbs_flush_mode_names[] = {
+    [UCT_RC_VERBS_FLUSH_MODE_RDMA_WRITE_0] = "write0",
+    [UCT_RC_VERBS_FLUSH_MODE_FLOW_CONTROL] = "fc",
+    [UCT_RC_VERBS_FLUSH_MODE_AUTO]         = "auto",
+    [UCT_RC_VERBS_FLUSH_MODE_LAST]         = NULL
+};
 
 static ucs_config_field_t uct_rc_verbs_iface_config_table[] = {
   {"RC_", "", NULL,
@@ -39,17 +47,44 @@ static ucs_config_field_t uct_rc_verbs_iface_config_table[] = {
    "a minimum between this value and the TX queue length. -1 means no limit.",
    ucs_offsetof(uct_rc_verbs_iface_config_t, tx_max_wr), UCS_CONFIG_TYPE_UINT},
 
+  {"FLUSH_MODE", "auto",
+   "Method to use for posting flush operation:\n"
+   " - write0 : Post empty RDMA_WRITE\n"
+   " - fc     : Send flow control message\n"
+   " - auto   : Select automatically based on device support",
+   ucs_offsetof(uct_rc_verbs_iface_config_t, flush_mode),
+   UCS_CONFIG_TYPE_ENUM(uct_rc_verbs_flush_mode_names)},
+
   {NULL}
 };
 
+static unsigned uct_rc_verbs_get_tx_res_count(uct_rc_verbs_ep_t *ep,
+                                              struct ibv_wc *wc)
+{
+    return wc->wr_id - ep->txcnt.ci;
+}
+
+static UCS_F_ALWAYS_INLINE void
+uct_rc_verbs_update_tx_res(uct_rc_iface_t *iface, uct_rc_verbs_ep_t *ep,
+                           unsigned count)
+{
+    ep->txcnt.ci += count;
+    uct_rc_txqp_available_add(&ep->super.txqp, count);
+    iface->tx.cq_available += count;
+    uct_rc_iface_update_reads(iface);
+    ucs_arbiter_dispatch(&iface->tx.arbiter, 1, uct_rc_ep_process_pending,
+                         NULL);
+}
+
 static void uct_rc_verbs_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
-                                        ucs_status_t status)
+                                        ucs_status_t ep_status)
 {
     struct ibv_wc     *wc      = arg;
     uct_rc_iface_t    *iface   = ucs_derived_of(ib_iface, uct_rc_iface_t);
     ucs_log_level_t    log_lvl = UCS_LOG_LEVEL_FATAL;
     uct_rc_verbs_ep_t *ep;
-    ucs_status_t       err_handler_status;
+    ucs_status_t       status;
+    unsigned           count;
 
     ep = ucs_derived_of(uct_rc_iface_lookup_ep(iface, wc->qp_num),
                         uct_rc_verbs_ep_t);
@@ -57,22 +92,31 @@ static void uct_rc_verbs_handle_failure(uct_ib_iface_t *ib_iface, void *arg,
         return;
     }
 
-    err_handler_status = uct_rc_verbs_ep_handle_failure(ep, status);
-    log_lvl            = uct_ib_iface_failure_log_level(ib_iface,
-                                                        err_handler_status, status);
+    count = uct_rc_verbs_get_tx_res_count(ep, wc);
+    uct_rc_txqp_purge_outstanding(iface, &ep->super.txqp, ep_status,
+                                  ep->txcnt.ci + count, 0);
+
+    /* Don't need to invoke UCT pending requests for a given UCT EP */
+    ucs_arbiter_group_desched(&iface->tx.arbiter, &ep->super.arb_group);
+    uct_rc_verbs_update_tx_res(iface, ep, count);
+
+    if (ep->super.flags & (UCT_RC_EP_FLAG_ERR_HANDLER_INVOKED |
+                           UCT_RC_EP_FLAG_FLUSH_CANCEL)) {
+        return;
+    }
+
+    ep->super.flags |= UCT_RC_EP_FLAG_ERR_HANDLER_INVOKED;
+
+    status  = uct_iface_handle_ep_err(&iface->super.super.super,
+                                      &ep->super.super.super, ep_status);
+    log_lvl = uct_base_iface_failure_log_level(&ib_iface->super, status,
+                                               ep_status);
 
     ucs_log(log_lvl,
             "send completion with error: %s qpn 0x%x wrid 0x%lx vendor_err 0x%x",
             ibv_wc_status_str(wc->status), wc->qp_num, wc->wr_id, wc->vendor_err);
 }
 
-static ucs_status_t uct_rc_verbs_ep_set_failed(uct_ib_iface_t *iface,
-                                               uct_ep_h ep, ucs_status_t status)
-{
-    return uct_set_ep_failed(&UCS_CLASS_NAME(uct_rc_verbs_ep_t), ep,
-                             &iface->super.super, status);
-}
-
 ucs_status_t uct_rc_verbs_wc_to_ucs_status(enum ibv_wc_status status)
 {
     switch (status)
@@ -82,6 +126,8 @@ ucs_status_t uct_rc_verbs_wc_to_ucs_status(enum ibv_wc_status status)
     case IBV_WC_RETRY_EXC_ERR:
     case IBV_WC_RNR_RETRY_EXC_ERR:
         return UCS_ERR_ENDPOINT_TIMEOUT;
+    case IBV_WC_WR_FLUSH_ERR:
+        return UCS_ERR_CANCELED;
     default:
         return UCS_ERR_IO_ERROR;
     }
@@ -107,22 +153,14 @@ uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface)
             continue;
         }
 
-        count = wc[i].wr_id - ep->txcnt.ci;
+        count = uct_rc_verbs_get_tx_res_count(ep, &wc[i]);
         ucs_trace_poll("rc_verbs iface %p tx_wc wrid 0x%lx ep %p qpn 0x%x count %d",
                        iface, wc[i].wr_id, ep, wc[i].qp_num, count);
-        ep->txcnt.ci += count;
-
-        uct_rc_txqp_completion_desc(&ep->super.txqp, ep->txcnt.ci);
-
-        uct_rc_txqp_available_add(&ep->super.txqp, count);
-        iface->super.tx.cq_available += count;
-
-        uct_rc_iface_update_reads(&iface->super);
 
+        uct_rc_txqp_completion_desc(&ep->super.txqp, ep->txcnt.ci + count);
         ucs_arbiter_group_schedule(&iface->super.tx.arbiter,
                                    &ep->super.arb_group);
-        ucs_arbiter_dispatch(&iface->super.tx.arbiter, 1,
-                             uct_rc_ep_process_pending, NULL);
+        uct_rc_verbs_update_tx_res(&iface->super, ep, count);
     }
 
     return num_wcs;
@@ -134,7 +172,7 @@ static unsigned uct_rc_verbs_iface_progress(void *arg)
     unsigned count;
 
     count = uct_rc_verbs_iface_poll_rx_common(iface);
-    if (count > 0) {
+    if (!uct_rc_iface_poll_tx(&iface->super, count)) {
         return count;
     }
 
@@ -145,7 +183,6 @@ static void uct_rc_verbs_iface_init_inl_wrs(uct_rc_verbs_iface_t *iface)
 {
     memset(&iface->inl_am_wr, 0, sizeof(iface->inl_am_wr));
     iface->inl_am_wr.sg_list        = iface->inl_sge;
-    iface->inl_am_wr.num_sge        = 2;
     iface->inl_am_wr.opcode         = IBV_WR_SEND;
     iface->inl_am_wr.send_flags     = IBV_SEND_INLINE;
 
@@ -174,6 +211,7 @@ static ucs_status_t uct_rc_verbs_iface_query(uct_iface_h tl_iface, uct_iface_att
         return status;
     }
 
+    iface_attr->cap.flags |= UCT_IFACE_FLAG_EP_CHECK;
     iface_attr->latency.m += 1e-9;  /* 1 ns per each extra QP */
     iface_attr->overhead   = 75e-9; /* Software overhead */
 
@@ -185,47 +223,6 @@ static ucs_status_t uct_rc_verbs_iface_query(uct_iface_h tl_iface, uct_iface_att
     return UCS_OK;
 }
 
-ucs_status_t uct_rc_verbs_iface_flush_mem_create(uct_rc_verbs_iface_t *iface)
-{
-    uct_ib_md_t *md = uct_ib_iface_md(&iface->super.super);
-    ucs_status_t status;
-    struct ibv_mr *mr;
-    void *mem;
-
-    if (iface->flush_mr != NULL) {
-        ucs_assert(iface->flush_mem != NULL);
-        return UCS_OK;
-    }
-
-    /*
-     * Map a whole page for the remote side to issue a dummy RDMA_WRITE on it,
-     * to flush its outstanding operations. A whole page is used to prevent any
-     * other allocations from using same page, so it would be fork-safe.
-     */
-    mem = ucs_mmap(NULL, ucs_get_page_size(), PROT_READ|PROT_WRITE,
-                   MAP_PRIVATE|MAP_ANONYMOUS, -1, 0, "flush_mem");
-    if (mem == MAP_FAILED) {
-        ucs_error("failed to allocate page for remote flush: %m");
-        status = UCS_ERR_NO_MEMORY;
-        goto err;
-    }
-
-    status = uct_ib_reg_mr(md->pd, mem, ucs_get_page_size(),
-                           UCT_IB_MEM_ACCESS_FLAGS, &mr, 0);
-    if (status != UCS_OK) {
-        goto err_munmap;
-    }
-
-    iface->flush_mem = mem;
-    iface->flush_mr  = mr;
-    return UCS_OK;
-
-err_munmap:
-    ucs_munmap(mem, ucs_get_page_size());
-err:
-    return status;
-}
-
 static ucs_status_t
 uct_rc_iface_verbs_init_rx(uct_rc_iface_t *rc_iface,
                            const uct_rc_iface_common_config_t *config)
@@ -249,21 +246,24 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h tl_md,
 {
     uct_rc_verbs_iface_config_t *config =
                     ucs_derived_of(tl_config, uct_rc_verbs_iface_config_t);
+    uct_ib_iface_config_t *ib_config    = &config->super.super.super;
+    uct_ib_iface_init_attr_t init_attr  = {};
+    uct_ib_qp_attr_t attr               = {};
+    const char *dev_name;
     ucs_status_t status;
-    uct_ib_iface_init_attr_t init_attr = {};
-    uct_ib_qp_attr_t attr = {};
     struct ibv_qp *qp;
     uct_rc_hdr_t *hdr;
 
     init_attr.fc_req_size            = sizeof(uct_rc_pending_req_t);
     init_attr.rx_hdr_len             = sizeof(uct_rc_hdr_t);
     init_attr.qp_type                = IBV_QPT_RC;
-    init_attr.cq_len[UCT_IB_DIR_RX]  = config->super.super.super.rx.queue_len;
+    init_attr.cq_len[UCT_IB_DIR_RX]  = ib_config->rx.queue_len;
     init_attr.cq_len[UCT_IB_DIR_TX]  = config->super.tx_cq_len;
-    init_attr.seg_size               = config->super.super.super.seg_size;
+    init_attr.seg_size               = ib_config->seg_size;
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops, tl_md,
-                              worker, params, &config->super.super, &init_attr);
+    UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops,
+                              &uct_rc_verbs_iface_tl_ops, tl_md, worker, params,
+                              &config->super.super, &init_attr);
 
     self->config.tx_max_wr           = ucs_min(config->tx_max_wr,
                                                self->super.config.tx_qp_len);
@@ -271,8 +271,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h tl_md,
                                                self->config.tx_max_wr / 4);
     self->super.config.fence_mode    = (uct_rc_fence_mode_t)config->super.super.fence_mode;
     self->super.progress             = uct_rc_verbs_iface_progress;
-    self->flush_mem                  = NULL;
-    self->flush_mr                   = NULL;
+    self->super.super.config.sl      = uct_ib_iface_config_select_sl(ib_config);
 
     if ((config->super.super.fence_mode == UCT_RC_FENCE_MODE_WEAK) ||
         (config->super.super.fence_mode == UCT_RC_FENCE_MODE_AUTO)) {
@@ -294,6 +293,17 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h tl_md,
     self->config.short_desc_size = ucs_max(UCT_IB_MAX_ATOMIC_SIZE,
                                            self->config.short_desc_size);
 
+    /* Flush mode */
+    if (config->flush_mode == UCT_RC_VERBS_FLUSH_MODE_AUTO) {
+        /* Use flow control for flush on older devices */
+        dev_name                 = uct_ib_device_name(
+                                       uct_ib_iface_device(&self->super.super));
+        self->config.flush_by_fc = (strstr(dev_name, "mthca") == dev_name);
+    } else {
+        self->config.flush_by_fc = (config->flush_mode ==
+                                    UCT_RC_VERBS_FLUSH_MODE_FLOW_CONTROL);
+    }
+
     /* Create AM headers and Atomic mempool */
     status = uct_iface_mpool_init(&self->super.super.super,
                                   &self->short_desc_mp,
@@ -301,7 +311,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h tl_md,
                                       self->config.short_desc_size,
                                   sizeof(uct_rc_iface_send_desc_t),
                                   UCS_SYS_CACHE_LINE_SIZE,
-                                  &config->super.super.super.tx.mp,
+                                  &ib_config->tx.mp,
                                   self->super.config.tx_qp_len,
                                   uct_rc_iface_send_desc_init,
                                   "rc_verbs_short_desc");
@@ -332,7 +342,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h tl_md,
     ucs_assertv_always(self->config.max_send_sge > 1, /* need 1 iov for am header*/
                        "max_send_sge %zu", self->config.max_send_sge);
 
-    if (self->config.max_inline < sizeof(*hdr)) {
+    if ((self->config.max_inline < sizeof(*hdr)) || self->config.flush_by_fc) {
         self->fc_desc = ucs_mpool_get(&self->short_desc_mp);
         ucs_assert_always(self->fc_desc != NULL);
         hdr        = (uct_rc_hdr_t*)(self->fc_desc + 1);
@@ -414,11 +424,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_verbs_iface_t)
 
     uct_rc_iface_cleanup_eps(&self->super);
 
-    if (self->flush_mr != NULL) {
-        uct_ib_dereg_mr(self->flush_mr);
-        ucs_assert(self->flush_mem != NULL);
-        ucs_munmap(self->flush_mem, ucs_get_page_size());
-    }
     if (self->fc_desc != NULL) {
         ucs_mpool_put(self->fc_desc);
     }
@@ -431,10 +436,9 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_verbs_iface_t, uct_iface_t, uct_md_h,
                                  const uct_iface_config_t*);
 static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_verbs_iface_t, uct_iface_t);
 
-static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = {
-    {
-    {
+static uct_iface_ops_t uct_rc_verbs_iface_tl_ops = {
     .ep_am_short              = uct_rc_verbs_ep_am_short,
+    .ep_am_short_iov          = uct_rc_verbs_ep_am_short_iov,
     .ep_am_bcopy              = uct_rc_verbs_ep_am_bcopy,
     .ep_am_zcopy              = uct_rc_verbs_ep_am_zcopy,
     .ep_put_short             = uct_rc_verbs_ep_put_short,
@@ -452,6 +456,7 @@ static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = {
     .ep_pending_purge         = uct_rc_ep_pending_purge,
     .ep_flush                 = uct_rc_verbs_ep_flush,
     .ep_fence                 = uct_rc_verbs_ep_fence,
+    .ep_check                 = uct_rc_ep_check,
     .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_rc_verbs_ep_t),
     .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_rc_verbs_ep_t),
     .ep_get_address           = uct_rc_verbs_ep_get_address,
@@ -468,18 +473,25 @@ static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = {
     .iface_get_address        = ucs_empty_function_return_success,
     .iface_get_device_address = uct_ib_iface_get_device_address,
     .iface_is_reachable       = uct_ib_iface_is_reachable,
+    };
+
+static uct_rc_iface_ops_t uct_rc_verbs_iface_ops = {
+    .super = {
+        .super = {
+            .iface_estimate_perf = uct_base_iface_estimate_perf,
+            .iface_vfs_refresh   = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
+        },
+        .create_cq      = uct_ib_verbs_create_cq,
+        .arm_cq         = uct_ib_iface_arm_cq,
+        .event_cq       = (uct_ib_iface_event_cq_func_t)ucs_empty_function,
+        .handle_failure = uct_rc_verbs_handle_failure,
     },
-    .create_cq                = uct_ib_verbs_create_cq,
-    .arm_cq                   = uct_ib_iface_arm_cq,
-    .event_cq                 = (uct_ib_iface_event_cq_func_t)ucs_empty_function,
-    .handle_failure           = uct_rc_verbs_handle_failure,
-    .set_ep_failed            = uct_rc_verbs_ep_set_failed,
-    },
-    .init_rx                  = uct_rc_iface_verbs_init_rx,
-    .cleanup_rx               = uct_rc_iface_verbs_cleanup_rx,
-    .fc_ctrl                  = uct_rc_verbs_ep_fc_ctrl,
-    .fc_handler               = uct_rc_iface_fc_handler,
-    .cleanup_qp               = uct_rc_verbs_ep_cleanup_qp,
+    .init_rx       = uct_rc_iface_verbs_init_rx,
+    .cleanup_rx    = uct_rc_iface_verbs_cleanup_rx,
+    .fc_ctrl       = uct_rc_verbs_ep_fc_ctrl,
+    .fc_handler    = uct_rc_iface_fc_handler,
+    .cleanup_qp    = uct_rc_verbs_ep_cleanup_qp,
+    .ep_post_check = uct_rc_verbs_ep_post_check,
 };
 
 static ucs_status_t
diff --git a/src/uct/ib/rc/verbs/rc_verbs_impl.h b/src/uct/ib/rc/verbs/rc_verbs_impl.h
index ccb4c6c4409..ed0c18233b4 100644
--- a/src/uct/ib/rc/verbs/rc_verbs_impl.h
+++ b/src/uct/ib/rc/verbs/rc_verbs_impl.h
@@ -134,11 +134,26 @@ uct_rc_verbs_iface_fill_inl_am_sge(uct_rc_verbs_iface_t *iface,
                                    const void *buffer, unsigned length)
 {
     uct_rc_am_short_hdr_t *am = &iface->am_inl_hdr;
-    am->rc_hdr.am_id = id;
-    am->am_hdr       = hdr;
+
+    am->rc_hdr.am_id          = id;
+    am->am_hdr                = hdr;
+    iface->inl_am_wr.num_sge  = 2;
     uct_rc_verbs_iface_fill_inl_sge(iface, am, sizeof(*am), buffer, length);
 }
 
+static inline void
+uct_rc_verbs_iface_fill_inl_am_sge_iov(uct_rc_verbs_iface_t *iface, uint8_t id,
+                                       const uct_iov_t *iov, size_t iovcnt)
+{
+    uct_rc_hdr_t *rch        = &iface->am_inl_hdr.rc_hdr;
+
+    rch->am_id               = id;
+    iface->inl_sge[0].addr   = (uintptr_t)rch;
+    iface->inl_sge[0].length = sizeof(*rch);
+    iface->inl_am_wr.num_sge = uct_ib_verbs_sge_fill_iov(iface->inl_sge + 1, iov,
+                                                         iovcnt) + 1;
+}
+
 #define UCT_RC_VERBS_FILL_SGE(_wr, _sge, _length) \
     _wr.sg_list = &_sge; \
     _wr.num_sge = 1; \
diff --git a/src/uct/ib/rdmacm/Makefile.am b/src/uct/ib/rdmacm/Makefile.am
index 0e4ad2403d8..4f3a9af106f 100644
--- a/src/uct/ib/rdmacm/Makefile.am
+++ b/src/uct/ib/rdmacm/Makefile.am
@@ -1,12 +1,12 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 # See file LICENSE for terms.
 #
 
 if HAVE_RDMACM
 
 # rdmacm is under IB, but it's actually a uct module, because it defines its own
-# memory domain component
+# component
 module_LTLIBRARIES        = libuct_rdmacm.la
 libuct_rdmacm_la_CPPFLAGS = $(BASE_CPPFLAGS) $(IBVERBS_CPPFLAGS) $(RDMACM_CPPFLAGS)
 libuct_rdmacm_la_CFLAGS   = $(BASE_CFLAGS)
@@ -16,27 +16,15 @@ libuct_rdmacm_la_LIBADD   = $(RDMACM_LIBS) $(top_builddir)/src/ucs/libucs.la \
 libuct_rdmacm_la_LDFLAGS  = $(IBVERBS_LDFLAGS) $(RDMACM_LDFLAGS) -version-info $(SOVERSION)
 
 noinst_HEADERS = \
-	rdmacm_md.h \
-	rdmacm_iface.h \
-	rdmacm_ep.h \
-	rdmacm_def.h
-
-libuct_rdmacm_la_SOURCES = \
-	rdmacm_md.c \
-	rdmacm_iface.c \
-	rdmacm_ep.c
-
-if HAVE_RDMACM_QP_LESS
-noinst_HEADERS += \
 	rdmacm_cm.h \
 	rdmacm_listener.h \
 	rdmacm_cm_ep.h
 
-libuct_rdmacm_la_SOURCES += \
+libuct_rdmacm_la_SOURCES = \
+	rdmacm_component.c \
 	rdmacm_cm.c \
 	rdmacm_listener.c \
 	rdmacm_cm_ep.c
-endif # HAVE_RDMACM_QP_LESS
 
 include $(top_srcdir)/config/module.am
 
diff --git a/src/uct/ib/rdmacm/configure.m4 b/src/uct/ib/rdmacm/configure.m4
index 35f078f06e7..9265da17352 100644
--- a/src/uct/ib/rdmacm/configure.m4
+++ b/src/uct/ib/rdmacm/configure.m4
@@ -1,5 +1,5 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2001-2017.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 #
 # See file LICENSE for terms.
 #
@@ -8,7 +8,6 @@
 # Check for RDMACM support
 #
 rdmacm_happy="no"
-rdmacm_qp_less_happy="no"
 AC_ARG_WITH([rdmacm],
            [AS_HELP_STRING([--with-rdmacm=(DIR)], [Enable the use of RDMACM (default is guess).])],
            [], [with_rdmacm=guess])
@@ -29,23 +28,18 @@ AS_IF([test "x$with_rdmacm" != xno],
 
        AC_CHECK_HEADER([$ucx_check_rdmacm_dir/include/rdma/rdma_cma.h],
                        [
-                       AC_CHECK_LIB([rdmacm], [rdma_create_id],
-                                     [uct_modules="${uct_modules}:rdmacm"
-                                      rdmacm_happy="yes"
-                                      AS_IF([test "$ucx_check_rdmacm_dir" != /usr],
-                                            [
-                                            AC_SUBST(RDMACM_CPPFLAGS, ["-I$ucx_check_rdmacm_dir/include"])
-                                            AC_SUBST(RDMACM_LDFLAGS,  ["-L$ucx_check_rdmacm_dir/lib$libsuff"])])
-                                      AC_SUBST(RDMACM_LIBS,     [-lrdmacm])
-                                      # QP less support
-                                      AC_CHECK_DECLS([rdma_establish, rdma_init_qp_attr],
-                                                     [rdmacm_qp_less_happy="yes"
-                                                      AC_DEFINE([HAVE_RDMACM_QP_LESS], 1, [RDMACM QP less support])],
-                                                     [], [#include <$ucx_check_rdmacm_dir/include/rdma/rdma_cma.h>])
-                                     ],
-                                     [AC_MSG_WARN([RDMACM requested but librdmacm is not found])
-                                      AC_MSG_ERROR([Please install librdmacm and librdmacm-devel or disable rdmacm support])
-                                     ])
+                       AC_CHECK_LIB([rdmacm], [rdma_establish],
+                                    [uct_modules="${uct_modules}:rdmacm"
+                                     rdmacm_happy="yes"
+                                     AS_IF([test "$ucx_check_rdmacm_dir" != /usr],
+                                           [
+                                           AC_SUBST(RDMACM_CPPFLAGS, ["-I$ucx_check_rdmacm_dir/include"])
+                                           AC_SUBST(RDMACM_LDFLAGS,  ["-L$ucx_check_rdmacm_dir/lib$libsuff"])])
+                                     AC_SUBST(RDMACM_LIBS, [-lrdmacm])],
+                                    [AS_IF([test "x$with_rdmacm" != xguess],
+                                           [AC_MSG_ERROR([RDMACM requested but librdmacm is not found or does not provide rdma_establish() API])],
+                                           [AC_MSG_WARN([RDMACM requested but librdmacm is not found or does not provide rdma_establish() API])])
+                                    ])
                        ],
                        [
                        AS_IF([test "x$with_rdmacm" != xguess],
@@ -59,5 +53,4 @@ AS_IF([test "x$with_rdmacm" != xno],
 )
 
 AM_CONDITIONAL([HAVE_RDMACM], [test "x$rdmacm_happy" != xno])
-AM_CONDITIONAL([HAVE_RDMACM_QP_LESS], [test "x$rdmacm_qp_less_happy" != xno])
 AC_CONFIG_FILES([src/uct/ib/rdmacm/Makefile])
diff --git a/src/uct/ib/rdmacm/rdmacm_cm.c b/src/uct/ib/rdmacm/rdmacm_cm.c
index 6ae34442622..1e05af71249 100644
--- a/src/uct/ib/rdmacm/rdmacm_cm.c
+++ b/src/uct/ib/rdmacm/rdmacm_cm.c
@@ -1,11 +1,11 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2019-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
 
 #ifdef HAVE_CONFIG_H
-#  include "config.h" /* Defines HAVE_RDMACM_QP_LESS */
+#  include "config.h"
 #endif
 
 #include "rdmacm_cm_ep.h"
@@ -41,9 +41,11 @@ ucs_status_t uct_rdmacm_cm_ack_event(struct rdma_cm_event *event)
     return UCS_OK;
 }
 
-ucs_status_t uct_rdmacm_cm_reject(struct rdma_cm_id *id)
+ucs_status_t uct_rdmacm_cm_reject(uct_rdmacm_cm_t *cm, struct rdma_cm_id *id)
 {
     uct_rdmacm_priv_data_hdr_t hdr;
+    char remote_ip_port_str[UCS_SOCKADDR_STRING_LEN];
+    char local_ip_port_str[UCS_SOCKADDR_STRING_LEN];
 
     hdr.length = 0;
     hdr.status = (uint8_t)UCS_ERR_REJECTED;
@@ -51,13 +53,69 @@ ucs_status_t uct_rdmacm_cm_reject(struct rdma_cm_id *id)
     ucs_trace("reject on cm_id %p", id);
 
     if (rdma_reject(id, &hdr, sizeof(hdr))) {
-        ucs_error("rdma_reject (id=%p) failed with error: %m", id);
-        return UCS_ERR_IO_ERROR;
+        uct_cm_peer_error(&cm->super,
+                          "rdma_reject (id=%p local addr=%s remote addr=%s) "
+                          "failed with error: %m", id,
+                          ucs_sockaddr_str(rdma_get_local_addr(id),
+                                           local_ip_port_str,
+                                           UCS_SOCKADDR_STRING_LEN),
+                          ucs_sockaddr_str(rdma_get_peer_addr(id),
+                                           remote_ip_port_str,
+                                           UCS_SOCKADDR_STRING_LEN));
+        return UCS_ERR_CONNECTION_RESET;
+    }
+
+    return UCS_OK;
+}
+
+ucs_status_t uct_rdmacm_cm_get_cq(uct_rdmacm_cm_t *cm, struct ibv_context *verbs,
+                                  struct ibv_cq **cq_p)
+{
+    struct ibv_cq *cq;
+    khiter_t iter;
+    int ret;
+
+    iter = kh_put(uct_rdmacm_cm_cqs, &cm->cqs,
+                  ibv_get_device_guid(verbs->device), &ret);
+    if (ret == -1) {
+        ucs_error("cm %p: cannot allocate hash entry for CQ", cm);
+        return UCS_ERR_NO_MEMORY;
     }
 
+    if (ret == 0) {
+        /* already exists so use it */
+        cq = kh_value(&cm->cqs, iter);
+    } else {
+        /* Create a dummy completion queue */
+        cq = ibv_create_cq(verbs, 1, NULL, NULL, 0);
+        if (cq == NULL) {
+            kh_del(uct_rdmacm_cm_cqs, &cm->cqs, iter);
+            ucs_error("ibv_create_cq() failed: %m");
+            return UCS_ERR_IO_ERROR;
+        }
+
+        kh_value(&cm->cqs, iter) = cq;
+    }
+
+    *cq_p = cq;
     return UCS_OK;
 }
 
+void uct_rdmacm_cm_cqs_cleanup(uct_rdmacm_cm_t *cm)
+{
+    struct ibv_cq *cq;
+    int ret;
+
+    kh_foreach_value(&cm->cqs, cq, {
+        ret = ibv_destroy_cq(cq);
+        if (ret != 0) {
+            ucs_warn("ibv_destroy_cq() returned %d: %m", ret);
+        }
+    });
+
+    kh_destroy_inplace(uct_rdmacm_cm_cqs, &cm->cqs);
+}
+
 size_t uct_rdmacm_cm_get_max_conn_priv()
 {
     return UCT_RDMACM_TCP_PRIV_DATA_LEN - sizeof(uct_rdmacm_priv_data_hdr_t);
@@ -74,6 +132,7 @@ static ucs_status_t uct_rdmacm_cm_query(uct_cm_h cm, uct_cm_attr_t *cm_attr)
 static void uct_rdmacm_cm_handle_event_addr_resolved(struct rdma_cm_event *event)
 {
     uct_rdmacm_cm_ep_t *cep = (uct_rdmacm_cm_ep_t*)event->id->context;
+    uct_rdmacm_cm_t    *cm  = uct_rdmacm_cm_ep_get_cm(cep);
     char ep_str[UCT_RDMACM_EP_STRING_LEN];
     uct_cm_remote_data_t remote_data;
 
@@ -83,54 +142,47 @@ static void uct_rdmacm_cm_handle_event_addr_resolved(struct rdma_cm_event *event
               uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
               event->id);
 
-    if (rdma_resolve_route(event->id, 1000 /* TODO */)) {
-        ucs_error("%s: rdma_resolve_route failed: %m",
+    if (rdma_resolve_route(event->id, uct_rdmacm_cm_get_timeout(cm))) {
+        ucs_diag("%s: rdma_resolve_route failed: %m",
                   uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN));
         remote_data.field_mask = 0;
-        uct_rdmacm_cm_ep_set_failed(cep, &remote_data, UCS_ERR_IO_ERROR);
+        uct_rdmacm_cm_ep_set_failed(cep, &remote_data, UCS_ERR_UNREACHABLE);
     }
 }
 
 static void uct_rdmacm_cm_handle_event_route_resolved(struct rdma_cm_event *event)
 {
-    uct_rdmacm_cm_ep_t     *cep = (uct_rdmacm_cm_ep_t*)event->id->context;
-    uct_cm_remote_data_t   remote_data;
-    ucs_status_t           status;
-    struct rdma_conn_param conn_param;
-    char                   ep_str[UCT_RDMACM_EP_STRING_LEN];
+    uct_rdmacm_cm_ep_t *cep = (uct_rdmacm_cm_ep_t*)event->id->context;
+    uint8_t pack_priv_data[UCT_RDMACM_TCP_PRIV_DATA_LEN];
+    size_t pack_priv_data_length;
+    ucs_status_t status;
 
     ucs_assert(event->id == cep->id);
 
-    memset(&conn_param, 0, sizeof(conn_param));
-    conn_param.private_data = ucs_alloca(uct_rdmacm_cm_get_max_conn_priv() +
-                                         sizeof(uct_rdmacm_priv_data_hdr_t));
-
-    status = uct_rdmacm_cm_ep_pack_cb(cep, &conn_param);
-    if (status != UCS_OK) {
-        cep->status = status;
-        cep->flags |= UCT_RDMACM_CM_EP_FAILED;
-        return;
+    if (cep->super.resolve_cb != NULL) {
+        status = uct_rdmacm_cm_ep_resolve_cb(cep);
+        goto out;
     }
 
-    status = uct_rdamcm_cm_ep_set_qp_num(&conn_param, cep);
+    ucs_assert(cep->super.priv_pack_cb != NULL);
+    status = uct_rdmacm_cm_ep_pack_cb(cep, pack_priv_data,
+                                      &pack_priv_data_length);
     if (status != UCS_OK) {
-        remote_data.field_mask = 0;
-        uct_rdmacm_cm_ep_set_failed(cep, &remote_data, status);
-        return;
+        goto out;
     }
 
-    ucs_trace("%s rdma_connect, cm_id %p",
-              uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), cep->id);
+    status = uct_rdmacm_cm_ep_send_priv_data(cep, pack_priv_data,
+                                             pack_priv_data_length);
 
-    if (rdma_connect(cep->id, &conn_param)) {
-        ucs_error("%s rdma_connect failed: %m",
-                  uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN));
-        remote_data.field_mask = 0;
-        uct_rdmacm_cm_ep_set_failed(cep, &remote_data, UCS_ERR_IO_ERROR);
+out:
+    if (status != UCS_OK) {
+        cep->status = status;
+        cep->flags |= UCT_RDMACM_CM_EP_FAILED;
     }
 }
 
-static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id,
+static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(uct_rdmacm_cm_t *cm,
+                                                 struct rdma_cm_id *cm_id,
                                                  uct_device_addr_t **dev_addr_p,
                                                  size_t *dev_addr_len_p)
 {
@@ -154,9 +206,10 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id,
     qp_attr.qp_state = IBV_QPS_RTR;
     ret              = rdma_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
     if (ret) {
-        ucs_error("rdma_init_qp_attr (id=%p, qp_state=%d) failed: %m",
-                  cm_id, qp_attr.qp_state);
-        return UCS_ERR_IO_ERROR;
+        uct_cm_peer_error(&cm->super,
+                          "rdma_init_qp_attr (id=%p, qp_state=%d) failed: %m",
+                          cm_id, qp_attr.qp_state);
+        return UCS_ERR_CONNECTION_RESET;
     }
 
     ret = ibv_query_port(cm_id->pd->context, cm_id->port_num, &port_attr);
@@ -224,7 +277,9 @@ static ucs_status_t uct_rdmacm_cm_id_to_dev_addr(struct rdma_cm_id *cm_id,
     return UCS_OK;
 }
 
-static void uct_rdmacm_cm_handle_event_connect_request(struct rdma_cm_event *event)
+static void
+uct_rdmacm_cm_handle_event_connect_request(uct_rdmacm_cm_t *cm,
+                                           struct rdma_cm_event *event)
 {
     uct_rdmacm_priv_data_hdr_t          *hdr      = (uct_rdmacm_priv_data_hdr_t*)
                                                     event->param.conn.private_data;
@@ -242,7 +297,7 @@ static void uct_rdmacm_cm_handle_event_connect_request(struct rdma_cm_event *eve
 
     uct_rdmacm_cm_id_to_dev_name(event->id, dev_name);
 
-    status = uct_rdmacm_cm_id_to_dev_addr(event->id, &dev_addr, &addr_length);
+    status = uct_rdmacm_cm_id_to_dev_addr(cm, event->id, &dev_addr, &addr_length);
     if (status != UCS_OK) {
         goto err;
     }
@@ -283,7 +338,7 @@ static void uct_rdmacm_cm_handle_event_connect_request(struct rdma_cm_event *eve
 err_free_dev_addr:
     ucs_free(dev_addr);
 err:
-    uct_rdmacm_cm_reject(event->id);
+    uct_rdmacm_cm_reject(cm, event->id);
     uct_rdmacm_cm_destroy_id(event->id);
     uct_rdmacm_cm_ack_event(event);
 }
@@ -313,11 +368,12 @@ static void uct_rdmacm_cm_handle_event_connect_response(struct rdma_cm_event *ev
     remote_data.conn_priv_data        = hdr + 1;
     remote_data.conn_priv_data_length = hdr->length;
 
-    status = uct_rdmacm_cm_id_to_dev_addr(event->id, &dev_addr, &addr_length);
+    status = uct_rdmacm_cm_id_to_dev_addr(uct_rdmacm_cm_ep_get_cm(cep),
+                                          event->id, &dev_addr, &addr_length);
     if (status != UCS_OK) {
-        ucs_error("%s client (ep=%p id=%p) failed to process a connect response ",
-                  uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
-                  cep, event->id);
+        ucs_diag("%s client (ep=%p id=%p) failed to process a connect response ",
+                 uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
+                 cep, event->id);
         uct_rdmacm_cm_ep_set_failed(cep, &remote_data, status);
         return;
     }
@@ -345,15 +401,28 @@ static void uct_rdmacm_cm_handle_event_established(struct rdma_cm_event *event)
     uct_rdmacm_cm_ep_server_conn_notify_cb(cep, UCS_OK);
 }
 
+static const char*
+uct_rdmacm_cm_event_status_str(const struct rdma_cm_event *event)
+{
+    if (event->event == RDMA_CM_EVENT_REJECTED) {
+        /* If it is REJECTED event, the status is some transport-specific reject
+         * reason */
+        return strerror(ECONNREFUSED);
+    }
+
+    /* RDMACM returns a negative errno as an event status */
+    return strerror(-event->status);
+}
+
 static void uct_rdmacm_cm_handle_event_disconnected(struct rdma_cm_event *event)
 {
     uct_rdmacm_cm_ep_t   *cep = event->id->context;
     char                 ep_str[UCT_RDMACM_EP_STRING_LEN];
     uct_cm_remote_data_t remote_data;
 
-    ucs_debug("%s got disconnect event, status %d",
+    ucs_debug("%s got disconnect event, status %s (%d)",
               uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
-              event->status);
+              uct_rdmacm_cm_event_status_str(event), event->status);
 
     cep->flags |= UCT_RDMACM_CM_EP_GOT_DISCONNECT;
     /* calling error_cb instead of disconnect CB directly handles out-of-order
@@ -407,9 +476,10 @@ static void uct_rdmacm_cm_handle_error_event(struct rdma_cm_event *event)
         log_level = UCS_LOG_LEVEL_ERROR;
     }
 
-    ucs_log(log_level, "%s got error event %s, event status %d ",
+    ucs_log(log_level, "%s got error event %s, event status %s (%d)",
             uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
-            rdma_event_str(event->event), event->status);
+            rdma_event_str(event->event), uct_rdmacm_cm_event_status_str(event),
+            event->status);
 
     if (uct_rdmacm_ep_is_connected(cep) &&
         !(cep->flags & UCT_RDMACM_CM_EP_FAILED)) {
@@ -430,10 +500,13 @@ uct_rdmacm_cm_process_event(uct_rdmacm_cm_t *cm, struct rdma_cm_event *event)
     uint8_t         ack_event                 = 1;
     char            ip_port_str[UCS_SOCKADDR_STRING_LEN];
 
-    ucs_trace("rdmacm event (fd=%d cm_id %p cm %p event_channel %p status %s): %s. Peer: %s.",
-              cm->ev_ch->fd, event->id, cm, cm->ev_ch, strerror(event->status),
+    ucs_trace("rdmacm event (fd=%d cm_id %p cm %p event_channel %p status %s"
+              " (%d)): %s. Peer: %s.",
+              cm->ev_ch->fd, event->id, cm, cm->ev_ch,
+              uct_rdmacm_cm_event_status_str(event), event->status,
               rdma_event_str(event->event),
-              ucs_sockaddr_str(remote_addr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
+              ucs_sockaddr_str(remote_addr, ip_port_str,
+                               UCS_SOCKADDR_STRING_LEN));
 
     /* The following applies for rdma_cm_id of type RDMA_PS_TCP only */
     ucs_assert(event->id->ps == RDMA_PS_TCP);
@@ -451,7 +524,7 @@ uct_rdmacm_cm_process_event(uct_rdmacm_cm_t *cm, struct rdma_cm_event *event)
         break;
     case RDMA_CM_EVENT_CONNECT_REQUEST:
         /* Server side event */
-        uct_rdmacm_cm_handle_event_connect_request(event);
+        uct_rdmacm_cm_handle_event_connect_request(cm, event);
         /* The server will ack the event after accepting/rejecting the request
          * (in ep_create). */
         ack_event = 0;
@@ -534,6 +607,7 @@ static uct_cm_ops_t uct_rdmacm_cm_ops = {
 
 static uct_iface_ops_t uct_rdmacm_cm_iface_ops = {
     .ep_pending_purge         = ucs_empty_function,
+    .ep_connect               = uct_rdmacm_cm_ep_connect,
     .ep_disconnect            = uct_rdmacm_cm_ep_disconnect,
     .cm_ep_conn_notify        = uct_rdmacm_cm_ep_conn_notify,
     .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_cm_ep_t),
@@ -541,6 +615,7 @@ static uct_iface_ops_t uct_rdmacm_cm_iface_ops = {
     .ep_put_bcopy             = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_unsupported,
     .ep_get_bcopy             = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_unsupported,
     .ep_am_short              = (uct_ep_am_short_func_t)ucs_empty_function_return_unsupported,
+    .ep_am_short_iov          = (uct_ep_am_short_iov_func_t)ucs_empty_function_return_unsupported,
     .ep_am_bcopy              = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_unsupported,
     .ep_atomic_cswap64        = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_unsupported,
     .ep_atomic64_post         = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_unsupported,
@@ -567,20 +642,70 @@ static uct_iface_ops_t uct_rdmacm_cm_iface_ops = {
     .iface_is_reachable       = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero
 };
 
+static ucs_status_t
+uct_rdmacm_cm_ipstr_to_sockaddr(const char *ip_str, struct sockaddr **saddr_p,
+                                const char *debug_name)
+{
+    struct sockaddr_storage *sa_storage;
+    ucs_status_t status;
+
+    /* NULL-pointer for empty parameter */
+    if (ip_str[0] == '\0') {
+        sa_storage = NULL;
+        goto out;
+    }
+
+    sa_storage = ucs_calloc(1, sizeof(struct sockaddr_storage), debug_name);
+    if (sa_storage == NULL) {
+        status = UCS_ERR_NO_MEMORY;
+        ucs_error("cannot allocate memory for rdmacm source address");
+        goto err;
+    }
+
+    status = ucs_sock_ipstr_to_sockaddr(ip_str, sa_storage);
+    if (status != UCS_OK) {
+        goto err_free;
+    }
+
+out:
+    *saddr_p = (struct sockaddr*)sa_storage;
+    return UCS_OK;
+
+err_free:
+    ucs_free(sa_storage);
+err:
+    return status;
+}
+
 UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_t, uct_component_h component,
                     uct_worker_h worker, const uct_cm_config_t *config)
 {
+    const uct_rdmacm_cm_config_t *rdmacm_config = ucs_derived_of(config,
+                                                                 uct_rdmacm_cm_config_t);
     uct_priv_worker_t *worker_priv;
     ucs_status_t status;
+    ucs_log_level_t log_lvl;
 
     UCS_CLASS_CALL_SUPER_INIT(uct_cm_t, &uct_rdmacm_cm_ops,
                               &uct_rdmacm_cm_iface_ops, worker, component,
                               config);
 
-    self->ev_ch  = rdma_create_event_channel();
+    kh_init_inplace(uct_rdmacm_cm_cqs, &self->cqs);
+
+    self->ev_ch = rdma_create_event_channel();
     if (self->ev_ch == NULL) {
-        ucs_error("rdma_create_event_channel failed: %m");
-        status = UCS_ERR_IO_ERROR;
+        if (errno == ENODEV) {
+            status  = UCS_ERR_NO_DEVICE;
+            log_lvl = UCS_LOG_LEVEL_DIAG;
+        } else if (errno == ENOENT) {
+            status  = UCS_ERR_IO_ERROR;
+            log_lvl = UCS_LOG_LEVEL_WARN;
+        } else {
+            status  = UCS_ERR_IO_ERROR;
+            log_lvl = UCS_LOG_LEVEL_ERROR;
+        }
+
+        ucs_log(log_lvl, "rdma_create_event_channel failed: %m");
         goto err;
     }
 
@@ -601,11 +726,22 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_t, uct_component_h component,
         goto err_destroy_ev_ch;
     }
 
+    status = uct_rdmacm_cm_ipstr_to_sockaddr(rdmacm_config->src_addr,
+                                             &self->config.src_addr,
+                                             "rdmacm_src_addr");
+    if (status != UCS_OK) {
+        goto ucs_async_remove_handler;
+    }
+
+    self->config.timeout = rdmacm_config->timeout;
+
     ucs_debug("created rdmacm_cm %p with event_channel %p (fd=%d)",
               self, self->ev_ch, self->ev_ch->fd);
 
     return UCS_OK;
 
+ucs_async_remove_handler:
+    ucs_async_remove_handler(self->ev_ch->fd, 1);
 err_destroy_ev_ch:
     rdma_destroy_event_channel(self->ev_ch);
 err:
@@ -616,6 +752,8 @@ UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_cm_t)
 {
     ucs_status_t status;
 
+    ucs_free(self->config.src_addr);
+
     status = ucs_async_remove_handler(self->ev_ch->fd, 1);
     if (status != UCS_OK) {
         ucs_warn("failed to remove event handler for fd %d: %s",
@@ -624,6 +762,7 @@ UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_cm_t)
 
     ucs_trace("destroying event_channel %p on cm %p", self->ev_ch, self);
     rdma_destroy_event_channel(self->ev_ch);
+    uct_rdmacm_cm_cqs_cleanup(self);
 }
 
 UCS_CLASS_DEFINE(uct_rdmacm_cm_t, uct_cm_t);
diff --git a/src/uct/ib/rdmacm/rdmacm_cm.h b/src/uct/ib/rdmacm/rdmacm_cm.h
index 6a236719c79..9a35e14ef77 100644
--- a/src/uct/ib/rdmacm/rdmacm_cm.h
+++ b/src/uct/ib/rdmacm/rdmacm_cm.h
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2019-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -8,17 +8,49 @@
 #define UCT_RDMACM_CM_H
 
 #include <uct/base/uct_cm.h>
-#include "rdmacm_def.h"
+#include <ucs/datastruct/khash.h>
+#include <ucs/sys/string.h>
+
+#include <rdma/rdma_cma.h>
+
+
+KHASH_MAP_INIT_INT64(uct_rdmacm_cm_cqs, struct ibv_cq*);
+
+
+#define UCT_RDMACM_TCP_PRIV_DATA_LEN    56    /** See rdma_connect(3) */
+#define UCT_RDMACM_EP_FLAGS_STRING_LEN  128   /** A string to hold the
+                                                  representation of the ep flags */
+#define UCT_RDMACM_EP_STRING_LEN        192   /** A string to hold the ep info */
+
+
+typedef struct uct_rdmacm_priv_data_hdr {
+    uint8_t length;     /* length of the private data */
+    uint8_t status;
+} uct_rdmacm_priv_data_hdr_t;
 
 
 /**
  * An rdmacm connection manager
  */
 typedef struct uct_rdmacm_cm {
-    uct_cm_t                  super;
-    struct rdma_event_channel *ev_ch;
+    uct_cm_t                   super;
+    struct rdma_event_channel  *ev_ch;
+    khash_t(uct_rdmacm_cm_cqs) cqs;
+
+    struct {
+        struct sockaddr        *src_addr;
+        double                 timeout;
+    } config;
 } uct_rdmacm_cm_t;
 
+
+typedef struct uct_rdmacm_cm_config {
+    uct_cm_config_t super;
+    char            *src_addr;
+    double          timeout;
+} uct_rdmacm_cm_config_t;
+
+
 UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_cm_t, uct_cm_t, uct_component_h,
                            uct_worker_h, const uct_cm_config_t *);
 UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_cm_t, uct_cm_t);
@@ -32,10 +64,25 @@ uct_rdmacm_cm_get_async(uct_rdmacm_cm_t *cm)
     return wpriv->async;
 }
 
+static inline void
+uct_rdmacm_cm_id_to_dev_name(struct rdma_cm_id *cm_id, char *dev_name)
+{
+    ucs_snprintf_zero(dev_name, UCT_DEVICE_NAME_MAX, "%s:%d",
+                      ibv_get_device_name(cm_id->verbs->device),
+                      cm_id->port_num);
+}
+
 ucs_status_t uct_rdmacm_cm_destroy_id(struct rdma_cm_id *id);
 
 ucs_status_t uct_rdmacm_cm_ack_event(struct rdma_cm_event *event);
 
-ucs_status_t uct_rdmacm_cm_reject(struct rdma_cm_id *id);
+ucs_status_t uct_rdmacm_cm_reject(uct_rdmacm_cm_t *cm, struct rdma_cm_id *id);
+
+ucs_status_t uct_rdmacm_cm_get_cq(uct_rdmacm_cm_t *cm, struct ibv_context *verbs,
+                                  struct ibv_cq **cq);
+
+void uct_rdmacm_cm_cqs_cleanup(uct_rdmacm_cm_t *cm);
+
+size_t uct_rdmacm_cm_get_max_conn_priv();
 
 #endif
diff --git a/src/uct/ib/rdmacm/rdmacm_cm_ep.c b/src/uct/ib/rdmacm/rdmacm_cm_ep.c
index db832f01912..5b5ddff264c 100644
--- a/src/uct/ib/rdmacm/rdmacm_cm_ep.c
+++ b/src/uct/ib/rdmacm/rdmacm_cm_ep.c
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2019-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -11,13 +11,15 @@
 #include "rdmacm_cm_ep.h"
 #include "rdmacm_cm.h"
 #include <ucs/arch/bitops.h>
+#include <ucs/sys/sock.h>
+#include <ucs/async/async.h>
 
 
 const char* uct_rdmacm_cm_ep_str(uct_rdmacm_cm_ep_t *cep, char *str,
                                  size_t max_len)
 {
-    struct sockaddr *local_addr  = rdma_get_local_addr(cep->id);
-    struct sockaddr *remote_addr = rdma_get_peer_addr(cep->id);
+    struct sockaddr *local_addr = cep->id ? rdma_get_local_addr(cep->id) : NULL;
+    struct sockaddr *remote_addr = cep->id ? rdma_get_peer_addr(cep->id) : NULL;
     char flags_buf[UCT_RDMACM_EP_FLAGS_STRING_LEN];
     char local_ip_port_str[UCS_SOCKADDR_STRING_LEN];
     char remote_ip_port_str[UCS_SOCKADDR_STRING_LEN];
@@ -33,13 +35,13 @@ const char* uct_rdmacm_cm_ep_str(uct_rdmacm_cm_ep_t *cep, char *str,
         NULL
     };
 
-    if (ucs_sockaddr_is_known_af(local_addr)) {
+    if ((local_addr != NULL) && ucs_sockaddr_is_known_af(local_addr)) {
         ucs_sockaddr_str(local_addr, local_ip_port_str, UCS_SOCKADDR_STRING_LEN);
     } else {
         ucs_strncpy_safe(local_ip_port_str, "<invalid>", UCS_SOCKADDR_STRING_LEN);
     }
 
-    if (ucs_sockaddr_is_known_af(remote_addr)) {
+    if ((remote_addr != NULL) && ucs_sockaddr_is_known_af(remote_addr)) {
         ucs_sockaddr_str(remote_addr, remote_ip_port_str, UCS_SOCKADDR_STRING_LEN);
     } else {
         ucs_strncpy_safe(remote_ip_port_str, "<invalid>", UCS_SOCKADDR_STRING_LEN);
@@ -147,7 +149,7 @@ ucs_status_t uct_rdmacm_cm_ep_conn_notify(uct_ep_h ep)
     return cep->status;
 }
 
-static void uct_rdmacm_cm_ep_destroy_dummy_cq_qp(uct_rdmacm_cm_ep_t *cep)
+static void uct_rdmacm_cm_ep_destroy_dummy_qp(uct_rdmacm_cm_ep_t *cep)
 {
     int ret;
 
@@ -158,39 +160,20 @@ static void uct_rdmacm_cm_ep_destroy_dummy_cq_qp(uct_rdmacm_cm_ep_t *cep)
         }
     }
 
-    if (cep->cq != NULL) {
-        ret = ibv_destroy_cq(cep->cq);
-        if (ret != 0) {
-            ucs_warn("ibv_destroy_cq() returned %d: %m", ret);
-        }
-    }
-
     cep->qp = NULL;
-    cep->cq = NULL;
 }
 
-static ucs_status_t uct_rdmacm_cm_create_dummy_cq_qp(struct rdma_cm_id *id,
-                                                     struct ibv_cq **cq_p,
-                                                     struct ibv_qp **qp_p)
+static ucs_status_t uct_rdmacm_cm_create_dummy_qp(struct rdma_cm_id *id,
+                                                  struct ibv_cq *cq,
+                                                  struct ibv_qp **qp_p)
 {
-    struct ibv_qp_init_attr qp_init_attr;
-    ucs_status_t status;
-    struct ibv_cq *cq;
+    struct ibv_qp_init_attr qp_init_attr = {0};
     struct ibv_qp *qp;
 
-    /* Create a dummy completion queue */
-    cq = ibv_create_cq(id->verbs, 1, NULL, NULL, 0);
-    if (cq == NULL) {
-        ucs_error("ibv_create_cq() failed: %m");
-        status =  UCS_ERR_IO_ERROR;
-        goto err;
-    }
-
     /* Create a dummy UD qp */
-    memset(&qp_init_attr, 0, sizeof(qp_init_attr));
-    qp_init_attr.send_cq = cq;
-    qp_init_attr.recv_cq = cq;
-    qp_init_attr.qp_type = IBV_QPT_UD;
+    qp_init_attr.send_cq          = cq;
+    qp_init_attr.recv_cq          = cq;
+    qp_init_attr.qp_type          = IBV_QPT_UD;
     qp_init_attr.cap.max_send_wr  = 2;
     qp_init_attr.cap.max_recv_wr  = 2;
     qp_init_attr.cap.max_send_sge = 1;
@@ -199,86 +182,70 @@ static ucs_status_t uct_rdmacm_cm_create_dummy_cq_qp(struct rdma_cm_id *id,
     qp = ibv_create_qp(id->pd, &qp_init_attr);
     if (qp == NULL) {
         ucs_error("failed to create a dummy ud qp. %m");
-        status = UCS_ERR_IO_ERROR;
-        goto err_destroy_cq;
+        return UCS_ERR_IO_ERROR;
     }
 
     ucs_debug("created ud QP %p with qp_num: 0x%x and cq %p on rdmacm_id %p",
               qp, qp->qp_num, cq, id);
 
-    *cq_p = cq;
     *qp_p = qp;
-
     return UCS_OK;
-
-err_destroy_cq:
-    ibv_destroy_cq(cq);
-err:
-    return status;
 }
 
-ucs_status_t
+static ucs_status_t
 uct_rdamcm_cm_ep_set_qp_num(struct rdma_conn_param *conn_param,
                             uct_rdmacm_cm_ep_t *cep)
 {
-    ucs_status_t status;
-    struct ibv_qp *qp;
     struct ibv_cq *cq;
+    ucs_status_t status;
+
+    status = uct_rdmacm_cm_get_cq(uct_rdmacm_cm_ep_get_cm(cep), cep->id->verbs,
+                                  &cq);
+    if (status != UCS_OK) {
+        return status;
+    }
 
     /* create a dummy qp in order to get a unique qp_num to provide to librdmacm */
-    status = uct_rdmacm_cm_create_dummy_cq_qp(cep->id, &cq, &qp);
+    status = uct_rdmacm_cm_create_dummy_qp(cep->id, cq, &cep->qp);
     if (status != UCS_OK) {
         return status;
     }
 
-    cep->cq             = cq;
-    cep->qp             = qp;
-    conn_param->qp_num  = qp->qp_num;
+    conn_param->qp_num = cep->qp->qp_num;
     return UCS_OK;
 }
 
 ucs_status_t uct_rdmacm_cm_ep_pack_cb(uct_rdmacm_cm_ep_t *cep,
-                                      struct rdma_conn_param *conn_param)
+                                      void *private_data,
+                                      size_t *priv_data_length_p)
 {
-    uct_rdmacm_priv_data_hdr_t      *hdr;
-    ucs_status_t                    status;
-    char                            dev_name[UCT_DEVICE_NAME_MAX];
-    size_t                          priv_data_ret;
     uct_cm_ep_priv_data_pack_args_t pack_args;
 
-    uct_rdmacm_cm_id_to_dev_name(cep->id, dev_name);
-
     /* Pack data to send inside rdmacm's conn_param to the remote peer */
-    hdr                  = (uct_rdmacm_priv_data_hdr_t*)conn_param->private_data;
     pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME;
-    ucs_strncpy_safe(pack_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX);
-
-    status = uct_cm_ep_pack_cb(&cep->super, cep->super.user_data, &pack_args,
-                               hdr + 1, uct_rdmacm_cm_get_max_conn_priv(),
-                               &priv_data_ret);
+    uct_rdmacm_cm_id_to_dev_name(cep->id, pack_args.dev_name);
 
-    if (status != UCS_OK) {
-        goto err;
-    }
-
-    ucs_assert_always(priv_data_ret <= UINT8_MAX);
-    hdr->length = (uint8_t)priv_data_ret;
-    hdr->status = UCS_OK;
-
-    conn_param->private_data_len = sizeof(*hdr) + hdr->length;
+    return uct_cm_ep_pack_cb(&cep->super, cep->super.user_data, &pack_args,
+                             private_data, uct_rdmacm_cm_get_max_conn_priv(),
+                             priv_data_length_p);
+}
 
-    return UCS_OK;
+ucs_status_t uct_rdmacm_cm_ep_resolve_cb(uct_rdmacm_cm_ep_t *cep)
+{
+    uct_cm_ep_resolve_args_t args;
 
-err:
-    return status;
+    args.field_mask = UCT_CM_EP_RESOLVE_ARGS_FIELD_DEV_NAME;
+    uct_rdmacm_cm_id_to_dev_name(cep->id, args.dev_name);
+    return uct_cm_ep_resolve_cb(&cep->super, &args);
 }
 
 static ucs_status_t uct_rdamcm_cm_ep_client_init(uct_rdmacm_cm_ep_t *cep,
                                                  const uct_ep_params_t *params)
 {
-    uct_rdmacm_cm_t *rdmacm_cm = uct_rdmacm_cm_ep_get_cm(cep);
     uct_cm_base_ep_t *cm_ep    = &cep->super;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
+    uct_rdmacm_cm_t *rdmacm_cm = uct_rdmacm_cm_ep_get_cm(cep);
+    char src_ip_port_str[UCS_SOCKADDR_STRING_LEN];
+    char dst_ip_port_str[UCS_SOCKADDR_STRING_LEN];
     char ep_str[UCT_RDMACM_EP_STRING_LEN];
     ucs_status_t status;
 
@@ -308,12 +275,17 @@ static ucs_status_t uct_rdamcm_cm_ep_client_init(uct_rdmacm_cm_ep_t *cep,
      * thread. Therefore, all ep fields have to be initialized before this
      * function is called. */
     ucs_trace("%s: rdma_resolve_addr on cm_id %p",
-              uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN), cep->id);
-    if (rdma_resolve_addr(cep->id, NULL, (struct sockaddr*)params->sockaddr->addr,
-                          1000/* TODO */)) {
-        ucs_error("rdma_resolve_addr() to dst addr %s failed: %m",
+              uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
+              cep->id);
+    if (rdma_resolve_addr(cep->id, rdmacm_cm->config.src_addr,
+                          (struct sockaddr*)params->sockaddr->addr,
+                          uct_rdmacm_cm_get_timeout(rdmacm_cm))) {
+        ucs_error("rdma_resolve_addr(src=%s, dst=%s) failed (%d): %m",
+                  ucs_sockaddr_str((struct sockaddr*)rdmacm_cm->config.src_addr,
+                                   src_ip_port_str, UCS_SOCKADDR_STRING_LEN),
                   ucs_sockaddr_str((struct sockaddr*)params->sockaddr->addr,
-                                   ip_port_str, UCS_SOCKADDR_STRING_LEN));
+                                   dst_ip_port_str, UCS_SOCKADDR_STRING_LEN),
+                  errno);
         status = UCS_ERR_IO_ERROR;
         goto err_destroy_id;
     }
@@ -329,13 +301,17 @@ static ucs_status_t uct_rdamcm_cm_ep_client_init(uct_rdmacm_cm_ep_t *cep,
 static ucs_status_t uct_rdamcm_cm_ep_server_init(uct_rdmacm_cm_ep_t *cep,
                                                  const uct_ep_params_t *params)
 {
-    struct rdma_cm_event   *event = (struct rdma_cm_event*)params->conn_request;
-    uct_rdmacm_cm_t        *cm    = uct_rdmacm_cm_ep_get_cm(cep);
-    uct_cm_base_ep_t       *cm_ep = &cep->super;
-    struct rdma_conn_param conn_param;
-    ucs_status_t           status;
-    char                   ep_str[UCT_RDMACM_EP_STRING_LEN];
+    struct rdma_cm_event *event = (struct rdma_cm_event*)params->conn_request;
+    uct_rdmacm_cm_t *cm         = uct_rdmacm_cm_ep_get_cm(cep);
+    uct_cm_base_ep_t *cm_ep     = &cep->super;
+    uint8_t pack_priv_data[UCT_RDMACM_TCP_PRIV_DATA_LEN];
+    size_t pack_priv_data_length;
+    const void *priv_data;
+    size_t priv_data_length;
+    ucs_status_t status;
+    char ep_str[UCT_RDMACM_EP_STRING_LEN];
 
+    cep->id     = event->id;
     cep->flags |= UCT_RDMACM_CM_EP_ON_SERVER;
 
     if (event->listen_id->channel != cm->ev_ch) {
@@ -359,53 +335,126 @@ static ucs_status_t uct_rdamcm_cm_ep_server_init(uct_rdmacm_cm_ep_t *cep,
                            uct_cm_ep_server_conn_notify_callback_t,
                            ucs_empty_function);
     if (status != UCS_OK) {
-        goto err;
+        goto err_reject;
     }
 
-    cep->id          = event->id;
     cep->id->context = cep;
 
-    memset(&conn_param, 0, sizeof(conn_param));
-    conn_param.private_data = ucs_alloca(uct_rdmacm_cm_get_max_conn_priv() +
-                                         sizeof(uct_rdmacm_priv_data_hdr_t));
+    if (ucs_test_all_flags(params->field_mask,
+                           UCT_EP_PARAM_FIELD_PRIV_DATA |
+                           UCT_EP_PARAM_FIELD_PRIV_DATA_LENGTH)) {
+        priv_data        = params->private_data;
+        priv_data_length = params->private_data_length;
+    } else if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB) {
+        status = uct_rdmacm_cm_ep_pack_cb(cep, pack_priv_data,
+                                          &pack_priv_data_length);
+        if (status != UCS_OK) {
+            goto err_reject;
+        }
+
+        priv_data        = &pack_priv_data;
+        priv_data_length = pack_priv_data_length;
+    } else {
+        priv_data        = NULL;
+        priv_data_length = 0;
+    }
 
-    status = uct_rdmacm_cm_ep_pack_cb(cep, &conn_param);
+    status = uct_rdmacm_cm_ep_send_priv_data(cep, priv_data, priv_data_length);
     if (status != UCS_OK) {
-        goto err_reject;
+        goto err;
+    }
+
+    return uct_rdmacm_cm_ack_event(event);
+err_reject:
+    uct_rdmacm_cm_reject(cm, cep->id);
+err:
+    uct_rdmacm_cm_destroy_id(cep->id);
+    cep->id = NULL;
+    uct_rdmacm_cm_ack_event(event);
+    return status;
+}
+
+ucs_status_t
+uct_rdmacm_cm_ep_send_priv_data(uct_rdmacm_cm_ep_t *cep, const void *priv_data,
+                                size_t priv_data_length)
+{
+    struct rdma_conn_param conn_param = {0};
+    uct_rdmacm_priv_data_hdr_t *hdr;
+    uct_cm_remote_data_t remote_data;
+    char ep_str[UCT_RDMACM_EP_STRING_LEN];
+    ucs_status_t status;
+
+    if (priv_data_length > uct_rdmacm_cm_get_max_conn_priv()) {
+        status = UCS_ERR_EXCEEDS_LIMIT;
+        goto err;
     }
 
     status = uct_rdamcm_cm_ep_set_qp_num(&conn_param, cep);
     if (status != UCS_OK) {
-        goto err_reject;
+        goto err;
     }
 
-    ucs_trace("%s: rdma_accept on cm_id %p",
-              uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
-              event->id);
+    conn_param.private_data     = ucs_alloca(UCT_RDMACM_TCP_PRIV_DATA_LEN);
+    conn_param.private_data_len = sizeof(*hdr) + priv_data_length;
 
-    if (rdma_accept(event->id, &conn_param)) {
-        uct_cm_ep_peer_error(&cep->super, "rdma_accept(on id=%p) failed: %m",
-                             event->id);
-        uct_rdmacm_cm_ep_destroy_dummy_cq_qp(cep);
-        status = UCS_ERR_IO_ERROR;
-        goto err;
+    hdr         = (uct_rdmacm_priv_data_hdr_t*)conn_param.private_data;
+    hdr->status = UCS_OK;
+    hdr->length = priv_data_length;
+    if (priv_data != NULL) {
+        memcpy(hdr + 1, priv_data, priv_data_length);
+    }
+
+    if (cep->flags & UCT_RDMACM_CM_EP_ON_CLIENT) {
+        ucs_trace("%s rdma_connect on cm_id %p",
+                  uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
+                  cep->id);
+        if (rdma_connect(cep->id, &conn_param)) {
+            uct_cm_ep_peer_error(&cep->super,
+                                 "rdma_connect(on id=%p) failed: %m", cep->id);
+            status = UCS_ERR_IO_ERROR;
+
+            /* If priv_pack_cb was specified, it means that error was detected
+             * while sending CM prviate data during handling "route resolved"
+             * RDMACM event, otherwise - error was detected when creating UCT EP
+             * and error should be returned to a user from uct_ep_create()
+             * status */
+            if (cep->super.priv_pack_cb != NULL) {
+                uct_rdmacm_cm_ep_set_failed(cep, &remote_data, status);
+            }
+            goto err;
+        }
+    } else {
+        ucs_assert(cep->flags & UCT_RDMACM_CM_EP_ON_SERVER);
+        ucs_trace("%s: rdma_accept on cm_id %p",
+                  uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
+                  cep->id);
+        if (rdma_accept(cep->id, &conn_param)) {
+            uct_cm_ep_peer_error(&cep->super,
+                                 "rdma_accept(on id=%p) failed: %m", cep->id);
+            status = UCS_ERR_CONNECTION_RESET;
+            goto err;
+        }
     }
 
-    uct_rdmacm_cm_ack_event(event);
     return UCS_OK;
 
-err_reject:
-    uct_rdmacm_cm_reject(event->id);
 err:
-    UCS_ASYNC_BLOCK(uct_rdmacm_cm_ep_get_async(cep));
-    cep->status = status;
-    cep->flags |= UCT_RDMACM_CM_EP_FAILED;
-    UCS_ASYNC_UNBLOCK(uct_rdmacm_cm_ep_get_async(cep));
-    uct_rdmacm_cm_destroy_id(event->id);
-    uct_rdmacm_cm_ack_event(event);
+    uct_rdmacm_cm_ep_destroy_dummy_qp(cep);
+    remote_data.field_mask = 0;
     return status;
 }
 
+ucs_status_t
+uct_rdmacm_cm_ep_connect(uct_ep_h ep, const uct_ep_connect_params_t *params)
+{
+    uct_rdmacm_cm_ep_t *cep = ucs_derived_of(ep, uct_rdmacm_cm_ep_t);
+    const void *priv_data;
+    size_t priv_data_length;
+
+    uct_ep_connect_params_get(params, &priv_data, &priv_data_length);
+    return uct_rdmacm_cm_ep_send_priv_data(cep, priv_data, priv_data_length);
+}
+
 ucs_status_t uct_rdmacm_cm_ep_disconnect(uct_ep_h ep, unsigned flags)
 {
     uct_rdmacm_cm_ep_t *cep = ucs_derived_of(ep, uct_rdmacm_cm_ep_t);
@@ -467,10 +516,11 @@ ucs_status_t uct_rdmacm_cm_ep_disconnect(uct_ep_h ep, unsigned flags)
         goto out;
     }
 
-    ucs_debug("%s: (id=%p) disconnecting from peer :%s",
+    ucs_debug("%s: (id=%p) disconnected from peer %s",
               uct_rdmacm_cm_ep_str(cep, ep_str, UCT_RDMACM_EP_STRING_LEN),
-              cep->id, ucs_sockaddr_str(rdma_get_peer_addr(cep->id), ip_port_str,
-                                        UCS_SOCKADDR_STRING_LEN));
+              cep->id,
+              ucs_sockaddr_str(rdma_get_peer_addr(cep->id), ip_port_str,
+                               UCS_SOCKADDR_STRING_LEN));
     status = UCS_OK;
 
 out:
@@ -485,10 +535,10 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_ep_t, const uct_ep_params_t *params)
 
     UCS_CLASS_CALL_SUPER_INIT(uct_cm_base_ep_t, params);
 
-    self->cq     = NULL;
     self->qp     = NULL;
     self->flags  = 0;
     self->status = UCS_OK;
+    self->id     = NULL;
 
     if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR) {
         status = uct_rdamcm_cm_ep_client_init(self, params);
@@ -522,7 +572,7 @@ UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_cm_ep_t)
 
     UCS_ASYNC_BLOCK(worker_priv->async);
 
-    uct_rdmacm_cm_ep_destroy_dummy_cq_qp(self);
+    uct_rdmacm_cm_ep_destroy_dummy_qp(self);
 
     /* rdma_destroy_id() cleans all events not yet reported on progress thread,
      * so no events would be reported to the user after destroying the id */
@@ -531,6 +581,6 @@ UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_cm_ep_t)
     UCS_ASYNC_UNBLOCK(worker_priv->async);
 }
 
-UCS_CLASS_DEFINE(uct_rdmacm_cm_ep_t, uct_base_ep_t);
+UCS_CLASS_DEFINE(uct_rdmacm_cm_ep_t, uct_cm_base_ep_t);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t);
diff --git a/src/uct/ib/rdmacm/rdmacm_cm_ep.h b/src/uct/ib/rdmacm/rdmacm_cm_ep.h
index b37ae2d8213..4f533c59036 100644
--- a/src/uct/ib/rdmacm/rdmacm_cm_ep.h
+++ b/src/uct/ib/rdmacm/rdmacm_cm_ep.h
@@ -1,11 +1,13 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2019-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
 
 #include "rdmacm_listener.h"
 
+#include <ucs/time/time.h>
+
 
 /**
  * RDMACM endpoint that is opened on a connection manager
@@ -13,7 +15,6 @@
 typedef struct uct_rdmacm_cm_ep {
     uct_cm_base_ep_t  super;
     struct rdma_cm_id *id;  /* The rdmacm id that is created per this ep */
-    struct ibv_cq     *cq;  /* Dummy cq used for creating a dummy qp */
     struct ibv_qp     *qp;  /* Dummy qp used for generating a unique qp_num */
     uint8_t           flags;
     ucs_status_t      status;
@@ -50,19 +51,30 @@ ucs_async_context_t *uct_rdmacm_cm_ep_get_async(uct_rdmacm_cm_ep_t *cep)
     return uct_rdmacm_cm_get_async(uct_rdmacm_cm_ep_get_cm(cep));
 }
 
+static inline int uct_rdmacm_cm_get_timeout(uct_rdmacm_cm_t *cm)
+{
+    return UCS_MSEC_PER_SEC * cm->config.timeout;
+}
+
 UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_cm_ep_t, uct_ep_t);
 
+ucs_status_t
+uct_rdmacm_cm_ep_send_priv_data(uct_rdmacm_cm_ep_t *cep, const void *priv_data,
+                                size_t priv_data_length);
+
+ucs_status_t uct_rdmacm_cm_ep_connect(uct_ep_h ep,
+                                      const uct_ep_connect_params_t *params);
+
 ucs_status_t uct_rdmacm_cm_ep_disconnect(uct_ep_h ep, unsigned flags);
 
 ucs_status_t uct_rdmacm_cm_ep_conn_notify(uct_ep_h ep);
 
 ucs_status_t uct_rdmacm_cm_ep_pack_cb(uct_rdmacm_cm_ep_t *cep,
-                                      struct rdma_conn_param *conn_param);
+                                      void *private_data,
+                                      size_t *priv_data_length);
 
-ucs_status_t
-uct_rdamcm_cm_ep_set_qp_num(struct rdma_conn_param *conn_param,
-                            uct_rdmacm_cm_ep_t *cep);
+ucs_status_t uct_rdmacm_cm_ep_resolve_cb(uct_rdmacm_cm_ep_t *cep);
 
 void uct_rdmacm_cm_ep_error_cb(uct_rdmacm_cm_ep_t *cep,
                                uct_cm_remote_data_t *remote_data,
diff --git a/src/uct/ib/rdmacm/rdmacm_component.c b/src/uct/ib/rdmacm/rdmacm_component.c
new file mode 100644
index 00000000000..082331962df
--- /dev/null
+++ b/src/uct/ib/rdmacm/rdmacm_component.c
@@ -0,0 +1,59 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2017-2021.  ALL RIGHTS RESERVED.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "rdmacm_cm.h"
+
+
+static ucs_config_field_t uct_rdmacm_cm_config_table[] = {
+    {"CM_", "", NULL,
+     ucs_offsetof(uct_rdmacm_cm_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_cm_config_table)},
+
+    {"SOURCE_ADDRESS", "",
+     "If non-empty, specify the local source address (IPv4 or IPv6) to use \n"
+     "when creating a client connection",
+     ucs_offsetof(uct_rdmacm_cm_config_t, src_addr), UCS_CONFIG_TYPE_STRING},
+
+    {"TIMEOUT", "10s",
+     "Timeout for RDMA address and route resolve operations",
+     ucs_offsetof(uct_rdmacm_cm_config_t, timeout), UCS_CONFIG_TYPE_TIME},
+
+    {NULL}
+};
+
+static ucs_status_t
+uct_rdmacm_query_md_resources(uct_component_t *component,
+                              uct_md_resource_desc_t **resources_p,
+                              unsigned *num_resources_p)
+{
+    *resources_p     = NULL;
+    *num_resources_p = 0;
+    return UCS_OK;
+}
+
+uct_component_t uct_rdmacm_component = {
+    .query_md_resources = uct_rdmacm_query_md_resources,
+    .md_open            = ucs_empty_function_return_unsupported,
+    .cm_open            = UCS_CLASS_NEW_FUNC_NAME(uct_rdmacm_cm_t),
+    .rkey_unpack        = ucs_empty_function_return_unsupported,
+    .rkey_ptr           = ucs_empty_function_return_unsupported,
+    .rkey_release       = ucs_empty_function_return_success,
+    .name               = "rdmacm",
+    .md_config          = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY,
+    .cm_config          = {
+        .name           = "RDMA-CM connection manager",
+        .prefix         = "RDMA_CM_",
+        .table          = uct_rdmacm_cm_config_table,
+        .size           = sizeof(uct_rdmacm_cm_config_t),
+    },
+    .tl_list            = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_rdmacm_component),
+    .flags              = UCT_COMPONENT_FLAG_CM
+};
+
+UCT_COMPONENT_REGISTER(&uct_rdmacm_component)
diff --git a/src/uct/ib/rdmacm/rdmacm_def.h b/src/uct/ib/rdmacm/rdmacm_def.h
deleted file mode 100644
index 6220be24665..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_def.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- *   * See file LICENSE for terms.
- *    */
-
-#ifndef UCT_RDMACM_H
-#define UCT_RDMACM_H
-
-#include <uct/api/uct.h>
-#include <uct/api/uct_def.h>
-#include <uct/base/uct_iface.h>
-#include <uct/base/uct_md.h>
-#include <ucs/type/class.h>
-#include <ucs/time/time.h>
-#include <ucs/async/async.h>
-#include <ucs/sys/sock.h>
-#include <ucs/sys/string.h>
-#include <rdma/rdma_cma.h>
-#include <sys/poll.h>
-
-#define UCT_RDMACM_TL_NAME              "rdmacm"
-#define UCT_RDMACM_UDP_PRIV_DATA_LEN    136   /** See rdma_accept(3) */
-#define UCT_RDMACM_TCP_PRIV_DATA_LEN    56    /** See rdma_connect(3) */
-#define UCT_RDMACM_EP_FLAGS_STRING_LEN  128   /** A string to hold the
-                                                  representation of the ep flags */
-#define UCT_RDMACM_EP_STRING_LEN        192   /** A string to hold the ep info */
-
-typedef struct uct_rdmacm_iface   uct_rdmacm_iface_t;
-typedef struct uct_rdmacm_ep      uct_rdmacm_ep_t;
-
-typedef struct uct_rdmacm_priv_data_hdr {
-    uint8_t length;     /* length of the private data */
-    uint8_t status;
-} uct_rdmacm_priv_data_hdr_t;
-
-typedef struct uct_rdmacm_ctx {
-    struct rdma_cm_id  *cm_id;
-    uct_rdmacm_ep_t    *ep;
-    ucs_list_link_t    list;    /* for list of used cm_ids */
-} uct_rdmacm_ctx_t;
-
-size_t uct_rdmacm_cm_get_max_conn_priv();
-
-ucs_status_t uct_rdmacm_resolve_addr(struct rdma_cm_id *cm_id,
-                                     struct sockaddr *addr, int timeout_ms,
-                                     ucs_log_level_t log_level);
-
-ucs_status_t uct_rdmacm_ep_resolve_addr(uct_rdmacm_ep_t *ep);
-
-ucs_status_t uct_rdmacm_ep_set_cm_id(uct_rdmacm_iface_t *iface, uct_rdmacm_ep_t *ep);
-
-static inline void uct_rdmacm_cm_id_to_dev_name(struct rdma_cm_id *cm_id, char *dev_name)
-{
-    ucs_snprintf_zero(dev_name, UCT_DEVICE_NAME_MAX, "%s:%d",
-                      ibv_get_device_name(cm_id->verbs->device), cm_id->port_num);
-}
-
-#endif /* UCT_RDMACM_H */
diff --git a/src/uct/ib/rdmacm/rdmacm_ep.c b/src/uct/ib/rdmacm/rdmacm_ep.c
deleted file mode 100644
index fd170e15034..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_ep.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "rdmacm_ep.h"
-
-
-#define UCT_RDMACM_CB_FLAGS_CHECK(_flags) \
-    do { \
-        UCT_CB_FLAGS_CHECK(_flags); \
-        if (!((_flags) & UCT_CB_FLAG_ASYNC)) { \
-            return UCS_ERR_UNSUPPORTED; \
-        } \
-    } while (0)
-
-
-ucs_status_t uct_rdmacm_ep_resolve_addr(uct_rdmacm_ep_t *ep)
-{
-    uct_rdmacm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_rdmacm_iface_t);
-    ucs_status_t status;
-
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-
-    status = uct_rdmacm_resolve_addr(ep->cm_id_ctx->cm_id,
-                                    (struct sockaddr *)&ep->remote_addr,
-                                    UCS_MSEC_PER_SEC * iface->config.addr_resolve_timeout,
-                                    UCS_LOG_LEVEL_ERROR);
-
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-    return status;
-}
-
-ucs_status_t uct_rdmacm_ep_set_cm_id(uct_rdmacm_iface_t *iface, uct_rdmacm_ep_t *ep)
-{
-    ucs_status_t status;
-
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-
-    /* create a cm_id for the client side */
-    if (iface->cm_id_quota > 0) {
-        /* Create an id for this interface. Events associated with this id will be
-         * reported on the event_channel that was created on iface init. */
-        ep->cm_id_ctx = ucs_malloc(sizeof(*ep->cm_id_ctx), "client cm_id_ctx");
-        if (ep->cm_id_ctx == NULL) {
-            status = UCS_ERR_NO_MEMORY;
-            goto out;
-        }
-
-        if (rdma_create_id(iface->event_ch, &ep->cm_id_ctx->cm_id,
-                           ep->cm_id_ctx, RDMA_PS_UDP)) {
-            ucs_error("rdma_create_id() failed: %m");
-            status = UCS_ERR_IO_ERROR;
-            goto out_free;
-        }
-
-        ep->cm_id_ctx->ep = ep;
-        ucs_list_add_tail(&iface->used_cm_ids_list, &ep->cm_id_ctx->list);
-        iface->cm_id_quota--;
-        ucs_debug("ep %p, new cm_id %p. cm_id_in_quota %d", ep,
-                   ep->cm_id_ctx->cm_id, iface->cm_id_quota);
-        status = UCS_OK;
-        goto out;
-    } else {
-        ep->cm_id_ctx = NULL;
-        status = UCS_ERR_NO_RESOURCE;
-        goto out;
-    }
-
-out_free:
-    ucs_free(ep->cm_id_ctx);
-out:
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-    return status;
-}
-
-static inline void uct_rdmacm_ep_add_to_pending(uct_rdmacm_iface_t *iface, uct_rdmacm_ep_t *ep)
-{
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-    ucs_list_add_tail(&iface->pending_eps_list, &ep->list_elem);
-    ep->is_on_pending = 1;
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-}
-
-static UCS_CLASS_INIT_FUNC(uct_rdmacm_ep_t, const uct_ep_params_t *params)
-{
-    uct_rdmacm_iface_t *iface       = ucs_derived_of(params->iface,
-                                                     uct_rdmacm_iface_t);
-    const ucs_sock_addr_t *sockaddr = params->sockaddr;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    ucs_status_t status;
-
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
-
-    if (iface->is_server) {
-        /* TODO allow an interface to be used both for server and client */
-        return UCS_ERR_UNSUPPORTED;
-    }
-
-    if (!(params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR)) {
-        return UCS_ERR_INVALID_PARAM;
-    }
-
-    UCT_RDMACM_CB_FLAGS_CHECK((params->field_mask &
-                               UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) ?
-                              params->sockaddr_cb_flags : 0);
-
-    /* Initialize these fields before calling rdma_resolve_addr to avoid a race
-     * where they are used before being initialized (from the async thread
-     * - after an RDMA_CM_EVENT_ROUTE_RESOLVED event) */
-    self->pack_cb       = (params->field_mask &
-                           UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB) ?
-                          params->sockaddr_pack_cb : NULL;
-    self->pack_cb_arg   = (params->field_mask &
-                           UCT_EP_PARAM_FIELD_USER_DATA) ?
-                          params->user_data : NULL;
-    self->pack_cb_flags = (params->field_mask &
-                           UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) ?
-                          params->sockaddr_cb_flags : 0;
-    pthread_mutex_init(&self->ops_mutex, NULL);
-    ucs_queue_head_init(&self->ops);
-
-    /* Save the remote address */
-    if (sockaddr->addr->sa_family == AF_INET) {
-        memcpy(&self->remote_addr, sockaddr->addr, sizeof(struct sockaddr_in));
-    } else if (sockaddr->addr->sa_family == AF_INET6) {
-        memcpy(&self->remote_addr, sockaddr->addr, sizeof(struct sockaddr_in6));
-    } else {
-        ucs_error("rdmacm ep: unknown remote sa_family=%d", sockaddr->addr->sa_family);
-        status = UCS_ERR_IO_ERROR;
-        goto err;
-    }
-
-    self->slow_prog_id = UCS_CALLBACKQ_ID_NULL;
-
-    status = uct_rdmacm_ep_set_cm_id(iface, self);
-    if (status == UCS_ERR_NO_RESOURCE) {
-        goto add_to_pending;
-    } else if (status != UCS_OK) {
-        goto err;
-    }
-
-    self->is_on_pending = 0;
-
-    /* After rdma_resolve_addr(), the client will wait for an
-     * RDMA_CM_EVENT_ADDR_RESOLVED event on the event_channel
-     * to proceed with the connection establishment.
-     * This event will be retrieved from the event_channel by the async thread.
-     * All endpoints share the interface's event_channel. */
-    status = uct_rdmacm_ep_resolve_addr(self);
-    if (status != UCS_OK) {
-        goto err;
-    }
-
-    goto out;
-
-add_to_pending:
-    /* Add the ep to the pending queue of eps since there is no
-     * available cm_id for it */
-    uct_rdmacm_ep_add_to_pending(iface, self);
-out:
-    ucs_debug("created an RDMACM endpoint on iface %p. event_channel: %p, "
-              "iface cm_id: %p remote addr: %s",
-               iface, iface->event_ch, iface->cm_id,
-               ucs_sockaddr_str((struct sockaddr *)sockaddr->addr,
-                                ip_port_str, UCS_SOCKADDR_STRING_LEN));
-    self->status = UCS_INPROGRESS;
-    return UCS_OK;
-
-err:
-    pthread_mutex_destroy(&self->ops_mutex);
-
-    return status;
-}
-
-static UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_ep_t)
-{
-    uct_rdmacm_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_rdmacm_iface_t);
-    uct_rdmacm_ctx_t *cm_id_ctx;
-
-    ucs_debug("rdmacm_ep %p: destroying", self);
-
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-    if (self->is_on_pending) {
-        ucs_list_del(&self->list_elem);
-        self->is_on_pending = 0;
-    }
-
-    /* remove the slow progress function in case it was placed on the slow progress
-     * chain but wasn't invoked yet */
-    uct_worker_progress_unregister_safe(&iface->super.worker->super,
-                                        &self->slow_prog_id);
-
-    pthread_mutex_destroy(&self->ops_mutex);
-    if (!ucs_queue_is_empty(&self->ops)) {
-        ucs_warn("destroying endpoint %p with not completed operations", self);
-    }
-
-    /* mark this ep as destroyed so that arriving events on it won't try to
-     * use it */
-    if (self->cm_id_ctx != NULL) {
-        cm_id_ctx     = self->cm_id_ctx->cm_id->context;
-        cm_id_ctx->ep = NULL;
-        ucs_debug("ep destroy: cm_id %p", cm_id_ctx->cm_id);
-    }
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-}
-
-UCS_CLASS_DEFINE(uct_rdmacm_ep_t, uct_base_ep_t)
-UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_ep_t, uct_ep_t, const uct_ep_params_t *);
-UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_ep_t, uct_ep_t);
-
-static unsigned uct_rdmacm_client_err_handle_progress(void *arg)
-{
-    uct_rdmacm_ep_t *rdmacm_ep = arg;
-    uct_rdmacm_iface_t *iface = ucs_derived_of(rdmacm_ep->super.super.iface,
-                                               uct_rdmacm_iface_t);
-
-    ucs_trace_func("err_handle ep=%p", rdmacm_ep);
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-
-    rdmacm_ep->slow_prog_id = UCS_CALLBACKQ_ID_NULL;
-    uct_set_ep_failed(&UCS_CLASS_NAME(uct_rdmacm_ep_t), &rdmacm_ep->super.super,
-                      rdmacm_ep->super.super.iface, rdmacm_ep->status);
-
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-    return 0;
-}
-
-void uct_rdmacm_ep_set_failed(uct_iface_t *iface, uct_ep_h ep, ucs_status_t status)
-{
-    uct_rdmacm_iface_t *rdmacm_iface = ucs_derived_of(iface, uct_rdmacm_iface_t);
-    uct_rdmacm_ep_t *rdmacm_ep       = ucs_derived_of(ep, uct_rdmacm_ep_t);
-
-    if (rdmacm_iface->super.err_handler_flags & UCT_CB_FLAG_ASYNC) {
-        uct_set_ep_failed(&UCS_CLASS_NAME(uct_rdmacm_ep_t), &rdmacm_ep->super.super,
-                          &rdmacm_iface->super.super, status);
-    } else {
-        /* invoke the error handling flow from the main thread */
-        rdmacm_ep->status = status;
-        uct_worker_progress_register_safe(&rdmacm_iface->super.worker->super,
-                                          uct_rdmacm_client_err_handle_progress,
-                                          rdmacm_ep, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                          &rdmacm_ep->slow_prog_id);
-    }
-}
-
-/**
- * Caller must lock ep->ops_mutex
- */
-void uct_rdmacm_ep_invoke_completions(uct_rdmacm_ep_t *ep, ucs_status_t status)
-{
-    uct_rdmacm_ep_op_t *op;
-
-    ucs_assert(pthread_mutex_trylock(&ep->ops_mutex) == EBUSY);
-
-    ucs_queue_for_each_extract(op, &ep->ops, queue_elem, 1) {
-        pthread_mutex_unlock(&ep->ops_mutex);
-        uct_invoke_completion(op->user_comp, status);
-        ucs_free(op);
-        pthread_mutex_lock(&ep->ops_mutex);
-    }
-    /* coverity[missing_unlock] */
-}
diff --git a/src/uct/ib/rdmacm/rdmacm_ep.h b/src/uct/ib/rdmacm/rdmacm_ep.h
deleted file mode 100644
index 3eb323c288e..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_ep.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_RDMACM_EP_H
-#define UCT_RDMACM_EP_H
-
-#include "rdmacm_iface.h"
-
-
-typedef struct uct_rdmacm_ep_op uct_rdmacm_ep_op_t;
-
-struct uct_rdmacm_ep_op {
-    ucs_queue_elem_t    queue_elem;
-    uct_completion_t    *user_comp;
-};
-
-
-struct uct_rdmacm_ep {
-    uct_base_ep_t                       super;
-    uct_cm_ep_priv_data_pack_callback_t pack_cb;
-    void                                *pack_cb_arg;
-    uint32_t                            pack_cb_flags;
-    int                                 is_on_pending;
-
-    pthread_mutex_t                     ops_mutex;  /* guards ops and status */
-    ucs_queue_head_t                    ops;
-    ucs_status_t                        status;     /* client EP status */
-
-    ucs_list_link_t                     list_elem;  /* for the pending_eps_list */
-    struct sockaddr_storage             remote_addr;
-    uct_worker_cb_id_t                  slow_prog_id;
-    uct_rdmacm_ctx_t                    *cm_id_ctx;
-};
-
-UCS_CLASS_DECLARE_NEW_FUNC(uct_rdmacm_ep_t, uct_ep_t, const uct_ep_params_t *);
-UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_ep_t, uct_ep_t);
-
-void uct_rdmacm_ep_set_failed(uct_iface_t *iface, uct_ep_h ep, ucs_status_t status);
-
-void uct_rdmacm_ep_invoke_completions(uct_rdmacm_ep_t *ep, ucs_status_t status);
-
-#endif
diff --git a/src/uct/ib/rdmacm/rdmacm_iface.c b/src/uct/ib/rdmacm/rdmacm_iface.c
deleted file mode 100644
index d13497ceb43..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_iface.c
+++ /dev/null
@@ -1,640 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "rdmacm_iface.h"
-#include "rdmacm_ep.h"
-#include <uct/base/uct_worker.h>
-#include <ucs/sys/string.h>
-
-
-enum uct_rdmacm_process_event_flags {
-    UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG = UCS_BIT(0),
-    UCT_RDMACM_PROCESS_EVENT_ACK_EVENT_FLAG     = UCS_BIT(1)
-};
-
-static ucs_config_field_t uct_rdmacm_iface_config_table[] = {
-    {"", "", NULL,
-     ucs_offsetof(uct_rdmacm_iface_config_t, super),
-     UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},
-
-    {"BACKLOG", "1024",
-     "Maximum number of pending connections for an rdma_cm_id.",
-     ucs_offsetof(uct_rdmacm_iface_config_t, backlog), UCS_CONFIG_TYPE_UINT},
-
-    {"CM_ID_QUOTA", "64",
-     "How many rdma_cm connections can progress simultaneously.",
-     ucs_offsetof(uct_rdmacm_iface_config_t, cm_id_quota), UCS_CONFIG_TYPE_UINT},
-
-    {NULL}
-};
-
-static UCS_CLASS_DECLARE_DELETE_FUNC(uct_rdmacm_iface_t, uct_iface_t);
-
-static ucs_status_t uct_rdmacm_iface_query(uct_iface_h tl_iface,
-                                           uct_iface_attr_t *iface_attr)
-{
-    uct_rdmacm_iface_t *rdmacm_iface = ucs_derived_of(tl_iface, uct_rdmacm_iface_t);
-    struct sockaddr *addr;
-    ucs_status_t status;
-
-    uct_base_iface_query(&rdmacm_iface->super, iface_attr);
-
-    iface_attr->iface_addr_len  = sizeof(ucs_sock_addr_t);
-    iface_attr->device_addr_len = 0;
-    iface_attr->cap.flags       = UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR |
-                                  UCT_IFACE_FLAG_CB_ASYNC            |
-                                  UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE;
-    /* User's private data size is UCT_RDMACM_UDP_PRIV_DATA_LEN minus room for
-     * the private_data header (to hold the length of the data) */
-    iface_attr->max_conn_priv   = UCT_RDMACM_MAX_CONN_PRIV;
-
-    if (rdmacm_iface->is_server) {
-        addr   = rdma_get_local_addr(rdmacm_iface->cm_id);
-        status = ucs_sockaddr_copy((struct sockaddr *)&iface_attr->listen_sockaddr,
-                                   addr);
-        if (status != UCS_OK) {
-            return status;
-        }
-    }
-
-    return UCS_OK;
-}
-
-static ucs_status_t uct_rdmacm_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr)
-{
-    ucs_sock_addr_t *rdmacm_addr = (ucs_sock_addr_t *)iface_addr;
-
-    rdmacm_addr->addr    = NULL;
-    rdmacm_addr->addrlen = 0;
-    return UCS_OK;
-}
-
-static ucs_status_t uct_rdmacm_accept(struct rdma_cm_id *id)
-{
-    /* The server will not send any reply data back to the client */
-    struct rdma_conn_param conn_param = {0};
-
-    /* Accepting the connection will generate the RDMA_CM_EVENT_ESTABLISHED
-     * event on the client side. */
-    if (rdma_accept(id, &conn_param)) {
-        ucs_error("rdma_accept(to id=%p) failed: %m", id);
-        return UCS_ERR_IO_ERROR;
-    }
-
-    return UCS_OK;
-}
-
-static ucs_status_t uct_rdmacm_iface_accept(uct_iface_h tl_iface,
-                                            uct_conn_request_h conn_request)
-{
-    struct rdma_cm_event *event = conn_request;
-    ucs_status_t         status;
-
-    ucs_trace("accepting event %p with id %p", event, event->id);
-    status = uct_rdmacm_accept(event->id);
-    rdma_destroy_id(event->id);
-    rdma_ack_cm_event(event);
-
-    return status;
-}
-
-static ucs_status_t uct_rdmacm_iface_reject(uct_iface_h tl_iface,
-                                            uct_conn_request_h conn_request)
-{
-    struct rdma_cm_event       *event = conn_request;
-    ucs_status_t               status = UCS_OK;
-    uct_rdmacm_priv_data_hdr_t hdr    = {
-        .length = 0,
-        .status = (uint8_t)UCS_ERR_REJECTED
-    };
-
-    ucs_trace("rejecting event %p with id %p", event, event->id);
-    if (rdma_reject(event->id, &hdr, sizeof(hdr))) {
-        ucs_warn("rdma_reject(id=%p) failed: %m", event->id);
-        status = UCS_ERR_IO_ERROR;
-    }
-
-    rdma_destroy_id(event->id);
-    rdma_ack_cm_event(event);
-    return status;
-}
-
-static ucs_status_t uct_rdmacm_ep_flush(uct_ep_h tl_ep, unsigned flags,
-                                        uct_completion_t *comp)
-{
-    uct_rdmacm_ep_t    *ep = ucs_derived_of(tl_ep, uct_rdmacm_ep_t);
-    ucs_status_t       status;
-    uct_rdmacm_ep_op_t *op;
-
-    pthread_mutex_lock(&ep->ops_mutex);
-    status = ep->status;
-    if ((status == UCS_INPROGRESS) && (comp != NULL)) {
-        op = ucs_malloc(sizeof(*op), "uct_rdmacm_ep_flush op");
-        if (op != NULL) {
-            op->user_comp = comp;
-            ucs_queue_push(&ep->ops, &op->queue_elem);
-        } else {
-            status = UCS_ERR_NO_MEMORY;
-        }
-    }
-    pthread_mutex_unlock(&ep->ops_mutex);
-
-    return status;
-}
-
-static uct_iface_ops_t uct_rdmacm_iface_ops = {
-    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_rdmacm_ep_t),
-    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_ep_t),
-    .ep_flush                 = uct_rdmacm_ep_flush,
-    .ep_fence                 = uct_base_ep_fence,
-    .ep_pending_purge         = ucs_empty_function,
-    .iface_accept             = uct_rdmacm_iface_accept,
-    .iface_reject             = uct_rdmacm_iface_reject,
-    .iface_progress_enable    = (uct_iface_progress_enable_func_t)ucs_empty_function_return_success,
-    .iface_progress_disable   = (uct_iface_progress_disable_func_t)ucs_empty_function_return_success,
-    .iface_progress           = ucs_empty_function_return_zero,
-    .iface_flush              = uct_base_iface_flush,
-    .iface_fence              = uct_base_iface_fence,
-    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_rdmacm_iface_t),
-    .iface_query              = uct_rdmacm_iface_query,
-    .iface_is_reachable       = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero,
-    .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success,
-    .iface_get_address        = uct_rdmacm_iface_get_address
-};
-
-ucs_status_t uct_rdmacm_resolve_addr(struct rdma_cm_id *cm_id,
-                                     struct sockaddr *addr, int timeout_ms,
-                                     ucs_log_level_t log_level)
-{
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-
-    if (rdma_resolve_addr(cm_id, NULL, addr, timeout_ms)) {
-        ucs_log(log_level, "rdma_resolve_addr(addr=%s) failed: %m",
-                ucs_sockaddr_str(addr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
-        return UCS_ERR_IO_ERROR;
-    }
-    return UCS_OK;
-}
-
-void uct_rdmacm_iface_client_start_next_ep(uct_rdmacm_iface_t *iface)
-{
-    ucs_status_t status;
-    uct_rdmacm_ep_t *ep, *tmp;
-
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-
-    /* try to start an ep from the pending eps list */
-    ucs_list_for_each_safe(ep, tmp, &iface->pending_eps_list, list_elem) {
-        status = uct_rdmacm_ep_set_cm_id(iface, ep);
-        if (status != UCS_OK) {
-            continue;
-        }
-
-        ucs_list_del(&ep->list_elem);
-        ep->is_on_pending = 0;
-
-        status = uct_rdmacm_ep_resolve_addr(ep);
-        if (status == UCS_OK) {
-            break;
-        }
-
-        uct_rdmacm_ep_set_failed(&iface->super.super, &ep->super.super, status);
-    }
-
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-}
-
-static void uct_rdmacm_client_handle_failure(uct_rdmacm_iface_t *iface,
-                                             uct_rdmacm_ep_t *ep,
-                                             ucs_status_t status)
-{
-    ucs_assert(!iface->is_server);
-    if (ep != NULL) {
-        pthread_mutex_lock(&ep->ops_mutex);
-        uct_rdmacm_ep_set_failed(&iface->super.super, &ep->super.super, status);
-        uct_rdmacm_ep_invoke_completions(ep, status);
-        pthread_mutex_unlock(&ep->ops_mutex);
-    }
-}
-
-static void uct_rdmacm_iface_process_conn_req(uct_rdmacm_iface_t *iface,
-                                              struct rdma_cm_event *event,
-                                              struct sockaddr *remote_addr)
-{
-    uct_rdmacm_priv_data_hdr_t *hdr;
-
-    hdr = (uct_rdmacm_priv_data_hdr_t*) event->param.ud.private_data;
-    ucs_assert(hdr->status == UCS_OK);
-
-    /* TODO check the iface's cb_flags to determine when to invoke this callback.
-     * currently only UCT_CB_FLAG_ASYNC is supported so the cb is invoked from here */
-    iface->conn_request_cb(&iface->super.super, iface->conn_request_arg,
-                           /* connection request*/
-                           event,
-                           /* private data */
-                           UCS_PTR_BYTE_OFFSET(event->param.ud.private_data,
-                                               sizeof(uct_rdmacm_priv_data_hdr_t)),
-                           /* length */
-                           hdr->length);
-}
-
-/**
- * Release a cm_id. This function should be called when the async context
- * is locked.
- */
-static void uct_rdmacm_iface_release_cm_id(uct_rdmacm_iface_t *iface,
-                                           uct_rdmacm_ctx_t **cm_id_ctx_p)
-{
-    uct_rdmacm_ctx_t *cm_id_ctx = *cm_id_ctx_p;
-
-    ucs_trace("destroying cm_id %p", cm_id_ctx->cm_id);
-
-    ucs_list_del(&cm_id_ctx->list);
-    if (cm_id_ctx->ep != NULL) {
-        cm_id_ctx->ep->cm_id_ctx = NULL;
-    }
-    rdma_destroy_id(cm_id_ctx->cm_id);
-    ucs_free(cm_id_ctx);
-    iface->cm_id_quota++;
-
-    *cm_id_ctx_p = NULL;
-}
-
-static unsigned
-uct_rdmacm_iface_process_event(uct_rdmacm_iface_t *iface,
-                               struct rdma_cm_event *event)
-{
-    struct sockaddr *remote_addr = rdma_get_peer_addr(event->id);
-    uct_rdmacm_md_t *rdmacm_md   = (uct_rdmacm_md_t *)iface->super.md;
-    unsigned ret_flags           = UCT_RDMACM_PROCESS_EVENT_ACK_EVENT_FLAG;
-    uct_rdmacm_ep_t *ep          = NULL;
-    uct_cm_ep_priv_data_pack_args_t pack_args;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    char dev_name[UCT_DEVICE_NAME_MAX];
-    uct_rdmacm_priv_data_hdr_t *hdr;
-    struct rdma_conn_param conn_param;
-    uct_rdmacm_ctx_t *cm_id_ctx;
-    ssize_t priv_data_ret;
-    ucs_status_t status;
-
-    if (iface->is_server) {
-        ucs_assert((iface->cm_id == event->id) ||
-                   ((event->event == RDMA_CM_EVENT_CONNECT_REQUEST) &&
-                    (iface->cm_id == event->listen_id)));
-    } else {
-        cm_id_ctx = event->id->context;
-        ep = cm_id_ctx->ep;
-    }
-
-    ucs_trace("rdmacm event (fd=%d cm_id %p) on %s (ep=%p): %s. Peer: %s.",
-              iface->event_ch->fd, event->id, (iface->is_server ? "server" : "client"),
-              ep, rdma_event_str(event->event),
-              ucs_sockaddr_str(remote_addr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
-
-    status = UCS_ERR_UNREACHABLE;
-    /* The following applies for rdma_cm_id of type RDMA_PS_UDP only */
-    switch (event->event) {
-    case RDMA_CM_EVENT_ADDR_RESOLVED:
-        /* Client - resolve the route to the server */
-        if (ep == NULL) {
-            /* received an event on an non-existing ep - an already destroyed ep */
-            ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-        } else if (rdma_resolve_route(event->id, UCS_MSEC_PER_SEC *
-                                                 rdmacm_md->addr_resolve_timeout)) {
-            ucs_error("rdma_resolve_route(to addr=%s) failed: %m",
-                      ucs_sockaddr_str(remote_addr, ip_port_str,
-                                       UCS_SOCKADDR_STRING_LEN));
-            ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-            uct_rdmacm_client_handle_failure(iface, ep, UCS_ERR_INVALID_ADDR);
-        }
-        break;
-
-    case RDMA_CM_EVENT_ROUTE_RESOLVED:
-        /* Client - send a connection request to the server */
-        if (ep == NULL) {
-            /* received an event on an non-existing ep - an already destroyed ep */
-            ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-        } else {
-            memset(&conn_param, 0, sizeof(conn_param));
-            conn_param.private_data = ucs_alloca(UCT_RDMACM_MAX_CONN_PRIV +
-                                                 sizeof(uct_rdmacm_priv_data_hdr_t));
-
-            uct_rdmacm_cm_id_to_dev_name(ep->cm_id_ctx->cm_id, dev_name);
-
-            hdr                  = (uct_rdmacm_priv_data_hdr_t*)conn_param.private_data;
-            pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME;
-            ucs_strncpy_safe(pack_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX);
-            /* TODO check the ep's cb_flags to determine when to invoke this callback.
-             * currently only UCT_CB_FLAG_ASYNC is supported so the cb is invoked from here */
-            priv_data_ret = ep->pack_cb(ep->pack_cb_arg, &pack_args, hdr + 1);
-            if (priv_data_ret < 0) {
-                ucs_trace("rdmacm client (iface=%p cm_id=%p fd=%d) failed to fill "
-                          "private data. status: %s",
-                          iface, event->id, iface->event_ch->fd,
-                          ucs_status_string((ucs_status_t)priv_data_ret));
-                ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-                uct_rdmacm_client_handle_failure(iface, ep, (ucs_status_t)priv_data_ret);
-                break;
-            }
-
-            hdr->length = (uint8_t)priv_data_ret;
-            hdr->status = UCS_OK;
-            /* The private_data starts with the header of the user's private data
-             * and then the private data itself */
-            conn_param.private_data_len = sizeof(*hdr) + hdr->length;
-
-            if (rdma_connect(event->id, &conn_param)) {
-                ucs_error("rdma_connect(to addr=%s) failed: %m",
-                          ucs_sockaddr_str(remote_addr, ip_port_str,
-                                           UCS_SOCKADDR_STRING_LEN));
-                ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-                uct_rdmacm_client_handle_failure(iface, ep,
-                                                 UCS_ERR_SOME_CONNECTS_FAILED);
-            }
-        }
-        break;
-
-    case RDMA_CM_EVENT_CONNECT_REQUEST:
-        /* Server - handle a connection request from the client */
-        ucs_assert(iface->is_server);
-        uct_rdmacm_iface_process_conn_req(iface, event, remote_addr);
-        ret_flags &= ~UCT_RDMACM_PROCESS_EVENT_ACK_EVENT_FLAG;
-        break;
-
-    case RDMA_CM_EVENT_REJECTED:
-        /* Client - server rejected the connection request */
-        ucs_warn("rdmacm connection request to %s rejected, id %p",
-                  ucs_sockaddr_str(remote_addr, ip_port_str,
-                                   UCS_SOCKADDR_STRING_LEN), event->id);
-
-        ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-        uct_rdmacm_client_handle_failure(iface, ep, UCS_ERR_REJECTED);
-        break;
-
-    case RDMA_CM_EVENT_ESTABLISHED:
-        /* Client - connection is ready */
-        ucs_assert(!iface->is_server);
-        ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-        if (ep != NULL) {
-            pthread_mutex_lock(&ep->ops_mutex);
-            ep->status = UCS_OK;
-            uct_rdmacm_ep_invoke_completions(ep, UCS_OK);
-            pthread_mutex_unlock(&ep->ops_mutex);
-        }
-        break;
-
-    /* client error events */
-    case RDMA_CM_EVENT_UNREACHABLE:
-        hdr = (uct_rdmacm_priv_data_hdr_t *)event->param.ud.private_data;
-        if ((hdr != NULL) && (event->param.ud.private_data_len > 0) &&
-            ((ucs_status_t)hdr->status == UCS_ERR_REJECTED)) {
-            ucs_assert(hdr->length == 0);
-            ucs_assert(event->param.ud.private_data_len >= sizeof(*hdr));
-            ucs_assert(!iface->is_server);
-            status = UCS_ERR_REJECTED;
-        }
-        /* Fall through */
-    case RDMA_CM_EVENT_ADDR_ERROR:
-    case RDMA_CM_EVENT_ROUTE_ERROR:
-    case RDMA_CM_EVENT_CONNECT_RESPONSE:
-    /* client and server error events */
-    case RDMA_CM_EVENT_CONNECT_ERROR:
-    case RDMA_CM_EVENT_DISCONNECTED:
-        /* Server/Client - connection was disconnected */
-        if (status != UCS_ERR_REJECTED) {
-            ucs_error("received event %s. status = %d. Peer: %s.",
-                      rdma_event_str(event->event), event->status,
-                      ucs_sockaddr_str(remote_addr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
-        }
-
-        if (!iface->is_server) {
-            ret_flags |= UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG;
-            uct_rdmacm_client_handle_failure(iface, ep, status);
-        }
-        break;
-
-    default:
-        ucs_warn("unexpected RDMACM event: %d", event->event);
-        break;
-    }
-
-    return ret_flags;
-}
-
-static void uct_rdmacm_iface_event_handler(int fd, ucs_event_set_types_t events,
-                                           void *arg)
-{
-    uct_rdmacm_iface_t             *iface     = arg;
-    uct_rdmacm_ctx_t               *cm_id_ctx = NULL;
-    struct rdma_cm_event           *event;
-    unsigned                       proc_event_flags;
-    int                            ret;
-
-    for (;;) {
-        /* Fetch an event */
-        ret = rdma_get_cm_event(iface->event_ch, &event);
-        if (ret) {
-            /* EAGAIN (in a non-blocking rdma_get_cm_event) means that
-             * there are no more events */
-            if (errno != EAGAIN) {
-                ucs_warn("rdma_get_cm_event() failed: %m");
-            }
-            return;
-        }
-
-        proc_event_flags = uct_rdmacm_iface_process_event(iface, event);
-        if (!iface->is_server) {
-            cm_id_ctx = (uct_rdmacm_ctx_t *)event->id->context;
-        }
-
-        if (proc_event_flags & UCT_RDMACM_PROCESS_EVENT_ACK_EVENT_FLAG) {
-            ret = rdma_ack_cm_event(event);
-            if (ret) {
-                ucs_warn("rdma_ack_cm_event() failed: %m");
-            }
-        }
-
-        if ((proc_event_flags & UCT_RDMACM_PROCESS_EVENT_DESTROY_CM_ID_FLAG) &&
-            (cm_id_ctx != NULL)) {
-            uct_rdmacm_iface_release_cm_id(iface, &cm_id_ctx);
-            uct_rdmacm_iface_client_start_next_ep(iface);
-        }
-    }
-}
-
-static UCS_CLASS_INIT_FUNC(uct_rdmacm_iface_t, uct_md_h md, uct_worker_h worker,
-                           const uct_iface_params_t *params,
-                           const uct_iface_config_t *tl_config)
-{
-    uct_rdmacm_iface_config_t *config = ucs_derived_of(tl_config, uct_rdmacm_iface_config_t);
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    uct_rdmacm_md_t *rdmacm_md;
-    struct sockaddr *listen_addr;
-    ucs_status_t status;
-
-    UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE,
-                    "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined");
-
-    UCT_CHECK_PARAM((params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) ||
-                    (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT),
-                    "Invalid open mode %zu", params->open_mode);
-
-    UCT_CHECK_PARAM(!(params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) ||
-                    (params->field_mask & UCT_IFACE_PARAM_FIELD_SOCKADDR),
-                    "UCT_IFACE_PARAM_FIELD_SOCKADDR is not defined for UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER");
-
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rdmacm_iface_ops, md, worker,
-                              params, tl_config
-                              UCS_STATS_ARG((params->field_mask & 
-                                             UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
-                                            params->stats_root : NULL)
-                              UCS_STATS_ARG(UCT_RDMACM_TL_NAME));
-
-    rdmacm_md = ucs_derived_of(self->super.md, uct_rdmacm_md_t);
-
-    if (self->super.worker->async == NULL) {
-        ucs_error("rdmacm must have async != NULL");
-        return UCS_ERR_INVALID_PARAM;
-    }
-    if (self->super.worker->async->mode == UCS_ASYNC_MODE_SIGNAL) {
-        ucs_warn("rdmacm does not support SIGIO");
-    }
-
-    self->config.addr_resolve_timeout = rdmacm_md->addr_resolve_timeout;
-
-    self->event_ch = rdma_create_event_channel();
-    if (self->event_ch == NULL) {
-        ucs_error("rdma_create_event_channel(open_mode=%zu) failed: %m",
-                  params->open_mode);
-        status = UCS_ERR_IO_ERROR;
-        goto err;
-    }
-
-    /* Set the event_channel fd to non-blocking mode
-     * (so that rdma_get_cm_event won't be blocking) */
-    status = ucs_sys_fcntl_modfl(self->event_ch->fd, O_NONBLOCK, 0);
-    if (status != UCS_OK) {
-        goto err_destroy_event_channel;
-    }
-
-    if (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) {
-        self->is_server = 1;
-
-        /* Create an id for this interface. Events associated with this id will be
-         * reported on the event_channel that was previously created. */
-        if (rdma_create_id(self->event_ch, &self->cm_id, NULL, RDMA_PS_UDP)) {
-            ucs_error("rdma_create_id() failed: %m");
-            status = UCS_ERR_IO_ERROR;
-            goto err_destroy_event_channel;
-        }
-
-        listen_addr = (struct sockaddr *)params->mode.sockaddr.listen_sockaddr.addr;
-        if (rdma_bind_addr(self->cm_id, listen_addr)) {
-            status = (errno == EADDRINUSE || errno == EADDRNOTAVAIL) ?
-                     UCS_ERR_BUSY : UCS_ERR_IO_ERROR;
-            ucs_error("rdma_bind_addr(addr=%s) failed: %m",
-                      ucs_sockaddr_str(listen_addr, ip_port_str,
-                                       UCS_SOCKADDR_STRING_LEN));
-            goto err_destroy_id;
-        }
-
-        if (rdma_listen(self->cm_id, config->backlog)) {
-            ucs_error("rdma_listen(cm_id:=%p event_channel=%p addr=%s) failed: %m",
-                       self->cm_id, self->event_ch,
-                       ucs_sockaddr_str(listen_addr, ip_port_str,
-                                        UCS_SOCKADDR_STRING_LEN));
-            status = UCS_ERR_IO_ERROR;
-            goto err_destroy_id;
-        }
-
-        ucs_debug("rdma_cm id %p listening on %s:%d", self->cm_id,
-                  ucs_sockaddr_str(listen_addr, ip_port_str,
-                                   UCS_SOCKADDR_STRING_LEN),
-                  ntohs(rdma_get_src_port(self->cm_id)));
-
-        if (!(params->mode.sockaddr.cb_flags & UCT_CB_FLAG_ASYNC)) {
-            ucs_fatal("Synchronous callback is not supported");
-        }
-
-        self->cb_flags         = params->mode.sockaddr.cb_flags;
-        self->conn_request_cb  = params->mode.sockaddr.conn_request_cb;
-        self->conn_request_arg = params->mode.sockaddr.conn_request_arg;
-    } else {
-        self->cm_id            = NULL;
-        self->is_server        = 0;
-    }
-
-    self->cm_id_quota = config->cm_id_quota;
-    ucs_list_head_init(&self->pending_eps_list);
-    ucs_list_head_init(&self->used_cm_ids_list);
-
-    /* Server and client register an event handler for incoming messages */
-    status = ucs_async_set_event_handler(self->super.worker->async->mode,
-                                         self->event_ch->fd, UCS_EVENT_SET_EVREAD,
-                                         uct_rdmacm_iface_event_handler,
-                                         self, self->super.worker->async);
-    if (status != UCS_OK) {
-        ucs_error("failed to set event handler");
-        goto err_destroy_id;
-    }
-
-
-    ucs_debug("created an RDMACM iface %p. event_channel: %p, fd: %d, cm_id: %p",
-              self, self->event_ch, self->event_ch->fd, self->cm_id);
-    return UCS_OK;
-
-err_destroy_id:
-    if (self->is_server) {
-        rdma_destroy_id(self->cm_id);
-    }
-err_destroy_event_channel:
-    rdma_destroy_event_channel(self->event_ch);
-err:
-    return status;
-}
-
-static UCS_CLASS_CLEANUP_FUNC(uct_rdmacm_iface_t)
-{
-    uct_rdmacm_ctx_t *cm_id_ctx, *tmp_cm_id_ctx;
-
-    ucs_async_remove_handler(self->event_ch->fd, 1);
-    if (self->is_server) {
-        rdma_destroy_id(self->cm_id);
-    }
-
-    UCS_ASYNC_BLOCK(self->super.worker->async);
-
-    ucs_list_for_each_safe(cm_id_ctx, tmp_cm_id_ctx,
-                           &self->used_cm_ids_list, list) {
-        uct_rdmacm_iface_release_cm_id(self, &cm_id_ctx);
-    }
-
-    UCS_ASYNC_UNBLOCK(self->super.worker->async);
-
-    rdma_destroy_event_channel(self->event_ch);
-}
-
-UCS_CLASS_DEFINE(uct_rdmacm_iface_t, uct_base_iface_t);
-static UCS_CLASS_DEFINE_NEW_FUNC(uct_rdmacm_iface_t, uct_iface_t, uct_md_h,
-                                 uct_worker_h, const uct_iface_params_t *,
-                                 const uct_iface_config_t *);
-static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rdmacm_iface_t, uct_iface_t);
-
-static ucs_status_t
-uct_rdmacm_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
-                            unsigned *num_tl_devices_p)
-{
-    *num_tl_devices_p = 0;
-    *tl_devices_p     = NULL;
-    return UCS_OK;
-}
-
-UCT_TL_DEFINE(&uct_rdmacm_component, rdmacm, uct_rdmacm_query_tl_devices,
-              uct_rdmacm_iface_t, "RDMACM_", uct_rdmacm_iface_config_table,
-              uct_rdmacm_iface_config_t);
diff --git a/src/uct/ib/rdmacm/rdmacm_iface.h b/src/uct/ib/rdmacm/rdmacm_iface.h
deleted file mode 100644
index a10297f3c36..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_iface.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_RDMACM_IFACE_H
-#define UCT_RDMACM_IFACE_H
-
-#include "rdmacm_def.h"
-#include "rdmacm_md.h"
-
-#define UCT_RDMACM_MAX_CONN_PRIV \
-        (UCT_RDMACM_UDP_PRIV_DATA_LEN) - (sizeof(uct_rdmacm_priv_data_hdr_t))
-
-typedef struct uct_rdmacm_iface_config {
-    uct_iface_config_t       super;
-    unsigned                 backlog;
-    unsigned                 cm_id_quota;
-} uct_rdmacm_iface_config_t;
-
-
-struct uct_rdmacm_iface {
-    uct_base_iface_t                     super;
-
-    struct rdma_cm_id                    *cm_id;
-    struct rdma_event_channel            *event_ch;
-
-    uint8_t                              is_server;
-    /** Fields used only for server side */
-    void                                 *conn_request_arg;
-    uct_sockaddr_conn_request_callback_t conn_request_cb;
-    uint32_t                             cb_flags;
-
-    /** Field used only for client side */
-    ucs_list_link_t                      pending_eps_list;
-    ucs_list_link_t                      used_cm_ids_list;
-    int                                  cm_id_quota; /* num of cm_ids in the quota*/
-
-    struct {
-        double                           addr_resolve_timeout;
-    } config;
-};
-
-void uct_rdmacm_iface_client_start_next_ep(uct_rdmacm_iface_t *iface);
-
-extern uct_component_t uct_rdmacm_component;
-
-#endif
diff --git a/src/uct/ib/rdmacm/rdmacm_listener.c b/src/uct/ib/rdmacm/rdmacm_listener.c
index 3e2b0379b41..22d2caf7a5a 100644
--- a/src/uct/ib/rdmacm/rdmacm_listener.c
+++ b/src/uct/ib/rdmacm/rdmacm_listener.c
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2019-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -10,6 +10,8 @@
 
 #include "rdmacm_listener.h"
 
+#include <ucs/sys/sock.h>
+
 
 #define UCS_RDMACM_MAX_BACKLOG_PATH        "/proc/sys/net/rdma_ucm/max_backlog"
 
@@ -35,6 +37,7 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_listener_t, uct_cm_h cm,
                     const uct_listener_params_t *params)
 {
     uct_rdmacm_cm_t *rdmacm_cm  = ucs_derived_of(cm, uct_rdmacm_cm_t);
+    int id_reuse_optval         = 1;
     char ip_port_str[UCS_SOCKADDR_STRING_LEN];
     ucs_status_t status;
     int backlog;
@@ -51,12 +54,32 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_listener_t, uct_cm_h cm,
         goto err;
     }
 
+    if (rdmacm_cm->super.config.reuse_addr) {
+        if (rdma_set_option(self->id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
+                            &id_reuse_optval, sizeof(id_reuse_optval))) {
+            ucs_error("rdma_set_option(REUSEADDR) failed: %m");
+            status = UCS_ERR_IO_ERROR;
+            goto err_destroy_id;
+        }
+    }
+
     if (rdma_bind_addr(self->id, (struct sockaddr*)saddr)) {
-        status = ((errno == EADDRINUSE) || (errno == EADDRNOTAVAIL)) ?
-                 UCS_ERR_BUSY : UCS_ERR_IO_ERROR;
-        ucs_error("rdma_bind_addr(addr=%s) failed: %m",
-                  ucs_sockaddr_str(saddr, ip_port_str,
-                                   UCS_SOCKADDR_STRING_LEN));
+        switch (errno) {
+        case EADDRINUSE:
+        case EADDRNOTAVAIL:
+            status = UCS_ERR_BUSY;
+            break;
+        case ENODEV:
+            status = UCS_ERR_NO_DEVICE;
+            break;
+        default:
+            status = UCS_ERR_IO_ERROR;
+            break;
+        }
+
+        ucs_diag("rdma_bind_addr(addr=%s) failed: %m",
+                 ucs_sockaddr_str(saddr, ip_port_str,
+                                  UCS_SOCKADDR_STRING_LEN));
         goto err_destroy_id;
     }
 
@@ -76,9 +99,8 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_listener_t, uct_cm_h cm,
     }
 
     ucs_debug("created an RDMACM listener %p on cm %p with cm_id: %p. "
-              "listening on %s:%d", self, cm, self->id,
-              ucs_sockaddr_str(saddr, ip_port_str, UCS_SOCKADDR_STRING_LEN),
-              ntohs(rdma_get_src_port(self->id)));
+              "listening on %s", self, cm, self->id,
+              ucs_sockaddr_str(saddr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
 
     return UCS_OK;
 
@@ -91,15 +113,16 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_listener_t, uct_cm_h cm,
 ucs_status_t uct_rdmacm_listener_reject(uct_listener_h listener,
                                         uct_conn_request_h conn_request)
 {
-    uct_rdmacm_listener_t *rdmacm_listener = ucs_derived_of(listener, uct_rdmacm_listener_t);
+    uct_rdmacm_listener_t *rdmacm_listener = ucs_derived_of(listener,
+                                                            uct_rdmacm_listener_t);
+    uct_rdmacm_cm_t *rdmacm_cm             = ucs_derived_of(listener->cm,
+                                                            uct_rdmacm_cm_t);
     struct rdma_cm_event *event            = (struct rdma_cm_event*)conn_request;
 
     ucs_assert_always(rdmacm_listener->id == event->listen_id);
 
-    uct_rdmacm_cm_reject(event->id);
-
+    uct_rdmacm_cm_reject(rdmacm_cm, event->id);
     uct_rdmacm_cm_destroy_id(event->id);
-
     return uct_rdmacm_cm_ack_event(event);
 }
 
diff --git a/src/uct/ib/rdmacm/rdmacm_md.c b/src/uct/ib/rdmacm/rdmacm_md.c
deleted file mode 100644
index 1df92b1ef3c..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_md.c
+++ /dev/null
@@ -1,265 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-219.  ALL RIGHTS RESERVED.
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "rdmacm_md.h"
-#include "rdmacm_cm.h"
-
-
-static ucs_config_field_t uct_rdmacm_md_config_table[] = {
-  {"", "", NULL,
-   ucs_offsetof(uct_rdmacm_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
-
-  {"ADDR_RESOLVE_TIMEOUT", "500ms",
-   "Time to wait for address resolution to complete",
-    ucs_offsetof(uct_rdmacm_md_config_t, addr_resolve_timeout), UCS_CONFIG_TYPE_TIME},
-
-  {NULL}
-};
-
-static void uct_rdmacm_md_close(uct_md_h md);
-
-static uct_md_ops_t uct_rdmacm_md_ops = {
-    .close                   = uct_rdmacm_md_close,
-    .query                   = uct_rdmacm_md_query,
-    .is_sockaddr_accessible  = uct_rdmacm_is_sockaddr_accessible,
-    .detect_memory_type      = ucs_empty_function_return_unsupported,
-};
-
-static void uct_rdmacm_md_close(uct_md_h md)
-{
-    uct_rdmacm_md_t *rdmacm_md = ucs_derived_of(md, uct_rdmacm_md_t);
-    ucs_free(rdmacm_md);
-}
-
-ucs_status_t uct_rdmacm_md_query(uct_md_h md, uct_md_attr_t *md_attr)
-{
-    md_attr->cap.flags            = UCT_MD_FLAG_SOCKADDR;
-    md_attr->cap.reg_mem_types    = 0;
-    md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
-    md_attr->cap.detect_mem_types = 0;
-    md_attr->cap.max_alloc        = 0;
-    md_attr->cap.max_reg          = 0;
-    md_attr->rkey_packed_size     = 0;
-    md_attr->reg_cost             = ucs_linear_func_make(0, 0);
-    memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus));
-    return UCS_OK;
-}
-
-static enum rdma_cm_event_type
-uct_rdmacm_get_event_type(struct rdma_event_channel *event_ch)
-{
-    enum rdma_cm_event_type event_type;
-    struct rdma_cm_event *event;
-    int ret;
-
-    /* Fetch an event */
-    ret = rdma_get_cm_event(event_ch, &event);
-    if (ret) {
-        ucs_warn("rdma_get_cm_event() failed: %m");
-        return RDMA_CM_EVENT_ADDR_RESOLVED;
-    }
-
-    event_type = event->event;
-    ret = rdma_ack_cm_event(event);
-    if (ret) {
-        ucs_warn("rdma_ack_cm_event() failed. event status: %d. %m.", event->status);
-    }
-
-    return event_type;
-}
-
-static int uct_rdmacm_is_addr_route_resolved(struct rdma_cm_id *cm_id,
-                                             struct sockaddr *addr,
-                                             int timeout_ms)
-{
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    enum rdma_cm_event_type event_type;
-    ucs_status_t status;
-
-    status = uct_rdmacm_resolve_addr(cm_id, addr, timeout_ms, UCS_LOG_LEVEL_DEBUG);
-    if (status != UCS_OK) {
-        return 0;
-    }
-
-    event_type = uct_rdmacm_get_event_type(cm_id->channel);
-    if (event_type != RDMA_CM_EVENT_ADDR_RESOLVED) {
-        ucs_debug("failed to resolve address (addr = %s). RDMACM event %s.",
-                  ucs_sockaddr_str(addr, ip_port_str, UCS_SOCKADDR_STRING_LEN),
-                  rdma_event_str(event_type));
-        return 0;
-    }
-
-    if (cm_id->verbs->device->transport_type == IBV_TRANSPORT_IWARP) {
-        ucs_debug("%s: iWarp support is not implemented",
-                  ucs_sockaddr_str(addr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
-        return 0;
-    }
-
-    if (rdma_resolve_route(cm_id, timeout_ms)) {
-        ucs_debug("rdma_resolve_route(addr = %s) failed: %m",
-                   ucs_sockaddr_str(addr, ip_port_str, UCS_SOCKADDR_STRING_LEN));
-        return 0;
-    }
-
-    event_type = uct_rdmacm_get_event_type(cm_id->channel);
-    if (event_type != RDMA_CM_EVENT_ROUTE_RESOLVED) {
-        ucs_debug("failed to resolve route to addr = %s. RDMACM event %s.",
-                  ucs_sockaddr_str(addr, ip_port_str, UCS_SOCKADDR_STRING_LEN),
-                  rdma_event_str(event_type));
-        return 0;
-    }
-
-    return 1;
-}
-
-int uct_rdmacm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr,
-                                      uct_sockaddr_accessibility_t mode)
-{
-    uct_rdmacm_md_t *rdmacm_md = ucs_derived_of(md, uct_rdmacm_md_t);
-    struct rdma_event_channel *event_ch = NULL;
-    struct rdma_cm_id *cm_id = NULL;
-    int is_accessible = 0;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-
-    if ((mode != UCT_SOCKADDR_ACC_LOCAL) && (mode != UCT_SOCKADDR_ACC_REMOTE)) {
-        ucs_error("Unknown sockaddr accessibility mode %d", mode);
-        return 0;
-    }
-
-    event_ch = rdma_create_event_channel();
-    if (event_ch == NULL) {
-        ucs_error("rdma_create_event_channel() failed: %m");
-        goto out;
-    }
-
-    if (rdma_create_id(event_ch, &cm_id, NULL, RDMA_PS_UDP)) {
-        ucs_error("rdma_create_id() failed: %m");
-        goto out_destroy_event_channel;
-    }
-
-    if (mode == UCT_SOCKADDR_ACC_LOCAL) {
-        /* Server side to check if can bind to the given sockaddr */
-        if (rdma_bind_addr(cm_id, (struct sockaddr *)sockaddr->addr)) {
-            ucs_debug("rdma_bind_addr(addr = %s) failed: %m",
-                      ucs_sockaddr_str((struct sockaddr *)sockaddr->addr,
-                                       ip_port_str, UCS_SOCKADDR_STRING_LEN));
-            goto out_destroy_id;
-        }
-
-        if (ucs_sockaddr_is_inaddr_any((struct sockaddr *)sockaddr->addr)) {
-            is_accessible = 1;
-            goto out_print;
-        }
-    }
-
-    /* Client and server sides check if can access the given sockaddr.
-     * The timeout needs to be passed in ms */
-    is_accessible = uct_rdmacm_is_addr_route_resolved(cm_id,
-                                                     (struct sockaddr *)sockaddr->addr,
-                                                     UCS_MSEC_PER_SEC * rdmacm_md->addr_resolve_timeout);
-    if (!is_accessible) {
-        goto out_destroy_id;
-    }
-
-out_print:
-    ucs_debug("address %s (port %d) is accessible from rdmacm_md %p with mode: %d",
-              ucs_sockaddr_str((struct sockaddr *)sockaddr->addr, ip_port_str,
-                               UCS_SOCKADDR_STRING_LEN),
-              ntohs(rdma_get_src_port(cm_id)), rdmacm_md, mode);
-
-out_destroy_id:
-    rdma_destroy_id(cm_id);
-out_destroy_event_channel:
-    rdma_destroy_event_channel(event_ch);
-out:
-    return is_accessible;
-}
-
-static ucs_status_t
-uct_rdmacm_query_md_resources(uct_component_t *component,
-                              uct_md_resource_desc_t **resources_p,
-                              unsigned *num_resources_p)
-{
-    struct rdma_event_channel *event_ch = NULL;
-
-    /* Create a dummy event channel to check if RDMACM can be used */
-    event_ch = rdma_create_event_channel();
-    if (event_ch == NULL) {
-        ucs_debug("could not create an RDMACM event channel. %m. "
-                  "Disabling the RDMACM resource");
-        return uct_md_query_empty_md_resource(resources_p, num_resources_p);
-
-    }
-
-    rdma_destroy_event_channel(event_ch);
-
-    return uct_md_query_single_md_resource(component, resources_p,
-                                           num_resources_p);
-}
-
-static ucs_status_t
-uct_rdmacm_md_open(uct_component_t *component, const char *md_name,
-                   const uct_md_config_t *uct_md_config, uct_md_h *md_p)
-{
-    uct_rdmacm_md_config_t *md_config = ucs_derived_of(uct_md_config,
-                                                       uct_rdmacm_md_config_t);
-    uct_rdmacm_md_t *md;
-    ucs_status_t status;
-
-    md = ucs_malloc(sizeof(*md), "rdmacm_md");
-    if (md == NULL) {
-        status = UCS_ERR_NO_MEMORY;
-        goto out;
-    }
-
-    md->super.ops            = &uct_rdmacm_md_ops;
-    md->super.component      = &uct_rdmacm_component;
-    md->addr_resolve_timeout = md_config->addr_resolve_timeout;
-
-    /* cppcheck-suppress autoVariables */
-    *md_p = &md->super;
-    status = UCS_OK;
-
-out:
-    return status;
-}
-
-uct_component_t uct_rdmacm_component = {
-    .query_md_resources = uct_rdmacm_query_md_resources,
-    .md_open            = uct_rdmacm_md_open,
-#if HAVE_RDMACM_QP_LESS
-    .cm_open            = UCS_CLASS_NEW_FUNC_NAME(uct_rdmacm_cm_t),
-#else
-    .cm_open            = ucs_empty_function_return_unsupported,
-#endif
-    .rkey_unpack        = ucs_empty_function_return_unsupported,
-    .rkey_ptr           = ucs_empty_function_return_unsupported,
-    .rkey_release       = ucs_empty_function_return_success,
-    .name               = "rdmacm",
-    .md_config          = {
-        .name           = "RDMA-CM memory domain",
-        .prefix         = "RDMACM_",
-        .table          = uct_rdmacm_md_config_table,
-        .size           = sizeof(uct_rdmacm_md_config_t),
-    },
-    .cm_config          = {
-        .name           = "RDMA-CM connection manager",
-        .prefix         = "RDMA_CM_",
-        .table          = uct_cm_config_table,
-        .size           = sizeof(uct_cm_config_t),
-    },
-    .tl_list            = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_rdmacm_component),
-#if HAVE_RDMACM_QP_LESS
-    .flags              = UCT_COMPONENT_FLAG_CM
-#else
-    .flags              = 0
-#endif
-};
-UCT_COMPONENT_REGISTER(&uct_rdmacm_component)
diff --git a/src/uct/ib/rdmacm/rdmacm_md.h b/src/uct/ib/rdmacm/rdmacm_md.h
deleted file mode 100644
index cd93010aae2..00000000000
--- a/src/uct/ib/rdmacm/rdmacm_md.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_RDMACM_MD_H_
-#define UCT_RDMACM_MD_H_
-
-#include "rdmacm_def.h"
-#include <uct/base/uct_md.h>
-#include <ucs/sys/sock.h>
-#include <ucs/time/time.h>
-#include <rdma/rdma_cma.h>
-
-/**
- * RDMACM memory domain.
- */
-typedef struct uct_rdmacm_md {
-    uct_md_t                 super;
-    double                   addr_resolve_timeout;
-} uct_rdmacm_md_t;
-
-/**
- * RDMACM memory domain configuration.
- */
-typedef struct uct_rdmacm_md_config {
-    uct_md_config_t          super;
-    double                   addr_resolve_timeout;
-} uct_rdmacm_md_config_t;
-
-extern uct_component_t uct_rdmacm_component;
-
-ucs_status_t uct_rdmacm_md_query(uct_md_h md, uct_md_attr_t *md_attr);
-
-int uct_rdmacm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr,
-                                      uct_sockaddr_accessibility_t mode);
-
-#endif
diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c
index 9eed3f9bed8..a9fffa3d94b 100644
--- a/src/uct/ib/ud/accel/ud_mlx5.c
+++ b/src/uct/ib/ud/accel/ud_mlx5.c
@@ -224,18 +224,20 @@ UCS_CLASS_DEFINE_DELETE_FUNC(uct_ud_mlx5_ep_t, uct_ep_t);
  * Generic inline+iov post-send function
  * The caller should check that header size + sg list would not exceed WQE size.
  */
-static UCS_F_ALWAYS_INLINE ucs_status_t
-uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
-                               /* inl. header */ const void *header, size_t header_size,
-                               /* inl. data */   const void *data, size_t data_size,
-                               /* iov data */    const uct_iov_t *iov, size_t iovcnt,
-                               uint32_t packet_flags, uct_completion_t *comp,
-                               unsigned stat_ops_counter, unsigned stat_bytes_counter,
-                               const char *func_name)
+static UCS_F_ALWAYS_INLINE ucs_status_t uct_ud_mlx5_ep_inline_iov_post(
+        uct_ep_h tl_ep, uint8_t am_id,
+        /* inl. header */ const void *header, size_t header_size,
+        /* inl. data */ const void *data, size_t data_size,
+        /* inl. iov */ const uct_iov_t *inl_iov, size_t inl_iovcnt,
+        /* iov data */ const uct_iov_t *iov, size_t iovcnt,
+        uint32_t packet_flags, uct_completion_t *comp,
+        unsigned stat_ops_counter, unsigned stat_bytes_counter,
+        const char *func_name)
 {
     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface,
                                                 uct_ud_mlx5_iface_t);
     uct_ud_mlx5_ep_t *ep       = ucs_derived_of(tl_ep, uct_ud_mlx5_ep_t);
+    size_t inl_iov_size        = uct_iov_total_length(inl_iov, inl_iovcnt);
     struct mlx5_wqe_inl_data_seg *inl;
     struct mlx5_wqe_ctrl_seg *ctrl;
     size_t inline_size, wqe_size;
@@ -243,6 +245,7 @@ uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
     uct_ud_send_skb_t *skb;
     ucs_status_t status;
     uct_ud_neth_t *neth;
+    ucs_iov_iter_t iov_iter;
 
     UCT_CHECK_AM_ID(am_id);
     UCT_UD_CHECK_ZCOPY_LENGTH(&iface->super, header_size + data_size,
@@ -260,7 +263,7 @@ uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
     ctrl            = uct_ud_mlx5_ep_get_next_wqe(iface, ep, &wqe_size,
                                                   &next_seg);
     inl             = next_seg;
-    inline_size     = sizeof(*neth) + header_size + data_size;
+    inline_size     = sizeof(*neth) + header_size + data_size + inl_iov_size;
     inl->byte_count = htonl(inline_size | MLX5_INLINE_SEG);
     wqe_size       += sizeof(*inl) + inline_size;
     skb->len        = inline_size;
@@ -287,6 +290,13 @@ uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
     /* copy inline "data" */
     uct_ib_mlx5_inline_copy(wqe_data, data, data_size, &iface->tx.wq);
 
+    /* copy inline iov */
+    if (inl_iovcnt > 0) {
+        ucs_assert(data_size == 0);
+        uct_ib_mlx5_inline_iov_copy(wqe_data, inl_iov, inl_iovcnt, inl_iov_size,
+                                    &iface->tx.wq);
+    }
+
     /* set iov to dptr */
     if (iovcnt > 0) {
         wqe_size  = ucs_align_up_pow2(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE);
@@ -300,7 +310,12 @@ uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
 
     memcpy(skb->neth, neth, sizeof(*neth) + header_size);
     memcpy(UCS_PTR_BYTE_OFFSET(skb->neth + 1, header_size), data, data_size);
-
+    if (inl_iovcnt > 0) {
+        ucs_assert((data_size == 0) && (header_size == 0));
+        ucs_iov_iter_init(&iov_iter);
+        uct_iov_to_buffer(inl_iov, inl_iovcnt, &iov_iter, skb->neth + 1,
+                          SIZE_MAX);
+    }
     if (iovcnt > 0) {
         uct_ud_skb_set_zcopy_desc(skb, iov, iovcnt, comp);
         status = UCS_INPROGRESS;
@@ -313,8 +328,8 @@ uct_ud_mlx5_ep_inline_iov_post(uct_ep_h tl_ep, uint8_t am_id,
 
     UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_ops_counter, 1);
     UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, stat_bytes_counter,
-                             header_size + data_size +
-                             uct_iov_total_length(iov, iovcnt));
+                             header_size + data_size + inl_iov_size +
+                                     uct_iov_total_length(iov, iovcnt));
 out:
     uct_ud_leave(&iface->super);
     return status;
@@ -329,15 +344,14 @@ uct_ud_mlx5_ep_short_common(uct_ep_h tl_ep, uint8_t am_id,
 {
     UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_size + data_size, 0,
                      uct_ud_mlx5_max_inline(), func_name);
-    return uct_ud_mlx5_ep_inline_iov_post(tl_ep, am_id,
-                                          header, header_size,
+
+    return uct_ud_mlx5_ep_inline_iov_post(tl_ep, am_id, header, header_size,
                                           data, data_size,
-                                          /* iov */ NULL, 0,
-                                          packet_flags,
+                                          /* inline iov */ NULL, 0,
+                                          /* iov */ NULL, 0, packet_flags,
                                           /* completion */ NULL,
                                           stat_ops_counter,
-                                          UCT_EP_STAT_BYTES_SHORT,
-                                          func_name);
+                                          UCT_EP_STAT_BYTES_SHORT, func_name);
 }
 
 static ucs_status_t
@@ -352,6 +366,28 @@ uct_ud_mlx5_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
                                        "uct_ud_mlx5_ep_am_short");
 }
 
+static ucs_status_t uct_ud_mlx5_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                                const uct_iov_t *iov,
+                                                size_t iovcnt)
+{
+    char dummy = 0; /* pass dummy pointer to 0-length header and data to avoid
+                     * compiler warnings */
+
+    UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + uct_iov_total_length(iov, iovcnt),
+                     0, uct_ud_mlx5_max_inline(),
+                     "uct_ud_mlx5_ep_am_short_iov");
+
+    return uct_ud_mlx5_ep_inline_iov_post(
+            tl_ep, id,
+            /* inl. header */ &dummy, 0,
+            /* inl. data */ &dummy, 0,
+            /* inl. iov */ iov, iovcnt,
+            /* iov */ NULL, 0,
+            /* packet flags */ UCT_UD_PACKET_FLAG_AM,
+            /* completion */ NULL, UCT_EP_STAT_AM, UCT_EP_STAT_BYTES_SHORT,
+            "uct_ud_mlx5_ep_am_short_iov");
+}
+
 static ssize_t uct_ud_mlx5_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                        uct_pack_callback_t pack_cb, void *arg,
                                        unsigned flags)
@@ -401,15 +437,17 @@ uct_ud_mlx5_ep_am_zcopy(uct_ep_h tl_ep, uint8_t id, const void *header,
     UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + header_length, 0,
                      UCT_IB_MLX5_AM_ZCOPY_MAX_HDR(UCT_IB_MLX5_AV_FULL_SIZE),
                      "am_zcopy header");
-    return uct_ud_mlx5_ep_inline_iov_post(tl_ep, id,
-                                          /* inl. header */  &dummy, 0,
-                                          /* inl. data */    header, header_length,
-                                          /* iov */          iov, iovcnt,
-                                          /* packet flags */ UCT_UD_PACKET_FLAG_AM |
-                                                             UCT_UD_PACKET_FLAG_ACK_REQ,
-                                          /* completion */   comp,
-                                          UCT_EP_STAT_AM, UCT_EP_STAT_BYTES_ZCOPY,
-                                          "uct_ud_mlx5_ep_am_zcopy");
+
+    return uct_ud_mlx5_ep_inline_iov_post(
+            tl_ep, id,
+            /* inl. header */ &dummy, 0,
+            /* inl. data */ header, header_length,
+            /* inl. iov */ NULL, 0,
+            /* iov */ iov, iovcnt,
+            /* packet flags */ UCT_UD_PACKET_FLAG_AM |
+                    UCT_UD_PACKET_FLAG_ACK_REQ,
+            /* completion */ comp, UCT_EP_STAT_AM, UCT_EP_STAT_BYTES_ZCOPY,
+            "uct_ud_mlx5_ep_am_zcopy");
 }
 
 static ucs_status_t
@@ -461,7 +499,8 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async)
     VALGRIND_MAKE_MEM_DEFINED(packet, len);
 
     if (!uct_ud_iface_check_grh(&iface->super, packet,
-                                uct_ib_mlx5_cqe_is_grh_present(cqe))) {
+                                uct_ib_mlx5_cqe_is_grh_present(cqe),
+                                cqe->flags_rqpn & 0xFF)) {
         ucs_mpool_put_inline(desc);
         goto out;
     }
@@ -508,23 +547,24 @@ uct_ud_mlx5_iface_poll_tx(uct_ud_mlx5_iface_t *iface, int is_async)
 static unsigned uct_ud_mlx5_iface_progress(uct_iface_h tl_iface)
 {
     uct_ud_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_mlx5_iface_t);
-    ucs_status_t status;
-    unsigned n, count = 0;
+    unsigned n, count;
 
     uct_ud_enter(&iface->super);
-    uct_ud_iface_dispatch_async_comps(&iface->super);
 
-    status = uct_ud_iface_dispatch_pending_rx(&iface->super);
-    if (ucs_likely(status == UCS_OK)) {
+    count  = uct_ud_iface_dispatch_async_comps(&iface->super, NULL);
+    count += uct_ud_iface_dispatch_pending_rx(&iface->super);
+
+    if (ucs_likely(count == 0)) {
         do {
-            n = uct_ud_mlx5_iface_poll_rx(iface, 0);
+            n      = uct_ud_mlx5_iface_poll_rx(iface, 0);
             count += n;
         } while ((n > 0) && (count < iface->super.super.config.rx_max_poll));
+        count += uct_ud_mlx5_iface_poll_tx(iface, 0);
     }
 
-    count += uct_ud_mlx5_iface_poll_tx(iface, 0);
     uct_ud_iface_progress_pending(&iface->super, 0);
     uct_ud_leave(&iface->super);
+
     return count;
 }
 
@@ -644,13 +684,6 @@ static ucs_status_t uct_ud_mlx5_iface_arm_cq(uct_ib_iface_t *ib_iface,
 #endif
 }
 
-static ucs_status_t uct_ud_mlx5_ep_set_failed(uct_ib_iface_t *iface,
-                                              uct_ep_h ep, ucs_status_t status)
-{
-    return uct_set_ep_failed(&UCS_CLASS_NAME(uct_ud_mlx5_ep_t), ep,
-                             &iface->super.super, status);
-}
-
 static void uct_ud_mlx5_iface_event_cq(uct_ib_iface_t *ib_iface,
                                        uct_ib_dir_t dir)
 {
@@ -664,7 +697,9 @@ static ucs_status_t uct_ud_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface,
                                                 struct ibv_qp **qp_p)
 {
     uct_ud_mlx5_iface_t *iface = ucs_derived_of(ib_iface, uct_ud_mlx5_iface_t);
-    uct_ib_mlx5_qp_t *qp = &iface->tx.wq.super;
+    uct_ib_mlx5_md_t *ib_md    = ucs_derived_of(ib_iface->super.md,
+                                                uct_ib_mlx5_md_t);
+    uct_ib_mlx5_qp_t *qp       = &iface->tx.wq.super;
     uct_ib_mlx5_qp_attr_t attr = {};
     ucs_status_t status;
 
@@ -672,12 +707,34 @@ static ucs_status_t uct_ud_mlx5_iface_create_qp(uct_ib_iface_t *ib_iface,
     attr.mmio_mode = UCT_IB_MLX5_MMIO_MODE_LAST;
 
     status = uct_ib_mlx5_iface_create_qp(ib_iface, qp, &attr);
+    if (status != UCS_OK) {
+        goto err_destroy_qp;
+    }
+
+    status = uct_ib_mlx5_txwq_init(iface->super.super.super.worker,
+                                   iface->tx.mmio_mode, &iface->tx.wq,
+                                   qp->verbs.qp);
     if (status != UCS_OK) {
         return status;
     }
 
     *qp_p = qp->verbs.qp;
     return status;
+
+err_destroy_qp:
+    uct_ib_mlx5_destroy_qp(ib_md, qp);
+    return status;
+}
+
+static void uct_ud_mlx5_iface_destroy_qp(uct_ud_iface_t *ud_iface)
+{
+    uct_ud_mlx5_iface_t *iface = ucs_derived_of(ud_iface, uct_ud_mlx5_iface_t);
+    uct_ib_mlx5_md_t *ib_md    = ucs_derived_of(ud_iface->super.super.md,
+                                                uct_ib_mlx5_md_t);
+    uct_ib_mlx5_qp_t *qp       = &iface->tx.wq.super;
+
+    uct_ib_mlx5_qp_mmio_cleanup(qp, iface->tx.wq.reg);
+    uct_ib_mlx5_destroy_qp(ib_md, qp);
 }
 
 static void UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_iface_t)(uct_iface_t*);
@@ -694,16 +751,38 @@ static void uct_ud_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg
 }
 
 static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = {
-    {
-    {
+    .super = {
+        .super = {
+            .iface_estimate_perf = uct_base_iface_estimate_perf,
+            .iface_vfs_refresh   = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
+        },
+        .create_cq      = uct_ib_mlx5_create_cq,
+        .arm_cq         = uct_ud_mlx5_iface_arm_cq,
+        .event_cq       = uct_ud_mlx5_iface_event_cq,
+        .handle_failure = uct_ud_mlx5_iface_handle_failure,
+    },
+    .async_progress          = uct_ud_mlx5_iface_async_progress,
+    .send_ctl                = uct_ud_mlx5_ep_send_ctl,
+    .ep_free                 = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_ep_t),
+    .create_qp               = uct_ud_mlx5_iface_create_qp,
+    .destroy_qp              = uct_ud_mlx5_iface_destroy_qp,
+    .unpack_peer_address     = uct_ud_mlx5_iface_unpack_peer_address,
+    .ep_get_peer_address     = uct_ud_mlx5_ep_get_peer_address,
+    .get_peer_address_length = uct_ud_mlx5_get_peer_address_length,
+    .peer_address_str        = uct_ud_mlx5_iface_peer_address_str,
+};
+
+static uct_iface_ops_t uct_ud_mlx5_iface_tl_ops = {
     .ep_put_short             = uct_ud_mlx5_ep_put_short,
     .ep_am_short              = uct_ud_mlx5_ep_am_short,
+    .ep_am_short_iov          = uct_ud_mlx5_ep_am_short_iov,
     .ep_am_bcopy              = uct_ud_mlx5_ep_am_bcopy,
     .ep_am_zcopy              = uct_ud_mlx5_ep_am_zcopy,
     .ep_pending_add           = uct_ud_ep_pending_add,
     .ep_pending_purge         = uct_ud_ep_pending_purge,
     .ep_flush                 = uct_ud_ep_flush,
     .ep_fence                 = uct_base_ep_fence,
+    .ep_check                 = uct_ud_ep_check,
     .ep_create                = uct_ud_mlx5_ep_create,
     .ep_destroy               = uct_ud_ep_disconnect ,
     .ep_get_address           = uct_ud_ep_get_address,
@@ -721,21 +800,6 @@ static uct_ud_iface_ops_t uct_ud_mlx5_iface_ops = {
     .iface_get_device_address = uct_ib_iface_get_device_address,
     .iface_get_address        = uct_ud_iface_get_address,
     .iface_is_reachable       = uct_ib_iface_is_reachable
-    },
-    .create_cq                = uct_ib_mlx5_create_cq,
-    .arm_cq                   = uct_ud_mlx5_iface_arm_cq,
-    .event_cq                 = uct_ud_mlx5_iface_event_cq,
-    .handle_failure           = uct_ud_mlx5_iface_handle_failure,
-    .set_ep_failed            = uct_ud_mlx5_ep_set_failed,
-    },
-    .async_progress           = uct_ud_mlx5_iface_async_progress,
-    .send_ctl                 = uct_ud_mlx5_ep_send_ctl,
-    .ep_free                  = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_mlx5_ep_t),
-    .create_qp                = uct_ud_mlx5_iface_create_qp,
-    .unpack_peer_address      = uct_ud_mlx5_iface_unpack_peer_address,
-    .ep_get_peer_address      = uct_ud_mlx5_ep_get_peer_address,
-    .get_peer_address_length  = uct_ud_mlx5_get_peer_address_length,
-    .peer_address_str         = uct_ud_mlx5_iface_peer_address_str
 };
 
 static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,
@@ -755,26 +819,28 @@ static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,
     init_attr.cq_len[UCT_IB_DIR_TX] = config->super.super.tx.queue_len * UCT_IB_MLX5_MAX_BB;
     init_attr.cq_len[UCT_IB_DIR_RX] = config->super.super.rx.queue_len;
 
+    self->tx.mmio_mode     = config->mlx5_common.mmio_mode;
     self->tx.wq.super.type = UCT_IB_MLX5_OBJ_TYPE_LAST;
 
     UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops,
-                              md, worker, params, &config->super, &init_attr);
+                              &uct_ud_mlx5_iface_tl_ops, md, worker, params,
+                              &config->super, &init_attr);
 
     self->super.config.max_inline = uct_ud_mlx5_max_inline();
 
-    status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]);
+    status = uct_ib_mlx5_iface_select_sl(&self->super.super,
+                                         &config->mlx5_common,
+                                         &config->super.super);
     if (status != UCS_OK) {
         return status;
     }
 
-    status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], &self->cq[UCT_IB_DIR_RX]);
+    status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_TX], &self->cq[UCT_IB_DIR_TX]);
     if (status != UCS_OK) {
         return status;
     }
 
-    status = uct_ib_mlx5_txwq_init(self->super.super.super.worker,
-                                   config->mlx5_common.mmio_mode, &self->tx.wq,
-                                   self->super.qp);
+    status = uct_ib_mlx5_get_cq(self->super.super.cq[UCT_IB_DIR_RX], &self->cq[UCT_IB_DIR_RX]);
     if (status != UCS_OK) {
         return status;
     }
@@ -813,10 +879,6 @@ static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,
 static UCS_CLASS_CLEANUP_FUNC(uct_ud_mlx5_iface_t)
 {
     ucs_trace_func("");
-    uct_ud_iface_remove_async_handlers(&self->super);
-    uct_ud_enter(&self->super);
-    uct_ib_mlx5_qp_mmio_cleanup(&self->tx.wq.super, self->tx.wq.reg);
-    uct_ud_leave(&self->super);
 }
 
 UCS_CLASS_DEFINE(uct_ud_mlx5_iface_t, uct_ud_iface_t);
diff --git a/src/uct/ib/ud/accel/ud_mlx5.h b/src/uct/ib/ud/accel/ud_mlx5.h
index eb13a0064d7..8108351c4e2 100644
--- a/src/uct/ib/ud/accel/ud_mlx5.h
+++ b/src/uct/ib/ud/accel/ud_mlx5.h
@@ -36,6 +36,7 @@ typedef struct {
     uct_ud_iface_t                      super;
     struct {
         uct_ib_mlx5_txwq_t              wq;
+        uct_ib_mlx5_mmio_mode_t         mmio_mode;
     } tx;
     struct {
         uct_ib_mlx5_rxwq_t              wq;
diff --git a/src/uct/ib/ud/base/ud_def.h b/src/uct/ib/ud/base/ud_def.h
index e95fd0b612b..13b29d701f2 100644
--- a/src/uct/ib/ud/base/ud_def.h
+++ b/src/uct/ib/ud/base/ud_def.h
@@ -40,7 +40,7 @@ typedef uint16_t                 uct_ud_psn_t;
 typedef struct uct_ud_iface      uct_ud_iface_t;
 typedef struct uct_ud_ep         uct_ud_ep_t;
 typedef struct uct_ud_ctl_hdr    uct_ud_ctl_hdr_t;
-typedef uct_ib_qpnum_t           uct_ud_iface_addr_t;
+typedef struct uct_ud_iface_addr uct_ud_iface_addr_t;
 typedef struct uct_ud_ep_addr    uct_ud_ep_addr_t;
 typedef struct uct_ud_iface_peer uct_ud_iface_peer_t;
 
@@ -162,8 +162,8 @@ typedef struct uct_ud_send_skb {
  * Call user completion handler
  */
 typedef struct uct_ud_comp_desc {
-    uct_completion_t        *comp;
-    ucs_status_t            status;     /* used in case of failure */
+    uct_completion_t *comp;
+    uct_ud_ep_t      *ep;
 } uct_ud_comp_desc_t;
 
 
diff --git a/src/uct/ib/ud/base/ud_ep.c b/src/uct/ib/ud/base/ud_ep.c
index 828e91b31c8..406167b2472 100644
--- a/src/uct/ib/ud/base/ud_ep.c
+++ b/src/uct/ib/ud/base/ud_ep.c
@@ -168,6 +168,7 @@ uct_ud_ep_window_release_inline(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
                                 int is_async, int invalidate_resend)
 {
     uct_ud_send_skb_t *skb;
+    uct_ud_comp_desc_t *cdesc;
 
     ucs_queue_for_each_extract(skb, &ep->tx.window, queue,
                                uct_ud_skb_is_completed(skb, ack_psn)) {
@@ -180,14 +181,15 @@ uct_ud_ep_window_release_inline(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
             uct_ud_skb_release(skb, 1);
         } else if (ucs_likely(!is_async)) {
             /* dispatch user completion immediately */
-            uct_ud_iface_dispatch_comp(iface, uct_ud_comp_desc(skb)->comp,
-                                       status);
+            cdesc = uct_ud_comp_desc(skb);
+            uct_completion_update_status(cdesc->comp, status);
+            uct_ud_iface_dispatch_comp(iface, cdesc->comp);
             uct_ud_skb_release(skb, 1);
         } else {
             /* Don't call user completion from async context. Instead, put
              * it on a queue which will be progressed from main thread.
              */
-            uct_ud_iface_add_async_comp(iface, skb, status);
+            uct_ud_iface_add_async_comp(iface, ep, skb, status);
         }
     }
 }
@@ -225,6 +227,11 @@ static void uct_ud_ep_purge_outstanding(uct_ud_ep_t *ep)
 
 static void uct_ud_ep_purge(uct_ud_ep_t *ep, ucs_status_t status)
 {
+    uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface,
+                                           uct_ud_iface_t);
+
+    uct_ud_iface_dispatch_async_comps(iface, ep);
+
     /* reset the maximal TX psn value to the default, since we should be able
      * to do TX operation after purging of the EP and uct_ep_flush(LOCAL)
      * operation has to return UCS_OK */
@@ -254,8 +261,9 @@ static unsigned uct_ud_ep_deferred_timeout_handler(void *arg)
 
     uct_ud_ep_purge(ep, UCS_ERR_ENDPOINT_TIMEOUT);
 
-    status = iface->super.ops->set_ep_failed(&iface->super, &ep->super.super,
-                                             UCS_ERR_ENDPOINT_TIMEOUT);
+    status = uct_iface_handle_ep_err(&iface->super.super.super,
+                                     &ep->super.super,
+                                     UCS_ERR_ENDPOINT_TIMEOUT);
     if (status != UCS_OK) {
         ucs_fatal("UD endpoint %p to "UCT_UD_EP_PEER_NAME_FMT": "
                   "unhandled timeout error",
@@ -1013,7 +1021,7 @@ ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
         } else {
             /* Otherwise, add the skb after async completions */
             ucs_assert(ep->tx.resend_count == 0);
-            uct_ud_iface_add_async_comp(iface, skb, UCS_OK);
+            uct_ud_iface_add_async_comp(iface, ep, skb, UCS_OK);
         }
 
         ucs_trace_data("added dummy flush skb %p psn %d user_comp %p", skb,
@@ -1026,7 +1034,7 @@ ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
 ucs_status_t uct_ud_ep_flush(uct_ep_h ep_h, unsigned flags,
                              uct_completion_t *comp)
 {
-    uct_ud_ep_t *ep = ucs_derived_of(ep_h, uct_ud_ep_t);
+    uct_ud_ep_t *ep       = ucs_derived_of(ep_h, uct_ud_ep_t);
     uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface,
                                            uct_ud_iface_t);
     ucs_status_t status;
@@ -1035,7 +1043,7 @@ ucs_status_t uct_ud_ep_flush(uct_ep_h ep_h, unsigned flags,
 
     if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
         uct_ep_pending_purge(ep_h, NULL, 0);
-        uct_ud_iface_dispatch_async_comps(iface);
+        uct_ud_iface_dispatch_async_comps(iface, ep);
         uct_ud_ep_purge(ep, UCS_ERR_CANCELED);
         /* FIXME make flush(CANCEL) operation truly non-blocking and wait until
          * all of the outstanding sends are completed. Without this, zero-copy
@@ -1062,6 +1070,27 @@ ucs_status_t uct_ud_ep_flush(uct_ep_h ep_h, unsigned flags,
     return status;
 }
 
+ucs_status_t uct_ud_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
+{
+    uct_ud_ep_t *ep       = ucs_derived_of(tl_ep, uct_ud_ep_t);
+    uct_ud_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_ud_iface_t);
+    char dummy            = 0;
+
+    UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);
+
+    uct_ud_enter(iface);
+    if (/* check that no TX resources are available (i.e. there is signaled
+         * operation which provides actual peer status) */
+        !uct_ud_ep_is_connected(ep) ||
+        !uct_ud_ep_is_last_ack_received(ep)) {
+        uct_ud_leave(iface);
+        return UCS_OK;
+    }
+    uct_ud_leave(iface);
+
+    return uct_ep_put_short(tl_ep, &dummy, 0, 0, 0);
+}
+
 static uct_ud_send_skb_t *uct_ud_ep_prepare_crep(uct_ud_ep_t *ep)
 {
     uct_ud_send_skb_t *skb;
@@ -1286,7 +1315,9 @@ static void uct_ud_ep_send_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep)
     skb->neth->packet_type = ep->dest_ep_id;
     if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_ACK_REQ)) {
         skb->neth->packet_type |= UCT_UD_PACKET_FLAG_ACK_REQ;
-        ctl_flags              |= UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED;
+        if (ep->tx.tick >= iface->config.min_poke_time) {
+            ctl_flags |= UCT_UD_IFACE_SEND_CTL_FLAG_SOLICITED;
+        }
     }
 
     if (uct_ud_ep_ctl_op_check(ep, UCT_UD_EP_OP_NACK)) {
@@ -1596,6 +1627,9 @@ void uct_ud_ep_disconnect(uct_ep_h tl_ep)
     /* schedule flush */
     uct_ud_ep_flush(tl_ep, 0, NULL);
 
+    /* cancel user outstanding operations */
+    uct_ud_ep_purge(ep, UCS_ERR_CANCELED);
+
     /* the EP will be destroyed by interface destroy or timeout in
      * uct_ud_ep_timer
      */
diff --git a/src/uct/ib/ud/base/ud_ep.h b/src/uct/ib/ud/base/ud_ep.h
index 1f57fdf3480..7427f8e2951 100644
--- a/src/uct/ib/ud/base/ud_ep.h
+++ b/src/uct/ib/ud/base/ud_ep.h
@@ -303,6 +303,8 @@ ucs_status_t uct_ud_ep_flush(uct_ep_h ep, unsigned flags,
 ucs_status_t uct_ud_ep_flush_nolock(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
                                     uct_completion_t *comp);
 
+ucs_status_t uct_ud_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
+
 ucs_status_t uct_ud_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr);
 
 ucs_status_t uct_ud_ep_create_connected_common(const uct_ep_params_t *params,
diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c
index 01ec3a52435..5ca415da486 100644
--- a/src/uct/ib/ud/base/ud_iface.c
+++ b/src/uct/ib/ud/base/ud_iface.c
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -147,6 +147,14 @@ static void uct_ud_iface_send_skb_init(uct_iface_h tl_iface, void *obj,
     skb->flags = UCT_UD_SEND_SKB_FLAG_INVALID;
 }
 
+static void uct_ud_iface_destroy_qp(uct_ud_iface_t *ud_iface)
+{
+    uct_ud_iface_ops_t *ops = ucs_derived_of(ud_iface->super.ops,
+                                             uct_ud_iface_ops_t);
+
+    ops->destroy_qp(ud_iface);
+}
+
 static ucs_status_t
 uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config)
 {
@@ -160,7 +168,7 @@ uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config
     qp_init_attr.sq_sig_all          = 0;
     qp_init_attr.cap.max_send_wr     = config->super.tx.queue_len;
     qp_init_attr.cap.max_recv_wr     = config->super.rx.queue_len;
-    qp_init_attr.cap.max_send_sge    = 2;
+    qp_init_attr.cap.max_send_sge    = config->super.tx.min_sge + 1;
     qp_init_attr.cap.max_recv_sge    = 1;
     qp_init_attr.cap.max_inline_data = config->super.tx.min_inline;
 
@@ -202,8 +210,9 @@ uct_ud_iface_create_qp(uct_ud_iface_t *self, const uct_ud_iface_config_t *config
     }
 
     return UCS_OK;
+
 err_destroy_qp:
-    uct_ib_destroy_qp(self->qp);
+    uct_ud_iface_destroy_qp(self);
     return UCS_ERR_INVALID_PARAM;
 }
 
@@ -404,8 +413,9 @@ static ucs_status_t uct_ud_iface_gid_hash_init(uct_ud_iface_t *iface,
     return status;
 }
 
-UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md,
-                    uct_worker_h worker, const uct_iface_params_t *params,
+UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops,
+                    uct_iface_ops_t *tl_ops, uct_md_h md, uct_worker_h worker,
+                    const uct_iface_params_t *params,
                     const uct_ud_iface_config_t *config,
                     uct_ib_iface_init_attr_t *init_attr)
 {
@@ -443,7 +453,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md,
     init_attr->seg_size    = ucs_min(mtu, config->super.seg_size);
     init_attr->qp_type     = IBV_QPT_UD;
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, md, worker,
+    UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, tl_ops, md, worker,
                               params, &config->super, init_attr);
 
     if (self->super.super.worker->async == NULL) {
@@ -460,6 +470,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md,
     self->rx.quota               = 0;
     self->config.tx_qp_len       = config->super.tx.queue_len;
     self->config.peer_timeout    = ucs_time_from_sec(config->peer_timeout);
+    self->config.min_poke_time   = ucs_time_from_sec(config->min_poke_time);
     self->config.check_grh_dgid  = config->dgid_check &&
                                    uct_ib_iface_is_roce(&self->super);
 
@@ -508,14 +519,14 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md,
 
     UCT_UD_IFACE_HOOK_INIT(self);
 
+    ucs_ptr_array_init(&self->eps, "ud_eps");
+
     status = uct_ud_iface_create_qp(self, config);
     if (status != UCS_OK) {
-        return UCS_ERR_INVALID_PARAM;
+        goto err_eps_array;
     }
 
-    ucs_ptr_array_init(&self->eps, "ud_eps");
-
-    status = uct_ib_iface_recv_mpool_init(&self->super, &config->super,
+    status = uct_ib_iface_recv_mpool_init(&self->super, &config->super, params,
                                           "ud_recv_skb", &self->rx.mp);
     if (status != UCS_OK) {
         goto err_qp;
@@ -570,7 +581,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md,
 err_rx_mpool:
     ucs_mpool_cleanup(&self->rx.mp, 1);
 err_qp:
-    uct_ib_destroy_qp(self->qp);
+    uct_ud_iface_destroy_qp(self);
+err_eps_array:
     ucs_ptr_array_cleanup(&self->eps);
     return status;
 }
@@ -591,6 +603,8 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_iface_t)
 {
     ucs_trace_func("");
 
+    uct_ud_iface_remove_async_handlers(self);
+
     /* TODO: proper flush and connection termination */
     uct_ud_enter(self);
     ucs_conn_match_cleanup(&self->conn_match_ctx);
@@ -602,7 +616,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_iface_t)
     /* TODO: qp to error state and cleanup all wqes */
     uct_ud_iface_free_pending_rx(self);
     ucs_mpool_cleanup(&self->rx.mp, 0);
-    uct_ib_destroy_qp(self->qp);
+    uct_ud_iface_destroy_qp(self);
     ucs_debug("iface(%p): ptr_array cleanup", self);
     ucs_ptr_array_cleanup(&self->eps);
     ucs_arbiter_cleanup(&self->tx.pending_q);
@@ -623,15 +637,26 @@ ucs_config_field_t uct_ud_iface_config_table[] = {
 
     {"TIMEOUT", "5.0m", "Transport timeout",
      ucs_offsetof(uct_ud_iface_config_t, peer_timeout), UCS_CONFIG_TYPE_TIME},
+
     {"TIMER_TICK", "10ms", "Initial timeout for retransmissions",
      ucs_offsetof(uct_ud_iface_config_t, timer_tick), UCS_CONFIG_TYPE_TIME},
+
     {"TIMER_BACKOFF", "2.0",
      "Timeout multiplier for resending trigger (must be >= "
      UCS_PP_MAKE_STRING(UCT_UD_MIN_TIMER_TIMER_BACKOFF) ")",
      ucs_offsetof(uct_ud_iface_config_t, timer_backoff),
                   UCS_CONFIG_TYPE_DOUBLE},
+
     {"ASYNC_TIMER_TICK", "100ms", "Resolution for async timer",
      ucs_offsetof(uct_ud_iface_config_t, event_timer_tick), UCS_CONFIG_TYPE_TIME},
+
+    {"MIN_POKE_TIME", "250ms",
+     "Minimal interval to send ACK request with solicited flag, to wake up\n"
+     "the remote peer in case it is not actively calling progress.\n"
+     "Smaller values may incur performance overhead, while extermely large\n"
+     "values can cause delays in presence of packet drops.",
+     ucs_offsetof(uct_ud_iface_config_t, min_poke_time), UCS_CONFIG_TYPE_TIME},
+
     {"ETH_DGID_CHECK", "y",
      "Enable checking destination GID for incoming packets of Ethernet network.\n"
      "Mismatched packets are silently dropped.",
@@ -669,6 +694,7 @@ ucs_status_t uct_ud_iface_query(uct_ud_iface_t *iface,
                                          UCT_IFACE_FLAG_CONNECT_TO_EP    |
                                          UCT_IFACE_FLAG_CONNECT_TO_IFACE |
                                          UCT_IFACE_FLAG_PENDING          |
+                                         UCT_IFACE_FLAG_EP_CHECK         |
                                          UCT_IFACE_FLAG_CB_SYNC          |
                                          UCT_IFACE_FLAG_CB_ASYNC         |
                                          UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE;
@@ -797,17 +823,26 @@ uct_ud_send_skb_t *uct_ud_iface_ctl_skb_get(uct_ud_iface_t *iface)
     return skb;
 }
 
-void uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface)
+unsigned
+uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface, uct_ud_ep_t *ep)
 {
-    uct_ud_comp_desc_t *cdesc;
+    unsigned count = 0;
     uct_ud_send_skb_t *skb;
+    uct_ud_comp_desc_t *cdesc;
 
     ucs_queue_for_each_extract(skb, &iface->tx.async_comp_q, queue, 1) {
         ucs_assert(!(skb->flags & UCT_UD_SEND_SKB_FLAG_RESENDING));
         cdesc = uct_ud_comp_desc(skb);
-        uct_ud_iface_dispatch_comp(iface, cdesc->comp, cdesc->status);
-        uct_ud_skb_release(skb, 0);
+        ucs_assert(cdesc->ep != NULL);
+
+        if ((ep == NULL) || (ep == cdesc->ep)) {
+            uct_ud_iface_dispatch_comp(iface, cdesc->comp);
+            uct_ud_skb_release(skb, 0);
+        }
+        ++count;
     }
+
+    return count;
 }
 
 static void uct_ud_iface_free_async_comps(uct_ud_iface_t *iface)
@@ -819,31 +854,27 @@ static void uct_ud_iface_free_async_comps(uct_ud_iface_t *iface)
     }
 }
 
-ucs_status_t uct_ud_iface_dispatch_pending_rx_do(uct_ud_iface_t *iface)
+unsigned uct_ud_iface_dispatch_pending_rx_do(uct_ud_iface_t *iface)
 {
-    int count;
+    unsigned max_poll = iface->super.config.rx_max_poll;
+    int count         = 0;
     uct_ud_recv_skb_t *skb;
     uct_ud_neth_t *neth;
-    unsigned max_poll = iface->super.config.rx_max_poll;
+    void *hdr;
 
-    count = 0;
     do {
-        skb = ucs_queue_pull_elem_non_empty(&iface->rx.pending_q, uct_ud_recv_skb_t, u.am.queue);
-        neth =  (uct_ud_neth_t *)((char *)uct_ib_iface_recv_desc_hdr(&iface->super,
-                                                                     (uct_ib_iface_recv_desc_t *)skb) +
-                                  UCT_IB_GRH_LEN);
-        uct_ib_iface_invoke_am_desc(&iface->super,
-                                    uct_ud_neth_get_am_id(neth),
-                                    neth + 1,
-                                    skb->u.am.len,
-                                    &skb->super);
-        count++;
-        if (count >= max_poll) {
-            return UCS_ERR_NO_RESOURCE;
-        }
-    } while (!ucs_queue_is_empty(&iface->rx.pending_q));
-
-    return UCS_OK;
+        skb  = ucs_queue_pull_elem_non_empty(&iface->rx.pending_q,
+                                             uct_ud_recv_skb_t, u.am.queue);
+        hdr  = uct_ib_iface_recv_desc_hdr(&iface->super,
+                                          (uct_ib_iface_recv_desc_t*)skb);
+        neth = (uct_ud_neth_t*)UCS_PTR_BYTE_OFFSET(hdr, UCT_IB_GRH_LEN);
+
+        uct_ib_iface_invoke_am_desc(&iface->super, uct_ud_neth_get_am_id(neth),
+                                    neth + 1, skb->u.am.len, &skb->super);
+        ++count;
+    } while ((count < max_poll) && !ucs_queue_is_empty(&iface->rx.pending_q));
+
+    return count;
 }
 
 static void uct_ud_iface_free_pending_rx(uct_ud_iface_t *iface)
@@ -875,6 +906,8 @@ ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events)
 
     status = uct_ib_iface_pre_arm(&iface->super);
     if (status != UCS_OK) {
+        ucs_trace("iface %p: pre arm failed status %s", iface,
+                  ucs_status_string(status));
         goto out;
     }
 
@@ -882,21 +915,35 @@ ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events)
     if ((events & (UCT_EVENT_RECV | UCT_EVENT_RECV_SIG)) &&
         !ucs_queue_is_empty(&iface->rx.pending_q))
     {
+        ucs_trace("iface %p: arm failed, has %lu unhandled receives", iface,
+                  ucs_queue_length(&iface->rx.pending_q));
         status = UCS_ERR_BUSY;
         goto out;
     }
 
-    /* Check if some send completions were not delivered yet */
-    if ((events & UCT_EVENT_SEND_COMP) &&
-        !ucs_queue_is_empty(&iface->tx.async_comp_q))
-    {
-        status = UCS_ERR_BUSY;
-        goto out;
+    if (events & UCT_EVENT_SEND_COMP) {
+        /* Check if some send completions were not delivered yet */
+        if (!ucs_queue_is_empty(&iface->tx.async_comp_q)) {
+            ucs_trace("iface %p: arm failed, has %lu async send comp", iface,
+                      ucs_queue_length(&iface->tx.async_comp_q));
+            status = UCS_ERR_BUSY;
+            goto out;
+        }
+
+        /* Check if we have pending operations which need to be progressed */
+        if (iface->tx.async_before_pending) {
+            ucs_trace("iface %p: arm failed, has async-before-pending flag",
+                      iface);
+            status = UCS_ERR_BUSY;
+            goto out;
+        }
     }
 
     if (events & UCT_EVENT_SEND_COMP) {
         status = iface->super.ops->arm_cq(&iface->super, UCT_IB_DIR_TX, 0);
         if (status != UCS_OK) {
+            ucs_trace("iface %p: arm cq failed status %s", iface,
+                      ucs_status_string(status));
             goto out;
         }
     }
@@ -905,10 +952,13 @@ ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events)
         /* we may get send completion through ACKs as well */
         status = iface->super.ops->arm_cq(&iface->super, UCT_IB_DIR_RX, 0);
         if (status != UCS_OK) {
+            ucs_trace("iface %p: arm cq failed status %s", iface,
+                      ucs_status_string(status));
             goto out;
         }
     }
 
+    ucs_trace("iface %p: arm cq ok", iface);
     status = UCS_OK;
 out:
     uct_ud_leave(iface);
diff --git a/src/uct/ib/ud/base/ud_iface.h b/src/uct/ib/ud/base/ud_iface.h
index deef3fb0880..68d52e9a130 100644
--- a/src/uct/ib/ud/base/ud_iface.h
+++ b/src/uct/ib/ud/base/ud_iface.h
@@ -53,6 +53,7 @@ typedef struct uct_ud_iface_config {
     uct_ib_iface_config_t         super;
     uct_ud_iface_common_config_t  ud_common;
     double                        peer_timeout;
+    double                        min_poke_time;
     double                        timer_tick;
     double                        timer_backoff;
     double                        event_timer_tick;
@@ -107,6 +108,7 @@ typedef struct uct_ud_iface_ops {
     void                      (*ep_free)(uct_ep_h ep);
     ucs_status_t              (*create_qp)(uct_ib_iface_t *iface, uct_ib_qp_attr_t *attr,
                                            struct ibv_qp **qp_p);
+    void                      (*destroy_qp)(uct_ud_iface_t *ud_iface);
     ucs_status_t              (*unpack_peer_address)(uct_ud_iface_t *iface,
                                                      const uct_ib_address_t *ib_addr,
                                                      const uct_ud_iface_addr_t *if_addr,
@@ -179,6 +181,7 @@ struct uct_ud_iface {
     } tx;
     struct {
         ucs_time_t           peer_timeout;
+        ucs_time_t           min_poke_time;
         unsigned             tx_qp_len;
         unsigned             max_inline;
         int                  check_grh_dgid;
@@ -207,10 +210,9 @@ struct uct_ud_iface {
 };
 
 
-UCS_CLASS_DECLARE(uct_ud_iface_t, uct_ud_iface_ops_t*, uct_md_h,
-                  uct_worker_h, const uct_iface_params_t*,
-                  const uct_ud_iface_config_t*,
-                  uct_ib_iface_init_attr_t*)
+UCS_CLASS_DECLARE(uct_ud_iface_t, uct_ud_iface_ops_t*, uct_iface_ops_t*,
+                  uct_md_h, uct_worker_h, const uct_iface_params_t*,
+                  const uct_ud_iface_config_t*, uct_ib_iface_init_attr_t*)
 
 
 struct uct_ud_ctl_hdr {
@@ -332,7 +334,7 @@ uct_ud_ep_t *uct_ud_iface_cep_get_ep(uct_ud_iface_t *iface,
 
 void uct_ud_iface_cep_remove_ep(uct_ud_iface_t *iface, uct_ud_ep_t *ep);
 
-ucs_status_t uct_ud_iface_dispatch_pending_rx_do(uct_ud_iface_t *iface);
+unsigned uct_ud_iface_dispatch_pending_rx_do(uct_ud_iface_t *iface);
 
 ucs_status_t uct_ud_iface_event_arm(uct_iface_h tl_iface, unsigned events);
 
@@ -346,7 +348,8 @@ void uct_ud_iface_ctl_skb_complete(uct_ud_iface_t *iface,
 void uct_ud_iface_send_completion(uct_ud_iface_t *iface, uint16_t sn,
                                   int is_async);
 
-void uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface);
+unsigned
+uct_ud_iface_dispatch_async_comps_do(uct_ud_iface_t *iface, uct_ud_ep_t *ep);
 
 
 static UCS_F_ALWAYS_INLINE int uct_ud_iface_can_tx(uct_ud_iface_t *iface)
@@ -380,18 +383,9 @@ static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface)
 }
 
 
-static UCS_F_ALWAYS_INLINE unsigned
-uct_ud_grh_get_dgid_len(struct ibv_grh *grh)
-{
-    static const uint8_t ipmask = 0xf0;
-    uint8_t ipver               = ((*(uint8_t*)grh) & ipmask);
-
-    return (ipver == (6 << 4)) ? UCS_IPV6_ADDR_LEN : UCS_IPV4_ADDR_LEN;
-}
-
-
 static UCS_F_ALWAYS_INLINE int
-uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *packet, int is_grh_present)
+uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *packet, int is_grh_present,
+                       uint8_t roce_pkt_type)
 {
     struct ibv_grh *grh = (struct ibv_grh *)packet;
     size_t gid_len;
@@ -408,7 +402,25 @@ uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *packet, int is_grh_present)
         return 1;
     }
 
-    gid_len = uct_ud_grh_get_dgid_len(grh);
+    /*
+     * Take the packet type from CQE, because:
+     * 1. According to Annex17_RoCEv2 (A17.4.5.1):
+     * For UD, the Completion Queue Entry (CQE) includes remote address
+     * information (InfiniBand Specification Vol. 1 Rev 1.2.1 Section 11.4.2.1).
+     * For RoCEv2, the remote address information comprises the source L2
+     * Address and a flag that indicates if the received frame is an IPv4,
+     * IPv6 or RoCE packet.
+     * 2. According to PRM, for responder UD/DC over RoCE sl represents RoCE
+     * packet type as:
+     * bit 3    : when set R-RoCE frame contains an UDP header otherwise not
+     * Bits[2:0]: L3_Header_Type, as defined below
+     *     - 0x0 : GRH - (RoCE v1.0)
+     *     - 0x1 : IPv6 - (RoCE v1.5/v2.0)
+     *     - 0x2 : IPv4 - (RoCE v1.5/v2.0)
+     */
+    gid_len = ((roce_pkt_type & UCT_IB_CQE_SL_PKTYPE_MASK) == 0x2) ?
+              UCS_IPV4_ADDR_LEN : UCS_IPV6_ADDR_LEN;
+
     if (ucs_likely((gid_len == iface->gid_table.last_len) &&
                     uct_ud_gid_equal(&grh->dgid, &iface->gid_table.last,
                                      gid_len))) {
@@ -537,23 +549,25 @@ uct_ud_iface_add_ctl_desc(uct_ud_iface_t *iface, uct_ud_ctl_desc_t *cdesc)
 }
 
 
-static UCS_F_ALWAYS_INLINE ucs_status_t
+static UCS_F_ALWAYS_INLINE unsigned
 uct_ud_iface_dispatch_pending_rx(uct_ud_iface_t *iface)
 {
     if (ucs_likely(ucs_queue_is_empty(&iface->rx.pending_q))) {
-        return UCS_OK;
+        return 0;
     }
+
     return uct_ud_iface_dispatch_pending_rx_do(iface);
 }
 
 
-static UCS_F_ALWAYS_INLINE void
-uct_ud_iface_dispatch_async_comps(uct_ud_iface_t *iface)
+static UCS_F_ALWAYS_INLINE unsigned
+uct_ud_iface_dispatch_async_comps(uct_ud_iface_t *iface, uct_ud_ep_t *ep)
 {
     if (ucs_likely(ucs_queue_is_empty(&iface->tx.async_comp_q))) {
-        return;
+        return 0;
     }
-    uct_ud_iface_dispatch_async_comps_do(iface);
+
+    return uct_ud_iface_dispatch_async_comps_do(iface, ep);
 }
 
 #if ENABLE_PARAMS_CHECK
diff --git a/src/uct/ib/ud/base/ud_inl.h b/src/uct/ib/ud/base/ud_inl.h
index 2b78f95c036..7e650894d12 100644
--- a/src/uct/ib/ud/base/ud_inl.h
+++ b/src/uct/ib/ud/base/ud_inl.h
@@ -238,23 +238,22 @@ uct_ud_skb_bcopy(uct_ud_send_skb_t *skb, uct_pack_callback_t pack_cb, void *arg)
 }
 
 static UCS_F_ALWAYS_INLINE void
-uct_ud_iface_dispatch_comp(uct_ud_iface_t *iface, uct_completion_t *comp,
-                           ucs_status_t status)
+uct_ud_iface_dispatch_comp(uct_ud_iface_t *iface, uct_completion_t *comp)
 {
     /* Avoid reordering with pending queue - if we have any pending requests,
      * prevent send operations from the completion callback
      */
     uct_ud_iface_raise_pending_async_ev(iface);
-    uct_invoke_completion(comp, status);
+    uct_invoke_completion(comp, UCS_OK);
 }
 
 static UCS_F_ALWAYS_INLINE void
-uct_ud_iface_add_async_comp(uct_ud_iface_t *iface, uct_ud_send_skb_t *skb,
-                            ucs_status_t status)
+uct_ud_iface_add_async_comp(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
+                            uct_ud_send_skb_t *skb, ucs_status_t status)
 {
     uct_ud_comp_desc_t *cdesc = uct_ud_comp_desc(skb);
 
-    cdesc->status = status;
+    cdesc->ep = ep;
+    uct_completion_update_status(cdesc->comp, status);
     ucs_queue_push(&iface->tx.async_comp_q, &skb->queue);
 }
-
diff --git a/src/uct/ib/ud/verbs/ud_verbs.c b/src/uct/ib/ud/verbs/ud_verbs.c
index 3b15ff9ba70..f3d7a41c734 100644
--- a/src/uct/ib/ud/verbs/ud_verbs.c
+++ b/src/uct/ib/ud/verbs/ud_verbs.c
@@ -102,9 +102,9 @@ static inline void
 uct_ud_verbs_ep_tx_inlv(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep,
                         const void *buffer, unsigned length)
 {
-    iface->tx.sge[1].addr   = (uintptr_t)buffer;
-    iface->tx.sge[1].length = length;
-    ucs_assert(iface->tx.wr_inl.num_sge == 2);
+    iface->tx.sge[1].addr    = (uintptr_t)buffer;
+    iface->tx.sge[1].length  = length;
+    iface->tx.wr_inl.num_sge = 2;
     uct_ud_verbs_post_send(iface, ep, &iface->tx.wr_inl, IBV_SEND_INLINE, 2);
 }
 
@@ -196,6 +196,41 @@ ucs_status_t uct_ud_verbs_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t hdr,
     return UCS_OK;
 }
 
+static ucs_status_t uct_ud_verbs_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                                 const uct_iov_t *iov, size_t iovcnt)
+{
+    uct_ud_verbs_ep_t *ep       = ucs_derived_of(tl_ep, uct_ud_verbs_ep_t);
+    uct_ud_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ud_verbs_iface_t);
+    uct_ud_send_skb_t *skb;
+    ucs_status_t status;
+
+    UCT_CHECK_IOV_SIZE(iovcnt, (size_t)iface->config.max_send_sge,
+                       "uct_ud_verbs_ep_am_short_iov");
+    UCT_CHECK_LENGTH(sizeof(uct_ud_neth_t) + uct_iov_total_length(iov, iovcnt), 0,
+                     iface->super.config.max_inline, "am_short");
+
+    uct_ud_enter(&iface->super);
+
+    status = uct_ud_am_skb_common(&iface->super, &ep->super, id, &skb);
+    if (status != UCS_OK) {
+        uct_ud_leave(&iface->super);
+        return status;
+    }
+
+    skb->len = iface->tx.sge[0].length = sizeof(uct_ud_neth_t);
+    iface->tx.sge[0].addr              = (uintptr_t)skb->neth;
+    iface->tx.wr_inl.num_sge           = uct_ib_verbs_sge_fill_iov(iface->tx.sge + 1,
+                                                                    iov, iovcnt) + 1;
+    uct_ud_verbs_post_send(iface, ep, &iface->tx.wr_inl, IBV_SEND_INLINE,
+                           iface->tx.wr_inl.num_sge);
+
+    uct_ud_iface_complete_tx_skb(&iface->super, &ep->super, skb);
+    UCT_TL_EP_STAT_OP(&ep->super.super, AM, SHORT, uct_iov_total_length(iov, iovcnt));
+    uct_ud_leave(&iface->super);
+
+    return UCS_OK;
+}
+
 static ssize_t uct_ud_verbs_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                                         uct_pack_callback_t pack_cb, void *arg,
                                         unsigned flags)
@@ -367,7 +402,7 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async)
 
     UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, packet, wc, num_wcs) {
         if (!uct_ud_iface_check_grh(&iface->super, packet,
-                                    wc[i].wc_flags & IBV_WC_GRH)) {
+                                    wc[i].wc_flags & IBV_WC_GRH, wc[i].sl)) {
             ucs_mpool_put_inline((void*)wc[i].wr_id);
             continue;
         }
@@ -386,13 +421,6 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async)
     return num_wcs;
 }
 
-static ucs_status_t uct_ud_verbs_ep_set_failed(uct_ib_iface_t *iface,
-                                               uct_ep_h ep, ucs_status_t status)
-{
-    return uct_set_ep_failed(&UCS_CLASS_NAME(uct_ud_verbs_ep_t), ep,
-                             &iface->super.super, status);
-}
-
 static unsigned uct_ud_verbs_iface_async_progress(uct_ud_iface_t *ud_iface)
 {
     uct_ud_verbs_iface_t *iface = ucs_derived_of(ud_iface, uct_ud_verbs_iface_t);
@@ -413,19 +441,18 @@ static unsigned uct_ud_verbs_iface_async_progress(uct_ud_iface_t *ud_iface)
 static unsigned uct_ud_verbs_iface_progress(uct_iface_h tl_iface)
 {
     uct_ud_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_ud_verbs_iface_t);
-    ucs_status_t status;
     unsigned count;
 
     uct_ud_enter(&iface->super);
-    uct_ud_iface_dispatch_async_comps(&iface->super);
-    status = uct_ud_iface_dispatch_pending_rx(&iface->super);
-    if (status == UCS_OK) {
+
+    count  = uct_ud_iface_dispatch_async_comps(&iface->super, NULL);
+    count += uct_ud_iface_dispatch_pending_rx(&iface->super);
+
+    if (ucs_likely(count == 0)) {
         count = uct_ud_verbs_iface_poll_rx(iface, 0);
         if (count == 0) {
-            count = uct_ud_verbs_iface_poll_tx(iface, 0);
+            count += uct_ud_verbs_iface_poll_tx(iface, 0);
         }
-    } else {
-        count = 0;
     }
 
     uct_ud_iface_progress_pending(&iface->super, 0);
@@ -519,19 +546,46 @@ uct_ud_verbs_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p)
     return uct_ud_verbs_ep_t_new(params, ep_p);
 }
 
+static void uct_ud_verbs_iface_destroy_qp(uct_ud_iface_t *ud_iface)
+{
+    uct_ib_destroy_qp(ud_iface->qp);
+}
+
 static void UCS_CLASS_DELETE_FUNC_NAME(uct_ud_verbs_iface_t)(uct_iface_t*);
 
 static uct_ud_iface_ops_t uct_ud_verbs_iface_ops = {
-    {
-    {
+    .super = {
+        .super = {
+            .iface_estimate_perf = uct_base_iface_estimate_perf,
+            .iface_vfs_refresh   = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
+        },
+        .create_cq      = uct_ib_verbs_create_cq,
+        .arm_cq         = uct_ib_iface_arm_cq,
+        .event_cq       = (uct_ib_iface_event_cq_func_t)ucs_empty_function,
+        .handle_failure = (uct_ib_iface_handle_failure_func_t)ucs_empty_function_do_assert,
+    },
+    .async_progress          = uct_ud_verbs_iface_async_progress,
+    .send_ctl                = uct_ud_verbs_ep_send_ctl,
+    .ep_free                 = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_verbs_ep_t),
+    .create_qp               = uct_ib_iface_create_qp,
+    .destroy_qp              = uct_ud_verbs_iface_destroy_qp,
+    .unpack_peer_address     = uct_ud_verbs_iface_unpack_peer_address,
+    .ep_get_peer_address     = uct_ud_verbs_ep_get_peer_address,
+    .get_peer_address_length = uct_ud_verbs_get_peer_address_length,
+    .peer_address_str        = uct_ud_verbs_iface_peer_address_str,
+};
+
+static uct_iface_ops_t uct_ud_verbs_iface_tl_ops = {
     .ep_put_short             = uct_ud_verbs_ep_put_short,
     .ep_am_short              = uct_ud_verbs_ep_am_short,
+    .ep_am_short_iov          = uct_ud_verbs_ep_am_short_iov,
     .ep_am_bcopy              = uct_ud_verbs_ep_am_bcopy,
     .ep_am_zcopy              = uct_ud_verbs_ep_am_zcopy,
     .ep_pending_add           = uct_ud_ep_pending_add,
     .ep_pending_purge         = uct_ud_ep_pending_purge,
     .ep_flush                 = uct_ud_ep_flush,
     .ep_fence                 = uct_base_ep_fence,
+    .ep_check                 = uct_ud_ep_check,
     .ep_create                = uct_ud_verbs_ep_create,
     .ep_destroy               = uct_ud_ep_disconnect,
     .ep_get_address           = uct_ud_ep_get_address,
@@ -549,21 +603,6 @@ static uct_ud_iface_ops_t uct_ud_verbs_iface_ops = {
     .iface_get_device_address = uct_ib_iface_get_device_address,
     .iface_get_address        = uct_ud_iface_get_address,
     .iface_is_reachable       = uct_ib_iface_is_reachable
-    },
-    .create_cq                = uct_ib_verbs_create_cq,
-    .arm_cq                   = uct_ib_iface_arm_cq,
-    .event_cq                 = (uct_ib_iface_event_cq_func_t)ucs_empty_function,
-    .handle_failure           = (uct_ib_iface_handle_failure_func_t)ucs_empty_function_do_assert,
-    .set_ep_failed            = uct_ud_verbs_ep_set_failed,
-    },
-    .async_progress           = uct_ud_verbs_iface_async_progress,
-    .send_ctl                 = uct_ud_verbs_ep_send_ctl,
-    .ep_free                  = UCS_CLASS_DELETE_FUNC_NAME(uct_ud_verbs_ep_t),
-    .create_qp                = uct_ib_iface_create_qp,
-    .unpack_peer_address      = uct_ud_verbs_iface_unpack_peer_address,
-    .ep_get_peer_address      = uct_ud_verbs_ep_get_peer_address,
-    .get_peer_address_length  = uct_ud_verbs_get_peer_address_length,
-    .peer_address_str         = uct_ud_verbs_iface_peer_address_str
 };
 
 static UCS_F_NOINLINE void
@@ -624,19 +663,21 @@ static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_md_h md, uct_worker_h worke
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
 {
-    uct_ud_iface_config_t *config = ucs_derived_of(tl_config,
-                                                   uct_ud_iface_config_t);
+    uct_ud_iface_config_t *config      = ucs_derived_of(tl_config,
+                                                        uct_ud_iface_config_t);
     uct_ib_iface_init_attr_t init_attr = {};
     ucs_status_t status;
 
     ucs_trace_func("");
 
-    init_attr.cq_len[UCT_IB_DIR_TX]   = config->super.tx.queue_len;
-    init_attr.cq_len[UCT_IB_DIR_RX]   = config->super.rx.queue_len;
+    init_attr.cq_len[UCT_IB_DIR_TX] = config->super.tx.queue_len;
+    init_attr.cq_len[UCT_IB_DIR_RX] = config->super.rx.queue_len;
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_verbs_iface_ops, md,
+    UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_verbs_iface_ops, &uct_ud_verbs_iface_tl_ops, md,
                               worker, params, config, &init_attr);
 
+    self->super.super.config.sl       = uct_ib_iface_config_select_sl(&config->super);
+
     memset(&self->tx.wr_inl, 0, sizeof(self->tx.wr_inl));
     self->tx.wr_inl.opcode            = IBV_WR_SEND;
     self->tx.wr_inl.wr_id             = 0xBEEBBEEB;
@@ -644,7 +685,6 @@ static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_md_h md, uct_worker_h worke
     self->tx.wr_inl.imm_data          = 0;
     self->tx.wr_inl.next              = 0;
     self->tx.wr_inl.sg_list           = self->tx.sge;
-    self->tx.wr_inl.num_sge           = 2;
 
     memset(&self->tx.wr_skb, 0, sizeof(self->tx.wr_skb));
     self->tx.wr_skb.opcode            = IBV_WR_SEND;
@@ -679,7 +719,6 @@ static UCS_CLASS_INIT_FUNC(uct_ud_verbs_iface_t, uct_md_h md, uct_worker_h worke
 static UCS_CLASS_CLEANUP_FUNC(uct_ud_verbs_iface_t)
 {
     ucs_trace_func("");
-    uct_ud_iface_remove_async_handlers(&self->super);
 }
 
 UCS_CLASS_DEFINE(uct_ud_verbs_iface_t, uct_ud_iface_t);
diff --git a/src/uct/rocm/base/rocm_base.c b/src/uct/rocm/base/rocm_base.c
index 718d80e6d43..f2dedee7f61 100644
--- a/src/uct/rocm/base/rocm_base.c
+++ b/src/uct/rocm/base/rocm_base.c
@@ -11,7 +11,6 @@
 
 #include <ucs/sys/module.h>
 
-#include <hsa_ext_amd.h>
 #include <pthread.h>
 
 
@@ -110,7 +109,8 @@ ucs_status_t uct_rocm_base_query_devices(uct_md_h md,
                                          unsigned *num_tl_devices_p)
 {
     return uct_single_device_resource(md, md->component->name,
-                                      UCT_DEVICE_TYPE_ACC, tl_devices_p,
+                                      UCT_DEVICE_TYPE_ACC,
+                                      UCS_SYS_DEVICE_ID_UNKNOWN, tl_devices_p,
                                       num_tl_devices_p);
 }
 
@@ -177,8 +177,8 @@ ucs_status_t uct_rocm_base_detect_memory_type(uct_md_h md, const void *addr,
     hsa_status_t status;
     hsa_amd_pointer_info_t info;
 
+    *mem_type_p = UCS_MEMORY_TYPE_HOST;
     if (addr == NULL) {
-        *mem_type_p = UCS_MEMORY_TYPE_HOST;
         return UCS_OK;
     }
 
@@ -199,6 +199,93 @@ ucs_status_t uct_rocm_base_detect_memory_type(uct_md_h md, const void *addr,
     return UCS_ERR_INVALID_ADDR;
 }
 
+ucs_status_t uct_rocm_base_mem_query(uct_md_h md, const void *addr,
+                                     const size_t length,
+                                     uct_md_mem_attr_t *mem_attr_p)
+{
+    ucs_status_t status;
+    ucs_memory_type_t mem_type;
+
+    status = uct_rocm_base_detect_memory_type(md, addr, length, &mem_type);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_MEM_TYPE) {
+        mem_attr_p->mem_type = mem_type;
+    }
+
+    if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_SYS_DEV) {
+        mem_attr_p->sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+    }
+
+    if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS) {
+        mem_attr_p->base_address = (void*) addr;
+    }
+
+    if (mem_attr_p->field_mask & UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH) {
+        mem_attr_p->alloc_length = length;
+    }
+
+    return UCS_OK;
+}
+
+static hsa_status_t uct_rocm_hsa_pool_callback(hsa_amd_memory_pool_t pool, void* data)
+{
+    int allowed;
+    uint32_t flags;
+    hsa_amd_segment_t segment;
+
+    hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &allowed);
+    if (allowed) {
+        hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment);
+        if (HSA_AMD_SEGMENT_GLOBAL != segment) {
+            return HSA_STATUS_SUCCESS;
+        }
+
+        hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flags);
+        if (flags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
+            *((hsa_amd_memory_pool_t*)data) = pool;
+            return HSA_STATUS_INFO_BREAK;
+        }
+    }
+    return HSA_STATUS_SUCCESS;
+}
+
+ucs_status_t uct_rocm_base_get_link_type(hsa_amd_link_info_type_t *link_type)
+{
+    hsa_amd_memory_pool_link_info_t link_info;
+    hsa_agent_t agent1, agent2;
+    hsa_amd_memory_pool_t pool;
+    hsa_status_t status;
+
+    *link_type = HSA_AMD_LINK_INFO_TYPE_PCIE;
+
+    if (uct_rocm_base_agents.num_gpu < 2) {
+        return UCS_OK;
+    }
+
+    agent1 = uct_rocm_base_agents.gpu_agents[0];
+    agent2 = uct_rocm_base_agents.gpu_agents[1];
+
+    status = hsa_amd_agent_iterate_memory_pools(agent2,
+                            uct_rocm_hsa_pool_callback, (void*)&pool);
+    if ((status != HSA_STATUS_SUCCESS) && (status != HSA_STATUS_INFO_BREAK)) {
+        ucs_debug("Could not iterate HSA memory pools: 0x%x", status);
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    status = hsa_amd_agent_memory_pool_get_info(agent1, pool,
+                        HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, &link_info);
+    if (status != HSA_STATUS_SUCCESS) {
+        ucs_debug("Could not get HSA memory pool info: 0x%x", status);
+        return UCS_ERR_UNSUPPORTED;
+    }
+
+    *link_type = link_info.link_type;
+    return UCS_OK;
+}
+
 UCS_MODULE_INIT() {
     UCS_MODULE_FRAMEWORK_DECLARE(uct_rocm);
     UCS_MODULE_FRAMEWORK_LOAD(uct_rocm, 0);
diff --git a/src/uct/rocm/base/rocm_base.h b/src/uct/rocm/base/rocm_base.h
index d818b73c005..aeeed20a3db 100644
--- a/src/uct/rocm/base/rocm_base.h
+++ b/src/uct/rocm/base/rocm_base.h
@@ -10,6 +10,7 @@
 #include <uct/base/uct_iface.h>
 #include <uct/base/uct_md.h>
 #include <hsa.h>
+#include <hsa_ext_amd.h>
 
 
 hsa_status_t uct_rocm_base_init(void);
@@ -29,5 +30,9 @@ hsa_status_t uct_rocm_base_get_ptr_info(void *ptr, size_t size,
 ucs_status_t uct_rocm_base_detect_memory_type(uct_md_h md, const void *addr,
                                               size_t length,
                                               ucs_memory_type_t *mem_type_p);
+ucs_status_t uct_rocm_base_mem_query(uct_md_h md, const void *addr,
+                                     const size_t length,
+                                     uct_md_mem_attr_t *mem_attr_p);
+ucs_status_t uct_rocm_base_get_link_type(hsa_amd_link_info_type_t *type);
 
 #endif
diff --git a/src/uct/rocm/copy/rocm_copy_ep.c b/src/uct/rocm/copy/rocm_copy_ep.c
index 04095ad2515..e1752ccdb6f 100644
--- a/src/uct/rocm/copy/rocm_copy_ep.c
+++ b/src/uct/rocm/copy/rocm_copy_ep.c
@@ -9,13 +9,18 @@
 
 #include "rocm_copy_ep.h"
 #include "rocm_copy_iface.h"
+#include "rocm_copy_md.h"
 
+
+#include <uct/rocm/base/rocm_base.h>
 #include <uct/base/uct_log.h>
 #include <uct/base/uct_iov.inl>
 #include <ucs/debug/memtrack.h>
 #include <ucs/type/class.h>
 #include <ucs/arch/cpu.h>
 
+#include <hsa_ext_amd.h>
+
 #define uct_rocm_memcpy_h2d(_d,_s,_l)  memcpy((_d),(_s),(_l))
 #define uct_rocm_memcpy_d2h(_d,_s,_l)  ucs_memcpy_nontemporal((_d),(_s),(_l))
 
@@ -40,23 +45,64 @@ UCS_CLASS_DEFINE_DELETE_FUNC(uct_rocm_copy_ep_t, uct_ep_t);
      ucs_trace_data(_fmt " to %"PRIx64"(%+ld)", ## __VA_ARGS__, (_remote_addr), \
                     (_rkey))
 
-static UCS_F_ALWAYS_INLINE ucs_status_t
-uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep,
-                                   uint64_t remote_addr,
-                                   const uct_iov_t *iov,
-                                   int is_put)
+ucs_status_t uct_rocm_copy_ep_zcopy(uct_ep_h tl_ep,
+                                    uint64_t remote_addr,
+                                    const uct_iov_t *iov,
+                                    uct_rkey_t rkey,
+                                    int is_put)
 {
-    size_t size = uct_iov_get_length(iov);
+    size_t size                        = uct_iov_get_length(iov);
+    uct_rocm_copy_iface_t *iface       = ucs_derived_of(tl_ep->iface, uct_rocm_copy_iface_t);
+    hsa_signal_t signal                = iface->hsa_signal;
+    uct_rocm_copy_key_t *rocm_copy_key = (uct_rocm_copy_key_t *) rkey;
+
+    hsa_status_t status;
+    hsa_agent_t agent;
+    void *src_addr, *dst_addr;
+    void *host_ptr, *dev_ptr, *mapped_ptr;
+    size_t offset;
+
+    ucs_trace("remote addr %p rkey %p size %zu",
+              (void*)remote_addr, (void*)rkey, size);
+
+    if (is_put) {   /* Host-to-Device */
+        host_ptr = iov->buffer;
+        dev_ptr  = (void *)remote_addr;
+    } else {        /* Device-to-Host */
+        dev_ptr  = (void *)remote_addr;
+        host_ptr = iov->buffer;
+    }
+
+    offset     = (uint64_t) host_ptr - rocm_copy_key->vaddr;
+    mapped_ptr = UCS_PTR_BYTE_OFFSET(rocm_copy_key->dev_ptr, offset);
 
-    if (!size) {
-        return UCS_OK;
+    ucs_trace("host_ptr %p offset %zu dev_ptr %p mapped_ptr %p",
+              host_ptr, offset, rocm_copy_key->dev_ptr, mapped_ptr);
+
+    status = uct_rocm_base_get_ptr_info(dev_ptr,  size, NULL, NULL, &agent);
+    if (status != HSA_STATUS_SUCCESS) {
+        const char *addr_type = is_put ? "DST" : "SRC";
+        ucs_error("%s addr %p/%lx is not ROCM memory", addr_type, dev_ptr, size);
+        return UCS_ERR_INVALID_ADDR;
     }
 
-    if (is_put)
-        uct_rocm_memcpy_h2d((void *)remote_addr, iov->buffer, size);
-    else
-        uct_rocm_memcpy_d2h(iov->buffer, (void *)remote_addr, size);
+    if (is_put) {
+        src_addr = mapped_ptr;
+        dst_addr = dev_ptr;
+    } else {
+        src_addr = dev_ptr;
+        dst_addr = mapped_ptr;
+    }
+
+    hsa_signal_store_screlease(signal, 1);
+    ucs_trace("hsa async copy from src %p to dst %p, len %ld",
+              src_addr, dst_addr, size);
+    status = hsa_amd_memory_async_copy(dst_addr, agent,
+                                       src_addr, agent,
+                                       size, 0, NULL, signal);
 
+    while (hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1,
+                                     UINT64_MAX, HSA_WAIT_STATE_ACTIVE));
     return UCS_OK;
 }
 
@@ -64,9 +110,16 @@ ucs_status_t uct_rocm_copy_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si
                                         uint64_t remote_addr, uct_rkey_t rkey,
                                         uct_completion_t *comp)
 {
+    size_t size                  = uct_iov_get_length(iov);
+    uct_rocm_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rocm_copy_iface_t);
     ucs_status_t status;
 
-    status = uct_rocm_copy_ep_zcopy(tl_ep, remote_addr, iov, 0);
+    if (size < iface->config.d2h_thresh) {
+        uct_rocm_memcpy_d2h(iov->buffer, (void *)remote_addr, size);
+        status = UCS_OK;
+    } else {
+        status = uct_rocm_copy_ep_zcopy(tl_ep, remote_addr, iov, rkey, 0);
+    }
 
     UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), GET, ZCOPY,
                       uct_iov_total_length(iov, iovcnt));
@@ -79,16 +132,22 @@ ucs_status_t uct_rocm_copy_ep_put_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, si
                                         uint64_t remote_addr, uct_rkey_t rkey,
                                         uct_completion_t *comp)
 {
+    size_t size                  = uct_iov_get_length(iov);
+    uct_rocm_copy_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rocm_copy_iface_t);
     ucs_status_t status;
 
-    status = uct_rocm_copy_ep_zcopy(tl_ep, remote_addr, iov, 1);
+    if (size < iface->config.h2d_thresh) {
+        uct_rocm_memcpy_h2d((void *)remote_addr, iov->buffer, size);
+        status = UCS_OK;
+    } else {
+        status = uct_rocm_copy_ep_zcopy(tl_ep, remote_addr, iov, rkey, 1);
+    }
 
     UCT_TL_EP_STAT_OP(ucs_derived_of(tl_ep, uct_base_ep_t), PUT, ZCOPY,
                       uct_iov_total_length(iov, iovcnt));
     uct_rocm_copy_trace_data(remote_addr, rkey, "GET_ZCOPY [length %zu]",
                              uct_iov_total_length(iov, iovcnt));
     return status;
-
 }
 
 
diff --git a/src/uct/rocm/copy/rocm_copy_iface.c b/src/uct/rocm/copy/rocm_copy_iface.c
index 1d6b1a0bb7c..c3e1bf892a3 100644
--- a/src/uct/rocm/copy/rocm_copy_iface.c
+++ b/src/uct/rocm/copy/rocm_copy_iface.c
@@ -22,6 +22,14 @@ static ucs_config_field_t uct_rocm_copy_iface_config_table[] = {
      ucs_offsetof(uct_rocm_copy_iface_config_t, super),
      UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},
 
+    {"D2H_THRESH", "16k",
+     "Threshold for switching to hsa memcpy for device-to-host copies",
+     ucs_offsetof(uct_rocm_copy_iface_config_t, d2h_thresh), UCS_CONFIG_TYPE_MEMUNITS},
+
+    {"H2D_THRESH", "1m",
+     "Threshold for switching to hsa memcpy for host-to-device copies",
+     ucs_offsetof(uct_rocm_copy_iface_config_t, h2d_thresh), UCS_CONFIG_TYPE_MEMUNITS},
+
     {NULL}
 };
 
@@ -126,18 +134,25 @@ static UCS_CLASS_INIT_FUNC(uct_rocm_copy_iface_t, uct_md_h md, uct_worker_h work
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
 {
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rocm_copy_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    uct_rocm_copy_iface_config_t *config = ucs_derived_of(tl_config,
+                                                          uct_rocm_copy_iface_config_t);
+
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rocm_copy_iface_ops, NULL,
+                              md, worker, params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG(UCT_ROCM_COPY_TL_NAME));
 
-    self->id = ucs_generate_uuid((uintptr_t)self);
+    self->id                    = ucs_generate_uuid((uintptr_t)self);
+    self->config.d2h_thresh     = config->d2h_thresh;
+    self->config.h2d_thresh     = config->h2d_thresh;
+    hsa_signal_create(1, 0, NULL, &self->hsa_signal);
 
     return UCS_OK;
 }
 
 static UCS_CLASS_CLEANUP_FUNC(uct_rocm_copy_iface_t)
 {
-
+    hsa_signal_destroy(self->hsa_signal);
 }
 
 UCS_CLASS_DEFINE(uct_rocm_copy_iface_t, uct_base_iface_t);
diff --git a/src/uct/rocm/copy/rocm_copy_iface.h b/src/uct/rocm/copy/rocm_copy_iface.h
index e1b4f0604af..77cccfb80e6 100644
--- a/src/uct/rocm/copy/rocm_copy_iface.h
+++ b/src/uct/rocm/copy/rocm_copy_iface.h
@@ -8,17 +8,26 @@
 
 #include <uct/base/uct_iface.h>
 
+#include <hsa.h>
+
 #define UCT_ROCM_COPY_TL_NAME    "rocm_cpy"
 
 typedef uint64_t uct_rocm_copy_iface_addr_t;
 
 typedef struct uct_rocm_copy_iface {
-    uct_base_iface_t super;
-    uct_rocm_copy_iface_addr_t id;
+    uct_base_iface_t            super;
+    uct_rocm_copy_iface_addr_t  id;
+    hsa_signal_t                hsa_signal;
+    struct {
+        size_t                  d2h_thresh;
+        size_t                  h2d_thresh;
+    } config;
 } uct_rocm_copy_iface_t;
 
 typedef struct uct_rocm_copy_iface_config {
-    uct_iface_config_t super;
+    uct_iface_config_t  super;
+    size_t              d2h_thresh;
+    size_t              h2d_thresh;
 } uct_rocm_copy_iface_config_t;
 
 #endif
diff --git a/src/uct/rocm/copy/rocm_copy_md.c b/src/uct/rocm/copy/rocm_copy_md.c
index 453a484d47d..0430800ac14 100644
--- a/src/uct/rocm/copy/rocm_copy_md.c
+++ b/src/uct/rocm/copy/rocm_copy_md.c
@@ -15,7 +15,9 @@
 #include <limits.h>
 #include <ucs/debug/log.h>
 #include <ucs/sys/sys.h>
+#include <ucs/sys/math.h>
 #include <ucs/debug/memtrack.h>
+#include <ucm/api/ucm.h>
 #include <ucs/type/class.h>
 
 #include <hsa_ext_amd.h>
@@ -25,19 +27,24 @@ static ucs_config_field_t uct_rocm_copy_md_config_table[] = {
      ucs_offsetof(uct_rocm_copy_md_config_t, super),
      UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
 
+    {"RCACHE", "try", "Enable using memory registration cache",
+     ucs_offsetof(uct_rocm_copy_md_config_t, enable_rcache),
+     UCS_CONFIG_TYPE_TERNARY},
+
     {NULL}
 };
 
 static ucs_status_t uct_rocm_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr)
 {
-    md_attr->cap.flags            = UCT_MD_FLAG_REG;
-    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
+    md_attr->cap.flags            = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY;
+    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST) |
+                                    UCS_BIT(UCS_MEMORY_TYPE_ROCM);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM);
-    md_attr->cap.detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM) |
-                                    UCS_BIT(UCS_MEMORY_TYPE_ROCM_MANAGED);
+    md_attr->cap.detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM);
     md_attr->cap.max_alloc        = 0;
     md_attr->cap.max_reg          = ULONG_MAX;
-    md_attr->rkey_packed_size     = 0;
+    md_attr->rkey_packed_size     = sizeof(uct_rocm_copy_key_t);
     md_attr->reg_cost             = ucs_linear_func_make(0, 0);
     memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus));
     return UCS_OK;
@@ -46,6 +53,12 @@ static ucs_status_t uct_rocm_copy_md_query(uct_md_h md, uct_md_attr_t *md_attr)
 static ucs_status_t uct_rocm_copy_mkey_pack(uct_md_h md, uct_mem_h memh,
                                             void *rkey_buffer)
 {
+    uct_rocm_copy_key_t *packed   = (uct_rocm_copy_key_t *)rkey_buffer;
+    uct_rocm_copy_mem_t *mem_hndl = (uct_rocm_copy_mem_t *)memh;
+
+    packed->vaddr   = (uint64_t) mem_hndl->vaddr;
+    packed->dev_ptr = mem_hndl->dev_ptr;
+
     return UCS_OK;
 }
 
@@ -53,40 +66,91 @@ static ucs_status_t uct_rocm_copy_rkey_unpack(uct_component_t *component,
                                               const void *rkey_buffer,
                                               uct_rkey_t *rkey_p, void **handle_p)
 {
-    *rkey_p   = 0xdeadbeef;
+    uct_rocm_copy_key_t *packed = (uct_rocm_copy_key_t *)rkey_buffer;
+    uct_rocm_copy_key_t *key;
+
+    key = ucs_malloc(sizeof(uct_rocm_copy_key_t), "uct_rocm_copy_key_t");
+    if (NULL == key) {
+        ucs_error("failed to allocate memory for uct_rocm_copy_key_t");
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    key->vaddr   = packed->vaddr;
+    key->dev_ptr = packed->dev_ptr;
+
     *handle_p = NULL;
+    *rkey_p   = (uintptr_t)key;
+
     return UCS_OK;
 }
 
 static ucs_status_t uct_rocm_copy_rkey_release(uct_component_t *component,
                                                uct_rkey_t rkey, void *handle)
 {
+    ucs_assert(NULL == handle);
+    ucs_free((void *)rkey);
     return UCS_OK;
 }
 
-static ucs_status_t uct_rocm_copy_mem_reg(uct_md_h md, void *address, size_t length,
-                                          unsigned flags, uct_mem_h *memh_p)
+static ucs_status_t uct_rocm_copy_mem_reg_internal(
+        uct_md_h uct_md, void *address, size_t length,
+        unsigned flags, uct_rocm_copy_mem_t *mem_hndl)
 {
+    void *dev_addr = NULL;
     hsa_status_t status;
-    void *lock_addr;
 
     if(address == NULL) {
-        *memh_p = address;
+        memset(mem_hndl, 0, sizeof(*mem_hndl));
         return UCS_OK;
     }
 
-    status = hsa_amd_memory_lock(address, length, NULL, 0, &lock_addr);
-    if (status != HSA_STATUS_SUCCESS) {
+    status = hsa_amd_memory_lock(address, length, NULL, 0, &dev_addr);
+    if ((status != HSA_STATUS_SUCCESS) || (dev_addr == NULL)) {
         return UCS_ERR_IO_ERROR;
     }
 
-    *memh_p = address;
+    mem_hndl->vaddr    = address;
+    mem_hndl->dev_ptr  = dev_addr;
+    mem_hndl->reg_size = length;
+
+    ucs_trace("Registered addr %p len %zu dev addr %p", address, length, dev_addr);
+    return UCS_OK;
+}
+
+static ucs_status_t uct_rocm_copy_mem_reg(uct_md_h md, void *address, size_t length,
+                                          unsigned flags, uct_mem_h *memh_p)
+{
+    uct_rocm_copy_mem_t *mem_hndl = NULL;
+    void *start, *end;
+    size_t len, page_size;
+    ucs_status_t status;
+
+    mem_hndl = ucs_malloc(sizeof(uct_rocm_copy_mem_t), "rocm_copy handle");
+    if (NULL == mem_hndl) {
+        ucs_error("failed to allocate memory for rocm_copy_mem_t");
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    page_size = ucs_get_page_size();
+    start     = ucs_align_down_pow2_ptr(address, page_size);
+    end       = ucs_align_up_pow2_ptr(UCS_PTR_BYTE_OFFSET(address, length), page_size);
+    len       = UCS_PTR_BYTE_DIFF(start, end);
+    ucs_assert_always(start <= end);
+
+    status = uct_rocm_copy_mem_reg_internal(md, address, len, 0, mem_hndl);
+    if (status != UCS_OK) {
+        ucs_free(mem_hndl);
+        return status;
+    }
+
+    *memh_p = mem_hndl;
     return UCS_OK;
 }
 
 static ucs_status_t uct_rocm_copy_mem_dereg(uct_md_h md, uct_mem_h memh)
 {
-    void *address = (void *)memh;
+    uct_rocm_copy_mem_t *mem_hndl = (uct_rocm_copy_mem_t *)memh;
+    void *address = mem_hndl->vaddr;
     hsa_status_t status;
 
     if (address == NULL) {
@@ -98,29 +162,130 @@ static ucs_status_t uct_rocm_copy_mem_dereg(uct_md_h md, uct_mem_h memh)
         return UCS_ERR_IO_ERROR;
     }
 
+    ucs_trace("Deregistered addr %p len %zu", address, mem_hndl->reg_size);
     return UCS_OK;
 }
 
 static void uct_rocm_copy_md_close(uct_md_h uct_md) {
     uct_rocm_copy_md_t *md = ucs_derived_of(uct_md, uct_rocm_copy_md_t);
 
+    if (md->rcache != NULL) {
+        ucs_rcache_destroy(md->rcache);
+    }
+
     ucs_free(md);
 }
 
 static uct_md_ops_t md_ops = {
-    .close               = uct_rocm_copy_md_close,
-    .query               = uct_rocm_copy_md_query,
-    .mkey_pack           = uct_rocm_copy_mkey_pack,
-    .mem_reg             = uct_rocm_copy_mem_reg,
-    .mem_dereg           = uct_rocm_copy_mem_dereg,
-    .detect_memory_type  = uct_rocm_base_detect_memory_type
+    .close                  = uct_rocm_copy_md_close,
+    .query                  = uct_rocm_copy_md_query,
+    .mkey_pack              = uct_rocm_copy_mkey_pack,
+    .mem_reg                = uct_rocm_copy_mem_reg,
+    .mem_dereg              = uct_rocm_copy_mem_dereg,
+    .mem_query              = uct_rocm_base_mem_query,
+    .detect_memory_type     = uct_rocm_base_detect_memory_type,
+    .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+};
+
+static inline uct_rocm_copy_rcache_region_t*
+uct_rocm_copy_rache_region_from_memh(uct_mem_h memh)
+{
+    return ucs_container_of(memh, uct_rocm_copy_rcache_region_t, memh);
+}
+
+static ucs_status_t
+uct_rocm_copy_mem_rcache_reg(uct_md_h uct_md, void *address, size_t length,
+                             unsigned flags, uct_mem_h *memh_p)
+{
+    uct_rocm_copy_md_t *md = ucs_derived_of(uct_md, uct_rocm_copy_md_t);
+    ucs_rcache_region_t *rregion;
+    ucs_status_t status;
+    uct_rocm_copy_mem_t *memh;
+
+    status = ucs_rcache_get(md->rcache, (void *)address, length, PROT_READ|PROT_WRITE,
+                            &flags, &rregion);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    ucs_assert(rregion->refcount > 0);
+    memh    = &ucs_derived_of(rregion, uct_rocm_copy_rcache_region_t)->memh;
+    *memh_p = memh;
+    return UCS_OK;
+}
+
+static ucs_status_t uct_rocm_copy_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh)
+{
+    uct_rocm_copy_md_t *md = ucs_derived_of(uct_md, uct_rocm_copy_md_t);
+    uct_rocm_copy_rcache_region_t *region = uct_rocm_copy_rache_region_from_memh(memh);
+
+    ucs_rcache_region_put(md->rcache, &region->super);
+    return UCS_OK;
+}
+
+static uct_md_ops_t md_rcache_ops = {
+    .close                  = uct_rocm_copy_md_close,
+    .query                  = uct_rocm_copy_md_query,
+    .mkey_pack              = uct_rocm_copy_mkey_pack,
+    .mem_reg                = uct_rocm_copy_mem_rcache_reg,
+    .mem_dereg              = uct_rocm_copy_mem_rcache_dereg,
+    .mem_query              = uct_rocm_base_mem_query,
+    .detect_memory_type     = uct_rocm_base_detect_memory_type,
+    .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+};
+
+static ucs_status_t
+uct_rocm_copy_rcache_mem_reg_cb(void *context, ucs_rcache_t *rcache,
+                                void *arg, ucs_rcache_region_t *rregion,
+                                uint16_t rcache_mem_reg_flags)
+{
+    uct_rocm_copy_md_t *md = context;
+    int *flags = arg;
+    uct_rocm_copy_rcache_region_t *region;
+
+    region = ucs_derived_of(rregion, uct_rocm_copy_rcache_region_t);
+    return uct_rocm_copy_mem_reg_internal(&md->super, (void*)region->super.super.start,
+                                          region->super.super.end -
+                                          region->super.super.start,
+                                          *flags, &region->memh);
+}
+
+static void uct_rocm_copy_rcache_mem_dereg_cb(void *context, ucs_rcache_t *rcache,
+                                              ucs_rcache_region_t *rregion)
+{
+    uct_rocm_copy_md_t *md = context;
+    uct_rocm_copy_rcache_region_t *region;
+
+    region = ucs_derived_of(rregion, uct_rocm_copy_rcache_region_t);
+    (void)uct_rocm_copy_mem_dereg(&md->super, &region->memh);
+}
+
+static void uct_rocm_copy_rcache_dump_region_cb(void *context, ucs_rcache_t *rcache,
+                                                ucs_rcache_region_t *rregion, char *buf,
+                                                size_t max)
+{
+    uct_rocm_copy_rcache_region_t *region = ucs_derived_of(rregion,
+                                                           uct_rocm_copy_rcache_region_t);
+    uct_rocm_copy_mem_t *memh = &region->memh;
+
+    snprintf(buf, max, "dev ptr:%p", memh->dev_ptr);
+}
+
+static ucs_rcache_ops_t uct_rocm_copy_rcache_ops = {
+    .mem_reg     = uct_rocm_copy_rcache_mem_reg_cb,
+    .mem_dereg   = uct_rocm_copy_rcache_mem_dereg_cb,
+    .dump_region = uct_rocm_copy_rcache_dump_region_cb
 };
 
 static ucs_status_t
 uct_rocm_copy_md_open(uct_component_h component, const char *md_name,
-                      const uct_md_config_t *md_config, uct_md_h *md_p)
+                      const uct_md_config_t *config, uct_md_h *md_p)
 {
+    const uct_rocm_copy_md_config_t *md_config =
+                    ucs_derived_of(config, uct_rocm_copy_md_config_t);
+    ucs_status_t status;
     uct_rocm_copy_md_t *md;
+    ucs_rcache_params_t rcache_params;
 
     md = ucs_malloc(sizeof(uct_rocm_copy_md_t), "uct_rocm_copy_md_t");
     if (NULL == md) {
@@ -130,9 +295,41 @@ uct_rocm_copy_md_open(uct_component_h component, const char *md_name,
 
     md->super.ops       = &md_ops;
     md->super.component = &uct_rocm_copy_component;
+    md->rcache          = NULL;
+    md->reg_cost        = ucs_linear_func_make(0, 0);
+
+    if (md_config->enable_rcache != UCS_NO) {
+        rcache_params.region_struct_size = sizeof(uct_rocm_copy_rcache_region_t);
+        rcache_params.alignment          = ucs_get_page_size();
+        rcache_params.max_alignment      = ucs_get_page_size();
+        rcache_params.ucm_events         = UCM_EVENT_MEM_TYPE_FREE;
+        rcache_params.ucm_event_priority = md_config->rcache.event_prio;
+        rcache_params.context            = md;
+        rcache_params.ops                = &uct_rocm_copy_rcache_ops;
+        rcache_params.flags              = 0;
+        status = ucs_rcache_create(&rcache_params, "rocm_copy", NULL, &md->rcache);
+        if (status == UCS_OK) {
+            md->super.ops = &md_rcache_ops;
+            md->reg_cost  = ucs_linear_func_make(0, 0);
+        } else {
+            ucs_assert(md->rcache == NULL);
+            if (md_config->enable_rcache == UCS_YES) {
+                status = UCS_ERR_IO_ERROR;
+                goto err;
+            } else {
+                ucs_debug("could not create registration cache for: %s",
+                          ucs_status_string(status));
+            }
+        }
+    }
 
     *md_p = (uct_md_h) md;
-    return UCS_OK;
+    status = UCS_OK;
+out:
+    return status;
+err:
+    ucs_free(md);
+    goto out;
 }
 
 uct_component_t uct_rocm_copy_component = {
diff --git a/src/uct/rocm/copy/rocm_copy_md.h b/src/uct/rocm/copy/rocm_copy_md.h
index 642d20275c4..790e8cc3227 100644
--- a/src/uct/rocm/copy/rocm_copy_md.h
+++ b/src/uct/rocm/copy/rocm_copy_md.h
@@ -7,16 +7,59 @@
 #define UCT_ROCM_COPY_MD_H
 
 #include <uct/base/uct_md.h>
+#include <ucs/config/types.h>
+#include <ucs/memory/rcache.h>
 
 
 extern uct_component_t uct_rocm_copy_component;
 
+/*
+ * @brief rocm_copy MD descriptor
+ */
 typedef struct uct_rocm_copy_md {
-    struct uct_md super;
+    uct_md_t            super;      /**< Domain info */
+    ucs_rcache_t        *rcache;    /**< Registration cache (can be NULL) */
+    ucs_linear_func_t   reg_cost;   /**< Memory registration cost */
 } uct_rocm_copy_md_t;
 
+
+/**
+ * rocm copy domain configuration.
+ */
 typedef struct uct_rocm_copy_md_config {
-    uct_md_config_t super;
+    uct_md_config_t             super;
+    ucs_ternary_auto_value_t    enable_rcache;/**< Enable registration cache */
+    uct_md_rcache_config_t      rcache;       /**< Registration cache config */
+    ucs_linear_func_t           uc_reg_cost;  /**< Memory registration cost estimation
+                                                   without using the cache */
 } uct_rocm_copy_md_config_t;
 
+
+/**
+ * @brief rocm copy mem handle
+ */
+typedef struct uct_rocm_copy_mem {
+    void        *vaddr;
+    void        *dev_ptr;
+    size_t      reg_size;
+} uct_rocm_copy_mem_t;
+
+
+/**
+ * @brief rocm copy packed and remote key for get/put
+ */
+typedef struct uct_rocm_copy_key {
+    uint64_t    vaddr;      /**< CPU address being mapped */
+    void        *dev_ptr;   /**< GPU accessible address */
+} uct_rocm_copy_key_t;
+
+
+/**
+ * rocm memory region in the registration cache.
+ */
+typedef struct uct_rocm_copy_rcache_region {
+    ucs_rcache_region_t  super;
+    uct_rocm_copy_mem_t  memh;      /**<  mr exposed to the user as the memh */
+} uct_rocm_copy_rcache_region_t;
+
 #endif
diff --git a/src/uct/rocm/gdr/rocm_gdr_iface.c b/src/uct/rocm/gdr/rocm_gdr_iface.c
index 4fb20073625..58d05a29fea 100644
--- a/src/uct/rocm/gdr/rocm_gdr_iface.c
+++ b/src/uct/rocm/gdr/rocm_gdr_iface.c
@@ -121,8 +121,9 @@ static UCS_CLASS_INIT_FUNC(uct_rocm_gdr_iface_t, uct_md_h md, uct_worker_h worke
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
 {
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rocm_gdr_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rocm_gdr_iface_ops, NULL,
+                              md, worker, params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG(UCT_ROCM_GDR_TL_NAME));
 
     self->id = ucs_generate_uuid((uintptr_t)self);
diff --git a/src/uct/rocm/gdr/rocm_gdr_md.c b/src/uct/rocm/gdr/rocm_gdr_md.c
index c50df47c2d7..a8308849d95 100644
--- a/src/uct/rocm/gdr/rocm_gdr_md.c
+++ b/src/uct/rocm/gdr/rocm_gdr_md.c
@@ -33,6 +33,7 @@ static ucs_status_t uct_rocm_gdr_md_query(uct_md_h md, uct_md_attr_t *md_attr)
     md_attr->cap.flags            = UCT_MD_FLAG_REG |
                                     UCT_MD_FLAG_NEED_RKEY;
     md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_ROCM);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
@@ -116,6 +117,7 @@ static uct_md_ops_t md_ops = {
     .mkey_pack           = uct_rocm_gdr_mkey_pack,
     .mem_reg             = uct_rocm_gdr_mem_reg,
     .mem_dereg           = uct_rocm_gdr_mem_dereg,
+    .mem_query           = ucs_empty_function_return_unsupported,
     .detect_memory_type  = ucs_empty_function_return_unsupported,
 };
 
diff --git a/src/uct/rocm/ipc/rocm_ipc_iface.c b/src/uct/rocm/ipc/rocm_ipc_iface.c
index cdffd0642af..b58eb0bdfb0 100644
--- a/src/uct/rocm/ipc/rocm_ipc_iface.c
+++ b/src/uct/rocm/ipc/rocm_ipc_iface.c
@@ -27,6 +27,26 @@ static ucs_config_field_t uct_rocm_ipc_iface_config_table[] = {
     {NULL}
 };
 
+static double uct_rocm_ipc_iface_get_bw()
+{
+    double bw = 30.0 * UCS_GBYTE;
+    hsa_amd_link_info_type_t type;
+
+    uct_rocm_base_get_link_type(&type);
+    switch (type) {
+    case HSA_AMD_LINK_INFO_TYPE_PCIE:
+        bw = 200.0 * UCS_GBYTE;
+        break;
+    case HSA_AMD_LINK_INFO_TYPE_XGMI:
+        bw = 400.0 * UCS_GBYTE;
+        break;
+    default:
+        bw = 100.0 * UCS_GBYTE;
+        break;
+    }
+    return bw;
+}
+
 static uint64_t uct_rocm_ipc_iface_node_guid(uct_base_iface_t *iface)
 {
     return ucs_machine_guid() *
@@ -87,11 +107,11 @@ static ucs_status_t uct_rocm_ipc_iface_query(uct_iface_h tl_iface,
                                           UCT_IFACE_FLAG_PENDING   |
                                           UCT_IFACE_FLAG_CONNECT_TO_IFACE;
 
-    /* TODO: get accurate info */
-    iface_attr->latency                 = ucs_linear_func_make(80e-9, 0);
-    iface_attr->bandwidth.dedicated     = 10.0 * UCS_GBYTE; /* 10 GB */
-    iface_attr->bandwidth.shared        = 0;
-    iface_attr->overhead                = 0.4e-6; /* 0.4 us */
+    iface_attr->latency                 = ucs_linear_func_make(1e-9, 0);
+    iface_attr->bandwidth.dedicated     = 0;
+    iface_attr->bandwidth.shared        = uct_rocm_ipc_iface_get_bw();
+    iface_attr->overhead                = 0;
+    iface_attr->priority                = 0;
 
     return UCS_OK;
 }
@@ -204,8 +224,9 @@ static UCS_CLASS_INIT_FUNC(uct_rocm_ipc_iface_t, uct_md_h md, uct_worker_h worke
 {
     ucs_status_t status;
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rocm_ipc_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_rocm_ipc_iface_ops, NULL,
+                              md, worker, params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG(UCT_ROCM_IPC_TL_NAME));
 
     status = ucs_mpool_init(&self->signal_pool,
diff --git a/src/uct/rocm/ipc/rocm_ipc_md.c b/src/uct/rocm/ipc/rocm_ipc_md.c
index 24b50696a0e..eb9974982e6 100644
--- a/src/uct/rocm/ipc/rocm_ipc_md.c
+++ b/src/uct/rocm/ipc/rocm_ipc_md.c
@@ -27,6 +27,7 @@ static ucs_status_t uct_rocm_ipc_md_query(uct_md_h md, uct_md_attr_t *md_attr)
     md_attr->cap.flags            = UCT_MD_FLAG_REG |
                                     UCT_MD_FLAG_NEED_RKEY;
     md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_ROCM);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
@@ -113,12 +114,14 @@ uct_rocm_ipc_md_open(uct_component_h component, const char *md_name,
                      const uct_md_config_t *uct_md_config, uct_md_h *md_p)
 {
     static uct_md_ops_t md_ops = {
-        .close              = (uct_md_close_func_t)ucs_empty_function,
-        .query              = uct_rocm_ipc_md_query,
-        .mkey_pack          = uct_rocm_ipc_mkey_pack,
-        .mem_reg            = uct_rocm_ipc_mem_reg,
-        .mem_dereg          = uct_rocm_ipc_mem_dereg,
-        .detect_memory_type = ucs_empty_function_return_unsupported,
+        .close                  = (uct_md_close_func_t)ucs_empty_function,
+        .query                  = uct_rocm_ipc_md_query,
+        .mkey_pack              = uct_rocm_ipc_mkey_pack,
+        .mem_reg                = uct_rocm_ipc_mem_reg,
+        .mem_dereg              = uct_rocm_ipc_mem_dereg,
+        .mem_query              = ucs_empty_function_return_unsupported,
+        .detect_memory_type     = ucs_empty_function_return_unsupported,
+        .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
     };
     static uct_md_t md = {
         .ops       = &md_ops,
diff --git a/src/uct/sm/base/sm_ep.c b/src/uct/sm/base/sm_ep.c
index 283624b1f48..9d49a678c33 100644
--- a/src/uct/sm/base/sm_ep.c
+++ b/src/uct/sm/base/sm_ep.c
@@ -228,25 +228,3 @@ ucs_status_t uct_sm_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare,
     UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t));
     return UCS_OK;
 }
-
-ucs_status_t uct_sm_ep_check(const char *proc, ucs_time_t starttime,
-                             unsigned flags, uct_completion_t *comp)
-{
-    ucs_time_t createtime;
-    ucs_status_t status;
-
-    UCT_CHECK_PARAM(comp == NULL, "Unsupported completion on ep_check");
-    UCT_CHECK_PARAM(flags == 0, "Unsupported flags: %u", flags);
-
-    status = ucs_sys_get_file_time(proc, UCS_SYS_FILE_TIME_CTIME, &createtime);
-    if ((status != UCS_OK) || (starttime != createtime)) {
-        return UCS_ERR_ENDPOINT_TIMEOUT;
-    }
-
-    return UCS_OK;
-}
-
-int uct_sm_ep_get_process_proc_dir(char *buffer, size_t max_len, pid_t pid)
-{
-    return snprintf(buffer, max_len, "/proc/%d", (int)pid);
-}
diff --git a/src/uct/sm/base/sm_ep.h b/src/uct/sm/base/sm_ep.h
index e8a696afcdf..78454fbbe78 100644
--- a/src/uct/sm/base/sm_ep.h
+++ b/src/uct/sm/base/sm_ep.h
@@ -41,9 +41,4 @@ ucs_status_t uct_sm_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode,
                                       uint64_t remote_addr, uct_rkey_t rkey,
                                       uct_completion_t *comp);
 
-ucs_status_t uct_sm_ep_check(const char *proc, ucs_time_t starttime,
-                             unsigned flags, uct_completion_t *comp);
-
-int uct_sm_ep_get_process_proc_dir(char *buffer, size_t max_len, pid_t pid);
-
 #endif
diff --git a/src/uct/sm/base/sm_iface.c b/src/uct/sm/base/sm_iface.c
index feb5a01af9c..3be00a622f1 100644
--- a/src/uct/sm/base/sm_iface.c
+++ b/src/uct/sm/base/sm_iface.c
@@ -18,19 +18,6 @@
 #include <ucs/type/init_once.h>
 
 
-#define UCS_SM_IFACE_ADDR_FLAG_EXT UCS_BIT(63)
-
-
-typedef struct {
-    uint64_t                        id;
-} ucs_sm_iface_base_device_addr_t;
-
-typedef struct {
-    ucs_sm_iface_base_device_addr_t super;
-    ucs_sys_ns_t                    ipc_ns;
-} ucs_sm_iface_ext_device_addr_t;
-
-
 ucs_config_field_t uct_sm_iface_config_table[] = {
     {"", "", NULL,
      ucs_offsetof(uct_sm_iface_config_t, super),
@@ -48,38 +35,16 @@ uct_sm_base_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_
                              unsigned *num_tl_devices_p)
 {
     return uct_single_device_resource(md, UCT_SM_DEVICE_NAME,
-                                      UCT_DEVICE_TYPE_SHM, tl_devices_p,
+                                      UCT_DEVICE_TYPE_SHM,
+                                      UCS_SYS_DEVICE_ID_UNKNOWN, tl_devices_p,
                                       num_tl_devices_p);
 }
 
-
-/* read boot_id GUID or use machine_guid */
-static uint64_t uct_sm_iface_get_system_id()
-{
-    uint64_t high;
-    uint64_t low;
-    ucs_status_t status;
-
-    status = ucs_sys_get_boot_id(&high, &low);
-    if (status == UCS_OK) {
-        return high ^ low;
-    }
-
-    return ucs_machine_guid();
-}
-
-ucs_status_t UCS_F_NOOPTIMIZE /* GCC failed to compile it in release mode */
+ucs_status_t
 uct_sm_iface_get_device_address(uct_iface_t *tl_iface, uct_device_addr_t *addr)
 {
-    ucs_sm_iface_ext_device_addr_t *ext_addr = (void*)addr;
-
-    ext_addr->super.id  = uct_sm_iface_get_system_id() & ~UCS_SM_IFACE_ADDR_FLAG_EXT;
-
-    if (!ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_IPC)) {
-        ext_addr->super.id |= UCS_SM_IFACE_ADDR_FLAG_EXT;
-        ext_addr->ipc_ns    = ucs_sys_get_ns(UCS_SYS_NS_TYPE_IPC);
-    }
-
+    uct_iface_get_local_address((uct_iface_local_addr_ns_t*)addr,
+                                UCS_SYS_NS_TYPE_IPC);
     return UCS_OK;
 }
 
@@ -87,32 +52,8 @@ int uct_sm_iface_is_reachable(const uct_iface_h tl_iface,
                               const uct_device_addr_t *dev_addr,
                               const uct_iface_addr_t *iface_addr)
 {
-    ucs_sm_iface_ext_device_addr_t *ext_addr = (void*)dev_addr;
-    ucs_sm_iface_ext_device_addr_t  my_addr  = {};
-    ucs_status_t status;
-
-    status = uct_sm_iface_get_device_address(tl_iface,
-                                             (uct_device_addr_t*)&my_addr);
-    if (status != UCS_OK) {
-        ucs_error("failed to get device address");
-        return 0;
-    }
-
-    /* do not merge these evaluations into single 'if' due
-     * to clags compilation warning */
-    /* check if both processes are on same host and
-     * both of them are in root (or non-root) pid namespace */
-    if (ext_addr->super.id != my_addr.super.id) {
-        return 0;
-    }
-
-    if (!(ext_addr->super.id & UCS_SM_IFACE_ADDR_FLAG_EXT)) {
-        return 1; /* both processes are in root namespace */
-    }
-
-    /* ok, we are in non-root PID namespace - return 1 if ID of
-     * namespaces are same */
-    return ext_addr->ipc_ns == my_addr.ipc_ns;
+    return uct_iface_local_is_reachable((uct_iface_local_addr_ns_t*)dev_addr,
+                                        UCS_SYS_NS_TYPE_IPC);
 }
 
 ucs_status_t uct_sm_iface_fence(uct_iface_t *tl_iface, unsigned flags)
@@ -132,11 +73,12 @@ ucs_status_t uct_sm_ep_fence(uct_ep_t *tl_ep, unsigned flags)
 size_t uct_sm_iface_get_device_addr_len()
 {
     return ucs_sys_ns_is_default(UCS_SYS_NS_TYPE_IPC) ?
-           sizeof(ucs_sm_iface_base_device_addr_t) :
-           sizeof(ucs_sm_iface_ext_device_addr_t);
+                   sizeof(uct_iface_local_addr_base_t) :
+                   sizeof(uct_iface_local_addr_ns_t);
 }
 
-UCS_CLASS_INIT_FUNC(uct_sm_iface_t, uct_iface_ops_t *ops, uct_md_h md,
+UCS_CLASS_INIT_FUNC(uct_sm_iface_t, uct_iface_ops_t *ops,
+                    uct_iface_internal_ops_t *internal_ops, uct_md_h md,
                     uct_worker_h worker, const uct_iface_params_t *params,
                     const uct_iface_config_t *tl_config)
 {
@@ -150,12 +92,12 @@ UCS_CLASS_INIT_FUNC(uct_sm_iface_t, uct_iface_ops_t *ops, uct_md_h md,
         return UCS_ERR_UNSUPPORTED;
     }
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, ops, md, worker, params,
-                              tl_config
-                              UCS_STATS_ARG((params->field_mask &
-                                             UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
-                                            params->stats_root : NULL)
-                              UCS_STATS_ARG(params->mode.device.dev_name));
+    UCS_CLASS_CALL_SUPER_INIT(
+            uct_base_iface_t, ops, internal_ops, md, worker, params,
+            tl_config UCS_STATS_ARG(
+                    (params->field_mask & UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
+                            params->stats_root :
+                            NULL) UCS_STATS_ARG(params->mode.device.dev_name));
 
     self->config.bandwidth = sm_config->bandwidth;
 
diff --git a/src/uct/sm/base/sm_iface.h b/src/uct/sm/base/sm_iface.h
index d4c48879228..e6c76ff2e0f 100644
--- a/src/uct/sm/base/sm_iface.h
+++ b/src/uct/sm/base/sm_iface.h
@@ -48,7 +48,8 @@ size_t uct_sm_iface_get_device_addr_len();
 
 ucs_status_t uct_sm_ep_fence(uct_ep_t *tl_ep, unsigned flags);
 
-UCS_CLASS_DECLARE(uct_sm_iface_t, uct_iface_ops_t*, uct_md_h, uct_worker_h,
-                  const uct_iface_params_t*, const uct_iface_config_t*);
+UCS_CLASS_DECLARE(uct_sm_iface_t, uct_iface_ops_t*, uct_iface_internal_ops_t*,
+                  uct_md_h, uct_worker_h, const uct_iface_params_t*,
+                  const uct_iface_config_t*);
 
 #endif
diff --git a/src/uct/sm/mm/base/mm_ep.c b/src/uct/sm/mm/base/mm_ep.c
index c82b75d1053..058d9650b2d 100644
--- a/src/uct/sm/mm/base/mm_ep.c
+++ b/src/uct/sm/mm/base/mm_ep.c
@@ -12,6 +12,7 @@
 #include "mm_ep.h"
 #include "uct/sm/base/sm_ep.h"
 
+#include <uct/base/uct_iov.inl>
 #include <ucs/arch/atomic.h>
 
 
@@ -19,6 +20,7 @@
 typedef enum {
     UCT_MM_SEND_AM_BCOPY,
     UCT_MM_SEND_AM_SHORT,
+    UCT_MM_SEND_AM_SHORT_IOV
 } uct_mm_send_op_t;
 
 
@@ -84,7 +86,6 @@ uct_mm_ep_get_remote_seg(uct_mm_ep_t *ep, uct_mm_seg_id_t seg_id, size_t length,
     return uct_mm_ep_attach_remote_seg(ep, seg_id, length, address_p);
 }
 
-
 /* send a signal to remote interface using Unix-domain socket */
 static void uct_mm_ep_signal_remote(uct_mm_ep_t *ep)
 {
@@ -92,15 +93,18 @@ static void uct_mm_ep_signal_remote(uct_mm_ep_t *ep)
     char dummy = 0;
     int ret;
 
+    ucs_trace("ep %p: signal remote", ep);
+
     for (;;) {
         ret = sendto(iface->signal_fd, &dummy, sizeof(dummy), 0,
-                     (const struct sockaddr*)&ep->signal.sockaddr,
-                     ep->signal.addrlen);
+                     (const struct sockaddr*)&ep->fifo_ctl->signal_sockaddr,
+                     ep->fifo_ctl->signal_addrlen);
         if (ucs_unlikely(ret < 0)) {
             if (errno == EINTR) {
                 /* Interrupted system call - retry */
                 continue;
-            } if ((errno == EAGAIN) || (errno == ECONNREFUSED)) {
+            }
+            if ((errno == EAGAIN) || (errno == ECONNREFUSED)) {
                 /* If we failed to signal because buffer is full - ignore the error
                  * since it means the remote side would get a signal anyway.
                  * If the remote side is not there - ignore the error as well.
@@ -113,8 +117,8 @@ static void uct_mm_ep_signal_remote(uct_mm_ep_t *ep)
             }
         } else {
             ucs_assert(ret == sizeof(dummy));
-            ucs_trace("sent wakeup from socket %d to %p", iface->signal_fd,
-                      (const struct sockaddr*)&ep->signal.sockaddr);
+            ucs_trace("sent wakeup from socket %d to %s", iface->signal_fd,
+                      ep->fifo_ctl->signal_sockaddr.sun_path);
             return;
         }
     }
@@ -158,10 +162,9 @@ static UCS_CLASS_INIT_FUNC(uct_mm_ep_t, const uct_ep_params_t *params)
 
     /* Initialize remote FIFO control structure */
     uct_mm_iface_set_fifo_ptrs(fifo_ptr, &self->fifo_ctl, &self->fifo_elems);
-    self->cached_tail     = self->fifo_ctl->tail;
-    self->signal.addrlen  = self->fifo_ctl->signal_addrlen;
-    self->signal.sockaddr = self->fifo_ctl->signal_sockaddr;
-    self->keepalive       = NULL;
+    self->cached_tail = self->fifo_ctl->tail;
+    self->keepalive   = NULL;
+    ucs_arbiter_elem_init(&self->arb_elem);
 
     ucs_debug("created mm ep %p, connected to remote FIFO id 0x%"PRIx64,
               self, addr->fifo_seg_id);
@@ -225,20 +228,20 @@ static inline void uct_mm_ep_update_cached_tail(uct_mm_ep_t *ep)
 
 /* A common mm active message sending function.
  * The first parameter indicates the origin of the call.
- * is_short = 1 - perform AM short sending
- * is_short = 0 - perform AM bcopy sending
  */
-static UCS_F_ALWAYS_INLINE ssize_t
-uct_mm_ep_am_common_send(uct_mm_send_op_t send_op, uct_mm_ep_t *ep,
-                         uct_mm_iface_t *iface, uint8_t am_id, size_t length,
-                         uint64_t header, const void *payload,
-                         uct_pack_callback_t pack_cb, void *arg)
+static UCS_F_ALWAYS_INLINE ssize_t uct_mm_ep_am_common_send(
+        uct_mm_send_op_t send_op, uct_mm_ep_t *ep, uct_mm_iface_t *iface,
+        uint8_t am_id, size_t length, uint64_t header, const void *payload,
+        uct_pack_callback_t pack_cb, void *arg, const uct_iov_t *iov,
+        size_t iovcnt)
 {
     uct_mm_fifo_element_t *elem;
     ucs_status_t status;
     void *base_address;
     uint8_t elem_flags;
     uint64_t head;
+    ucs_iov_iter_t iov_iter;
+    void *desc_data;
 
     UCT_CHECK_AM_ID(am_id);
 
@@ -255,7 +258,12 @@ uct_mm_ep_am_common_send(uct_mm_send_op_t send_op, uct_mm_ep_t *ep,
             /* update the local copy of the tail to its actual value on the remote peer */
             uct_mm_ep_update_cached_tail(ep);
             if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) {
-                UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1);
+                ucs_arbiter_group_push_head_elem_always(&ep->arb_group,
+                                                        &ep->arb_elem);
+                ucs_arbiter_group_schedule_nonempty(&iface->arbiter,
+                                                    &ep->arb_group);
+                UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES,
+                                         1);
                 return UCS_ERR_NO_RESOURCE;
             }
         }
@@ -276,8 +284,9 @@ uct_mm_ep_am_common_send(uct_mm_send_op_t send_op, uct_mm_ep_t *ep,
         elem_flags   = UCT_MM_FIFO_ELEM_FLAG_INLINE;
         elem->length = length + sizeof(header);
 
-        uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, am_id,
-                           elem + 1, length + sizeof(header), "TX: AM_SHORT");
+        uct_mm_iface_trace_am(iface, UCT_AM_TRACE_TYPE_SEND, elem_flags, am_id,
+                              elem + 1, elem->length,
+                              head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED);
         UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, sizeof(header) + length);
         break;
     case UCT_MM_SEND_AM_BCOPY:
@@ -289,17 +298,27 @@ uct_mm_ep_am_common_send(uct_mm_send_op_t send_op, uct_mm_ep_t *ep,
             return status;
         }
 
-        length       = pack_cb(UCS_PTR_BYTE_OFFSET(base_address,
-                                                   elem->desc.offset),
-                               arg);
+        desc_data    = UCS_PTR_BYTE_OFFSET(base_address, elem->desc.offset);
+        length       = pack_cb(desc_data, arg);
         elem_flags   = 0;
         elem->length = length;
 
-        uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_SEND, am_id,
-                           UCS_PTR_BYTE_OFFSET(base_address, elem->desc.offset),
-                           length, "TX: AM_BCOPY");
+        uct_mm_iface_trace_am(iface, UCT_AM_TRACE_TYPE_SEND, elem_flags, am_id,
+                              desc_data, elem->length,
+                              head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED);
         UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, length);
         break;
+    case UCT_MM_SEND_AM_SHORT_IOV:
+        elem_flags   = UCT_MM_FIFO_ELEM_FLAG_INLINE;
+        ucs_iov_iter_init(&iov_iter);
+        elem->length = uct_iov_to_buffer(iov, iovcnt, &iov_iter, elem + 1,
+                                         SIZE_MAX);
+
+        uct_mm_iface_trace_am(iface, UCT_AM_TRACE_TYPE_SEND, elem_flags, am_id,
+                              elem + 1, elem->length,
+                              head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED);
+        UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, elem->length);
+        break;
     }
 
     elem->am_id = am_id;
@@ -321,6 +340,7 @@ uct_mm_ep_am_common_send(uct_mm_send_op_t send_op, uct_mm_ep_t *ep,
 
     switch (send_op) {
     case UCT_MM_SEND_AM_SHORT:
+    case UCT_MM_SEND_AM_SHORT_IOV:
         return UCS_OK;
     case UCT_MM_SEND_AM_BCOPY:
         return length;
@@ -341,7 +361,23 @@ ucs_status_t uct_mm_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header,
 
     return (ucs_status_t)uct_mm_ep_am_common_send(UCT_MM_SEND_AM_SHORT, ep,
                                                   iface, id, length, header,
-                                                  payload, NULL, NULL);
+                                                  payload, NULL, NULL, NULL, 0);
+}
+
+ucs_status_t uct_mm_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                    const uct_iov_t *iov, size_t iovcnt)
+{
+    uct_mm_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_mm_iface_t);
+    uct_mm_ep_t *ep       = ucs_derived_of(tl_ep, uct_mm_ep_t);
+
+    UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), 0,
+                     iface->config.fifo_elem_size -
+                             sizeof(uct_mm_fifo_element_t),
+                     "am_short_iov");
+
+    return (ucs_status_t)uct_mm_ep_am_common_send(UCT_MM_SEND_AM_SHORT_IOV, ep,
+                                                  iface, id, 0, 0, NULL, NULL,
+                                                  NULL, iov, iovcnt);
 }
 
 ssize_t uct_mm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb,
@@ -351,7 +387,7 @@ ssize_t uct_mm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_
     uct_mm_ep_t *ep = ucs_derived_of(tl_ep, uct_mm_ep_t);
 
     return uct_mm_ep_am_common_send(UCT_MM_SEND_AM_BCOPY, ep, iface, id, 0, 0,
-                                    NULL, pack_cb, arg);
+                                    NULL, pack_cb, arg, NULL, 0);
 }
 
 static inline int uct_mm_ep_has_tx_resources(uct_mm_ep_t *ep)
@@ -388,9 +424,9 @@ ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter,
                                                   ucs_arbiter_elem_t *elem,
                                                   void *arg)
 {
-    uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv);
     uct_mm_ep_t *ep        = ucs_container_of(group, uct_mm_ep_t, arb_group);
     unsigned *count        = (unsigned*)arg;
+    uct_pending_req_t *req;
     ucs_status_t status;
 
     /* update the local tail with its actual value from the remote peer
@@ -401,6 +437,12 @@ ucs_arbiter_cb_result_t uct_mm_ep_process_pending(ucs_arbiter_t *arbiter,
         return UCS_ARBITER_CB_RESULT_RESCHED_GROUP;
     }
 
+    if (elem == &ep->arb_elem) {
+        return UCS_ARBITER_CB_RESULT_REMOVE_ELEM;
+    }
+
+    req = ucs_container_of(elem, uct_pending_req_t, priv);
+
     ucs_trace_data("progressing pending request %p", req);
     status = req->func(req);
     ucs_trace_data("status returned from progress pending: %s",
@@ -428,16 +470,21 @@ static ucs_arbiter_cb_result_t uct_mm_ep_arbiter_purge_cb(ucs_arbiter_t *arbiter
 {
     uct_mm_ep_t *ep                 = ucs_container_of(group, uct_mm_ep_t,
                                                        arb_group);
-    uct_pending_req_t *req          = ucs_container_of(elem, uct_pending_req_t,
-                                                       priv);
     uct_purge_cb_args_t *cb_args    = arg;
     uct_pending_purge_callback_t cb = cb_args->cb;
+    uct_pending_req_t *req;
+
+    if (elem == &ep->arb_elem) {
+        return UCS_ARBITER_CB_RESULT_REMOVE_ELEM;
+    }
 
+    req = ucs_container_of(elem, uct_pending_req_t, priv);
     if (cb != NULL) {
         cb(req, cb_args->arg);
     } else {
         ucs_warn("ep=%p canceling user pending request %p", ep, req);
     }
+
     return UCS_ARBITER_CB_RESULT_REMOVE_ELEM;
 }
 
@@ -476,38 +523,8 @@ ucs_status_t uct_mm_ep_flush(uct_ep_h tl_ep, unsigned flags,
 ucs_status_t
 uct_mm_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
 {
-    uct_mm_ep_t *ep    = ucs_derived_of(tl_ep, uct_mm_ep_t);
-    uct_iface_h  iface = ep->super.super.iface;
-    ucs_status_t status;
-    int proc_len;
-
-    if (ep->keepalive == NULL) {
-        proc_len = uct_sm_ep_get_process_proc_dir(NULL, 0,
-                                                  ep->fifo_ctl->owner.pid);
-        if (proc_len <= 0) {
-            return UCS_ERR_INVALID_PARAM;
-        }
-
-        ep->keepalive = ucs_malloc(sizeof(*ep->keepalive) + proc_len + 1,
-                                   "mm_ep->keepalive");
-        if (ep->keepalive == NULL) {
-            status = UCS_ERR_NO_MEMORY;
-            goto err_set_ep_failed;
-        }
-
-        ep->keepalive->starttime = ep->fifo_ctl->owner.starttime;
-        uct_sm_ep_get_process_proc_dir(ep->keepalive->proc, proc_len + 1,
-                                       ep->fifo_ctl->owner.pid);
-    }
-
-    status = uct_sm_ep_check(ep->keepalive->proc, ep->keepalive->starttime,
-                             flags, comp);
-    if (status != UCS_OK) {
-        goto err_set_ep_failed;
-    }
-    return UCS_OK;
+    uct_mm_ep_t *ep = ucs_derived_of(tl_ep, uct_mm_ep_t);
 
-err_set_ep_failed:
-    return uct_set_ep_failed(&UCS_CLASS_NAME(uct_mm_ep_t), &ep->super.super,
-                             iface, status);
+    return uct_ep_keepalive_check(tl_ep, &ep->keepalive, ep->fifo_ctl->pid,
+                                  flags, comp);
 }
diff --git a/src/uct/sm/mm/base/mm_ep.h b/src/uct/sm/mm/base/mm_ep.h
index 32f923571a1..dcf5a8cfa48 100644
--- a/src/uct/sm/mm/base/mm_ep.h
+++ b/src/uct/sm/mm/base/mm_ep.h
@@ -18,41 +18,37 @@ KHASH_INIT(uct_mm_remote_seg, uintptr_t, uct_mm_remote_seg_t, 1,
            kh_int64_hash_func, kh_int64_hash_equal)
 
 
-/* owner of segment process information. we have to cache this value
- * because some transports terminate segment when process gone (xpmem) */
-typedef struct uct_mm_keepalive_info {
-    ucs_time_t starttime; /* Process starttime */
-    char       proc[];    /* Process owner proc dir */
-} uct_mm_keepalive_info_t;
-
 /**
  * MM transport endpoint
  */
 typedef struct uct_mm_ep {
     uct_base_ep_t              super;
 
-    /* Remote peer */
-    uct_mm_fifo_ctl_t          *fifo_ctl;   /* pointer to the destination's ctl struct in the receive fifo */
-    void                       *fifo_elems; /* fifo elements (destination's receive fifo) */
+    /* pointer to the destination's ctl struct in the receive fifo */
+    uct_mm_fifo_ctl_t          *fifo_ctl;
+
+    /* fifo elements (destination's receive fifo) */
+    void                       *fifo_elems;
 
-    uint64_t                   cached_tail; /* the sender's own copy of the remote FIFO's tail.
-                                               it is not always updated with the actual remote tail value */
+    /* the sender's own copy of the remote FIFO's tail.
+       it is not always updated with the actual remote tail value */
+    uint64_t                   cached_tail;
 
     /* mapped remote memory chunks to which remote descriptors belong to.
      * (after attaching to them) */
     khash_t(uct_mm_remote_seg) remote_segs;
 
-    void                       *remote_iface_addr; /* remote md-specific address, can be NULL */
+    /* remote md-specific address, can be NULL */
+    void                       *remote_iface_addr;
 
-    ucs_arbiter_group_t        arb_group;   /* the group that holds this ep's pending operations */
+    /* group that holds this ep's pending operations */
+    ucs_arbiter_group_t        arb_group;
 
-    /* Used for signaling remote side wakeup */
-    struct {
-        struct sockaddr_un     sockaddr;  /* address of signaling socket */
-        socklen_t              addrlen;   /* address length of signaling socket */
-    } signal;
+    /* placeholder arbiter element to make sure that we would not be able to arm
+       the interface as long as one of the endpoints is unable to send */
+    ucs_arbiter_elem_t         arb_elem;
 
-    uct_mm_keepalive_info_t    *keepalive; /* keepalive info */
+    uct_keepalive_info_t       *keepalive; /* keepalive info */
 } uct_mm_ep_t;
 
 
@@ -61,6 +57,10 @@ UCS_CLASS_DECLARE_DELETE_FUNC(uct_mm_ep_t, uct_ep_t);
 
 ucs_status_t uct_mm_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header,
                                 const void *payload, unsigned length);
+
+ucs_status_t uct_mm_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                    const uct_iov_t *iov, size_t iovcnt);
+
 ssize_t uct_mm_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id, uct_pack_callback_t pack_cb,
                            void *arg, unsigned flags);
 
diff --git a/src/uct/sm/mm/base/mm_iface.c b/src/uct/sm/mm/base/mm_iface.c
index 7654dea4afc..c8f9636117f 100644
--- a/src/uct/sm/mm/base/mm_iface.c
+++ b/src/uct/sm/mm/base/mm_iface.c
@@ -1,6 +1,6 @@
 /**
  * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED.
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
  * See file LICENSE for terms.
  */
 
@@ -61,6 +61,9 @@ ucs_config_field_t uct_mm_iface_config_table[] = {
      "Maximal number of receive completions to pick during RX poll",
      ucs_offsetof(uct_mm_iface_config_t, fifo_max_poll), UCS_CONFIG_TYPE_ULUNITS},
 
+    {"ERROR_HANDLING", "n", "Expose error handling support capability",
+     ucs_offsetof(uct_mm_iface_config_t, error_handling), UCS_CONFIG_TYPE_BOOL},
+
     {NULL}
 };
 
@@ -120,6 +123,8 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface,
 {
     uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t);
     uct_mm_md_t    *md    = ucs_derived_of(iface->super.super.md, uct_mm_md_t);
+    int attach_shm_file;
+    ucs_status_t status;
 
     uct_base_iface_query(&iface->super.super, iface_attr);
 
@@ -146,7 +151,7 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface,
     iface_attr->cap.am.max_zcopy        = 0;
     iface_attr->cap.am.opt_zcopy_align  = UCS_SYS_CACHE_LINE_SIZE;
     iface_attr->cap.am.align_mtu        = iface_attr->cap.am.opt_zcopy_align;
-    iface_attr->cap.am.max_iov          = 1;
+    iface_attr->cap.am.max_iov          = SIZE_MAX;
 
     iface_attr->iface_addr_len          = sizeof(uct_mm_iface_addr_t) +
                                           md->iface_addr_len;
@@ -161,8 +166,21 @@ static ucs_status_t uct_mm_iface_query(uct_iface_h tl_iface,
                                           UCT_IFACE_FLAG_AM_BCOPY            |
                                           UCT_IFACE_FLAG_PENDING             |
                                           UCT_IFACE_FLAG_CB_SYNC             |
-                                          UCT_IFACE_FLAG_EP_CHECK            |
-                                          UCT_IFACE_FLAG_CONNECT_TO_IFACE;
+                                          UCT_IFACE_FLAG_CONNECT_TO_IFACE    |
+                                          iface->config.extra_cap_flags;
+
+    status = uct_mm_md_mapper_ops(md)->query(&attach_shm_file);
+    ucs_assert_always(status == UCS_OK);
+    if (attach_shm_file) {
+        /*
+         * Only MM tranports with attaching to SHM file can support error
+         * handling mechanisms (e.g. EP checking) to check if a peer was down,
+         * there is no safe way to check a process existence (touching a shared
+         * memory block of a peer leads to "bus" error in case of a peer is
+         * down) */
+        iface_attr->cap.flags |= UCT_IFACE_FLAG_EP_CHECK;
+    }
+
     iface_attr->cap.event_flags         = UCT_IFACE_FLAG_EVENT_SEND_COMP     |
                                           UCT_IFACE_FLAG_EVENT_RECV          |
                                           UCT_IFACE_FLAG_EVENT_FD;
@@ -219,17 +237,17 @@ uct_mm_assign_desc_to_fifo_elem(uct_mm_iface_t *iface,
     return UCS_OK;
 }
 
-static UCS_F_ALWAYS_INLINE void
-uct_mm_iface_process_recv(uct_mm_iface_t *iface,
-                          uct_mm_fifo_element_t* elem)
+static UCS_F_ALWAYS_INLINE void uct_mm_iface_process_recv(uct_mm_iface_t *iface)
 {
+    uct_mm_fifo_element_t *elem = iface->read_index_elem;
     ucs_status_t status;
-    void         *data;
+    void *data;
 
     if (ucs_likely(elem->flags & UCT_MM_FIFO_ELEM_FLAG_INLINE)) {
         /* read short (inline) messages from the FIFO elements */
-        uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV,
-                           elem->am_id, elem + 1, elem->length, "RX: AM_SHORT");
+        uct_mm_iface_trace_am(iface, UCT_AM_TRACE_TYPE_RECV, elem->flags,
+                              elem->am_id, elem + 1, elem->length,
+                              iface->read_index);
         uct_mm_iface_invoke_am(iface, elem->am_id, elem + 1, elem->length, 0);
         return;
     }
@@ -243,9 +261,8 @@ uct_mm_iface_process_recv(uct_mm_iface_t *iface,
     /* read bcopy messages from the receive descriptors */
     data = elem->desc_data;
     VALGRIND_MAKE_MEM_DEFINED(data, elem->length);
-
-    uct_iface_trace_am(&iface->super.super, UCT_AM_TRACE_TYPE_RECV,
-                       elem->am_id, data, elem->length, "RX: AM_BCOPY");
+    uct_mm_iface_trace_am(iface, UCT_AM_TRACE_TYPE_RECV, elem->flags,
+                          elem->am_id, data, elem->length, iface->read_index);
 
     status = uct_mm_iface_invoke_am(iface, elem->am_id, data, elem->length,
                                     UCT_CB_PARAM_FLAG_DESC);
@@ -279,7 +296,7 @@ uct_mm_iface_poll_fifo(uct_mm_iface_t *iface)
     ucs_assert(iface->read_index <=
                (iface->recv_fifo_ctl->head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED));
 
-    uct_mm_iface_process_recv(iface, iface->read_index_elem);
+    uct_mm_iface_process_recv(iface);
 
     /* raise the read_index */
     iface->read_index++;
@@ -353,43 +370,74 @@ static ucs_status_t uct_mm_iface_event_fd_get(uct_iface_h tl_iface, int *fd_p)
     return UCS_OK;
 }
 
-static ucs_status_t uct_mm_iface_event_fd_arm(uct_iface_h tl_iface,
-                                              unsigned events)
+
+static ucs_status_t
+uct_mm_iface_event_fd_arm(uct_iface_h tl_iface, unsigned events)
 {
     uct_mm_iface_t *iface = ucs_derived_of(tl_iface, uct_mm_iface_t);
     char dummy[UCT_MM_IFACE_MAX_SIG_EVENTS]; /* pop multiple signals at once */
     uint64_t head, prev_head;
     int ret;
 
-    /* Make the next sender which writes to the FIFO signal the receiver */
-    head      = iface->recv_fifo_ctl->head;
-    prev_head = ucs_atomic_cswap64(ucs_unaligned_ptr(&iface->recv_fifo_ctl->head),
-                                   head, head | UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED);
-    if (prev_head != head) {
-        /* race with sender; need to retry */
+    if ((events & UCT_EVENT_SEND_COMP) &&
+        !ucs_arbiter_is_empty(&iface->arbiter)) {
+        /* if we have outstanding send operations, can't go to sleep */
         return UCS_ERR_BUSY;
     }
 
+    if (!(events & UCT_EVENT_RECV)) {
+        /* Nothing to do anymore */
+        return UCS_OK;
+    }
+
+    /* Make the next sender which writes to the FIFO signal the receiver */
+    head = iface->recv_fifo_ctl->head;
     if ((head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED) > iface->read_index) {
-        /* 'read_index' is being written but not ready yet */
+        /* head element was not read yet */
+        ucs_trace("iface %p: cannot arm, head %" PRIu64 " read_index %" PRIu64,
+                  iface, head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED,
+                  iface->read_index);
         return UCS_ERR_BUSY;
     }
 
+    if (!(head & UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED)) {
+        /* Try to mark the head index as armed in an atomic way; fail if any
+           sender managed to update the head at the same time */
+        prev_head = ucs_atomic_cswap64(
+                ucs_unaligned_ptr(&iface->recv_fifo_ctl->head), head,
+                head | UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED);
+        if (prev_head != head) {
+            /* race with sender; need to retry */
+            ucs_assert(!(prev_head & UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED));
+            ucs_trace("iface %p: cannot arm, head %" PRIu64
+                      " prev_head %" PRIu64,
+                      iface, head, prev_head);
+            return UCS_ERR_BUSY;
+        }
+    }
+
+    /* check for pending events */
     ret = recvfrom(iface->signal_fd, &dummy, sizeof(dummy), 0, NULL, 0);
     if (ret > 0) {
+        ucs_trace("iface %p: cannot arm, got a signal", iface);
         return UCS_ERR_BUSY;
     } else if (ret == -1) {
         if (errno == EAGAIN) {
+            ucs_trace("iface %p: armed head %" PRIu64 " read_index %" PRIu64,
+                      iface, head & ~UCT_MM_IFACE_FIFO_HEAD_EVENT_ARMED,
+                      iface->read_index);
             return UCS_OK;
         } else if (errno == EINTR) {
             return UCS_ERR_BUSY;
         } else {
-            ucs_error("failed to retrieve message from signal pipe: %m");
+            ucs_error("iface %p: failed to retrieve message from socket: %m",
+                      iface);
             return UCS_ERR_IO_ERROR;
         }
     } else {
         ucs_assert(ret == 0);
-        return UCS_OK;
+        ucs_trace("iface %p: remote socket closed", iface);
+        return UCS_ERR_CONNECTION_RESET;
     }
 }
 
@@ -400,6 +448,7 @@ static uct_iface_ops_t uct_mm_iface_ops = {
     .ep_put_bcopy             = uct_sm_ep_put_bcopy,
     .ep_get_bcopy             = uct_sm_ep_get_bcopy,
     .ep_am_short              = uct_mm_ep_am_short,
+    .ep_am_short_iov          = uct_mm_ep_am_short_iov,
     .ep_am_bcopy              = uct_mm_ep_am_bcopy,
     .ep_atomic_cswap64        = uct_sm_ep_atomic_cswap64,
     .ep_atomic64_post         = uct_sm_ep_atomic64_post,
@@ -561,12 +610,13 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker,
     uct_mm_iface_config_t *mm_config =
                     ucs_derived_of(tl_config, uct_mm_iface_config_t);
     uct_mm_fifo_element_t* fifo_elem_p;
+    size_t alignment, align_offset;
     ucs_status_t status;
     unsigned i;
-    char proc[32];
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_sm_iface_t, &uct_mm_iface_ops, md,
-                              worker, params, tl_config);
+    UCS_CLASS_CALL_SUPER_INIT(uct_sm_iface_t, &uct_mm_iface_ops,
+                              &uct_base_iface_internal_ops, md, worker, params,
+                              tl_config);
 
     if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) {
         ucs_error("Shared memory transport does not support multi-threaded worker");
@@ -603,6 +653,10 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker,
                                       UCT_MM_IFACE_FIFO_MAX_POLL :
                                       /* trim by the maximum unsigned integer value */
                                       ucs_min(mm_config->fifo_max_poll, UINT_MAX));
+
+    self->config.extra_cap_flags   = (mm_config->error_handling == UCS_YES) ?
+                                     UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE :
+                                     0ul;
     self->fifo_prev_wnd_cons       = 0;
     self->fifo_poll_count          = self->config.fifo_max_poll;
     /* cppcheck-suppress internalAstError */
@@ -628,21 +682,13 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker,
 
     uct_mm_iface_set_fifo_ptrs(self->recv_fifo_mem.address,
                                &self->recv_fifo_ctl, &self->recv_fifo_elems);
-    self->recv_fifo_ctl->head      = 0;
-    self->recv_fifo_ctl->tail      = 0;
-    self->recv_fifo_ctl->owner.pid = getpid();
-    self->read_index               = 0;
-    self->read_index_elem          = UCT_MM_IFACE_GET_FIFO_ELEM(self,
-                                                                self->recv_fifo_elems,
-                                                                self->read_index);
-    uct_sm_ep_get_process_proc_dir(proc, sizeof(proc),
-                                   self->recv_fifo_ctl->owner.pid);
-    status = ucs_sys_get_file_time(proc, UCS_SYS_FILE_TIME_CTIME,
-                                   &self->recv_fifo_ctl->owner.starttime);
-    if (status != UCS_OK) {
-        ucs_error("mm_iface failed to get process starttime");
-        return status;
-    }
+    self->recv_fifo_ctl->head = 0;
+    self->recv_fifo_ctl->tail = 0;
+    self->recv_fifo_ctl->pid  = getpid();
+    self->read_index          = 0;
+    self->read_index_elem     = UCT_MM_IFACE_GET_FIFO_ELEM(self,
+                                                           self->recv_fifo_elems,
+                                                           self->read_index);
 
     /* create a unix file descriptor to receive event notifications */
     status = uct_mm_iface_create_signal_fd(self);
@@ -650,17 +696,22 @@ static UCS_CLASS_INIT_FUNC(uct_mm_iface_t, uct_md_h md, uct_worker_h worker,
         goto err_free_fifo;
     }
 
+    status = uct_iface_param_am_alignment(params, self->config.seg_size,
+                                          sizeof(uct_mm_recv_desc_t),
+                                          sizeof(uct_mm_recv_desc_t),
+                                          &alignment, &align_offset);
+    if (status != UCS_OK) {
+        goto err_close_signal_fd;
+    }
+
     /* create a memory pool for receive descriptors */
-    status = uct_iface_mpool_init(&self->super.super,
-                                  &self->recv_desc_mp,
-                                  sizeof(uct_mm_recv_desc_t) + self->rx_headroom +
-                                  self->config.seg_size,
-                                  sizeof(uct_mm_recv_desc_t),
-                                  UCS_SYS_CACHE_LINE_SIZE,
-                                  &mm_config->mp,
+    status = uct_iface_mpool_init(&self->super.super, &self->recv_desc_mp,
+                                  sizeof(uct_mm_recv_desc_t) +
+                                          self->rx_headroom +
+                                          self->config.seg_size,
+                                  align_offset, alignment, &mm_config->mp,
                                   mm_config->mp.bufs_grow,
-                                  uct_mm_iface_recv_desc_init,
-                                  "mm_recv_desc");
+                                  uct_mm_iface_recv_desc_init, "mm_recv_desc");
     if (status != UCS_OK) {
         ucs_error("failed to create a receive descriptor memory pool for the MM transport");
         goto err_close_signal_fd;
diff --git a/src/uct/sm/mm/base/mm_iface.h b/src/uct/sm/mm/base/mm_iface.h
index 367a2b910d3..eaaf39ec756 100644
--- a/src/uct/sm/mm/base/mm_iface.h
+++ b/src/uct/sm/mm/base/mm_iface.h
@@ -22,8 +22,12 @@
 
 
 enum {
-    UCT_MM_FIFO_ELEM_FLAG_OWNER  = UCS_BIT(0), /* new/old info */
-    UCT_MM_FIFO_ELEM_FLAG_INLINE = UCS_BIT(1), /* if inline or not */
+    /* FIFO element polarity, changes every cycle to indicate the element is
+       written by the sender */
+    UCT_MM_FIFO_ELEM_FLAG_OWNER  = UCS_BIT(0),
+
+    /* Whether the element data is inline or in receive descriptor */
+    UCT_MM_FIFO_ELEM_FLAG_INLINE = UCS_BIT(1),
 };
 
 
@@ -48,6 +52,19 @@ enum {
         uct_mm_md_mapper_call(md, _func, ## __VA_ARGS__); \
     })
 
+
+#define uct_mm_iface_trace_am(_iface, _type, _flags, _am_id, _data, _length, \
+                              _elem_sn) \
+    uct_iface_trace_am(&(_iface)->super.super, _type, _am_id, _data, _length, \
+                       "%cX [%lu] %c%c", \
+                       ((_type) == UCT_AM_TRACE_TYPE_RECV) ? 'R' : \
+                       ((_type) == UCT_AM_TRACE_TYPE_SEND) ? 'T' : \
+                                                             '?', \
+                       (_elem_sn), \
+                       ((_flags) & UCT_MM_FIFO_ELEM_FLAG_OWNER) ? 'o' : '-', \
+                       ((_flags) & UCT_MM_FIFO_ELEM_FLAG_INLINE) ? 'i' : '-')
+
+
 /* AIMD (additive increase/multiplicative decrease) algorithm adopted for FIFO
  * polling mechanism to adjust FIFO polling window.
  * - FIFO window is increased if the number of completed RX operations during
@@ -81,9 +98,10 @@ typedef struct uct_mm_iface_config {
     size_t                   fifo_max_poll;       /* Maximal RX completions to pick
                                                    * during RX poll */
     double                   release_fifo_factor; /* Tail index update frequency */
-    ucs_ternary_value_t      hugetlb_mode;        /* Enable using huge pages for
+    ucs_ternary_auto_value_t hugetlb_mode;        /* Enable using huge pages for
                                                    * shared memory buffers */
     unsigned                 fifo_elem_size;      /* Size of the FIFO element size */
+    int                      error_handling; /* Exposing of error handling cap */
     uct_iface_mpool_config_t mp;
 } uct_mm_iface_config_t;
 
@@ -111,10 +129,7 @@ typedef struct uct_mm_fifo_ctl {
 
     /* 2nd cacheline */
     volatile uint64_t         tail;           /* How much was consumed */
-    struct {
-        pid_t                 pid;            /* Process owner pid */
-        ucs_time_t            starttime;      /* Process starttime */
-    } owner;
+    pid_t                     pid;            /* Process owner pid */
 } UCS_S_PACKED UCS_V_ALIGNED(UCS_SYS_CACHE_LINE_SIZE) uct_mm_fifo_ctl_t;
 
 
@@ -204,6 +219,7 @@ typedef struct uct_mm_iface {
         unsigned            fifo_elem_size;
         unsigned            seg_size;         /* size of the receive descriptor (for payload)*/
         unsigned            fifo_max_poll;
+        uint64_t            extra_cap_flags;
     } config;
 } uct_mm_iface_t;
 
@@ -212,13 +228,14 @@ typedef struct uct_mm_iface {
  * Define a memory-mapper transport for MM.
  *
  * @param _name         Component name token
- * @param _md_ops       Memory domain operations, of type uct_mm_md_ops_t.
+ * @param _md_ops       Memory domain operations, of type uct_mm_md_ops_t
  * @param _rkey_unpack  Remote key unpack function
  * @param _rkey_release Remote key release function
- * @param _cfg_prefix   Prefix for configuration variables.
+ * @param _cfg_prefix   Prefix for configuration variables
+ * @param _cfg_table    Configuration table
  */
 #define UCT_MM_TL_DEFINE(_name, _md_ops, _rkey_unpack, _rkey_release, \
-                         _cfg_prefix) \
+                         _cfg_prefix, _cfg_table) \
     \
     UCT_MM_COMPONENT_DEFINE(uct_##_name##_component, _name, _md_ops, \
                             _rkey_unpack, _rkey_release, _cfg_prefix) \
@@ -227,8 +244,8 @@ typedef struct uct_mm_iface {
                   _name, \
                   uct_sm_base_query_tl_devices, \
                   uct_mm_iface_t, \
-                  "MM_", \
-                  uct_mm_iface_config_table, \
+                  _cfg_prefix, \
+                  _cfg_table, \
                   uct_mm_iface_config_t);
 
 
diff --git a/src/uct/sm/mm/base/mm_md.c b/src/uct/sm/mm/base/mm_md.c
index 1b66c2c7d55..0d4b5d8e887 100644
--- a/src/uct/sm/mm/base/mm_md.c
+++ b/src/uct/sm/mm/base/mm_md.c
@@ -36,8 +36,9 @@ ucs_status_t uct_mm_query_md_resources(uct_component_t *component,
                                        unsigned *num_resources_p)
 {
     ucs_status_t status;
+    int UCS_V_UNUSED attach_shm_file;
 
-    status = uct_mm_mdc_mapper_ops(component)->query();
+    status = uct_mm_mdc_mapper_ops(component)->query(&attach_shm_file);
     switch (status) {
     case UCS_OK:
         return uct_md_query_single_md_resource(component, resources_p,
diff --git a/src/uct/sm/mm/base/mm_md.h b/src/uct/sm/mm/base/mm_md.h
index a74f57364a2..6b6a11e6b7b 100644
--- a/src/uct/sm/mm/base/mm_md.h
+++ b/src/uct/sm/mm/base/mm_md.h
@@ -44,8 +44,8 @@ typedef struct uct_mm_remote_seg {
  * MM memory domain configuration
  */
 typedef struct uct_mm_md_config {
-    uct_md_config_t       super;
-    ucs_ternary_value_t   hugetlb_mode;     /* Enable using huge pages */
+    uct_md_config_t          super;
+    ucs_ternary_auto_value_t hugetlb_mode;     /* Enable using huge pages */
 } uct_mm_md_config_t;
 
 
@@ -60,8 +60,16 @@ typedef struct uct_mm_md {
 } uct_mm_md_t;
 
 
-/* Check if available on current machine */
-typedef ucs_status_t (*uct_mm_mapper_query_func_t)();
+/* Check if available on current machine.
+ *
+ * @param [in/out] attach_shm_file_p     Flag which shows whether MM transport
+ *                                       attaches to a SHM file or to a process
+ *                                       region.
+ *
+ * @return UCS_OK - if MM transport is available on the machine, otherwise -
+ *         error code.
+ */
+typedef ucs_status_t (*uct_mm_mapper_query_func_t)(int *attach_shm_file_p);
 
 
 /* Return the size of memory-domain specific iface address (e.g mmap path) */
@@ -107,13 +115,13 @@ typedef void
  * Memory mapper operations - used to implement MD and TL functionality
  */
 typedef struct uct_mm_mapper_ops {
-    uct_md_ops_t                             super;
-    uct_mm_mapper_query_func_t               query;
-    uct_mm_mapper_iface_addr_length_func_t   iface_addr_length;
-    uct_mm_mapper_iface_addr_pack_func_t     iface_addr_pack;
-    uct_mm_mapper_mem_attach_func_t          mem_attach;
-    uct_mm_mapper_mem_detach_func_t          mem_detach;
-    uct_mm_mapper_is_reachable_func_t        is_reachable;
+    uct_md_ops_t                           super;
+    uct_mm_mapper_query_func_t             query;
+    uct_mm_mapper_iface_addr_length_func_t iface_addr_length;
+    uct_mm_mapper_iface_addr_pack_func_t   iface_addr_pack;
+    uct_mm_mapper_mem_attach_func_t        mem_attach;
+    uct_mm_mapper_mem_detach_func_t        mem_detach;
+    uct_mm_mapper_is_reachable_func_t      is_reachable;
 } uct_mm_md_mapper_ops_t;
 
 
@@ -147,6 +155,8 @@ typedef struct uct_mm_component {
  * @param _var          Variable for MM component.
  * @param _name         String which is the component name.
  * @param _md_ops       Mapper operations, of type uct_mm_mapper_ops_t.
+ * @param _rkey_unpack  Remote key unpack function.
+ * @param _rkey_release Remote key release function.
  * @param _cfg_prefix   Prefix for configuration environment vars.
  */
 #define UCT_MM_COMPONENT_DEFINE(_var, _name, _md_ops, _rkey_unpack, \
diff --git a/src/uct/sm/mm/posix/mm_posix.c b/src/uct/sm/mm/posix/mm_posix.c
index cf00383961c..b03f2cb427f 100644
--- a/src/uct/sm/mm/posix/mm_posix.c
+++ b/src/uct/sm/mm/posix/mm_posix.c
@@ -77,11 +77,23 @@ static ucs_config_field_t uct_posix_md_config_table[] = {
   {NULL}
 };
 
+static ucs_config_field_t uct_posix_iface_config_table[] = {
+  {"MM_", "", NULL, 0, UCS_CONFIG_TYPE_TABLE(uct_mm_iface_config_table)},
+
+  {NULL}
+};
+
 static int uct_posix_use_shm_open(const uct_posix_md_config_t *posix_config)
 {
     return !strcmp(posix_config->dir, UCT_POSIX_SHM_OPEN_DIR);
 }
 
+static ucs_status_t uct_posix_query(int *attach_shm_file_p)
+{
+    *attach_shm_file_p = 1;
+    return UCS_OK;
+}
+
 static size_t uct_posix_iface_addr_length(uct_mm_md_t *md)
 {
     const uct_posix_md_config_t *posix_config =
@@ -667,13 +679,13 @@ static uct_mm_md_mapper_ops_t uct_posix_md_ops = {
         .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
         .detect_memory_type     = ucs_empty_function_return_unsupported
     },
-   .query                       = ucs_empty_function_return_success,
-   .iface_addr_length           = uct_posix_iface_addr_length,
-   .iface_addr_pack             = uct_posix_iface_addr_pack,
-   .mem_attach                  = uct_posix_mem_attach,
-   .mem_detach                  = uct_posix_mem_detach,
-   .is_reachable                = uct_posix_is_reachable
+    .query             = uct_posix_query,
+    .iface_addr_length = uct_posix_iface_addr_length,
+    .iface_addr_pack   = uct_posix_iface_addr_pack,
+    .mem_attach        = uct_posix_mem_attach,
+    .mem_detach        = uct_posix_mem_detach,
+    .is_reachable      = uct_posix_is_reachable
 };
 
 UCT_MM_TL_DEFINE(posix, &uct_posix_md_ops, uct_posix_rkey_unpack,
-                 uct_posix_rkey_release, "POSIX_")
+                 uct_posix_rkey_release, "POSIX_", uct_posix_iface_config_table)
diff --git a/src/uct/sm/mm/sysv/mm_sysv.c b/src/uct/sm/mm/sysv/mm_sysv.c
index 1938f896e11..97e711b4749 100644
--- a/src/uct/sm/mm/sysv/mm_sysv.c
+++ b/src/uct/sm/mm/sysv/mm_sysv.c
@@ -34,6 +34,12 @@ static ucs_config_field_t uct_sysv_md_config_table[] = {
   {NULL}
 };
 
+static ucs_config_field_t uct_sysv_iface_config_table[] = {
+  {"MM_", "", NULL, 0, UCS_CONFIG_TYPE_TABLE(uct_mm_iface_config_table)},
+
+  {NULL}
+};
+
 static ucs_status_t uct_sysv_md_query(uct_md_h md, uct_md_attr_t *md_attr)
 {
     uct_mm_md_query(md, md_attr, 1);
@@ -139,6 +145,12 @@ uct_sysv_md_mkey_pack(uct_md_h md, uct_mem_h memh, void *rkey_buffer)
     return UCS_OK;
 }
 
+static ucs_status_t uct_sysv_query(int *attach_shm_file_p)
+{
+    *attach_shm_file_p = 1;
+    return UCS_OK;
+}
+
 static ucs_status_t uct_sysv_mem_attach(uct_mm_md_t *md, uct_mm_seg_id_t seg_id,
                                         size_t length, const void *iface_addr,
                                         uct_mm_remote_seg_t *rseg)
@@ -176,7 +188,7 @@ uct_sysv_rkey_release(uct_component_t *component, uct_rkey_t rkey, void *handle)
 }
 
 static uct_mm_md_mapper_ops_t uct_sysv_md_ops = {
-   .super = {
+    .super = {
         .close                  = uct_mm_md_close,
         .query                  = uct_sysv_md_query,
         .mem_alloc              = uct_sysv_mem_alloc,
@@ -188,13 +200,13 @@ static uct_mm_md_mapper_ops_t uct_sysv_md_ops = {
         .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
         .detect_memory_type     = ucs_empty_function_return_unsupported
     },
-   .query                       = ucs_empty_function_return_success,
-   .iface_addr_length           = ucs_empty_function_return_zero_size_t,
-   .iface_addr_pack             = ucs_empty_function_return_success,
-   .mem_attach                  = uct_sysv_mem_attach,
-   .mem_detach                  = uct_sysv_mem_detach,
-   .is_reachable                = ucs_empty_function_return_one_int
+    .query             = uct_sysv_query,
+    .iface_addr_length = ucs_empty_function_return_zero_size_t,
+    .iface_addr_pack   = ucs_empty_function_return_success,
+    .mem_attach        = uct_sysv_mem_attach,
+    .mem_detach        = uct_sysv_mem_detach,
+    .is_reachable      = ucs_empty_function_return_one_int
 };
 
 UCT_MM_TL_DEFINE(sysv, &uct_sysv_md_ops, uct_sysv_rkey_unpack,
-                 uct_sysv_rkey_release, "SYSV_")
+                 uct_sysv_rkey_release, "SYSV_", uct_sysv_iface_config_table)
diff --git a/src/uct/sm/mm/xpmem/mm_xpmem.c b/src/uct/sm/mm/xpmem/mm_xpmem.c
index 2761ef40984..dc2d429863e 100644
--- a/src/uct/sm/mm/xpmem/mm_xpmem.c
+++ b/src/uct/sm/mm/xpmem/mm_xpmem.c
@@ -70,6 +70,12 @@ static ucs_config_field_t uct_xpmem_md_config_table[] = {
   {NULL}
 };
 
+static ucs_config_field_t uct_xpmem_iface_config_table[] = {
+  {"MM_", "", NULL, 0, UCS_CONFIG_TYPE_TABLE(uct_mm_iface_config_table)},
+
+  {NULL}
+};
+
 UCS_STATIC_INIT {
     ucs_recursive_spinlock_init(&uct_xpmem_remote_mem_lock, 0);
     kh_init_inplace(xpmem_remote_mem, &uct_xpmem_remote_mem_hash);
@@ -88,7 +94,7 @@ UCS_STATIC_CLEANUP {
     ucs_recursive_spinlock_destroy(&uct_xpmem_remote_mem_lock);
 }
 
-static ucs_status_t uct_xpmem_query()
+static ucs_status_t uct_xpmem_query(int *attach_shm_file_p)
 {
     int version;
 
@@ -100,6 +106,9 @@ static ucs_status_t uct_xpmem_query()
     }
 
     ucs_debug("xpmem version: %d", version);
+
+    *attach_shm_file_p = 0;
+
     return UCS_OK;
 }
 
@@ -110,7 +119,7 @@ static ucs_status_t uct_xpmem_md_query(uct_md_h md, uct_md_attr_t *md_attr)
     md_attr->cap.flags        |= UCT_MD_FLAG_REG;
     md_attr->reg_cost          = ucs_linear_func_make(60.0e-9, 0);
     md_attr->cap.max_reg       = ULONG_MAX;
-    md_attr->cap.reg_mem_types = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    md_attr->cap.reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     md_attr->rkey_packed_size  = sizeof(uct_xpmem_packed_rkey_t);
 
     return UCS_OK;
@@ -262,6 +271,8 @@ uct_xpmem_rmem_add(xpmem_segid_t xsegid, uct_xpmem_remote_mem_t **rmem_p)
     rcache_params.ops                = &uct_xpmem_rcache_ops;
     rcache_params.context            = rmem;
     rcache_params.flags              = UCS_RCACHE_FLAG_NO_PFN_CHECK;
+    rcache_params.max_regions        = ULONG_MAX;
+    rcache_params.max_size           = SIZE_MAX;
 
     status = ucs_rcache_create(&rcache_params, "xpmem_remote_mem",
                                ucs_stats_get_root(), &rmem->rcache);
@@ -538,13 +549,13 @@ static uct_mm_md_mapper_ops_t uct_xpmem_md_ops = {
         .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
         .detect_memory_type     = ucs_empty_function_return_unsupported
     },
-   .query                       = uct_xpmem_query,
-   .iface_addr_length           = uct_xpmem_iface_addr_length,
-   .iface_addr_pack             = uct_xpmem_iface_addr_pack,
-   .mem_attach                  = uct_xpmem_mem_attach,
-   .mem_detach                  = uct_xpmem_mem_detach,
-   .is_reachable                = ucs_empty_function_return_one_int
+    .query             = uct_xpmem_query,
+    .iface_addr_length = uct_xpmem_iface_addr_length,
+    .iface_addr_pack   = uct_xpmem_iface_addr_pack,
+    .mem_attach        = uct_xpmem_mem_attach,
+    .mem_detach        = uct_xpmem_mem_detach,
+    .is_reachable      = ucs_empty_function_return_one_int
 };
 
 UCT_MM_TL_DEFINE(xpmem, &uct_xpmem_md_ops, uct_xpmem_rkey_unpack,
-                 uct_xpmem_rkey_release, "XPMEM_")
+                 uct_xpmem_rkey_release, "XPMEM_", uct_xpmem_iface_config_table)
diff --git a/src/uct/sm/scopy/base/scopy_iface.c b/src/uct/sm/scopy/base/scopy_iface.c
index 51fe7a62ad4..96e6bb710d3 100644
--- a/src/uct/sm/scopy/base/scopy_iface.c
+++ b/src/uct/sm/scopy/base/scopy_iface.c
@@ -83,7 +83,8 @@ void uct_scopy_iface_query(uct_scopy_iface_t *iface, uct_iface_attr_t *iface_att
     iface_attr->latency                 = ucs_linear_func_make(80e-9, 0); /* 80 ns */
 }
 
-UCS_CLASS_INIT_FUNC(uct_scopy_iface_t, uct_scopy_iface_ops_t *ops, uct_md_h md,
+UCS_CLASS_INIT_FUNC(uct_scopy_iface_t, uct_iface_ops_t *ops,
+                    uct_scopy_iface_ops_t *scopy_ops, uct_md_h md,
                     uct_worker_h worker, const uct_iface_params_t *params,
                     const uct_iface_config_t *tl_config)
 {
@@ -92,9 +93,10 @@ UCS_CLASS_INIT_FUNC(uct_scopy_iface_t, uct_scopy_iface_ops_t *ops, uct_md_h md,
     size_t elem_size;
     ucs_status_t status;
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_sm_iface_t, &ops->super, md, worker, params, tl_config);
+    UCS_CLASS_CALL_SUPER_INIT(uct_sm_iface_t, ops, &scopy_ops->super, md,
+                              worker, params, tl_config);
 
-    self->tx              = ops->ep_tx;
+    self->tx              = scopy_ops->ep_tx;
     self->config.max_iov  = ucs_min(config->max_iov, ucs_iov_get_max());
     self->config.seg_size = config->seg_size;
     self->config.tx_quota = config->tx_quota;
diff --git a/src/uct/sm/scopy/base/scopy_iface.h b/src/uct/sm/scopy/base/scopy_iface.h
index f0a7893a49f..5de28b571b0 100644
--- a/src/uct/sm/scopy/base/scopy_iface.h
+++ b/src/uct/sm/scopy/base/scopy_iface.h
@@ -54,15 +54,16 @@ typedef struct uct_scopy_iface {
 
 
 typedef struct uct_scopy_iface_ops {
-    uct_iface_ops_t               super;
-    uct_scopy_ep_tx_func_t        ep_tx;
+    uct_iface_internal_ops_t super;
+    uct_scopy_ep_tx_func_t   ep_tx;
 } uct_scopy_iface_ops_t;
 
 
 void uct_scopy_iface_query(uct_scopy_iface_t *iface, uct_iface_attr_t *iface_attr);
 
-UCS_CLASS_DECLARE(uct_scopy_iface_t, uct_scopy_iface_ops_t*, uct_md_h, uct_worker_h,
-                  const uct_iface_params_t*, const uct_iface_config_t*);
+UCS_CLASS_DECLARE(uct_scopy_iface_t, uct_iface_ops_t*, uct_scopy_iface_ops_t*,
+                  uct_md_h, uct_worker_h, const uct_iface_params_t*,
+                  const uct_iface_config_t*);
 
 unsigned uct_scopy_iface_progress(uct_iface_h tl_iface);
 
diff --git a/src/uct/sm/scopy/cma/cma_ep.c b/src/uct/sm/scopy/cma/cma_ep.c
index cd2a5364c4c..1b1d96a2f8e 100644
--- a/src/uct/sm/scopy/cma/cma_ep.c
+++ b/src/uct/sm/scopy/cma/cma_ep.c
@@ -37,18 +37,19 @@ const struct {
 
 static UCS_CLASS_INIT_FUNC(uct_cma_ep_t, const uct_ep_params_t *params)
 {
-    UCT_CHECK_PARAM(params->field_mask & UCT_EP_PARAM_FIELD_IFACE_ADDR,
-                    "UCT_EP_PARAM_FIELD_IFACE_ADDR and UCT_EP_PARAM_FIELD_DEV_ADDR are not defined");
-
+    UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
     UCS_CLASS_CALL_SUPER_INIT(uct_scopy_ep_t, params);
+
     self->remote_pid = *(const pid_t*)params->iface_addr &
                        ~UCT_CMA_IFACE_ADDR_FLAG_PID_NS;
+    self->keepalive  = NULL;
+
     return UCS_OK;
 }
 
 static UCS_CLASS_CLEANUP_FUNC(uct_cma_ep_t)
 {
-    /* No op */
+    ucs_free(self->keepalive);
 }
 
 UCS_CLASS_DEFINE(uct_cma_ep_t, uct_scopy_ep_t)
@@ -61,11 +62,15 @@ ucs_status_t uct_cma_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt,
                            uct_scopy_tx_op_t tx_op)
 {
     uct_cma_ep_t *ep                   = ucs_derived_of(tl_ep, uct_cma_ep_t);
+    uct_base_iface_t *iface            = ucs_derived_of(tl_ep->iface,
+                                                        uct_base_iface_t);
     size_t local_iov_idx               = 0;
     size_t UCS_V_UNUSED remote_iov_idx = 0;
     size_t local_iov_cnt               = UCT_SM_MAX_IOV;
     size_t total_iov_length;
     struct iovec local_iov[UCT_SM_MAX_IOV], remote_iov;
+    ucs_log_level_t log_lvl;
+    ucs_status_t status;
     ssize_t ret;
 
     ucs_assert(*length_p != 0);
@@ -82,9 +87,15 @@ ucs_status_t uct_cma_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt,
                                   local_iov_cnt - local_iov_idx, &remote_iov,
                                   1, 0);
     if (ucs_unlikely(ret < 0)) {
-        ucs_error("%s(pid=%d length=%zu) returned %zd: %m",
-                  uct_cma_ep_fn[tx_op].name, ep->remote_pid,
-                  remote_iov.iov_len, ret);
+        status  = uct_iface_handle_ep_err(&iface->super, &ep->super.super.super,
+                                          UCS_ERR_CONNECTION_RESET);
+        log_lvl = uct_base_iface_failure_log_level(iface, status,
+                                                   UCS_ERR_CONNECTION_RESET);
+
+        ucs_log(log_lvl, "%s(pid=%d length=%zu) returned %zd: %m",
+                uct_cma_ep_fn[tx_op].name, ep->remote_pid, remote_iov.iov_len,
+                ret);
+
         return UCS_ERR_IO_ERROR;
     }
 
@@ -93,3 +104,12 @@ ucs_status_t uct_cma_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt,
     *length_p = ret;
     return UCS_OK;
 }
+
+ucs_status_t uct_cma_ep_check(const uct_ep_h tl_ep, unsigned flags,
+                              uct_completion_t *comp)
+{
+    uct_cma_ep_t *ep = ucs_derived_of(tl_ep, uct_cma_ep_t);
+
+    return uct_ep_keepalive_check(tl_ep, &ep->keepalive, ep->remote_pid, flags,
+                                  comp);
+}
diff --git a/src/uct/sm/scopy/cma/cma_ep.h b/src/uct/sm/scopy/cma/cma_ep.h
index 82030f6a9cd..0354f73c783 100644
--- a/src/uct/sm/scopy/cma/cma_ep.h
+++ b/src/uct/sm/scopy/cma/cma_ep.h
@@ -13,8 +13,9 @@
 
 
 typedef struct uct_cma_ep {
-    uct_scopy_ep_t super;
-    pid_t          remote_pid;
+    uct_scopy_ep_t       super;
+    pid_t                remote_pid;
+    uct_keepalive_info_t *keepalive;
 } uct_cma_ep_t;
 
 
@@ -26,4 +27,7 @@ ucs_status_t uct_cma_ep_tx(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iov_cnt,
                            uint64_t remote_addr, uct_rkey_t rkey,
                            uct_scopy_tx_op_t tx_op);
 
+ucs_status_t uct_cma_ep_check(const uct_ep_h tl_ep, unsigned flags,
+                              uct_completion_t *comp);
+
 #endif
diff --git a/src/uct/sm/scopy/cma/cma_iface.c b/src/uct/sm/scopy/cma/cma_iface.c
index 0d39c793173..44d46ef2f2f 100644
--- a/src/uct/sm/scopy/cma/cma_iface.c
+++ b/src/uct/sm/scopy/cma/cma_iface.c
@@ -62,6 +62,8 @@ static ucs_status_t uct_cma_iface_query(uct_iface_h tl_iface,
     iface_attr->bandwidth.dedicated = iface->super.super.config.bandwidth;
     iface_attr->bandwidth.shared    = 0;
     iface_attr->overhead            = 0.4e-6; /* 0.4 us */
+    iface_attr->cap.flags          |= UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
+                                      UCT_IFACE_FLAG_EP_CHECK;
 
     return UCS_OK;
 }
@@ -86,38 +88,45 @@ uct_cma_iface_is_reachable(const uct_iface_h tl_iface,
 
 static UCS_CLASS_DECLARE_DELETE_FUNC(uct_cma_iface_t, uct_iface_t);
 
+static uct_iface_ops_t uct_cma_iface_tl_ops = {
+    .ep_put_zcopy             = uct_scopy_ep_put_zcopy,
+    .ep_get_zcopy             = uct_scopy_ep_get_zcopy,
+    .ep_pending_add           = ucs_empty_function_return_busy,
+    .ep_pending_purge         = ucs_empty_function,
+    .ep_flush                 = uct_scopy_ep_flush,
+    .ep_fence                 = uct_sm_ep_fence,
+    .ep_check                 = uct_cma_ep_check,
+    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_cma_ep_t),
+    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_ep_t),
+    .iface_flush              = uct_scopy_iface_flush,
+    .iface_fence              = uct_sm_iface_fence,
+    .iface_progress_enable    = ucs_empty_function,
+    .iface_progress_disable   = ucs_empty_function,
+    .iface_progress           = uct_scopy_iface_progress,
+    .iface_event_fd_get       = ucs_empty_function_return_unsupported,
+    .iface_event_arm          = uct_scopy_iface_event_arm,
+    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_iface_t),
+    .iface_query              = uct_cma_iface_query,
+    .iface_get_address        = uct_cma_iface_get_address,
+    .iface_get_device_address = uct_sm_iface_get_device_address,
+    .iface_is_reachable       = uct_cma_iface_is_reachable,
+};
+
 static uct_scopy_iface_ops_t uct_cma_iface_ops = {
     .super = {
-        .ep_put_zcopy             = uct_scopy_ep_put_zcopy,
-        .ep_get_zcopy             = uct_scopy_ep_get_zcopy,
-        .ep_pending_add           = ucs_empty_function_return_busy,
-        .ep_pending_purge         = ucs_empty_function,
-        .ep_flush                 = uct_scopy_ep_flush,
-        .ep_fence                 = uct_sm_ep_fence,
-        .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_cma_ep_t),
-        .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_ep_t),
-        .iface_flush              = uct_scopy_iface_flush,
-        .iface_fence              = uct_sm_iface_fence,
-        .iface_progress_enable    = ucs_empty_function,
-        .iface_progress_disable   = ucs_empty_function,
-        .iface_progress           = uct_scopy_iface_progress,
-        .iface_event_fd_get       = ucs_empty_function_return_unsupported,
-        .iface_event_arm          = uct_scopy_iface_event_arm,
-        .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_cma_iface_t),
-        .iface_query              = uct_cma_iface_query,
-        .iface_get_address        = uct_cma_iface_get_address,
-        .iface_get_device_address = uct_sm_iface_get_device_address,
-        .iface_is_reachable       = uct_cma_iface_is_reachable
+        .iface_estimate_perf = uct_base_iface_estimate_perf,
+        .iface_vfs_refresh   = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
     },
-    .ep_tx                        = uct_cma_ep_tx
+    .ep_tx = uct_cma_ep_tx,
 };
 
 static UCS_CLASS_INIT_FUNC(uct_cma_iface_t, uct_md_h md, uct_worker_h worker,
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
 {
-    UCS_CLASS_CALL_SUPER_INIT(uct_scopy_iface_t, &uct_cma_iface_ops, md,
-                              worker, params, tl_config);
+    UCS_CLASS_CALL_SUPER_INIT(uct_scopy_iface_t, &uct_cma_iface_tl_ops,
+                              &uct_cma_iface_ops, md, worker, params,
+                              tl_config);
 
     return UCS_OK;
 }
diff --git a/src/uct/sm/scopy/cma/cma_md.c b/src/uct/sm/scopy/cma/cma_md.c
index c4fb1398924..1621a3ebfc4 100644
--- a/src/uct/sm/scopy/cma/cma_md.c
+++ b/src/uct/sm/scopy/cma/cma_md.c
@@ -143,14 +143,15 @@ uct_cma_md_open(uct_component_t *component, const char *md_name,
                 const uct_md_config_t *md_config, uct_md_h *md_p)
 {
     static uct_md_ops_t md_ops = {
-        .close              = (uct_md_close_func_t)ucs_empty_function,
-        .query              = uct_cma_md_query,
-        .mem_alloc          = (uct_md_mem_alloc_func_t)ucs_empty_function_return_success,
-        .mem_free           = (uct_md_mem_free_func_t)ucs_empty_function_return_success,
-        .mkey_pack          = (uct_md_mkey_pack_func_t)ucs_empty_function_return_success,
-        .mem_reg            = uct_cma_mem_reg,
-        .mem_dereg          = (uct_md_mem_dereg_func_t)ucs_empty_function_return_success,
-        .detect_memory_type = ucs_empty_function_return_unsupported,
+        .close                  = (uct_md_close_func_t)ucs_empty_function,
+        .query                  = uct_cma_md_query,
+        .mem_alloc              = (uct_md_mem_alloc_func_t)ucs_empty_function_return_success,
+        .mem_free               = (uct_md_mem_free_func_t)ucs_empty_function_return_success,
+        .mkey_pack              = (uct_md_mkey_pack_func_t)ucs_empty_function_return_success,
+        .mem_reg                = uct_cma_mem_reg,
+        .mem_dereg              = (uct_md_mem_dereg_func_t)ucs_empty_function_return_success,
+        .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+        .detect_memory_type     = ucs_empty_function_return_unsupported,
     };
     static uct_md_t md = {
         .ops          = &md_ops,
@@ -165,7 +166,8 @@ ucs_status_t uct_cma_md_query(uct_md_h md, uct_md_attr_t *md_attr)
 {
     md_attr->rkey_packed_size     = 0;
     md_attr->cap.flags            = UCT_MD_FLAG_REG;
-    md_attr->cap.reg_mem_types    = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
diff --git a/src/uct/sm/scopy/knem/knem_iface.c b/src/uct/sm/scopy/knem/knem_iface.c
index f3e78225600..0efcecb8cd8 100644
--- a/src/uct/sm/scopy/knem/knem_iface.c
+++ b/src/uct/sm/scopy/knem/knem_iface.c
@@ -41,38 +41,44 @@ static ucs_status_t uct_knem_iface_query(uct_iface_h tl_iface,
 
 static UCS_CLASS_DECLARE_DELETE_FUNC(uct_knem_iface_t, uct_iface_t);
 
+static uct_iface_ops_t uct_knem_iface_tl_ops = {
+    .ep_put_zcopy             = uct_scopy_ep_put_zcopy,
+    .ep_get_zcopy             = uct_scopy_ep_get_zcopy,
+    .ep_pending_add           = ucs_empty_function_return_busy,
+    .ep_pending_purge         = ucs_empty_function,
+    .ep_flush                 = uct_scopy_ep_flush,
+    .ep_fence                 = uct_sm_ep_fence,
+    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_knem_ep_t),
+    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_ep_t),
+    .iface_flush              = uct_scopy_iface_flush,
+    .iface_fence              = uct_sm_iface_fence,
+    .iface_progress_enable    = ucs_empty_function,
+    .iface_progress_disable   = ucs_empty_function,
+    .iface_progress           = uct_scopy_iface_progress,
+    .iface_event_fd_get       = ucs_empty_function_return_unsupported,
+    .iface_event_arm          = uct_scopy_iface_event_arm,
+    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_iface_t),
+    .iface_query              = uct_knem_iface_query,
+    .iface_get_device_address = uct_sm_iface_get_device_address,
+    .iface_get_address        = ucs_empty_function_return_success,
+    .iface_is_reachable       = uct_sm_iface_is_reachable,
+};
+
 static uct_scopy_iface_ops_t uct_knem_iface_ops = {
     .super = {
-        .ep_put_zcopy             = uct_scopy_ep_put_zcopy,
-        .ep_get_zcopy             = uct_scopy_ep_get_zcopy,
-        .ep_pending_add           = ucs_empty_function_return_busy,
-        .ep_pending_purge         = ucs_empty_function,
-        .ep_flush                 = uct_scopy_ep_flush,
-        .ep_fence                 = uct_sm_ep_fence,
-        .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_knem_ep_t),
-        .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_ep_t),
-        .iface_flush              = uct_scopy_iface_flush,
-        .iface_fence              = uct_sm_iface_fence,
-        .iface_progress_enable    = ucs_empty_function,
-        .iface_progress_disable   = ucs_empty_function,
-        .iface_progress           = uct_scopy_iface_progress,
-        .iface_event_fd_get       = ucs_empty_function_return_unsupported,
-        .iface_event_arm          = uct_scopy_iface_event_arm,
-        .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_knem_iface_t),
-        .iface_query              = uct_knem_iface_query,
-        .iface_get_device_address = uct_sm_iface_get_device_address,
-        .iface_get_address        = ucs_empty_function_return_success,
-        .iface_is_reachable       = uct_sm_iface_is_reachable
+        .iface_estimate_perf = uct_base_iface_estimate_perf,
+        .iface_vfs_refresh   = (uct_iface_vfs_refresh_func_t)ucs_empty_function,
     },
-    .ep_tx                        = uct_knem_ep_tx
+    .ep_tx = uct_knem_ep_tx,
 };
 
 static UCS_CLASS_INIT_FUNC(uct_knem_iface_t, uct_md_h md, uct_worker_h worker,
                            const uct_iface_params_t *params,
                            const uct_iface_config_t *tl_config)
 {
-    UCS_CLASS_CALL_SUPER_INIT(uct_scopy_iface_t, &uct_knem_iface_ops, md,
-                              worker, params, tl_config);
+    UCS_CLASS_CALL_SUPER_INIT(uct_scopy_iface_t, &uct_knem_iface_tl_ops,
+                              &uct_knem_iface_ops, md, worker, params,
+                              tl_config);
     self->knem_md = (uct_knem_md_t *)md;
 
     return UCS_OK;
diff --git a/src/uct/sm/scopy/knem/knem_md.c b/src/uct/sm/scopy/knem/knem_md.c
index 3402e0e61cb..f6824eaacde 100644
--- a/src/uct/sm/scopy/knem/knem_md.c
+++ b/src/uct/sm/scopy/knem/knem_md.c
@@ -39,7 +39,8 @@ ucs_status_t uct_knem_md_query(uct_md_h uct_md, uct_md_attr_t *md_attr)
     md_attr->rkey_packed_size     = sizeof(uct_knem_key_t);
     md_attr->cap.flags            = UCT_MD_FLAG_REG |
                                     UCT_MD_FLAG_NEED_RKEY;
-    md_attr->cap.reg_mem_types    = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
@@ -275,12 +276,13 @@ static ucs_status_t uct_knem_mem_rcache_dereg(uct_md_h uct_md, uct_mem_h memh)
 }
 
 static uct_md_ops_t uct_knem_md_rcache_ops = {
-    .close              = uct_knem_md_close,
-    .query              = uct_knem_md_query,
-    .mkey_pack          = uct_knem_rkey_pack,
-    .mem_reg            = uct_knem_mem_rcache_reg,
-    .mem_dereg          = uct_knem_mem_rcache_dereg,
-    .detect_memory_type = ucs_empty_function_return_unsupported,
+    .close                  = uct_knem_md_close,
+    .query                  = uct_knem_md_query,
+    .mkey_pack              = uct_knem_rkey_pack,
+    .mem_reg                = uct_knem_mem_rcache_reg,
+    .mem_dereg              = uct_knem_mem_rcache_dereg,
+    .is_sockaddr_accessible = ucs_empty_function_return_zero_int,
+    .detect_memory_type     = ucs_empty_function_return_unsupported,
 };
 
 
@@ -352,16 +354,15 @@ uct_knem_md_open(uct_component_t *component, const char *md_name,
     }
 
     if (md_config->rcache_enable != UCS_NO) {
+        uct_md_set_rcache_params(&rcache_params, &md_config->rcache);
         rcache_params.region_struct_size = sizeof(uct_knem_rcache_region_t);
-        rcache_params.alignment          = md_config->rcache.alignment;
         rcache_params.max_alignment      = ucs_get_page_size();
         rcache_params.ucm_events         = UCM_EVENT_VM_UNMAPPED;
-        rcache_params.ucm_event_priority = md_config->rcache.event_prio;
         rcache_params.context            = knem_md;
         rcache_params.ops                = &uct_knem_rcache_ops;
         rcache_params.flags              = UCS_RCACHE_FLAG_PURGE_ON_FORK;
-        status = ucs_rcache_create(&rcache_params, "knem rcache device",
-                                   ucs_stats_get_root(), &knem_md->rcache);
+        status = ucs_rcache_create(&rcache_params, "knem", ucs_stats_get_root(),
+                                   &knem_md->rcache);
         if (status == UCS_OK) {
             knem_md->super.ops = &uct_knem_md_rcache_ops;
             knem_md->reg_cost  = ucs_linear_func_make(md_config->rcache.overhead,
diff --git a/src/uct/sm/scopy/knem/knem_md.h b/src/uct/sm/scopy/knem/knem_md.h
index f29d9c4f060..f371c961fdd 100644
--- a/src/uct/sm/scopy/knem/knem_md.h
+++ b/src/uct/sm/scopy/knem/knem_md.h
@@ -39,9 +39,9 @@ typedef struct uct_knem_key {
  * KNEM memory domain configuration.
  */
 typedef struct uct_knem_md_config {
-    uct_md_config_t        super;
-    ucs_ternary_value_t    rcache_enable;
-    uct_md_rcache_config_t rcache;
+    uct_md_config_t          super;
+    ucs_ternary_auto_value_t rcache_enable;
+    uct_md_rcache_config_t   rcache;
 } uct_knem_md_config_t;
 
 /**
diff --git a/src/uct/sm/self/self.c b/src/uct/sm/self/self.c
index b0c7bfb7a7b..02578e9f9be 100644
--- a/src/uct/sm/self/self.c
+++ b/src/uct/sm/self/self.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
  * See file LICENSE for terms.
  */
 
@@ -9,6 +9,7 @@
 
 #include "self.h"
 
+#include <uct/base/uct_iov.inl>
 #include <uct/sm/base/sm_ep.h>
 #include <uct/sm/base/sm_iface.h>
 #include <ucs/type/class.h>
@@ -47,6 +48,16 @@ static ucs_config_field_t uct_self_iface_config_table[] = {
     {NULL}
 };
 
+static ucs_config_field_t uct_self_md_config_table[] = {
+    {"", "", NULL, ucs_offsetof(uct_self_md_config_t, super),
+     UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
+
+    {"NUM_DEVICES", "1", "Number of \"self\" devices to create",
+     ucs_offsetof(uct_self_md_config_t, num_devices), UCS_CONFIG_TYPE_INT},
+
+    {NULL}
+};
+
 
 static ucs_status_t uct_self_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *attr)
 {
@@ -106,7 +117,7 @@ static ucs_status_t uct_self_iface_query(uct_iface_h tl_iface, uct_iface_attr_t
     attr->cap.am.opt_zcopy_align  = 1;
     attr->cap.am.align_mtu        = attr->cap.am.opt_zcopy_align;
     attr->cap.am.max_hdr          = 0;
-    attr->cap.am.max_iov          = 1;
+    attr->cap.am.max_iov          = SIZE_MAX;
 
     attr->latency                 = ucs_linear_func_make(0, 0);
     attr->bandwidth.dedicated     = 6911.0 * UCS_MBYTE;
@@ -165,6 +176,7 @@ static UCS_CLASS_INIT_FUNC(uct_self_iface_t, uct_md_h md, uct_worker_h worker,
 {
     uct_self_iface_config_t *config = ucs_derived_of(tl_config,
                                                      uct_self_iface_config_t);
+    size_t align_offset, alignment;
     ucs_status_t status;
 
     UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE,
@@ -179,20 +191,27 @@ static UCS_CLASS_INIT_FUNC(uct_self_iface_t, uct_md_h md, uct_worker_h worker,
         return UCS_ERR_INVALID_PARAM;
     }
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_self_iface_ops, md, worker,
-                              params, tl_config
-                              UCS_STATS_ARG((params->field_mask & 
-                                             UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
-                                            params->stats_root : NULL)
-                              UCS_STATS_ARG(UCT_SELF_NAME));
+    UCS_CLASS_CALL_SUPER_INIT(
+            uct_base_iface_t, &uct_self_iface_ops,
+            &uct_base_iface_internal_ops, md, worker, params,
+            tl_config UCS_STATS_ARG(
+                    (params->field_mask & UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
+                            params->stats_root :
+                            NULL) UCS_STATS_ARG(UCT_SELF_NAME));
 
     self->id          = ucs_generate_uuid((uintptr_t)self);
     self->send_size   = config->seg_size;
 
-    status = ucs_mpool_init(&self->msg_mp, 0, self->send_size, 0,
-                            UCS_SYS_CACHE_LINE_SIZE,
-                            2, /* 2 elements are enough for most of communications */
-                            UINT_MAX, &uct_self_iface_mpool_ops, "self_msg_desc");
+    status = uct_iface_param_am_alignment(params, self->send_size, 0, 0,
+                                          &alignment, &align_offset);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    status = ucs_mpool_init(
+            &self->msg_mp, 0, self->send_size, align_offset, alignment,
+            2, /* 2 elements are enough for most of communications */
+            UINT_MAX, &uct_self_iface_mpool_ops, "self_msg_desc");
 
     if (UCS_STATUS_IS_ERR(status)) {
         return status;
@@ -220,9 +239,27 @@ static ucs_status_t
 uct_self_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
                           unsigned *num_tl_devices_p)
 {
-    return uct_single_device_resource(md, UCT_SM_DEVICE_NAME,
-                                      UCT_DEVICE_TYPE_SELF,
-                                      tl_devices_p, num_tl_devices_p);
+    uct_self_md_t *self_md = ucs_derived_of(md, uct_self_md_t);
+    int i;
+    uct_tl_device_resource_t *devices;
+
+    devices = ucs_calloc(self_md->num_devices, sizeof(*devices),
+                         "device resource");
+    if (NULL == devices) {
+        ucs_error("failed to allocate device resource");
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    for (i = 0; i < self_md->num_devices; i++) {
+        ucs_snprintf_zero(devices[i].name, sizeof(devices->name), "%s%d",
+                          UCT_SM_DEVICE_NAME, i);
+        devices[i].type       = UCT_DEVICE_TYPE_SELF;
+        devices[i].sys_device = UCS_SYS_DEVICE_ID_UNKNOWN;
+    }
+
+    *tl_devices_p     = devices;
+    *num_tl_devices_p = self_md->num_devices;
+    return UCS_OK;
 }
 
 static UCS_CLASS_INIT_FUNC(uct_self_ep_t, const uct_ep_params_t *params)
@@ -263,6 +300,31 @@ ucs_status_t uct_self_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t header,
     return UCS_OK;
 }
 
+ucs_status_t uct_self_ep_am_short_iov(uct_ep_h tl_ep, uint8_t id,
+                                      const uct_iov_t *iov, size_t iovcnt)
+{
+    uct_self_iface_t *iface        = ucs_derived_of(tl_ep->iface,
+                                                    uct_self_iface_t);
+    uct_self_ep_t UCS_V_UNUSED *ep = ucs_derived_of(tl_ep, uct_self_ep_t);
+    void *send_buffer;
+    ucs_iov_iter_t iov_iter;
+    size_t length;
+
+    UCT_CHECK_AM_ID(id);
+    UCT_CHECK_LENGTH(uct_iov_total_length(iov, iovcnt), 0, iface->send_size,
+                     "am_short_iov");
+
+    ucs_iov_iter_init(&iov_iter);
+    send_buffer = UCT_SELF_IFACE_SEND_BUFFER_GET(iface);
+    length      = uct_iov_to_buffer(iov, iovcnt, &iov_iter, send_buffer,
+                                    SIZE_MAX);
+
+    UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, length);
+    uct_self_iface_sendrecv_am(iface, id, send_buffer, length, "SHORT_IOV");
+
+    return UCS_OK;
+}
+
 ssize_t uct_self_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
                              uct_pack_callback_t pack_cb, void *arg,
                              unsigned flags)
@@ -289,6 +351,7 @@ static uct_iface_ops_t uct_self_iface_ops = {
     .ep_put_bcopy             = uct_sm_ep_put_bcopy,
     .ep_get_bcopy             = uct_sm_ep_get_bcopy,
     .ep_am_short              = uct_self_ep_am_short,
+    .ep_am_short_iov          = uct_self_ep_am_short_iov,
     .ep_am_bcopy              = uct_self_ep_am_bcopy,
     .ep_atomic_cswap64        = uct_sm_ep_atomic_cswap64,
     .ep_atomic64_post         = uct_sm_ep_atomic64_post,
@@ -323,7 +386,7 @@ static ucs_status_t uct_self_md_query(uct_md_h md, uct_md_attr_t *attr)
     /* Dummy memory registration provided. No real memory handling exists */
     attr->cap.flags            = UCT_MD_FLAG_REG |
                                  UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */
-    attr->cap.reg_mem_types    = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     attr->cap.detect_mem_types = 0;
     attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     attr->cap.max_alloc        = 0;
@@ -345,6 +408,8 @@ static ucs_status_t uct_self_mem_reg(uct_md_h md, void *address, size_t length,
 static ucs_status_t uct_self_md_open(uct_component_t *component, const char *md_name,
                                      const uct_md_config_t *config, uct_md_h *md_p)
 {
+    uct_self_md_config_t *md_config = ucs_derived_of(config,
+                                                     uct_self_md_config_t);
     static uct_md_ops_t md_ops = {
         .close              = ucs_empty_function,
         .query              = uct_self_md_query,
@@ -353,12 +418,14 @@ static ucs_status_t uct_self_md_open(uct_component_t *component, const char *md_
         .mem_dereg          = ucs_empty_function_return_success,
         .detect_memory_type = ucs_empty_function_return_unsupported
     };
-    static uct_md_t md = {
-        .ops          = &md_ops,
-        .component    = &uct_self_component
-    };
 
-    *md_p = &md;
+    static uct_self_md_t md;
+
+    md.super.ops       = &md_ops;
+    md.super.component = &uct_self_component;
+    md.num_devices     = md_config->num_devices;
+
+    *md_p = &md.super;
     return UCS_OK;
 }
 
@@ -383,7 +450,12 @@ static uct_component_t uct_self_component = {
     .rkey_ptr           = ucs_empty_function_return_unsupported,
     .rkey_release       = ucs_empty_function_return_success,
     .name               = UCT_SELF_NAME,
-    .md_config          = UCT_MD_DEFAULT_CONFIG_INITIALIZER,
+    .md_config          = {
+        .name           = "Self memory domain",
+        .prefix         = "SELF_",
+        .table          = uct_self_md_config_table,
+        .size           = sizeof(uct_self_md_config_t),
+    },
     .cm_config          = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY,
     .tl_list            = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_self_component),
     .flags              = 0
diff --git a/src/uct/sm/self/self.h b/src/uct/sm/self/self.h
index f9a4b610cb1..c1420a92c05 100644
--- a/src/uct/sm/self/self.h
+++ b/src/uct/sm/self/self.h
@@ -20,6 +20,24 @@ typedef struct uct_self_iface_config {
 } uct_self_iface_config_t;
 
 
+/**
+ * @brief self device MD descriptor
+ */
+typedef struct uct_self_md {
+    uct_md_t super;
+    size_t   num_devices; /* Number of devices to create */
+} uct_self_md_t;
+
+
+/**
+ * @brief self device MD configuration
+ */
+typedef struct uct_self_md_config {
+    uct_md_config_t super;
+    size_t          num_devices; /* Number of devices to create */
+} uct_self_md_config_t;
+
+
 typedef struct uct_self_iface {
     uct_base_iface_t      super;
     uct_self_iface_addr_t id;           /* Unique identifier for the instance */
diff --git a/src/uct/tcp/sockcm/sockcm_def.h b/src/uct/tcp/sockcm/sockcm_def.h
deleted file mode 100644
index 81195a084e6..00000000000
--- a/src/uct/tcp/sockcm/sockcm_def.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_SOCKCM_H
-#define UCT_SOCKCM_H
-
-#include <uct/api/uct.h>
-#include <uct/api/uct_def.h>
-#include <uct/base/uct_iface.h>
-#include <uct/base/uct_md.h>
-#include <ucs/type/class.h>
-#include <ucs/time/time.h>
-#include <ucs/async/async.h>
-#include <sys/poll.h>
-#include <ucs/sys/sock.h>
-#include <net/if.h>
-
-#define UCT_SOCKCM_TL_NAME              "sockcm"
-#define UCT_SOCKCM_PRIV_DATA_LEN        2048
-
-typedef struct uct_sockcm_iface   uct_sockcm_iface_t;
-typedef struct uct_sockcm_ep      uct_sockcm_ep_t;
-
-typedef struct uct_sockcm_conn_param {
-    ssize_t                 length;
-    int                     fd;
-    char                    private_data[UCT_SOCKCM_PRIV_DATA_LEN];
-} uct_sockcm_conn_param_t;
-
-typedef struct uct_sockcm_ctx {
-    int                     sock_fd;
-    size_t                  recv_len;
-    uct_sockcm_iface_t      *iface;
-    uct_sockcm_conn_param_t conn_param;
-    ucs_list_link_t         list;
-} uct_sockcm_ctx_t;
-
-ucs_status_t uct_sockcm_ep_set_sock_id(uct_sockcm_ep_t *ep);
-void uct_sockcm_ep_put_sock_id(uct_sockcm_ctx_t *sock_id_ctx);
-
-#endif /* UCT_SOCKCM_H */
diff --git a/src/uct/tcp/sockcm/sockcm_ep.c b/src/uct/tcp/sockcm/sockcm_ep.c
deleted file mode 100644
index 9f0250b34dd..00000000000
--- a/src/uct/tcp/sockcm/sockcm_ep.c
+++ /dev/null
@@ -1,407 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-2019.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "sockcm_ep.h"
-#include <ucs/sys/string.h>
-#include <ucs/sys/sock.h>
-#include <netinet/tcp.h>
-#include <uct/tcp/tcp.h>
-
-#define UCT_SOCKCM_CB_FLAGS_CHECK(_flags) \
-    do { \
-        UCT_CB_FLAGS_CHECK(_flags); \
-        if (!((_flags) & UCT_CB_FLAG_ASYNC)) { \
-            return UCS_ERR_UNSUPPORTED; \
-        } \
-    } while (0)
-
-ucs_status_t uct_sockcm_ep_set_sock_id(uct_sockcm_ep_t *ep)
-{
-    ucs_status_t status;
-    struct sockaddr *dest_addr = NULL;
-
-    ep->sock_id_ctx = ucs_malloc(sizeof(*ep->sock_id_ctx), "client sock_id_ctx");
-    if (ep->sock_id_ctx == NULL) {
-        return UCS_ERR_NO_MEMORY;
-    }
-
-    dest_addr = (struct sockaddr *) &(ep->remote_addr);
-
-    status = ucs_socket_create(dest_addr->sa_family, SOCK_STREAM,
-                               &ep->sock_id_ctx->sock_fd);
-    if (status != UCS_OK) {
-        ucs_debug("unable to create client socket for sockcm");
-        ucs_free(ep->sock_id_ctx);
-        return status;
-    }
-
-    return UCS_OK;
-}
-
-void uct_sockcm_ep_put_sock_id(uct_sockcm_ctx_t *sock_id_ctx)
-{
-    close(sock_id_ctx->sock_fd);
-    ucs_free(sock_id_ctx);
-}
-
-ucs_status_t uct_sockcm_ep_send_client_info(uct_sockcm_ep_t *ep)
-{
-    uct_sockcm_iface_t *iface = ucs_derived_of(ep->super.super.iface,
-                                               uct_sockcm_iface_t);
-    uct_cm_ep_priv_data_pack_args_t pack_args;
-    uct_sockcm_conn_param_t conn_param;
-    char dev_name[UCT_DEVICE_NAME_MAX];
-    ucs_status_t status;
-
-    memset(&conn_param, 0, sizeof(uct_sockcm_conn_param_t));
-
-    /* get interface name associated with the connected client fd; use that for pack_cb */
-    status = ucs_sockaddr_get_ifname(ep->sock_id_ctx->sock_fd, dev_name,
-                                     UCT_DEVICE_NAME_MAX);
-    if (UCS_OK != status) {
-        goto out;
-    }
-
-    pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME;
-    ucs_strncpy_safe(pack_args.dev_name, dev_name, UCT_DEVICE_NAME_MAX);
-
-    conn_param.length = ep->pack_cb(ep->pack_cb_arg, &pack_args,
-                                    (void*)conn_param.private_data);
-    if (conn_param.length < 0) {
-        ucs_error("sockcm client (iface=%p, ep = %p) failed to fill "
-                  "private data. status: %s",
-                  iface, ep, ucs_status_string((ucs_status_t)conn_param.length));
-        status = UCS_ERR_IO_ERROR;
-        goto out;
-    }
-
-    ucs_assert(conn_param.length <= UCT_SOCKCM_PRIV_DATA_LEN);
-
-    status = ucs_socket_send(ep->sock_id_ctx->sock_fd, &conn_param,
-                             sizeof(uct_sockcm_conn_param_t));
-
-out:
-    return status;
-}
-
-static const char*
-uct_sockcm_ep_conn_state_str(uct_sockcm_ep_conn_state_t state)
-{
-    switch (state) {
-    case UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING:
-        return "UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING";
-    case UCT_SOCKCM_EP_CONN_STATE_INFO_SENT:
-        return "UCT_SOCKCM_EP_CONN_STATE_INFO_SENT";
-    case UCT_SOCKCM_EP_CONN_STATE_CLOSED:
-        return "UCT_SOCKCM_EP_CONN_STATE_CLOSED";
-    case UCT_SOCKCM_EP_CONN_STATE_CONNECTED:
-        return "UCT_SOCKCM_EP_CONN_STATE_CONNECTED";
-    default:
-        ucs_fatal("invaild sockcm endpoint state %d", state);
-    }
-}
-
-static void uct_sockcm_change_state(uct_sockcm_ep_t *ep,
-                                    uct_sockcm_ep_conn_state_t conn_state,
-                                    ucs_status_t status)
-{
-    uct_sockcm_iface_t *iface = ucs_derived_of(ep->super.super.iface,
-                                               uct_sockcm_iface_t);
-
-    pthread_mutex_lock(&ep->ops_mutex);
-    ucs_debug("changing ep with status %s from state %s to state %s, status %s",
-              ucs_status_string(ep->status),
-              uct_sockcm_ep_conn_state_str(ep->conn_state),
-              uct_sockcm_ep_conn_state_str(conn_state),
-              ucs_status_string(status));
-    if ((ep->status != UCS_OK) &&
-        (ep->conn_state == UCT_SOCKCM_EP_CONN_STATE_CLOSED)) {
-        /* Do not handle failure twice for closed EP */
-        pthread_mutex_unlock(&ep->ops_mutex);
-        return;
-    }
-
-    ep->status     = status;
-    ep->conn_state = conn_state;
-
-    if (conn_state == UCT_SOCKCM_EP_CONN_STATE_CLOSED) {
-        uct_sockcm_ep_set_failed(&iface->super.super, &ep->super.super, status);
-    }
-
-    uct_sockcm_ep_invoke_completions(ep, status);
-    pthread_mutex_unlock(&ep->ops_mutex);
-}
-
-static void uct_sockcm_handle_sock_connect(uct_sockcm_ep_t *ep)
-{
-    char sockaddr_str[UCS_SOCKADDR_STRING_LEN];
-    int fd = ep->sock_id_ctx->sock_fd;
-    ucs_status_t status;
-
-    if (!ucs_socket_is_connected(fd)) {
-        ucs_error("failed to connect to %s",
-                  ucs_sockaddr_str((struct sockaddr*)&ep->remote_addr,
-                                   sockaddr_str, sizeof(sockaddr_str)));
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED,
-                                UCS_ERR_UNREACHABLE);
-        goto err;
-    } 
-
-    status = uct_sockcm_ep_send_client_info(ep);
-    if (status != UCS_OK) {
-        ucs_error("failed to send client info: %s", ucs_status_string(status));
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, status);
-        goto err;
-    }
-
-    ep->conn_state = UCT_SOCKCM_EP_CONN_STATE_INFO_SENT;
-
-    /* Call current handler when server responds to sent message */
-    if (UCS_OK != ucs_async_modify_handler(fd, UCS_EVENT_SET_EVREAD)) {
-        ucs_error("failed to modify async handler for fd %d", fd);
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED,
-                                UCS_ERR_IO_ERROR);
-        goto err;
-    }
-
-    return;
-
-err:
-    status = ucs_async_modify_handler(fd, 0);
-    if (status != UCS_OK) {
-        ucs_debug("unable to modify handler");
-    }
-}
-
-static void uct_sockcm_handle_info_sent(uct_sockcm_ep_t *ep)
-{
-    ucs_status_t status;
-    size_t recv_len;
-    char notif_val;
-
-    recv_len = sizeof(notif_val);
-    status   = ucs_socket_recv_nb(ep->sock_id_ctx->sock_fd, &notif_val,
-                                  &recv_len);
-    if (UCS_ERR_NO_PROGRESS == status) {
-        /* will call recv again when ready */
-        return;
-    }
-
-    ucs_async_remove_handler(ep->sock_id_ctx->sock_fd, 0);
-
-    if (UCS_OK != status) {
-        /* receive notif failed, close the connection */
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED, status);
-        return;
-    }
-
-    if (notif_val == UCT_SOCKCM_IFACE_NOTIFY_ACCEPT) {
-        ucs_debug("event_handler OK after accept");
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CONNECTED, UCS_OK);
-    } else {
-        ucs_debug("event_handler REJECTED after reject");
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED,
-                                UCS_ERR_REJECTED);
-    }
-}
-
-static void uct_sockcm_ep_event_handler(int fd, ucs_event_set_types_t events,
-                                        void *arg)
-{
-    uct_sockcm_ep_t *ep = (uct_sockcm_ep_t *) arg;
-
-    switch (ep->conn_state) {
-    case UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING:
-        uct_sockcm_handle_sock_connect(ep);
-        break;
-    case UCT_SOCKCM_EP_CONN_STATE_INFO_SENT:
-        uct_sockcm_handle_info_sent(ep);
-        break;
-    case UCT_SOCKCM_EP_CONN_STATE_CONNECTED:
-        if (UCS_OK != ucs_async_modify_handler(fd, 0)) {
-            ucs_warn("unable to turn off event notifications on %d", fd);
-        }
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CONNECTED, UCS_OK);
-        break;
-    case UCT_SOCKCM_EP_CONN_STATE_CLOSED:
-    default:
-        ucs_debug("handling closed/default state, ep %p fd %d", ep, fd);
-        uct_sockcm_change_state(ep, UCT_SOCKCM_EP_CONN_STATE_CLOSED,
-                                UCS_ERR_IO_ERROR);
-        break;
-    }
-}
-
-static UCS_CLASS_INIT_FUNC(uct_sockcm_ep_t, const uct_ep_params_t *params)
-{
-    const ucs_sock_addr_t *sockaddr = params->sockaddr;
-    uct_sockcm_iface_t    *iface    = NULL;
-    struct sockaddr *param_sockaddr = NULL;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    ucs_status_t status;
-    size_t sockaddr_len;
-
-    iface = ucs_derived_of(params->iface, uct_sockcm_iface_t);
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
-
-    if (iface->is_server) {
-        return UCS_ERR_UNSUPPORTED;
-    }
-
-    if (!(params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR)) {
-        return UCS_ERR_INVALID_PARAM;
-    }
-
-    UCT_SOCKCM_CB_FLAGS_CHECK((params->field_mask &
-                               UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) ?
-                              params->sockaddr_cb_flags : 0);
-
-    self->pack_cb       = (params->field_mask &
-                           UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB) ?
-                          params->sockaddr_pack_cb : NULL;
-    self->pack_cb_arg   = (params->field_mask &
-                           UCT_EP_PARAM_FIELD_USER_DATA) ?
-                          params->user_data : NULL;
-    self->pack_cb_flags = (params->field_mask &
-                           UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS) ?
-                          params->sockaddr_cb_flags : 0;
-    pthread_mutex_init(&self->ops_mutex, NULL);
-    ucs_queue_head_init(&self->ops);
-
-    param_sockaddr = (struct sockaddr *) sockaddr->addr;
-    if (UCS_OK != ucs_sockaddr_sizeof(param_sockaddr, &sockaddr_len)) {
-       ucs_error("sockcm ep: unknown remote sa_family=%d",
-                 sockaddr->addr->sa_family);
-       status = UCS_ERR_IO_ERROR;
-       goto err;
-    }
-
-    memcpy(&self->remote_addr, param_sockaddr, sockaddr_len);
-
-    self->slow_prog_id = UCS_CALLBACKQ_ID_NULL;
-
-    status = uct_sockcm_ep_set_sock_id(self);
-    if (status != UCS_OK) {
-        goto err;
-    }
-
-    status = ucs_sys_fcntl_modfl(self->sock_id_ctx->sock_fd, O_NONBLOCK, 0); 
-    if (status != UCS_OK) {
-        goto sock_err;
-    }
-
-    status = ucs_socket_connect(self->sock_id_ctx->sock_fd, param_sockaddr);
-    if (UCS_STATUS_IS_ERR(status)) {
-        self->conn_state = UCT_SOCKCM_EP_CONN_STATE_CLOSED;
-        goto sock_err;
-    }
-
-    self->conn_state = UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING; 
-    self->status     = UCS_INPROGRESS;
-
-    /* set ep->status before event handler call to avoid simultaneous writes to state*/
-    status = ucs_async_set_event_handler(iface->super.worker->async->mode,
-                                         self->sock_id_ctx->sock_fd,
-                                         UCS_EVENT_SET_EVWRITE,
-                                         uct_sockcm_ep_event_handler,
-                                         self, iface->super.worker->async);
-    if (status != UCS_OK) {
-        goto sock_err;
-    }
-
-    ucs_debug("created an SOCKCM endpoint on iface %p, "
-              "remote addr: %s", iface,
-               ucs_sockaddr_str(param_sockaddr,
-                                ip_port_str, UCS_SOCKADDR_STRING_LEN));
-    return UCS_OK;
-
-sock_err:
-    uct_sockcm_ep_put_sock_id(self->sock_id_ctx);
-err:
-    ucs_debug("error in sock connect");
-    pthread_mutex_destroy(&self->ops_mutex);
-
-    return status;
-}
-
-static UCS_CLASS_CLEANUP_FUNC(uct_sockcm_ep_t)
-{
-    uct_sockcm_iface_t *iface = ucs_derived_of(self->super.super.iface,
-                                               uct_sockcm_iface_t);
-
-    ucs_debug("sockcm_ep %p: destroying", self);
-
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-
-    ucs_async_remove_handler(self->sock_id_ctx->sock_fd, 1);
-    uct_sockcm_ep_put_sock_id(self->sock_id_ctx);
-
-    uct_worker_progress_unregister_safe(&iface->super.worker->super,
-                                        &self->slow_prog_id);
-
-    pthread_mutex_destroy(&self->ops_mutex);
-    if (!ucs_queue_is_empty(&self->ops)) {
-        ucs_warn("destroying endpoint %p with not completed operations", self);
-    }
-
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-}
-
-UCS_CLASS_DEFINE(uct_sockcm_ep_t, uct_base_ep_t)
-UCS_CLASS_DEFINE_NEW_FUNC(uct_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *);
-UCS_CLASS_DEFINE_DELETE_FUNC(uct_sockcm_ep_t, uct_ep_t);
-
-static unsigned uct_sockcm_client_err_handle_progress(void *arg)
-{
-    uct_sockcm_ep_t *sockcm_ep = arg;
-    uct_sockcm_iface_t *iface = ucs_derived_of(sockcm_ep->super.super.iface,
-                                               uct_sockcm_iface_t);
-
-    ucs_trace_func("err_handle ep=%p", sockcm_ep);
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-
-    sockcm_ep->slow_prog_id = UCS_CALLBACKQ_ID_NULL;
-    uct_set_ep_failed(&UCS_CLASS_NAME(uct_sockcm_ep_t), &sockcm_ep->super.super,
-                      sockcm_ep->super.super.iface, sockcm_ep->status);
-
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-    return 0;
-}
-
-void uct_sockcm_ep_set_failed(uct_iface_t *iface, uct_ep_h ep, ucs_status_t status)
-{
-    uct_sockcm_iface_t *sockcm_iface = ucs_derived_of(iface, uct_sockcm_iface_t);
-    uct_sockcm_ep_t *sockcm_ep       = ucs_derived_of(ep, uct_sockcm_ep_t);
-
-    if (sockcm_iface->super.err_handler_flags & UCT_CB_FLAG_ASYNC) {
-        uct_set_ep_failed(&UCS_CLASS_NAME(uct_sockcm_ep_t), &sockcm_ep->super.super,
-                          &sockcm_iface->super.super, status);
-    } else {
-        sockcm_ep->status = status;
-        uct_worker_progress_register_safe(&sockcm_iface->super.worker->super,
-                                          uct_sockcm_client_err_handle_progress,
-                                          sockcm_ep, UCS_CALLBACKQ_FLAG_ONESHOT,
-                                          &sockcm_ep->slow_prog_id);
-    }
-}
-
-void uct_sockcm_ep_invoke_completions(uct_sockcm_ep_t *ep, ucs_status_t status)
-{
-    uct_sockcm_ep_op_t *op;
-
-    ucs_assert(pthread_mutex_trylock(&ep->ops_mutex) == EBUSY);
-
-    ucs_queue_for_each_extract(op, &ep->ops, queue_elem, 1) {
-        pthread_mutex_unlock(&ep->ops_mutex);
-        uct_invoke_completion(op->user_comp, status);
-        ucs_free(op);
-        pthread_mutex_lock(&ep->ops_mutex);
-    }
-}
diff --git a/src/uct/tcp/sockcm/sockcm_ep.h b/src/uct/tcp/sockcm/sockcm_ep.h
deleted file mode 100644
index 9f0bd64a833..00000000000
--- a/src/uct/tcp/sockcm/sockcm_ep.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-2019.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_SOCKCM_EP_H
-#define UCT_SOCKCM_EP_H
-
-#include "sockcm_iface.h"
-
-typedef struct uct_sockcm_ep_op uct_sockcm_ep_op_t;
-
-typedef enum uct_sockcm_ep_conn_state {
-    UCT_SOCKCM_EP_CONN_STATE_SOCK_CONNECTING,
-    UCT_SOCKCM_EP_CONN_STATE_INFO_SENT,
-    UCT_SOCKCM_EP_CONN_STATE_CLOSED,
-    UCT_SOCKCM_EP_CONN_STATE_CONNECTED
-} uct_sockcm_ep_conn_state_t;
-
-struct uct_sockcm_ep_op {
-    ucs_queue_elem_t    queue_elem;
-    uct_completion_t    *user_comp;
-};
-
-struct uct_sockcm_ep {
-    uct_base_ep_t                       super;
-    uct_cm_ep_priv_data_pack_callback_t pack_cb;
-    void                                *pack_cb_arg;
-    uint32_t                            pack_cb_flags;
-    uct_sockcm_ep_conn_state_t          conn_state;
-
-    pthread_mutex_t                     ops_mutex;  /* guards ops and status */
-    ucs_queue_head_t                    ops;
-    ucs_status_t                        status;     /* client EP status */
-
-    struct sockaddr_storage             remote_addr;
-    uct_worker_cb_id_t                  slow_prog_id;
-    uct_sockcm_ctx_t                    *sock_id_ctx;
-};
-
-UCS_CLASS_DECLARE_NEW_FUNC(uct_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *);
-UCS_CLASS_DECLARE_DELETE_FUNC(uct_sockcm_ep_t, uct_ep_t);
-
-void uct_sockcm_ep_set_failed(uct_iface_t *iface, uct_ep_h ep, ucs_status_t status);
-
-void uct_sockcm_ep_invoke_completions(uct_sockcm_ep_t *ep, ucs_status_t status);
-
-#endif
diff --git a/src/uct/tcp/sockcm/sockcm_iface.c b/src/uct/tcp/sockcm/sockcm_iface.c
deleted file mode 100644
index df936820f45..00000000000
--- a/src/uct/tcp/sockcm/sockcm_iface.c
+++ /dev/null
@@ -1,431 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017-2019.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "sockcm_iface.h"
-#include "sockcm_ep.h"
-
-#include <uct/base/uct_worker.h>
-#include <uct/tcp/tcp.h>
-#include <ucs/sys/string.h>
-#include <ucs/sys/sock.h>
-
-
-enum uct_sockcm_process_event_flags {
-    UCT_SOCKCM_PROCESS_EVENT_DESTROY_SOCK_ID_FLAG = UCS_BIT(0),
-    UCT_SOCKCM_PROCESS_EVENT_ACK_EVENT_FLAG       = UCS_BIT(1)
-};
-
-static ucs_config_field_t uct_sockcm_iface_config_table[] = {    
-    {"", "", NULL,
-     ucs_offsetof(uct_sockcm_iface_config_t, super),
-     UCS_CONFIG_TYPE_TABLE(uct_iface_config_table)},
-
-    {"BACKLOG", "1024",
-     "Maximum number of pending connections for a listening socket.",
-     ucs_offsetof(uct_sockcm_iface_config_t, backlog), UCS_CONFIG_TYPE_UINT},
-
-    {NULL}
-};
-
-static UCS_CLASS_DECLARE_DELETE_FUNC(uct_sockcm_iface_t, uct_iface_t);
-
-static ucs_status_t uct_sockcm_iface_query(uct_iface_h tl_iface,
-                                           uct_iface_attr_t *iface_attr)
-{
-    uct_sockcm_iface_t *iface = ucs_derived_of(tl_iface, uct_sockcm_iface_t);
-    struct sockaddr_storage addr;
-    ucs_status_t status;
-
-    uct_base_iface_query(&iface->super, iface_attr);
-
-    iface_attr->iface_addr_len  = sizeof(ucs_sock_addr_t);
-    iface_attr->device_addr_len = 0;
-    iface_attr->cap.flags       = UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR    |
-                                  UCT_IFACE_FLAG_CB_ASYNC               |
-                                  UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE;
-    iface_attr->max_conn_priv   = UCT_SOCKCM_MAX_CONN_PRIV;
-
-    if (iface->is_server) {
-        socklen_t len = sizeof(struct sockaddr_storage);
-        if (getsockname(iface->listen_fd, (struct sockaddr *)&addr, &len)) {
-            ucs_error("sockcm_iface: getsockname failed %m");
-            return UCS_ERR_IO_ERROR;
-        }
-
-        status = ucs_sockaddr_copy((struct sockaddr *)&iface_attr->listen_sockaddr,
-                                   (const struct sockaddr *)&addr);
-        if (status != UCS_OK) {
-            return status;
-        }
-    }
-
-    return UCS_OK;
-}
-
-static ucs_status_t uct_sockcm_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *iface_addr)
-{
-    ucs_sock_addr_t *sockcm_addr = (ucs_sock_addr_t *)iface_addr;
-
-    sockcm_addr->addr    = NULL;
-    sockcm_addr->addrlen = 0;
-    return UCS_OK;
-}
-
-static ucs_status_t uct_sockcm_iface_notify_client(int notif_val,
-                                                   uct_conn_request_h conn_request)
-{
-    char notif = notif_val;
-    int fd;
-    
-    fd = ((uct_sockcm_ctx_t *) conn_request)->sock_fd;
-
-    return ucs_socket_send(fd, &notif, sizeof(notif));
-}
-
-static ucs_status_t uct_sockcm_iface_accept(uct_iface_h tl_iface,
-                                            uct_conn_request_h conn_request)
-{
-    return uct_sockcm_iface_notify_client(UCT_SOCKCM_IFACE_NOTIFY_ACCEPT, conn_request);
-}
-
-static ucs_status_t uct_sockcm_iface_reject(uct_iface_h tl_iface,
-                                            uct_conn_request_h conn_request)
-{
-    return uct_sockcm_iface_notify_client(UCT_SOCKCM_IFACE_NOTIFY_REJECT, conn_request);
-}
-
-static ucs_status_t uct_sockcm_ep_flush(uct_ep_h tl_ep, unsigned flags,
-                                        uct_completion_t *comp)
-{
-    uct_sockcm_ep_t    *ep = ucs_derived_of(tl_ep, uct_sockcm_ep_t);
-    ucs_status_t       status;
-    uct_sockcm_ep_op_t *op;
-
-    pthread_mutex_lock(&ep->ops_mutex);
-    status = ep->status;
-    if ((status == UCS_INPROGRESS) && (comp != NULL)) {
-        op = ucs_malloc(sizeof(*op), "uct_sockcm_ep_flush op");
-        if (op != NULL) {
-            op->user_comp = comp;
-            ucs_queue_push(&ep->ops, &op->queue_elem);
-        } else {
-            status = UCS_ERR_NO_MEMORY;
-        }
-    }
-    pthread_mutex_unlock(&ep->ops_mutex);
-
-    return status;
-}
-
-
-static uct_iface_ops_t uct_sockcm_iface_ops = {
-    .ep_create                = UCS_CLASS_NEW_FUNC_NAME(uct_sockcm_ep_t),
-    .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_sockcm_ep_t),
-    .ep_flush                 = uct_sockcm_ep_flush,
-    .ep_fence                 = uct_base_ep_fence,
-    .ep_pending_purge         = ucs_empty_function,
-    .iface_accept             = uct_sockcm_iface_accept,
-    .iface_reject             = uct_sockcm_iface_reject,
-    .iface_progress_enable    = (uct_iface_progress_enable_func_t)ucs_empty_function_return_success,
-    .iface_progress_disable   = (uct_iface_progress_disable_func_t)ucs_empty_function_return_success,
-    .iface_progress           = ucs_empty_function_return_zero,
-    .iface_flush              = uct_base_iface_flush,
-    .iface_fence              = uct_base_iface_fence,
-    .iface_close              = UCS_CLASS_DELETE_FUNC_NAME(uct_sockcm_iface_t),
-    .iface_query              = uct_sockcm_iface_query,
-    .iface_is_reachable       = (uct_iface_is_reachable_func_t)ucs_empty_function_return_zero,
-    .iface_get_device_address = (uct_iface_get_device_address_func_t)ucs_empty_function_return_success,
-    .iface_get_address        = uct_sockcm_iface_get_address
-};
-
-static ucs_status_t uct_sockcm_iface_process_conn_req(uct_sockcm_ctx_t *sock_id_ctx)
-{
-    uct_sockcm_iface_t      *iface      = sock_id_ctx->iface;
-    uct_sockcm_conn_param_t *conn_param = &sock_id_ctx->conn_param;
-
-    ucs_debug("process conn req conn_param = %p, conn_param->length = %ld", 
-              conn_param, conn_param->length);
-    iface->conn_request_cb(&iface->super.super, iface->conn_request_arg, sock_id_ctx, 
-                           conn_param->private_data, conn_param->length);
-    return UCS_OK;
-}
-
-static void uct_sockcm_iface_recv_handler(int fd, ucs_event_set_types_t events,
-                                          void *arg)
-{
-    uct_sockcm_ctx_t *sock_id_ctx = (uct_sockcm_ctx_t *) arg;
-    ucs_status_t status;
-    size_t recv_len;
-
-    /* attempt another receive only if initial receive was not successful */
-    recv_len = sizeof(uct_sockcm_conn_param_t) - sock_id_ctx->recv_len;
-    if (recv_len == 0) {
-        goto out_remove_handler;
-    }
-
-    status = ucs_socket_recv_nb(sock_id_ctx->sock_fd,
-                                UCS_PTR_BYTE_OFFSET(&sock_id_ctx->conn_param,
-                                                    sock_id_ctx->recv_len),
-                                &recv_len);
-    if ((status == UCS_ERR_CANCELED) || (status == UCS_ERR_IO_ERROR)) {
-        ucs_warn("recv failed in recv handler");
-        /* TODO: clean up resources allocated for client endpoint? */
-        return;
-    }
-
-    sock_id_ctx->recv_len += ((UCS_ERR_NO_PROGRESS == status) ? 0 : recv_len);
-    if (sock_id_ctx->recv_len != sizeof(uct_sockcm_conn_param_t)) {
-        /* handler should be notified when remaining pieces show up */
-        return;
-    }
-
-    if (UCS_OK != uct_sockcm_iface_process_conn_req((uct_sockcm_ctx_t*)arg)) {
-        ucs_error("unable to process connection request");
-    }
-
-out_remove_handler:
-    status = ucs_async_modify_handler(fd, 0);
-    if (status != UCS_OK) {
-        ucs_debug("unable to modify handler");
-    }
-}
-
-static void uct_sockcm_iface_event_handler(int fd, ucs_event_set_types_t events,
-                                           void *arg)
-{
-    size_t recv_len               = 0;
-    uct_sockcm_iface_t *iface     = arg;
-    uct_sockcm_ctx_t *sock_id_ctx = NULL;
-    struct sockaddr peer_addr;
-    socklen_t addrlen;
-    int accept_fd;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    ucs_status_t status;
-
-    addrlen   = sizeof(struct sockaddr);
-    accept_fd = accept(iface->listen_fd, (struct sockaddr*)&peer_addr, &addrlen);
-    if (accept_fd == -1) {
-         if ((errno == EAGAIN) || (errno == EINTR)) {
-              ucs_debug("accept(fd=%d) failed: %m", iface->listen_fd);
-         } else {
-              /* accept failed here, let the client try again */
-              ucs_warn("accept(fd=%d) failed with non-recoverable error %m",
-                       iface->listen_fd);
-         }
-         return;
-    }
-
-    ucs_debug("sockcm_iface %p: accepted connection from %s at fd %d %m", iface,
-              ucs_sockaddr_str(&peer_addr, ip_port_str,
-                               UCS_SOCKADDR_STRING_LEN), accept_fd);
-
-    /* Unlike rdmacm, socket connect/accept does not permit exchange of
-     * connection parameters but we need to use send/recv on top of that
-     * We simulate that with an explicit receive */
-
-    sock_id_ctx = ucs_malloc(sizeof(uct_sockcm_ctx_t), "accepted sock_id_ctx");
-    if (sock_id_ctx == NULL) {
-        ucs_error("sockcm_listener: unable to create mem for accepted fd");
-        close(accept_fd);
-        return;
-    }
-
-    sock_id_ctx->recv_len = 0;
-    sock_id_ctx->sock_fd  = accept_fd;
-    sock_id_ctx->iface    = iface;
-
-    status = ucs_sys_fcntl_modfl(sock_id_ctx->sock_fd, O_NONBLOCK, 0);
-    if (status != UCS_OK) {
-        ucs_error("sockcm_listener: unable make accepted fd non-blocking");
-        goto err;
-    }
-
-    recv_len = sizeof(sock_id_ctx->conn_param);
-
-    status = ucs_socket_recv_nb(accept_fd, &sock_id_ctx->conn_param, &recv_len);
-    if (UCS_OK != status) {
-        sock_id_ctx->recv_len = ((UCS_ERR_NO_PROGRESS == status) ? 0: recv_len);
-        status = ucs_async_set_event_handler(iface->super.worker->async->mode,
-                                             sock_id_ctx->sock_fd,
-                                             UCS_EVENT_SET_EVREAD, 
-                                             uct_sockcm_iface_recv_handler,
-                                             sock_id_ctx,
-                                             iface->super.worker->async);
-        if (status != UCS_OK) {
-            ucs_fatal("sockcm_listener: unable to create handler for new connection");
-            goto err;
-        }
-        ucs_debug("assigning recv handler for message from client");
-    } else {
-        ucs_debug("not assigning recv handler for message from client");
-        if (UCS_OK != uct_sockcm_iface_process_conn_req(sock_id_ctx)) {
-            ucs_error("Unable to process connection request");
-        }
-    }
-
-    UCS_ASYNC_BLOCK(iface->super.worker->async);
-    ucs_list_add_tail(&iface->used_sock_ids_list, &sock_id_ctx->list);
-    UCS_ASYNC_UNBLOCK(iface->super.worker->async);
-    
-    return;
-
-err:
-    uct_sockcm_ep_put_sock_id(sock_id_ctx);
-    return;
-}
-
-static UCS_CLASS_INIT_FUNC(uct_sockcm_iface_t, uct_md_h md, uct_worker_h worker,
-                           const uct_iface_params_t *params,
-                           const uct_iface_config_t *tl_config)
-{
-    uct_sockcm_iface_config_t *config = ucs_derived_of(tl_config,
-                                                       uct_sockcm_iface_config_t);
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-    ucs_status_t status;
-    struct sockaddr *param_sockaddr;
-    int param_sockaddr_len;
-
-    UCT_CHECK_PARAM(params->field_mask & UCT_IFACE_PARAM_FIELD_OPEN_MODE,
-                    "UCT_IFACE_PARAM_FIELD_OPEN_MODE is not defined");
-
-    UCT_CHECK_PARAM((params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) ||
-                    (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT),
-                    "Invalid open mode %"PRIu64, params->open_mode);
-
-    UCT_CHECK_PARAM(!(params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) ||
-                    (params->field_mask & UCT_IFACE_PARAM_FIELD_SOCKADDR),
-                    "UCT_IFACE_PARAM_FIELD_SOCKADDR is not defined "
-                    "for UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER");
-
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_sockcm_iface_ops, md, worker,
-                              params, tl_config
-                              UCS_STATS_ARG((params->field_mask &
-                                             UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
-                                            params->stats_root : NULL)
-                              UCS_STATS_ARG(UCT_SOCKCM_TL_NAME));
-
-    if (self->super.worker->async == NULL) {
-        ucs_error("sockcm must have async != NULL");
-        return UCS_ERR_INVALID_PARAM;
-    }
-    if (self->super.worker->async->mode == UCS_ASYNC_MODE_SIGNAL) {
-        ucs_warn("sockcm does not support SIGIO");
-    }
-
-    self->listen_fd = -1;
-
-    if (params->open_mode & UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER) {
-
-        if (!(params->mode.sockaddr.cb_flags & UCT_CB_FLAG_ASYNC)) {
-            return UCS_ERR_INVALID_PARAM;
-        }
-
-        param_sockaddr = (struct sockaddr *)params->mode.sockaddr.listen_sockaddr.addr;
-        param_sockaddr_len = params->mode.sockaddr.listen_sockaddr.addrlen;
-
-        status = ucs_socket_create(param_sockaddr->sa_family, SOCK_STREAM,
-                                   &self->listen_fd);
-        if (status != UCS_OK) {
-            return status;
-        }
-
-        status = ucs_sys_fcntl_modfl(self->listen_fd, O_NONBLOCK, 0);
-        if (status != UCS_OK) {
-            goto err_close_sock;
-        }
-
-        if (0 > bind(self->listen_fd, param_sockaddr, param_sockaddr_len)) {
-            ucs_error("bind(fd=%d) failed: %m", self->listen_fd);
-            status = (errno == EADDRINUSE) ? UCS_ERR_BUSY : UCS_ERR_IO_ERROR;
-            goto err_close_sock;
-        }
-
-        if (0 > listen(self->listen_fd, config->backlog)) {
-            ucs_error("listen(fd=%d; backlog=%d)", self->listen_fd,
-                      config->backlog);
-            status = UCS_ERR_IO_ERROR;
-            goto err_close_sock;
-        }
-
-        status = ucs_async_set_event_handler(self->super.worker->async->mode,
-                                             self->listen_fd,
-                                             UCS_EVENT_SET_EVREAD | 
-                                             UCS_EVENT_SET_EVERR,
-                                             uct_sockcm_iface_event_handler,
-                                             self, self->super.worker->async);
-        if (status != UCS_OK) {
-            goto err_close_sock;
-        }
-
-        ucs_debug("iface (%p) sockcm id %d listening on %s", self,
-                  self->listen_fd,
-                  ucs_sockaddr_str(param_sockaddr, ip_port_str,
-                                   UCS_SOCKADDR_STRING_LEN));
-
-        self->cb_flags         = params->mode.sockaddr.cb_flags;
-        self->conn_request_cb  = params->mode.sockaddr.conn_request_cb;
-        self->conn_request_arg = params->mode.sockaddr.conn_request_arg;
-        self->is_server        = 1;
-    } else {
-        self->is_server        = 0;
-    }
-
-    ucs_list_head_init(&self->used_sock_ids_list);
-
-    return UCS_OK;
-
- err_close_sock:
-    close(self->listen_fd);
-    return status;
-}
-
-static UCS_CLASS_CLEANUP_FUNC(uct_sockcm_iface_t)
-{
-    uct_sockcm_ctx_t *sock_id_ctx;
-
-    if (self->is_server) {
-        if (-1 != self->listen_fd) {
-            ucs_debug("cleaning listen_fd = %d", self->listen_fd);
-            ucs_async_remove_handler(self->listen_fd, 1);
-            close(self->listen_fd);
-        }
-    }
-
-    UCS_ASYNC_BLOCK(self->super.worker->async);
-
-    while (!ucs_list_is_empty(&self->used_sock_ids_list)) {
-        sock_id_ctx = ucs_list_extract_head(&self->used_sock_ids_list,
-                                            uct_sockcm_ctx_t, list);
-        ucs_debug("cleaning server fd = %d", sock_id_ctx->sock_fd);
-        ucs_async_remove_handler(sock_id_ctx->sock_fd, 1);
-        uct_sockcm_ep_put_sock_id(sock_id_ctx);
-    }
-
-    UCS_ASYNC_UNBLOCK(self->super.worker->async);
-}
-
-UCS_CLASS_DEFINE(uct_sockcm_iface_t, uct_base_iface_t);
-static UCS_CLASS_DEFINE_NEW_FUNC(uct_sockcm_iface_t, uct_iface_t, uct_md_h,
-                                 uct_worker_h, const uct_iface_params_t *,
-                                 const uct_iface_config_t *);
-static UCS_CLASS_DEFINE_DELETE_FUNC(uct_sockcm_iface_t, uct_iface_t);
-
-static ucs_status_t
-uct_sockcm_query_tl_devices(uct_md_h md, uct_tl_device_resource_t **tl_devices_p,
-                            unsigned *num_tl_devices_p)
-{
-    *num_tl_devices_p = 0;
-    *tl_devices_p     = NULL;
-    return UCS_OK;
-}
-
-UCT_TL_DEFINE(&uct_sockcm_component, sockcm, uct_sockcm_query_tl_devices,
-              uct_sockcm_iface_t, "SOCKCM_", uct_sockcm_iface_config_table,
-              uct_sockcm_iface_config_t);
diff --git a/src/uct/tcp/sockcm/sockcm_iface.h b/src/uct/tcp/sockcm/sockcm_iface.h
deleted file mode 100644
index e39fd0f8f4f..00000000000
--- a/src/uct/tcp/sockcm/sockcm_iface.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_SOCKCM_IFACE_H
-#define UCT_SOCKCM_IFACE_H
-
-#include "sockcm_def.h"
-#include "sockcm_md.h"
-
-#define UCT_SOCKCM_MAX_CONN_PRIV \
-        (UCT_SOCKCM_PRIV_DATA_LEN - sizeof(ssize_t))
-
-
-typedef enum uct_sockcm_iface_notify {
-    UCT_SOCKCM_IFACE_NOTIFY_ACCEPT = 0,
-    UCT_SOCKCM_IFACE_NOTIFY_REJECT
-} uct_sockcm_iface_notify_t;
-
-typedef struct uct_sockcm_iface_config {
-    uct_iface_config_t       super;
-    unsigned                 backlog;
-} uct_sockcm_iface_config_t;
-
-struct uct_sockcm_iface {
-    uct_base_iface_t                     super;
-
-    int                                  listen_fd;
-
-    uint8_t                              is_server;
-    /* Fields used only for server side */
-    void                                 *conn_request_arg;
-    uct_sockaddr_conn_request_callback_t conn_request_cb;
-    uint32_t                             cb_flags;
-
-    /* Field used only for client side */
-    ucs_list_link_t                      used_sock_ids_list;
-};
-#endif
diff --git a/src/uct/tcp/sockcm/sockcm_md.c b/src/uct/tcp/sockcm/sockcm_md.c
deleted file mode 100644
index 0610efcc771..00000000000
--- a/src/uct/tcp/sockcm/sockcm_md.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-
-#include "sockcm_md.h"
-
-#define UCT_SOCKCM_NAME              "sockcm"
-
-static ucs_config_field_t uct_sockcm_md_config_table[] = {
-  {"", "", NULL,
-   ucs_offsetof(uct_sockcm_md_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_md_config_table)},
-  {NULL}
-};
-
-static void uct_sockcm_md_close(uct_md_h md);
-
-static uct_md_ops_t uct_sockcm_md_ops = {
-    .close                  = uct_sockcm_md_close,
-    .query                  = uct_sockcm_md_query,
-    .is_sockaddr_accessible = uct_sockcm_is_sockaddr_accessible,
-    .detect_memory_type     = ucs_empty_function_return_unsupported,
-};
-
-static void uct_sockcm_md_close(uct_md_h md)
-{
-    uct_sockcm_md_t *sockcm_md = ucs_derived_of(md, uct_sockcm_md_t);
-    ucs_free(sockcm_md);
-}
-
-ucs_status_t uct_sockcm_md_query(uct_md_h md, uct_md_attr_t *md_attr)
-{
-    md_attr->cap.flags            = UCT_MD_FLAG_SOCKADDR;
-    md_attr->cap.reg_mem_types    = 0;
-    md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
-    md_attr->cap.detect_mem_types = 0;
-    md_attr->cap.max_alloc        = 0;
-    md_attr->cap.max_reg          = 0;
-    md_attr->rkey_packed_size     = 0;
-    md_attr->reg_cost             = ucs_linear_func_make(0, 0);
-    memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus));
-    return UCS_OK;
-}
-
-int uct_sockcm_is_sockaddr_accessible(uct_md_h md, const ucs_sock_addr_t *sockaddr,
-                                      uct_sockaddr_accessibility_t mode)
-{
-    struct sockaddr *param_sockaddr = NULL;
-    int is_accessible               = 0;
-    int sock_fd                     = -1;
-    size_t sockaddr_len             = 0;
-    char ip_port_str[UCS_SOCKADDR_STRING_LEN];
-
-    param_sockaddr = (struct sockaddr *) sockaddr->addr;
-
-    if ((mode != UCT_SOCKADDR_ACC_LOCAL) && (mode != UCT_SOCKADDR_ACC_REMOTE)) {
-        ucs_error("Unknown sockaddr accessibility mode %d", mode);
-        return 0;
-    }
-
-    sock_fd = socket(param_sockaddr->sa_family, SOCK_STREAM, 0);
-    if (-1 == sock_fd) {
-        return 0;
-    }
-
-    if (UCS_OK != ucs_sockaddr_sizeof(param_sockaddr, &sockaddr_len)) {
-        ucs_debug("family != AF_INET and != AF_INET6");
-        goto out_destroy_id;
-    }
-
-    if (mode == UCT_SOCKADDR_ACC_LOCAL) {
-        ucs_debug("addr_len = %ld", (long int) sockaddr_len);
-
-        if (bind(sock_fd, param_sockaddr, sockaddr_len)) {
-            ucs_debug("bind(addr = %s) failed: %m",
-                      ucs_sockaddr_str((struct sockaddr *)sockaddr->addr,
-                                       ip_port_str, UCS_SOCKADDR_STRING_LEN));
-            goto out_destroy_id;
-        }
-
-        if (ucs_sockaddr_is_inaddr_any(param_sockaddr)) {
-            is_accessible = 1;
-            goto out_print;
-        }
-    }
-
-    is_accessible = 1; /* if UCT_SOCKADDR_ACC_REMOTE == mode*/
-
- out_print:
-    ucs_debug("address %s is accessible from sockcm_md %p with mode: %d",
-              ucs_sockaddr_str(param_sockaddr, ip_port_str,
-                               UCS_SOCKADDR_STRING_LEN),
-              ucs_derived_of(md, uct_sockcm_md_t), mode);
-
- out_destroy_id:
-    close(sock_fd);
-
-    return is_accessible;
-}
-
-static ucs_status_t
-uct_sockcm_md_open(uct_component_t *component, const char *md_name,
-                   const uct_md_config_t *config, uct_md_h *md_p)
-{
-    uct_sockcm_md_t *md;
-
-    md = ucs_malloc(sizeof(*md), "sockcm_md");
-    if (md == NULL) {
-        return UCS_ERR_NO_MEMORY;
-    }
-
-    md->super.ops            = &uct_sockcm_md_ops;
-    md->super.component      = &uct_sockcm_component;
-
-    /* cppcheck-suppress autoVariables */
-    *md_p = &md->super;
-    return UCS_OK;
-}
-
-uct_component_t uct_sockcm_component = {
-    .query_md_resources = uct_md_query_single_md_resource,
-    .md_open            = uct_sockcm_md_open,
-    .cm_open            = ucs_empty_function_return_unsupported,
-    .rkey_unpack        = ucs_empty_function_return_unsupported,
-    .rkey_ptr           = ucs_empty_function_return_unsupported,
-    .rkey_release       = ucs_empty_function_return_unsupported,
-    .name               = UCT_SOCKCM_NAME,
-    .md_config          = {
-        .name           = "Sock-CM memory domain",
-        .prefix         =  "SOCKCM_",
-        .table          = uct_sockcm_md_config_table,
-        .size           = sizeof(uct_sockcm_md_config_t),
-    },
-    .cm_config          = UCS_CONFIG_EMPTY_GLOBAL_LIST_ENTRY,
-    .tl_list            = UCT_COMPONENT_TL_LIST_INITIALIZER(&uct_sockcm_component),
-    .flags              = 0
-};
-UCT_COMPONENT_REGISTER(&uct_sockcm_component)
diff --git a/src/uct/tcp/sockcm/sockcm_md.h b/src/uct/tcp/sockcm/sockcm_md.h
deleted file mode 100644
index 7b7cfa6d927..00000000000
--- a/src/uct/tcp/sockcm/sockcm_md.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
- * Copyright (C) NVIDIA Corporation. 2019.  ALL RIGHTS RESERVED.
- * See file LICENSE for terms.
- */
-
-#ifndef UCT_SOCKCM_MD_H_
-#define UCT_SOCKCM_MD_H_
-
-#include "sockcm_def.h"
-#include <uct/base/uct_md.h>
-#include <ucs/sys/sock.h>
-#include <ucs/time/time.h>
-
-/*
- * SOCKCM memory domain.
- */
-typedef struct uct_sockcm_md {
-    uct_md_t                 super;
-} uct_sockcm_md_t;
-
-/*
- * SOCKCM memory domain configuration.
- */
-typedef struct uct_sockcm_md_config {
-    uct_md_config_t          super;
-} uct_sockcm_md_config_t;
-
-extern uct_component_t uct_sockcm_component;
-
-ucs_status_t uct_sockcm_md_query(uct_md_h md, uct_md_attr_t *md_attr);
-
-int uct_sockcm_is_sockaddr_accessible(uct_md_h md,
-                                      const ucs_sock_addr_t *sockaddr,
-                                      uct_sockaddr_accessibility_t mode);
-
-#endif
diff --git a/src/uct/tcp/tcp.h b/src/uct/tcp/tcp.h
index 14ba86cef89..c234c179a59 100644
--- a/src/uct/tcp/tcp.h
+++ b/src/uct/tcp/tcp.h
@@ -60,6 +60,13 @@
 /* Maximal value for connection sequence number */
 #define UCT_TCP_CM_CONN_SN_MAX               UINT64_MAX
 
+/* The seconds the connection needs to remain idle before TCP starts sending
+ * keepalive probes */
+#define UCT_TCP_EP_DEFAULT_KEEPALIVE_IDLE    10
+
+/* The seconds between individual keepalive probes */
+#define UCT_TCP_EP_DEFAULT_KEEPALIVE_INTVL   1
+
 
 /**
  * TCP EP connection manager ID
@@ -157,11 +164,11 @@ typedef enum uct_tcp_cm_conn_event {
     /* Connection request from a EP that has TX capability to a EP that
      * has to be able to receive AM data (i.e. has to have RX capability). */
     UCT_TCP_CM_CONN_REQ               = UCS_BIT(0),
-    /* Connection acknowledgment from a EP that accepts a conenction from
+    /* Connection acknowledgment from a EP that accepts a connection from
      * initiator of a connection request. */
     UCT_TCP_CM_CONN_ACK               = UCS_BIT(1),
     /* Connection acknowledgment + Connection request. The mesasge is sent
-     * from a EP that accepts remote conenction when it was in
+     * from a EP that accepts remote connection when it was in
      * `UCT_TCP_EP_CONN_STATE_CONNECTING` state (i.e. original
      * `UCT_TCP_CM_CONN_REQ` wasn't sent yet) and want to have RX capability
      * on a peer's EP in order to send AM data. */
@@ -205,11 +212,13 @@ typedef struct uct_tcp_am_hdr {
  */
 typedef enum uct_tcp_ep_am_id {
     /* AM ID reserved for TCP internal Connection Manager messages */
-    UCT_TCP_EP_CM_AM_ID      = UCT_AM_ID_MAX,
+    UCT_TCP_EP_CM_AM_ID        = UCT_AM_ID_MAX,
     /* AM ID reserved for TCP internal PUT REQ message */
-    UCT_TCP_EP_PUT_REQ_AM_ID = UCT_AM_ID_MAX + 1,
+    UCT_TCP_EP_PUT_REQ_AM_ID   = UCT_AM_ID_MAX + 1,
+    /* AM ID reserved for TCP internal PUT ACK message */
+    UCT_TCP_EP_PUT_ACK_AM_ID   = UCT_AM_ID_MAX + 2,
     /* AM ID reserved for TCP internal PUT ACK message */
-    UCT_TCP_EP_PUT_ACK_AM_ID = UCT_AM_ID_MAX + 2
+    UCT_TCP_EP_KEEPALIVE_AM_ID = UCT_AM_ID_MAX + 3
 } uct_tcp_ep_am_id_t;
 
 
@@ -271,13 +280,46 @@ typedef struct uct_tcp_ep_zcopy_tx {
 } uct_tcp_ep_zcopy_tx_t;
 
 
+/**
+ * TCP device address flags
+ */
+typedef enum uct_tcp_device_addr_flags {
+    /**
+     * Device address is extended by additional information:
+     * @ref uct_iface_local_addr_ns_t for loopback reachability
+     */
+    UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK = UCS_BIT(0)
+} uct_tcp_device_addr_flags_t;
+
+
+/**
+ * TCP device address
+ */
+typedef struct uct_tcp_device_addr {
+    uint8_t flags; /* Flags of type @ref uct_tcp_device_addr_flags_t */
+    uint8_t sa_family; /* Address family of packed address */
+    /* The following packed fields follow:
+     * 1. in_addr/in6_addr structure in case of non-loopback interface
+     * 2. @ref uct_iface_local_addr_ns_t in case of loopback interface
+     */
+} UCS_S_PACKED uct_tcp_device_addr_t;
+
+
+/**
+ * TCP iface address
+ */
+typedef struct uct_tcp_iface_addr {
+    uint16_t port; /* Listening port of iface */
+} UCS_S_PACKED uct_tcp_iface_addr_t;
+
+
 /**
  * TCP endpoint address
  */
 typedef struct uct_tcp_ep_addr {
-    in_port_t                     iface_addr;     /* Interface address */
-    ucs_ptr_map_key_t             ptr_map_key;    /* PTR map key, used by EPs created with
-                                                   * CONNECT_TO_EP method */
+    uct_tcp_iface_addr_t iface_addr; /* TCP iface address */
+    ucs_ptr_map_key_t    ptr_map_key; /* PTR map key, used by EPs created with
+                                       * CONNECT_TO_EP method */
 } UCS_S_PACKED uct_tcp_ep_addr_t;
 
 
@@ -336,10 +378,10 @@ typedef struct uct_tcp_iface {
         size_t                    rx_seg_size;       /* RX AM buffer size */
         size_t                    sendv_thresh;      /* Minimum size of user's payload from which
                                                       * non-blocking vector send should be used */
-        struct {
-            size_t                max_iov;           /* Maximum supported IOVs limited by
+        size_t                    max_iov;           /* Maximum supported IOVs limited by
                                                       * user configuration and service buffers
                                                       * (TCP protocol and user's AM headers) */
+        struct {
             size_t                max_hdr;           /* Maximum supported AM Zcopy header */
             size_t                hdr_offset;        /* Offset in TX buffer to empty space that
                                                       * can be used for AM Zcopy header */
@@ -356,6 +398,16 @@ typedef struct uct_tcp_iface {
         unsigned                  syn_cnt;           /* Number of SYN retransmits that TCP should send
                                                       * before aborting the attempt to connect.
                                                       * It cannot exceed 255. */
+        struct {
+            ucs_time_t            idle;              /* The time the connection needs to remain
+                                                      * idle before TCP starts sending keepalive
+                                                      * probes (TCP_KEEPIDLE socket option) */
+            unsigned              cnt;               /* The maximum number of keepalive probes TCP
+                                                      * should send before dropping the connection
+                                                      * (TCP_KEEPCNT socket option). */
+            ucs_time_t            intvl;             /* The time between individual keepalive
+                                                      * probes (TCP_KEEPINTVL socket option). */
+        } keepalive;
     } config;
 
     struct {
@@ -386,6 +438,11 @@ typedef struct uct_tcp_iface_config {
     uct_iface_mpool_config_t       tx_mpool;
     uct_iface_mpool_config_t       rx_mpool;
     ucs_range_spec_t               port_range;
+    struct {
+        ucs_time_t                 idle;
+        unsigned                   cnt;
+        ucs_time_t                 intvl;
+    } keepalive;
 } uct_tcp_iface_config_t;
 
 
@@ -429,6 +486,10 @@ ucs_status_t uct_tcp_ep_init(uct_tcp_iface_t *iface, int fd,
                              const struct sockaddr_in *dest_addr,
                              uct_tcp_ep_t **ep_p);
 
+ucs_status_t uct_tcp_ep_set_dest_addr(const uct_device_addr_t *dev_addr,
+                                      const uct_iface_addr_t *iface_addr,
+                                      struct sockaddr *dest_addr);
+
 uint64_t uct_tcp_ep_get_cm_id(const uct_tcp_ep_t *ep);
 
 ucs_status_t uct_tcp_ep_create(const uct_ep_params_t *params,
@@ -476,6 +537,9 @@ void uct_tcp_ep_pending_queue_dispatch(uct_tcp_ep_t *ep);
 ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header,
                                  const void *payload, unsigned length);
 
+ucs_status_t uct_tcp_ep_am_short_iov(uct_ep_h uct_ep, uint8_t am_id,
+                                     const uct_iov_t *iov, size_t iovcnt);
+
 ssize_t uct_tcp_ep_am_bcopy(uct_ep_h uct_ep, uint8_t am_id,
                             uct_pack_callback_t pack_cb, void *arg,
                             unsigned flags);
@@ -498,6 +562,9 @@ void uct_tcp_ep_pending_purge(uct_ep_h tl_ep, uct_pending_purge_callback_t cb,
 ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags,
                               uct_completion_t *comp);
 
+ucs_status_t
+uct_tcp_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp);
+
 ucs_status_t uct_tcp_cm_send_event(uct_tcp_ep_t *ep,
                                    uct_tcp_cm_conn_event_t event,
                                    int log_error);
@@ -535,6 +602,8 @@ ucs_status_t uct_tcp_cm_handle_incoming_conn(uct_tcp_iface_t *iface,
 
 ucs_status_t uct_tcp_cm_conn_start(uct_tcp_ep_t *ep);
 
+int uct_tcp_keepalive_is_enabled(uct_tcp_iface_t *iface);
+
 static inline void uct_tcp_iface_outstanding_inc(uct_tcp_iface_t *iface)
 {
     iface->outstanding++;
diff --git a/src/uct/tcp/tcp_cm.c b/src/uct/tcp/tcp_cm.c
index 8014636c3e4..1b6d002636c 100644
--- a/src/uct/tcp/tcp_cm.c
+++ b/src/uct/tcp/tcp_cm.c
@@ -8,6 +8,7 @@
 #endif
 
 #include "tcp.h"
+#include "tcp/tcp.h"
 
 #include <ucs/async/async.h>
 
@@ -58,6 +59,7 @@ void uct_tcp_cm_change_conn_state(uct_tcp_ep_t *ep,
         }
         break;
     case UCT_TCP_EP_CONN_STATE_CLOSED:
+        ucs_assert(ep->events == 0);
         if (old_conn_state == UCT_TCP_EP_CONN_STATE_CLOSED) {
             return;
         }
@@ -182,7 +184,7 @@ ucs_status_t uct_tcp_cm_send_event(uct_tcp_ep_t *ep,
     pkt_buf         = ucs_alloca(pkt_length);
     pkt_hdr         = (uct_tcp_am_hdr_t*)(UCS_PTR_BYTE_OFFSET(pkt_buf,
                                                               magic_number_length));
-    pkt_hdr->am_id  = UCT_AM_ID_MAX;
+    pkt_hdr->am_id  = UCT_TCP_EP_CM_AM_ID;
     pkt_hdr->length = cm_pkt_length;
 
     if (event == UCT_TCP_CM_CONN_REQ) {
@@ -421,7 +423,7 @@ uct_tcp_cm_simult_conn_accept_remote_conn(uct_tcp_ep_t *accept_ep,
     ucs_assertv(connect_ep->events == 0,
                 "Requested epoll events must be 0-ed for ep=%p", connect_ep);
 
-    close(connect_ep->fd);
+    ucs_close_fd(&connect_ep->fd);
     connect_ep->fd = accept_ep->fd;
 
     /* 2. Migrate RX from the EP allocated during accepting connection to
@@ -599,7 +601,6 @@ uct_tcp_cm_handle_conn_req(uct_tcp_ep_t **ep_p,
         uct_tcp_cm_insert_ep(iface, ep);
     }
 
-out_connect_ep:
     uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CONNECTED);
     return 1;
 
diff --git a/src/uct/tcp/tcp_ep.c b/src/uct/tcp/tcp_ep.c
index 4a504f33ce9..a247e21b96f 100644
--- a/src/uct/tcp/tcp_ep.c
+++ b/src/uct/tcp/tcp_ep.c
@@ -8,6 +8,7 @@
 #endif
 
 #include "tcp.h"
+#include "tcp/tcp.h"
 
 #include <ucs/async/async.h>
 
@@ -16,7 +17,7 @@
 static unsigned uct_tcp_ep_progress_data_tx(void *arg);
 static unsigned uct_tcp_ep_progress_data_rx(void *arg);
 static unsigned uct_tcp_ep_progress_magic_number_rx(void *arg);
-static unsigned uct_tcp_ep_failed_progress(void *arg);
+static unsigned uct_tcp_ep_destroy_progress(void *arg);
 
 const uct_tcp_cm_state_t uct_tcp_ep_cm_state[] = {
     [UCT_TCP_EP_CONN_STATE_CLOSED] = {
@@ -136,8 +137,6 @@ int uct_tcp_ep_is_self(const uct_tcp_ep_t *ep)
 
 static void uct_tcp_ep_cleanup(uct_tcp_ep_t *ep)
 {
-    uct_tcp_ep_addr_cleanup(&ep->peer_addr);
-
     if (ep->tx.buf != NULL) {
         uct_tcp_ep_ctx_reset(&ep->tx);
     }
@@ -151,14 +150,24 @@ static void uct_tcp_ep_cleanup(uct_tcp_ep_t *ep)
     ucs_close_fd(&ep->stale_fd);
 }
 
+static void uct_tcp_ep_ptr_map_verify(uct_tcp_ep_t *ep, int on_ptr_map)
+{
+    ucs_assert(ep->flags & UCT_TCP_EP_FLAG_CONNECT_TO_EP);
+    if (on_ptr_map) {
+        ucs_assert(ep->flags & UCT_TCP_EP_FLAG_ON_PTR_MAP);
+    } else {
+        ucs_assert(!(ep->flags & UCT_TCP_EP_FLAG_ON_PTR_MAP));
+    }
+    ucs_assert(!(ep->flags & UCT_TCP_EP_FLAG_ON_MATCH_CTX));
+}
+
 static void uct_tcp_ep_ptr_map_add(uct_tcp_ep_t *ep)
 {
     uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
                                             uct_tcp_iface_t);
     ucs_status_t status;
 
-    ucs_assert(ep->flags & UCT_TCP_EP_FLAG_CONNECT_TO_EP);
-    ucs_assert(!(ep->flags & UCT_TCP_EP_FLAG_ON_MATCH_CTX));
+    uct_tcp_ep_ptr_map_verify(ep, 0);
 
     status = ucs_ptr_map_put(&iface->ep_ptr_map, ep, 1,
                              &ep->cm_id.ptr_map_key);
@@ -167,35 +176,55 @@ static void uct_tcp_ep_ptr_map_add(uct_tcp_ep_t *ep)
     ep->flags |= UCT_TCP_EP_FLAG_ON_PTR_MAP;
 }
 
+static void uct_tcp_ep_ptr_map_removed(uct_tcp_ep_t *ep)
+{
+    uct_tcp_ep_ptr_map_verify(ep, 1);
+    ep->flags &= ~UCT_TCP_EP_FLAG_ON_PTR_MAP;
+}
+
 static void uct_tcp_ep_ptr_map_del(uct_tcp_ep_t *ep)
 {
     uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
                                             uct_tcp_iface_t);
     ucs_status_t status;
 
-    ucs_assert(ep->flags & UCT_TCP_EP_FLAG_CONNECT_TO_EP);
-    ucs_assert(ep->flags & UCT_TCP_EP_FLAG_ON_PTR_MAP);
-    ucs_assert(!(ep->flags & UCT_TCP_EP_FLAG_ON_MATCH_CTX));
-
     status = ucs_ptr_map_del(&iface->ep_ptr_map, ep->cm_id.ptr_map_key);
     ucs_assert_always(status == UCS_OK);
+    uct_tcp_ep_ptr_map_removed(ep);
+}
 
-    ep->flags &= ~UCT_TCP_EP_FLAG_ON_PTR_MAP;
+static uct_tcp_ep_t *
+uct_tcp_ep_ptr_map_get(uct_tcp_iface_t *iface, ucs_ptr_map_key_t ptr_map_key)
+{
+    ucs_status_t status;
+    uct_tcp_ep_t *ep;
+    void *ptr;
+
+    status = ucs_ptr_map_get(&iface->ep_ptr_map, ptr_map_key, 0, &ptr);
+    if (ucs_likely(status == UCS_OK)) {
+        ep = ptr;
+        uct_tcp_ep_ptr_map_verify(ep, 1);
+        return ep;
+    }
+
+    return NULL;
 }
 
-uct_tcp_ep_t* uct_tcp_ep_ptr_map_retrieve(uct_tcp_iface_t *iface,
+uct_tcp_ep_t *uct_tcp_ep_ptr_map_retrieve(uct_tcp_iface_t *iface,
                                           ucs_ptr_map_key_t ptr_map_key)
 {
+    ucs_status_t status;
     uct_tcp_ep_t *ep;
+    void *ptr;
 
-    ep = ucs_ptr_map_get(&iface->ep_ptr_map, ptr_map_key);
-    if (ep != NULL) {
-        ucs_assert(ep->flags & UCT_TCP_EP_FLAG_ON_PTR_MAP);
-        ucs_assert(!(ep->flags & UCT_TCP_EP_FLAG_ON_MATCH_CTX));
-        uct_tcp_ep_ptr_map_del(ep);
+    status = ucs_ptr_map_get(&iface->ep_ptr_map, ptr_map_key, 1, &ptr);
+    if (ucs_likely(status == UCS_OK)) {
+        ep = ptr;
+        uct_tcp_ep_ptr_map_removed(ep);
+        return ep;
     }
 
-    return ep;
+    return NULL;
 }
 
 static UCS_CLASS_INIT_FUNC(uct_tcp_ep_t, uct_tcp_iface_t *iface,
@@ -283,7 +312,7 @@ uct_tcp_ep_failed_remove_filter(const ucs_callbackq_elem_t *elem, void *arg)
     uct_tcp_ep_t *ep = (uct_tcp_ep_t*)arg;
 
     ucs_assert(ep->flags & UCT_TCP_EP_FLAG_FAILED);
-    return (elem->cb == uct_tcp_ep_failed_progress) && (elem->arg == ep);
+    return (elem->cb == uct_tcp_ep_destroy_progress) && (elem->arg == ep);
 }
 
 static int
@@ -295,11 +324,57 @@ uct_tcp_ep_progress_rx_remove_filter(const ucs_callbackq_elem_t *elem,
     return (elem->cb == uct_tcp_ep_progress_data_rx) && (elem->arg == ep);
 }
 
+static UCS_F_ALWAYS_INLINE void
+uct_tcp_ep_tx_started(uct_tcp_ep_t *ep, const uct_tcp_am_hdr_t *hdr)
+{
+    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
+                                            uct_tcp_iface_t);
+
+    ep->tx.length      += sizeof(*hdr) + hdr->length;
+    iface->outstanding += ep->tx.length;
+}
+
+static UCS_F_ALWAYS_INLINE void
+uct_tcp_ep_tx_completed(uct_tcp_ep_t *ep, size_t sent_length)
+{
+    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
+                                            uct_tcp_iface_t);
+
+    iface->outstanding -= sent_length;
+    ep->tx.offset      += sent_length;
+}
+
+static UCS_F_ALWAYS_INLINE void
+uct_tcp_ep_zcopy_completed(uct_tcp_ep_t *ep, uct_completion_t *comp,
+                           ucs_status_t status)
+{
+    ep->flags &= ~UCT_TCP_EP_FLAG_ZCOPY_TX;
+    if (comp != NULL) {
+        uct_invoke_completion(comp, status);
+    }
+}
+
+static void uct_tcp_ep_purge(uct_tcp_ep_t *ep)
+{
+    uct_tcp_ep_put_completion_t *put_comp;
+    uct_tcp_ep_zcopy_tx_t *ctx;
+
+    if (ep->flags & UCT_TCP_EP_FLAG_ZCOPY_TX) {
+        ctx = (uct_tcp_ep_zcopy_tx_t*)ep->tx.buf;
+        uct_tcp_ep_zcopy_completed(ep, ctx->comp, UCS_ERR_CANCELED);
+        uct_tcp_ep_tx_completed(ep, ep->tx.length - ep->tx.offset);
+    }
+
+    ucs_queue_for_each_extract(put_comp, &ep->put_comp_q, elem, 1) {
+        uct_invoke_completion(put_comp->comp, UCS_ERR_CANCELED);
+        ucs_mpool_put_inline(put_comp);
+    }
+}
+
 static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t)
 {
     uct_tcp_iface_t *iface = ucs_derived_of(self->super.super.iface,
                                             uct_tcp_iface_t);
-    uct_tcp_ep_put_completion_t *put_comp;
 
     if (self->flags & UCT_TCP_EP_FLAG_ON_MATCH_CTX) {
         uct_tcp_cm_remove_ep(iface, self);
@@ -312,10 +387,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t)
     }
 
     uct_tcp_ep_remove_ctx_cap(self, UCT_TCP_EP_CTX_CAPS);
-
-    ucs_queue_for_each_extract(put_comp, &self->put_comp_q, elem, 1) {
-        ucs_free(put_comp);
-    }
+    uct_tcp_ep_purge(self);
 
     if (self->flags & UCT_TCP_EP_FLAG_FAILED) {
         /* a failed EP callback can be still scheduled on the UCT worker,
@@ -328,8 +400,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_tcp_ep_t)
     ucs_callbackq_remove_if(&iface->super.worker->super.progress_q,
                             uct_tcp_ep_progress_rx_remove_filter, self);
 
-    uct_tcp_cm_change_conn_state(self, UCT_TCP_EP_CONN_STATE_CLOSED);
     uct_tcp_ep_cleanup(self);
+    uct_tcp_cm_change_conn_state(self, UCT_TCP_EP_CONN_STATE_CLOSED);
+    uct_tcp_ep_addr_cleanup(&self->peer_addr);
 
     ucs_debug("tcp_ep %p: destroyed on iface %p", self, iface);
 }
@@ -357,31 +430,23 @@ void uct_tcp_ep_destroy(uct_ep_h tl_ep)
         uct_tcp_cm_remove_ep(iface, ep);
         /* remove TX capability, but still will be able to receive data */
         uct_tcp_ep_remove_ctx_cap(ep, UCT_TCP_EP_FLAG_CTX_TYPE_TX);
+        /* purge all outstanding operations (GET/PUT Zcopy, flush operations) */
+        uct_tcp_ep_purge(ep);
         uct_tcp_cm_insert_ep(iface, ep);
     } else {
         uct_tcp_ep_destroy_internal(tl_ep);
     }
 }
 
-static unsigned uct_tcp_ep_failed_progress(void *arg)
+static unsigned uct_tcp_ep_destroy_progress(void *arg)
 {
-    uct_tcp_ep_t *ep       = (uct_tcp_ep_t*)arg;
-    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
-                                            uct_tcp_iface_t);
+    uct_tcp_ep_t *ep = (uct_tcp_ep_t*)arg;
 
+    ucs_assert(!(ep->flags & UCT_TCP_EP_FLAG_CTX_TYPE_TX));
     ucs_assert(ep->flags & UCT_TCP_EP_FLAG_FAILED);
     /* Reset FAILED flag to not remove callback in the EP destructor */
     ep->flags &= ~UCT_TCP_EP_FLAG_FAILED;
-
-    if (ep->flags & UCT_TCP_EP_FLAG_CTX_TYPE_TX) {
-        uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CLOSED);
-        uct_set_ep_failed(&UCS_CLASS_NAME(uct_tcp_ep_t),
-                          &ep->super.super, &iface->super.super,
-                          UCS_ERR_ENDPOINT_TIMEOUT);
-    } else {
-        uct_tcp_ep_destroy_internal(&ep->super.super);
-    }
-
+    uct_tcp_ep_destroy_internal(&ep->super.super);
     return 1;
 }
 
@@ -401,10 +466,19 @@ void uct_tcp_ep_set_failed(uct_tcp_ep_t *ep)
     }
 
     uct_tcp_ep_mod_events(ep, 0, ep->events);
-    ep->flags |= UCT_TCP_EP_FLAG_FAILED;
-    uct_worker_progress_register_safe(&iface->super.worker->super,
-                                      uct_tcp_ep_failed_progress, ep,
-                                      UCS_CALLBACKQ_FLAG_ONESHOT, &cb_id);
+
+    if (ep->flags & UCT_TCP_EP_FLAG_CTX_TYPE_TX) {
+        ucs_debug("tcp_ep %p: calling error handler (flags: %x)", ep,
+                  ep->flags);
+        uct_tcp_cm_change_conn_state(ep, UCT_TCP_EP_CONN_STATE_CLOSED);
+        uct_iface_handle_ep_err(ep->super.super.iface, &ep->super.super,
+                                UCS_ERR_ENDPOINT_TIMEOUT);
+    } else {
+        ep->flags |= UCT_TCP_EP_FLAG_FAILED;
+        uct_worker_progress_register_safe(&iface->super.worker->super,
+                                          uct_tcp_ep_destroy_progress, ep,
+                                          UCS_CALLBACKQ_FLAG_ONESHOT, &cb_id);
+    }
 }
 
 static inline void uct_tcp_ep_ctx_move(uct_tcp_ep_ctx_t *to_ctx,
@@ -418,6 +492,60 @@ static inline void uct_tcp_ep_ctx_move(uct_tcp_ep_ctx_t *to_ctx,
     memset(from_ctx, 0, sizeof(*from_ctx));
 }
 
+static int uct_tcp_ep_time_seconds(ucs_time_t time_val, int auto_val)
+{
+    if (time_val == UCS_TIME_AUTO) {
+        return auto_val;
+    }
+
+    return ucs_max(1, (int)ucs_time_to_sec(time_val));
+}
+
+static ucs_status_t uct_tcp_ep_keepalive_enable(uct_tcp_ep_t *ep)
+{
+#ifdef UCT_TCP_EP_KEEPALIVE
+    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
+                                            uct_tcp_iface_t);
+    const int optval       = 1;
+    int idle_sec;
+    int intvl_sec;
+    ucs_status_t status;
+
+    if (!uct_tcp_keepalive_is_enabled(iface)) {
+        return UCS_OK;
+    }
+
+    idle_sec  = uct_tcp_ep_time_seconds(iface->config.keepalive.idle,
+                                        UCT_TCP_EP_DEFAULT_KEEPALIVE_IDLE);
+    intvl_sec = uct_tcp_ep_time_seconds(iface->config.keepalive.intvl,
+                                        UCT_TCP_EP_DEFAULT_KEEPALIVE_INTVL);
+
+    status = ucs_socket_setopt(ep->fd, IPPROTO_TCP, TCP_KEEPINTVL,
+                               &intvl_sec, sizeof(intvl_sec));
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    status = ucs_socket_setopt(ep->fd, IPPROTO_TCP, TCP_KEEPCNT,
+                               &iface->config.keepalive.cnt,
+                               sizeof(iface->config.keepalive.cnt));
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    status = ucs_socket_setopt(ep->fd, IPPROTO_TCP, TCP_KEEPIDLE,
+                               &idle_sec, sizeof(idle_sec));
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    return ucs_socket_setopt(ep->fd, SOL_SOCKET, SO_KEEPALIVE,
+                             &optval, sizeof(optval));
+#else /* UCT_TCP_EP_KEEPALIVE */
+    return UCS_OK;
+#endif /* UCT_TCP_EP_KEEPALIVE */
+}
+
 static ucs_status_t uct_tcp_ep_create_socket_and_connect(uct_tcp_ep_t *ep)
 {
     uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
@@ -435,6 +563,11 @@ static ucs_status_t uct_tcp_ep_create_socket_and_connect(uct_tcp_ep_t *ep)
         goto err;
     }
 
+    status = uct_tcp_ep_keepalive_enable(ep);
+    if (status != UCS_OK) {
+        goto err;
+    }
+
     status = uct_tcp_cm_conn_start(ep);
     if (status != UCS_OK) {
         goto err;
@@ -529,7 +662,7 @@ static ucs_status_t uct_tcp_ep_connect(uct_tcp_ep_t *ep)
         }
     } else {
         /* EP that connects to self or EP created using CONNECT_TO_EP mustn't
-         * go here and always create socket and conenct to a peer */
+         * go here and always create socket and connect to a peer */
         ucs_assert(!uct_tcp_ep_is_self(ep) &&
                    !(ep->flags & UCT_TCP_EP_FLAG_CONNECT_TO_EP));
         ucs_assert((peer_ep != NULL) && (peer_ep->fd != -1) &&
@@ -563,17 +696,36 @@ static ucs_status_t uct_tcp_ep_connect(uct_tcp_ep_t *ep)
     return UCS_OK;
 }
 
-void uct_tcp_ep_set_dest_addr(const uct_device_addr_t *dev_addr,
-                              const uct_iface_addr_t *iface_addr,
-                              struct sockaddr_in *dest_addr)
+ucs_status_t uct_tcp_ep_set_dest_addr(const uct_device_addr_t *dev_addr,
+                                      const uct_iface_addr_t *iface_addr,
+                                      struct sockaddr *dest_addr)
 {
+    uct_tcp_device_addr_t *tcp_dev_addr  = (uct_tcp_device_addr_t*)dev_addr;
+    uct_tcp_iface_addr_t *tcp_iface_addr = (uct_tcp_iface_addr_t*)iface_addr;
+    const struct in_addr loopback_addr   = {
+        .s_addr = htonl(INADDR_LOOPBACK)
+    };
+    const void *in_addr;
+    ucs_status_t status;
+
     memset(dest_addr, 0, sizeof(*dest_addr));
+
+    if (tcp_dev_addr->flags & UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK) {
+        in_addr = &loopback_addr;
+    } else {
+        in_addr = tcp_dev_addr + 1;
+    }
+
     /* TODO: handle AF_INET6 */
-    dest_addr->sin_family = AF_INET;
-    dest_addr->sin_port   = *(const in_port_t*)iface_addr;
-    dest_addr->sin_addr   = *(const struct in_addr*)
-                            ucs_sockaddr_get_inet_addr((const struct sockaddr*)
-                                                       dev_addr);
+    dest_addr->sa_family = tcp_dev_addr->sa_family;
+    ucs_assert(dest_addr->sa_family == AF_INET);
+
+    status = ucs_sockaddr_set_inet_addr(dest_addr, in_addr);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    return ucs_sockaddr_set_port(dest_addr, ntohs(tcp_iface_addr->port));
 }
 
 uint64_t uct_tcp_ep_get_cm_id(const uct_tcp_ep_t *ep)
@@ -594,8 +746,12 @@ ucs_status_t uct_tcp_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p)
     if (ucs_test_all_flags(params->field_mask,
                            UCT_EP_PARAM_FIELD_DEV_ADDR |
                            UCT_EP_PARAM_FIELD_IFACE_ADDR)) {
-        uct_tcp_ep_set_dest_addr(params->dev_addr, params->iface_addr,
-                                 &dest_addr);
+        status = uct_tcp_ep_set_dest_addr(params->dev_addr, params->iface_addr,
+                                          (struct sockaddr*)&dest_addr);
+        if (status != UCS_OK) {
+            return status;
+        }
+
         ep_dest_addr = &dest_addr;
     }
 
@@ -625,6 +781,7 @@ ucs_status_t uct_tcp_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *ep_addr)
     ucs_assert(ep->flags & UCT_TCP_EP_FLAG_CONNECT_TO_EP);
 
     addr->ptr_map_key = ep->cm_id.ptr_map_key;
+
     return uct_iface_get_address(tl_ep->iface,
                                  (uct_iface_addr_t*)&addr->iface_addr);
 }
@@ -637,6 +794,7 @@ ucs_status_t uct_tcp_ep_connect_to_ep(uct_ep_h tl_ep,
     uct_tcp_iface_t UCS_V_UNUSED *iface = ucs_derived_of(ep->super.super.iface,
                                                          uct_tcp_iface_t);
     uct_tcp_ep_addr_t *addr             = (uct_tcp_ep_addr_t*)ep_addr;
+    ucs_status_t status;
 
     ucs_assert(ep->flags & UCT_TCP_EP_FLAG_CONNECT_TO_EP);
 
@@ -648,19 +806,37 @@ ucs_status_t uct_tcp_ep_connect_to_ep(uct_ep_h tl_ep,
         return UCS_OK;
     }
 
-    uct_tcp_ep_set_dest_addr(dev_addr, (uct_iface_addr_t*)&addr->iface_addr,
-                             &ep->peer_addr);
+    if (uct_tcp_ep_ptr_map_get(iface, ep->cm_id.ptr_map_key) == NULL) {
+        /* If the EP doesn't exist anymore in the local EP PTR MAP, it means the
+         * EP was already connected to a peer's EP and removed from the EP PTR
+         * MAP as a part of CONN_REQ handling (only not connected yet EPs are
+         * contained in a PTR MAP) and then disconnected before a user called
+         * uct_ep_connect_to_ep() for the EP.
+         * So, just report to a caller that the connection was already reset */
+        ucs_assert(uct_tcp_cm_ep_accept_conn(ep));
+        ucs_assert(ep->conn_state == UCT_TCP_EP_CONN_STATE_CLOSED);
+        ucs_assert(ep->conn_retries > 0);
+        return UCS_ERR_CONNECTION_RESET;
+    }
+
+    status = uct_tcp_ep_set_dest_addr(dev_addr,
+                                      (uct_iface_addr_t*)&addr->iface_addr,
+                                      (struct sockaddr*)&ep->peer_addr);
+    if (status != UCS_OK) {
+        return status;
+    }
 
     if (!uct_tcp_cm_ep_accept_conn(ep)) {
         ucs_assert(ep->conn_state == UCT_TCP_EP_CONN_STATE_CLOSED);
-        /* EP that are created as CONNECT_TO_EP has to be full-duplex, set RX
+        /* EPs which are created as CONNECT_TO_EP has to be full-duplex, set RX
          * capability as well as TX (that's set in uct_tcp_ep_connect()) */
         uct_tcp_ep_add_ctx_cap(ep, UCT_TCP_EP_FLAG_CTX_TYPE_RX);
 
         uct_tcp_ep_ptr_map_del(ep);
 
-        /* Use remote peer connection sequence number value, since the EP has to
-         * send the CONN_REQ to the peer has to find its EP in the EP PTR map */
+        /* Use the peer's EP PTR map key, since the EP has to send the CONN_REQ
+         * to the peer which has to find an EP created for this connection in
+         * the EP PTR map */
         ep->cm_id.ptr_map_key = addr->ptr_map_key;
         return uct_tcp_ep_connect(ep);
     }
@@ -721,7 +897,7 @@ static inline void uct_tcp_ep_handle_put_ack(uct_tcp_ep_t *ep,
                                (UCS_CIRCULAR_COMPARE32(put_comp->wait_put_sn,
                                                        <=, put_ack->sn))) {
         uct_invoke_completion(put_comp->comp, UCS_OK);
-        ucs_free(put_comp);
+        ucs_mpool_put_inline(put_comp);
     }
 }
 
@@ -737,36 +913,6 @@ void uct_tcp_ep_pending_queue_dispatch(uct_tcp_ep_t *ep)
     }
 }
 
-static UCS_F_ALWAYS_INLINE void
-uct_tcp_ep_tx_started(uct_tcp_ep_t *ep, const uct_tcp_am_hdr_t *hdr)
-{
-    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
-                                            uct_tcp_iface_t);
-
-    ep->tx.length      += sizeof(*hdr) + hdr->length;
-    iface->outstanding += ep->tx.length;
-}
-
-static UCS_F_ALWAYS_INLINE void
-uct_tcp_ep_tx_completed(uct_tcp_ep_t *ep, size_t sent_length)
-{
-    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
-                                            uct_tcp_iface_t);
-
-    iface->outstanding -= sent_length;
-    ep->tx.offset      += sent_length;
-}
-
-static UCS_F_ALWAYS_INLINE void
-uct_tcp_ep_zcopy_completed(uct_tcp_ep_t *ep, uct_completion_t *comp,
-                           ucs_status_t status)
-{
-    ep->flags &= ~UCT_TCP_EP_FLAG_ZCOPY_TX;
-    if (comp != NULL) {
-        uct_invoke_completion(comp, status);
-    }
-}
-
 static void uct_tcp_ep_handle_disconnected(uct_tcp_ep_t *ep, ucs_status_t status)
 {
     uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
@@ -1245,6 +1391,9 @@ static unsigned uct_tcp_ep_progress_am_rx(uct_tcp_ep_t *ep)
             ucs_assert(hdr->length == sizeof(uint32_t));
             uct_tcp_ep_handle_put_ack(ep, (uct_tcp_ep_put_ack_hdr_t*)(hdr + 1));
             handled++;
+        } else if (hdr->am_id == UCT_TCP_EP_KEEPALIVE_AM_ID) {
+            /* just ignore keepalive requests */
+            handled++;
         } else {
             ucs_assert(hdr->am_id == UCT_TCP_EP_CM_AM_ID);
             handled += 1 + uct_tcp_cm_handle_conn_pkt(&ep, hdr + 1, hdr->length);
@@ -1435,17 +1584,17 @@ uct_tcp_ep_am_send(uct_tcp_ep_t *ep, const uct_tcp_am_hdr_t *hdr)
 static const void*
 uct_tcp_ep_am_sendv_get_trace_payload(uct_tcp_am_hdr_t *hdr,
                                       const void *header,
-                                      const struct iovec *payload_iov,
+                                      const struct iovec *iov, size_t iov_cnt,
                                       int short_sendv)
 {
     if (short_sendv == 0) {
         return header;
     }
 
-    /* If user requested trace data, we copy header and payload
+    /* If user requested trace data, we copy iov
      * to EP TX buffer in order to trace correct data */
-    uct_am_short_fill_data(hdr + 1, *(const uint64_t*)header,
-                           payload_iov->iov_base, payload_iov->iov_len);
+    ucs_iov_copy(iov + 1, iov_cnt - 1, 0, hdr + 1, SIZE_MAX, UCS_IOV_COPY_TO_BUF);
+
     return (hdr + 1);
 }
 
@@ -1474,8 +1623,8 @@ uct_tcp_ep_am_sendv(uct_tcp_ep_t *ep, int short_sendv, uct_tcp_am_hdr_t *hdr,
     uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, hdr->am_id,
                        /* the function will be invoked only in case of
                         * data tracing is enabled */
-                       uct_tcp_ep_am_sendv_get_trace_payload(hdr, header,
-                                                             &iov[2], short_sendv),
+                       uct_tcp_ep_am_sendv_get_trace_payload(hdr, header, iov,
+                                                             iov_cnt, short_sendv),
                        hdr->length, "SEND: ep %p fd %d sent %zu/%zu bytes, "
                        "moved by offset %zu, iov cnt %zu "
                        "[addr %p len %zu] [addr %p len %zu]",
@@ -1530,6 +1679,33 @@ static void uct_tcp_ep_post_put_ack(uct_tcp_ep_t *ep)
     ep->flags &= ~UCT_TCP_EP_FLAG_PUT_RX_SENDING_ACK;
 }
 
+static inline ucs_status_t
+uct_tcp_ep_am_short_sendv(uct_tcp_ep_t *ep, uct_tcp_iface_t *iface,
+                          uct_tcp_am_hdr_t *hdr, uint64_t header, struct iovec *iov,
+                          size_t iov_cnt)
+{
+    ucs_status_t status;
+    size_t offset;
+
+    status = uct_tcp_ep_am_sendv(ep, 1, hdr, iface->config.tx_seg_size, &header, iov,
+                                 iov_cnt);
+    if (ucs_unlikely(status != UCS_OK)) {
+        return status;
+    }
+
+    if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) {
+        /* Copy only user's header and payload to the TX buffer,
+         * TCP AM header is placed at the beginning of the buffer */
+        offset = (ep->tx.offset >= sizeof(*hdr)) ? ep->tx.offset - sizeof(*hdr) : 0;
+
+        ucs_iov_copy(&iov[1], iov_cnt - 1, offset,
+                     UCS_PTR_BYTE_OFFSET(hdr + 1, offset),
+                     ep->tx.length - sizeof(*hdr) - offset, UCS_IOV_COPY_TO_BUF);
+    }
+
+    return status;
+}
+
 ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header,
                                  const void *payload, unsigned length)
 {
@@ -1538,7 +1714,6 @@ ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header
     uct_tcp_am_hdr_t *hdr  = NULL;
     struct iovec iov[UCT_TCP_EP_AM_SHORTV_IOV_COUNT];
     uint32_t UCS_V_UNUSED payload_length;
-    size_t offset;
     ucs_status_t status;
 
     UCT_CHECK_LENGTH(length + sizeof(header), 0,
@@ -1560,11 +1735,6 @@ ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header
     if (length <= iface->config.sendv_thresh) {
         uct_am_short_fill_data(hdr + 1, header, payload, length);
         status = uct_tcp_ep_am_send(ep, hdr);
-        if (ucs_unlikely(status != UCS_OK)) {
-            return status;
-        }
-
-        UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, payload_length);
     } else {
         iov[0].iov_base = hdr;
         iov[0].iov_len  = sizeof(*hdr);
@@ -1575,27 +1745,57 @@ ucs_status_t uct_tcp_ep_am_short(uct_ep_h uct_ep, uint8_t am_id, uint64_t header
         iov[2].iov_base = (void*)payload;
         iov[2].iov_len  = length;
 
-        status = uct_tcp_ep_am_sendv(ep, 1, hdr, iface->config.tx_seg_size,
-                                     &header, iov, UCT_TCP_EP_AM_SHORTV_IOV_COUNT);
-        if (ucs_unlikely(status != UCS_OK)) {
-            return status;
-        }
+        status          = uct_tcp_ep_am_short_sendv(ep, iface, hdr, header, iov,
+                                                    UCT_TCP_EP_AM_SHORTV_IOV_COUNT);
+    }
 
-        UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, payload_length);
+    if (ucs_unlikely(status != UCS_OK)) {
+        return status;
+    }
 
-        if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) {
-            /* Copy only user's header and payload to the TX buffer,
-             * TCP AM header is placed at the beginning of the buffer */
-            offset = ((ep->tx.offset >= sizeof(*hdr)) ?
-                      (ep->tx.offset - sizeof(*hdr)) : 0);
+    UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, payload_length);
 
-            ucs_iov_copy(&iov[1], UCT_TCP_EP_AM_SHORTV_IOV_COUNT - 1,
-                         offset, UCS_PTR_BYTE_OFFSET(hdr + 1, offset),
-                         ep->tx.length - sizeof(*hdr) - offset,
-                         UCS_IOV_COPY_TO_BUF);
-        }
+    return status;
+}
+
+ucs_status_t uct_tcp_ep_am_short_iov(uct_ep_h uct_ep, uint8_t am_id,
+                                     const uct_iov_t *uct_iov, size_t uct_iov_cnt)
+{
+    uct_tcp_ep_t *ep       = ucs_derived_of(uct_ep, uct_tcp_ep_t);
+    uct_tcp_iface_t *iface = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t);
+    uct_tcp_am_hdr_t *hdr  = NULL;
+    struct iovec *iov      = ucs_alloca((uct_iov_cnt + 1) * sizeof(*iov));
+    ucs_iov_iter_t uct_iov_iter;
+    size_t UCS_V_UNUSED payload_length;
+    ucs_status_t status;
+
+    UCT_CHECK_AM_ID(am_id);
+    UCT_CHECK_IOV_SIZE(uct_iov_cnt, iface->config.max_iov, "am_short_iov");
+    UCT_CHECK_LENGTH(uct_iov_total_length(uct_iov, uct_iov_cnt), 0,
+                     iface->config.tx_seg_size - sizeof(uct_tcp_am_hdr_t),
+                     "am_short_iov");
+
+    status = uct_tcp_ep_am_prepare(iface, ep, am_id, &hdr);
+    if (status != UCS_OK) {
+        return status;
     }
 
+    ucs_assertv(hdr != NULL, "ep=%p", ep);
+
+    ucs_iov_iter_init(&uct_iov_iter);
+    iov[0].iov_base = hdr;
+    iov[0].iov_len  = sizeof(*hdr);
+    hdr->length     = payload_length = uct_iov_to_iovec(&iov[1], &uct_iov_cnt,
+                                                        uct_iov, uct_iov_cnt,
+                                                        SIZE_MAX, &uct_iov_iter);
+    status          = uct_tcp_ep_am_short_sendv(ep, iface, hdr, 0, iov,
+                                                uct_iov_cnt + 1);
+    if (ucs_unlikely(status != UCS_OK)) {
+        return status;
+    }
+
+    UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, payload_length);
+
     return status;
 }
 
@@ -1644,7 +1844,7 @@ uct_tcp_ep_prepare_zcopy(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, uint8_t am_id
     uct_tcp_ep_zcopy_tx_t *ctx;
     ucs_status_t status;
 
-    UCT_CHECK_IOV_SIZE(iovcnt, iface->config.zcopy.max_iov, name);
+    UCT_CHECK_IOV_SIZE(iovcnt, iface->config.max_iov, name);
     UCT_CHECK_LENGTH(header_length, 0, iface->config.zcopy.max_hdr, name);
 
     status = uct_tcp_ep_am_prepare(iface, ep, am_id, &hdr);
@@ -1672,7 +1872,7 @@ uct_tcp_ep_prepare_zcopy(uct_tcp_iface_t *iface, uct_tcp_ep_t *ep, uint8_t am_id
 
     /* User-defined payload */
     ucs_iov_iter_init(&uct_iov_iter);
-    io_vec_cnt       = iovcnt; 
+    io_vec_cnt       = iovcnt;
     *zcopy_payload_p = uct_iov_to_iovec(&ctx->iov[ctx->iov_cnt], &io_vec_cnt,
                                         iov, iovcnt, SIZE_MAX, &uct_iov_iter);
     *ctx_p           = ctx;
@@ -1723,12 +1923,38 @@ ucs_status_t uct_tcp_ep_am_zcopy(uct_ep_h uct_ep, uint8_t am_id, const void *hea
     return UCS_OK;
 }
 
+static UCS_F_ALWAYS_INLINE ucs_status_t
+uct_tcp_ep_put_comp_add(uct_tcp_ep_t *ep, uct_completion_t *comp, int wait_sn)
+{
+    uct_tcp_iface_t *iface = ucs_derived_of(ep->super.super.iface,
+                                            uct_tcp_iface_t);
+    uct_tcp_ep_put_completion_t *put_comp;
+
+    if (comp == NULL) {
+        return UCS_OK;
+    }
+
+    put_comp = ucs_mpool_get_inline(&iface->tx_mpool);
+    if (ucs_unlikely(put_comp == NULL)) {
+        ucs_error("tcp_ep %p: unable to allocate PUT completion from mpool",
+                  ep);
+        return UCS_ERR_NO_MEMORY;
+    }
+
+    put_comp->wait_put_sn = ep->tx.put_sn;
+    put_comp->comp        = comp;
+    ucs_queue_push(&ep->put_comp_q, &put_comp->elem);
+
+    return UCS_OK;
+}
+
 ucs_status_t uct_tcp_ep_put_zcopy(uct_ep_h uct_ep, const uct_iov_t *iov,
                                   size_t iovcnt, uint64_t remote_addr,
                                   uct_rkey_t rkey, uct_completion_t *comp)
 {
     uct_tcp_ep_t *ep                 = ucs_derived_of(uct_ep, uct_tcp_ep_t);
-    uct_tcp_iface_t *iface           = ucs_derived_of(uct_ep->iface, uct_tcp_iface_t);
+    uct_tcp_iface_t *iface           = ucs_derived_of(uct_ep->iface,
+                                                      uct_tcp_iface_t);
     uct_tcp_ep_zcopy_tx_t *ctx       = NULL;
     uct_tcp_ep_put_req_hdr_t put_req = {0}; /* Suppress Cppcheck false-positive */
     ucs_status_t status;
@@ -1773,13 +1999,17 @@ ucs_status_t uct_tcp_ep_put_zcopy(uct_ep_h uct_ep, const uct_iov_t *iov,
 
     UCT_TL_EP_STAT_OP(&ep->super, PUT, ZCOPY, put_req.length);
 
+    status = uct_tcp_ep_put_comp_add(ep, comp, put_req.sn);
+    if (ucs_unlikely(status != UCS_OK)) {
+        return status;
+    }
+
     if (uct_tcp_ep_ctx_buf_need_progress(&ep->tx)) {
         uct_tcp_ep_set_outstanding_zcopy(iface, ep, ctx, &put_req,
-                                         sizeof(put_req), comp);
-        return UCS_INPROGRESS;
+                                         sizeof(put_req), NULL);
     }
 
-    return UCS_OK;
+    return UCS_INPROGRESS;
 }
 
 ucs_status_t uct_tcp_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *req,
@@ -1809,7 +2039,6 @@ ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags,
                               uct_completion_t *comp)
 {
     uct_tcp_ep_t *ep = ucs_derived_of(tl_ep, uct_tcp_ep_t);
-    uct_tcp_ep_put_completion_t *put_comp;
     ucs_status_t status;
 
     if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) {
@@ -1829,15 +2058,9 @@ ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags,
     }
 
     if (ep->flags & UCT_TCP_EP_FLAG_PUT_TX_WAITING_ACK) {
-        if (comp != NULL) {
-            put_comp = ucs_calloc(1, sizeof(*put_comp), "put completion");
-            if (put_comp == NULL) {
-                return UCS_ERR_NO_MEMORY;
-            }
-
-            put_comp->wait_put_sn = ep->tx.put_sn;
-            put_comp->comp        = comp;
-            ucs_queue_push(&ep->put_comp_q, &put_comp->elem);
+        status = uct_tcp_ep_put_comp_add(ep, comp, ep->tx.put_sn);
+        if (status != UCS_OK) {
+            return status;
         }
 
         return UCS_INPROGRESS;
@@ -1847,3 +2070,22 @@ ucs_status_t uct_tcp_ep_flush(uct_ep_h tl_ep, unsigned flags,
     return UCS_OK;
 }
 
+ucs_status_t
+uct_tcp_ep_check(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp)
+{
+    uct_tcp_ep_t *ep       = ucs_derived_of(tl_ep, uct_tcp_ep_t);
+    uct_tcp_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_tcp_iface_t);
+    uct_tcp_am_hdr_t *hdr  = NULL; /* init to suppress build warning */
+    ucs_status_t status;
+
+    UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);
+
+    status = uct_tcp_ep_am_prepare(iface, ep, UCT_TCP_EP_KEEPALIVE_AM_ID, &hdr);
+    if (status != UCS_OK) {
+        return (status == UCS_ERR_NO_RESOURCE) ? UCS_OK : status;
+    }
+
+    ucs_assert(hdr != NULL);
+    hdr->length = 0;
+    return uct_tcp_ep_am_send(ep, hdr);
+}
diff --git a/src/uct/tcp/tcp_iface.c b/src/uct/tcp/tcp_iface.c
index 3feb8193412..f26a84f0ae9 100644
--- a/src/uct/tcp/tcp_iface.c
+++ b/src/uct/tcp/tcp_iface.c
@@ -81,10 +81,29 @@ static ucs_config_field_t uct_tcp_iface_config_table[] = {
                                 ucs_offsetof(uct_tcp_iface_config_t, rx_mpool), ""),
 
   {"PORT_RANGE", "0",
-   "Generate a random TCP port number from that range. A value of zero means\n "
+   "Generate a random TCP port number from that range. A value of zero means\n"
    "let the operating system select the port number.",
    ucs_offsetof(uct_tcp_iface_config_t, port_range), UCS_CONFIG_TYPE_RANGE_SPEC},
 
+#ifdef UCT_TCP_EP_KEEPALIVE
+  {"KEEPIDLE", UCS_PP_MAKE_STRING(UCT_TCP_EP_DEFAULT_KEEPALIVE_IDLE) "s",
+   "The time the connection needs to remain idle before TCP starts sending "
+   "keepalive probes.",
+   ucs_offsetof(uct_tcp_iface_config_t, keepalive.idle),
+                UCS_CONFIG_TYPE_TIME_UNITS},
+
+  {"KEEPCNT", "3",
+   "The maximum number of keepalive probes TCP should send before "
+   "dropping the connection.",
+   ucs_offsetof(uct_tcp_iface_config_t, keepalive.cnt),
+                UCS_CONFIG_TYPE_UINT},
+
+  {"KEEPINTVL", UCS_PP_MAKE_STRING(UCT_TCP_EP_DEFAULT_KEEPALIVE_INTVL) "s",
+   "The time between individual keepalive probes.",
+   ucs_offsetof(uct_tcp_iface_config_t, keepalive.intvl),
+                UCS_CONFIG_TYPE_TIME_UNITS},
+#endif /* UCT_TCP_EP_KEEPALIVE */
+
   {NULL}
 };
 
@@ -94,17 +113,60 @@ static UCS_CLASS_DEFINE_DELETE_FUNC(uct_tcp_iface_t, uct_iface_t);
 static ucs_status_t uct_tcp_iface_get_device_address(uct_iface_h tl_iface,
                                                      uct_device_addr_t *addr)
 {
-    uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t);
+    uct_tcp_iface_t *iface          = ucs_derived_of(tl_iface, uct_tcp_iface_t);
+    uct_tcp_device_addr_t *dev_addr = (uct_tcp_device_addr_t*)addr;
+    void *pack_ptr                   = dev_addr + 1;
+    const struct sockaddr *saddr    = (struct sockaddr*)&iface->config.ifaddr;
+    const void *in_addr;
+    size_t ip_addr_len;
+    ucs_status_t status;
+
+    dev_addr->flags     = 0;
+    dev_addr->sa_family = iface->config.ifaddr.sin_family;
+
+    if (ucs_sockaddr_is_inaddr_loopback(saddr)) {
+        dev_addr->flags |= UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK;
+        uct_iface_get_local_address(pack_ptr, UCS_SYS_NS_TYPE_NET);
+    } else {
+        in_addr = ucs_sockaddr_get_inet_addr(saddr);
+        status  = ucs_sockaddr_inet_addr_sizeof(saddr, &ip_addr_len);
+        if (status != UCS_OK) {
+            return status;
+        }
+
+        memcpy(pack_ptr, in_addr, ip_addr_len);
+    }
 
-    *(struct sockaddr_in*)addr = iface->config.ifaddr;
     return UCS_OK;
 }
 
-static ucs_status_t uct_tcp_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *addr)
+static size_t uct_tcp_iface_get_device_address_length(uct_tcp_iface_t *iface)
 {
-    uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t);
+    const struct sockaddr *saddr = (struct sockaddr*)&iface->config.ifaddr;
+    size_t addr_len              = sizeof(uct_tcp_device_addr_t);
+    size_t in_addr_len;
+    ucs_status_t status;
+
+    if (ucs_sockaddr_is_inaddr_loopback(saddr)) {
+        addr_len += sizeof(uct_iface_local_addr_ns_t);
+    } else {
+        status = ucs_sockaddr_inet_addr_sizeof(saddr, &in_addr_len);
+        ucs_assert_always(status == UCS_OK);
 
-    *(in_port_t*)addr = iface->config.ifaddr.sin_port;
+        addr_len += in_addr_len;
+    }
+
+    return addr_len;
+}
+
+static ucs_status_t
+uct_tcp_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *addr)
+{
+    uct_tcp_iface_t *iface           = ucs_derived_of(tl_iface,
+                                                      uct_tcp_iface_t);
+    uct_tcp_iface_addr_t *iface_addr = (uct_tcp_iface_addr_t*)addr;
+
+    iface_addr->port = iface->config.ifaddr.sin_port;
     return UCS_OK;
 }
 
@@ -112,15 +174,26 @@ static int uct_tcp_iface_is_reachable(const uct_iface_h tl_iface,
                                       const uct_device_addr_t *dev_addr,
                                       const uct_iface_addr_t *iface_addr)
 {
+    uct_tcp_device_addr_t *tcp_dev_addr = (uct_tcp_device_addr_t*)dev_addr;
+    uct_iface_local_addr_ns_t *local_addr_ns;
+
+    if (tcp_dev_addr->flags & UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK) {
+        local_addr_ns = (uct_iface_local_addr_ns_t*)(tcp_dev_addr + 1);
+        return uct_iface_local_is_reachable(local_addr_ns,
+                                            UCS_SYS_NS_TYPE_NET);
+    }
+
     /* We always report that a peer is reachable. connect() call will
      * fail if the peer is unreachable when creating UCT/TCP EP */
     return 1;
 }
 
-static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *attr)
+static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface,
+                                        uct_iface_attr_t *attr)
 {
     uct_tcp_iface_t *iface = ucs_derived_of(tl_iface, uct_tcp_iface_t);
-    size_t am_buf_size     = iface->config.tx_seg_size - sizeof(uct_tcp_am_hdr_t);
+    size_t am_buf_size     = iface->config.tx_seg_size -
+                             sizeof(uct_tcp_am_hdr_t);
     ucs_status_t status;
     int is_default;
 
@@ -133,14 +206,15 @@ static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *
     }
 
     attr->ep_addr_len      = sizeof(uct_tcp_ep_addr_t);
-    attr->iface_addr_len   = sizeof(in_port_t);
-    attr->device_addr_len  = sizeof(struct sockaddr_in);
+    attr->iface_addr_len   = sizeof(uct_tcp_iface_addr_t);
+    attr->device_addr_len  = uct_tcp_iface_get_device_address_length(iface);
     attr->cap.flags        = UCT_IFACE_FLAG_CONNECT_TO_IFACE |
                              UCT_IFACE_FLAG_CONNECT_TO_EP    |
                              UCT_IFACE_FLAG_AM_SHORT         |
                              UCT_IFACE_FLAG_AM_BCOPY         |
                              UCT_IFACE_FLAG_PENDING          |
                              UCT_IFACE_FLAG_CB_SYNC          |
+                             UCT_IFACE_FLAG_EP_CHECK         |
                              UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE;
     attr->cap.event_flags  = UCT_IFACE_FLAG_EVENT_SEND_COMP |
                              UCT_IFACE_FLAG_EVENT_RECV      |
@@ -149,9 +223,13 @@ static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *
     attr->cap.am.max_short = am_buf_size;
     attr->cap.am.max_bcopy = am_buf_size;
 
-    if (iface->config.zcopy.max_iov > UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT) {
+    if (uct_tcp_keepalive_is_enabled(iface)) {
+        attr->cap.flags   |= UCT_IFACE_FLAG_EP_KEEPALIVE;
+    }
+
+    if (iface->config.max_iov > UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT) {
         /* AM */
-        attr->cap.am.max_iov          = iface->config.zcopy.max_iov -
+        attr->cap.am.max_iov          = iface->config.max_iov -
                                         UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT;
         attr->cap.am.max_zcopy        = iface->config.rx_seg_size -
                                         sizeof(uct_tcp_am_hdr_t);
@@ -161,7 +239,7 @@ static ucs_status_t uct_tcp_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *
 
         if (iface->config.put_enable) {
             /* PUT */
-            attr->cap.put.max_iov          = iface->config.zcopy.max_iov -
+            attr->cap.put.max_iov          = iface->config.max_iov -
                                              UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT;
             attr->cap.put.max_zcopy        = UCT_TCP_EP_PUT_ZCOPY_MAX -
                                              UCT_TCP_EP_PUT_SERVICE_LENGTH;
@@ -280,7 +358,7 @@ uct_tcp_iface_connect_handler(int listen_fd, ucs_event_set_types_t events,
 
         status = uct_tcp_cm_handle_incoming_conn(iface, &peer_addr, fd);
         if (status != UCS_OK) {
-            close(fd);
+            ucs_close_fd(&fd);
             return;
         }
     }
@@ -316,6 +394,7 @@ ucs_status_t uct_tcp_iface_set_sockopt(uct_tcp_iface_t *iface, int fd,
 
 static uct_iface_ops_t uct_tcp_iface_ops = {
     .ep_am_short              = uct_tcp_ep_am_short,
+    .ep_am_short_iov          = uct_tcp_ep_am_short_iov,
     .ep_am_bcopy              = uct_tcp_ep_am_bcopy,
     .ep_am_zcopy              = uct_tcp_ep_am_zcopy,
     .ep_put_zcopy             = uct_tcp_ep_put_zcopy,
@@ -323,6 +402,7 @@ static uct_iface_ops_t uct_tcp_iface_ops = {
     .ep_pending_purge         = uct_tcp_ep_pending_purge,
     .ep_flush                 = uct_tcp_ep_flush,
     .ep_fence                 = uct_base_ep_fence,
+    .ep_check                 = uct_tcp_ep_check,
     .ep_create                = uct_tcp_ep_create,
     .ep_destroy               = uct_tcp_ep_destroy,
     .ep_get_address           = uct_tcp_ep_get_address,
@@ -416,7 +496,7 @@ static ucs_status_t uct_tcp_iface_listener_init(uct_tcp_iface_t *iface)
     return UCS_OK;
 
 err_close_sock:
-    close(iface->listen_fd);
+    ucs_close_fd(&iface->listen_fd);
 err:
     return status;
 }
@@ -495,12 +575,13 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker,
         return UCS_ERR_INVALID_PARAM;
     }
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &uct_tcp_iface_ops, md, worker,
-                              params, tl_config
-                              UCS_STATS_ARG((params->field_mask &
-                                             UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
-                                            params->stats_root : NULL)
-                              UCS_STATS_ARG(params->mode.device.dev_name));
+    UCS_CLASS_CALL_SUPER_INIT(
+            uct_base_iface_t, &uct_tcp_iface_ops, &uct_base_iface_internal_ops,
+            md, worker, params,
+            tl_config UCS_STATS_ARG(
+                    (params->field_mask & UCT_IFACE_PARAM_FIELD_STATS_ROOT) ?
+                            params->stats_root :
+                            NULL) UCS_STATS_ARG(params->mode.device.dev_name));
 
     ucs_strncpy_zero(self->if_name, params->mode.device.dev_name,
                      sizeof(self->if_name));
@@ -520,19 +601,18 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker,
     /* Maximum IOV count allowed by user's configuration (considering TCP
      * protocol and user's AM headers that use 1st and 2nd IOVs
      * correspondingly) and system constraints */
-    self->config.zcopy.max_iov    = ucs_min(config->max_iov +
+    self->config.max_iov          = ucs_min(config->max_iov +
                                             UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT,
                                             ucs_iov_get_max());
     /* Use a remaining part of TX segment for AM Zcopy header */
     self->config.zcopy.hdr_offset = (sizeof(uct_tcp_ep_zcopy_tx_t) +
-                                     sizeof(struct iovec) *
-                                     self->config.zcopy.max_iov);
+                                     sizeof(struct iovec) * self->config.max_iov);
     if ((self->config.zcopy.hdr_offset > self->config.tx_seg_size) &&
-        (self->config.zcopy.max_iov > UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT)) {
+        (self->config.max_iov > UCT_TCP_EP_ZCOPY_SERVICE_IOV_COUNT)) {
         ucs_error("AM Zcopy context (%zu) must be <= TX segment size (%zu). "
                   "It can be adjusted by decreasing maximum IOV count (%zu)",
                   self->config.zcopy.hdr_offset, self->config.tx_seg_size,
-                  self->config.zcopy.max_iov);
+                  self->config.max_iov);
         return UCS_ERR_INVALID_PARAM;
     }
 
@@ -554,6 +634,20 @@ static UCS_CLASS_INIT_FUNC(uct_tcp_iface_t, uct_md_h md, uct_worker_h worker,
     self->sockopt.nodelay          = config->sockopt_nodelay;
     self->sockopt.sndbuf           = config->sockopt.sndbuf;
     self->sockopt.rcvbuf           = config->sockopt.rcvbuf;
+    self->config.keepalive.cnt     = config->keepalive.cnt;
+    self->config.keepalive.intvl   = config->keepalive.intvl;
+
+    if (config->keepalive.idle != UCS_MEMUNITS_AUTO) {
+        /* TCP iface configuration sets the keepalive interval */
+        self->config.keepalive.idle = config->keepalive.idle;
+    } else if (params->field_mask & UCT_IFACE_PARAM_FIELD_KEEPALIVE_INTERVAL) {
+        /* User parameters set the keepalive interval */
+        self->config.keepalive.idle = params->keepalive_interval;
+    } else {
+        /* Use the default keepalive interval */
+        self->config.keepalive.idle =
+            ucs_time_from_sec(UCT_TCP_EP_DEFAULT_KEEPALIVE_IDLE);
+    }
 
     status = uct_tcp_iface_set_port_range(self, config);
     if (status != UCS_OK) {
@@ -769,6 +863,17 @@ ucs_status_t uct_tcp_query_devices(uct_md_h md,
     return status;
 }
 
+int uct_tcp_keepalive_is_enabled(uct_tcp_iface_t *iface)
+{
+#ifdef UCT_TCP_EP_KEEPALIVE
+    return (iface->config.keepalive.idle != UCS_TIME_INFINITY) &&
+           (iface->config.keepalive.cnt != 0) &&
+           (iface->config.keepalive.intvl != UCS_TIME_INFINITY);
+#else /* UCT_TCP_EP_KEEPALIVE */
+    return 0;
+#endif /* UCT_TCP_EP_KEEPALIVE */
+}
+
 UCT_TL_DEFINE(&uct_tcp_component, tcp, uct_tcp_query_devices, uct_tcp_iface_t,
               UCT_TCP_CONFIG_PREFIX, uct_tcp_iface_config_table,
               uct_tcp_iface_config_t);
diff --git a/src/uct/tcp/tcp_listener.c b/src/uct/tcp/tcp_listener.c
index 05fa2d7ef7d..c273ca76d2d 100644
--- a/src/uct/tcp/tcp_listener.c
+++ b/src/uct/tcp/tcp_listener.c
@@ -115,7 +115,7 @@ UCS_CLASS_INIT_FUNC(uct_tcp_listener_t, uct_cm_h cm,
     }
 
     status = ucs_socket_server_init(saddr, socklen, backlog, 0,
-                                    self->sockcm->allow_addr_inuse,
+                                    self->sockcm->super.config.reuse_addr,
                                     &self->listen_fd);
     if (status != UCS_OK) {
         goto err;
diff --git a/src/uct/tcp/tcp_md.c b/src/uct/tcp/tcp_md.c
index 3ee975759b6..30cb7784084 100644
--- a/src/uct/tcp/tcp_md.c
+++ b/src/uct/tcp/tcp_md.c
@@ -18,7 +18,8 @@ static ucs_status_t uct_tcp_md_query(uct_md_h md, uct_md_attr_t *attr)
     attr->cap.flags               = UCT_MD_FLAG_REG |
                                     UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */
     attr->cap.max_alloc           = 0;
-    attr->cap.reg_mem_types       = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    attr->cap.reg_mem_types       = UCS_BIT(UCS_MEMORY_TYPE_HOST);
+    attr->cap.alloc_mem_types     = 0;
     attr->cap.access_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     attr->cap.detect_mem_types    = 0;
     attr->cap.max_reg             = ULONG_MAX;
diff --git a/src/uct/tcp/tcp_sockcm.c b/src/uct/tcp/tcp_sockcm.c
index 70a9a00bfe6..cb34ef5cfa2 100644
--- a/src/uct/tcp/tcp_sockcm.c
+++ b/src/uct/tcp/tcp_sockcm.c
@@ -15,7 +15,7 @@
 
 
 ucs_config_field_t uct_tcp_sockcm_config_table[] = {
-  {"TCP_CM_", "", NULL,
+  {"CM_", "", NULL,
    ucs_offsetof(uct_tcp_sockcm_config_t, super), UCS_CONFIG_TYPE_TABLE(uct_cm_config_table)},
 
   {"PRIV_DATA_LEN", "2048",
@@ -26,10 +26,6 @@ ucs_config_field_t uct_tcp_sockcm_config_table[] = {
 
    UCT_TCP_SYN_CNT(ucs_offsetof(uct_tcp_sockcm_config_t, syn_cnt)),
 
-   {"ALLOW_ADDR_INUSE", "n",
-    "Allow using an address that is already in use by another socket.",
-    ucs_offsetof(uct_tcp_sockcm_config_t, allow_addr_inuse), UCS_CONFIG_TYPE_BOOL},
-
   {NULL}
 };
 
@@ -98,9 +94,9 @@ void uct_tcp_sa_data_handler(int fd, ucs_event_set_types_t events, void *arg)
 
     ucs_assertv(ep->fd == fd, "ep->fd %d fd %d, ep_state %d", ep->fd, fd, ep->state);
 
-    ucs_trace("ep %p on %s received event (state = %d)", ep,
+    ucs_trace("ep %p on %s received event 0x%x (state = %d)", ep,
               (ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client",
-              ep->state);
+              events, ep->state);
 
     if (events & UCS_EVENT_SET_EVERR) {
         status = uct_tcp_sockcm_event_err_to_ucs_err_log(fd, &log_level);
@@ -134,6 +130,7 @@ void uct_tcp_sa_data_handler(int fd, ucs_event_set_types_t events, void *arg)
 
 static uct_iface_ops_t uct_tcp_sockcm_iface_ops = {
     .ep_pending_purge         = (uct_ep_pending_purge_func_t)ucs_empty_function,
+    .ep_connect               = uct_tcp_sockcm_ep_connect,
     .ep_disconnect            = uct_tcp_sockcm_ep_disconnect,
     .cm_ep_conn_notify        = uct_tcp_sockcm_cm_ep_conn_notify,
     .ep_destroy               = UCS_CLASS_DELETE_FUNC_NAME(uct_tcp_sockcm_ep_t),
@@ -141,6 +138,7 @@ static uct_iface_ops_t uct_tcp_sockcm_iface_ops = {
     .ep_put_bcopy             = (uct_ep_put_bcopy_func_t)ucs_empty_function_return_unsupported,
     .ep_get_bcopy             = (uct_ep_get_bcopy_func_t)ucs_empty_function_return_unsupported,
     .ep_am_short              = (uct_ep_am_short_func_t)ucs_empty_function_return_unsupported,
+    .ep_am_short_iov          = (uct_ep_am_short_iov_func_t)ucs_empty_function_return_unsupported,
     .ep_am_bcopy              = (uct_ep_am_bcopy_func_t)ucs_empty_function_return_unsupported,
     .ep_atomic_cswap64        = (uct_ep_atomic_cswap64_func_t)ucs_empty_function_return_unsupported,
     .ep_atomic64_post         = (uct_ep_atomic64_post_func_t)ucs_empty_function_return_unsupported,
@@ -177,12 +175,11 @@ UCS_CLASS_INIT_FUNC(uct_tcp_sockcm_t, uct_component_h component,
                               &uct_tcp_sockcm_iface_ops, worker, component,
                               config);
 
-    self->priv_data_len    = cm_config->priv_data_len -
-                             sizeof(uct_tcp_sockcm_priv_data_hdr_t);
-    self->sockopt_sndbuf   = cm_config->sockopt.sndbuf;
-    self->sockopt_rcvbuf   = cm_config->sockopt.rcvbuf;
-    self->syn_cnt          = cm_config->syn_cnt;
-    self->allow_addr_inuse = cm_config->allow_addr_inuse;
+    self->priv_data_len  = cm_config->priv_data_len +
+                                   sizeof(uct_tcp_sockcm_priv_data_hdr_t);
+    self->sockopt_sndbuf = cm_config->sockopt.sndbuf;
+    self->sockopt_rcvbuf = cm_config->sockopt.rcvbuf;
+    self->syn_cnt        = cm_config->syn_cnt;
 
     ucs_list_head_init(&self->ep_list);
 
diff --git a/src/uct/tcp/tcp_sockcm.h b/src/uct/tcp/tcp_sockcm.h
index 7fbed0fa9e8..3b62dff49ba 100644
--- a/src/uct/tcp/tcp_sockcm.h
+++ b/src/uct/tcp/tcp_sockcm.h
@@ -21,7 +21,6 @@ typedef struct uct_tcp_sockcm {
     size_t              sockopt_rcvbuf;  /** SO_RCVBUF */
     unsigned            syn_cnt;         /** TCP_SYNCNT */
     ucs_list_link_t     ep_list;         /** List of endpoints */
-    int                 allow_addr_inuse;
 } uct_tcp_sockcm_t;
 
 /**
@@ -32,7 +31,6 @@ typedef struct uct_tcp_sockcm_config {
     size_t                          priv_data_len;
     uct_tcp_send_recv_buf_config_t  sockopt;
     unsigned                        syn_cnt;
-    int                             allow_addr_inuse;
 } uct_tcp_sockcm_config_t;
 
 
diff --git a/src/uct/tcp/tcp_sockcm_ep.c b/src/uct/tcp/tcp_sockcm_ep.c
index dee68ae5219..dad2a297d9c 100644
--- a/src/uct/tcp/tcp_sockcm_ep.c
+++ b/src/uct/tcp/tcp_sockcm_ep.c
@@ -9,12 +9,17 @@
 #endif
 
 #include "tcp_sockcm_ep.h"
+#include "tcp.h"
 #include <ucs/sys/sock.h>
 #include <ucs/async/async.h>
 #include <ucs/arch/bitops.h>
 #include <ucs/sys/string.h>
 
 
+#define UCT_TCP_SOCKCM_EP_MAX_DEVICE_ADDR_LEN (sizeof(uct_tcp_device_addr_t) + \
+                                               sizeof(struct in6_addr))
+
+
 const char *uct_tcp_sockcm_cm_ep_peer_addr_str(uct_tcp_sockcm_ep_t *cep,
                                                char *buf, size_t max)
 {
@@ -65,6 +70,43 @@ static void uct_tcp_sockcm_ep_server_notify_cb(uct_tcp_sockcm_ep_t *cep,
     uct_cm_ep_server_conn_notify_cb(&cep->super, status);
 }
 
+static ucs_status_t
+uct_tcp_sockcm_ep_pack_priv_data(uct_tcp_sockcm_ep_t *cep, const void *data,
+                                 size_t data_length)
+{
+    uct_tcp_sockcm_priv_data_hdr_t *hdr =
+            (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf;
+
+    ucs_assert(cep->comm_ctx.offset == 0);
+    ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED));
+
+    if (data_length > uct_tcp_sockcm_ep_get_cm(cep)->priv_data_len) {
+        cep->state |= UCT_TCP_SOCKCM_EP_PACK_CB_FAILED;
+        return UCS_ERR_BUFFER_TOO_SMALL;
+    }
+
+    if (data != NULL) {
+        memcpy(hdr + 1, data, data_length);
+    }
+
+    hdr->length          = data_length;
+    hdr->status          = (uint8_t)UCS_OK;
+    cep->comm_ctx.length = sizeof(*hdr) + hdr->length;
+    cep->state          |= UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED;
+    return UCS_OK;
+}
+
+ucs_status_t uct_tcp_sockcm_ep_connect(uct_ep_h ep,
+                                       const uct_ep_connect_params_t *params)
+{
+    uct_tcp_sockcm_ep_t *cep = ucs_derived_of(ep, uct_tcp_sockcm_ep_t);
+    const void *priv_data;
+    size_t priv_data_length;
+
+    uct_ep_connect_params_get(params, &priv_data, &priv_data_length);
+    return uct_tcp_sockcm_ep_pack_priv_data(cep, priv_data, priv_data_length);
+}
+
 ucs_status_t uct_tcp_sockcm_ep_disconnect(uct_ep_h ep, unsigned flags)
 {
     uct_tcp_sockcm_ep_t *cep     = ucs_derived_of(ep, uct_tcp_sockcm_ep_t);
@@ -144,6 +186,7 @@ ucs_status_t uct_tcp_sockcm_ep_disconnect(uct_ep_h ep, unsigned flags)
 
 void uct_tcp_sockcm_close_ep(uct_tcp_sockcm_ep_t *ep)
 {
+    ucs_assert(!(ep->state & UCT_TCP_SOCKCM_EP_SERVER_CONN_REQ_CB_INVOKED));
     ucs_list_del(&ep->list);
     UCS_CLASS_DELETE(uct_tcp_sockcm_ep_t, ep);
 }
@@ -187,10 +230,13 @@ void uct_tcp_sockcm_ep_handle_event_status(uct_tcp_sockcm_ep_t *ep,
               ((ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) ? "server" : "client"),
               ep, ep->fd, ep->state, events, reason, ucs_status_string(status));
 
-    /* if the ep is on the server side but uct_ep_create wasn't called yet,
-     * destroy the ep here since uct_ep_destroy won't be called either */
-    if ((ep->state & UCT_TCP_SOCKCM_EP_ON_SERVER) &&
-        !(ep->state & UCT_TCP_SOCKCM_EP_SERVER_CREATED)) {
+    /* if the ep is on the server side but uct_ep_create wasn't called yet and
+     * connection request wasn't prvodied to a user, destroy the ep here since
+     * uct_ep_destroy won't be called either */
+    if ((ep->state & (UCT_TCP_SOCKCM_EP_ON_SERVER |
+                      UCT_TCP_SOCKCM_EP_SERVER_CREATED |
+                      UCT_TCP_SOCKCM_EP_SERVER_CONN_REQ_CB_INVOKED)) ==
+        UCT_TCP_SOCKCM_EP_ON_SERVER) {
         ucs_trace("closing server's internal ep %p (state=%d)", ep, ep->state);
         uct_tcp_sockcm_close_ep(ep);
     } else {
@@ -204,10 +250,13 @@ void uct_tcp_sockcm_ep_handle_event_status(uct_tcp_sockcm_ep_t *ep,
                      ep->fd, ucs_status_string(async_status));
         }
 
-        /* if the private data pack callback failed, then the upper layer already
+        /* if the resolve or pack callback failed, then the upper layer already
          * knows about it since it failed in it. in this case, no need to invoke
          * another upper layer callback. */
-        if (!(ep->state & UCT_TCP_SOCKCM_EP_PACK_CB_FAILED)) {
+        if (!(ep->state & (UCT_TCP_SOCKCM_EP_RESOLVE_CB_FAILED |
+                           UCT_TCP_SOCKCM_EP_PACK_CB_FAILED)) &&
+            (ep->state & (UCT_TCP_SOCKCM_EP_SERVER_CREATED |
+                          UCT_TCP_SOCKCM_EP_ON_CLIENT))) {
             uct_tcp_sockcm_ep_invoke_error_cb(ep, status);
         }
 
@@ -279,15 +328,14 @@ static void uct_tcp_sockcm_ep_mark_tx_completed(uct_tcp_sockcm_ep_t *cep)
     }
 }
 
-/**
- * This function should be called with the lock held.
- */
 ucs_status_t uct_tcp_sockcm_ep_progress_send(uct_tcp_sockcm_ep_t *cep)
 {
+    uct_tcp_sockcm_t UCS_V_UNUSED *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(cep);
     ucs_status_t status;
     size_t sent_length;
     ucs_event_set_types_t events;
 
+    ucs_assert(ucs_async_is_blocked(tcp_sockcm->super.iface.worker->async));
     ucs_assert(ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_ON_CLIENT      |
                                               UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED) ||
                ucs_test_all_flags(cep->state, UCT_TCP_SOCKCM_EP_ON_SERVER      |
@@ -355,9 +403,11 @@ ucs_status_t uct_tcp_sockcm_ep_progress_send(uct_tcp_sockcm_ep_t *cep)
 
 ucs_status_t uct_tcp_sockcm_cm_ep_conn_notify(uct_ep_h ep)
 {
-    uct_tcp_sockcm_ep_t *cep                = ucs_derived_of(ep, uct_tcp_sockcm_ep_t);
-    uct_tcp_sockcm_t *tcp_sockcm            = uct_tcp_sockcm_ep_get_cm(cep);
-    uct_tcp_sockcm_priv_data_hdr_t *hdr;
+    uct_tcp_sockcm_ep_t *cep            =
+            ucs_derived_of(ep, uct_tcp_sockcm_ep_t);
+    uct_tcp_sockcm_t *tcp_sockcm        = uct_tcp_sockcm_ep_get_cm(cep);
+    uct_tcp_sockcm_priv_data_hdr_t *hdr =
+            (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf;
     char peer_str[UCS_SOCKADDR_STRING_LEN];
     ucs_status_t status;
 
@@ -375,9 +425,8 @@ ucs_status_t uct_tcp_sockcm_cm_ep_conn_notify(uct_ep_h ep)
                                               UCT_TCP_SOCKCM_EP_CLIENT_CONNECTED_CB_INVOKED));
     ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED));
 
-    hdr = (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf;
-
-    hdr->length          = 0;   /* sending only the header in the notify message */
+    /* sending only the header in the notify message */
+    hdr->length          = 0; 
     hdr->status          = (uint8_t)UCS_OK;
     cep->comm_ctx.length = sizeof(*hdr);
 
@@ -392,37 +441,69 @@ ucs_status_t uct_tcp_sockcm_cm_ep_conn_notify(uct_ep_h ep)
     return status;
 }
 
-static ucs_status_t uct_tcp_sockcm_ep_pack_priv_data(uct_tcp_sockcm_ep_t *cep)
+static ucs_status_t
+uct_tcp_sockcm_ep_invoke_resolve_cb(uct_tcp_sockcm_ep_t *cep,
+                                    const char *ifname)
 {
-    char ifname_str[UCT_DEVICE_NAME_MAX];
-    uct_tcp_sockcm_priv_data_hdr_t *hdr;
-    size_t priv_data_ret;
+    uct_cm_ep_resolve_args_t resolve_args;
     ucs_status_t status;
-    uct_cm_ep_priv_data_pack_args_t pack_args;
 
-    /* get interface name associated with the connected client fd */
-    status = ucs_sockaddr_get_ifname(cep->fd, ifname_str, sizeof(ifname_str));
-    if (UCS_OK != status) {
-        goto out;
+    resolve_args.field_mask = UCT_CM_EP_RESOLVE_ARGS_FIELD_DEV_NAME;
+    ucs_strncpy_safe(resolve_args.dev_name, ifname, UCT_DEVICE_NAME_MAX);
+    status      = uct_cm_ep_resolve_cb(&cep->super, &resolve_args);
+    cep->state |= UCT_TCP_SOCKCM_EP_RESOLVE_CB_INVOKED;
+    if (status != UCS_OK) {
+        cep->state |= UCT_TCP_SOCKCM_EP_RESOLVE_CB_FAILED;
     }
 
-    hdr                  = (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf;
+    return status;
+}
+
+static ucs_status_t
+uct_tcp_sockcm_ep_invoke_pack_cb(uct_tcp_sockcm_ep_t *cep,
+                                 const char *ifname)
+{
+    uct_cm_ep_priv_data_pack_args_t pack_args;
+    uct_tcp_sockcm_priv_data_hdr_t *hdr;
+    ucs_status_t status;
+
     pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME;
-    ucs_strncpy_safe(pack_args.dev_name, ifname_str, UCT_DEVICE_NAME_MAX);
+    ucs_strncpy_safe(pack_args.dev_name, ifname, UCT_DEVICE_NAME_MAX);
 
+    ucs_assert(cep->comm_ctx.offset == 0);
+    hdr = (uct_tcp_sockcm_priv_data_hdr_t*)cep->comm_ctx.buf;
     status = uct_cm_ep_pack_cb(&cep->super, cep->super.user_data, &pack_args,
                                hdr + 1,
                                uct_tcp_sockcm_ep_get_cm(cep)->priv_data_len,
-                               &priv_data_ret);
+                               &hdr->length);
     if (status != UCS_OK) {
         cep->state |= UCT_TCP_SOCKCM_EP_PACK_CB_FAILED;
-        goto out;
+        return status;
     }
 
-    hdr->length          = priv_data_ret;
     hdr->status          = (uint8_t)UCS_OK;
     cep->comm_ctx.length = sizeof(*hdr) + hdr->length;
     cep->state          |= UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED;
+    return UCS_OK;
+}
+
+static ucs_status_t uct_tcp_sockcm_ep_resolve(uct_tcp_sockcm_ep_t *cep)
+{
+    char ifname_str[UCT_DEVICE_NAME_MAX];
+    ucs_status_t status;
+
+    /* get interface name associated with the connected client fd */
+    status = ucs_sockaddr_get_ifname(cep->fd, ifname_str, sizeof(ifname_str));
+    if (status != UCS_OK) {
+        goto out;
+    }
+
+    if (cep->super.resolve_cb != NULL) {
+        status = uct_tcp_sockcm_ep_invoke_resolve_cb(cep, ifname_str);
+    } else {
+        ucs_assert(cep->super.priv_pack_cb != NULL);
+        status = uct_tcp_sockcm_ep_invoke_pack_cb(cep, ifname_str);
+    }
 
 out:
     return status;
@@ -442,66 +523,109 @@ static int uct_tcp_sockcm_ep_send_skip_event(uct_tcp_sockcm_ep_t *cep)
         return cep->state & UCT_TCP_SOCKCM_EP_DATA_SENT;
     } else {
         ucs_assert(cep->state & UCT_TCP_SOCKCM_EP_ON_CLIENT);
-        return (cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_SENT) ||
-               ((cep->state & UCT_TCP_SOCKCM_EP_DATA_SENT) &&
-                !(cep->state & UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_CALLED));
+        /* if data already sent or not packed yet, then skip event */
+        return (cep->state & (UCT_TCP_SOCKCM_EP_CLIENT_NOTIFY_SENT |
+                              UCT_TCP_SOCKCM_EP_DATA_SENT)) ||
+               !(cep->state & UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED);
     }
 }
 
 ucs_status_t uct_tcp_sockcm_ep_send(uct_tcp_sockcm_ep_t *cep)
 {
-    ucs_status_t status;
+    if (!(cep->state & (UCT_TCP_SOCKCM_EP_RESOLVE_CB_INVOKED |
+                        UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED   |
+                        UCT_TCP_SOCKCM_EP_ON_SERVER))) {
+        ucs_assert(cep->state & UCT_TCP_SOCKCM_EP_ON_CLIENT);
+        return uct_tcp_sockcm_ep_resolve(cep);
+    }
 
     if (uct_tcp_sockcm_ep_send_skip_event(cep)) {
+        ucs_assert(!(cep->state & UCT_TCP_SOCKCM_EP_DISCONNECTING));
         return UCS_OK;
     }
 
-    if (!(cep->state & UCT_TCP_SOCKCM_EP_PRIV_DATA_PACKED)) {
-        status = uct_tcp_sockcm_ep_pack_priv_data(cep);
-        if (status != UCS_OK) {
-            return status;
-        }
+    return uct_tcp_sockcm_ep_progress_send(cep);
+}
+
+static ssize_t
+uct_tcp_sockcm_ep_get_remote_device_addr(const uct_tcp_sockcm_ep_t *cep,
+                                         struct sockaddr_storage *saddr,
+                                         socklen_t *saddr_len_p,
+                                         uct_tcp_device_addr_t *remote_dev_addr,
+                                         size_t max_remote_dev_addr_len)
+{
+    ucs_status_t status;
+    size_t in_addr_len;
+    size_t remote_dev_addr_len;
+
+    /* Get the device address of the remote peer associated with the connected
+     * fd */
+    status = ucs_socket_getpeername(cep->fd, saddr, saddr_len_p);
+    if (status != UCS_OK) {
+        return status;
     }
 
-    return uct_tcp_sockcm_ep_progress_send(cep);
+    status = ucs_sockaddr_inet_addr_sizeof((struct sockaddr*)saddr,
+                                           &in_addr_len);
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    remote_dev_addr_len = sizeof(*remote_dev_addr) + in_addr_len;
+    if (remote_dev_addr_len > max_remote_dev_addr_len) {
+        return UCS_ERR_BUFFER_TOO_SMALL;
+    }
+
+    remote_dev_addr->flags     = 0u;
+    remote_dev_addr->sa_family = saddr->ss_family;
+
+    memcpy(remote_dev_addr + 1,
+           ucs_sockaddr_get_inet_addr((struct sockaddr*)saddr), in_addr_len);
+
+    return remote_dev_addr_len;
 }
 
-static ucs_status_t uct_tcp_sockcm_ep_server_invoke_conn_req_cb(uct_tcp_sockcm_ep_t *cep)
+static ucs_status_t
+uct_tcp_sockcm_ep_server_invoke_conn_req_cb(uct_tcp_sockcm_ep_t *cep)
 {
-    uct_tcp_sockcm_priv_data_hdr_t *hdr     = (uct_tcp_sockcm_priv_data_hdr_t *)
-                                              cep->comm_ctx.buf;
-    struct sockaddr_storage remote_dev_addr = {0}; /* Suppress Clang false-positive */
+    uct_tcp_sockcm_priv_data_hdr_t *hdr    = (uct_tcp_sockcm_priv_data_hdr_t*)
+                                                     cep->comm_ctx.buf;
+    struct sockaddr_storage saddr          = {0};
+    uct_tcp_device_addr_t *remote_dev_addr =
+            ucs_alloca(UCT_TCP_SOCKCM_EP_MAX_DEVICE_ADDR_LEN);
+    ssize_t remote_dev_addr_len;
     uct_cm_listener_conn_request_args_t conn_req_args;
     char peer_str[UCS_SOCKADDR_STRING_LEN];
     char ifname_str[UCT_DEVICE_NAME_MAX];
     uct_cm_remote_data_t remote_data;
-    socklen_t remote_dev_addr_len;
+    socklen_t saddr_len;
     ucs_sock_addr_t client_saddr;
     ucs_status_t status;
 
-    /* get the local interface name associated with the connected fd */
+    /* Get the local interface name associated with the connected fd */
     status = ucs_sockaddr_get_ifname(cep->fd, ifname_str, UCT_DEVICE_NAME_MAX);
     if (UCS_OK != status) {
         return status;
     }
 
-    /* get the device address of the remote peer associated with the connected fd */
-    status = ucs_socket_getpeername(cep->fd, &remote_dev_addr, &remote_dev_addr_len);
-    if (status != UCS_OK) {
-        return status;
+    remote_dev_addr_len = uct_tcp_sockcm_ep_get_remote_device_addr(
+            cep, &saddr, &saddr_len, remote_dev_addr,
+            UCT_TCP_SOCKCM_EP_MAX_DEVICE_ADDR_LEN);
+    if (remote_dev_addr_len < 0) {
+        return (ucs_status_t)remote_dev_addr_len;
     }
 
     remote_data.field_mask            = UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR        |
                                         UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH |
                                         UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA  |
                                         UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH;
-    remote_data.dev_addr              = (uct_device_addr_t *)&remote_dev_addr;
+    remote_data.dev_addr              = (uct_device_addr_t*)remote_dev_addr;
     remote_data.dev_addr_length       = remote_dev_addr_len;
     remote_data.conn_priv_data        = hdr + 1;
     remote_data.conn_priv_data_length = hdr->length;
 
-    client_saddr.addr    = (struct sockaddr*)&remote_dev_addr;
-    client_saddr.addrlen = remote_dev_addr_len;
+    client_saddr.addr    = (struct sockaddr*)&saddr;
+    client_saddr.addrlen = saddr_len;
 
     conn_req_args.field_mask     = UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_DEV_NAME     |
                                    UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST |
@@ -523,39 +647,44 @@ static ucs_status_t uct_tcp_sockcm_ep_server_invoke_conn_req_cb(uct_tcp_sockcm_e
      * to uct_ep_create() which will be invoked by the user and therefore moving
      * over to its responsibility. */
     ucs_list_del(&cep->list);
+    cep->state |= UCT_TCP_SOCKCM_EP_SERVER_CONN_REQ_CB_INVOKED;
     cep->listener->conn_request_cb(&cep->listener->super, cep->listener->user_data,
                                    &conn_req_args);
 
     return UCS_OK;
 }
 
-static ucs_status_t uct_tcp_sockcm_ep_client_invoke_connect_cb(uct_tcp_sockcm_ep_t *cep)
+static ucs_status_t
+uct_tcp_sockcm_ep_client_invoke_connect_cb(uct_tcp_sockcm_ep_t *cep)
 {
-    uct_tcp_sockcm_priv_data_hdr_t *hdr     = (uct_tcp_sockcm_priv_data_hdr_t *)
-                                              cep->comm_ctx.buf;
-    struct sockaddr_storage remote_dev_addr = {0}; /* Suppress Clang false-positive */
-    socklen_t remote_dev_addr_len;
+    uct_tcp_sockcm_priv_data_hdr_t *hdr    = (uct_tcp_sockcm_priv_data_hdr_t*)
+                                                     cep->comm_ctx.buf;
+    struct sockaddr_storage saddr          = {0};
+    uct_tcp_device_addr_t *remote_dev_addr =
+            ucs_alloca(UCT_TCP_SOCKCM_EP_MAX_DEVICE_ADDR_LEN);
+    ssize_t remote_dev_addr_len;
     uct_cm_remote_data_t remote_data;
-    ucs_status_t status;
+    socklen_t saddr_len;
 
-    /* get the device address of the remote peer associated with the connected fd */
-    status = ucs_socket_getpeername(cep->fd, &remote_dev_addr, &remote_dev_addr_len);
-    if (status != UCS_OK) {
-        return status;
+    remote_dev_addr_len = uct_tcp_sockcm_ep_get_remote_device_addr(
+            cep, &saddr, &saddr_len, remote_dev_addr,
+            UCT_TCP_SOCKCM_EP_MAX_DEVICE_ADDR_LEN);
+    if (remote_dev_addr_len < 0) {
+        return (ucs_status_t)remote_dev_addr_len;
     }
 
     remote_data.field_mask            = UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR        |
                                         UCT_CM_REMOTE_DATA_FIELD_DEV_ADDR_LENGTH |
                                         UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA  |
                                         UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH;
-    remote_data.dev_addr              = (uct_device_addr_t *)&remote_dev_addr;
+    remote_data.dev_addr              = (uct_device_addr_t*)remote_dev_addr;
     remote_data.dev_addr_length       = remote_dev_addr_len;
     remote_data.conn_priv_data        = hdr + 1;
     remote_data.conn_priv_data_length = hdr->length;
 
     uct_tcp_sockcm_ep_client_connect_cb(cep, &remote_data, (ucs_status_t)hdr->status);
 
-    return status;
+    return UCS_OK;
 }
 
 static ucs_status_t uct_tcp_sockcm_ep_server_handle_data_received(uct_tcp_sockcm_ep_t *cep)
@@ -778,12 +907,46 @@ static ucs_status_t uct_tcp_sockcm_ep_client_init(uct_tcp_sockcm_ep_t *cep,
     return status;
 }
 
+static ssize_t uct_tcp_sockcm_ep_pack_cb(uct_tcp_sockcm_ep_t *tcp_ep,
+                                         void *data_buf)
+{
+    uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(tcp_ep);
+    uct_cm_ep_priv_data_pack_args_t pack_args;
+    size_t priv_data_ret;
+    char ifname_str[UCT_DEVICE_NAME_MAX];
+    ucs_status_t status;
+
+    status = ucs_sockaddr_get_ifname(tcp_ep->fd, ifname_str,
+                                     sizeof(ifname_str));
+    if (status != UCS_OK) {
+        return status;
+    }
+
+    pack_args.field_mask = UCT_CM_EP_PRIV_DATA_PACK_ARGS_FIELD_DEVICE_NAME;
+    ucs_strncpy_safe(pack_args.dev_name, ifname_str, UCT_DEVICE_NAME_MAX);
+    status = uct_cm_ep_pack_cb(&tcp_ep->super, tcp_ep->super.user_data,
+                               &pack_args, data_buf, tcp_sockcm->priv_data_len,
+                               &priv_data_ret);
+    if (status != UCS_OK) {
+        tcp_ep->state |= UCT_TCP_SOCKCM_EP_PACK_CB_FAILED;
+        return status;
+    }
+
+    return priv_data_ret;
+}
+
+/**
+ * The caller has to block async.
+ */
 static ucs_status_t uct_tcp_sockcm_ep_server_create(uct_tcp_sockcm_ep_t *tcp_ep,
                                                     const uct_ep_params_t *params,
                                                     uct_ep_h *ep_p)
 {
     uct_tcp_sockcm_t *tcp_sockcm = uct_tcp_sockcm_ep_get_cm(tcp_ep);
+    void *data_buf               = NULL;
     uct_tcp_sockcm_t *params_tcp_sockcm;
+    const void *priv_data;
+    ssize_t priv_data_length;
     ucs_async_context_t *new_async_ctx;
     ucs_status_t status;
 
@@ -800,6 +963,11 @@ static ucs_status_t uct_tcp_sockcm_ep_server_create(uct_tcp_sockcm_ep_t *tcp_ep,
         goto err;
     }
 
+    if (tcp_ep->state & UCT_TCP_SOCKCM_EP_FAILED) {
+        status = UCS_ERR_CONNECTION_RESET;
+        goto err;
+    }
+
     /* check if the server opened this ep, to the client, on a CM that is
      * different from the one it created its internal ep on earlier, when it
      * received the connection request from the client (the cm used by its listener) */
@@ -812,35 +980,26 @@ static ucs_status_t uct_tcp_sockcm_ep_server_create(uct_tcp_sockcm_ep_t *tcp_ep,
         }
     }
 
-    UCS_ASYNC_BLOCK(tcp_sockcm->super.iface.worker->async);
-
-    UCS_CLASS_CLEANUP(uct_cm_base_ep_t, &tcp_ep->super);
-
-    /* set the server's ep to use the cm from params and its iface
+    /* set the server's ep to use the cm from params.
      * (it could be the previous one it had - the one used by the listener or
      * a new one set by the user) */
-    status = UCS_CLASS_INIT(uct_cm_base_ep_t, &tcp_ep->super, params);
+    status = uct_cm_ep_set_common_data(&tcp_ep->super, params);
     if (status != UCS_OK) {
-        ucs_error("failed to initialize a uct_cm_base_ep_t endpoint");
-        goto err_unblock;
+        ucs_error("failed to set common data for a uct_cm_base_ep_t endpoint");
+        goto err;
     }
 
-    params_tcp_sockcm = ucs_derived_of(params->cm, uct_tcp_sockcm_t);
-    ucs_assert(uct_tcp_sockcm_ep_get_cm(tcp_ep) == params_tcp_sockcm);
-
     status = UCT_CM_SET_CB(params, UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER,
                            tcp_ep->super.server.notify_cb, params->sockaddr_cb_server,
                            uct_cm_ep_server_conn_notify_callback_t,
                            ucs_empty_function);
     if (status != UCS_OK) {
-        goto err_unblock;
+        goto err;
     }
 
     /* the server's endpoint was already created by the listener, return it */
-    *ep_p          = &tcp_ep->super.super.super;
-    tcp_ep->state |= UCT_TCP_SOCKCM_EP_SERVER_CREATED;
-
-    UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async);
+    *ep_p             = &tcp_ep->super.super.super;
+    params_tcp_sockcm = ucs_derived_of(params->cm, uct_tcp_sockcm_t);
 
     if (&tcp_sockcm->super != params->cm) {
         new_async_ctx = params_tcp_sockcm->super.iface.worker->async;
@@ -855,20 +1014,59 @@ static ucs_status_t uct_tcp_sockcm_ep_server_create(uct_tcp_sockcm_ep_t *tcp_ep,
             goto err;
         }
 
+        /* set the server's ep to use the iface from the cm in params */
+        uct_ep_set_iface(&tcp_ep->super.super.super, &params->cm->iface.super);
+
+        status = uct_base_ep_stats_reset(&tcp_ep->super.super, &params->cm->iface);
+        if (status != UCS_OK) {
+            ucs_error("failed to reset the stats on ep %p: %s",
+                      tcp_ep, ucs_status_string(status));
+            goto err;
+        }
+
         ucs_trace("moved tcp_sockcm ep %p from cm %p to cm %p", tcp_ep,
                   tcp_sockcm, params_tcp_sockcm);
     }
 
+    ucs_assert(uct_tcp_sockcm_ep_get_cm(tcp_ep) == params_tcp_sockcm);
     ucs_trace("server completed endpoint creation (fd=%d cm=%p state=%d)",
               tcp_ep->fd, params_tcp_sockcm, tcp_ep->state);
 
     /* now that the server's ep was created, can try to send data */
-    ucs_async_modify_handler(tcp_ep->fd, UCS_EVENT_SET_EVWRITE | UCS_EVENT_SET_EVREAD);
-    return UCS_OK;
+    ucs_async_modify_handler(tcp_ep->fd, UCS_EVENT_SET_EVWRITE |
+                                         UCS_EVENT_SET_EVREAD);
+
+    if (ucs_test_all_flags(params->field_mask,
+                           UCT_EP_PARAM_FIELD_PRIV_DATA |
+                           UCT_EP_PARAM_FIELD_PRIV_DATA_LENGTH)) {
+        priv_data        = params->private_data;
+        priv_data_length = params->private_data_length;
+    } else if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB) {
+        data_buf = ucs_malloc(tcp_sockcm->priv_data_len, "tcp_priv_data");
+        if (data_buf == NULL) {
+            status = UCS_ERR_NO_MEMORY;
+            goto err;
+        }
+
+        priv_data        = data_buf;
+        priv_data_length = uct_tcp_sockcm_ep_pack_cb(tcp_ep, data_buf);
+        if (priv_data_length < 0) {
+            status = (ucs_status_t)priv_data_length;
+            goto err;
+        }
+    } else {
+        priv_data        = NULL;
+        priv_data_length = 0;
+    }
+
+    status = uct_tcp_sockcm_ep_pack_priv_data(tcp_ep, priv_data,
+                                              priv_data_length);
+    if (status == UCS_OK) {
+        tcp_ep->state |= UCT_TCP_SOCKCM_EP_SERVER_CREATED;
+    }
 
-err_unblock:
-    UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async);
 err:
+    ucs_free(data_buf);
     return status;
 }
 
@@ -911,6 +1109,7 @@ UCS_CLASS_INIT_FUNC(uct_tcp_sockcm_ep_t, const uct_ep_params_t *params)
 ucs_status_t uct_tcp_sockcm_ep_create(const uct_ep_params_t *params, uct_ep_h *ep_p)
 {
     uct_tcp_sockcm_ep_t *tcp_ep;
+    ucs_async_context_t *async;
     ucs_status_t status;
 
     if (params->field_mask & UCT_EP_PARAM_FIELD_SOCKADDR) {
@@ -918,12 +1117,15 @@ ucs_status_t uct_tcp_sockcm_ep_create(const uct_ep_params_t *params, uct_ep_h *e
         return UCS_CLASS_NEW(uct_tcp_sockcm_ep_t, ep_p, params);
     } else if (params->field_mask & UCT_EP_PARAM_FIELD_CONN_REQUEST) {
         tcp_ep = (uct_tcp_sockcm_ep_t*)params->conn_request;
+        async  = uct_tcp_sockcm_ep_get_cm(tcp_ep)->super.iface.worker->async;
 
+        UCS_ASYNC_BLOCK(async);
         status = uct_tcp_sockcm_ep_server_create(tcp_ep, params, ep_p);
         if (status != UCS_OK) {
             UCS_CLASS_DELETE(uct_tcp_sockcm_ep_t, tcp_ep);
         }
 
+        UCS_ASYNC_UNBLOCK(async);
         return status;
     } else {
         ucs_error("either UCT_EP_PARAM_FIELD_SOCKADDR or UCT_EP_PARAM_FIELD_CONN_REQUEST "
@@ -948,6 +1150,6 @@ UCS_CLASS_CLEANUP_FUNC(uct_tcp_sockcm_ep_t)
     UCS_ASYNC_UNBLOCK(tcp_sockcm->super.iface.worker->async);
 }
 
-UCS_CLASS_DEFINE(uct_tcp_sockcm_ep_t, uct_base_ep_t);
+UCS_CLASS_DEFINE(uct_tcp_sockcm_ep_t, uct_cm_base_ep_t);
 UCS_CLASS_DEFINE_NEW_FUNC(uct_tcp_sockcm_ep_t, uct_ep_t, const uct_ep_params_t *);
 UCS_CLASS_DEFINE_DELETE_FUNC(uct_tcp_sockcm_ep_t, uct_ep_t);
diff --git a/src/uct/tcp/tcp_sockcm_ep.h b/src/uct/tcp/tcp_sockcm_ep.h
index ae10d2d1107..d11216b1ec9 100644
--- a/src/uct/tcp/tcp_sockcm_ep.h
+++ b/src/uct/tcp/tcp_sockcm_ep.h
@@ -26,8 +26,10 @@ typedef enum uct_tcp_sockcm_ep_state {
                                                                     (debug flag) */
     UCT_TCP_SOCKCM_EP_PACK_CB_FAILED              = UCS_BIT(15), /* the upper layer's priv_pack_cb failed */
     UCT_TCP_SOCKCM_EP_SERVER_REJECT_CALLED        = UCS_BIT(16), /* ep on the server called reject API call */
-    UCT_TCP_SOCKCM_EP_SERVER_REJECT_SENT          = UCS_BIT(17)  /* ep on the server sent the reject message to the client */
-
+    UCT_TCP_SOCKCM_EP_SERVER_REJECT_SENT          = UCS_BIT(17), /* ep on the server sent the reject message to the client */
+    UCT_TCP_SOCKCM_EP_RESOLVE_CB_FAILED           = UCS_BIT(18), /* the upper layer's resolve_cb failed */
+    UCT_TCP_SOCKCM_EP_RESOLVE_CB_INVOKED          = UCS_BIT(19), /* resolve_cb invoked */
+    UCT_TCP_SOCKCM_EP_SERVER_CONN_REQ_CB_INVOKED  = UCS_BIT(20)  /* server ep was passed to a user via conn_req_cb */
 } uct_tcp_sockcm_ep_state_t;
 
 
@@ -63,6 +65,9 @@ void uct_tcp_sockcm_ep_close_fd(int *fd);
 
 ucs_status_t uct_tcp_sockcm_ep_create(const uct_ep_params_t *params, uct_ep_h* ep_p);
 
+ucs_status_t uct_tcp_sockcm_ep_connect(uct_ep_h ep,
+                                       const uct_ep_connect_params_t *params);
+
 ucs_status_t uct_tcp_sockcm_ep_disconnect(uct_ep_h ep, unsigned flags);
 
 ucs_status_t uct_tcp_sockcm_ep_send(uct_tcp_sockcm_ep_t *cep);
diff --git a/src/uct/ugni/base/ugni_iface.c b/src/uct/ugni/base/ugni_iface.c
index f4a6b1ec84b..a25a44eaf07 100644
--- a/src/uct/ugni/base/ugni_iface.c
+++ b/src/uct/ugni/base/ugni_iface.c
@@ -75,8 +75,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
 
     ucs_assert(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE);
 
-    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
-                              params, tl_config UCS_STATS_ARG(params->stats_root)
+    UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, NULL, md,
+                              worker, params,
+                              tl_config UCS_STATS_ARG(params->stats_root)
                               UCS_STATS_ARG(UCT_UGNI_MD_NAME));
     dev = uct_ugni_device_by_name(params->mode.device.dev_name);
     if (NULL == dev) {
diff --git a/src/uct/ugni/base/ugni_md.c b/src/uct/ugni/base/ugni_md.c
index a937ddb340d..48c6bc5038c 100644
--- a/src/uct/ugni/base/ugni_md.c
+++ b/src/uct/ugni/base/ugni_md.c
@@ -39,7 +39,8 @@ static ucs_status_t uct_ugni_md_query(uct_md_h md, uct_md_attr_t *md_attr)
     md_attr->cap.flags            = UCT_MD_FLAG_REG       |
                                     UCT_MD_FLAG_NEED_MEMH |
                                     UCT_MD_FLAG_NEED_RKEY;
-    md_attr->cap.reg_mem_types    = UCS_MEMORY_TYPES_CPU_ACCESSIBLE;
+    md_attr->cap.reg_mem_types    = UCS_BIT(UCS_MEMORY_TYPE_HOST);
+    md_attr->cap.alloc_mem_types  = 0;
     md_attr->cap.access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST);
     md_attr->cap.detect_mem_types = 0;
     md_attr->cap.max_alloc        = 0;
@@ -180,24 +181,22 @@ uct_ugni_md_open(uct_component_h component,const char *md_name,
                  const uct_md_config_t *md_config, uct_md_h *md_p)
 {
     ucs_status_t status = UCS_OK;
+    static uct_md_ops_t md_ops;
+    static uct_ugni_md_t md;
 
     pthread_mutex_lock(&uct_ugni_global_lock);
-    static uct_md_ops_t md_ops = {
-        .close              = uct_ugni_md_close,
-        .query              = uct_ugni_md_query,
-        .mem_alloc          = (void*)ucs_empty_function,
-        .mem_free           = (void*)ucs_empty_function,
-        .mem_reg            = uct_ugni_mem_reg,
-        .mem_dereg          = uct_ugni_mem_dereg,
-        .mkey_pack          = uct_ugni_rkey_pack,
-        .detect_memory_type = ucs_empty_function_return_unsupported,
-    };
-
-    static uct_ugni_md_t md = {
-        .super.ops          = &md_ops,
-        .super.component    = &uct_ugni_component,
-        .ref_count          = 0
-    };
+    md_ops.close              = uct_ugni_md_close;
+    md_ops.query              = uct_ugni_md_query;
+    md_ops.mem_alloc          = (void*)ucs_empty_function;
+    md_ops.mem_free           = (void*)ucs_empty_function;
+    md_ops.mem_reg            = uct_ugni_mem_reg;
+    md_ops.mem_dereg          = uct_ugni_mem_dereg;
+    md_ops.mkey_pack          = uct_ugni_rkey_pack;
+    md_ops.detect_memory_type = ucs_empty_function_return_unsupported;
+
+    md.super.ops              = &md_ops;
+    md.super.component        = &uct_ugni_component;
+    md.ref_count              = 0;
 
     *md_p = &md.super;
 
diff --git a/src/uct/ugni/rdma/ugni_rdma_ep.c b/src/uct/ugni/rdma/ugni_rdma_ep.c
index e5a1a244802..044c0fb20d3 100644
--- a/src/uct/ugni/rdma/ugni_rdma_ep.c
+++ b/src/uct/ugni/rdma/ugni_rdma_ep.c
@@ -13,10 +13,14 @@
 #include <uct/ugni/base/ugni_device.h>
 
 #define UCT_CHECK_PARAM_IOV(_iov, _iovcnt, _buffer, _length, _memh) \
+    void     *_buffer; \
+    size_t    _length; \
+    uct_mem_h _memh; \
+    \
     UCT_CHECK_PARAM(1 == _iovcnt, "iov[iovcnt] has to be 1 at this time"); \
-    void     *_buffer = _iov[0].buffer; \
-    size_t    _length = _iov[0].length; \
-    uct_mem_h _memh   = _iov[0].memh;
+    _buffer = _iov[0].buffer; \
+    _length = _iov[0].length; \
+    _memh   = _iov[0].memh;
 
 /* Endpoint operations */
 static inline void uct_ugni_invoke_orig_comp(uct_ugni_rdma_fetch_desc_t *fma_desc, ucs_status_t status)
@@ -759,14 +763,17 @@ ucs_status_t uct_ugni_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t
 
 UCS_CLASS_INIT_FUNC(uct_ugni_rdma_ep_t, const uct_ep_params_t *params)
 {
-    UCS_CLASS_CALL_SUPER_INIT(uct_ugni_ep_t, params);
-    UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
     ucs_status_t rc;
+    uct_ugni_iface_t *iface;
+    const uct_sockaddr_ugni_t *iface_addr;
+    const uct_devaddr_ugni_t *ugni_dev_addr;
 
-    uct_ugni_iface_t *iface = ucs_derived_of(params->iface, uct_ugni_iface_t);
-    const uct_sockaddr_ugni_t *iface_addr = (const uct_sockaddr_ugni_t*)params->iface_addr;
-    const uct_devaddr_ugni_t *ugni_dev_addr = (const uct_devaddr_ugni_t *)params->dev_addr;
+    UCS_CLASS_CALL_SUPER_INIT(uct_ugni_ep_t, params);
+    UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
 
+    iface = ucs_derived_of(params->iface, uct_ugni_iface_t);
+    iface_addr = (const uct_sockaddr_ugni_t*)params->iface_addr;
+    ugni_dev_addr = (const uct_devaddr_ugni_t *)params->dev_addr;
     ucs_debug("Connecting RDMA ep %p", self);
     rc = ugni_connect_ep(&self->super, iface, iface_addr, ugni_dev_addr);
 
diff --git a/src/uct/ugni/rdma/ugni_rdma_iface.c b/src/uct/ugni/rdma/ugni_rdma_iface.c
index 69e1ebac557..57775aef812 100644
--- a/src/uct/ugni/rdma/ugni_rdma_iface.c
+++ b/src/uct/ugni/rdma/ugni_rdma_iface.c
@@ -186,6 +186,7 @@ static uct_iface_ops_t uct_ugni_aries_rdma_iface_ops = {
     .ep_get_bcopy             = uct_ugni_ep_get_bcopy,
     .ep_get_zcopy             = uct_ugni_ep_get_zcopy,
     .ep_am_short              = uct_ugni_ep_am_short,
+    .ep_am_short_iov          = uct_base_ep_am_short_iov,
     .ep_atomic_cswap64        = uct_ugni_ep_atomic_cswap64,
     .ep_atomic_cswap32        = uct_ugni_ep_atomic_cswap32,
     .ep_atomic64_post         = uct_ugni_ep_atomic64_post,
@@ -217,6 +218,7 @@ static uct_iface_ops_t uct_ugni_gemini_rdma_iface_ops = {
     .ep_get_bcopy             = uct_ugni_ep_get_bcopy,
     .ep_get_zcopy             = uct_ugni_ep_get_zcopy,
     .ep_am_short              = uct_ugni_ep_am_short,
+    .ep_am_short_iov          = uct_base_ep_am_short_iov,
     .ep_atomic_cswap64        = uct_ugni_ep_atomic_cswap64,
     .ep_pending_add           = uct_ugni_ep_pending_add,
     .ep_pending_purge         = uct_ugni_ep_pending_purge,
diff --git a/src/uct/ugni/smsg/ugni_smsg_iface.c b/src/uct/ugni/smsg/ugni_smsg_iface.c
index 2e658e445b6..a9ed245e568 100644
--- a/src/uct/ugni/smsg/ugni_smsg_iface.c
+++ b/src/uct/ugni/smsg/ugni_smsg_iface.c
@@ -229,6 +229,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_smsg_iface_t)
 
 static uct_iface_ops_t uct_ugni_smsg_iface_ops = {
     .ep_am_short              = uct_ugni_smsg_ep_am_short,
+    .ep_am_short_iov          = uct_base_ep_am_short_iov,
     .ep_am_bcopy              = uct_ugni_smsg_ep_am_bcopy,
     .ep_pending_add           = uct_ugni_ep_pending_add,
     .ep_pending_purge         = uct_ugni_ep_pending_purge,
diff --git a/src/uct/ugni/udt/ugni_udt_ep.c b/src/uct/ugni/udt/ugni_udt_ep.c
index c20999ceed6..14be027748e 100644
--- a/src/uct/ugni/udt/ugni_udt_ep.c
+++ b/src/uct/ugni/udt/ugni_udt_ep.c
@@ -84,12 +84,17 @@ void uct_ugni_udt_ep_pending_purge(uct_ep_h tl_ep,
 
 static UCS_CLASS_INIT_FUNC(uct_ugni_udt_ep_t, const uct_ep_params_t *params)
 {
+    ucs_status_t rc;
+    uct_ugni_iface_t *iface;
+    const uct_sockaddr_ugni_t *iface_addr;
+    const uct_devaddr_ugni_t *ugni_dev_addr;
+
     UCS_CLASS_CALL_SUPER_INIT(uct_ugni_ep_t, params);
     UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
-    uct_ugni_iface_t *iface = ucs_derived_of(params->iface, uct_ugni_iface_t);
-    const uct_sockaddr_ugni_t *iface_addr = (const uct_sockaddr_ugni_t*)params->iface_addr;
-    const uct_devaddr_ugni_t *ugni_dev_addr = (const uct_devaddr_ugni_t *)params->dev_addr;
-    ucs_status_t rc;
+
+    iface = ucs_derived_of(params->iface, uct_ugni_iface_t);
+    iface_addr = (const uct_sockaddr_ugni_t*)params->iface_addr;
+    ugni_dev_addr = (const uct_devaddr_ugni_t *)params->dev_addr;
 
     ucs_debug("Connecting UDT ep %p", self);
     rc = ugni_connect_ep(&self->super, iface, iface_addr, ugni_dev_addr);
@@ -224,6 +229,7 @@ ucs_status_t uct_ugni_udt_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t heade
 {
     uct_ugni_udt_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_udt_iface_t);
     uct_ugni_udt_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_udt_ep_t);
+    ucs_status_t status;
 
     UCS_ASYNC_BLOCK(iface->super.super.worker->async);
 
@@ -231,8 +237,8 @@ ucs_status_t uct_ugni_udt_ep_am_short(uct_ep_h tl_ep, uint8_t id, uint64_t heade
                      iface->config.udt_seg_size - sizeof(header) - sizeof(uct_ugni_udt_header_t), "am_short");
     ucs_trace_data("AM_SHORT [%p] am_id: %d buf=%p length=%u",
                    iface, id, payload, length);
-    ucs_status_t status = uct_ugni_udt_ep_am_common_send(UCT_UGNI_UDT_AM_SHORT, ep, iface, id, length,
-                                                         header, payload, NULL, NULL);
+    status = uct_ugni_udt_ep_am_common_send(UCT_UGNI_UDT_AM_SHORT, ep, iface, id, length,
+                                            header, payload, NULL, NULL);
 
     UCS_ASYNC_UNBLOCK(iface->super.super.worker->async);
 
@@ -245,13 +251,14 @@ ssize_t uct_ugni_udt_ep_am_bcopy(uct_ep_h tl_ep, uint8_t id,
 {
     uct_ugni_udt_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_udt_iface_t);
     uct_ugni_udt_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_udt_ep_t);
+    ucs_status_t status;
 
     UCS_ASYNC_BLOCK(iface->super.super.worker->async);
 
     ucs_trace_data("AM_BCOPY [%p] am_id: %d buf=%p",
                    iface, id, arg );
-    ucs_status_t status = uct_ugni_udt_ep_am_common_send(UCT_UGNI_UDT_AM_BCOPY, ep, iface, id, 0,
-                                                         0, NULL, pack_cb, arg);
+    status = uct_ugni_udt_ep_am_common_send(UCT_UGNI_UDT_AM_BCOPY, ep, iface, id, 0,
+                                            0, NULL, pack_cb, arg);
     UCS_ASYNC_UNBLOCK(iface->super.super.worker->async);
 
     return status;
diff --git a/src/uct/ugni/udt/ugni_udt_iface.c b/src/uct/ugni/udt/ugni_udt_iface.c
index 7fde0b38ee7..5a101eacda7 100644
--- a/src/uct/ugni/udt/ugni_udt_iface.c
+++ b/src/uct/ugni/udt/ugni_udt_iface.c
@@ -355,6 +355,7 @@ static UCS_CLASS_DEFINE_DELETE_FUNC(uct_ugni_udt_iface_t, uct_iface_t);
 
 static uct_iface_ops_t uct_ugni_udt_iface_ops = {
     .ep_am_short              = uct_ugni_udt_ep_am_short,
+    .ep_am_short_iov          = uct_base_ep_am_short_iov,
     .ep_am_bcopy              = uct_ugni_udt_ep_am_bcopy,
     .ep_pending_add           = uct_ugni_udt_ep_pending_add,
     .ep_pending_purge         = uct_ugni_udt_ep_pending_purge,
diff --git a/test/apps/Makefile.am b/test/apps/Makefile.am
index 4e9c7caad13..2ed94c8a16f 100644
--- a/test/apps/Makefile.am
+++ b/test/apps/Makefile.am
@@ -20,7 +20,8 @@ noinst_PROGRAMS = \
 	test_ucp_dlopen \
 	test_ucs_dlopen \
 	test_link_map \
-	test_dlopen_cfg_print
+	test_dlopen_cfg_print \
+	test_init_mt
 
 objdir = $(shell sed -n -e 's/^objdir=\(.*\)$$/\1/p' $(LIBTOOL))
 
@@ -48,6 +49,35 @@ test_dlopen_cfg_print_CPPFLAGS = $(BASE_CPPFLAGS) -g \
 test_dlopen_cfg_print_CFLAGS   = $(BASE_CFLAGS)
 test_dlopen_cfg_print_LDADD    = -ldl
 
+test_init_mt_SOURCES  = test_init_mt.c
+test_init_mt_CPPFLAGS = $(BASE_CPPFLAGS)
+test_init_mt_CFLAGS   = $(BASE_CFLAGS) $(OPENMP_CFLAGS)
+test_init_mt_LDADD    = $(top_builddir)/src/ucp/libucp.la
+
+if HAVE_CUDA
+noinst_PROGRAMS                += test_cuda_hook_dynamic
+
+test_cuda_hook_dynamic_SOURCES  = test_cuda_hook.c
+test_cuda_hook_dynamic_CPPFLAGS = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS)
+test_cuda_hook_dynamic_CFLAGS   = $(BASE_CFLAGS)
+test_cuda_hook_dynamic_LDFLAGS  = $(CUDA_LDFLAGS)
+test_cuda_hook_dynamic_LDADD    = $(top_builddir)/src/ucp/libucp.la \
+                                  $(top_builddir)/src/ucm/libucm.la \
+                                  $(CUDA_LIBS)
+
+if HAVE_CUDA_STATIC
+noinst_PROGRAMS                += test_cuda_hook_static
+test_cuda_hook_static_SOURCES   = test_cuda_hook.c
+test_cuda_hook_static_CPPFLAGS  = $(BASE_CPPFLAGS) $(CUDA_CPPFLAGS)
+test_cuda_hook_static_CFLAGS    = $(BASE_CFLAGS)
+test_cuda_hook_static_LDFLAGS   = $(CUDA_LDFLAGS)
+test_cuda_hook_static_LDADD     = $(top_builddir)/src/ucp/libucp.la \
+                                  $(top_builddir)/src/ucm/libucm.la \
+                                  $(CUDA_STATIC_LIBS) -lcuda -lrt -ldl -lpthread
+endif
+
+endif
+
 if HAVE_TCMALLOC
 noinst_PROGRAMS       += test_tcmalloc
 test_tcmalloc_SOURCES  = test_tcmalloc.c
diff --git a/test/apps/iodemo/Makefile.am b/test/apps/iodemo/Makefile.am
index efbfb15e5ea..164aeb529c0 100644
--- a/test/apps/iodemo/Makefile.am
+++ b/test/apps/iodemo/Makefile.am
@@ -9,14 +9,27 @@ bin_PROGRAMS = io_demo
 noinst_HEADERS = \
 	ucx_wrapper.h
 
+io_demo_LDFLAGS  = -ldl
+
+if HAVE_CUDA
+io_demo_CUDA_LIBS     = $(CUDA_LIBS)
+io_demo_LDFLAGS      += $(CUDA_LDFLAGS)
+io_demo_CUDA_CPPFLAGS = $(CUDA_CPPFLAGS) -DHAVE_CUDA
+else
+io_demo_CUDA_LIBS     =
+io_demo_CUDA_CPPFLAGS =
+endif
+
+
 io_demo_CXXFLAGS = \
 	$(BASE_CXXFLAGS)
 
-io_demo_CPPFLAGS = $(BASE_CPPFLAGS)
+io_demo_CPPFLAGS = $(BASE_CPPFLAGS) $(io_demo_CUDA_CPPFLAGS)
 
-io_demo_LDADD = \
+io_demo_LDADD    = \
 	$(top_builddir)/src/ucs/libucs.la \
-	$(top_builddir)/src/ucp/libucp.la
+	$(top_builddir)/src/ucp/libucp.la \
+	$(io_demo_CUDA_LIBS)
 
 io_demo_SOURCES = \
 	ucx_wrapper.cc \
diff --git a/test/apps/iodemo/io_demo.cc b/test/apps/iodemo/io_demo.cc
index 02b7bff8e56..b430db8c809 100644
--- a/test/apps/iodemo/io_demo.cc
+++ b/test/apps/iodemo/io_demo.cc
@@ -23,6 +23,12 @@
 #include <limits>
 #include <malloc.h>
 #include <dlfcn.h>
+#include <set>
+
+#ifdef HAVE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
 
 #define ALIGNMENT           4096
 #define BUSY_PROGRESS_COUNT 1000
@@ -50,8 +56,8 @@ typedef struct {
     int                      port_num;
     double                   connect_timeout;
     double                   client_timeout;
-    long                     client_retries;
-    double                   client_retry_interval;
+    long                     retries;
+    double                   retry_interval;
     double                   client_runtime_limit;
     double                   print_interval;
     size_t                   iomsg_size;
@@ -60,30 +66,33 @@ typedef struct {
     size_t                   chunk_size;
     long                     iter_count;
     long                     window_size;
+    long                     conn_window_size;
     std::vector<io_op_t>     operations;
     unsigned                 random_seed;
     size_t                   num_offcache_buffers;
     bool                     verbose;
     bool                     validate;
+    bool                     use_am;
+    ucs_memory_type_t        memory_type;
 } options_t;
 
 #define LOG_PREFIX  "[DEMO]"
 #define LOG         UcxLog(LOG_PREFIX)
 #define VERBOSE_LOG UcxLog(LOG_PREFIX, _test_opts.verbose)
 
-
-template<class T, bool use_offcache = false>
-class MemoryPool {
+template<class BufferType, bool use_offcache = false> class ObjectPool {
 public:
-    MemoryPool(size_t buffer_size, const std::string& name, size_t offcache = 0) :
-        _num_allocated(0), _buffer_size(buffer_size), _name(name) {
-
+    ObjectPool(size_t buffer_size, const std::string &name,
+               size_t offcache = 0) :
+        _buffer_size(buffer_size), _num_allocated(0), _name(name)
+    {
         for (size_t i = 0; i < offcache; ++i) {
             _offcache_queue.push(get_free());
         }
     }
 
-    ~MemoryPool() {
+    ~ObjectPool()
+    {
         while (!_offcache_queue.empty()) {
             _free_stack.push_back(_offcache_queue.front());
             _offcache_queue.pop();
@@ -99,8 +108,9 @@ class MemoryPool {
         }
     }
 
-    inline T* get() {
-        T* item = get_free();
+    inline BufferType *get()
+    {
+        BufferType *item = get_free();
 
         if (use_offcache && !_offcache_queue.empty()) {
             _offcache_queue.push(item);
@@ -111,7 +121,8 @@ class MemoryPool {
         return item;
     }
 
-    inline void put(T* item) {
+    inline void put(BufferType *item)
+    {
         _free_stack.push_back(item);
     }
 
@@ -119,12 +130,26 @@ class MemoryPool {
         return _num_allocated;
     }
 
+    virtual ucs_memory_type_t memory_type() const
+    {
+        return UCS_MEMORY_TYPE_HOST;
+    }
+
+protected:
+    size_t buffer_size() const
+    {
+        return _buffer_size;
+    }
+
+    virtual BufferType *construct() = 0;
+
 private:
-    inline T* get_free() {
-        T* item;
+    inline BufferType *get_free()
+    {
+        BufferType *item;
 
         if (_free_stack.empty()) {
-            item = new T(_buffer_size, *this);
+            item = construct();
             _num_allocated++;
         } else {
             item = _free_stack.back();
@@ -134,11 +159,52 @@ class MemoryPool {
     }
 
 private:
-    std::vector<T*> _free_stack;
-    std::queue<T*>  _offcache_queue;
-    uint32_t        _num_allocated;
-    size_t          _buffer_size;
-    std::string     _name;
+    size_t                   _buffer_size;
+    std::vector<BufferType*> _free_stack;
+    std::queue<BufferType*>  _offcache_queue;
+    uint32_t                 _num_allocated;
+    std::string              _name;
+};
+
+template<class BufferType, bool use_offcache = false>
+class MemoryPool : public ObjectPool<BufferType, use_offcache> {
+public:
+    MemoryPool(size_t buffer_size, const std::string &name,
+               size_t offcache = 0) :
+        ObjectPool<BufferType, use_offcache>::ObjectPool(buffer_size, name,
+                                                         offcache)
+    {
+    }
+
+public:
+    virtual BufferType *construct()
+    {
+        return new BufferType(this->buffer_size(), *this);
+    }
+};
+
+template<typename BufferType>
+class BufferMemoryPool : public ObjectPool<BufferType, true> {
+public:
+    BufferMemoryPool(size_t buffer_size, const std::string &name,
+                     ucs_memory_type_t memory_type, size_t offcache = 0) :
+        ObjectPool<BufferType, true>(buffer_size, name, offcache),
+        _memory_type(memory_type)
+    {
+    }
+
+    virtual BufferType *construct()
+    {
+        return BufferType::allocate(this->buffer_size(), *this, _memory_type);
+    }
+
+    virtual ucs_memory_type_t memory_type() const
+    {
+        return _memory_type;
+    }
+
+private:
+    ucs_memory_type_t _memory_type;
 };
 
 /**
@@ -170,21 +236,75 @@ class IoDemoRandom {
         }
     }
 
-    static inline void fill(unsigned &seed, void *buffer, size_t size) {
+    template <typename unsigned_type>
+    static inline unsigned_type urand(unsigned_type max)
+    {
+        assert(max < std::numeric_limits<unsigned_type>::max());
+        assert(unsigned_type(0) == std::numeric_limits<unsigned_type>::min());
+
+        return rand(_seed, unsigned_type(0), max - 1);
+    }
+
+    static void *get_host_fill_buffer(void *buffer, size_t size,
+                                      ucs_memory_type_t memory_type)
+    {
+        static std::vector<uint8_t> _buffer;
+
+        if (memory_type == UCS_MEMORY_TYPE_CUDA) {
+            _buffer.resize(size);
+            return _buffer.data();
+        }
+
+        return buffer;
+    }
+
+    static void fill_commit(void *buffer, void *fill_buffer, size_t size,
+                            ucs_memory_type_t memory_type)
+    {
+#ifdef HAVE_CUDA
+        if (memory_type == UCS_MEMORY_TYPE_CUDA) {
+            cudaMemcpy(buffer, fill_buffer, size, cudaMemcpyDefault);
+        }
+#endif
+    }
+
+    static inline void fill(unsigned &seed, void *buffer, size_t size,
+                            ucs_memory_type_t memory_type)
+    {
+        void *fill_buffer = get_host_fill_buffer(buffer, size, memory_type);
         size_t body_count = size / sizeof(uint64_t);
         size_t tail_count = size & (sizeof(uint64_t) - 1);
-        uint64_t *body    = reinterpret_cast<uint64_t*>(buffer);
+        uint64_t *body    = reinterpret_cast<uint64_t*>(fill_buffer);
         uint8_t *tail     = reinterpret_cast<uint8_t*>(body + body_count);
 
         fill(seed, body, body_count);
         fill(seed, tail, tail_count);
+
+        fill_commit(buffer, fill_buffer, size, memory_type);
+    }
+
+    static const void *get_host_validate_buffer(const void *buffer, size_t size,
+                                                ucs_memory_type_t memory_type)
+    {
+#ifdef HAVE_CUDA
+        static std::vector<uint8_t> _buffer;
+
+        if (memory_type == UCS_MEMORY_TYPE_CUDA) {
+            _buffer.resize(size);
+            cudaMemcpy(_buffer.data(), buffer, size, cudaMemcpyDefault);
+            return _buffer.data();
+        }
+#endif
+        return buffer;
     }
 
     static inline size_t validate(unsigned &seed, const void *buffer,
-                                  size_t size) {
+                                  size_t size, ucs_memory_type_t memory_type)
+    {
         size_t body_count    = size / sizeof(uint64_t);
         size_t tail_count    = size & (sizeof(uint64_t) - 1);
-        const uint64_t *body = reinterpret_cast<const uint64_t*>(buffer);
+        const uint64_t *body = reinterpret_cast<const uint64_t*>(
+                get_host_validate_buffer(buffer, size, memory_type));
         const uint8_t *tail  = reinterpret_cast<const uint8_t*>(body + body_count);
 
         size_t err_pos = validate(seed, body, body_count);
@@ -201,15 +321,17 @@ class IoDemoRandom {
     }
 
 private:
-    template <typename T>
-    static inline void fill(unsigned &seed, T *buffer, size_t count) {
+    template<typename T>
+    static inline void fill(unsigned &seed, T *buffer, size_t count)
+    {
         for (size_t i = 0; i < count; ++i) {
             buffer[i] = rand<T>(seed);
         }
     }
 
-    template <typename T>
-    static inline size_t validate(unsigned &seed, const T *buffer, size_t count) {
+    template<typename T>
+    static inline size_t validate(unsigned &seed, const T *buffer, size_t count)
+    {
         for (size_t i = 0; i < count; ++i) {
             if (buffer[i] != rand<T>(seed)) {
                 return i;
@@ -246,47 +368,113 @@ class P2pDemoCommon : public UcxContext {
 
     class Buffer {
     public:
-        Buffer(size_t size, MemoryPool<Buffer, true>& pool) :
-            _capacity(size), _buffer(memalign(ALIGNMENT, size)), _size(0),
-            _pool(pool) {
-            if (_buffer == NULL) {
+        Buffer(void *buffer, size_t size, BufferMemoryPool<Buffer> &pool,
+               ucs_memory_type_t memory_type = UCS_MEMORY_TYPE_HOST) :
+            _capacity(size),
+            _buffer(buffer),
+            _size(0),
+            _pool(pool),
+            _memory_type(memory_type)
+        {
+        }
+
+        static Buffer *allocate(size_t size, BufferMemoryPool<Buffer> &pool,
+                                ucs_memory_type_t memory_type)
+        {
+#ifdef HAVE_CUDA
+            cudaError_t cerr;
+#endif
+            void *buffer;
+
+            switch (memory_type) {
+#ifdef HAVE_CUDA
+            case UCS_MEMORY_TYPE_CUDA:
+                cerr = cudaMalloc(&buffer, size);
+                if (cerr != cudaSuccess) {
+                    buffer = NULL;
+                }
+                break;
+            case UCS_MEMORY_TYPE_CUDA_MANAGED:
+                cerr = cudaMallocManaged(&buffer, size, cudaMemAttachGlobal);
+                if (cerr != cudaSuccess) {
+                    buffer = NULL;
+                }
+                break;
+#endif
+            case UCS_MEMORY_TYPE_HOST:
+                buffer = memalign(ALIGNMENT, size);
+                break;
+            default:
+                LOG << "ERROR: Unsupported memory type requested: "
+                    << ucs_memory_type_names[memory_type];
+                abort();
+            }
+            if (buffer == NULL) {
                 throw std::bad_alloc();
             }
+
+            return new Buffer(buffer, size, pool, memory_type);
         }
 
-        ~Buffer() {
-            free(_buffer);
+        ~Buffer()
+        {
+            switch (_memory_type) {
+#ifdef HAVE_CUDA
+            case UCS_MEMORY_TYPE_CUDA:
+            case UCS_MEMORY_TYPE_CUDA_MANAGED:
+                cudaFree(_buffer);
+                break;
+#endif
+            case UCS_MEMORY_TYPE_HOST:
+                free(_buffer);
+                break;
+            default:
+                /* Unreachable - would fail in ctor */
+                abort();
+            }
         }
 
-        void release() {
+        inline size_t capacity() const
+        {
+            return _capacity;
+        }
+
+        void release()
+        {
             _pool.put(this);
         }
 
-        inline void *buffer(size_t offset = 0) const {
+        inline void *buffer(size_t offset = 0) const
+        {
             return (uint8_t*)_buffer + offset;
         }
 
-        inline void resize(size_t size) {
+        inline void resize(size_t size)
+        {
             assert(size <= _capacity);
             _size = size;
         }
 
-        inline size_t size() const {
+        inline size_t size() const
+        {
             return _size;
         }
 
     public:
-        const size_t         _capacity;
+        const size_t             _capacity;
 
     private:
-        void*                     _buffer;
-        size_t                    _size;
-        MemoryPool<Buffer, true>& _pool;
+        void                     *_buffer;
+        size_t                   _size;
+        BufferMemoryPool<Buffer> &_pool;
+        ucs_memory_type_t        _memory_type;
     };
 
     class BufferIov {
     public:
-        BufferIov(size_t size, MemoryPool<BufferIov>& pool) : _pool(pool) {
+        BufferIov(size_t size, MemoryPool<BufferIov> &pool) :
+            _memory_type(UCS_MEMORY_TYPE_UNKNOWN), _pool(pool)
+        {
             _iov.reserve(size);
         }
 
@@ -294,12 +482,14 @@ class P2pDemoCommon : public UcxContext {
             return _iov.size();
         }
 
-        void init(size_t data_size, MemoryPool<Buffer, true> &chunk_pool,
-                  uint32_t sn, bool validate) {
+        void init(size_t data_size, BufferMemoryPool<Buffer> &chunk_pool,
+                  uint32_t sn, bool validate)
+        {
             assert(_iov.empty());
 
+            _memory_type  = chunk_pool.memory_type();
             Buffer *chunk = chunk_pool.get();
-            _iov.resize(get_chunk_cnt(data_size, chunk->_capacity));
+            _iov.resize(get_chunk_cnt(data_size, chunk->capacity()));
 
             size_t remaining = init_chunk(0, chunk, data_size);
             for (size_t i = 1; i < _iov.size(); ++i) {
@@ -309,11 +499,12 @@ class P2pDemoCommon : public UcxContext {
             assert(remaining == 0);
 
             if (validate) {
-                fill_data(sn);
+                fill_data(sn, _memory_type);
             }
         }
 
-        inline Buffer& operator[](size_t i) const {
+        inline Buffer &operator[](size_t i) const
+        {
             return *_iov[i];
         }
 
@@ -332,7 +523,8 @@ class P2pDemoCommon : public UcxContext {
             for (size_t iov_err_pos = 0, i = 0; i < _iov.size(); ++i) {
                 size_t buf_err_pos = IoDemoRandom::validate(seed,
                                                             _iov[i]->buffer(),
-                                                            _iov[i]->size());
+                                                            _iov[i]->size(),
+                                                            _memory_type);
                 iov_err_pos       += buf_err_pos;
                 if (buf_err_pos < _iov[i]->size()) {
                     return iov_err_pos;
@@ -349,17 +541,20 @@ class P2pDemoCommon : public UcxContext {
     private:
         size_t init_chunk(size_t i, Buffer *chunk, size_t remaining) {
             _iov[i] = chunk;
-            _iov[i]->resize(std::min(_iov[i]->_capacity, remaining));
+            _iov[i]->resize(std::min(_iov[i]->capacity(), remaining));
             return remaining - _iov[i]->size();
         }
 
-        void fill_data(unsigned seed) {
+        void fill_data(unsigned seed, ucs_memory_type_t memory_type)
+        {
             for (size_t i = 0; i < _iov.size(); ++i) {
-                IoDemoRandom::fill(seed, _iov[i]->buffer(), _iov[i]->size());
+                IoDemoRandom::fill(seed, _iov[i]->buffer(), _iov[i]->size(),
+                                   memory_type);
             }
         }
 
         static const size_t    _npos = static_cast<size_t>(-1);
+        ucs_memory_type_t _memory_type;
         std::vector<Buffer*>   _iov;
         MemoryPool<BufferIov>& _pool;
     };
@@ -385,7 +580,7 @@ class P2pDemoCommon : public UcxContext {
             if (validate) {
                 void *tail       = reinterpret_cast<void*>(m + 1);
                 size_t tail_size = _io_msg_size - sizeof(*m);
-                IoDemoRandom::fill(sn, tail, tail_size);
+                IoDemoRandom::fill(sn, tail, tail_size, UCS_MEMORY_TYPE_HOST);
             }
         }
 
@@ -415,13 +610,15 @@ class P2pDemoCommon : public UcxContext {
     public:
         SendCompleteCallback(size_t buffer_size,
                              MemoryPool<SendCompleteCallback>& pool) :
-            _op_counter(NULL), _counter(0), _iov(NULL), _pool(pool) {
+            _op_counter(NULL), _counter(0), _iov(NULL), _io_msg(NULL),
+            _pool(pool) {
         }
 
-        void init(BufferIov* iov, long* op_counter) {
+        void init(BufferIov* iov, long* op_counter, IoMessage *io_msg = NULL) {
             _op_counter = op_counter;
             _counter    = iov->size();
             _iov        = iov;
+            _io_msg     = io_msg;
             assert(_counter > 0);
         }
 
@@ -434,6 +631,10 @@ class P2pDemoCommon : public UcxContext {
                 ++(*_op_counter);
             }
 
+            if (_io_msg != NULL) {
+                (*_io_msg)(status);
+            }
+
             _iov->release();
             _pool.put(this);
         }
@@ -442,18 +643,22 @@ class P2pDemoCommon : public UcxContext {
         long*                             _op_counter;
         size_t                            _counter;
         BufferIov*                        _iov;
+        IoMessage*                        _io_msg;
         MemoryPool<SendCompleteCallback>& _pool;
     };
 
-    P2pDemoCommon(const options_t& test_opts) :
-        UcxContext(test_opts.iomsg_size, test_opts.connect_timeout),
+    P2pDemoCommon(const options_t &test_opts) :
+        UcxContext(test_opts.iomsg_size, test_opts.connect_timeout,
+                   test_opts.use_am),
         _test_opts(test_opts),
         _io_msg_pool(test_opts.iomsg_size, "io messages"),
         _send_callback_pool(0, "send callbacks"),
         _data_buffers_pool(get_chunk_cnt(test_opts.max_data_size,
-                                         test_opts.chunk_size), "data iovs"),
+                                         test_opts.chunk_size),
+                           "data iovs"),
         _data_chunks_pool(test_opts.chunk_size, "data chunks",
-                          test_opts.num_offcache_buffers) {
+                          test_opts.memory_type)
+    {
     }
 
     const options_t& opts() const {
@@ -513,7 +718,8 @@ class P2pDemoCommon : public UcxContext {
         const void *buf = msg + 1;
         size_t buf_size = iomsg_size - sizeof(*msg);
 
-        size_t err_pos  = IoDemoRandom::validate(seed, buf, buf_size);
+        size_t err_pos = IoDemoRandom::validate(seed, buf, buf_size,
+                                                UCS_MEMORY_TYPE_HOST);
         if (err_pos < buf_size) {
             LOG << "ERROR: io msg data corruption at " << err_pos << " position";
             abort();
@@ -522,19 +728,13 @@ class P2pDemoCommon : public UcxContext {
 
     static void validate(const iomsg_t *msg, uint32_t sn, size_t iomsg_size) {
         if (sn != msg->sn) {
-            LOG << "ERROR: io msg sn missmatch " << sn << " != " << msg->sn;
+            LOG << "ERROR: io msg sn mismatch " << sn << " != " << msg->sn;
             abort();
         }
 
         validate(msg, iomsg_size);
     }
 
-    static double get_time() {
-        struct timeval tv;
-        gettimeofday(&tv, NULL);
-        return tv.tv_sec + (tv.tv_usec * 1e-6);
-    }
-
 private:
     bool send_io_message(UcxConnection *conn, IoMessage *msg) {
         VERBOSE_LOG << "sending IO " << io_op_names[msg->msg()->op] << ", sn "
@@ -555,7 +755,7 @@ class P2pDemoCommon : public UcxContext {
     MemoryPool<IoMessage>            _io_msg_pool;
     MemoryPool<SendCompleteCallback> _send_callback_pool;
     MemoryPool<BufferIov>            _data_buffers_pool;
-    MemoryPool<Buffer, true>         _data_chunks_pool;
+    BufferMemoryPool<Buffer> _data_chunks_pool;
 };
 
 class DemoServer : public P2pDemoCommon {
@@ -585,8 +785,15 @@ class DemoServer : public P2pDemoCommon {
             }
 
             if (status == UCS_OK) {
-                _server->send_io_message(_conn, IO_WRITE_COMP, _sn, 0,
-                                         _server->opts().validate);
+                if (_server->opts().use_am) {
+                    IoMessage *m = _server->_io_msg_pool.get();
+                    m->init(IO_WRITE_COMP, _sn, 0, _server->opts().validate);
+                    _conn->send_am(m->buffer(), _server->opts().iomsg_size,
+                                   NULL, 0ul, m);
+                } else {
+                    _server->send_io_message(_conn, IO_WRITE_COMP, _sn, 0,
+                                             _server->opts().validate);
+                }
                 if (_server->opts().validate) {
                     validate(*_iov, _sn);
                 }
@@ -630,7 +837,33 @@ class DemoServer : public P2pDemoCommon {
         listen_addr.sin_addr.s_addr = INADDR_ANY;
         listen_addr.sin_port        = htons(opts().port_num);
 
-        listen((const struct sockaddr*)&listen_addr, sizeof(listen_addr));
+        for (long retry = 1;; ++retry) {
+            if (listen((const struct sockaddr*)&listen_addr,
+                       sizeof(listen_addr))) {
+                break;
+            }
+
+            if (retry > opts().retries) {
+                return;
+            }
+
+            {
+                UcxLog log(LOG_PREFIX);
+                log << "restarting listener on "
+                    << UcxContext::sockaddr_str((struct sockaddr*)&listen_addr,
+                                                sizeof(listen_addr))
+                    << " in " << opts().retry_interval << " seconds (retry "
+                    << retry;
+
+                if (opts().retries < std::numeric_limits<long>::max()) {
+                    log << "/" << opts().retries;
+                }
+
+                log << ")";
+            }
+
+            sleep(opts().retry_interval);
+        }
 
         for (double prev_time = 0.0; ;) {
             try {
@@ -667,6 +900,27 @@ class DemoServer : public P2pDemoCommon {
         send_io_message(conn, IO_READ_COMP, msg->sn, 0, opts().validate);
     }
 
+    void handle_io_am_read_request(UcxConnection* conn, const iomsg_t *msg) {
+        VERBOSE_LOG << "sending AM IO read data";
+        assert(opts().max_data_size >= msg->data_size);
+
+        IoMessage *m = _io_msg_pool.get();
+        m->init(IO_READ_COMP, msg->sn, msg->data_size, opts().validate);
+
+        BufferIov *iov = _data_buffers_pool.get();
+        iov->init(msg->data_size, _data_chunks_pool, msg->sn, opts().validate);
+
+        SendCompleteCallback *cb = _send_callback_pool.get();
+        cb->init(iov, &_curr_state.read_count, m);
+
+        assert(iov->size() == 1);
+
+        // Send IO_READ_COMP as AM header and first iov element as payload
+        // (note that multi-iov send is not supported for IODEMO with AM yet)
+        conn->send_am(m->buffer(), opts().iomsg_size, (*iov)[0].buffer(),
+                      (*iov)[0].size(), cb);
+    }
+
     void handle_io_write_request(UcxConnection* conn, const iomsg_t *msg) {
         VERBOSE_LOG << "receiving IO write data";
         assert(msg->data_size != 0);
@@ -680,15 +934,31 @@ class DemoServer : public P2pDemoCommon {
         recv_data(conn, *iov, msg->sn, w);
     }
 
+    void handle_io_am_write_request(UcxConnection* conn, const iomsg_t *msg,
+                                    const UcxAmDesc &data_desc) {
+        VERBOSE_LOG << "receiving AM IO write data";
+        assert(msg->data_size != 0);
+
+        BufferIov *iov             = _data_buffers_pool.get();
+        IoWriteResponseCallback *w = _callback_pool.get();
+
+        iov->init(msg->data_size, _data_chunks_pool, msg->sn, opts().validate);
+        w->init(this, conn, msg->sn, iov, &_curr_state.write_count);
+
+        assert(iov->size() == 1);
+
+        conn->recv_am_data((*iov)[0].buffer(), (*iov)[0].size(), data_desc, w);
+    }
+
     virtual void dispatch_connection_accepted(UcxConnection* conn) {
         ++_curr_state.active_conns;
     }
 
     virtual void dispatch_connection_error(UcxConnection *conn) {
-        LOG << "deleting connection with status "
+        LOG << "disconnecting connection with status "
             << ucs_status_string(conn->ucx_status());
         --_curr_state.active_conns;
-        delete conn;
+        conn->disconnect(new UcxDisconnectCallback(*conn));
     }
 
     virtual void dispatch_io_message(UcxConnection* conn, const void *buffer,
@@ -713,6 +983,29 @@ class DemoServer : public P2pDemoCommon {
         }
     }
 
+    virtual void dispatch_am_message(UcxConnection* conn, const void *buffer,
+                                     size_t length,
+                                     const UcxAmDesc &data_desc) {
+        iomsg_t const *msg = reinterpret_cast<const iomsg_t*>(buffer);
+
+        VERBOSE_LOG << "got io (AM) message " << io_op_names[msg->op] << " sn "
+                    << msg->sn << " data size " << msg->data_size
+                    << " conn " << conn;
+
+        if (opts().validate) {
+            assert(length == opts().iomsg_size);
+            validate(msg, length);
+        }
+
+        if (msg->op == IO_READ) {
+            handle_io_am_read_request(conn, msg);
+        } else if (msg->op == IO_WRITE) {
+            handle_io_am_write_request(conn, msg, data_desc);
+        } else {
+            LOG << "Invalid opcode: " << msg->op;
+        }
+    }
+
 private:
     void save_prev_state() {
         _prev_state = _curr_state;
@@ -736,38 +1029,97 @@ class DemoServer : public P2pDemoCommon {
 
 
 class DemoClient : public P2pDemoCommon {
+private:
+    class DisconnectCallback : public UcxCallback {
+    public:
+        DisconnectCallback(DemoClient &client, UcxConnection &conn) :
+            _client(client), _conn(&conn) {
+        }
+
+        virtual ~DisconnectCallback() {
+            delete _conn;
+        }
+
+        virtual void operator()(ucs_status_t status) {
+            server_info_t &server_info = _client.get_server_info(_conn);
+
+            _client._num_sent -= get_num_uncompleted(server_info);
+
+            // Remove connection pointer
+            _client._server_index_lookup.erase(_conn);
+
+            // Remove active servers entry
+            _client.active_servers_remove(server_info.active_index);
+
+            reset_server_info(server_info);
+            delete this;
+        }
+
+    private:
+        DemoClient    &_client;
+        UcxConnection *_conn;
+    };
+
 public:
     typedef struct {
         UcxConnection* conn;
         long           retry_count;               /* Connect retry counter */
-        long           num_sent;                  /* Total number of sent operations */
+        double         prev_connect_time;         /* timestamp of last connect attempt */
         size_t         active_index;              /* Index in active vector */
+        long           num_sent[IO_OP_MAX];       /* Number of sent operations */
         long           num_completed[IO_OP_MAX];  /* Number of completed operations */
         long           prev_completed[IO_OP_MAX]; /* Completed in last report */
     } server_info_t;
 
+    class ConnectCallback : public UcxCallback {
+    public:
+        ConnectCallback(DemoClient &client, size_t server_idx) :
+            _client(client), _server_idx(server_idx)
+        {
+        }
+
+        virtual void operator()(ucs_status_t status)
+        {
+            if (status == UCS_OK) {
+                _client.connect_succeed(_server_idx);
+            } else {
+                _client.connect_failed(_server_idx);
+            }
+
+            _client._connecting_servers.erase(_server_idx);
+            delete this;
+        }
+
+    private:
+        DemoClient   &_client;
+        const size_t _server_idx;
+    };
+
     class IoReadResponseCallback : public UcxCallback {
     public:
         IoReadResponseCallback(size_t buffer_size,
             MemoryPool<IoReadResponseCallback>& pool) :
-            _comp_counter(0), _io_counter(NULL), _server_io_counter(NULL),
+            _comp_counter(0), _client(NULL),
+            _server_index(std::numeric_limits<size_t>::max()),
             _sn(0), _validate(false), _iov(NULL), _buffer(malloc(buffer_size)),
-            _buffer_size(buffer_size), _pool(pool) {
+            _buffer_size(buffer_size), _meta_comp_counter(0), _pool(pool) {
 
             if (_buffer == NULL) {
                 throw std::bad_alloc();
             }
         }
 
-        void init(long *io_counter, long *conn_io_counter,
-                  uint32_t sn, bool validate, BufferIov *iov) {
+        void init(DemoClient *client, size_t server_index,
+                  uint32_t sn, bool validate, BufferIov *iov,
+                  int meta_comp_counter = 1) {
             /* wait for all data chunks and the read response completion */
-            _comp_counter      = iov->size() + 1;
-            _io_counter        = io_counter;
-            _server_io_counter = conn_io_counter;
+            _comp_counter      = iov->size() + meta_comp_counter;
+            _client            = client;
+            _server_index      = server_index;
             _sn                = sn;
             _validate          = validate;
             _iov               = iov;
+            _meta_comp_counter = meta_comp_counter;
         }
 
         ~IoReadResponseCallback() {
@@ -779,12 +1131,21 @@ class DemoClient : public P2pDemoCommon {
                 return;
             }
 
-            ++(*_io_counter);
-            ++(*_server_io_counter);
+            assert(_server_index != std::numeric_limits<size_t>::max());
+            _client->handle_operation_completion(_server_index, IO_READ);
+
             if (_validate && (status == UCS_OK)) {
-                iomsg_t *msg = reinterpret_cast<iomsg_t*>(_buffer);
-                validate(msg, _sn, _buffer_size);
                 validate(*_iov, _sn);
+
+                if (_meta_comp_counter != 0) {
+                    // With tag API, we also wait for READ_COMP arrival, so need
+                    // to validate it. With AM API, READ_COMP arrives as AM
+                    // header together with data descriptor, we validate it in
+                    // place to avoid unneeded memory copy to this
+                    // IoReadResponseCallback _buffer.
+                    iomsg_t *msg = reinterpret_cast<iomsg_t*>(_buffer);
+                    validate(msg, _sn, _buffer_size);
+                }
             }
 
             _iov->release();
@@ -797,21 +1158,26 @@ class DemoClient : public P2pDemoCommon {
 
     private:
         long                                _comp_counter;
-        long*                               _io_counter;
-        long*                               _server_io_counter;
+        DemoClient*                         _client;
+        size_t                              _server_index;
         uint32_t                            _sn;
         bool                                _validate;
         BufferIov*                          _iov;
         void*                               _buffer;
         const size_t                        _buffer_size;
+        int                                 _meta_comp_counter;
         MemoryPool<IoReadResponseCallback>& _pool;
     };
 
-    DemoClient(const options_t& test_opts) :
-        P2pDemoCommon(test_opts), _prev_connect_time(0),
-        _num_sent(0), _num_completed(0),
-        _status(OK), _start_time(get_time()),
-        _read_callback_pool(opts().iomsg_size, "read callbacks") {
+    DemoClient(const options_t &test_opts) :
+        P2pDemoCommon(test_opts),
+        _num_active_servers_to_use(0),
+        _num_sent(0),
+        _num_completed(0),
+        _status(OK),
+        _start_time(get_time()),
+        _read_callback_pool(opts().iomsg_size, "read callbacks")
+    {
     }
 
     typedef enum {
@@ -821,27 +1187,67 @@ class DemoClient : public P2pDemoCommon {
     } status_t;
 
     size_t get_server_index(const UcxConnection *conn) {
-        return _server_index_lookup[conn];
+        assert(_server_index_lookup.size() == _active_servers.size());
+
+        std::map<const UcxConnection*, size_t>::const_iterator i =
+                                                _server_index_lookup.find(conn);
+        return (i == _server_index_lookup.end()) ? _server_info.size() :
+               i->second;
     }
 
-    size_t do_io_read(server_info_t& server_info, uint32_t sn) {
-        size_t data_size = get_data_size();
-        bool validate    = opts().validate;
+    server_info_t &get_server_info(const UcxConnection *conn) {
+        const size_t server_index = get_server_index(conn);
+
+        assert(server_index < _server_info.size());
+        return _server_info[server_index];
+    }
+
+    void commit_operation(size_t server_index, io_op_t op) {
+        server_info_t& server_info = _server_info[server_index];
+
+        assert(get_num_uncompleted(server_info) < opts().conn_window_size);
+
+        ++server_info.num_sent[op];
+        ++_num_sent;
+        if (get_num_uncompleted(server_info) == opts().conn_window_size) {
+            active_servers_make_unused(server_info.active_index);
+        }
+    }
+
+    void handle_operation_completion(size_t server_index, io_op_t op) {
+        assert(server_index < _server_info.size());
+        server_info_t& server_info = _server_info[server_index];
+
+        assert(get_num_uncompleted(server_info) <= opts().conn_window_size);
+        assert(_server_index_lookup.find(server_info.conn) !=
+               _server_index_lookup.end());
+        assert(_num_completed < _num_sent);
+
+        if (get_num_uncompleted(server_info) == opts().conn_window_size) {
+            active_servers_make_used(server_info.active_index);
+        }
+
+        ++_num_completed;
+        ++server_info.num_completed[op];
+    }
+
+    size_t do_io_read(size_t server_index, uint32_t sn) {
+        server_info_t& server_info = _server_info[server_index];
+        size_t data_size           = get_data_size();
+        bool validate              = opts().validate;
 
         if (!send_io_message(server_info.conn, IO_READ, sn, data_size,
                              validate)) {
             return 0;
         }
 
-        ++server_info.num_sent;
-        ++_num_sent;
+        commit_operation(server_index, IO_READ);
 
         BufferIov *iov            = _data_buffers_pool.get();
         IoReadResponseCallback *r = _read_callback_pool.get();
 
         iov->init(data_size, _data_chunks_pool, sn, validate);
-        r->init(&_num_completed, &server_info.num_completed[IO_READ], sn,
-                validate, iov);
+        r->init(this, server_index, sn, validate, iov);
 
         recv_data(server_info.conn, *iov, sn, r);
         server_info.conn->recv_data(r->buffer(), opts().iomsg_size, sn, r);
@@ -849,17 +1255,31 @@ class DemoClient : public P2pDemoCommon {
         return data_size;
     }
 
-    size_t do_io_write(server_info_t& server_info, uint32_t sn) {
-        size_t data_size = get_data_size();
-        bool validate    = opts().validate;
+    size_t do_io_read_am(size_t server_index, uint32_t sn) {
+        server_info_t& server_info = _server_info[server_index];
+        size_t data_size           = get_data_size();
+
+        commit_operation(server_index, IO_READ);
+
+        IoMessage *m = _io_msg_pool.get();
+        m->init(IO_READ, sn, data_size, opts().validate);
+
+        server_info.conn->send_am(m->buffer(), opts().iomsg_size, NULL, 0, m);
+
+        return data_size;
+    }
+
+    size_t do_io_write(size_t server_index, uint32_t sn) {
+        server_info_t& server_info = _server_info[server_index];
+        size_t data_size           = get_data_size();
+        bool validate              = opts().validate;
 
         if (!send_io_message(server_info.conn, IO_WRITE, sn, data_size,
                              validate)) {
             return 0;
         }
 
-        ++server_info.num_sent;
-        ++_num_sent;
+        commit_operation(server_index, IO_WRITE);
 
         BufferIov *iov           = _data_buffers_pool.get();
         SendCompleteCallback *cb = _send_callback_pool.get();
@@ -870,10 +1290,40 @@ class DemoClient : public P2pDemoCommon {
         VERBOSE_LOG << "sending data " << iov << " size "
                     << data_size << " sn " << sn;
         send_data(server_info.conn, *iov, sn, cb);
+
         return data_size;
     }
 
-    void close_uncompleted_servers(const char *reason) {
+    size_t do_io_write_am(size_t server_index, uint32_t sn) {
+        server_info_t& server_info = _server_info[server_index];
+        size_t data_size           = get_data_size();
+        bool validate              = opts().validate;
+
+        commit_operation(server_index, IO_WRITE);
+
+        IoMessage *m = _io_msg_pool.get();
+        m->init(IO_WRITE, sn, data_size, validate);
+
+        BufferIov *iov = _data_buffers_pool.get();
+        iov->init(data_size, _data_chunks_pool, sn, validate);
+
+        SendCompleteCallback *cb = _send_callback_pool.get();
+        cb->init(iov, NULL, m);
+
+        VERBOSE_LOG << "sending IO_WRITE (AM) data " << iov << " size "
+                    << data_size << " sn " << sn;
+
+        assert(iov->size() == 1);
+
+        // Send IO_WRITE as AM header and first iov element as payload
+        // (note that multi-iov send is not supported for IODEMO with AM yet)
+        server_info.conn->send_am(m->buffer(), opts().iomsg_size,
+                                  (*iov)[0].buffer(), (*iov)[0].size(), cb);
+
+        return data_size;
+    }
+
+    void disconnect_uncompleted_servers(const char *reason) {
         std::vector<size_t> server_idxs;
         server_idxs.reserve(_active_servers.size());
 
@@ -884,8 +1334,7 @@ class DemoClient : public P2pDemoCommon {
         }
 
         while (!server_idxs.empty()) {
-            size_t i = server_idxs.back();
-            terminate_connection(_server_info[i].conn, reason);
+            disconnect_server(server_idxs.back(), reason);
             server_idxs.pop_back();
         }
     }
@@ -900,57 +1349,112 @@ class DemoClient : public P2pDemoCommon {
 
         if (msg->op >= IO_COMP_MIN) {
             assert(msg->op == IO_WRITE_COMP);
-            ++_num_completed;
-            ++_server_info[get_server_index(conn)].num_completed[IO_WRITE];
+
+            size_t server_index = get_server_index(conn);
+            if (server_index < _server_info.size()) {
+                handle_operation_completion(server_index, IO_WRITE);
+            } else {
+                /* do not increment _num_completed here since we decremented
+                 * _num_sent on connection termination */
+                LOG << "got WRITE completion on failed connection";
+            }
+        }
+    }
+
+    virtual void dispatch_am_message(UcxConnection* conn, const void *buffer,
+                                     size_t length,
+                                     const UcxAmDesc &data_desc) {
+        iomsg_t const *msg = reinterpret_cast<const iomsg_t*>(buffer);
+
+        VERBOSE_LOG << "got AM io message " << io_op_names[msg->op] << " sn "
+                    << msg->sn << " data size " << msg->data_size
+                    << " conn " << conn;
+
+        assert(msg->op >= IO_COMP_MIN);
+
+        if (opts().validate) {
+            assert(length == opts().iomsg_size);
+            validate(msg, opts().iomsg_size);
+        }
+
+        // Client can receive IO_WRITE_COMP or IO_READ_COMP only
+        size_t server_index = get_server_index(conn);
+        if (msg->op == IO_WRITE_COMP) {
+            assert(msg->op == IO_WRITE_COMP);
+            handle_operation_completion(server_index, IO_WRITE);
+        } else if (msg->op == IO_READ_COMP) {
+            BufferIov *iov = _data_buffers_pool.get();
+            iov->init(msg->data_size, _data_chunks_pool, msg->sn, opts().validate);
+
+            IoReadResponseCallback *r = _read_callback_pool.get();
+            r->init(this, server_index, msg->sn, opts().validate, iov, 0);
+
+            assert(iov->size() == 1);
+
+            conn->recv_am_data((*iov)[0].buffer(), msg->data_size, data_desc, r);
         }
     }
 
+    static long get_num_uncompleted(const server_info_t& server_info) {
+        long num_uncompleted;
+
+        num_uncompleted = (server_info.num_sent[IO_READ] +
+                           server_info.num_sent[IO_WRITE]) -
+                          (server_info.num_completed[IO_READ] +
+                           server_info.num_completed[IO_WRITE]);
+
+        assert(num_uncompleted >= 0);
+
+        return num_uncompleted;
+    }
+
     long get_num_uncompleted(size_t server_index) const {
-        return _server_info[server_index].num_sent -
-               (_server_info[server_index].num_completed[IO_READ] +
-                _server_info[server_index].num_completed[IO_WRITE]);
+        assert(server_index < _server_info.size());
+        return get_num_uncompleted(_server_info[server_index]);
     }
 
     static void reset_server_info(server_info_t& server_info) {
         server_info.conn                   = NULL;
-        server_info.num_sent               = 0;
         for (int op = 0; op < IO_OP_MAX; ++op) {
+            server_info.num_sent[op]       = 0;
             server_info.num_completed[op]  = 0;
             server_info.prev_completed[op] = 0;
         }
     }
 
     virtual void dispatch_connection_error(UcxConnection *conn) {
-        terminate_connection(conn, ucs_status_string(conn->ucx_status()));
+        size_t server_index = get_server_index(conn);
+        if (server_index < _server_info.size()) {
+            disconnect_server(server_index,
+                              ucs_status_string(conn->ucx_status()));
+        }
     }
 
-    void terminate_connection(UcxConnection *conn, const char *reason) {
-        LOG << "terminate connection " << conn << " due to " << reason;
-        size_t server_index        = get_server_index(conn);
+    void disconnect_server(size_t server_index, const char *reason) {
         server_info_t& server_info = _server_info[server_index];
 
-        // Remove connection pointer
-        _server_index_lookup.erase(conn);
-
-        // Destroying the connection will complete its outstanding operations
-        delete conn;
-
-        // Don't wait for any more completions on this connection
-        _num_sent -= get_num_uncompleted(server_index);
-
-        // Replace in _active_servers by the last element in the vector
-        size_t active_index = server_info.active_index;
-        std::swap(_active_servers[active_index], _active_servers.back());
-        assert(_active_servers.back() == server_index);
+        if (server_info.conn->is_disconnecting()) {
+            LOG << "not disconnecting " << server_info.conn << " with "
+                << get_num_uncompleted(server_info) << " uncompleted operations"
+                " (read: " << server_info.num_completed[IO_READ] << "/"
+                << server_info.num_sent[IO_READ] << "; write: "
+                << server_info.num_completed[IO_WRITE] << "/"
+                << server_info.num_sent[IO_WRITE] << ") due to \"" << reason
+                << "\" because disconnection is already in progress";
+            return;
+        }
 
-        // Swap the active_index field with the "replacement" server_info
-        server_info_t& replacement_server_info =
-                _server_info[_active_servers[active_index]];
-        std::swap(replacement_server_info.active_index, server_info.active_index);
-        assert(server_info.active_index == _active_servers.size() - 1);
+        LOG << "disconnecting connection " << server_info.conn << " with "
+            << get_num_uncompleted(server_info) << " uncompleted operations"
+            " (read: " << server_info.num_completed[IO_READ] << "/"
+            << server_info.num_sent[IO_READ] << "; write: "
+            << server_info.num_completed[IO_WRITE] << "/"
+            << server_info.num_sent[IO_WRITE] << ") due to \"" << reason
+            << "\"";
 
-        _active_servers.pop_back();
-        reset_server_info(server_info);
+        // Destroying the connection will complete its outstanding operations
+        server_info.conn->disconnect(new DisconnectCallback(*this,
+                                                            *server_info.conn));
     }
 
     void wait_for_responses(long max_outstanding) {
@@ -980,14 +1484,16 @@ class DemoClient : public P2pDemoCommon {
             if (elapsed_time > _test_opts.client_timeout) {
                 LOG << "timeout waiting for " << (_num_sent - _num_completed)
                     << " replies";
-                close_uncompleted_servers("timeout for replies");
+                disconnect_uncompleted_servers("timeout for replies");
                 timer_finished = true;
             }
             check_time_limit(curr_time);
         }
     }
 
-    UcxConnection* connect(const char* server) {
+    void connect(size_t server_index)
+    {
+        const char *server = opts().servers[server_index];
         struct sockaddr_in connect_addr;
         std::string server_addr;
         int port_num;
@@ -1011,11 +1517,19 @@ class DemoClient : public P2pDemoCommon {
         int ret = inet_pton(AF_INET, server_addr.c_str(), &connect_addr.sin_addr);
         if (ret != 1) {
             LOG << "invalid address " << server_addr;
-            return NULL;
+            abort();
         }
 
-        return UcxContext::connect((const struct sockaddr*)&connect_addr,
-                                   sizeof(connect_addr));
+        if (!_connecting_servers.insert(server_index).second) {
+            LOG << server_name(server_index) << " is already connecting";
+            abort();
+        }
+
+        UcxConnection *conn = new UcxConnection(*this, opts().use_am);
+        _server_info[server_index].conn = conn;
+        conn->connect((const struct sockaddr*)&connect_addr,
+                      sizeof(connect_addr),
+                      new ConnectCallback(*this, server_index));
     }
 
     const std::string server_name(size_t server_index) {
@@ -1024,20 +1538,36 @@ class DemoClient : public P2pDemoCommon {
         return ss.str();
     }
 
+    void connect_succeed(size_t server_index)
+    {
+        server_info_t &server_info = _server_info[server_index];
+        long attempts              = server_info.retry_count + 1;
+
+        server_info.retry_count                = 0;
+        server_info.prev_connect_time          = 0.;
+        _server_index_lookup[server_info.conn] = server_index;
+        active_servers_add(server_index);
+        LOG << "Connected to " << server_name(server_index) << " after "
+            << attempts << " attempts";
+    }
+
     void connect_failed(size_t server_index) {
         server_info_t& server_info = _server_info[server_index];
 
+        // The connection should close itself calling error handler
+        server_info.conn = NULL;
+
         ++server_info.retry_count;
 
         UcxLog log(LOG_PREFIX);
         log << "Connect to " << server_name(server_index) << " failed"
             << " (retry " << server_info.retry_count;
-        if (opts().client_retries < std::numeric_limits<long>::max()) {
-            log << "/" << opts().client_retries;
+        if (opts().retries < std::numeric_limits<long>::max()) {
+            log << "/" << opts().retries;
         }
         log << ")";
 
-        if (server_info.retry_count >= opts().client_retries) {
+        if (server_info.retry_count >= opts().retries) {
             /* If at least one server exceeded its retries, bail */
             _status = CONN_RETRIES_EXCEEDED;
         }
@@ -1057,43 +1587,44 @@ class DemoClient : public P2pDemoCommon {
         }
 
         double curr_time = get_time();
-        if (curr_time < (_prev_connect_time + opts().client_retry_interval)) {
-            // Not enough time elapsed since previous connection attempt
-            return;
-        }
-
         for (size_t server_index = 0; server_index < _server_info.size();
              ++server_index) {
             server_info_t& server_info = _server_info[server_index];
             if (server_info.conn != NULL) {
-                // Server is already connected
+                // Already connecting to this server
                 continue;
             }
 
             // If retry count exceeded for at least one server, we should have
             // exited already
             assert(_status == OK);
-            assert(server_info.retry_count < opts().client_retries);
+            assert(server_info.retry_count < opts().retries);
 
-            server_info.conn = connect(opts().servers[server_index]);
-            if (server_info.conn == NULL) {
-                connect_failed(server_index);
-                if (_status != OK) {
-                    break;
-                }
+            if (curr_time < (server_info.prev_connect_time +
+                             opts().retry_interval)) {
+                // Not enough time elapsed since previous connection attempt
                 continue;
             }
 
-            server_info.retry_count = 0;
-            _server_index_lookup[server_info.conn] = server_index;
+            connect(server_index);
+            server_info.prev_connect_time = curr_time;
+            assert(server_info.conn != NULL);
+            assert(_status == OK);
+        }
+    }
 
-            server_info.active_index = _active_servers.size();
-            _active_servers.push_back(server_index);
+    size_t pick_server_index() const {
+        assert(_num_active_servers_to_use != 0);
 
-            LOG << "Connected to " << server_name(server_index);
-        }
+        /* Pick a random connected server to which the client has credits
+         * to send (its conn's window is not full) */
+        size_t active_index = IoDemoRandom::rand(size_t(0),
+                                                 _num_active_servers_to_use - 1);
+        size_t server_index = _active_servers[active_index];
+        assert(get_num_uncompleted(server_index) < opts().conn_window_size);
+        assert(_server_info[server_index].conn != NULL);
 
-        _prev_connect_time = curr_time;
+        return server_index;
     }
 
     static inline bool is_control_iter(long iter) {
@@ -1118,40 +1649,58 @@ class DemoClient : public P2pDemoCommon {
         op_info_t op_info[IO_OP_MAX] = {{0,0}};
 
         while ((total_iter < opts().iter_count) && (_status == OK)) {
-            VERBOSE_LOG << " <<<< iteration " << total_iter << " >>>>";
-
-            wait_for_responses(opts().window_size - 1);
+            connect_all(is_control_iter(total_iter));
             if (_status != OK) {
                 break;
             }
 
-            connect_all(is_control_iter(total_iter));
+            if (_active_servers.empty()) {
+                if (_connecting_servers.empty()) {
+                    LOG << "All remote servers are down, reconnecting in "
+                        << opts().retry_interval << " seconds";
+                    sleep(opts().retry_interval);
+                    check_time_limit(get_time());
+                } else {
+                    progress();
+                }
+                continue;
+            }
+
+            VERBOSE_LOG << " <<<< iteration " << total_iter << " >>>>";
+            long conns_window_size = opts().conn_window_size *
+                                     _active_servers.size();
+            long max_outstanding   = std::min(opts().window_size,
+                                              conns_window_size) - 1;
+            wait_for_responses(max_outstanding);
             if (_status != OK) {
                 break;
             }
 
-            if (_active_servers.empty()) {
-                LOG << "All remote servers are down, reconnecting in "
-                    << opts().client_retry_interval << " seconds";
-                sleep(opts().client_retry_interval);
-                check_time_limit(get_time());
+            if (_num_active_servers_to_use == 0) {
+                // It is possible that the number of active servers to use is 0
+                // after wait_for_responses(), if some clients were closed in
+                // UCP Worker progress during handling of remote disconnection
+                // from servers
                 continue;
             }
 
-            /* Pick random connected server */
-            size_t active_index = IoDemoRandom::rand(size_t(0),
-                                                     _active_servers.size() - 1);
-            size_t server_index = _active_servers[active_index];
-            assert(_server_info[server_index].conn != NULL);
-
-            io_op_t op = get_op();
+            size_t server_index = pick_server_index();
+            io_op_t op          = get_op();
             size_t size;
             switch (op) {
             case IO_READ:
-                size = do_io_read(_server_info[server_index], sn);
+                if (opts().use_am) {
+                    size = do_io_read_am(server_index, sn);
+                } else {
+                    size = do_io_read(server_index, sn);
+                }
                 break;
             case IO_WRITE:
-                size = do_io_write(_server_info[server_index], sn);
+                if (opts().use_am) {
+                    size = do_io_write_am(server_index, sn);
+                } else {
+                    size = do_io_write(server_index, sn);
+                }
                 break;
             default:
                 abort();
@@ -1191,12 +1740,20 @@ class DemoClient : public P2pDemoCommon {
 
         for (size_t server_index = 0; server_index < _server_info.size();
              ++server_index) {
-            LOG << "Disconnecting from server " << server_name(server_index);
-            delete _server_info[server_index].conn;
-            _server_info[server_index].conn = NULL;
+            LOG << "Disconnecting from " << server_name(server_index);
+            UcxConnection& conn = *_server_info[server_index].conn;
+            conn.disconnect(new DisconnectCallback(*this, conn));
+        }
+
+        if (!_active_servers.empty()) {
+            LOG << "Waiting for " << _active_servers.size()
+                << " disconnects to complete";
+            do {
+                progress();
+            } while (!_active_servers.empty());
         }
-        _server_index_lookup.clear();
-        _active_servers.clear();
+
+        assert(_server_index_lookup.empty());
 
         return _status;
     }
@@ -1265,12 +1822,16 @@ class DemoClient : public P2pDemoCommon {
             for (size_t server_index = 0; server_index < _server_info.size();
                  ++server_index) {
                 server_info_t& server_info = _server_info[server_index];
-                long delta_completed = server_info.num_completed[op_id] -
-                                       server_info.prev_completed[op_id];
-                if (delta_completed < delta_min) {
-                    delta_min = delta_completed;
+                long delta_completed       = server_info.num_completed[op_id] -
+                                             server_info.prev_completed[op_id];
+                if ((delta_completed < delta_min) ||
+                    ((delta_completed == delta_min) &&
+                     (server_info.retry_count >
+                      _server_info[min_index].retry_count))) {
                     min_index = server_index;
                 }
+
+                delta_min = std::min(delta_completed, delta_min);
                 delta_max = std::max(delta_completed, delta_max);
 
                 server_info.prev_completed[op_id] =
@@ -1300,12 +1861,59 @@ class DemoClient : public P2pDemoCommon {
         }
     }
 
+    void active_servers_swap(size_t index1, size_t index2) {
+        size_t& active_server1 = _active_servers[index1];
+        size_t& active_server2 = _active_servers[index2];
+
+        std::swap(_server_info[active_server1].active_index,
+                  _server_info[active_server2].active_index);
+        std::swap(active_server1, active_server2);
+    }
+
+    void active_servers_add(size_t server_index) {
+        assert(_num_active_servers_to_use <= _active_servers.size());
+
+        _active_servers.push_back(server_index);
+        _server_info[server_index].active_index = _active_servers.size() - 1;
+        active_servers_make_used(_server_info[server_index].active_index);
+        assert(_num_active_servers_to_use <= _active_servers.size());
+    }
+
+    void active_servers_remove(size_t active_index) {
+        assert(active_index < _active_servers.size());
+
+        if (active_index < _num_active_servers_to_use) {
+            active_servers_make_unused(active_index);
+            active_index = _num_active_servers_to_use;
+        }
+
+        assert(active_index >= _num_active_servers_to_use);
+        active_servers_swap(active_index, _active_servers.size() - 1);
+        _active_servers.pop_back();
+    }
+
+    void active_servers_make_unused(size_t active_index) {
+        assert(active_index < _num_active_servers_to_use);
+        --_num_active_servers_to_use;
+        active_servers_swap(active_index, _num_active_servers_to_use);
+    }
+
+    void active_servers_make_used(size_t active_index) {
+        assert(active_index >= _num_active_servers_to_use);
+        active_servers_swap(active_index, _num_active_servers_to_use);
+        ++_num_active_servers_to_use;
+    }
 
 private:
     std::vector<server_info_t>              _server_info;
+    // Connection establishment is in progress
+    std::set<size_t>                        _connecting_servers;
+    // Active servers is the list of communicating servers
     std::vector<size_t>                     _active_servers;
+    // Num active servers to use handles window size, server becomes "unused" if
+    // its window is full
+    size_t                                  _num_active_servers_to_use;
     std::map<const UcxConnection*, size_t>  _server_index_lookup;
-    double                                  _prev_connect_time;
     long                                    _num_sent;
     long                                    _num_completed;
     status_t                                _status;
@@ -1388,8 +1996,41 @@ static void adjust_opts(options_t *test_opts) {
         test_opts->operations.push_back(IO_WRITE);
     }
 
+    if (test_opts->use_am &&
+        (test_opts->chunk_size < test_opts->max_data_size)) {
+        std::cout << "ignoring chunk size parameter, because it is not supported"
+                     " with AM API" << std::endl;
+        test_opts->chunk_size = test_opts->max_data_size;
+        return;
+    }
+
     test_opts->chunk_size = std::min(test_opts->chunk_size,
                                      test_opts->max_data_size);
+
+    // randomize servers to optimize startup
+    std::random_shuffle(test_opts->servers.begin(), test_opts->servers.end(),
+                        IoDemoRandom::urand<size_t>);
+
+    UcxLog vlog(LOG_PREFIX, test_opts->verbose);
+    vlog << "List of servers:";
+    for (size_t i = 0; i < test_opts->servers.size(); ++i) {
+        vlog << " " << test_opts->servers[i];
+    }
+}
+
+static int parse_window_size(const char *optarg, long &window_size,
+                             const std::string &window_size_str) {
+    window_size = strtol(optarg, NULL, 0);
+    if ((window_size <= 0) ||
+        // If the converted value falls out of range of corresponding
+        // return type, LONG_MAX is returned
+        (window_size == std::numeric_limits<long>::max())) {
+        std::cout << "invalid " << window_size_str << " size '" << optarg
+                  << "'" << std::endl;
+        return -1;
+    }
+
+    return 0;
 }
 
 static int parse_args(int argc, char **argv, options_t *test_opts)
@@ -1401,8 +2042,8 @@ static int parse_args(int argc, char **argv, options_t *test_opts)
     test_opts->port_num              = 1337;
     test_opts->connect_timeout       = 20.0;
     test_opts->client_timeout        = 50.0;
-    test_opts->client_retries        = std::numeric_limits<long>::max();
-    test_opts->client_retry_interval = 5.0;
+    test_opts->retries               = std::numeric_limits<long>::max();
+    test_opts->retry_interval        = 5.0;
     test_opts->client_runtime_limit  = std::numeric_limits<double>::max();
     test_opts->print_interval        = 1.0;
     test_opts->min_data_size         = 4096;
@@ -1411,25 +2052,29 @@ static int parse_args(int argc, char **argv, options_t *test_opts)
     test_opts->num_offcache_buffers  = 0;
     test_opts->iomsg_size            = 256;
     test_opts->iter_count            = 1000;
-    test_opts->window_size           = 1;
-    test_opts->random_seed           = std::time(NULL);
+    test_opts->window_size           = 16;
+    test_opts->conn_window_size      = 16;
+    test_opts->random_seed           = std::time(NULL) ^ getpid();
     test_opts->verbose               = false;
     test_opts->validate              = false;
+    test_opts->use_am                = false;
+    test_opts->memory_type           = UCS_MEMORY_TYPE_HOST;
 
-    while ((c = getopt(argc, argv, "p:c:r:d:b:i:w:k:o:t:n:l:s:y:vqHP:")) != -1) {
+    while ((c = getopt(argc, argv, "p:c:r:d:b:i:w:a:k:o:t:n:l:s:y:vqAHP:m:")) !=
+           -1) {
         switch (c) {
         case 'p':
             test_opts->port_num = atoi(optarg);
             break;
         case 'c':
             if (strcmp(optarg, "inf")) {
-                test_opts->client_retries = strtol(optarg, NULL, 0);
+                test_opts->retries = strtol(optarg, NULL, 0);
             }
             break;
         case 'y':
-            if (set_time(optarg, &test_opts->client_retry_interval) != 0) {
+            if (set_time(optarg, &test_opts->retry_interval) != 0) {
                 std::cout << "invalid '" << optarg
-                          << "' value for client retry interval" << std::endl;
+                          << "' value for retry interval" << std::endl;
                 return -1;
             }
             break;
@@ -1457,7 +2102,16 @@ static int parse_args(int argc, char **argv, options_t *test_opts)
             }
             break;
         case 'w':
-            test_opts->window_size = atoi(optarg);
+            if (parse_window_size(optarg, test_opts->window_size,
+                                  "window") != 0) {
+                return -1;
+            }
+            break;
+        case 'a':
+            if (parse_window_size(optarg, test_opts->conn_window_size,
+                                  "per connection window") != 0) {
+                return -1;
+            }
             break;
         case 'k':
             test_opts->chunk_size = strtol(optarg, NULL, 0);
@@ -1519,42 +2173,68 @@ static int parse_args(int argc, char **argv, options_t *test_opts)
         case 'q':
             test_opts->validate = true;
             break;
+        case 'A':
+            test_opts->use_am = true;
+            break;
         case 'H':
             UcxLog::use_human_time = true;
             break;
         case 'P':
             test_opts->print_interval = atof(optarg);
             break;
+        case 'm':
+            if (!strcmp(optarg, "host")) {
+                test_opts->memory_type = UCS_MEMORY_TYPE_HOST;
+#ifdef HAVE_CUDA
+            } else if (!strcmp(optarg, "cuda")) {
+                test_opts->memory_type = UCS_MEMORY_TYPE_CUDA;
+            } else if (!strcmp(optarg, "cuda-managed")) {
+                test_opts->memory_type = UCS_MEMORY_TYPE_CUDA_MANAGED;
+#endif
+            } else {
+                std::cout << "Invalid '" << optarg << "' value for memory type"
+                          << std::endl;
+                return -1;
+            }
+            break;
         case 'h':
         default:
             std::cout << "Usage: io_demo [options] [server_address]" << std::endl;
             std::cout << "       or io_demo [options] [server_address0:port0] [server_address1:port1]..." << std::endl;
             std::cout << "" << std::endl;
             std::cout << "Supported options are:" << std::endl;
-            std::cout << "  -p <port>                  TCP port number to use" << std::endl;
-            std::cout << "  -n <connect timeout>       Timeout for connecting to the peer (or \"inf\")" << std::endl;
-            std::cout << "  -o <op1,op2,...,opN>       Comma-separated string of IO operations [read|write]" << std::endl;
-            std::cout << "                             NOTE: if using several IO operations, performance" << std::endl;
-            std::cout << "                                   measurments may be inaccurate" << std::endl;
-            std::cout << "  -d <min>:<max>             Range that should be used to get data" << std::endl;
-            std::cout << "                             size of IO payload" << std::endl;
-            std::cout << "  -b <number of buffers>     Number of offcache IO buffers" << std::endl;
-            std::cout << "  -i <iterations-count>      Number of iterations to run communication" << std::endl;
-            std::cout << "  -w <window-size>           Number of outstanding requests" << std::endl;
-            std::cout << "  -k <chunk-size>            Split the data transfer to chunks of this size" << std::endl;
-            std::cout << "  -r <io-request-size>       Size of IO request packet" << std::endl;
-            std::cout << "  -t <client timeout>        Client timeout (or \"inf\")" << std::endl;
-            std::cout << "  -c <client retries>        Number of connection retries on client" << std::endl;
-            std::cout << "                             (or \"inf\") for failure" << std::endl;
-            std::cout << "  -y <client retry interval> Client retry interval" << std::endl;
-            std::cout << "  -l <client run-time limit> Time limit to run the IO client (or \"inf\")" << std::endl;
-            std::cout << "                             Examples: -l 17.5s; -l 10m; 15.5h" << std::endl;
-            std::cout << "  -s <random seed>           Random seed to use for randomizing" << std::endl;
-            std::cout << "  -v                         Set verbose mode" << std::endl;
-            std::cout << "  -q                         Enable data integrity and transaction check" << std::endl;
-            std::cout << "  -H                         Use human-readable timestamps" << std::endl;
-            std::cout << "  -P <interval>              Set report printing interval"  << std::endl;
+            std::cout << "  -p <port>                   TCP port number to use" << std::endl;
+            std::cout << "  -n <connect timeout>        Timeout for connecting to the peer (or \"inf\")" << std::endl;
+            std::cout << "  -o <op1,op2,...,opN>        Comma-separated string of IO operations [read|write]" << std::endl;
+            std::cout << "                              NOTE: if using several IO operations, performance" << std::endl;
+            std::cout << "                                    measurements may be inaccurate" << std::endl;
+            std::cout << "  -d <min>:<max>              Range that should be used to get data" << std::endl;
+            std::cout << "                              size of IO payload" << std::endl;
+            std::cout << "  -b <number of buffers>      Number of offcache IO buffers" << std::endl;
+            std::cout << "  -i <iterations-count>       Number of iterations to run communication" << std::endl;
+            std::cout << "  -w <window-size>            Number of outstanding requests" << std::endl;
+            std::cout << "  -a <conn-window-size>       Number of outstanding requests per connection" << std::endl;
+            std::cout << "  -k <chunk-size>             Split the data transfer to chunks of this size" << std::endl;
+            std::cout << "  -r <io-request-size>        Size of IO request packet" << std::endl;
+            std::cout << "  -t <client timeout>         Client timeout (or \"inf\")" << std::endl;
+            std::cout << "  -c <retries>                Number of connection retries on client or " << std::endl;
+            std::cout << "                              listen retries on server" << std::endl;
+            std::cout << "                              (or \"inf\") for failure" << std::endl;
+            std::cout << "  -y <retry interval>         Retry interval" << std::endl;
+            std::cout << "  -l <client run-time limit>  Time limit to run the IO client (or \"inf\")" << std::endl;
+            std::cout << "                              Examples: -l 17.5s; -l 10m; 15.5h" << std::endl;
+            std::cout << "  -s <random seed>            Random seed to use for randomizing" << std::endl;
+            std::cout << "  -v                          Set verbose mode" << std::endl;
+            std::cout << "  -q                          Enable data integrity and transaction check" << std::endl;
+            std::cout << "  -A                          Use UCP Active Messages API (use TAG API otherwise)" << std::endl;
+            std::cout << "  -H                          Use human-readable timestamps" << std::endl;
+            std::cout << "  -P <interval>               Set report printing interval"  << std::endl;
             std::cout << "" << std::endl;
+            std::cout << "  -m <memory_type>            Memory type to use. Possible values: host"
+#ifdef HAVE_CUDA
+                      << ", cuda, cuda-managed"
+#endif
+                      << std::endl;
             return -1;
         }
     }
diff --git a/test/apps/iodemo/run_io_demo.sh b/test/apps/iodemo/run_io_demo.sh
index 033a32aacb1..b09c5bbb29d 100755
--- a/test/apps/iodemo/run_io_demo.sh
+++ b/test/apps/iodemo/run_io_demo.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
 #
@@ -289,17 +289,12 @@ build_server_args_list() {
 	do
 		key="$1"
 		case $key in
-		-d)
+		-d|-P|-k|-r|-b|-n)
 			value="$2"
 			iodemo_server_args+=" $key $value"
 			shift
 			;;
-		-P)
-			value="$2"
-			iodemo_server_args+=" $key $value"
-			shift
-			;;
-		-q)
+		-q|-A|-v|-H)
 			iodemo_server_args+=" $key"
 			;;
 		*)
@@ -347,17 +342,17 @@ create_mapping_bynode()
 	# which means no node will have one process less.
 	# The expression "(x + N - 1) % N" yields a number in the range 0..N-1 and
 	# then adding 1 yields the equivalent of "x % N" in the range 1..N.
-	# 
+	#
 	remainder_client_index=$(((num_clients + num_hosts - 1) % num_hosts + 1))
 	remainder_server_index=$(((num_servers + num_hosts - 1) % num_hosts + 1))
 	show_var remainder_client_index
 	show_var remainder_client_index
-	
+
 	host_index=0
 	for host in $(split_list ${host_list})
 	do
 		# Add same amount of clients/servers on each host, except few last hosts
-		# which may have less (if mapping is not balanced) 
+		# which may have less (if mapping is not balanced)
 		num_clients_per_host[${host}]=$((max_clients_per_node - \
 		                                 (host_index >= remainder_client_index)))
 		num_servers_per_host[${host}]=$((max_servers_per_node - \
@@ -378,7 +373,7 @@ create_mapping_byslot()
 		                       ${remaining_clients})
 		num_servers_per_host[${host}]=${node_num_servers}
 		num_clients_per_host[${host}]=${node_num_clients}
-		
+
 		remaining_clients=$((remaining_clients - node_num_clients))
 		remaining_servers=$((remaining_servers - node_num_servers))
 	done
@@ -471,7 +466,7 @@ make_scripts()
 
 		# Add file header and startup
 		cat >${command_file} <<-EOF
-			#!/bin/sh
+			#!/bin/bash
 			#
 			# Launch script for io_demo on ${host} with ${num_servers_per_host[${host}]} servers and ${num_clients_per_host[${host}]} clients
 			#
diff --git a/test/apps/iodemo/ucx_wrapper.cc b/test/apps/iodemo/ucx_wrapper.cc
index 68bb73b03d5..579c0fc539c 100644
--- a/test/apps/iodemo/ucx_wrapper.cc
+++ b/test/apps/iodemo/ucx_wrapper.cc
@@ -12,9 +12,13 @@
 #include <string.h>
 #include <assert.h>
 
+#include <algorithm>
 #include <limits>
 
 
+#define AM_MSG_ID 0
+
+
 struct ucx_request {
     UcxCallback                  *callback;
     UcxConnection                *conn;
@@ -25,6 +29,16 @@ struct ucx_request {
     ucs_list_link_t              pos;
 };
 
+// Holds details of arrived AM message
+struct UcxAmDesc {
+    UcxAmDesc(void *data, const ucp_am_recv_param_t *param) :
+        _data(data), _param(param) {
+    }
+
+    void                         *_data;
+    const ucp_am_recv_param_t    *_param;
+};
+
 UcxCallback::~UcxCallback()
 {
 }
@@ -41,9 +55,10 @@ EmptyCallback* EmptyCallback::get() {
 
 bool UcxLog::use_human_time = false;
 
-UcxLog::UcxLog(const char* prefix, bool enable) : _enable(enable)
+UcxLog::UcxLog(const char* prefix, bool enable)
 {
     if (!enable) {
+        _ss = NULL;
         return;
     }
 
@@ -57,21 +72,56 @@ UcxLog::UcxLog(const char* prefix, bool enable) : _enable(enable)
     } else {
         snprintf(str, sizeof(str), "[%lu.%06lu] ", tv.tv_sec, tv.tv_usec);
     }
-    std::cout << str << prefix << " ";
+
+    _ss = new std::stringstream();
+    (*_ss) << str << prefix << " ";
 }
 
 UcxLog::~UcxLog()
 {
-    if (_enable) {
-        std::cout << std::endl;
+    if (_ss != NULL) {
+        (*_ss) << std::endl;
+        std::cout << (*_ss).str();
+        delete _ss;
     }
 }
 
 #define UCX_LOG UcxLog("[UCX]", true)
 
-UcxContext::UcxContext(size_t iomsg_size, double connect_timeout) :
+UcxContext::UcxAcceptCallback::UcxAcceptCallback(UcxContext &context,
+                                                 UcxConnection &connection) :
+    _context(context), _connection(connection)
+{
+}
+
+void UcxContext::UcxAcceptCallback::operator()(ucs_status_t status)
+{
+    if (status == UCS_OK) {
+        _context.dispatch_connection_accepted(&_connection);
+    }
+
+    delete this;
+}
+
+UcxContext::UcxDisconnectCallback::UcxDisconnectCallback(UcxConnection &conn)
+    : _conn(&conn)
+{
+}
+
+UcxContext::UcxDisconnectCallback::~UcxDisconnectCallback()
+{
+    delete _conn;
+}
+
+void UcxContext::UcxDisconnectCallback::operator()(ucs_status_t status)
+{
+    delete this;
+}
+
+UcxContext::UcxContext(size_t iomsg_size, double connect_timeout, bool use_am) :
     _context(NULL), _worker(NULL), _listener(NULL), _iomsg_recv_request(NULL),
-    _iomsg_buffer(iomsg_size, '\0'), _connect_timeout(connect_timeout)
+    _iomsg_buffer(iomsg_size, '\0'), _connect_timeout(connect_timeout),
+    _use_am(use_am)
 {
 }
 
@@ -97,8 +147,8 @@ bool UcxContext::init()
     ucp_params.field_mask   = UCP_PARAM_FIELD_FEATURES |
                               UCP_PARAM_FIELD_REQUEST_INIT |
                               UCP_PARAM_FIELD_REQUEST_SIZE;
-    ucp_params.features     = UCP_FEATURE_TAG |
-                              UCP_FEATURE_STREAM;
+    ucp_params.features     = _use_am ? UCP_FEATURE_AM :
+                                        UCP_FEATURE_TAG | UCP_FEATURE_STREAM;
     ucp_params.request_init = request_init;
     ucp_params.request_size = sizeof(ucx_request);
     ucs_status_t status = ucp_init(&ucp_params, NULL, &_context);
@@ -107,7 +157,8 @@ bool UcxContext::init()
         return false;
     }
 
-    UCX_LOG << "created context " << _context;
+    UCX_LOG << "created context " << _context << " with "
+            << (_use_am ? "AM" : "TAG");
 
     /* Create worker */
     ucp_worker_params_t worker_params;
@@ -123,7 +174,12 @@ bool UcxContext::init()
 
     UCX_LOG << "created worker " << _worker;
 
-    recv_io_message();
+    if (_use_am) {
+        set_am_handler(am_recv_callback, this);
+    } else {
+        recv_io_message();
+    }
+
     return true;
 }
 
@@ -150,24 +206,14 @@ bool UcxContext::listen(const struct sockaddr* saddr, size_t addrlen)
     return true;
 }
 
-UcxConnection* UcxContext::connect(const struct sockaddr* saddr, size_t addrlen)
-{
-    UcxConnection *conn = new UcxConnection(*this, get_next_conn_id());
-    if (!conn->connect(saddr, addrlen)) {
-        delete conn;
-        return NULL;
-    }
-
-    add_connection(conn);
-    return conn;
-}
-
 void UcxContext::progress()
 {
     ucp_worker_progress(_worker);
     progress_io_message();
+    progress_timed_out_conns();
     progress_conn_requests();
     progress_failed_connections();
+    progress_disconnected_connections();
 }
 
 uint32_t UcxContext::get_next_conn_id()
@@ -187,6 +233,7 @@ void UcxContext::request_reset(ucx_request *r)
     r->completed   = false;
     r->callback    = NULL;
     r->conn        = NULL;
+    r->status      = UCS_OK;
     r->recv_length = 0;
     r->pos.next    = NULL;
     r->pos.prev    = NULL;
@@ -201,8 +248,26 @@ void UcxContext::request_release(void *request)
 void UcxContext::connect_callback(ucp_conn_request_h conn_req, void *arg)
 {
     UcxContext *self = reinterpret_cast<UcxContext*>(arg);
-    UCX_LOG << "got new connection request " << conn_req;
-    self->_conn_requests.push_back(conn_req);
+    ucp_conn_request_attr_t conn_req_attr;
+    conn_req_t conn_request;
+
+    conn_req_attr.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR;
+    ucs_status_t status = ucp_conn_request_query(conn_req, &conn_req_attr);
+    if (status == UCS_OK) {
+        UCX_LOG << "got new connection request " << conn_req << " from client "
+                << UcxContext::sockaddr_str((const struct sockaddr*)
+                                            &conn_req_attr.client_address,
+                                            sizeof(conn_req_attr.client_address));
+    } else {
+        UCX_LOG << "got new connection request " << conn_req
+                << ", ucp_conn_request_query() failed ("
+                << ucs_status_string(status) << ")";
+    }
+
+    conn_request.conn_request = conn_req;
+    gettimeofday(&conn_request.arrival_time, NULL);
+
+    self->_conn_requests.push_back(conn_request);
 }
 
 void UcxContext::iomsg_recv_callback(void *request, ucs_status_t status,
@@ -255,27 +320,54 @@ double UcxContext::connect_timeout() const
     return _connect_timeout;
 }
 
+int UcxContext::is_timeout_elapsed(struct timeval const *tv_prior, double timeout)
+{
+    struct timeval tv_current, elapsed;
+
+    gettimeofday(&tv_current, NULL);
+    timersub(&tv_current, tv_prior, &elapsed);
+    return ((elapsed.tv_sec + (elapsed.tv_usec * 1e-6)) > timeout);
+}
+
+void UcxContext::progress_timed_out_conns()
+{
+    while (!_conns_in_progress.empty() &&
+           (get_time() > _conns_in_progress.begin()->first)) {
+        UcxConnection *conn = _conns_in_progress.begin()->second;
+        _conns_in_progress.erase(_conns_in_progress.begin());
+        conn->handle_connection_error(UCS_ERR_TIMED_OUT);
+    }
+}
+
 void UcxContext::progress_conn_requests()
 {
     while (!_conn_requests.empty()) {
-        UcxConnection *conn = new UcxConnection(*this, get_next_conn_id());
-        if (conn->accept(_conn_requests.front())) {
-            add_connection(conn);
-            dispatch_connection_accepted(conn);
+        conn_req_t conn_request = _conn_requests.front();
+
+        if (is_timeout_elapsed(&conn_request.arrival_time, _connect_timeout)) {
+            UCX_LOG << "reject connection request " << conn_request.conn_request
+                    << " since server's timeout (" << _connect_timeout
+                    << " seconds) elapsed";
+            ucp_listener_reject(_listener, conn_request.conn_request);
         } else {
-            delete conn;
+            UcxConnection *conn = new UcxConnection(*this, _use_am);
+            // Start accepting the connection request, and call
+            // UcxAcceptCallback when connection is established
+            conn->accept(conn_request.conn_request,
+                         new UcxAcceptCallback(*this, *conn));
         }
+
         _conn_requests.pop_front();
     }
 }
 
 void UcxContext::progress_io_message()
 {
-    if (!_iomsg_recv_request->completed) {
+    if (_use_am || !_iomsg_recv_request->completed) {
         return;
     }
 
-    uint32_t conn_id = _iomsg_recv_request->conn_id;
+    uint64_t conn_id = _iomsg_recv_request->conn_id;
     conn_map_t::iterator iter = _conns.find(conn_id);
     if (iter == _conns.end()) {
         UCX_LOG << "could not find connection with id " << conn_id;
@@ -297,6 +389,19 @@ void UcxContext::progress_failed_connections()
     }
 }
 
+void UcxContext::progress_disconnected_connections()
+{
+    std::list<UcxConnection *>::iterator it = _disconnecting_conns.begin();
+    while (it != _disconnecting_conns.end()) {
+        UcxConnection *conn = *it;
+        if (conn->disconnect_progress()) {
+            it = _disconnecting_conns.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
 UcxContext::wait_status_t
 UcxContext::wait_completion(ucs_status_ptr_t status_ptr, const char *title,
                             double timeout)
@@ -309,10 +414,7 @@ UcxContext::wait_completion(ucs_status_ptr_t status_ptr, const char *title,
         struct timeval tv_start;
         gettimeofday(&tv_start, NULL);
         do {
-            struct timeval tv_current, elapsed;
-            gettimeofday(&tv_current, NULL);
-            timersub(&tv_current, &tv_start, &elapsed);
-            if (elapsed.tv_sec + (elapsed.tv_usec * 1e-6) > timeout) {
+            if (is_timeout_elapsed(&tv_start, timeout)) {
                 UCX_LOG << title << " request " << status_ptr << " timed out";
                 return WAIT_STATUS_TIMED_OUT;
             }
@@ -352,6 +454,7 @@ void UcxContext::add_connection(UcxConnection *conn)
 {
     assert(_conns.find(conn->id()) == _conns.end());
     _conns[conn->id()] = conn;
+    UCX_LOG << "added " << conn->get_log_prefix() << " to connection map";
 }
 
 void UcxContext::remove_connection(UcxConnection *conn)
@@ -359,9 +462,32 @@ void UcxContext::remove_connection(UcxConnection *conn)
     conn_map_t::iterator i = _conns.find(conn->id());
     if (i != _conns.end()) {
         _conns.erase(i);
+        UCX_LOG << "removed " << conn->get_log_prefix()
+                << " from connection map";
+    }
+}
+
+void UcxContext::remove_connection_inprogress(UcxConnection *conn)
+{
+    // we expect to remove connections from the list close to the same order
+    // as created, so this linear search should be pretty fast
+    timeout_conn_t::iterator i;
+    for (i = _conns_in_progress.begin(); i != _conns_in_progress.end(); ++i) {
+        if (i->second == conn) {
+            _conns_in_progress.erase(i);
+            return;
+        }
     }
 }
 
+void UcxContext::move_connection_to_disconnecting(UcxConnection *conn)
+{
+    remove_connection(conn);
+    assert(std::find(_disconnecting_conns.begin(), _disconnecting_conns.end(),
+                     conn) == _disconnecting_conns.end());
+    _disconnecting_conns.push_back(conn);
+}
+
 void UcxContext::dispatch_connection_accepted(UcxConnection* conn)
 {
 }
@@ -369,22 +495,42 @@ void UcxContext::dispatch_connection_accepted(UcxConnection* conn)
 void UcxContext::handle_connection_error(UcxConnection *conn)
 {
     remove_connection(conn);
+    remove_connection_inprogress(conn);
     _failed_conns.push_back(conn);
 }
 
 void UcxContext::destroy_connections()
 {
     while (!_conn_requests.empty()) {
-        UCX_LOG << "reject connection request " << _conn_requests.front();
-        ucp_listener_reject(_listener, _conn_requests.front());
+        ucp_conn_request_h conn_req = _conn_requests.front().conn_request;
+        UCX_LOG << "reject connection request " << conn_req;
+        ucp_listener_reject(_listener, conn_req);
         _conn_requests.pop_front();
     }
 
-    for (conn_map_t::iterator iter = _conns.begin(); iter != _conns.end(); ++iter) {
-        delete iter->second;
+    while (!_conns_in_progress.empty()) {
+        UcxConnection &conn = *_conns_in_progress.begin()->second;
+        _conns_in_progress.erase(_conns_in_progress.begin());
+        conn.disconnect(new UcxDisconnectCallback(conn));
+    }
+
+    UCX_LOG << "destroy_connections";
+    while (!_conns.empty()) {
+        UcxConnection &conn = *_conns.begin()->second;
+        _conns.erase(_conns.begin());
+        conn.disconnect(new UcxDisconnectCallback(conn));
     }
 
-    _conns.clear();
+    while (!_disconnecting_conns.empty()) {
+        ucp_worker_progress(_worker);
+        progress_disconnected_connections();
+    }
+}
+
+double UcxContext::get_time() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (tv.tv_usec * 1e-6);
 }
 
 void UcxContext::destroy_listener()
@@ -408,18 +554,65 @@ void UcxContext::destroy_worker()
     ucp_worker_destroy(_worker);
 }
 
+ucs_status_t UcxContext::am_recv_callback(void *arg, const void *header,
+                                          size_t header_length,
+                                          void *data, size_t length,
+                                          const ucp_am_recv_param_t *param)
+{
+    UcxContext *self = reinterpret_cast<UcxContext*>(arg);
+
+    assert(param->recv_attr & UCP_AM_RECV_ATTR_FIELD_REPLY_EP);
+    assert(self->_use_am);
+
+    uint64_t conn_id = reinterpret_cast<uint64_t>(param->reply_ep);
+    conn_map_t::iterator iter = self->_conns.find(conn_id);
+    if (iter == self->_conns.end()) {
+        // TODO: change this to assert when data dropping is implemented in AM
+        UCX_LOG << "could not find connection with ep " << param->reply_ep
+                << "(" << conn_id << ")";
+        return UCS_OK;
+    }
+
+    UcxConnection *conn = iter->second;
+
+    UcxAmDesc data_desc(data, param);
+
+    self->dispatch_am_message(conn, header, header_length, data_desc);
+
+    return UCS_OK;
+}
+
+void UcxContext::set_am_handler(ucp_am_recv_callback_t cb, void *arg)
+{
+    ucp_am_handler_param_t param;
+
+    param.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID |
+                       UCP_AM_HANDLER_PARAM_FIELD_CB |
+                       UCP_AM_HANDLER_PARAM_FIELD_ARG;
+    param.id         = AM_MSG_ID;
+    param.cb         = cb;
+    param.arg        = arg;
+    ucp_worker_set_am_recv_handler(_worker, &param);
+}
 
 #define UCX_CONN_LOG UcxLog(_log_prefix, true)
 
 unsigned UcxConnection::_num_instances = 0;
 
-UcxConnection::UcxConnection(UcxContext &context, uint32_t conn_id) :
-    _context(context), _conn_id(conn_id), _remote_conn_id(0),
-    _ep(NULL), _close_request(NULL), _ucx_status(UCS_OK)
+UcxConnection::UcxConnection(UcxContext &context, bool use_am) :
+    _context(context),
+    _establish_cb(NULL),
+    _disconnect_cb(NULL),
+    _conn_id(context.get_next_conn_id()),
+    _remote_conn_id(0),
+    _ep(NULL),
+    _close_request(NULL),
+    _ucx_status(UCS_INPROGRESS),
+    _use_am(use_am)
 {
     ++_num_instances;
     struct sockaddr_in in_addr = {0};
-    in_addr.sin_family = AF_INET;
+    in_addr.sin_family         = AF_INET;
     set_log_prefix((const struct sockaddr*)&in_addr, sizeof(in_addr));
     ucs_list_head_init(&_all_requests);
     UCX_CONN_LOG << "created new connection " << this << " total: " << _num_instances;
@@ -427,45 +620,38 @@ UcxConnection::UcxConnection(UcxContext &context, uint32_t conn_id) :
 
 UcxConnection::~UcxConnection()
 {
-    UCX_CONN_LOG << "destroying, ep is " << _ep;
-
-    // if _ep is NULL, connection was closed and removed by error handler
-    if (_ep != NULL) {
-        disconnect(UCP_EP_CLOSE_MODE_FORCE);
-    }
-
-    if (_close_request) {
-        _context.wait_completion(_close_request, "ep close");
-    }
-
-    // wait until all requests are completed
-    if (!ucs_list_is_empty(&_all_requests)) {
-        UCX_CONN_LOG << "waiting for " << ucs_list_length(&_all_requests) <<
-                        " uncompleted requests";
-    }
-    while (!ucs_list_is_empty(&_all_requests)) {
-        ucp_worker_progress(_context.worker());
-    }
+    /* establish cb must be destroyed earlier since it accesses
+     * the connection */
+    assert(_establish_cb == NULL);
+    assert(_disconnect_cb == NULL);
+    assert(_ep == NULL);
+    assert(ucs_list_is_empty(&_all_requests));
+    assert(!UCS_PTR_IS_PTR(_close_request));
 
     UCX_CONN_LOG << "released";
     --_num_instances;
 }
 
-bool UcxConnection::connect(const struct sockaddr* saddr, socklen_t addrlen)
+void UcxConnection::connect(const struct sockaddr *saddr, socklen_t addrlen,
+                            UcxCallback *callback)
 {
     set_log_prefix(saddr, addrlen);
 
     ucp_ep_params_t ep_params;
-    ep_params.field_mask       = UCP_EP_PARAM_FIELD_FLAGS       |
+    ep_params.field_mask       = UCP_EP_PARAM_FIELD_FLAGS |
                                  UCP_EP_PARAM_FIELD_SOCK_ADDR;
     ep_params.flags            = UCP_EP_PARAMS_FLAGS_CLIENT_SERVER;
     ep_params.sockaddr.addr    = saddr;
     ep_params.sockaddr.addrlen = addrlen;
 
-    return connect_common(ep_params);
+    char sockaddr_str[UCS_SOCKADDR_STRING_LEN];
+    UCX_CONN_LOG << "Connecting to "
+                 << ucs_sockaddr_str(saddr, sockaddr_str,
+                                     UCS_SOCKADDR_STRING_LEN);
+    connect_common(ep_params, callback);
 }
 
-bool UcxConnection::accept(ucp_conn_request_h conn_req)
+void UcxConnection::accept(ucp_conn_request_h conn_req, UcxCallback *callback)
 {
     ucp_conn_request_attr_t conn_req_attr;
     conn_req_attr.field_mask = UCP_CONN_REQUEST_ATTR_FIELD_CLIENT_ADDR;
@@ -481,8 +667,48 @@ bool UcxConnection::accept(ucp_conn_request_h conn_req)
     ucp_ep_params_t ep_params;
     ep_params.field_mask   = UCP_EP_PARAM_FIELD_CONN_REQUEST;
     ep_params.conn_request = conn_req;
+    connect_common(ep_params, callback);
+}
+
+void UcxConnection::disconnect(UcxCallback *callback)
+{
+    /* establish cb must be destroyed earlier since it accesses
+     * the connection */
+    assert(_establish_cb == NULL);
+    assert(_disconnect_cb == NULL);
+    assert(_ep != NULL);
+
+    UCX_CONN_LOG << "destroying, ep is " << _ep;
+    ep_close(UCP_EP_CLOSE_MODE_FORCE);
+
+    _disconnect_cb = callback;
+    if (ucs_list_is_empty(&_all_requests)) {
+        _context.move_connection_to_disconnecting(this);
+    } else {
+        cancel_all();
+    }
+}
+
+bool UcxConnection::disconnect_progress()
+{
+    assert(_ep == NULL);
+    assert(_disconnect_cb != NULL);
+
+    if (UCS_PTR_IS_PTR(_close_request)) {
+        if (ucp_request_check_status(_close_request) == UCS_INPROGRESS) {
+            return false;
+        } else {
+            ucp_request_free(_close_request);
+            _close_request = NULL;
+        }
+    }
 
-    return connect_common(ep_params);
+    assert(ucs_list_is_empty(&_all_requests));
+    UcxCallback *cb = _disconnect_cb;
+    _disconnect_cb  = NULL;
+    // invoke last since it can delete this object
+    (*cb)(UCS_OK);
+    return true;
 }
 
 bool UcxConnection::send_io_message(const void *buffer, size_t length,
@@ -515,6 +741,48 @@ bool UcxConnection::recv_data(void *buffer, size_t length, uint32_t sn,
     return process_request("ucp_tag_recv_nb", ptr_status, callback);
 }
 
+bool UcxConnection::send_am(const void *meta, size_t meta_length,
+                            const void *buffer, size_t length,
+                            UcxCallback* callback)
+{
+    if (_ep == NULL) {
+        return false;
+    }
+
+    ucp_request_param_t param;
+    param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK  |
+                         UCP_OP_ATTR_FIELD_FLAGS;
+    param.cb.send      = common_request_callback_nbx;
+    param.flags        = UCP_AM_SEND_REPLY;
+    param.datatype     = 0; // make coverity happy
+
+    ucs_status_ptr_t sptr = ucp_am_send_nbx(_ep, AM_MSG_ID, meta, meta_length,
+                                            buffer, length, &param);
+    return process_request("ucp_am_send_nbx", sptr, callback);
+}
+
+bool UcxConnection::recv_am_data(void *buffer, size_t length,
+                                 const UcxAmDesc &data_desc,
+                                 UcxCallback* callback)
+{
+    assert(_ep != NULL);
+
+    if (!(data_desc._param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV)) {
+        memcpy(buffer, data_desc._data, length);
+        (*callback)(UCS_OK);
+        return true;
+    }
+
+    ucp_request_param_t params;
+    params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK  |
+                          UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+    params.cb.recv_am   = am_data_recv_callback;
+    ucs_status_ptr_t sp = ucp_am_recv_data_nbx(_context.worker(),
+                                               data_desc._data,
+                                               buffer, length, &params);
+    return process_request("ucp_am_recv_data_nbx", sp, callback);
+}
+
 void UcxConnection::cancel_all()
 {
     if (ucs_list_is_empty(&_all_requests)) {
@@ -524,11 +792,10 @@ void UcxConnection::cancel_all()
     ucx_request *request, *tmp;
     unsigned     count = 0;
     ucs_list_for_each_safe(request, tmp, &_all_requests, pos) {
-        ucp_request_cancel(_context.worker(), request);
         ++count;
+        UCX_CONN_LOG << "canceling " << request << " request #" << count;
+        ucp_request_cancel(_context.worker(), request);
     }
-
-    UCX_CONN_LOG << "canceling " << count << " requests ";
 }
 
 ucp_tag_t UcxConnection::make_data_tag(uint32_t conn_id, uint32_t sn)
@@ -548,6 +815,18 @@ void UcxConnection::stream_send_callback(void *request, ucs_status_t status)
 void UcxConnection::stream_recv_callback(void *request, ucs_status_t status,
                                          size_t recv_len)
 {
+    ucx_request *r      = reinterpret_cast<ucx_request*>(request);
+    UcxConnection *conn = r->conn;
+
+    if (!conn->is_established()) {
+        assert(conn->_establish_cb == r->callback);
+        conn->established(status);
+    } else {
+        assert(UCS_STATUS_IS_ERR(conn->ucx_status()));
+    }
+
+    conn->request_completed(r);
+    UcxContext::request_release(r);
 }
 
 void UcxConnection::common_request_callback(void *request, ucs_status_t status)
@@ -555,6 +834,8 @@ void UcxConnection::common_request_callback(void *request, ucs_status_t status)
     ucx_request *r = reinterpret_cast<ucx_request*>(request);
 
     assert(!r->completed);
+    r->status = status;
+
     if (r->callback) {
         // already processed by send/recv function
         (*r->callback)(status);
@@ -563,7 +844,6 @@ void UcxConnection::common_request_callback(void *request, ucs_status_t status)
     } else {
         // not yet processed by "process_request"
         r->completed = true;
-        r->status    = status;
     }
 }
 
@@ -573,6 +853,19 @@ void UcxConnection::data_recv_callback(void *request, ucs_status_t status,
     common_request_callback(request, status);
 }
 
+void UcxConnection::common_request_callback_nbx(void *request,
+                                                ucs_status_t status,
+                                                void *user_data)
+{
+    common_request_callback(request, status);
+}
+
+void UcxConnection::am_data_recv_callback(void *request, ucs_status_t status,
+                                          size_t length, void *user_data)
+{
+    common_request_callback(request, status);
+}
+
 void UcxConnection::error_callback(void *arg, ucp_ep_h ep, ucs_status_t status)
 {
     reinterpret_cast<UcxConnection*>(arg)->handle_connection_error(status);
@@ -592,10 +885,49 @@ void UcxConnection::set_log_prefix(const struct sockaddr* saddr,
     memcpy(_log_prefix, ss.str().c_str(), length);
 }
 
-bool UcxConnection::connect_common(ucp_ep_params_t& ep_params)
+void UcxConnection::connect_tag(UcxCallback *callback)
+{
+    const ucp_datatype_t dt_int = ucp_dt_make_contig(sizeof(uint32_t));
+    size_t recv_len;
+
+    // receive remote connection id
+    void *rreq = ucp_stream_recv_nb(_ep, &_remote_conn_id, 1, dt_int,
+                                    stream_recv_callback, &recv_len,
+                                    UCP_STREAM_RECV_FLAG_WAITALL);
+    if (UCS_PTR_IS_PTR(rreq)) {
+        process_request("conn_id receive", rreq, callback);
+        _context._conns_in_progress.push_back(std::make_pair(
+                UcxContext::get_time() + _context._connect_timeout, this));
+    } else {
+        established(UCS_PTR_STATUS(rreq));
+        if (rreq != NULL) {
+            // failed to receive
+            return;
+        }
+    }
+
+    // send local connection id
+    void *sreq = ucp_stream_send_nb(_ep, &_conn_id, 1, dt_int,
+                                    stream_send_callback, 0);
+    // we do not have to check the status here, in case if the endpoint is
+    // failed we should handle it in ep_params.err_handler.cb set above
+    if (UCS_PTR_IS_PTR(sreq)) {
+        ucp_request_free(sreq);
+    }
+}
+
+void UcxConnection::connect_am(UcxCallback *callback)
+{
+    // With AM use ep as a connection ID. AM receive callback provides
+    // reply ep, which can be used for finding a proper connection.
+    _conn_id = reinterpret_cast<uint64_t>(_ep);
+    established(UCS_OK);
+}
+
+void UcxConnection::connect_common(ucp_ep_params_t &ep_params,
+                                   UcxCallback *callback)
 {
-    UcxContext::wait_status_t wait_status;
-    double connect_timeout = _context.connect_timeout();
+    _establish_cb = callback;
 
     // create endpoint
     ep_params.field_mask      |= UCP_EP_PARAM_FIELD_ERR_HANDLER |
@@ -608,50 +940,34 @@ bool UcxConnection::connect_common(ucp_ep_params_t& ep_params)
     if (status != UCS_OK) {
         assert(_ep == NULL);
         UCX_LOG << "ucp_ep_create() failed: " << ucs_status_string(status);
-        return false;
+        handle_connection_error(status);
+        return;
     }
 
-    UCX_CONN_LOG << "created endpoint " << _ep << ", exchanging connection id";
+    UCX_CONN_LOG << "created endpoint " << _ep << ", connection id "
+                 << _conn_id;
 
-    const ucp_datatype_t dt_int = ucp_dt_make_contig(sizeof(uint32_t));
+    if (_use_am) {
+        connect_am(callback);
+    } else {
+        connect_tag(callback);
+    }
 
-    // receive remote connection id
-    size_t recv_len;
-    void *rreq             = ucp_stream_recv_nb(_ep, &_remote_conn_id, 1, dt_int,
-                                                stream_recv_callback, &recv_len,
-                                                UCP_STREAM_RECV_FLAG_WAITALL);
-    const char *rreq_title = "conn_id receive";
+    _context.add_connection(this);
+}
 
-    // send local connection id
-    void *sreq             = ucp_stream_send_nb(_ep, &_conn_id, 1, dt_int,
-                                                stream_send_callback, 0);
-    const char *sreq_title = "conn_id send";
-
-    wait_status = _context.wait_completion(sreq, sreq_title, connect_timeout);
-    if (wait_status != UcxContext::WAIT_STATUS_OK) {
-        UCX_CONN_LOG << "failed to send remote connection id";
-        ep_close(UCP_EP_CLOSE_MODE_FORCE);
-        if (wait_status == UcxContext::WAIT_STATUS_TIMED_OUT) {
-            _context.wait_completion(sreq, sreq_title);
-        }
-        // wait for receive request as well, which should be canceled by ep close
-        _context.wait_completion(rreq, rreq_title);
-        return false;
+void UcxConnection::established(ucs_status_t status)
+{
+    if (!_use_am && (status == UCS_OK)) {
+        assert(_remote_conn_id != 0);
+        UCX_CONN_LOG << "Remote id is " << _remote_conn_id;
     }
 
-    // wait to complete receiving remote connection id
-    wait_status = _context.wait_completion(rreq, rreq_title, connect_timeout);
-    if (wait_status != UcxContext::WAIT_STATUS_OK) {
-        UCX_CONN_LOG << "failed to receive remote connection id";
-        ep_close(UCP_EP_CLOSE_MODE_FORCE);
-        if (wait_status == UcxContext::WAIT_STATUS_TIMED_OUT) {
-            _context.wait_completion(rreq, rreq_title);
-        }
-        return false;
-    }
+    _ucx_status = status;
+    _context.remove_connection_inprogress(this);
 
-    UCX_CONN_LOG << "remote id is " << _remote_conn_id;
-    return true;
+    (*_establish_cb)(status);
+    _establish_cb = NULL;
 }
 
 bool UcxConnection::send_common(const void *buffer, size_t length, ucp_tag_t tag,
@@ -676,26 +992,36 @@ void UcxConnection::request_completed(ucx_request *r)
 {
     assert(r->conn == this);
     ucs_list_del(&r->pos);
+
+    if (_disconnect_cb != NULL) {
+        UCX_CONN_LOG << "completing request " << r << " with status \""
+                     << ucs_status_string(r->status) << "\" (" << r->status
+                     << ")" << " during disconnect";
+
+        if (ucs_list_is_empty(&_all_requests)) {
+            _context.move_connection_to_disconnecting(this);
+        }
+    }
 }
 
 void UcxConnection::handle_connection_error(ucs_status_t status)
 {
+    if (UCS_STATUS_IS_ERR(_ucx_status)) {
+        return;
+    }
+
     UCX_CONN_LOG << "detected error: " << ucs_status_string(status);
     _ucx_status = status;
 
-    if (_remote_conn_id != 0) {
-        /* the upper layer should close the connection */
+    /* the upper layer should close the connection */
+    if (is_established()) {
         _context.handle_connection_error(this);
+    } else {
+        (*_establish_cb)(status);
+        _establish_cb = NULL;
     }
 }
 
-void UcxConnection::disconnect(enum ucp_ep_close_mode mode)
-{
-    _context.remove_connection(this);
-    cancel_all();
-    ep_close(mode);
-}
-
 void UcxConnection::ep_close(enum ucp_ep_close_mode mode)
 {
     static const char *mode_str[] = {"force", "flush"};
diff --git a/test/apps/iodemo/ucx_wrapper.h b/test/apps/iodemo/ucx_wrapper.h
index f362c130a39..0e6d832bb23 100644
--- a/test/apps/iodemo/ucx_wrapper.h
+++ b/test/apps/iodemo/ucx_wrapper.h
@@ -9,19 +9,23 @@
 
 #include <ucp/api/ucp.h>
 #include <ucs/algorithm/crc.h>
+#include <ucs/datastruct/list.h>
+#include <ucs/sys/sock.h>
 #include <deque>
 #include <exception>
 #include <iostream>
+#include <list>
 #include <map>
 #include <sstream>
 #include <string>
-#include <ucs/datastruct/list.h>
+#include <vector>
 
 #define MAX_LOG_PREFIX_SIZE   64
 
 /* Forward declarations */
 class UcxConnection;
 struct ucx_request;
+struct UcxAmDesc;
 
 /*
  * UCX callback for send/receive completion
@@ -56,15 +60,15 @@ class UcxLog {
     ~UcxLog();
 
     template<typename T>
-    const UcxLog& operator<<(const T &t) const {
-        if (_enable) {
-            std::cout << t;
+    UcxLog& operator<<(const T &t) {
+        if (_ss != NULL) {
+            (*_ss) << t;
         }
         return *this;
     }
 
 private:
-    const bool               _enable;
+    std::stringstream        *_ss;
 };
 
 
@@ -72,8 +76,32 @@ class UcxLog {
  * Holds UCX global context and worker
  */
 class UcxContext {
+    class UcxAcceptCallback : public UcxCallback {
+    public:
+        UcxAcceptCallback(UcxContext &context, UcxConnection &connection);
+
+        virtual void operator()(ucs_status_t status);
+
+    private:
+        UcxContext    &_context;
+        UcxConnection &_connection;
+    };
+
+protected:
+    class UcxDisconnectCallback : public UcxCallback {
+    public:
+        UcxDisconnectCallback(UcxConnection &conn);
+
+        virtual ~UcxDisconnectCallback();
+
+        virtual void operator()(ucs_status_t status);
+
+    private:
+        UcxConnection *_conn;
+    };
+
 public:
-    UcxContext(size_t iomsg_size, double connect_timeout);
+    UcxContext(size_t iomsg_size, double connect_timeout, bool use_am);
 
     virtual ~UcxContext();
 
@@ -81,16 +109,27 @@ class UcxContext {
 
     bool listen(const struct sockaddr* saddr, size_t addrlen);
 
-    UcxConnection* connect(const struct sockaddr* saddr, size_t addrlen);
-
     void progress();
 
+    static const std::string sockaddr_str(const struct sockaddr* saddr,
+                                          size_t addrlen);
+
+    void destroy_connections();
+
+    static double get_time();
+
 protected:
 
     // Called when new IO message is received
     virtual void dispatch_io_message(UcxConnection* conn, const void *buffer,
                                      size_t length) = 0;
 
+    // Called when new AM message is received
+    // (note IO message can be bundled with data)
+    virtual void dispatch_am_message(UcxConnection* conn, const void *hdr,
+                                     size_t hdr_length,
+                                     const UcxAmDesc &data_desc) = 0;
+
     // Called when there is a fatal failure on the connection
     virtual void dispatch_connection_error(UcxConnection* conn) = 0;
 
@@ -104,6 +143,11 @@ class UcxContext {
         WAIT_STATUS_TIMED_OUT
     } wait_status_t;
 
+    typedef struct {
+        ucp_conn_request_h conn_request;
+        struct timeval     arrival_time;
+    } conn_req_t;
+
     friend class UcxConnection;
 
     static const ucp_tag_t IOMSG_TAG = 1ull << 63;
@@ -121,19 +165,27 @@ class UcxContext {
     static void iomsg_recv_callback(void *request, ucs_status_t status,
                                     ucp_tag_recv_info *info);
 
-    static const std::string sockaddr_str(const struct sockaddr* saddr,
-                                          size_t addrlen);
+    static ucs_status_t am_recv_callback(void *arg, const void *header,
+                                         size_t header_length,
+                                         void *data, size_t length,
+                                         const ucp_am_recv_param_t *param);
 
     ucp_worker_h worker() const;
 
     double connect_timeout() const;
 
+    int is_timeout_elapsed(struct timeval const *tv_prior, double timeout);
+
+    void progress_timed_out_conns();
+
     void progress_conn_requests();
 
     void progress_io_message();
 
     void progress_failed_connections();
 
+    void progress_disconnected_connections();
+
     wait_status_t wait_completion(ucs_status_ptr_t status_ptr, const char *title,
                                   double timeout = 1e6);
 
@@ -143,37 +195,50 @@ class UcxContext {
 
     void remove_connection(UcxConnection *conn);
 
-    void handle_connection_error(UcxConnection *conn);
+    void remove_connection_inprogress(UcxConnection *conn);
 
-    void destroy_connections();
+    void move_connection_to_disconnecting(UcxConnection *conn);
+
+    void handle_connection_error(UcxConnection *conn);
 
     void destroy_listener();
 
     void destroy_worker();
 
-    typedef std::map<uint32_t, UcxConnection*> conn_map_t;
-
-    ucp_context_h                  _context;
-    ucp_worker_h                   _worker;
-    ucp_listener_h                 _listener;
-    conn_map_t                     _conns;
-    std::deque<ucp_conn_request_h> _conn_requests;
-    std::deque<UcxConnection *>    _failed_conns;
-    ucx_request*                   _iomsg_recv_request;
-    std::string                    _iomsg_buffer;
-    double                         _connect_timeout;
+    void set_am_handler(ucp_am_recv_callback_t cb, void *arg);
+
+    typedef std::map<uint64_t, UcxConnection*>              conn_map_t;
+    typedef std::vector<std::pair<double, UcxConnection*> > timeout_conn_t;
+
+    ucp_context_h               _context;
+    ucp_worker_h                _worker;
+    ucp_listener_h              _listener;
+    conn_map_t                  _conns;
+    std::deque<conn_req_t>      _conn_requests;
+    timeout_conn_t              _conns_in_progress; // ordered in time
+    std::deque<UcxConnection *> _failed_conns;
+    std::list<UcxConnection *>  _disconnecting_conns;
+    ucx_request                 *_iomsg_recv_request;
+    std::string                 _iomsg_buffer;
+    double                      _connect_timeout;
+    bool                        _use_am;
 };
 
 
 class UcxConnection {
 public:
-    UcxConnection(UcxContext& context, uint32_t conn_id);
+    UcxConnection(UcxContext &context, bool use_am);
 
     ~UcxConnection();
 
-    bool connect(const struct sockaddr* saddr, socklen_t addrlen);
+    void connect(const struct sockaddr *saddr, socklen_t addrlen,
+                 UcxCallback *callback);
 
-    bool accept(ucp_conn_request_h conn_req);
+    void accept(ucp_conn_request_h conn_req, UcxCallback *callback);
+
+    void disconnect(UcxCallback *callback);
+
+    bool disconnect_progress();
 
     bool send_io_message(const void *buffer, size_t length,
                          UcxCallback* callback = EmptyCallback::get());
@@ -184,9 +249,16 @@ class UcxConnection {
     bool recv_data(void *buffer, size_t length, uint32_t sn,
                    UcxCallback* callback = EmptyCallback::get());
 
+    bool send_am(const void *meta, size_t meta_length,
+                 const void *buffer, size_t length,
+                 UcxCallback* callback = EmptyCallback::get());
+
+    bool recv_am_data(void *buffer, size_t length, const UcxAmDesc &data_desc,
+                      UcxCallback* callback = EmptyCallback::get());
+
     void cancel_all();
 
-    uint32_t id() const {
+    uint64_t id() const {
         return _conn_id;
     }
 
@@ -194,6 +266,20 @@ class UcxConnection {
         return _ucx_status;
     }
 
+    const char* get_log_prefix() const {
+        return _log_prefix;
+    }
+
+    bool is_established() const {
+        return _establish_cb == NULL;
+    }
+
+    bool is_disconnecting() const {
+        return _disconnect_cb != NULL;
+    }
+
+    void handle_connection_error(ucs_status_t status);
+
 private:
     static ucp_tag_t make_data_tag(uint32_t conn_id, uint32_t sn);
 
@@ -206,6 +292,12 @@ class UcxConnection {
 
     static void common_request_callback(void *request, ucs_status_t status);
 
+    static void common_request_callback_nbx(void *request, ucs_status_t status,
+                                            void *user_data);
+
+    static void am_data_recv_callback(void *request, ucs_status_t status,
+                                      size_t length, void *user_data);
+
     static void data_recv_callback(void *request, ucs_status_t status,
                                    ucp_tag_recv_info *info);
 
@@ -213,7 +305,13 @@ class UcxConnection {
 
     void set_log_prefix(const struct sockaddr* saddr, socklen_t addrlen);
 
-    bool connect_common(ucp_ep_params_t& ep_params);
+    void connect_common(ucp_ep_params_t &ep_params, UcxCallback *callback);
+
+    void connect_tag(UcxCallback *callback);
+
+    void connect_am(UcxCallback *callback);
+
+    void established(ucs_status_t status);
 
     bool send_common(const void *buffer, size_t length, ucp_tag_t tag,
                      UcxCallback* callback);
@@ -222,25 +320,24 @@ class UcxConnection {
 
     void request_completed(ucx_request *r);
 
-    void handle_connection_error(ucs_status_t status);
-
-    void disconnect(enum ucp_ep_close_mode mode);
-
     void ep_close(enum ucp_ep_close_mode mode);
 
     bool process_request(const char *what, ucs_status_ptr_t ptr_status,
                          UcxCallback* callback);
 
-    static unsigned    _num_instances;
-
-    UcxContext&        _context;
-    uint32_t           _conn_id;
-    uint32_t           _remote_conn_id;
-    char               _log_prefix[MAX_LOG_PREFIX_SIZE];
-    ucp_ep_h           _ep;
-    void*              _close_request;
-    ucs_list_link_t    _all_requests;
-    ucs_status_t       _ucx_status;
+    static unsigned _num_instances;
+
+    UcxContext      &_context;
+    UcxCallback     *_establish_cb;
+    UcxCallback     *_disconnect_cb;
+    uint64_t        _conn_id;
+    uint64_t        _remote_conn_id;
+    char            _log_prefix[MAX_LOG_PREFIX_SIZE];
+    ucp_ep_h        _ep;
+    void            *_close_request;
+    ucs_list_link_t _all_requests;
+    ucs_status_t    _ucx_status;
+    bool            _use_am;
 };
 
 #endif
diff --git a/test/apps/profiling/ucx_profiling.c b/test/apps/profiling/ucx_profiling.c
index 38c3eb645cf..0606a6af957 100644
--- a/test/apps/profiling/ucx_profiling.c
+++ b/test/apps/profiling/ucx_profiling.c
@@ -17,12 +17,14 @@ UCS_PROFILE_FUNC(double, calc_pi, (count), int count) {
     pi_d_4 = 0.0;
 
     /* Profile a block of code */
-    UCS_PROFILE_CODE("leibnitz") {
-        for (n = 0; n < count; ++n) {
-            pi_d_4 += pow(-1.0, n) / (2 * n + 1);
-
-            /* create a timestamp for each step */
-            UCS_PROFILE_SAMPLE("step");
+    {
+        UCS_PROFILE_CODE("leibnitz") {
+            for (n = 0; n < count; ++n) {
+                pi_d_4 += pow(-1.0, n) / (2 * n + 1);
+
+                /* create a timestamp for each step */
+                UCS_PROFILE_SAMPLE("step");
+            }
         }
     }
 
diff --git a/test/apps/test_cuda_hook.c b/test/apps/test_cuda_hook.c
new file mode 100644
index 00000000000..11a1d8423bc
--- /dev/null
+++ b/test/apps/test_cuda_hook.c
@@ -0,0 +1,130 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <ucp/api/ucp.h>
+#include <ucm/api/ucm.h>
+#include <cuda_runtime.h>
+#include <getopt.h>
+#include <cuda.h>
+
+
+static void event_cb(ucm_event_type_t event_type, ucm_event_t *event, void *arg)
+{
+    int *count_p = arg;
+    const char *title;
+
+    if (event_type == UCM_EVENT_MEM_TYPE_ALLOC) {
+        title = "allocate";
+    } else if (event_type == UCM_EVENT_MEM_TYPE_FREE) {
+        title = "free";
+    } else {
+        printf("unexpected memory event type %d\n", event_type);
+        return;
+    }
+
+    printf("%s %s address %p size %zu\n", title,
+           ucs_memory_type_names[event->mem_type.mem_type],
+           event->mem_type.address, event->mem_type.size);
+    ++(*count_p);
+}
+
+static void alloc_driver_api()
+{
+    CUdeviceptr dptr = 0;
+    CUcontext context;
+    CUdevice device;
+    CUresult res;
+
+    res = cuInit(0);
+    if (res != CUDA_SUCCESS) {
+        printf("cuInit() failed: %d\n", res);
+        return;
+    }
+
+    res = cuDeviceGet(&device, 0);
+    if (res != CUDA_SUCCESS) {
+        printf("cuDeviceGet(0) failed: %d\n", res);
+        return;
+    }
+
+    res = cuCtxCreate(&context, 0, device);
+    if (res != CUDA_SUCCESS) {
+        printf("cuCtxCreate() failed: %d\n", res);
+        return;
+    }
+
+    res = cuMemAlloc(&dptr, 4096);
+    printf("cuMemAlloc() returned 0x%lx result %d\n", (uintptr_t)dptr, res);
+    cuMemFree(dptr);
+
+    cuCtxDetach(context);
+}
+
+static void alloc_runtime_api()
+{
+    void *dptr = NULL;
+    cudaError_t res;
+
+    res = cudaMalloc(&dptr, 4096);
+    printf("cudaMalloc() returned %p result %d\n", dptr, res);
+    cudaFree(dptr);
+}
+
+int main(int argc, char **argv)
+{
+    static const ucm_event_type_t memtype_events = UCM_EVENT_MEM_TYPE_ALLOC |
+                                                   UCM_EVENT_MEM_TYPE_FREE;
+    static const int num_expected_events         = 2;
+    ucp_context_h context;
+    ucs_status_t status;
+    ucp_params_t params;
+    int use_driver_api;
+    int num_events;
+    int c;
+
+    use_driver_api = 0;
+    while ((c = getopt(argc, argv, "d")) != -1) {
+        switch (c) {
+        case 'd':
+            use_driver_api = 1;
+            break;
+        default:
+            printf("Usage: test_cuda_hook [options]\n");
+            printf("Options are:\n");
+            printf("  -d :   Use Cuda driver API (Default: use runtime API)\n");
+            printf("\n");
+            return -1;
+        }
+    }
+
+    params.field_mask = UCP_PARAM_FIELD_FEATURES;
+    params.features   = UCP_FEATURE_TAG | UCP_FEATURE_STREAM;
+    status            = ucp_init(&params, NULL, &context);
+    if (status != UCS_OK) {
+        printf("failed to create context\n");
+        return -1;
+    }
+
+    num_events = 0;
+    ucm_set_event_handler(memtype_events, 1000, event_cb, &num_events);
+
+    if (use_driver_api) {
+        alloc_driver_api();
+    } else {
+        alloc_runtime_api();
+    }
+
+    ucm_unset_event_handler(memtype_events, event_cb, &num_events);
+    printf("got %d/%d memory events\n", num_events, num_expected_events);
+
+    ucp_cleanup(context);
+
+    return (num_events >= num_expected_events) ? 0 : -1;
+}
diff --git a/test/apps/test_dlopen_cfg_print.c b/test/apps/test_dlopen_cfg_print.c
index acc488e6491..f0d65c42acd 100644
--- a/test/apps/test_dlopen_cfg_print.c
+++ b/test/apps/test_dlopen_cfg_print.c
@@ -42,6 +42,7 @@ int main(int argc, char **argv)
     void *ucs_handle, *uct_handle;
     ucs_list_link_t *config_list;
     int i;
+    print_all_opts_func_t print_all_opts;
 
     /* unload and reload uct while ucs is loaded
      * would fail if uct global vars are kept on global lists in ucs */
@@ -52,7 +53,7 @@ int main(int argc, char **argv)
     }
 
     /* print all config table, to force going over the global list in ucs */
-    print_all_opts_func_t print_all_opts =
+    print_all_opts =
         (print_all_opts_func_t)dlsym(ucs_handle, "ucs_config_parser_print_all_opts");
     config_list = (ucs_list_link_t*)dlsym(ucs_handle, "ucs_config_global_list");
     print_all_opts(stdout, "TEST_", 0, config_list);
diff --git a/test/apps/test_init_mt.c b/test/apps/test_init_mt.c
new file mode 100644
index 00000000000..612c506cc27
--- /dev/null
+++ b/test/apps/test_init_mt.c
@@ -0,0 +1,55 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <ucp/api/ucp.h>
+
+#if _OPENMP
+#include <omp.h>
+#endif
+
+
+int main(int argc, char **argv)
+{
+    int count = 0;
+
+#pragma omp parallel
+    {
+        ucs_status_t ctx_status, worker_status;
+        ucp_context_h context;
+        ucp_worker_h worker;
+        ucp_params_t params;
+        ucp_worker_params_t wparams;
+
+        params.field_mask = UCP_PARAM_FIELD_FEATURES;
+        params.features   = UCP_FEATURE_TAG | UCP_FEATURE_STREAM;
+        ctx_status        = ucp_init(&params, NULL, &context);
+        if (ctx_status == UCS_OK) {
+            wparams.field_mask = 0;
+            worker_status      = ucp_worker_create(context, &wparams, &worker);
+            if (worker_status == UCS_OK) {
+                __sync_add_and_fetch(&count, 1);
+            }
+        }
+
+#pragma omp barrier
+
+        if (ctx_status == UCS_OK) {
+            if (worker_status == UCS_OK) {
+                ucp_worker_destroy(worker);
+            }
+            ucp_cleanup(context);
+        }
+    }
+
+#pragma omp barrier
+
+    printf("finished %d threads\n", count);
+    return 0;
+}
diff --git a/test/apps/test_ucx_tls.py b/test/apps/test_ucx_tls.py
index d5e6d04207b..ae99cc5742c 100755
--- a/test/apps/test_ucx_tls.py
+++ b/test/apps/test_ucx_tls.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python2
 #
 # Copyright (C) Mellanox Technologies Ltd. 2017-.  ALL RIGHTS RESERVED.
 #
@@ -10,6 +10,8 @@
 import os
 import re
 import commands
+import itertools
+import contextlib
 from distutils.version import LooseVersion
 from optparse import OptionParser
 
@@ -80,6 +82,37 @@
     "mlx5_override"   : mlx5_am_override
 }
 
+tl_aliases = {
+    "mm":   ["posix", "sysv", "xpmem", ],
+    "sm":   ["posix", "sysv", "xpmem", "knem", "cma", "rdmacm", "sockcm", ],
+    "shm":  ["posix", "sysv", "xpmem", "knem", "cma", "rdmacm", "sockcm", ],
+    "ib":   ["rc_verbs", "ud_verbs", "rc_mlx5", "ud_mlx5", "dc_mlx5", "rdmacm", ],
+    "ud_v": ["ud_verbs", "rdmacm", ],
+    "ud_x": ["ud_mlx5", "rdmacm", ],
+    "ud":   ["ud_mlx5", "ud_verbs", "rdmacm", ],
+    "rc_v": ["rc_verbs", "ud_verbs:aux", "rdmacm", ],
+    "rc_x": ["rc_mlx5", "ud_mlx5:aux", "rdmacm", ],
+    "rc":   ["rc_mlx5", "ud_mlx5:aux", "rc_verbs", "ud_verbs:aux", "rdmacm", ],
+    "dc":   ["dc_mlx5", "rdmacm", ],
+    "dc_x": ["dc_mlx5", "rdmacm", ],
+    "ugni": ["ugni_smsg", "ugni_udt:aux", "ugni_rdma", ],
+    "cuda": ["cuda_copy", "cuda_ipc", "gdr_copy", ],
+    "rocm": ["rocm_copy", "rocm_ipc", "rocm_gdr", ],
+}
+
+@contextlib.contextmanager
+def _override_env(var_name, value):
+    if value is None:
+        yield
+        return
+
+    prev_value = os.getenv(var_name)
+    os.putenv(var_name, value)
+    try:
+        yield
+    finally:
+        os.putenv(var_name, prev_value) if prev_value else os.unsetenv(var_name)
+
 def exec_cmd(cmd):
     if options.verbose:
         print cmd
@@ -91,18 +124,14 @@ def exec_cmd(cmd):
 
     return status, output
 
-def find_am_transport(dev, neps, override = 0) :
-
-    os.putenv("UCX_TLS", "ib")
-    os.putenv("UCX_NET_DEVICES", dev)
-
+def find_am_transport(dev, neps=1, override=0, tls="ib"):
     if (override):
         os.putenv("UCX_NUM_EPS", "2")
+    
+    with _override_env("UCX_TLS", tls), \
+         _override_env("UCX_NET_DEVICES", dev):
 
-    status, output = exec_cmd(ucx_info + ucx_info_args + str(neps) + " | grep am")
-
-    os.unsetenv("UCX_TLS")
-    os.unsetenv("UCX_NET_DEVICES")
+        status, output = exec_cmd(ucx_info + ucx_info_args + str(neps) + " | grep am")
 
     match = re.search(r'\d+:(\S+)/\S+', output)
     if match:
@@ -112,7 +141,7 @@ def find_am_transport(dev, neps, override = 0) :
 
         return am_tls
     else:
-        return "no am tls"
+        return None
 
 def test_fallback_from_rc(dev, neps) :
 
@@ -140,6 +169,61 @@ def test_fallback_from_rc(dev, neps) :
 
     os.unsetenv("UCX_TLS")
 
+def test_ucx_tls_positive(tls):
+    # Use TLS list in "allow" mode and verify that the found tl is in the list
+    found_tl = find_am_transport(None, tls=tls)
+    print "Using UCX_TLS=" + tls + ", found TL: " + str(found_tl)
+    if tls == 'all':
+        return
+    tls = tls.split(',')
+    if found_tl in tls or "\\" + found_tl in tls:
+        return
+    for tl in tls:
+        if tl in tl_aliases and found_tl in tl_aliases[tl]:
+            return
+    print "Found TL doesn't belong to the allowed UCX_TLS"
+    sys.exit(1)
+
+def test_ucx_tls_negative(tls):
+    # Use TLS list in "negate" mode and verify that the found tl is not in the list
+    found_tl = find_am_transport(None, tls="^"+tls)
+    print "Using UCX_TLS=^" + tls + ", found TL: " + str(found_tl)
+    tls = tls.split(',')
+    if not found_tl or found_tl in tls:
+        print "No available TL found"
+        sys.exit(1)
+    for tl in tls:
+        if tl in tl_aliases and found_tl in tl_aliases[tl]:
+            print "Found TL belongs to the forbidden UCX_TLS"
+            sys.exit(1)
+
+def _powerset(iterable, with_empty_set=True):
+    iterable_list = list(iterable)
+    return itertools.chain.from_iterable(
+        itertools.combinations(iterable_list, r) for r in \
+            range(0 if with_empty_set else 1, len(iterable_list) + 1))
+
+def test_tls_allow_list(ucx_info):
+    status, output = exec_cmd(ucx_info + " -d | grep Transport | awk '{print $3}'")
+    available_tls = set(output.splitlines())
+
+    # Add some basic variants (those that are available on this platform)
+    tls_variants = [tls_variant for tls_variant in ["tcp", "posix", "xpmem"] if \
+                    tls_variant in available_tls]
+
+    # Add some IB variant (both strict and alias), if available
+    for tls_variant in available_tls:
+        if tls_variant.startswith("rc_") or tls_variant.startswith("dc_") or \
+           tls_variant.startswith("ud_"):
+            tls_variants += ["ib", "\\" + tls_variant]
+            break
+
+    tls_variants = _powerset(tls_variants, with_empty_set=False)
+    test_funcs = [test_ucx_tls_positive, test_ucx_tls_negative]
+    for (tls_variant, test_func) in \
+        itertools.product(tls_variants, test_funcs):
+        test_func(",".join(tls_variant))
+
 parser = OptionParser()
 parser.add_option("-p", "--prefix", metavar="PATH", help = "root UCX directory")
 parser.add_option("-v", "--verbose", action="store_true", \
@@ -196,7 +280,7 @@ def test_fallback_from_rc(dev, neps) :
     for n_eps in sorted(dev_tl_map):
         tl = find_am_transport(dev + ':' + port, n_eps)
         print dev+':' + port + "               eps: ", n_eps, " expected am tl: " + \
-              dev_tl_map[n_eps] + " selected: " + tl
+              dev_tl_map[n_eps] + " selected: " + str(tl)
 
         if dev_tl_map[n_eps] != tl:
             sys.exit(1)
@@ -204,7 +288,7 @@ def test_fallback_from_rc(dev, neps) :
         if override:
             tl = find_am_transport(dev + ':' + port, n_eps, 1)
             print dev+':' + port + " UCX_NUM_EPS=2 eps: ", n_eps, " expected am tl: " + \
-                  dev_tl_override_map[n_eps] + " selected: " + tl
+                  dev_tl_override_map[n_eps] + " selected: " + str(tl)
 
             if dev_tl_override_map[n_eps] != tl:
                 sys.exit(1)
@@ -212,5 +296,8 @@ def test_fallback_from_rc(dev, neps) :
         if n_eps >= (rc_max_num_eps * 2):
             test_fallback_from_rc(dev + ':' + port, n_eps)
 
+# Test UCX_TLS configuration (TL choice according to "allow" and "negate" lists)
+test_tls_allow_list(ucx_info)
+
 sys.exit(0)
 
diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am
index 2589eef36ba..6f579e638e7 100644
--- a/test/gtest/Makefile.am
+++ b/test/gtest/Makefile.am
@@ -4,6 +4,7 @@
 # Copyright (C) The University of Tennessee and the University of Tennessee Research Foundation. 2016. ALL RIGHTS RESERVED.
 # Copyright (C) Los Alamos National Security, LLC. 2018 ALL RIGHTS RESERVED.
 # Copyright (C) Advanced Micro Devices, Inc. 2019. ALL RIGHTS RESERVED.
+# Copyright (C) ARM Ltd. 2020.  ALL RIGHTS RESERVED.
 #
 # See file LICENSE for terms.
 #
@@ -45,6 +46,7 @@ noinst_PROGRAMS = gtest
 gtestdir  = $(includedir)
 gtest_LDADD = \
 	$(top_builddir)/src/ucs/libucs.la \
+	$(top_builddir)/src/ucs/vfs/sock/libucs_vfs_sock.la \
 	$(top_builddir)/src/uct/libuct.la \
 	$(top_builddir)/src/ucm/libucm.la \
 	$(top_builddir)/src/ucp/libucp.la \
@@ -55,6 +57,7 @@ gtest_LDADD = \
 
 gtest_CPPFLAGS = \
 	$(BASE_CPPFLAGS) \
+	-DTOP_SRCDIR=\"$(top_srcdir)\" \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/test \
 	-I$(top_builddir)/src \
@@ -103,6 +106,7 @@ gtest_SOURCES = \
 	uct/test_progress.cc \
 	uct/test_uct_ep.cc \
 	uct/test_uct_perf.cc \
+	uct/v2/test_uct_query.cc \
 	uct/test_zcopy_comp.cc \
 	uct/uct_p2p_test.cc \
 	uct/uct_test.cc \
@@ -114,10 +118,12 @@ gtest_SOURCES = \
 	uct/tcp/test_tcp.cc \
 	\
 	ucp/test_ucp_am.cc \
+	ucp/test_ucp_ep.cc \
 	ucp/test_ucp_stream.cc \
 	ucp/test_ucp_peer_failure.cc \
 	ucp/test_ucp_atomic.cc \
 	ucp/test_ucp_dt.cc \
+	ucp/test_ucp_tls.cc \
 	ucp/test_ucp_memheap.cc \
 	ucp/test_ucp_mmap.cc \
 	ucp/test_ucp_mem_type.cc \
@@ -146,6 +152,7 @@ gtest_SOURCES = \
 	ucs/test_algorithm.cc \
 	ucs/test_arbiter.cc \
 	ucs/test_async.cc \
+	ucs/test_bitmap.cc \
 	ucs/test_callbackq.cc \
 	ucs/test_class.cc \
 	ucs/test_config.cc \
@@ -172,6 +179,7 @@ gtest_SOURCES = \
 	ucs/test_type.cc \
 	ucs/test_log.cc \
 	ucs/test_iov.cc \
+	ucs/test_vfs.cc \
 	ucs/arch/test_x86_64.cc
 
 if HAVE_IB
@@ -202,7 +210,7 @@ endif
 if HAVE_TL_RC
 gtest_SOURCES += \
 	uct/ib/test_rc.cc
-endif	
+endif
 if HAVE_TL_DC
 gtest_SOURCES += \
 	uct/ib/test_dc.cc
@@ -218,8 +226,10 @@ gtest_SOURCES += \
 	ucm/cuda_hooks.cc
 gtest_CPPFLAGS += \
 	$(CUDA_CPPFLAGS)
+gtest_LDFLAGS += \
+	$(CUDA_LDFLAGS)
 gtest_LDADD += \
-	$(CUDA_LDFLAGS) \
+	$(CUDA_LIBS) \
 	$(top_builddir)/src/uct/cuda/libuct_cuda.la
 endif
 
diff --git a/test/gtest/common/gtest-all.cc b/test/gtest/common/gtest-all.cc
index fa67e68e6ab..e6d31bd5b1e 100644
--- a/test/gtest/common/gtest-all.cc
+++ b/test/gtest/common/gtest-all.cc
@@ -7489,7 +7489,7 @@ void StackLowerThanAddress(const void* ptr, bool* result) {
 }
 
 bool StackGrowsDown() {
-  int dummy;
+  int dummy = 0;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
diff --git a/test/gtest/common/main.cc b/test/gtest/common/main.cc
index c8f39543d6a..b7476cb59f9 100644
--- a/test/gtest/common/main.cc
+++ b/test/gtest/common/main.cc
@@ -22,7 +22,7 @@ double ucs::perf_retry_interval  = 1.0;
 
 void parse_test_opts(int argc, char **argv) {
     int c;
-    while ((c = getopt(argc, argv, "s:p:i:")) != -1) {
+    while ((c = getopt(argc, argv, "s:p:i:t:")) != -1) {
         switch (c) {
         case 's':
             ucs_gtest_random_seed = atoi(optarg);
@@ -33,8 +33,12 @@ void parse_test_opts(int argc, char **argv) {
         case 'i':
             ucs::perf_retry_interval = atof(optarg);
             break;
+        case 't':
+            ucs::watchdog_timeout = atof(optarg);
+            break;
         default:
-            fprintf(stderr, "Usage: gtest [ -s rand-seed ] [ -p count ] [ -i interval ]\n");
+            fprintf(stderr, "Usage: gtest [ -s rand-seed ] [ -p count ] "
+                            "[ -i interval ] [ -t timeout ]\n");
             exit(1);
         }
     }
@@ -93,6 +97,9 @@ int main(int argc, char **argv) {
     ucs_global_opts.warn_unused_env_vars = 0; /* Avoid warnings if not all
                                                  config vars are being used */
 
+    /* set gpu context for tests that need it */
+    mem_buffer::set_device_context();
+
     ret = ucs::watchdog_start();
     if (ret != 0) {
         ADD_FAILURE() << "Unable to start watchdog - abort";
diff --git a/test/gtest/common/mem_buffer.cc b/test/gtest/common/mem_buffer.cc
index 4dee7e3667b..6598101c874 100644
--- a/test/gtest/common/mem_buffer.cc
+++ b/test/gtest/common/mem_buffer.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
  * Copyright (C) Advanced Micro Devices, Inc. 2019.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
@@ -11,6 +11,7 @@
 
 #include "mem_buffer.h"
 
+#include <sys/types.h>
 #include <ucp/core/ucp_mm.h>
 #include <ucs/debug/assert.h>
 #include <common/test_helpers.h>
@@ -23,7 +24,7 @@
     do { \
         cudaError_t cerr = _code; \
         if (cerr != cudaSuccess) { \
-            UCS_TEST_ABORT(# _code << " failed" << _details); \
+            UCS_TEST_ABORT(#_code << " failed with code " << cerr << _details); \
         } \
     } while (0)
 
@@ -65,6 +66,11 @@ bool mem_buffer::is_rocm_supported()
 #endif
 }
 
+bool mem_buffer::is_gpu_supported()
+{
+    return is_cuda_supported() || is_rocm_supported();
+}
+
 const std::vector<ucs_memory_type_t>&  mem_buffer::supported_mem_types()
 {
     static std::vector<ucs_memory_type_t> vec;
@@ -84,6 +90,29 @@ const std::vector<ucs_memory_type_t>&  mem_buffer::supported_mem_types()
     return vec;
 }
 
+void mem_buffer::set_device_context()
+{
+    static __thread bool device_set = false;
+
+    if (device_set) {
+        return;
+    }
+
+#if HAVE_CUDA
+    if (is_cuda_supported()) {
+        cudaSetDevice(0);
+    }
+#endif
+
+#if HAVE_ROCM
+    if (is_rocm_supported()) {
+        hipSetDevice(0);
+    }
+#endif
+
+    device_set = true;
+}
+
 void *mem_buffer::allocate(size_t size, ucs_memory_type_t mem_type)
 {
     void *ptr;
@@ -198,7 +227,7 @@ void mem_buffer::pattern_check(const void *buffer, size_t length)
 void mem_buffer::pattern_fill(void *buffer, size_t length, uint64_t seed,
                               ucs_memory_type_t mem_type)
 {
-    if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) {
+    if (UCP_MEM_IS_HOST(mem_type)) {
         pattern_fill(buffer, length, seed);
     } else {
         ucs::auto_buffer temp(length);
@@ -210,7 +239,7 @@ void mem_buffer::pattern_fill(void *buffer, size_t length, uint64_t seed,
 void mem_buffer::pattern_check(const void *buffer, size_t length, uint64_t seed,
                                ucs_memory_type_t mem_type)
 {
-    if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) {
+    if (UCP_MEM_IS_HOST(mem_type)) {
         pattern_check(buffer, length, seed);
     } else {
         ucs::auto_buffer temp(length);
@@ -224,12 +253,12 @@ void mem_buffer::copy_to(void *dst, const void *src, size_t length,
 {
     switch (dst_mem_type) {
     case UCS_MEMORY_TYPE_HOST:
-    case UCS_MEMORY_TYPE_CUDA_MANAGED:
     case UCS_MEMORY_TYPE_ROCM_MANAGED:
         memcpy(dst, src, length);
         break;
 #if HAVE_CUDA
     case UCS_MEMORY_TYPE_CUDA:
+    case UCS_MEMORY_TYPE_CUDA_MANAGED:
         CUDA_CALL(cudaMemcpy(dst, src, length, cudaMemcpyHostToDevice),
                   ": dst=" << dst << " src=" << src << "length=" << length);
         CUDA_CALL(cudaDeviceSynchronize(), "");
@@ -251,12 +280,12 @@ void mem_buffer::copy_from(void *dst, const void *src, size_t length,
 {
     switch (src_mem_type) {
     case UCS_MEMORY_TYPE_HOST:
-    case UCS_MEMORY_TYPE_CUDA_MANAGED:
     case UCS_MEMORY_TYPE_ROCM_MANAGED:
         memcpy(dst, src, length);
         break;
 #if HAVE_CUDA
     case UCS_MEMORY_TYPE_CUDA:
+    case UCS_MEMORY_TYPE_CUDA_MANAGED:
         CUDA_CALL(cudaMemcpy(dst, src, length, cudaMemcpyDeviceToHost),
                   ": dst=" << dst << " src=" << src << "length=" << length);
         CUDA_CALL(cudaDeviceSynchronize(), "");
@@ -276,7 +305,11 @@ void mem_buffer::copy_from(void *dst, const void *src, size_t length,
 bool mem_buffer::compare(const void *expected, const void *buffer,
                          size_t length, ucs_memory_type_t mem_type)
 {
-    if (UCP_MEM_IS_ACCESSIBLE_FROM_CPU(mem_type)) {
+    /* don't access managed memory from CPU to avoid moving the pages
+     * from GPU to CPU during the test
+     */
+    if ((mem_type == UCS_MEMORY_TYPE_HOST) ||
+        (mem_type == UCS_MEMORY_TYPE_ROCM_MANAGED)) {
         return memcmp(expected, buffer, length) == 0;
     } else {
         ucs::auto_buffer temp(length);
diff --git a/test/gtest/common/mem_buffer.h b/test/gtest/common/mem_buffer.h
index 5a55faa16f2..1a02fe77a21 100644
--- a/test/gtest/common/mem_buffer.h
+++ b/test/gtest/common/mem_buffer.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -64,6 +64,12 @@ class mem_buffer {
     /* return the string name of a memory type */
     static std::string mem_type_name(ucs_memory_type_t mem_type);
 
+    /* returns whether any other type of memory besides the CPU is supported */
+    static bool is_gpu_supported();
+
+    /* set device context if compiled with GPU support */
+    static void set_device_context();
+
     mem_buffer(size_t size, ucs_memory_type_t mem_type);
     virtual ~mem_buffer();
 
diff --git a/test/gtest/common/test.cc b/test/gtest/common/test.cc
index 9d78d2facd0..8b10b9deb3e 100644
--- a/test/gtest/common/test.cc
+++ b/test/gtest/common/test.cc
@@ -172,7 +172,8 @@ test_base::count_warns_logger(const char *file, unsigned line, const char *funct
     } else if (level == UCS_LOG_LEVEL_WARN) {
         ++m_total_warnings;
     }
-    if (m_first_warns_and_errors.size() < 5) {
+    if ((level <= UCS_LOG_LEVEL_WARN) &&
+        (m_first_warns_and_errors.size() < 5)) {
         /* Save the first few errors/warnings which cause the test to fail */
         va_list ap2;
         va_copy(ap2, ap);
@@ -206,69 +207,86 @@ void test_base::push_debug_message_with_limit(std::vector<std::string>& vec,
 }
 
 ucs_log_func_rc_t
-test_base::hide_errors_logger(const char *file, unsigned line, const char *function,
-                              ucs_log_level_t level,
-                              const ucs_log_component_config_t *comp_conf,
-                              const char *message, va_list ap)
+test_base::common_logger(ucs_log_level_t log_level_to_handle, bool print,
+                         std::vector<std::string> &messages_vec, size_t limit,
+                         const char *file, unsigned line, const char *function,
+                         ucs_log_level_t level,
+                         const ucs_log_component_config_t *comp_conf,
+                         const char *message, va_list ap)
 {
-    if (level == UCS_LOG_LEVEL_ERROR) {
-        pthread_mutex_lock(&m_logger_mutex);
-        va_list ap2;
-        va_copy(ap2, ap);
-        m_errors.push_back(format_message(message, ap2));
-        va_end(ap2);
-        level = UCS_LOG_LEVEL_DEBUG;
-        pthread_mutex_unlock(&m_logger_mutex);
+    if (level != log_level_to_handle) {
+        return UCS_LOG_FUNC_RC_CONTINUE;
+    }
+
+    // dump the formatted message to a stringstream
+    va_list ap2;
+    va_copy(ap2, ap);
+    std::istringstream iss(format_message(message, ap2));
+    va_end(ap2);
+
+    // save each line of the message to messages_vec, and print it if reqeusted
+    pthread_mutex_lock(&m_logger_mutex);
+    std::string message_line;
+    while (getline(iss, message_line, '\n')) {
+        push_debug_message_with_limit(messages_vec, message_line, limit);
+        if (print) {
+            UCS_TEST_MESSAGE << "< " << message_line << " >";
+        }
+    }
+    pthread_mutex_unlock(&m_logger_mutex);
+
+    // if the message was not printed, pass it to default handler in debug level
+    if (!print) {
+        ucs_log_default_handler(file, line, function, UCS_LOG_LEVEL_DEBUG,
+                                comp_conf, message, ap);
     }
 
-    ucs_log_default_handler(file, line, function, level,
-                            &ucs_global_opts.log_component, message, ap);
     return UCS_LOG_FUNC_RC_STOP;
 }
 
 ucs_log_func_rc_t
-test_base::hide_warns_logger(const char *file, unsigned line, const char *function,
-                             ucs_log_level_t level,
+test_base::hide_errors_logger(const char *file, unsigned line,
+                              const char *function, ucs_log_level_t level,
+                              const ucs_log_component_config_t *comp_conf,
+                              const char *message, va_list ap)
+{
+    return common_logger(UCS_LOG_LEVEL_ERROR, false, m_errors,
+                         std::numeric_limits<size_t>::max(), file, line,
+                         function, level, comp_conf, message, ap);
+}
+
+ucs_log_func_rc_t
+test_base::hide_warns_logger(const char *file, unsigned line,
+                             const char *function, ucs_log_level_t level,
                              const ucs_log_component_config_t *comp_conf,
                              const char *message, va_list ap)
 {
-    if (level == UCS_LOG_LEVEL_WARN) {
-        pthread_mutex_lock(&m_logger_mutex);
-        va_list ap2;
-        va_copy(ap2, ap);
-        m_warnings.push_back(format_message(message, ap2));
-        va_end(ap2);
-        level = UCS_LOG_LEVEL_DEBUG;
-        pthread_mutex_unlock(&m_logger_mutex);
-    }
-
-    ucs_log_default_handler(file, line, function, level,
-                            &ucs_global_opts.log_component, message, ap);
-    return UCS_LOG_FUNC_RC_STOP;
+    return common_logger(UCS_LOG_LEVEL_WARN, false, m_warnings,
+                         std::numeric_limits<size_t>::max(), file, line,
+                         function, level, comp_conf, message, ap);
 }
 
 ucs_log_func_rc_t
-test_base::wrap_errors_logger(const char *file, unsigned line, const char *function,
-                              ucs_log_level_t level,
+test_base::wrap_errors_logger(const char *file, unsigned line,
+                              const char *function, ucs_log_level_t level,
                               const ucs_log_component_config_t *comp_conf,
                               const char *message, va_list ap)
 {
-    /* Ignore warnings about empty memory pool */
-    if (level == UCS_LOG_LEVEL_ERROR) {
-        pthread_mutex_lock(&m_logger_mutex);
-        std::istringstream iss(format_message(message, ap));
-        std::string text;
-        while (getline(iss, text, '\n')) {
-            push_debug_message_with_limit(m_errors, text, 1000);
-            UCS_TEST_MESSAGE << "< " << text << " >";
-        }
-        pthread_mutex_unlock(&m_logger_mutex);
-        return UCS_LOG_FUNC_RC_STOP;
-    }
+    return common_logger(UCS_LOG_LEVEL_ERROR, true, m_errors, 1000, file, line,
+                         function, level, comp_conf, message, ap);
+}
 
-    return UCS_LOG_FUNC_RC_CONTINUE;
+ucs_log_func_rc_t
+test_base::wrap_warns_logger(const char *file, unsigned line,
+                             const char *function, ucs_log_level_t level,
+                             const ucs_log_component_config_t *comp_conf,
+                             const char *message, va_list ap)
+{
+    return common_logger(UCS_LOG_LEVEL_WARN, true, m_warnings, 1000, file, line,
+                         function, level, comp_conf, message, ap);
 }
 
+
 unsigned test_base::num_errors()
 {
     return m_total_errors - m_num_errors_before;
@@ -288,7 +306,7 @@ void test_base::SetUpProxy() {
     m_errors.clear();
     m_warnings.clear();
     m_first_warns_and_errors.clear();
-    m_num_log_handlers_before    = ucs_log_num_handlers();
+    m_num_log_handlers_before = ucs_log_num_handlers();
     ucs_log_push_handler(count_warns_logger);
 
     try {
@@ -317,7 +335,10 @@ void test_base::TearDownProxy() {
 
     m_errors.clear();
 
-    ucs_log_pop_handler();
+    ucs_assert(ucs_log_get_current_indent() == 0);
+    if (ucs_log_num_handlers() > m_num_log_handlers_before) {
+        ucs_log_pop_handler();
+    }
 
     unsigned num_not_removed = ucs_log_num_handlers() - m_num_log_handlers_before;
     if (num_not_removed != 0) {
@@ -426,9 +447,11 @@ static void clear_dontcopy_regions_vma_cb(ucs_sys_vma_info_t *info, void *ctx) {
 
     if (info->flags & UCS_SYS_VMA_FLAG_DONTCOPY) {
         ret = madvise((void*)info->start, info->end - info->start, MADV_DOFORK);
-        EXPECT_EQ(0, ret) << "errno: " << errno
-                          << std::hex << " 0x" << info->start
-                          << "-0x" << info->end;
+        if (ret != 0) {
+            UCS_TEST_MESSAGE << "madvise(DOFORK) failed, errno: " << errno
+                             << std::hex << " 0x" << info->start
+                             << "-0x" << info->end;
+        }
     }
 }
 
diff --git a/test/gtest/common/test.h b/test/gtest/common/test.h
index d974907e822..f91bd15277b 100644
--- a/test/gtest/common/test.h
+++ b/test/gtest/common/test.h
@@ -108,6 +108,12 @@ class test_base {
                        const ucs_log_component_config_t *comp_conf,
                        const char *message, va_list ap);
 
+    static ucs_log_func_rc_t
+    wrap_warns_logger(const char *file, unsigned line, const char *function,
+                      ucs_log_level_t level,
+                      const ucs_log_component_config_t *comp_conf,
+                      const char *message, va_list ap);
+
     unsigned num_errors();
 
     unsigned num_warnings();
@@ -136,6 +142,14 @@ class test_base {
                                               const std::string& message,
                                               const size_t limit);
 
+    static ucs_log_func_rc_t
+    common_logger(ucs_log_level_t log_level_to_handle, bool print,
+                  std::vector<std::string> &messages_vec, size_t limit,
+                  const char *file, unsigned line, const char *function,
+                  ucs_log_level_t level,
+                  const ucs_log_component_config_t *comp_conf,
+                  const char *message, va_list ap);
+
     static void *thread_func(void *arg);
 
     pthread_barrier_t    m_barrier;
diff --git a/test/gtest/common/test_helpers.cc b/test/gtest/common/test_helpers.cc
index 69c2a8aa69b..30d3d256d36 100644
--- a/test/gtest/common/test_helpers.cc
+++ b/test/gtest/common/test_helpers.cc
@@ -6,9 +6,11 @@
 
 #include "test_helpers.h"
 
+#include <ucs/async/async.h>
 #include <ucs/sys/math.h>
 #include <ucs/sys/sys.h>
 #include <ucs/sys/string.h>
+#include <ucs/config/global_opts.h>
 #include <ucs/config/parser.h>
 
 #include <set>
@@ -24,7 +26,7 @@ typedef std::pair<std::string, ::testing::TimeInMillis> test_result_t;
 
 const double test_timeout_in_sec = 60.;
 
-const double watchdog_timeout_default = 900.; // 15 minutes
+double watchdog_timeout = 900.; // 15 minutes
 
 static test_watchdog_t watchdog;
 
@@ -76,7 +78,7 @@ void *watchdog_func(void *arg)
             watchdog.state = WATCHDOG_DEFAULT_SET;
             break;
         case WATCHDOG_DEFAULT_SET:
-            watchdog.timeout     = watchdog_timeout_default;
+            watchdog.timeout     = watchdog_timeout;
             watchdog.state       = WATCHDOG_RUN;
             watchdog.kill_signal = SIGABRT;
             break;
@@ -114,7 +116,7 @@ void watchdog_set(test_watchdog_state_t new_state, double new_timeout)
 
 void watchdog_set(test_watchdog_state_t new_state)
 {
-    watchdog_set(new_state, watchdog_timeout_default);
+    watchdog_set(new_state, watchdog_timeout);
 }
 
 void watchdog_set(double new_timeout)
@@ -171,7 +173,7 @@ int watchdog_start()
 
     pthread_mutex_lock(&watchdog.mutex);
     watchdog.state          = WATCHDOG_RUN;
-    watchdog.timeout        = watchdog_timeout_default;
+    watchdog.timeout        = watchdog_timeout;
     watchdog.kill_signal    = SIGABRT;
     watchdog.watched_thread = pthread_self();
     pthread_mutex_unlock(&watchdog.mutex);
@@ -586,27 +588,35 @@ std::string exit_status_info(int exit_status)
     return ss.str().substr(2, std::string::npos);
 }
 
-sock_addr_storage::sock_addr_storage() : m_size(0), m_is_valid(false) {
+sock_addr_storage::sock_addr_storage(bool is_rdmacm_netdev) :
+        m_size(0), m_is_valid(false), m_is_rdmacm_netdev(is_rdmacm_netdev)
+{
     memset(&m_storage, 0, sizeof(m_storage));
 }
 
-sock_addr_storage::sock_addr_storage(const ucs_sock_addr_t &ucs_sock_addr) {
+sock_addr_storage::sock_addr_storage(const ucs_sock_addr_t &ucs_sock_addr,
+                                     bool is_rdmacm_netdev)
+{
     if (sizeof(m_storage) < ucs_sock_addr.addrlen) {
         memset(&m_storage, 0, sizeof(m_storage));
-        m_size     = 0;
-        m_is_valid = false;
+        m_size             = 0;
+        m_is_valid         = false;
+        m_is_rdmacm_netdev = false;
     } else {
-        set_sock_addr(*ucs_sock_addr.addr, ucs_sock_addr.addrlen);
+        set_sock_addr(*ucs_sock_addr.addr, ucs_sock_addr.addrlen,
+                      is_rdmacm_netdev);
     }
 }
 
 void sock_addr_storage::set_sock_addr(const struct sockaddr &addr,
-                                      const size_t size) {
+                                      const size_t size, bool is_rdmacm_netdev)
+{
     ASSERT_GE(sizeof(m_storage), size);
     ASSERT_TRUE(ucs::is_inet_addr(&addr));
     memcpy(&m_storage, &addr, size);
-    m_size     = size;
-    m_is_valid = true;
+    m_size             = size;
+    m_is_valid         = true;
+    m_is_rdmacm_netdev = is_rdmacm_netdev;
 }
 
 void sock_addr_storage::reset_to_any() {
@@ -664,6 +674,11 @@ uint16_t sock_addr_storage::get_port() const {
     }
 }
 
+bool sock_addr_storage::is_rdmacm_netdev() const
+{
+    return m_is_rdmacm_netdev;
+}
+
 size_t sock_addr_storage::get_addr_size() const {
     return m_size;
 }
@@ -685,6 +700,18 @@ const struct sockaddr* sock_addr_storage::get_sock_addr_ptr() const {
     return m_is_valid ? (struct sockaddr *)(&m_storage) : NULL;
 }
 
+const void* sock_addr_storage::get_sock_addr_in_buf() const {
+    const struct sockaddr* saddr = get_sock_addr_ptr();
+
+    ucs_assert_always(saddr != NULL);
+    ucs_assert_always((saddr->sa_family == AF_INET) ||
+                      (saddr->sa_family == AF_INET6));
+
+    return (saddr->sa_family == AF_INET) ?
+           (const void*)&((struct sockaddr_in*)saddr)->sin_addr :
+           (const void*)&((struct sockaddr_in6*)saddr)->sin6_addr;
+}
+
 std::ostream& operator<<(std::ostream& os, const sock_addr_storage& sa_storage)
 {
     return os << ucs::sockaddr_to_str(sa_storage.get_sock_addr_ptr());
@@ -705,6 +732,17 @@ void* auto_buffer::operator*() const {
     return m_ptr;
 };
 
+scoped_log_level::scoped_log_level(ucs_log_level_t level)
+    : m_prev_level(ucs_global_opts.log_component.log_level)
+{
+    ucs_global_opts.log_component.log_level = level;
+}
+
+scoped_log_level::~scoped_log_level()
+{
+    ucs_global_opts.log_component.log_level = m_prev_level;
+}
+
 namespace detail {
 
 message_stream::message_stream(const std::string& title) {
@@ -723,6 +761,27 @@ message_stream::~message_stream() {
 
 } // detail
 
+scoped_async_lock::scoped_async_lock(ucs_async_context_t &async) :
+    m_async(async)
+{
+    UCS_ASYNC_BLOCK(&m_async);
+}
+
+scoped_async_lock::~scoped_async_lock()
+{
+    UCS_ASYNC_UNBLOCK(&m_async);
+}
+
+scoped_mutex_lock::scoped_mutex_lock(pthread_mutex_t &mutex) : m_mutex(mutex)
+{
+    pthread_mutex_lock(&m_mutex);
+}
+
+scoped_mutex_lock::~scoped_mutex_lock()
+{
+    pthread_mutex_unlock(&m_mutex);
+}
+
 std::vector<std::vector<ucs_memory_type_t> > supported_mem_type_pairs() {
     static std::vector<std::vector<ucs_memory_type_t> > result;
 
diff --git a/test/gtest/common/test_helpers.h b/test/gtest/common/test_helpers.h
index 4e157d1cc31..3ad239ee56c 100644
--- a/test/gtest/common/test_helpers.h
+++ b/test/gtest/common/test_helpers.h
@@ -12,6 +12,7 @@
 
 #include <common/mem_buffer.h>
 
+#include <ucs/async/async_fwd.h>
 #include <ucs/config/types.h>
 #include <ucs/sys/preprocessor.h>
 #include <ucs/sys/checker.h>
@@ -176,8 +177,7 @@
 namespace ucs {
 
 extern const double test_timeout_in_sec;
-extern const double watchdog_timeout_default;
-
+extern double watchdog_timeout;
 extern std::set< const ::testing::TestInfo*> skipped_tests;
 
 typedef enum {
@@ -266,7 +266,7 @@ ucs_time_t get_deadline(double timeout_in_sec = test_timeout_in_sec);
  */
 int max_tcp_connections();
 
- 
+
 /**
  * Signal-safe sleep.
  */
@@ -325,11 +325,13 @@ std::string sockaddr_to_str(const S *saddr) {
  */
 class sock_addr_storage {
 public:
-    sock_addr_storage();
+    sock_addr_storage(bool is_rdmacm_netdev = false);
 
-    sock_addr_storage(const ucs_sock_addr_t &ucs_sock_addr);
+    sock_addr_storage(const ucs_sock_addr_t &ucs_sock_addr,
+                      bool is_rdmacm_netdev = false);
 
-    void set_sock_addr(const struct sockaddr &addr, const size_t size);
+    void set_sock_addr(const struct sockaddr &addr, const size_t size,
+                       bool is_rdmacm_netdev = false);
 
     void reset_to_any();
 
@@ -339,6 +341,8 @@ class sock_addr_storage {
 
     uint16_t get_port() const;
 
+    bool is_rdmacm_netdev() const;
+
     size_t get_addr_size() const;
 
     ucs_sock_addr_t to_ucs_sock_addr() const;
@@ -347,10 +351,13 @@ class sock_addr_storage {
 
     const struct sockaddr* get_sock_addr_ptr() const;
 
+    const void* get_sock_addr_in_buf() const;
+
 private:
     struct sockaddr_storage m_storage;
     size_t                  m_size;
     bool                    m_is_valid;
+    bool                    m_is_rdmacm_netdev;
 };
 
 
@@ -799,6 +806,17 @@ static void deleter(T *ptr) {
     delete ptr;
 }
 
+
+class scoped_log_level {
+public:
+    scoped_log_level(ucs_log_level_t level);
+    ~scoped_log_level();
+
+private:
+    const ucs_log_level_t m_prev_level;
+};
+
+
 extern int    perf_retry_count;
 extern double perf_retry_interval;
 
@@ -847,6 +865,29 @@ class message_stream {
 
 } // detail
 
+
+class scoped_async_lock {
+public:
+    scoped_async_lock(ucs_async_context_t &async);
+
+    ~scoped_async_lock();
+
+private:
+    ucs_async_context_t &m_async;
+};
+
+
+class scoped_mutex_lock {
+public:
+    scoped_mutex_lock(pthread_mutex_t &mutex);
+
+    ~scoped_mutex_lock();
+
+private:
+    pthread_mutex_t &m_mutex;
+};
+
+
 /**
  * N-ary Cartesian product over the N vectors provided in the input vector
  * The cardinality of the result vector:
diff --git a/test/gtest/common/test_obj_size.cc b/test/gtest/common/test_obj_size.cc
index 9c3ddacd920..60d0ef5cc5e 100644
--- a/test/gtest/common/test_obj_size.cc
+++ b/test/gtest/common/test_obj_size.cc
@@ -30,6 +30,9 @@ extern "C" {
 #  include <uct/ib/ud/base/ud_ep.h>
 #  include <uct/ib/ud/verbs/ud_verbs.h>
 #endif
+#if HAVE_CUDA
+#  include <uct/cuda/cuda_ipc/cuda_ipc_ep.h>
+#endif
 }
 
 class test_obj_size : public ucs::test {
@@ -48,7 +51,7 @@ UCS_TEST_F(test_obj_size, size) {
 #else
     EXPECTED_SIZE(ucp_ep_t, 64);
     /* TODO reduce request size to 240 or less after removing old protocols state */
-    EXPECTED_SIZE(ucp_request_t, 296);
+    EXPECTED_SIZE(ucp_request_t, 272);
     EXPECTED_SIZE(ucp_recv_desc_t, 48);
     EXPECTED_SIZE(uct_ep_t, 8);
     EXPECTED_SIZE(uct_base_ep_t, 8);
@@ -57,7 +60,7 @@ UCS_TEST_F(test_obj_size, size) {
     EXPECTED_SIZE(uct_tcp_ep_t, 160);
 #  if HAVE_TL_RC
     EXPECTED_SIZE(uct_rc_ep_t, 64);
-    EXPECTED_SIZE(uct_rc_verbs_ep_t, 96);
+    EXPECTED_SIZE(uct_rc_verbs_ep_t, 80);
 #  endif
 #  if HAVE_TL_DC
     EXPECTED_SIZE(uct_dc_mlx5_ep_t, 32);
@@ -66,6 +69,9 @@ UCS_TEST_F(test_obj_size, size) {
     EXPECTED_SIZE(uct_ud_ep_t, 248);
     EXPECTED_SIZE(uct_ud_verbs_ep_t, 264);
 #  endif
+#  if HAVE_CUDA
+    EXPECTED_SIZE(uct_cuda_ipc_ep_t, 24);
+#  endif
 #endif
 }
 
diff --git a/test/gtest/common/test_perf.cc b/test/gtest/common/test_perf.cc
index 0000e1a3e54..1a5c7fc8642 100644
--- a/test/gtest/common/test_perf.cc
+++ b/test/gtest/common/test_perf.cc
@@ -3,7 +3,7 @@
 * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
 * Copyright (C) The University of Tennessee and The University 
 *               of Tennessee Research Foundation. 2015. ALL RIGHTS RESERVED.
-* Copyright (C) ARM Ltd. 2016.  ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2016-2020.  ALL RIGHTS RESERVED.
 * See file LICENSE for terms.
 */
 
@@ -18,6 +18,11 @@ extern "C" {
 #include <vector>
 
 
+#define UCP_ARM_PERF_TEST_MULTIPLIER  2
+#define UCT_ARM_PERF_TEST_MULTIPLIER 15
+#define UCT_PERF_TEST_MULTIPLIER      5
+
+
 test_perf::rte_comm::rte_comm() {
     pthread_mutex_init(&m_mutex, NULL);
 }
@@ -131,6 +136,7 @@ std::vector<int> test_perf::get_affinity() {
     cpu_set_t affinity;
     int ret, nr_cpus;
 
+    CPU_ZERO(&affinity);
     ret = sched_getaffinity(getpid(), sizeof(affinity), &affinity);
     if (ret != 0) {
         ucs_error("Failed to get CPU affinity: %m");
@@ -163,8 +169,10 @@ void test_perf::set_affinity(int cpu)
 void* test_perf::thread_func(void *arg)
 {
     thread_arg *a = (thread_arg*)arg;
+    rte *r        = reinterpret_cast<rte*>(a->params.rte_group);
     test_result *result;
 
+    ucs_log_set_thread_name("perf-%d", r->index());
     set_affinity(a->cpu);
     result = new test_result();
     result->status = ucx_perf_run(&a->params, &result->result);
@@ -180,19 +188,19 @@ test_perf::test_result test_perf::run_multi_threaded(const test_spec &test, unsi
 
     ucx_perf_params_t params;
     memset(&params, 0, sizeof(params));
-    params.api = test.api;
+    params.api             = test.api;
     params.command         = test.command;
     params.test_type       = test.test_type;
     params.thread_mode     = UCS_THREAD_MODE_SINGLE;
     params.async_mode      = UCS_ASYNC_THREAD_LOCK_TYPE;
     params.thread_count    = 1;
-    params.wait_mode       = UCX_PERF_WAIT_MODE_LAST;
+    params.wait_mode       = test.wait_mode;
     params.flags           = test.test_flags | flags;
-    params.am_hdr_size     = 8;
+    params.uct.am_hdr_size = 8;
     params.alignment       = ucs_get_page_size();
     params.max_outstanding = test.max_outstanding;
     if (ucs::test_time_multiplier() == 1) {
-        params.warmup_iter = test.iters / 10;
+        params.warmup_iter = ucs_max(1, test.iters / 100);
         params.max_iter    = test.iters;
     } else {
         params.warmup_iter = 0;
@@ -252,8 +260,8 @@ test_perf::test_result test_perf::run_multi_threaded(const test_spec &test, unsi
     return result;
 }
 
-void test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf,
-                         const std::string &tl_name, const std::string &dev_name)
+double test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf,
+                           const std::string &tl_name, const std::string &dev_name)
 {
     std::vector<int> cpus = get_affinity();
     if (cpus.size() < 2) {
@@ -271,7 +279,7 @@ void test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf,
         if ((result.status == UCS_ERR_UNSUPPORTED) ||
             (result.status == UCS_ERR_UNREACHABLE))
         {
-            return; /* Skipped */
+            return 0.0; /* Skipped */
         }
 
         ASSERT_UCS_OK(result.status);
@@ -292,15 +300,16 @@ void test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf,
         }
 
         if (!check_perf) {
-            return; /* Skip */
+            return value; /* Skip */
         } else if ((value >= test.min) && (value <= test.max)) {
-            return; /* Success */
+            return value; /* Success */
         } else {
             ucs::safe_sleep(ucs::perf_retry_interval);
         }
     }
 
-     ADD_FAILURE() << "Invalid " << test.title << " performance, expected: " <<
-                      std::setprecision(3) << test.min << ".." << test.max;
-}
+    ADD_FAILURE() << "Invalid " << test.title << " performance, expected: "
+                  << std::setprecision(3) << test.min << ".." << test.max;
 
+    return 0.0;
+}
diff --git a/test/gtest/common/test_perf.h b/test/gtest/common/test_perf.h
index 3b1a836a0c9..28e0d545237 100644
--- a/test/gtest/common/test_perf.h
+++ b/test/gtest/common/test_perf.h
@@ -1,5 +1,6 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2020.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -19,6 +20,7 @@ class test_perf {
         ucx_perf_api_t         api;
         ucx_perf_cmd_t         command;
         ucx_perf_test_type_t   test_type;
+        ucx_perf_wait_mode_t   wait_mode;
         int                    data_layout;
         size_t                 msg_stride;
         size_t                 msglencnt;
@@ -34,8 +36,8 @@ class test_perf {
 
     static std::vector<int> get_affinity();
 
-    void run_test(const test_spec& test, unsigned flags, bool check_perf,
-                  const std::string &tl_name, const std::string &dev_name);
+    double run_test(const test_spec& test, unsigned flags, bool check_perf, const
+                    std::string &tl_name, const std::string &dev_name);
 
 private:
     class rte_comm {
diff --git a/test/gtest/common/test_watchdog.cc b/test/gtest/common/test_watchdog.cc
index 9b7d6e27884..291806181bc 100644
--- a/test/gtest/common/test_watchdog.cc
+++ b/test/gtest/common/test_watchdog.cc
@@ -16,20 +16,20 @@ class test_watchdog : public ucs::test {
         ucs::watchdog_signal();
         // all have to be set to their default values
         EXPECT_EQ(ucs::WATCHDOG_RUN, ucs::watchdog_get_state());
-        EXPECT_EQ(ucs::watchdog_timeout_default, ucs::watchdog_get_timeout());
+        EXPECT_EQ(ucs::watchdog_timeout, ucs::watchdog_get_timeout());
     }
 };
 
 UCS_TEST_F(test_watchdog, watchdog_set) {
     EXPECT_EQ(ucs::WATCHDOG_RUN, ucs::watchdog_get_state());
-    EXPECT_EQ(ucs::watchdog_timeout_default, ucs::watchdog_get_timeout());
+    EXPECT_EQ(ucs::watchdog_timeout, ucs::watchdog_get_timeout());
     EXPECT_EQ(SIGABRT, ucs::watchdog_get_kill_signal());
 
     ucs::watchdog_set(ucs::WATCHDOG_TEST);
     // when the test state is applied, the watchdog
     // changes state to WATCHDOG_DEFAULT_SET
     EXPECT_EQ(ucs::WATCHDOG_DEFAULT_SET, ucs::watchdog_get_state());
-    EXPECT_EQ(ucs::watchdog_timeout_default, ucs::watchdog_get_timeout());
+    EXPECT_EQ(ucs::watchdog_timeout, ucs::watchdog_get_timeout());
     EXPECT_EQ(SIGTERM, ucs::watchdog_get_kill_signal());
 
     reset_to_default();
@@ -54,12 +54,12 @@ UCS_TEST_F(test_watchdog, watchdog_set) {
     // when the timeout and the timeout applied, the watchdog
     // changes state to WATCHDOG_DEFAULT_SET
     EXPECT_EQ(ucs::WATCHDOG_RUN, ucs::watchdog_get_state());
-    EXPECT_EQ(ucs::watchdog_timeout_default, ucs::watchdog_get_timeout());
+    EXPECT_EQ(ucs::watchdog_timeout, ucs::watchdog_get_timeout());
     EXPECT_EQ(SIGABRT, ucs::watchdog_get_kill_signal());
 
     ucs::watchdog_set(ucs::WATCHDOG_DEFAULT_SET);
     EXPECT_EQ(ucs::WATCHDOG_RUN, ucs::watchdog_get_state());
-    EXPECT_EQ(ucs::watchdog_timeout_default, ucs::watchdog_get_timeout());
+    EXPECT_EQ(ucs::watchdog_timeout, ucs::watchdog_get_timeout());
     EXPECT_EQ(SIGABRT, ucs::watchdog_get_kill_signal());
 }
 
diff --git a/test/gtest/configure.m4 b/test/gtest/configure.m4
index aff6c1de23b..0d1cf75f051 100644
--- a/test/gtest/configure.m4
+++ b/test/gtest/configure.m4
@@ -11,6 +11,12 @@ CHECK_COMPILER_FLAG([-fno-tree-vectorize], [-fno-tree-vectorize],
                     [GTEST_CXXFLAGS="$GTEST_CXXFLAGS -fno-tree-vectorize"],
                     [])
 
+# error #186: pointless comparison of unsigned integer with zero
+CHECK_COMPILER_FLAG([--diag_suppress 186], [--diag_suppress 186],
+                    [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])],
+                    [GTEST_CXXFLAGS="$GTEST_CXXFLAGS --diag_suppress 186"],
+                    [])
+                    
 # error #236: controlling expression is constant
 CHECK_COMPILER_FLAG([--diag_suppress 236], [--diag_suppress 236],
                     [AC_LANG_SOURCE([[int main(int argc, char** argv){return 0;}]])],
diff --git a/test/gtest/ucm/malloc_hook.cc b/test/gtest/ucm/malloc_hook.cc
index 7e0e56a69b2..5ee6efd65ac 100644
--- a/test/gtest/ucm/malloc_hook.cc
+++ b/test/gtest/ucm/malloc_hook.cc
@@ -22,6 +22,7 @@ extern "C" {
 #include <ucs/time/time.h>
 #include <ucm/malloc/malloc_hook.h>
 #include <ucm/bistro/bistro.h>
+#include <ucm/util/reloc.h>
 #include <ucs/sys/sys.h>
 #ifdef HAVE_MALLOC_H
 #include <malloc.h>
@@ -207,8 +208,11 @@ class malloc_hook : public ucs::test {
         bistro_patch(const char* symbol, void *hook)
         {
             ucs_status_t status;
-
-            status = ucm_bistro_patch(symbol, hook, &m_rp);
+            void *func_ptr = ucm_reloc_get_orig(symbol, hook);
+            if (func_ptr == NULL) {
+                UCS_TEST_ABORT("could not find " << symbol);
+            }
+            status = ucm_bistro_patch(func_ptr, hook, symbol, NULL, &m_rp);
             ASSERT_UCS_OK(status);
             EXPECT_NE((intptr_t)m_rp, 0);
         }
@@ -218,6 +222,11 @@ class malloc_hook : public ucs::test {
             ucm_bistro_restore(m_rp);
         }
 
+        ucm_bistro_restore_point_t* rp()
+        {
+            return m_rp;
+        }
+
     protected:
         ucm_bistro_restore_point_t *m_rp;
     };
@@ -1034,8 +1043,6 @@ typedef int (munmap_f_t)(void *addr, size_t len);
 
 UCS_TEST_SKIP_COND_F(malloc_hook, bistro_patch, RUNNING_ON_VALGRIND) {
     const char *symbol = "munmap";
-    ucm_bistro_restore_point_t *rp = NULL;
-    ucs_status_t status;
     munmap_f_t *munmap_f;
     void *ptr;
     int res;
@@ -1043,30 +1050,27 @@ UCS_TEST_SKIP_COND_F(malloc_hook, bistro_patch, RUNNING_ON_VALGRIND) {
     uint64_t UCS_V_UNUSED origin;
 
     /* set hook to mmap call */
-    status = ucm_bistro_patch(symbol, (void*)bistro_hook<0>::munmap, &rp);
-    ASSERT_UCS_OK(status);
-    EXPECT_NE((intptr_t)rp, 0);
+    {
+        bistro_patch patch(symbol, (void*)bistro_hook<0>::munmap);
 
-    munmap_f = (munmap_f_t*)ucm_bistro_restore_addr(rp);
-    EXPECT_NE((intptr_t)munmap_f, 0);
+        munmap_f = (munmap_f_t*)ucm_bistro_restore_addr(patch.rp());
+        EXPECT_NE((intptr_t)munmap_f, 0);
 
-    /* save partial body of patched function */
-    patched = *(uint64_t*)munmap_f;
+        /* save partial body of patched function */
+        patched = *(uint64_t*)munmap_f;
 
-    bistro_call_counter = 0;
-    ptr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
-    EXPECT_NE(ptr, MAP_FAILED);
-
-    /* try to call munmap, we should jump into munmap_hook instead */
-    res = munmap_f(ptr, 4096);
-    EXPECT_EQ(res, 0);
-    /* due to cache coherency issues on ARM systems could be executed
-     * original function body, so, skip counter evaluation */
-    EXPECT_GT(bistro_call_counter, 0);
+        bistro_call_counter = 0;
+        ptr                 = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+                                   MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+        EXPECT_NE(ptr, MAP_FAILED);
 
-    /* restore original mmap body */
-    status = ucm_bistro_restore(rp);
-    ASSERT_UCS_OK(status);
+        /* try to call munmap, we should jump into munmap_hook instead */
+        res = munmap_f(ptr, 4096);
+        EXPECT_EQ(res, 0);
+        /* due to cache coherency issues on ARM systems could be executed
+         * original function body, so, skip counter evaluation */
+        EXPECT_GT(bistro_call_counter, 0);
+    }
 
     bistro_call_counter = 0;
     /* now try to call mmap, we should NOT jump into mmap_hook */
@@ -1201,6 +1205,80 @@ UCS_TEST_SKIP_COND_F(malloc_hook, test_event_unmap,
     EXPECT_TRUE(status == UCS_OK);
 }
 
+class memtype_hooks : public ucs::test_with_param<ucs_memory_type_t> {
+public:
+    void mem_event(ucm_event_type_t event_type, ucm_event_t *event)
+    {
+        m_events.push_back(event_t(event_type, *event));
+    }
+
+protected:
+    typedef std::pair<ucm_event_type_t, ucm_event_t> event_t;
+
+    bool is_event_fired(ucm_event_type_t event_type, void *address, size_t size)
+    {
+        for (size_t i = 0; i < m_events.size(); ++i) {
+            if (event_type != m_events[i].first) {
+                continue;
+            }
+
+            if ((event_type == UCM_EVENT_MEM_TYPE_ALLOC) &&
+                (m_events[i].second.mem_type.address == address) &&
+                (m_events[i].second.mem_type.mem_type == mem_type()) &&
+                (m_events[i].second.mem_type.size == size)) {
+                return true;
+            }
+
+            if ((event_type == UCM_EVENT_MEM_TYPE_FREE) &&
+                (m_events[i].second.mem_type.address == address) &&
+                (m_events[i].second.mem_type.size == size)) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+protected:
+    ucs_memory_type_t mem_type() const
+    {
+        return GetParam();
+    }
+
+    std::vector<event_t> m_events;
+};
+
+UCS_TEST_SKIP_COND_P(memtype_hooks, alloc_free,
+                     RUNNING_ON_VALGRIND || (GetParam() == UCS_MEMORY_TYPE_HOST))
+{
+
+    /* vector operations should not generate events */
+    m_events.reserve(16);
+
+    mmap_event<memtype_hooks> event_handler(this);
+
+    UCS_TEST_MESSAGE << ucs_memory_type_names[mem_type()];
+
+    ucs_status_t status;
+    status = event_handler.set(UCM_EVENT_MEM_TYPE_ALLOC |
+                               UCM_EVENT_MEM_TYPE_FREE);
+    ASSERT_UCS_OK(status);
+
+    const size_t size = 64 * UCS_KBYTE;
+    ucs::auto_ptr<mem_buffer> buffer(new mem_buffer(size, mem_type()));
+    void *ptr = buffer->ptr();
+
+    EXPECT_TRUE(is_event_fired(UCM_EVENT_MEM_TYPE_ALLOC, ptr, size));
+    m_events.clear();
+
+    buffer.reset();
+
+    EXPECT_TRUE(is_event_fired(UCM_EVENT_MEM_TYPE_FREE, ptr, size));
+}
+
+INSTANTIATE_TEST_CASE_P(mem_types, memtype_hooks,
+                        ::testing::ValuesIn(mem_buffer::supported_mem_types()));
+
 class malloc_hook_dlopen : public malloc_hook {
 protected:
     class library {
diff --git a/test/gtest/ucp/test_ucp_am.cc b/test/gtest/ucp/test_ucp_am.cc
index 6e6fd649b20..2b46b8525e8 100644
--- a/test/gtest/ucp/test_ucp_am.cc
+++ b/test/gtest/ucp/test_ucp_am.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2020.  ALL RIGHTS RESERVED.
  * Copyright (c) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
  * Copyright (C) Los Alamos National Security, LLC. 2018. ALL RIGHTS RESERVED.
  *
@@ -11,6 +11,7 @@
 #include <math.h>
 
 #include <common/test.h>
+#include <common/mem_buffer.h>
 
 #include "ucp_datatype.h"
 #include "ucp_test.h"
@@ -254,7 +255,7 @@ void test_ucp_am::do_set_am_handler_realloc_test()
     do_send_process_data_test(0, UCP_SEND_ID + 1, 0);
 }
 
-UCS_TEST_P(test_ucp_am, send_process_am, "RNDV_THRESH=-1")
+UCS_TEST_P(test_ucp_am, send_process_am)
 {
     set_handlers(UCP_SEND_ID);
     do_send_process_data_test(0, UCP_SEND_ID, 0);
@@ -263,13 +264,22 @@ UCS_TEST_P(test_ucp_am, send_process_am, "RNDV_THRESH=-1")
     do_send_process_data_test(0, UCP_SEND_ID, UCP_AM_SEND_REPLY);
 }
 
-UCS_TEST_P(test_ucp_am, send_process_am_release, "RNDV_THRESH=-1")
+UCS_TEST_P(test_ucp_am, send_process_am_rndv, "RNDV_THRESH=1")
+{
+    set_handlers(UCP_SEND_ID);
+    do_send_process_data_test(0, UCP_SEND_ID, 0);
+
+    set_reply_handlers();
+    do_send_process_data_test(0, UCP_SEND_ID, UCP_AM_SEND_REPLY);
+}
+
+UCS_TEST_P(test_ucp_am, send_process_am_release)
 {
     set_handlers(UCP_SEND_ID);
     do_send_process_data_test(UCP_RELEASE, 0, 0);
 }
 
-UCS_TEST_P(test_ucp_am, send_process_iov_am, "RNDV_THRESH=-1")
+UCS_TEST_P(test_ucp_am, send_process_iov_am)
 {
     ucs::detail::message_stream ms("INFO");
 
@@ -285,7 +295,7 @@ UCS_TEST_P(test_ucp_am, send_process_iov_am, "RNDV_THRESH=-1")
     }
 }
 
-UCS_TEST_P(test_ucp_am, set_am_handler_realloc, "RNDV_THRESH=-1")
+UCS_TEST_P(test_ucp_am, set_am_handler_realloc)
 {
     do_set_am_handler_realloc_test();
 }
@@ -295,12 +305,18 @@ UCP_INSTANTIATE_TEST_CASE(test_ucp_am)
 
 class test_ucp_am_nbx : public test_ucp_am_base {
 public:
+    static const uint64_t SEED = 0x1111111111111111lu;
+
     test_ucp_am_nbx()
     {
         m_dt          = ucp_dt_make_contig(1);
         m_am_received = false;
+        m_rx_dt       = ucp_dt_make_contig(1);
+        m_rx_memtype  = UCS_MEMORY_TYPE_HOST;
+        m_rx_buf      = NULL;
     }
 
+protected:
     size_t max_am_hdr()
     {
         ucp_worker_attr_t attr;
@@ -337,8 +353,8 @@ class test_ucp_am_nbx : public test_ucp_am_base {
         }
     }
 
-    void set_am_data_handler(entity &e, uint16_t am_id,
-                             ucp_am_recv_callback_t cb, void *arg)
+    void set_am_data_handler(entity &e, uint16_t am_id, ucp_am_recv_callback_t cb,
+                             void *arg, unsigned flags = 0)
     {
         ucp_am_handler_param_t param;
 
@@ -349,9 +365,21 @@ class test_ucp_am_nbx : public test_ucp_am_base {
         param.id         = am_id;
         param.cb         = cb;
         param.arg        = arg;
+
+        if (flags != 0) {
+            param.field_mask |= UCP_AM_HANDLER_PARAM_FIELD_FLAGS;
+            param.flags       = flags;
+        }
+
         ASSERT_UCS_OK(ucp_worker_set_am_recv_handler(e.worker(), &param));
     }
 
+    void check_header(const void *header, size_t header_length)
+    {
+        std::string check_pattern((char*)header, header_length);
+        EXPECT_EQ(check_pattern, m_hdr);
+    }
+
     ucs_status_ptr_t send_am(const ucp::data_type_desc_t& dt_desc,
                              unsigned flags = 0, const void *hdr = NULL,
                              unsigned hdr_length = 0)
@@ -372,50 +400,61 @@ class test_ucp_am_nbx : public test_ucp_am_base {
     }
 
     void test_am_send_recv(size_t size, size_t header_size = 0ul,
-                           unsigned flags = 0, bool hold_desc = false)
+                           unsigned flags = 0,
+                           ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
+                           unsigned data_cb_flags = 0)
     {
-        std::string sbuf(size, 'd');
-        std::string hbuf(header_size, 'h');
+        mem_buffer sbuf(size, mem_type);
+        mem_buffer::pattern_fill(sbuf.ptr(), size, SEED, mem_type);
+        m_hdr.resize(header_size);
+        ucs::fill_random(m_hdr);
         m_am_received = false;
 
-        set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_cb, this);
+        set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_cb, this,
+                            data_cb_flags);
 
-        ucp::data_type_desc_t sdt_desc(m_dt, &sbuf[0], size);
+        ucp::data_type_desc_t sdt_desc(m_dt, sbuf.ptr(), size);
 
-        ucs_status_ptr_t sptr = send_am(sdt_desc, get_send_flag(),
-                                        hbuf.c_str(), header_size);
+        ucs_status_ptr_t sptr = send_am(sdt_desc, get_send_flag() | flags,
+                                        m_hdr.data(), m_hdr.size());
 
         wait_for_flag(&m_am_received);
         request_wait(sptr);
         EXPECT_TRUE(m_am_received);
     }
 
-    void test_am(size_t size)
+    void test_am(size_t size, unsigned flags = 0)
     {
         size_t small_hdr_size = 8;
 
-        test_am_send_recv(size, small_hdr_size);
-        test_am_send_recv(size, 0);
+        test_am_send_recv(size, 0, flags);
+        test_am_send_recv(size, small_hdr_size, flags);
 
         if (max_am_hdr() > small_hdr_size) {
-            test_am_send_recv(size, max_am_hdr());
+            test_am_send_recv(size, max_am_hdr(), flags);
         }
     }
 
+    void test_short_thresh(size_t max_short)
+    {
+        ucp_ep_config_t *ep_cfg = ucp_ep_config(sender().ep());
+
+        EXPECT_LE(max_short, ep_cfg->rndv.am_thresh.remote);
+        EXPECT_LE(max_short, ep_cfg->rndv.am_thresh.local);
+        EXPECT_LE(max_short, ep_cfg->rndv.rma_thresh.remote);
+        EXPECT_LE(max_short, ep_cfg->rndv.rma_thresh.local);
+    }
+
     virtual ucs_status_t am_data_handler(const void *header,
                                          size_t header_length,
                                          void *data, size_t length,
                                          const ucp_am_recv_param_t *rx_param)
     {
+        ucs_status_t status;
+
         EXPECT_FALSE(m_am_received);
-        EXPECT_EQ(std::string::npos,
-                  std::string((const char*)data, length).find_first_not_of('d'));
 
-        if (header_length != 0) {
-            EXPECT_EQ(std::string::npos,
-                      std::string((const char*)header,
-                                  header_length).find_first_not_of('h'));
-        }
+        check_header(header, header_length);
 
         bool has_reply_ep = get_send_flag();
 
@@ -423,11 +462,55 @@ class test_ucp_am_nbx : public test_ucp_am_base {
                                 UCP_AM_RECV_ATTR_FIELD_REPLY_EP);
         EXPECT_EQ(has_reply_ep, rx_param->reply_ep != NULL);
 
-        EXPECT_FALSE(rx_param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV);
+        if (!(rx_param->recv_attr &
+              (UCP_AM_RECV_ATTR_FLAG_RNDV | UCP_AM_RECV_ATTR_FLAG_DATA))) {
+            mem_buffer::pattern_check(data, length, SEED);
+            m_am_received = true;
+            return UCS_OK;
+        }
 
-        m_am_received = true;
+        m_rx_buf = mem_buffer::allocate(length, m_rx_memtype);
+        mem_buffer::pattern_fill(m_rx_buf, length, 0ul, m_rx_memtype);
 
-        return UCS_OK;
+        m_rx_dt_desc.make(m_rx_dt, m_rx_buf, length);
+
+        uint32_t imm_compl_flag = UCP_OP_ATTR_FLAG_NO_IMM_CMPL *
+                                  (ucs::rand() % 2);
+        size_t rx_length = SIZE_MAX;
+        ucp_request_param_t params;
+        params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
+                              UCP_OP_ATTR_FIELD_USER_DATA |
+                              UCP_OP_ATTR_FIELD_DATATYPE |
+                              UCP_OP_ATTR_FIELD_RECV_INFO |
+                              imm_compl_flag;
+        params.datatype     = m_rx_dt_desc.dt();
+        params.cb.recv_am   = am_data_recv_cb;
+        params.user_data    = this;
+        params.recv_info.length = &rx_length;
+        ucs_status_ptr_t sp = ucp_am_recv_data_nbx(receiver().worker(),
+                                                   data, m_rx_dt_desc.buf(),
+                                                   m_rx_dt_desc.count(),
+                                                   &params);
+        //ucs_warn("imm_compl %d, sp %p, rx len %zu", imm_compl_flag, sp, rx_length);
+        if (UCS_PTR_IS_PTR(sp)) {
+            ucp_request_release(sp);
+            status = UCS_INPROGRESS;
+        } else {
+            EXPECT_EQ(NULL, sp);
+            EXPECT_EQ(rx_length, length);
+            am_recv_check_data(rx_length);
+            status = UCS_OK;
+        }
+
+        return status;
+    }
+
+    void am_recv_check_data(size_t length)
+    {
+        ASSERT_FALSE(m_am_received);
+        m_am_received = true;
+        mem_buffer::pattern_check(m_rx_buf, length, SEED, m_rx_memtype);
+        mem_buffer::release(m_rx_buf, m_rx_memtype);
     }
 
     static ucs_status_t am_data_cb(void *arg, const void *header,
@@ -439,9 +522,49 @@ class test_ucp_am_nbx : public test_ucp_am_base {
         return self->am_data_handler(header, header_length, data, length, param);
     }
 
+    static ucs_status_t am_rx_check_cb(void *arg, const void *header,
+                                       size_t header_length, void *data,
+                                       size_t length,
+                                       const ucp_am_recv_param_t *param)
+    {
+        test_ucp_am_nbx *self = reinterpret_cast<test_ucp_am_nbx*>(arg);
+        self->m_am_received   = true;
+        return UCS_OK;
+    }
+
+    static ucs_status_t am_data_hold_cb(void *arg, const void *header,
+                                        size_t header_length, void *data,
+                                        size_t length,
+                                        const ucp_am_recv_param_t *param)
+    {
+        void **rx_data_p = reinterpret_cast<void**>(arg);
+
+        EXPECT_TRUE(param->recv_attr & UCP_AM_RECV_ATTR_FLAG_DATA);
+        EXPECT_EQ(NULL, *rx_data_p);
+
+        *rx_data_p = data;
+
+        return UCS_INPROGRESS;
+    }
+
+    static void am_data_recv_cb(void *request, ucs_status_t status,
+                                size_t length, void *user_data)
+    {
+        test_ucp_am_nbx *self = reinterpret_cast<test_ucp_am_nbx*>(user_data);
+
+        EXPECT_UCS_OK(status);
+
+        self->am_recv_check_data(length);
+    }
+
     static const uint16_t           TEST_AM_NBX_ID = 0;
     ucp_datatype_t                  m_dt;
     volatile bool                   m_am_received;
+    std::string                     m_hdr;
+    ucp_datatype_t                  m_rx_dt;
+    ucs_memory_type_t               m_rx_memtype;
+    ucp::data_type_desc_t           m_rx_dt_desc;
+    void                            *m_rx_buf;
 };
 
 UCS_TEST_P(test_ucp_am_nbx, set_invalid_handler)
@@ -508,15 +631,281 @@ UCS_TEST_P(test_ucp_am_nbx, zero_send)
     test_am_send_recv(0, max_am_hdr());
 }
 
+UCS_TEST_P(test_ucp_am_nbx, rx_persistent_data)
+{
+    void *rx_data = NULL;
+    char data     = 'd';
+
+    set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_hold_cb, &rx_data,
+                        UCP_AM_FLAG_PERSISTENT_DATA);
+
+    ucp_request_param_t param;
+
+    param.op_attr_mask    = 0ul;
+    ucs_status_ptr_t sptr = ucp_am_send_nbx(sender().ep(), TEST_AM_NBX_ID, NULL,
+                                            0ul, &data, sizeof(data), &param);
+    wait_for_flag(&rx_data);
+    EXPECT_TRUE(rx_data != NULL);
+    EXPECT_EQ(data, *reinterpret_cast<char*>(rx_data));
+
+    ucp_am_data_release(receiver().worker(), rx_data);
+    EXPECT_EQ(UCS_OK, request_wait(sptr));
+}
+
+// Check that max_short limits are adjusted when rndv threshold is set
+UCS_TEST_P(test_ucp_am_nbx, max_short_thresh_rndv, "RNDV_THRESH=0")
+{
+    ucp_ep_config_t *ep_cfg = ucp_ep_config(sender().ep());
+
+    size_t max_short = static_cast<size_t>(
+            ep_cfg->am_u.max_eager_short.memtype_on + 1);
+
+    test_short_thresh(max_short);
+
+    size_t max_reply_short = static_cast<size_t>(
+            ep_cfg->am_u.max_reply_eager_short.memtype_on + 1);
+
+    test_short_thresh(max_reply_short);
+}
+
+// Check that max_short limits are adjusted when zcopy threshold is set
+UCS_TEST_P(test_ucp_am_nbx, max_short_thresh_zcopy, "ZCOPY_THRESH=0")
+{
+    ucp_ep_config_t *ep_cfg = ucp_ep_config(sender().ep());
+
+    size_t max_short = static_cast<size_t>(
+            ep_cfg->am_u.max_eager_short.memtype_on + 1);
+
+    EXPECT_LE(max_short, ep_cfg->am.zcopy_thresh[0]);
+
+
+    size_t max_reply_short = static_cast<size_t>(
+            ep_cfg->am_u.max_reply_eager_short.memtype_on + 1);
+
+    EXPECT_LE(max_reply_short, ep_cfg->am.zcopy_thresh[0]);
+}
+
 UCP_INSTANTIATE_TEST_CASE(test_ucp_am_nbx)
 
 
+class test_ucp_am_nbx_closed_ep : public test_ucp_am_nbx {
+protected:
+    virtual ucp_ep_params_t get_ep_params()
+    {
+        ucp_ep_params_t ep_params = test_ucp_am_nbx::get_ep_params();
+        ep_params.field_mask     |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;
+        /* The error handling requirement is needed since we need to take care of
+         * a case when a receiver tries to fetch data on a closed EP */
+        ep_params.err_mode        = UCP_ERR_HANDLING_MODE_PEER;
+        return ep_params;
+    }
+
+    void test_recv_on_closed_ep(size_t size, unsigned flags = 0,
+                                bool poke_rx_progress = false,
+                                bool rx_expected = false)
+    {
+        skip_loopback();
+        test_am_send_recv(0, max_am_hdr()); // warmup wireup
+
+        m_am_received = false;
+        std::vector<char> sbuf(size, 'd');
+        ucp::data_type_desc_t sdt_desc(m_dt, &sbuf[0], size);
+
+        set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_rx_check_cb, this);
+
+        ucs_status_ptr_t sreq = send_am(sdt_desc, flags);
+
+        sender().progress();
+        if (poke_rx_progress) {
+            receiver().progress();
+            if (m_am_received) {
+                request_wait(sreq);
+                UCS_TEST_SKIP_R("received all AMs before ep closed");
+            }
+        }
+
+        void *close_req = receiver().disconnect_nb(0, 0,
+                                                   UCP_EP_CLOSE_MODE_FLUSH);
+        ucs_time_t deadline = ucs::get_deadline(10);
+        while (!is_request_completed(close_req) &&
+               (ucs_get_time() < deadline)) {
+            progress();
+        };
+
+        receiver().close_ep_req_free(close_req);
+
+        if (rx_expected) {
+            request_wait(sreq);
+            wait_for_flag(&m_am_received);
+        } else {
+            // Send request may complete with error
+            // (rndv should complete with EP_TIMEOUT)
+            scoped_log_handler wrap_err(wrap_errors_logger);
+            request_wait(sreq);
+        }
+
+        EXPECT_EQ(rx_expected, m_am_received);
+    }
+};
+
+
+UCS_TEST_P(test_ucp_am_nbx_closed_ep, rx_short_am_on_closed_ep, "RNDV_THRESH=inf")
+{
+    // Single fragment message sent without REPLY flag is expected
+    // to be received even if remote side closes its ep
+    test_recv_on_closed_ep(8, 0, false, true);
+}
+
+// All the following type of AM messages are expected to be dropped on the
+// receiver side, when its ep is closed
+UCS_TEST_P(test_ucp_am_nbx_closed_ep, rx_short_reply_am_on_closed_ep, "RNDV_THRESH=inf")
+{
+    test_recv_on_closed_ep(8, UCP_AM_SEND_REPLY);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_closed_ep, rx_long_am_on_closed_ep, "RNDV_THRESH=inf")
+{
+    test_recv_on_closed_ep(64 * UCS_KBYTE, 0, true);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_closed_ep, rx_long_reply_am_on_closed_ep, "RNDV_THRESH=inf")
+{
+    test_recv_on_closed_ep(64 * UCS_KBYTE, UCP_AM_SEND_REPLY, true);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_closed_ep, rx_rts_am_on_closed_ep, "RNDV_THRESH=32K")
+{
+    test_recv_on_closed_ep(64 * UCS_KBYTE, 0);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_closed_ep, rx_rts_reply_am_on_closed_ep, "RNDV_THRESH=32K")
+{
+    test_recv_on_closed_ep(64 * UCS_KBYTE, UCP_AM_SEND_REPLY);
+}
+
+UCP_INSTANTIATE_TEST_CASE(test_ucp_am_nbx_closed_ep)
+
+
+class test_ucp_am_nbx_eager_memtype : public test_ucp_am_nbx {
+public:
+    void init()
+    {
+        modify_config("RNDV_THRESH", "inf");
+        test_ucp_am_nbx::init();
+        m_rx_memtype = static_cast<ucs_memory_type_t>(get_variant_value(1));
+    }
+
+    static void base_test_generator(std::vector<ucp_test_variant> &variants)
+    {
+        // 1. Do not instantiate test case if no GPU memtypes supported.
+        // 2. Do not exclude host memory type, because this generator is used by
+        //    test_ucp_am_nbx_rndv_memtype class to generate combinations like
+        //    host<->cuda, cuda-managed<->host, etc.
+        if (!mem_buffer::is_gpu_supported()) {
+            return;
+        }
+
+        add_variant_memtypes(variants, test_ucp_am_base::get_test_variants,
+                             std::numeric_limits<uint64_t>::max());
+    }
+
+    static void get_test_variants(std::vector<ucp_test_variant> &variants)
+    {
+        add_variant_memtypes(variants, base_test_generator,
+                             std::numeric_limits<uint64_t>::max());
+    }
+};
+
+UCS_TEST_P(test_ucp_am_nbx_eager_memtype, basic)
+{
+    ucs_memory_type_t mt = static_cast<ucs_memory_type_t>(get_variant_value(0));
+    test_am_send_recv(16 * UCS_KBYTE, 8, 0, mt);
+}
+
+UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_am_nbx_eager_memtype)
+
+
+class test_ucp_am_nbx_eager_data_release : public test_ucp_am_nbx {
+public:
+    test_ucp_am_nbx_eager_data_release()
+    {
+        modify_config("RNDV_THRESH", "inf");
+        modify_config("ZCOPY_THRESH", "inf");
+        m_data_ptr = NULL;
+    }
+
+    virtual ucs_status_t
+    am_data_handler(const void *header, size_t header_length, void *data,
+                    size_t length, const ucp_am_recv_param_t *rx_param)
+    {
+        EXPECT_FALSE(m_am_received);
+        EXPECT_TRUE(rx_param->recv_attr & UCP_AM_RECV_ATTR_FLAG_DATA);
+
+        m_am_received = true;
+        m_data_ptr    = data;
+
+        check_header(header, header_length);
+        mem_buffer::pattern_check(data, length, SEED);
+
+        return UCS_INPROGRESS;
+    }
+
+    void test_data_release(size_t size)
+    {
+        size_t hdr_size = ucs_min(max_am_hdr(), 8);
+        test_am_send_recv(size, 0, 0, UCS_MEMORY_TYPE_HOST,
+                          UCP_AM_FLAG_PERSISTENT_DATA);
+        ucp_am_data_release(receiver().worker(), m_data_ptr);
+
+        test_am_send_recv(size, hdr_size, 0, UCS_MEMORY_TYPE_HOST,
+                          UCP_AM_FLAG_PERSISTENT_DATA);
+        ucp_am_data_release(receiver().worker(), m_data_ptr);
+    }
+
+    size_t fragment_size()
+    {
+        return ucp_ep_config(sender().ep())->am.max_bcopy -
+               sizeof(ucp_am_hdr_t);
+    }
+
+private:
+    void *m_data_ptr;
+};
+
+UCS_TEST_P(test_ucp_am_nbx_eager_data_release, short)
+{
+    test_data_release(1);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_eager_data_release, single)
+{
+    test_data_release(fragment_size() / 2);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_eager_data_release, multi)
+{
+    test_data_release(fragment_size() * 2);
+}
+
+UCP_INSTANTIATE_TEST_CASE(test_ucp_am_nbx_eager_data_release)
+
+
 class test_ucp_am_nbx_dts : public test_ucp_am_nbx {
 public:
     static const uint64_t dts_bitmap = UCS_BIT(UCP_DATATYPE_CONTIG) |
                                        UCS_BIT(UCP_DATATYPE_IOV) |
                                        UCS_BIT(UCP_DATATYPE_GENERIC);
 
+    virtual ucp_ep_params_t get_ep_params()
+    {
+        ucp_ep_params_t ep_params = test_ucp_am_nbx::get_ep_params();
+
+        ep_params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;
+        ep_params.err_mode    = static_cast<ucp_err_handling_mode_t>(
+                                                          get_variant_value(3));
+        return ep_params;
+    }
+
     static void get_test_dts(std::vector<ucp_test_variant>& variants)
     {
         /* coverity[overrun-buffer-val] */
@@ -524,28 +913,57 @@ class test_ucp_am_nbx_dts : public test_ucp_am_nbx {
                            dts_bitmap, ucp_datatype_class_names);
     }
 
+    static void base_test_generator(std::vector<ucp_test_variant> &variants)
+    {
+        /* push variant for the receive type, on top of existing dts variants */
+        /* coverity[overrun-buffer-val] */
+        add_variant_values(variants, get_test_dts, dts_bitmap,
+                           ucp_datatype_class_names);
+    }
+
+    static void get_test_dts_reply(std::vector<ucp_test_variant>& variants)
+    {
+        add_variant_values(variants, base_test_generator, 0);
+        add_variant_values(variants, base_test_generator, UCP_AM_SEND_REPLY,
+                           "reply");
+    }
+
     static void get_test_variants(std::vector<ucp_test_variant>& variants)
     {
-        add_variant_values(variants, get_test_dts, 0);
-        add_variant_values(variants, get_test_dts, UCP_AM_SEND_REPLY, "reply");
+        add_variant_values(variants, get_test_dts_reply,
+                           UCP_ERR_HANDLING_MODE_NONE);
+        add_variant_values(variants, get_test_dts_reply,
+                           UCP_ERR_HANDLING_MODE_PEER, "errh");
     }
 
     void init()
     {
         test_ucp_am_nbx::init();
 
-        m_dt = make_dt(get_variant_value());
+        m_dt    = make_dt(get_variant_value(0));
+        m_rx_dt = make_dt(get_variant_value(1));
     }
 
     void cleanup()
     {
         destroy_dt(m_dt);
+        destroy_dt(m_rx_dt);
         test_ucp_am_nbx::cleanup();
     }
 
     virtual unsigned get_send_flag()
     {
-        return get_variant_value(1);
+        return get_variant_value(2);
+    }
+
+    virtual ucs_status_t
+    am_data_handler(const void *header, size_t header_length, void *data,
+                    size_t length, const ucp_am_recv_param_t *rx_param)
+    {
+        EXPECT_FALSE(rx_param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV);
+
+        return test_ucp_am_nbx::am_data_handler(header, header_length, data,
+                                                length, rx_param);
     }
 };
 
@@ -578,14 +996,23 @@ UCS_TEST_P(test_ucp_am_nbx_dts, long_zcopy_send, "ZCOPY_THRESH=1",
     test_am(64 * UCS_KBYTE);
 }
 
+UCS_TEST_P(test_ucp_am_nbx_dts, send_eager_flag, "RNDV_THRESH=128")
+{
+    test_am(64 * UCS_KBYTE, UCP_AM_SEND_FLAG_EAGER);
+}
+
 UCP_INSTANTIATE_TEST_CASE(test_ucp_am_nbx_dts)
 
 
-class test_ucp_am_nbx_rndv: public test_ucp_am_nbx {
+class test_ucp_am_nbx_rndv : public test_ucp_am_nbx {
 public:
+    struct am_cb_args {
+        test_ucp_am_nbx_rndv *self;
+        void                 **desc;
+    };
+
     test_ucp_am_nbx_rndv()
     {
-        m_rx_dt  = ucp_dt_make_contig(1);
         m_status = UCS_OK;
         modify_config("RNDV_THRESH", "128");
     }
@@ -594,43 +1021,18 @@ class test_ucp_am_nbx_rndv: public test_ucp_am_nbx {
                                  void *data, size_t length,
                                  const ucp_am_recv_param_t *rx_param)
     {
-        EXPECT_FALSE(m_am_received);
         EXPECT_TRUE(rx_param->recv_attr & UCP_AM_RECV_ATTR_FLAG_RNDV);
         EXPECT_FALSE(rx_param->recv_attr & UCP_AM_RECV_ATTR_FLAG_DATA);
 
-        m_rx_buf.resize(length, 'u');
-
-        m_rx_dt_desc.make(m_rx_dt, &m_rx_buf[0], length);
-
-        ucp_request_param_t params;
-        params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK  |
-                              UCP_OP_ATTR_FIELD_USER_DATA |
-                              UCP_OP_ATTR_FIELD_DATATYPE  |
-                              UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
-        params.datatype     = m_rx_dt_desc.dt();
-        params.cb.recv_am   = am_data_recv_cb;
-        params.user_data    = this;
-        ucs_status_ptr_t sp = ucp_am_recv_data_nbx(receiver().worker(),
-                                                   data, m_rx_dt_desc.buf(),
-                                                   m_rx_dt_desc.count(),
-                                                   &params);
-        EXPECT_TRUE(UCS_PTR_IS_PTR(sp)) << "sp is: " << sp;
-        ucp_request_release(sp);
+        ucs_status_t status = test_ucp_am_nbx::am_data_handler(header,
+                                                               header_length,
+                                                               data, length,
+                                                               rx_param);
+        EXPECT_FALSE(UCS_STATUS_IS_ERR(status));
 
         return UCS_INPROGRESS;
     }
 
-    static void am_data_recv_cb(void *request, ucs_status_t status,
-                                size_t length, void *user_data)
-    {
-        test_ucp_am_nbx_rndv *self = reinterpret_cast<test_ucp_am_nbx_rndv*>
-                                                                    (user_data);
-        ASSERT_FALSE(self->m_am_received);
-        self->m_am_received = true;
-        EXPECT_UCS_OK(status);
-        EXPECT_EQ(self->m_rx_buf, std::vector<char>(length, 'd'));
-    }
-
     static ucs_status_t am_data_reject_rndv_cb(void *arg, const void *header,
                                                size_t header_length, void *data,
                                                size_t length,
@@ -658,10 +1060,25 @@ class test_ucp_am_nbx_rndv: public test_ucp_am_nbx {
         return UCS_INPROGRESS;
     }
 
-    ucp_datatype_t               m_rx_dt;
-    ucs_status_t                 m_status;
-    ucp::data_type_desc_t        m_rx_dt_desc;
-    std::vector<char>            m_rx_buf;
+    static ucs_status_t am_data_drop_rndv_cb(void *arg,
+                                             const void *header,
+                                             size_t header_length,
+                                             void *data, size_t length,
+                                             const ucp_am_recv_param_t *param)
+    {
+        struct am_cb_args *args    = reinterpret_cast<am_cb_args*>(arg);
+        test_ucp_am_nbx_rndv *self = args->self;
+        void **data_desc_p         = args->desc;
+
+        *data_desc_p = data;
+        self->m_am_received = true;
+
+        /* return UCS_OK without calling ucp_am_recv_data_nbx()
+         * to drop the message */
+        return UCS_OK;
+    }
+
+    ucs_status_t m_status;
 };
 
 UCS_TEST_P(test_ucp_am_nbx_rndv, rndv_auto, "RNDV_SCHEME=auto")
@@ -679,6 +1096,62 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, rndv_put, "RNDV_SCHEME=put_zcopy")
     test_am_send_recv(64 * UCS_KBYTE);
 }
 
+UCS_TEST_P(test_ucp_am_nbx_rndv, rndv_flag_zero_send, "RNDV_THRESH=inf")
+{
+    test_am_send_recv(0, 0, UCP_AM_SEND_FLAG_RNDV);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_rndv, rndv_flag_send, "RNDV_THRESH=inf")
+{
+    test_am_send_recv(64 * UCS_KBYTE, 0, UCP_AM_SEND_FLAG_RNDV);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_rndv, rndv_zero_send, "RNDV_THRESH=0")
+{
+    test_am_send_recv(0);
+}
+
+UCS_TEST_P(test_ucp_am_nbx_rndv, just_header_rndv, "RNDV_THRESH=1")
+{
+    test_am_send_recv(0, max_am_hdr());
+}
+
+UCS_TEST_P(test_ucp_am_nbx_rndv, header_and_data_rndv, "RNDV_THRESH=128")
+{
+    test_am_send_recv(127, 1);
+}
+
+UCS_TEST_SKIP_COND_P(test_ucp_am_nbx_rndv, invalid_recv_desc,
+                     RUNNING_ON_VALGRIND, "RNDV_THRESH=1")
+{
+    void *data_desc = NULL;
+    void *rx_data   = NULL;
+    char data       = 'd';
+    ucp_request_param_t param;
+
+    struct am_cb_args args = { this,  &data_desc };
+    set_am_data_handler(receiver(), TEST_AM_NBX_ID, am_data_drop_rndv_cb, &args);
+
+    param.op_attr_mask = 0ul;
+
+    ucs_status_ptr_t sptr = ucp_am_send_nbx(sender().ep(), TEST_AM_NBX_ID, NULL,
+                                            0ul, &data, sizeof(data), &param);
+
+    wait_for_flag(&m_am_received);
+
+    scoped_log_handler wrap_err(wrap_errors_logger);
+    /* attempt to recv data with invalid 'data_desc' since it was reliased
+     * due to am_data_drop_rndv_cb() returned UCS_OK */
+    ucs_status_ptr_t rptr = ucp_am_recv_data_nbx(receiver().worker(),
+                                                 data_desc,
+                                                 rx_data, sizeof(data),
+                                                 &param);
+
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, UCS_PTR_STATUS(rptr));
+
+    request_wait(sptr);
+}
+
 UCS_TEST_P(test_ucp_am_nbx_rndv, reject_rndv)
 {
     skip_loopback();
@@ -733,7 +1206,7 @@ UCS_TEST_P(test_ucp_am_nbx_rndv, deferred_reject_rndv)
 UCP_INSTANTIATE_TEST_CASE(test_ucp_am_nbx_rndv)
 
 
-class test_ucp_am_nbx_rndv_dts: public test_ucp_am_nbx_rndv {
+class test_ucp_am_nbx_rndv_dts : public test_ucp_am_nbx_rndv {
 public:
     static void get_test_variants(std::vector<ucp_test_variant>& variants)
     {
@@ -767,3 +1240,30 @@ UCS_TEST_P(test_ucp_am_nbx_rndv_dts, rndv, "RNDV_THRESH=256")
 }
 
 UCP_INSTANTIATE_TEST_CASE(test_ucp_am_nbx_rndv_dts);
+
+
+class test_ucp_am_nbx_rndv_memtype : public test_ucp_am_nbx_rndv {
+public:
+    static void get_test_variants(std::vector<ucp_test_variant>& variants) {
+        // Test will not be instantiated if no GPU memtypes supported, because
+        // of the check for supported memory types in
+        // test_ucp_am_nbx_eager_memtype::get_test_variants
+        return test_ucp_am_nbx_eager_memtype::get_test_variants(variants);
+    }
+
+    void init()
+    {
+        modify_config("RNDV_THRESH", "128");
+
+        test_ucp_am_nbx::init();
+        m_rx_memtype = static_cast<ucs_memory_type_t>(get_variant_value(1));
+    }
+};
+
+UCS_TEST_P(test_ucp_am_nbx_rndv_memtype, rndv)
+{
+    ucs_memory_type_t mt = static_cast<ucs_memory_type_t>(get_variant_value(0));
+    test_am_send_recv(64 * UCS_KBYTE, 8, 0, mt);
+}
+
+UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_am_nbx_rndv_memtype);
diff --git a/test/gtest/ucp/test_ucp_context.cc b/test/gtest/ucp/test_ucp_context.cc
index fe87d8a036a..23fbfa74780 100644
--- a/test/gtest/ucp/test_ucp_context.cc
+++ b/test/gtest/ucp/test_ucp_context.cc
@@ -9,14 +9,23 @@ extern "C" {
 #include <ucs/sys/sys.h>
 }
 
-
-class test_ucp_context : public ucp_test {
-public:
-    static void get_test_variants(std::vector<ucp_test_variant>& variants) {
-        add_variant(variants, UCP_FEATURE_TAG | UCP_FEATURE_WAKEUP);
-    }
+class test_ucp_lib_query : public ucs::test {
 };
 
+UCS_TEST_F(test_ucp_lib_query, test_max_thread_support) {
+    ucs_status_t status;
+    ucp_lib_attr_t params;
+    memset(&params, 0, sizeof(ucp_lib_attr_t));
+    params.field_mask = UCP_LIB_ATTR_FIELD_MAX_THREAD_LEVEL;
+    status            = ucp_lib_query(&params);
+    ASSERT_EQ(UCS_OK, status);
+#if ENABLE_MT
+    EXPECT_EQ(UCS_THREAD_MODE_MULTI, params.max_thread_level);
+#else
+    EXPECT_EQ(UCS_THREAD_MODE_SERIALIZED, params.max_thread_level);
+#endif
+}
+
 UCS_TEST_P(test_ucp_context, minimal_field_mask) {
     ucs::handle<ucp_config_t*> config;
     UCS_TEST_CREATE_HANDLE(ucp_config_t*, config, ucp_config_release,
diff --git a/test/gtest/ucp/test_ucp_dt.cc b/test/gtest/ucp/test_ucp_dt.cc
index 3361e3a44ec..5110b068666 100644
--- a/test/gtest/ucp/test_ucp_dt.cc
+++ b/test/gtest/ucp/test_ucp_dt.cc
@@ -16,7 +16,7 @@ extern "C" {
 class test_ucp_dt_iov : public ucs::test {
 protected:
     size_t calc_iov_offset(const ucp_dt_iov_t *iov, size_t iov_indx, size_t iov_offs) {
-        size_t offset = iov_offs;;
+        size_t offset = iov_offs;
         for (size_t i = 0; i < iov_indx; ++i) {
             offset += iov[i].length;
         }
diff --git a/test/gtest/ucp/test_ucp_ep.cc b/test/gtest/ucp/test_ucp_ep.cc
new file mode 100644
index 00000000000..077feb3441b
--- /dev/null
+++ b/test/gtest/ucp/test_ucp_ep.cc
@@ -0,0 +1,65 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "ucp_test.h"
+#include <ucp/core/ucp_context.h>
+
+
+class test_ucp_ep : public ucp_test {
+public:
+    static void get_test_variants(std::vector<ucp_test_variant> &variants)
+    {
+        add_variant(variants, UCP_FEATURE_TAG);
+    }
+
+    /// @override
+    virtual void init()
+    {
+        ucp_test::init();
+        sender().connect(&receiver(), get_ep_params());
+    }
+};
+
+UCS_TEST_P(test_ucp_ep, ucp_query_ep)
+{
+    ucp_ep_h ep;
+    ucs_status_t status;
+    ucp_ep_evaluate_perf_param_t param;
+    ucp_ep_evaluate_perf_attr_t attr;
+    double estimated_time_0, estimated_time_1000;
+
+    param.field_mask   = UCP_EP_PERF_PARAM_FIELD_MESSAGE_SIZE;
+    attr.field_mask    = UCP_EP_PERF_ATTR_FIELD_ESTIMATED_TIME;
+    param.message_size = 0;
+    create_entity();
+
+    ep     = sender().ep();
+    status = ucp_ep_evaluate_perf(ep, &param, &attr);
+
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_GE(attr.estimated_time, 0);
+    estimated_time_0 = attr.estimated_time;
+
+    param.message_size = 1000;
+    status             = ucp_ep_evaluate_perf(ep, &param, &attr);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_GT(attr.estimated_time, 0);
+    EXPECT_LT(attr.estimated_time, 10);
+    estimated_time_1000 = attr.estimated_time;
+
+    param.message_size = 2000;
+    status             = ucp_ep_evaluate_perf(ep, &param, &attr);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_GT(attr.estimated_time, 0);
+    EXPECT_LT(attr.estimated_time, 10);
+
+    /* Test time estimation sanity, by verifying constant increase per message
+       size (which represents current calculation model) */
+    EXPECT_FLOAT_EQ(attr.estimated_time - estimated_time_1000,
+                    estimated_time_1000 - estimated_time_0);
+}
+
+UCP_INSTANTIATE_TEST_CASE(test_ucp_ep);
diff --git a/test/gtest/ucp/test_ucp_mem_type.cc b/test/gtest/ucp/test_ucp_mem_type.cc
index 7344df20176..92284d8f7e3 100644
--- a/test/gtest/ucp/test_ucp_mem_type.cc
+++ b/test/gtest/ucp/test_ucp_mem_type.cc
@@ -34,12 +34,12 @@ UCS_TEST_P(test_ucp_mem_type, detect) {
 
     const size_t size                      = 256;
     const ucs_memory_type_t alloc_mem_type = mem_type();
+    ucs_memory_info_t mem_info;
 
     mem_buffer b(size, alloc_mem_type);
 
-    ucs_memory_type_t detected_mem_type =
-                    ucp_memory_type_detect(sender().ucph(), b.ptr(), size);
-    EXPECT_EQ(alloc_mem_type, detected_mem_type);
+    ucp_memory_detect(sender().ucph(), b.ptr(), size, &mem_info);
+    EXPECT_EQ(alloc_mem_type, mem_info.type);
 }
 
 UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_mem_type, all, "all")
@@ -71,10 +71,12 @@ class test_ucp_mem_type_alloc_before_init : public test_ucp_mem_type {
 UCS_TEST_P(test_ucp_mem_type_alloc_before_init, xfer) {
     sender().connect(&receiver(), get_ep_params());
 
-    EXPECT_EQ(mem_type(), ucp_memory_type_detect(sender().ucph(),
-                                                 m_send_buffer->ptr(), m_size));
-    EXPECT_EQ(mem_type(), ucp_memory_type_detect(receiver().ucph(),
-                                                 m_recv_buffer->ptr(), m_size));
+    ucs_memory_info_t mem_info;
+    ucp_memory_detect(sender().ucph(), m_send_buffer->ptr(), m_size, &mem_info);
+    EXPECT_EQ(mem_type(), mem_info.type) << "send buffer";
+    ucp_memory_detect(receiver().ucph(), m_recv_buffer->ptr(), m_size,
+                      &mem_info);
+    EXPECT_EQ(mem_type(), mem_info.type) << "receive buffer";
 
     mem_buffer::pattern_fill(m_send_buffer->ptr(), m_size, SEED, mem_type());
 
diff --git a/test/gtest/ucp/test_ucp_mmap.cc b/test/gtest/ucp/test_ucp_mmap.cc
index da81a606fd0..3e0da48dadc 100644
--- a/test/gtest/ucp/test_ucp_mmap.cc
+++ b/test/gtest/ucp/test_ucp_mmap.cc
@@ -17,21 +17,38 @@ extern "C" {
 
 class test_ucp_mmap : public ucp_test {
 public:
+    enum {
+        VARIANT_DEFAULT,
+        VARIANT_MAP_NONBLOCK,
+        VARIANT_PROTO_ENABLE
+    };
+
     static void
     get_test_variants(std::vector<ucp_test_variant>& variants)
     {
-        add_variant_with_value(variants, UCP_FEATURE_RMA, 0, "");
-        add_variant_with_value(variants, UCP_FEATURE_RMA, UCP_MEM_MAP_NONBLOCK,
+        add_variant_with_value(variants, UCP_FEATURE_RMA, VARIANT_DEFAULT, "");
+        add_variant_with_value(variants, UCP_FEATURE_RMA, VARIANT_MAP_NONBLOCK,
                                "map_nb");
+        add_variant_with_value(variants, UCP_FEATURE_RMA, VARIANT_PROTO_ENABLE,
+                               "proto");
     }
 
     virtual void init() {
         ucs::skip_on_address_sanitizer();
+        if (get_variant_value() == VARIANT_PROTO_ENABLE) {
+            modify_config("PROTO_ENABLE", "y");
+        }
         ucp_test::init();
+        sender().connect(&receiver(), get_ep_params());
+        if (!is_loopback()) {
+            receiver().connect(&sender(), get_ep_params());
+        }
     }
 
     unsigned mem_map_flags() const {
-        return get_variant_value();
+        return (get_variant_value() == VARIANT_MAP_NONBLOCK) ?
+                       UCP_MEM_MAP_NONBLOCK :
+                       0;
     }
 
     bool is_tl_rdma() {
@@ -52,10 +69,16 @@ class test_ucp_mmap : public ucp_test {
 protected:
     bool resolve_rma(entity *e, ucp_rkey_h rkey);
     bool resolve_amo(entity *e, ucp_rkey_h rkey);
-    bool resolve_rma_bw(entity *e, ucp_rkey_h rkey);
+    bool resolve_rma_bw_get_zcopy(entity *e, ucp_rkey_h rkey);
+    bool resolve_rma_bw_put_zcopy(entity *e, ucp_rkey_h rkey);
     void test_length0(unsigned flags);
-    void test_rkey_management(entity *e, ucp_mem_h memh, bool is_dummy,
+    void test_rkey_management(ucp_mem_h memh, bool is_dummy,
                               bool expect_rma_offload);
+    void test_rkey_proto(ucp_mem_h memh);
+
+private:
+    void compare_distance(const ucs_sys_dev_distance_t &dist1,
+                          const ucs_sys_dev_distance_t &dist2);
 };
 
 bool test_ucp_mmap::resolve_rma(entity *e, ucp_rkey_h rkey)
@@ -98,14 +121,14 @@ bool test_ucp_mmap::resolve_amo(entity *e, ucp_rkey_h rkey)
     }
 }
 
-bool test_ucp_mmap::resolve_rma_bw(entity *e, ucp_rkey_h rkey)
+bool test_ucp_mmap::resolve_rma_bw_get_zcopy(entity *e, ucp_rkey_h rkey)
 {
     ucp_ep_config_t *ep_config = ucp_ep_config(e->ep());
     ucp_lane_index_t lane;
     uct_rkey_t uct_rkey;
 
     lane = ucp_rkey_find_rma_lane(e->ucph(), ep_config, UCS_MEMORY_TYPE_HOST,
-                                  ep_config->rndv.get_zcopy_lanes, rkey, 0,
+                                  ep_config->rndv.get_zcopy.lanes, rkey, 0,
                                   &uct_rkey);
     if (lane != UCP_NULL_LANE) {
         return true;
@@ -114,8 +137,24 @@ bool test_ucp_mmap::resolve_rma_bw(entity *e, ucp_rkey_h rkey)
     }
 }
 
-void test_ucp_mmap::test_rkey_management(entity *e, ucp_mem_h memh,
-                                         bool is_dummy, bool expect_rma_offload)
+bool test_ucp_mmap::resolve_rma_bw_put_zcopy(entity *e, ucp_rkey_h rkey)
+{
+    ucp_ep_config_t *ep_config = ucp_ep_config(e->ep());
+    ucp_lane_index_t lane;
+    uct_rkey_t uct_rkey;
+
+    lane = ucp_rkey_find_rma_lane(e->ucph(), ep_config, UCS_MEMORY_TYPE_HOST,
+                                  ep_config->rndv.put_zcopy.lanes, rkey, 0,
+                                  &uct_rkey);
+    if (lane != UCP_NULL_LANE) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+void test_ucp_mmap::test_rkey_management(ucp_mem_h memh, bool is_dummy,
+                                         bool expect_rma_offload)
 {
     size_t rkey_size;
     void *rkey_buffer;
@@ -124,47 +163,62 @@ void test_ucp_mmap::test_rkey_management(entity *e, ucp_mem_h memh,
     /* Some transports don't support memory registration, so the memory
      * can be inaccessible remotely. But it should always be possible
      * to pack/unpack a key, even if empty. */
-    status = ucp_rkey_pack(e->ucph(), memh, &rkey_buffer, &rkey_size);
-    if (status == UCS_ERR_UNSUPPORTED && !is_dummy) {
+    status = ucp_rkey_pack(sender().ucph(), memh, &rkey_buffer, &rkey_size);
+    if ((status == UCS_ERR_UNSUPPORTED) && !is_dummy) {
         return;
     }
     ASSERT_UCS_OK(status);
 
-    EXPECT_EQ(ucp_rkey_packed_size(e->ucph(), memh->md_map), rkey_size);
+    EXPECT_EQ(ucp_rkey_packed_size(sender().ucph(), memh->md_map,
+                                   UCS_SYS_DEVICE_ID_UNKNOWN, 0),
+              rkey_size);
 
     /* Unpack remote key buffer */
     ucp_rkey_h rkey;
-    status = ucp_ep_rkey_unpack(e->ep(), rkey_buffer, &rkey);
-    if (status == UCS_ERR_UNREACHABLE && !is_dummy) {
+    status = ucp_ep_rkey_unpack(receiver().ep(), rkey_buffer, &rkey);
+    if ((status == UCS_ERR_UNREACHABLE) && !is_dummy) {
         ucp_rkey_buffer_release(rkey_buffer);
         return;
     }
     ASSERT_UCS_OK(status);
 
     /* Test ucp_rkey_packed_md_map() */
-    EXPECT_EQ(rkey->md_map, ucp_rkey_packed_md_map(rkey_buffer));
+    EXPECT_EQ(memh->md_map, ucp_rkey_packed_md_map(rkey_buffer));
 
-    bool have_rma    = resolve_rma(e, rkey);
-    bool have_amo    = resolve_amo(e, rkey);
-    bool have_rma_bw = resolve_rma_bw(e, rkey);
+    /* rkey->md_map is a subset of all possible keys */
+    EXPECT_TRUE(ucs_test_all_flags(memh->md_map, rkey->md_map));
+
+    bool have_rma              = resolve_rma(&receiver(), rkey);
+    bool have_amo              = resolve_amo(&receiver(), rkey);
+    bool have_rma_bw_get_zcopy = resolve_rma_bw_get_zcopy(&receiver(), rkey);
+    bool have_rma_bw_put_zcopy = resolve_rma_bw_put_zcopy(&receiver(), rkey);
 
     /* Test that lane resolution on the remote key returns consistent results */
     for (int i = 0; i < 10; ++i) {
-        switch (ucs::rand() % 3) {
+        switch (ucs::rand() % 4) {
         case 0:
-            EXPECT_EQ(have_rma, resolve_rma(e, rkey));
+            EXPECT_EQ(have_rma, resolve_rma(&receiver(), rkey));
             break;
         case 1:
-            EXPECT_EQ(have_amo, resolve_amo(e, rkey));
+            EXPECT_EQ(have_amo, resolve_amo(&receiver(), rkey));
             break;
         case 2:
-            EXPECT_EQ(have_rma_bw, resolve_rma_bw(e, rkey));
+            EXPECT_EQ(have_rma_bw_get_zcopy,
+                      resolve_rma_bw_get_zcopy(&receiver(), rkey));
+            break;
+        case 3:
+            EXPECT_EQ(have_rma_bw_put_zcopy,
+                      resolve_rma_bw_put_zcopy(&receiver(), rkey));
             break;
         }
     }
 
-    if (expect_rma_offload && is_dummy) {
-        EXPECT_NE(&ucp_rma_sw_proto, rkey->cache.rma_proto);
+    if (expect_rma_offload) {
+        if (is_dummy) {
+            EXPECT_EQ(&ucp_rma_sw_proto, rkey->cache.rma_proto);
+        } else {
+            EXPECT_EQ(&ucp_rma_basic_proto, rkey->cache.rma_proto);
+        }
     }
 
     /* Test obtaining direct-access pointer */
@@ -180,13 +234,75 @@ void test_ucp_mmap::test_rkey_management(entity *e, ucp_mem_h memh,
     ucp_rkey_buffer_release(rkey_buffer);
 }
 
+void test_ucp_mmap::compare_distance(const ucs_sys_dev_distance_t &dist1,
+                                     const ucs_sys_dev_distance_t &dist2)
+{
+    EXPECT_NEAR(dist1.bandwidth, dist2.bandwidth, 600e6); /* 600 MBs accuracy */
+    EXPECT_NEAR(dist1.latency, dist2.latency, 20e-9); /* 20 nsec accuracy */
+}
+
+void test_ucp_mmap::test_rkey_proto(ucp_mem_h memh)
+{
+    ucs_status_t status;
+
+    /* Detect system device of the allocated memory */
+    ucs_memory_info_t mem_info;
+    ucp_memory_detect(sender().ucph(), memh->address, memh->length, &mem_info);
+    EXPECT_EQ(memh->mem_type, mem_info.type);
+
+    /* Collect distances from all devices in the system */
+    uint64_t sys_dev_map = UCS_MASK(ucs_topo_num_devices());
+    std::vector<ucs_sys_dev_distance_t> sys_distance(ucs_topo_num_devices());
+    for (unsigned i = 0; i < sys_distance.size(); ++i) {
+        status = ucs_topo_get_distance(mem_info.sys_dev, i, &sys_distance[i]);
+        ASSERT_UCS_OK(status);
+    }
+
+    /* Allocate buffer for packed rkey */
+    size_t rkey_size = ucp_rkey_packed_size(sender().ucph(), memh->md_map,
+                                            mem_info.sys_dev, sys_dev_map);
+    std::string rkey_buffer(rkey_size, '0');
+
+    /* Pack the rkey and validate packed size */
+    ssize_t packed_size = ucp_rkey_pack_uct(sender().ucph(), memh->md_map,
+                                            memh->uct, &mem_info, sys_dev_map,
+                                            &sys_distance[0], &rkey_buffer[0]);
+    ASSERT_EQ((ssize_t)rkey_size, packed_size);
+
+    /* Unpack remote key buffer */
+    ucp_rkey_h rkey;
+    status = ucp_ep_rkey_unpack_internal(receiver().ep(), &rkey_buffer[0],
+                                         rkey_size, &rkey);
+    ASSERT_UCS_OK(status);
+
+    /* Check rkey configuration */
+    if (receiver().ucph()->config.ext.proto_enable) {
+        ucp_rkey_config_t *rkey_config = ucp_rkey_config(receiver().worker(),
+                                                         rkey);
+        ucp_ep_config_t *ep_config     = ucp_ep_config(receiver().ep());
+
+        EXPECT_EQ(receiver().ep()->cfg_index, rkey_config->key.ep_cfg_index);
+        EXPECT_EQ(mem_info.sys_dev, rkey_config->key.sys_dev);
+        EXPECT_EQ(mem_info.type, rkey_config->key.mem_type);
+
+        /* Compare original system distance and unpacked rkey system distance */
+        for (ucp_lane_index_t lane = 0; lane < ep_config->key.num_lanes;
+             ++lane) {
+            ucs_sys_device_t sys_dev = ep_config->key.lanes[lane].dst_sys_dev;
+            compare_distance(rkey_config->lanes_distance[lane],
+                             (sys_dev == UCS_SYS_DEVICE_ID_UNKNOWN) ?
+                                     ucs_topo_default_distance :
+                                     sys_distance[sys_dev]);
+        }
+    }
+
+    ucp_rkey_destroy(rkey);
+}
 
 UCS_TEST_P(test_ucp_mmap, alloc) {
     ucs_status_t status;
     bool is_dummy;
 
-    sender().connect(&sender(), get_ep_params());
-
     for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) {
         size_t size = ucs::rand() % (UCS_MBYTE);
 
@@ -204,8 +320,8 @@ UCS_TEST_P(test_ucp_mmap, alloc) {
         ASSERT_UCS_OK(status);
 
         is_dummy = (size == 0);
-        test_rkey_management(&sender(), memh, is_dummy,
-                             is_tl_rdma() || is_tl_shm());
+        test_rkey_management(memh, is_dummy, is_tl_rdma() || is_tl_shm());
+        test_rkey_proto(memh);
 
         status = ucp_mem_unmap(sender().ucph(), memh);
         ASSERT_UCS_OK(status);
@@ -213,12 +329,9 @@ UCS_TEST_P(test_ucp_mmap, alloc) {
 }
 
 UCS_TEST_P(test_ucp_mmap, reg) {
-
     ucs_status_t status;
     bool is_dummy;
 
-    sender().connect(&sender(), get_ep_params());
-
     for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) {
         size_t size = ucs::rand() % (UCS_MBYTE);
 
@@ -239,7 +352,8 @@ UCS_TEST_P(test_ucp_mmap, reg) {
         ASSERT_UCS_OK(status);
 
         is_dummy = (size == 0);
-        test_rkey_management(&sender(), memh, is_dummy, is_tl_rdma());
+        test_rkey_management(memh, is_dummy, is_tl_rdma());
+        test_rkey_proto(memh);
 
         status = ucp_mem_unmap(sender().ucph(), memh);
         ASSERT_UCS_OK(status);
@@ -254,8 +368,6 @@ UCS_TEST_P(test_ucp_mmap, reg_mem_type) {
     bool is_dummy;
     ucs_memory_type_t alloc_mem_type;
 
-    sender().connect(&sender(), get_ep_params());
-
     for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) {
         size_t size = ucs::rand() % (UCS_MBYTE);
 
@@ -279,7 +391,14 @@ UCS_TEST_P(test_ucp_mmap, reg_mem_type) {
         ASSERT_UCS_OK(status);
 
         is_dummy = (size == 0);
-        test_rkey_management(&sender(), memh, is_dummy, is_tl_rdma());
+        if (!is_dummy) {
+            EXPECT_EQ(alloc_mem_type, memh->mem_type);
+        }
+        test_rkey_management(memh, is_dummy,
+                             is_tl_rdma() &&
+                                     !UCP_MEM_IS_CUDA_MANAGED(alloc_mem_type) &&
+                                     !UCP_MEM_IS_ROCM_MANAGED(alloc_mem_type));
+        test_rkey_proto(memh);
 
         status = ucp_mem_unmap(sender().ucph(), memh);
         ASSERT_UCS_OK(status);
@@ -295,8 +414,6 @@ void test_ucp_mmap::test_length0(unsigned flags)
     ucp_mem_map_params_t params;
     int i;
 
-    sender().connect(&sender(), get_ep_params());
-
     /* Check that ucp_mem_map accepts any value for buffer if size is 0 and
      * UCP_MEM_FLAG_ZERO_REG flag is passed to it. */
 
@@ -314,8 +431,13 @@ void test_ucp_mmap::test_length0(unsigned flags)
     status = ucp_mem_map(sender().ucph(), &params, &memh[1]);
     ASSERT_UCS_OK(status);
 
+    bool expect_rma_offload = is_tl_rdma() ||
+                              ((flags & UCP_MEM_MAP_ALLOCATE) &&
+                               is_tl_shm());
+
     for (i = 0; i < buf_num; i++) {
-        test_rkey_management(&sender(), memh[i], true, false);
+        test_rkey_management(memh[i], true, expect_rma_offload);
+        test_rkey_proto(memh[i]);
         status = ucp_mem_unmap(sender().ucph(), memh[i]);
         ASSERT_UCS_OK(status);
     }
@@ -333,8 +455,6 @@ UCS_TEST_P(test_ucp_mmap, alloc_advise) {
     ucs_status_t status;
     bool is_dummy;
 
-    sender().connect(&sender(), get_ep_params());
-
     size_t size = 128 * UCS_MBYTE;
 
     ucp_mem_h memh;
@@ -352,9 +472,11 @@ UCS_TEST_P(test_ucp_mmap, alloc_advise) {
     status = ucp_mem_map(sender().ucph(), &params, &memh);
     ASSERT_UCS_OK(status);
 
-    attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH;
+    attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS | UCP_MEM_ATTR_FIELD_LENGTH |
+                      UCP_MEM_ATTR_FIELD_MEM_TYPE;
     status = ucp_mem_query(memh, &attr);
     ASSERT_UCS_OK(status);
+    EXPECT_EQ(attr.mem_type, UCS_MEMORY_TYPE_HOST);
     EXPECT_GE(attr.length, size);
 
     advise_params.field_mask = UCP_MEM_ADVISE_PARAM_FIELD_ADDRESS |
@@ -367,19 +489,17 @@ UCS_TEST_P(test_ucp_mmap, alloc_advise) {
     ASSERT_UCS_OK(status);
 
     is_dummy = (size == 0);
-    test_rkey_management(&sender(), memh, is_dummy, is_tl_rdma() || is_tl_shm());
+    test_rkey_management(memh, is_dummy, is_tl_rdma() || is_tl_shm());
+    test_rkey_proto(memh);
 
     status = ucp_mem_unmap(sender().ucph(), memh);
     ASSERT_UCS_OK(status);
 }
 
 UCS_TEST_P(test_ucp_mmap, reg_advise) {
-
     ucs_status_t status;
     bool is_dummy;
 
-    sender().connect(&sender(), get_ep_params());
-
     size_t size = 128 * UCS_MBYTE;
 
     void *ptr = malloc(size);
@@ -410,10 +530,11 @@ UCS_TEST_P(test_ucp_mmap, reg_advise) {
     advise_params.address    = mem_attr.address;
     advise_params.length     = size;
     advise_params.advice     = UCP_MADV_WILLNEED;
-    status = ucp_mem_advise(sender().ucph(), memh, &advise_params); 
+    status = ucp_mem_advise(sender().ucph(), memh, &advise_params);
     ASSERT_UCS_OK(status);
     is_dummy = (size == 0);
-    test_rkey_management(&sender(), memh, is_dummy, is_tl_rdma());
+    test_rkey_management(memh, is_dummy, is_tl_rdma());
+    test_rkey_proto(memh);
 
     status = ucp_mem_unmap(sender().ucph(), memh);
     ASSERT_UCS_OK(status);
@@ -425,8 +546,6 @@ UCS_TEST_P(test_ucp_mmap, fixed) {
     ucs_status_t status;
     bool         is_dummy;
 
-    sender().connect(&sender(), get_ep_params());
-
     for (int i = 0; i < 1000 / ucs::test_time_multiplier(); ++i) {
         size_t size = (i + 1) * ((i % 2) ? 1000 : 1);
         void *ptr = ucs::mmap_fixed_address();
@@ -447,11 +566,12 @@ UCS_TEST_P(test_ucp_mmap, fixed) {
         EXPECT_GE(memh->length, size);
 
         is_dummy = (size == 0);
-        test_rkey_management(&sender(), memh, is_dummy, is_tl_rdma());
+        test_rkey_management(memh, is_dummy, is_tl_rdma());
+        test_rkey_proto(memh);
 
         status = ucp_mem_unmap(sender().ucph(), memh);
         ASSERT_UCS_OK(status);
     }
 }
 
-UCP_INSTANTIATE_TEST_CASE(test_ucp_mmap)
+UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(test_ucp_mmap)
diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc
index 81b81bcc499..7155e147d8e 100644
--- a/test/gtest/ucp/test_ucp_peer_failure.cc
+++ b/test/gtest/ucp/test_ucp_peer_failure.cc
@@ -24,8 +24,10 @@ class test_ucp_peer_failure : public ucp_test {
     ucp_ep_params_t get_ep_params();
 
 protected:
+    static const int AM_ID = 0;
+
     enum {
-        TEST_TAG = UCS_BIT(0),
+        TEST_AM  = UCS_BIT(0),
         TEST_RMA = UCS_BIT(1),
         FAIL_IMM = UCS_BIT(2)
     };
@@ -37,6 +39,10 @@ class test_ucp_peer_failure : public ucp_test {
 
     typedef ucs::handle<ucp_mem_h, ucp_context_h> mem_handle_t;
 
+    void set_am_handler(entity &e);
+    static ucs_status_t
+    am_callback(void *arg, const void *header, size_t header_length, void *data,
+                size_t length, const ucp_am_recv_param_t *param);
     void set_timeouts();
     static void err_cb(void *arg, ucp_ep_h ep, ucs_status_t status);
     ucp_ep_h stable_sender();
@@ -44,7 +50,6 @@ class test_ucp_peer_failure : public ucp_test {
     entity& stable_receiver();
     entity& failing_receiver();
     void *send_nb(ucp_ep_h ep, ucp_rkey_h rkey);
-    void *recv_nb(entity& e);
     static ucs_log_func_rc_t
     warn_unreleased_rdesc_handler(const char *file, unsigned line,
                                   const char *function,
@@ -57,15 +62,14 @@ class test_ucp_peer_failure : public ucp_test {
     void get_rkey(ucp_ep_h ep, entity& dst, mem_handle_t& memh,
                   ucs::handle<ucp_rkey_h>& rkey);
     void set_rkeys();
-    static void send_cb(void *request, ucs_status_t status);
-    static void recv_cb(void *request, ucs_status_t status,
-                        ucp_tag_recv_info_t *info);
+    static void send_cb(void *request, ucs_status_t status, void *user_data);
 
     virtual void cleanup();
 
     void do_test(size_t msg_size, int pre_msg_count, bool force_close,
                  bool request_must_fail);
 
+    size_t                              m_am_rx_count;
     size_t                              m_err_count;
     ucs_status_t                        m_err_status;
     std::string                         m_sbuf, m_rbuf;
@@ -77,16 +81,20 @@ class test_ucp_peer_failure : public ucp_test {
 UCP_INSTANTIATE_TEST_CASE(test_ucp_peer_failure)
 
 
-test_ucp_peer_failure::test_ucp_peer_failure() : m_err_count(0), m_err_status(UCS_OK) {
+test_ucp_peer_failure::test_ucp_peer_failure() :
+    m_am_rx_count(0), m_err_count(0), m_err_status(UCS_OK)
+{
     ucs::fill_random(m_sbuf);
     set_timeouts();
 }
 
-void test_ucp_peer_failure::get_test_variants(std::vector<ucp_test_variant>& variants) {
-    add_variant_with_value(variants, UCP_FEATURE_TAG, TEST_TAG, "tag");
+void test_ucp_peer_failure::get_test_variants(
+        std::vector<ucp_test_variant> &variants)
+{
+    add_variant_with_value(variants, UCP_FEATURE_AM, TEST_AM, "am");
     add_variant_with_value(variants, UCP_FEATURE_RMA, TEST_RMA, "rma");
-    add_variant_with_value(variants, UCP_FEATURE_TAG, TEST_TAG | FAIL_IMM,
-                           "tag_fail_imm");
+    add_variant_with_value(variants, UCP_FEATURE_AM, TEST_AM | FAIL_IMM,
+                           "am_fail_imm");
     add_variant_with_value(variants, UCP_FEATURE_RMA, TEST_RMA | FAIL_IMM,
                            "rma_fail_imm");
 }
@@ -102,16 +110,43 @@ ucp_ep_params_t test_ucp_peer_failure::get_ep_params() {
     return params;
 }
 
+void test_ucp_peer_failure::set_am_handler(entity &e)
+{
+    if (!(get_variant_value() & TEST_AM)) {
+        return;
+    }
+
+    ucp_am_handler_param_t param;
+    param.field_mask = UCP_AM_HANDLER_PARAM_FIELD_ID |
+                       UCP_AM_HANDLER_PARAM_FIELD_CB |
+                       UCP_AM_HANDLER_PARAM_FIELD_ARG;
+    param.cb         = am_callback;
+    param.arg        = this;
+    param.id         = AM_ID;
+
+    ucs_status_t status = ucp_worker_set_am_recv_handler(e.worker(), &param);
+    ASSERT_UCS_OK(status);
+}
+
+ucs_status_t
+test_ucp_peer_failure::am_callback(void *arg, const void *header,
+                                   size_t header_length, void *data,
+                                   size_t length,
+                                   const ucp_am_recv_param_t *param)
+{
+    test_ucp_peer_failure *self = reinterpret_cast<test_ucp_peer_failure*>(arg);
+    ++self->m_am_rx_count;
+    return UCS_OK;
+}
+
 void test_ucp_peer_failure::set_timeouts() {
-    /* Set small TL timeouts to reduce testing time */
-    m_env.push_back(new ucs::scoped_setenv("UCX_RC_TIMEOUT",     "10ms"));
-    m_env.push_back(new ucs::scoped_setenv("UCX_RC_RNR_TIMEOUT", "10ms"));
-    m_env.push_back(new ucs::scoped_setenv("UCX_RC_RETRY_COUNT", "2"));
+    set_tl_timeouts(m_env);
 }
 
 void test_ucp_peer_failure::err_cb(void *arg, ucp_ep_h ep, ucs_status_t status) {
     test_ucp_peer_failure *self = reinterpret_cast<test_ucp_peer_failure*>(arg);
-    EXPECT_EQ(UCS_ERR_ENDPOINT_TIMEOUT, status);
+    EXPECT_TRUE((UCS_ERR_CONNECTION_RESET == status) ||
+                (UCS_ERR_ENDPOINT_TIMEOUT == status));
     self->m_err_status = status;
     ++self->m_err_count;
 }
@@ -134,25 +169,19 @@ ucp_test::entity& test_ucp_peer_failure::failing_receiver() {
     return m_entities.at(m_entities.size() - 1 - FAILING_EP_INDEX);
 }
 
-void *test_ucp_peer_failure::send_nb(ucp_ep_h ep, ucp_rkey_h rkey) {
-    if (get_variant_value() & TEST_TAG) {
-        return ucp_tag_send_nb(ep, &m_sbuf[0], m_sbuf.size(), DATATYPE, 0,
-                               send_cb);
-    } else if (get_variant_value() & TEST_RMA) {
-        return ucp_put_nb(ep, &m_sbuf[0], m_sbuf.size(), (uintptr_t)&m_rbuf[0],
-                          rkey, send_cb);
-    } else {
-        ucs_fatal("invalid test case");
-    }
-}
-
-void *test_ucp_peer_failure::recv_nb(entity& e) {
-    ucs_assert(m_rbuf.size() >= m_sbuf.size());
-    if (get_variant_value() & TEST_TAG) {
-        return ucp_tag_recv_nb(e.worker(), &m_rbuf[0], m_rbuf.size(), DATATYPE, 0,
-                               0, recv_cb);
+void *test_ucp_peer_failure::send_nb(ucp_ep_h ep, ucp_rkey_h rkey)
+{
+    ucp_request_param_t param;
+    param.op_attr_mask = UCP_OP_ATTR_FIELD_DATATYPE |
+                         UCP_OP_ATTR_FIELD_CALLBACK;
+    param.datatype     = DATATYPE;
+    param.cb.send      = send_cb;
+    if (get_variant_value() & TEST_AM) {
+        return ucp_am_send_nbx(ep, AM_ID, NULL, 0, &m_sbuf[0], m_sbuf.size(),
+                               &param);
     } else if (get_variant_value() & TEST_RMA) {
-        return NULL;
+        return ucp_put_nbx(ep, &m_sbuf[0], m_sbuf.size(), (uintptr_t)&m_rbuf[0],
+                           rkey, &param);
     } else {
         ucs_fatal("invalid test case");
     }
@@ -195,13 +224,27 @@ void test_ucp_peer_failure::fail_receiver() {
     }
 }
 
-void test_ucp_peer_failure::smoke_test(bool stable_pair) {
-    void *rreq = recv_nb(stable_pair ? stable_receiver() : failing_receiver());
-    void *sreq = send_nb(stable_pair ? stable_sender()   : failing_sender(),
-                         stable_pair ? m_stable_rkey     : m_failing_rkey);
+void test_ucp_peer_failure::smoke_test(bool stable_pair)
+{
+    ucp_ep_h send_ep = stable_pair ? stable_sender() : failing_sender();
+    size_t am_count  = m_am_rx_count;
+
+    // Send and wait for completion
+    void *sreq = send_nb(send_ep, stable_pair ? m_stable_rkey : m_failing_rkey);
     request_wait(sreq);
-    request_wait(rreq);
-    EXPECT_EQ(m_sbuf, m_rbuf);
+
+    if (get_variant_value() & TEST_AM) {
+        // Wait for active message to be received
+        while (m_am_rx_count < am_count) {
+            progress();
+        }
+    } else if (get_variant_value() & TEST_RMA) {
+        // Flush the sender and expect data to arrive on receiver
+        void *freq = ucp_ep_flush_nb(send_ep, 0,
+                                     (ucp_send_callback_t)ucs_empty_function);
+        request_wait(freq);
+        EXPECT_EQ(m_sbuf, m_rbuf);
+    }
 }
 
 void test_ucp_peer_failure::unmap_memh(ucp_mem_h memh, ucp_context_h context)
@@ -250,12 +293,8 @@ void test_ucp_peer_failure::set_rkeys() {
     }
 }
 
-void test_ucp_peer_failure::send_cb(void *request, ucs_status_t status)
-{
-}
-
-void test_ucp_peer_failure::recv_cb(void *request, ucs_status_t status,
-                                    ucp_tag_recv_info_t *info)
+void test_ucp_peer_failure::send_cb(void *request, ucs_status_t status,
+                                    void *user_data)
 {
 }
 
@@ -279,6 +318,8 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
     create_entity();
     sender().connect(&stable_receiver(),  get_ep_params(), STABLE_EP_INDEX);
     sender().connect(&failing_receiver(), get_ep_params(), FAILING_EP_INDEX);
+    set_am_handler(stable_receiver());
+    set_am_handler(failing_receiver());
 
     set_rkeys();
 
@@ -303,8 +344,9 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
         }
     }
 
+    flush_ep(sender(), 0, FAILING_EP_INDEX);
     EXPECT_EQ(UCS_OK, m_err_status);
-    
+
     /* Since UCT/UD EP has a SW implementation of reliablity on which peer
      * failure mechanism is based, we should set small UCT/UD EP timeout
      * for UCT/UD EPs for sender's UCP EP to reduce testing time */
@@ -316,7 +358,7 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
         fail_receiver();
 
         void *sreq = send_nb(failing_sender(), m_failing_rkey);
-
+        flush_ep(sender(), 0, FAILING_EP_INDEX);
         while (!m_err_count) {
             progress();
         }
@@ -331,7 +373,8 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
             ucs_status_t status = ucp_request_check_status(sreq);
             EXPECT_NE(UCS_INPROGRESS, status);
             if (request_must_fail) {
-                EXPECT_EQ(m_err_status, status);
+                EXPECT_TRUE((m_err_status == status) ||
+                            (UCS_ERR_CANCELED == status));
             } else {
                 EXPECT_TRUE((m_err_status == status) || (UCS_OK == status));
             }
@@ -339,9 +382,10 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
         }
 
         /* Additional sends must fail */
-        void *sreq2 = send_nb(failing_sender(), m_failing_rkey);
-        EXPECT_FALSE(UCS_PTR_IS_PTR(sreq2));
-        EXPECT_EQ(m_err_status, UCS_PTR_STATUS(sreq2));
+        void *sreq2         = send_nb(failing_sender(), m_failing_rkey);
+        ucs_status_t status = request_wait(sreq2);
+        EXPECT_TRUE(UCS_STATUS_IS_ERR(status));
+        EXPECT_EQ(m_err_status, status);
 
         if (force_close) {
             unsigned allocd_eps_before =
@@ -353,6 +397,7 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
 
             void *creq = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FORCE);
             request_wait(creq);
+            short_progress_loop(); /* allow discard lanes & complete destroy EP */
 
             unsigned allocd_eps_after =
                     ucs_strided_alloc_inuse_count(&sender().worker()->ep_alloc);
@@ -381,15 +426,8 @@ void test_ucp_peer_failure::do_test(size_t msg_size, int pre_msg_count,
     /* Check that TX polling is working well */
     while (sender().progress());
 
-    /* Destroy rkeys before destroying the worker (which also destroys the
-     * endpoints) */
+    /* Destroy rkey for failing pair */
     m_failing_rkey.reset();
-    m_stable_rkey.reset();
-
-    /* When all requests on sender are done we need to prevent LOCAL_FLUSH
-     * in test teardown. Receiver is killed and doesn't respond on FC requests
-     */
-    sender().destroy_worker();
 }
 
 UCS_TEST_P(test_ucp_peer_failure, basic) {
@@ -427,42 +465,26 @@ UCS_TEST_P(test_ucp_peer_failure, bcopy_multi, "SEG_SIZE?=512", "RC_TM_ENABLE?=n
             false /* must_fail */);
 }
 
-UCS_TEST_P(test_ucp_peer_failure, force_close, "RC_FC_ENABLE?=n") {
+UCS_TEST_P(test_ucp_peer_failure, force_close, "RC_FC_ENABLE?=n",
+           /* To catch unexpected descriptors leak, for multi-fragment protocol
+              with TCP */
+           "TCP_RX_SEG_SIZE?=1024", "TCP_TX_SEG_SIZE?=1024")
+{
     do_test(16000, /* msg_size */
             1000, /* pre_msg_cnt */
             true, /* force_close */
             false /* must_fail */);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_peer_failure, disable_sync_send,
-                     !(get_variant_value() & TEST_TAG)) {
-    const size_t        max_size = UCS_MBYTE;
-    std::vector<char>   buf(max_size, 0);
-    void                *req;
-
-    sender().connect(&receiver(), get_ep_params());
-
-    /* Make sure API is disabled for any size and data type */
-    for (size_t size = 1; size <= max_size; size *= 2) {
-        req = ucp_tag_send_sync_nb(sender().ep(), buf.data(), size, DATATYPE,
-                                   0x111337, NULL);
-        EXPECT_FALSE(UCS_PTR_IS_PTR(req));
-        EXPECT_EQ(UCS_ERR_UNSUPPORTED, UCS_PTR_STATUS(req));
-
-        ucp::data_type_desc_t dt_desc(DATATYPE_IOV, buf.data(), size);
-        req = ucp_tag_send_sync_nb(sender().ep(), dt_desc.buf(), dt_desc.count(),
-                                   dt_desc.dt(), 0x111337, NULL);
-        EXPECT_FALSE(UCS_PTR_IS_PTR(req));
-        EXPECT_EQ(UCS_ERR_UNSUPPORTED, UCS_PTR_STATUS(req));
-    }
-}
-
 class test_ucp_peer_failure_keepalive : public test_ucp_peer_failure
 {
 public:
     test_ucp_peer_failure_keepalive() {
         m_sbuf.resize(1 * UCS_MBYTE);
         m_rbuf.resize(1 * UCS_MBYTE);
+
+        m_env.push_back(new ucs::scoped_setenv("UCX_TCP_KEEPIDLE", "inf"));
+        m_env.push_back(new ucs::scoped_setenv("UCX_UD_TIMEOUT", "3s"));
     }
 
     void init() {
@@ -472,20 +494,24 @@ class test_ucp_peer_failure_keepalive : public test_ucp_peer_failure
         sender().connect(&failing_receiver(), get_ep_params(), FAILING_EP_INDEX);
         stable_receiver().connect(&sender(), get_ep_params());
         failing_receiver().connect(&sender(), get_ep_params());
+        set_am_handler(failing_receiver());
+        set_am_handler(stable_receiver());
     }
 
     static void get_test_variants(std::vector<ucp_test_variant>& variants) {
-        add_variant_with_value(variants, UCP_FEATURE_TAG, TEST_TAG, "tag");
+        add_variant_with_value(variants, UCP_FEATURE_AM, TEST_AM, "am");
     }
 };
 
 UCS_TEST_P(test_ucp_peer_failure_keepalive, kill_receiver,
-           "KEEPALIVE_TIMEOUT=0.3", "KEEPALIVE_NUM_EPS=inf") {
+           "KEEPALIVE_INTERVAL=0.3", "KEEPALIVE_NUM_EPS=inf") {
     /* TODO: wireup is not tested yet */
 
     scoped_log_handler err_handler(wrap_errors_logger);
     scoped_log_handler warn_handler(hide_warns_logger);
 
+    /* initiate p2p pairing */
+    ucp_ep_resolve_remote_id(failing_sender(), 0);
     smoke_test(true); /* allow wireup to complete */
     smoke_test(false);
 
@@ -504,7 +530,7 @@ UCS_TEST_P(test_ucp_peer_failure_keepalive, kill_receiver,
     /* flush all outstanding ops to allow keepalive to run */
     flush_worker(sender());
 
-    /* kill EPs */
+    /* kill EPs & ifaces */
     failing_receiver().close_all_eps(*this, 0, UCP_EP_CLOSE_MODE_FORCE);
     wait_for_flag(&m_err_count);
 
diff --git a/test/gtest/ucp/test_ucp_perf.cc b/test/gtest/ucp/test_ucp_perf.cc
index 009b763498b..e2505ce2d70 100644
--- a/test/gtest/ucp/test_ucp_perf.cc
+++ b/test/gtest/ucp/test_ucp_perf.cc
@@ -1,17 +1,20 @@
 /**
 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
-*
 * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
+* Copyright (C) ARM Ltd. 2020.  ALL RIGHTS RESERVED.
+*
 * See file LICENSE for terms.
 */
 
 #include "ucp_test.h"
 
-#include <gtest/common/test_perf.h>
+#include <common/test_perf.h>
 
 
 #define MB   pow(1024.0, -2)
-#define UCP_ARM_PERF_TEST_MULTIPLIER 2
+#define UCT_PERF_TEST_MULTIPLIER  5
+#define UCT_ARM_PERF_TEST_MULTIPLIER  15
+
 class test_ucp_perf : public ucp_test, public test_perf {
 public:
     static void get_test_variants(std::vector<ucp_test_variant>& variants) {
@@ -37,8 +40,9 @@ class test_ucp_perf : public ucp_test, public test_perf {
         // Ignore errors that transport cannot reach peer
         if (level == UCS_LOG_LEVEL_ERROR) {
             std::string err_str = format_message(message, ap);
-            if (strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNREACHABLE)) || 
-                strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNSUPPORTED))) {
+            if (strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNREACHABLE)) ||
+                strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNSUPPORTED)) ||
+                strstr(err_str.c_str(), "no peer failure handler")) {
                 UCS_TEST_MESSAGE << err_str;
                 return UCS_LOG_FUNC_RC_STOP;
             }
@@ -52,126 +56,235 @@ class test_ucp_perf : public ucp_test, public test_perf {
 
 const test_perf::test_spec test_ucp_perf::tests[] =
 {
+  { "tag 0-msg latency", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 0 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    0 },
+
   { "tag latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    0 },
+
+  { "tag latency errh", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    UCX_PERF_TEST_FLAG_ERR_HANDLING },
+
+  { "blocking tag latency", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_SLEEP,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
     0 },
 
   { "tag iov latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_IOV, 8192, 3, { 1024, 1024, 1024 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
     0 },
 
   { "tag mr", "Mpps",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu,
+    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0,
+    0 },
+
+  { "blocking tag mr", "Mpps",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_SLEEP,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0,
     0 },
 
   { "tag sync mr", "Mpps",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG_SYNC, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 200000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.05, 100.0, 0},
 
   { "tag wild mr", "Mpps",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0,
     UCX_PERF_TEST_FLAG_TAG_WILDCARD },
 
   { "tag bw", "MB/sec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 },
+
+  { "blocking tag bw", "MB/sec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_SLEEP,
     UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 },
 
   { "tag bw_zcopy_multi", "MB/sec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_TAG, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 16, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 },
 
   { "put latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0,
     0 },
 
   { "put rate", "Mpps",
     UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.5, 100.0,
     0 },
 
   { "put bw", "MB/sec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 2048 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0,
     0 },
 
   { "get latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0,
     0 },
 
   { "get bw", "MB/sec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0,
     0 },
 
   { "stream latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, 0 },
 
   { "stream bw", "MB/sec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0, 0 },
 
   { "stream recv-data latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0,
     UCX_PERF_TEST_FLAG_STREAM_RECV_DATA },
 
   { "stream recv-data bw", "MB/sec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_STREAM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 16384 }, 1, 10000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 200.0, 100000.0,
     UCX_PERF_TEST_FLAG_STREAM_RECV_DATA },
 
   { "atomic add rate", "Mpps",
     UCX_PERF_API_UCP, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 1000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 500.0,
     0 },
 
   { "atomic fadd latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0,
     0 },
 
   { "atomic swap latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0,
     0 },
 
   { "atomic cswap latency", "usec",
     UCX_PERF_API_UCP, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0,
     0 },
 
+  { "am 0-msg latency", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 0 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    0 },
+
+  { "am latency", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    0 },
+
+  { "blocking am latency", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_SLEEP,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    0 },
+
+  { "am iov latency", "usec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_IOV, 8192, 3, { 1024, 1024, 1024 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 60.0,
+    0 },
+
+  { "am mr", "Mpps",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu,
+    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0,
+    0 },
+
+  { "blocking am mr", "Mpps",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_SLEEP,
+    UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 2000000lu,
+    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.1, 100.0,
+    0 },
+
+  { "am bw", "MB/sec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 },
+
+  { "blocking am bw", "MB/sec",
+    UCX_PERF_API_UCP, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_SLEEP,
+    UCT_PERF_DATA_LAYOUT_LAST, 0, 1, { 2048 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 100.0, 100000.0 },
+
   { NULL }
 };
 
 
-UCS_TEST_P(test_ucp_perf, envelope) {
+UCS_TEST_SKIP_COND_P(test_ucp_perf, envelope, has_transport("self"))
+{
     bool check_perf = true;
     size_t max_iter = std::numeric_limits<size_t>::max();
 
@@ -187,16 +300,64 @@ UCS_TEST_P(test_ucp_perf, envelope) {
     ucs::scoped_setenv warn_invalid("UCX_WARN_INVALID_CONFIG", "no");
 
     /* Run all tests */
-    for (const test_spec *test_iter = tests; test_iter->title != NULL; ++test_iter) {
+    for (const test_spec *test_iter = tests; test_iter->title != NULL;
+         ++test_iter) {
         test_spec test = *test_iter;
 
         if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) {
-            test.max *= UCP_ARM_PERF_TEST_MULTIPLIER;
-            test.min /= UCP_ARM_PERF_TEST_MULTIPLIER;
+            test.max *= UCT_ARM_PERF_TEST_MULTIPLIER;
+            test.min /= UCT_ARM_PERF_TEST_MULTIPLIER;
+        } else {
+            test.max *= UCT_PERF_TEST_MULTIPLIER;
+            test.min /= UCT_PERF_TEST_MULTIPLIER;
         }
         test.iters = ucs_min(test.iters, max_iter);
+
         run_test(test, 0, check_perf, "", "");
     }
 }
 
 UCP_INSTANTIATE_TEST_CASE(test_ucp_perf)
+
+
+class test_ucp_wait_mem : public test_ucp_perf {};
+
+UCS_TEST_P(test_ucp_wait_mem, envelope) {
+    double perf_avg    = 0;
+    double perf_iter   = 0;
+    const int max_iter = ucs_max(ucs::perf_retry_count, 1);
+    int i;
+
+    /* Run ping-pong with no WFE and get latency reference values */
+    const test_spec test1 = { "put latency reference", "usec",
+                              UCX_PERF_API_UCP, UCX_PERF_CMD_PUT,
+                              UCX_PERF_TEST_TYPE_PINGPONG,
+                              UCX_PERF_WAIT_MODE_POLL,
+                              UCP_PERF_DATATYPE_CONTIG,
+                              0, 1, { 8 }, 1, 1000lu,
+                              ucs_offsetof(ucx_perf_result_t,
+                                           latency.total_average),
+                              1e6, 0.001, 30.0, 0 };
+    for (i = 0; i < max_iter; i++) {
+        perf_iter = run_test(test1, 0, false, "", "");
+        perf_avg += perf_iter;
+    }
+    perf_avg /= max_iter;
+
+    /* Run ping-pong with WFE while re-using previous run numbers as
+     * a min/max boundary. The latency of the WFE run should stay nearly
+     * identical with 200 percent margin. When WFE does not work as expected
+     * the slow down is typically 10x-100x */
+    const test_spec test2 = { "put latency with ucp_worker_wait_mem()",
+                              "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT,
+                              UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM,
+                              UCX_PERF_WAIT_MODE_POLL,
+                              UCP_PERF_DATATYPE_CONTIG,
+                              0, 1, { 8 }, 1, 1000lu,
+                              ucs_offsetof(ucx_perf_result_t,
+                                           latency.total_average),
+                              1e6, perf_avg * 0.7, perf_avg * 2, 0 };
+    run_test(test2, 0, true, "", "");
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wait_mem, shm, "shm")
diff --git a/test/gtest/ucp/test_ucp_proto.cc b/test/gtest/ucp/test_ucp_proto.cc
index e65657c76b6..417dd54bef7 100644
--- a/test/gtest/ucp/test_ucp_proto.cc
+++ b/test/gtest/ucp/test_ucp_proto.cc
@@ -42,11 +42,12 @@ UCS_TEST_P(test_ucp_proto, dump_protocols) {
     select_param.op_flags   = 0;
     select_param.dt_class   = UCP_DATATYPE_CONTIG;
     select_param.mem_type   = UCS_MEMORY_TYPE_HOST;
-    select_param.sys_dev    = 0;
+    select_param.sys_dev    = UCS_SYS_DEVICE_ID_UNKNOWN;
     select_param.sg_count   = 1;
     select_param.padding[0] = 0;
     select_param.padding[1] = 0;
 
+    ucs_string_buffer_init(&strb);
     ucp_proto_select_param_str(&select_param, &strb);
     UCS_TEST_MESSAGE << ucs_string_buffer_cstr(&strb);
     ucs_string_buffer_cleanup(&strb);
@@ -72,11 +73,13 @@ UCS_TEST_P(test_ucp_proto, rkey_config) {
 
     /* similar configurations should return same index */
     ucp_worker_cfg_index_t cfg_index1;
-    status = ucp_worker_get_rkey_config(worker(), &rkey_config_key, &cfg_index1);
+    status = ucp_worker_rkey_config_get(worker(), &rkey_config_key, NULL,
+                                        &cfg_index1);
     ASSERT_UCS_OK(status);
 
     ucp_worker_cfg_index_t cfg_index2;
-    status = ucp_worker_get_rkey_config(worker(), &rkey_config_key, &cfg_index2);
+    status = ucp_worker_rkey_config_get(worker(), &rkey_config_key, NULL,
+                                        &cfg_index2);
     ASSERT_UCS_OK(status);
 
     EXPECT_EQ(static_cast<int>(cfg_index1), static_cast<int>(cfg_index2));
@@ -88,10 +91,29 @@ UCS_TEST_P(test_ucp_proto, rkey_config) {
 
     /* different configuration should return different index */
     ucp_worker_cfg_index_t cfg_index3;
-    status = ucp_worker_get_rkey_config(worker(), &rkey_config_key, &cfg_index3);
+    status = ucp_worker_rkey_config_get(worker(), &rkey_config_key, NULL,
+                                        &cfg_index3);
     ASSERT_UCS_OK(status);
 
     EXPECT_NE(static_cast<int>(cfg_index1), static_cast<int>(cfg_index3));
 }
 
+UCS_TEST_P(test_ucp_proto, worker_print_info_rkey)
+{
+    ucp_rkey_config_key_t rkey_config_key;
+
+    rkey_config_key.ep_cfg_index = 0;
+    rkey_config_key.md_map       = 0;
+    rkey_config_key.mem_type     = UCS_MEMORY_TYPE_HOST;
+    rkey_config_key.sys_dev      = UCS_SYS_DEVICE_ID_UNKNOWN;
+
+    /* similar configurations should return same index */
+    ucp_worker_cfg_index_t cfg_index;
+    ucs_status_t status = ucp_worker_rkey_config_get(worker(), &rkey_config_key,
+                                                     NULL, &cfg_index);
+    ASSERT_UCS_OK(status);
+
+    ucp_worker_print_info(worker(), stdout);
+}
+
 UCP_INSTANTIATE_TEST_CASE(test_ucp_proto)
diff --git a/test/gtest/ucp/test_ucp_rma.cc b/test/gtest/ucp/test_ucp_rma.cc
index 2bc96698847..1f2e7a85b7e 100644
--- a/test/gtest/ucp/test_ucp_rma.cc
+++ b/test/gtest/ucp/test_ucp_rma.cc
@@ -71,8 +71,8 @@ class test_ucp_rma : public test_ucp_memheap {
 
             /* Memory type put/get is fully supported only with new protocols */
             if (!enable_proto() &&
-                (!UCP_MEM_IS_ACCESSIBLE_FROM_CPU(pairs[i][0]) ||
-                 !UCP_MEM_IS_ACCESSIBLE_FROM_CPU(pairs[i][1]))) {
+                (!UCP_MEM_IS_HOST(pairs[i][0]) ||
+                 !UCP_MEM_IS_HOST(pairs[i][1]))) {
                 continue;
             }
 
diff --git a/test/gtest/ucp/test_ucp_sockaddr.cc b/test/gtest/ucp/test_ucp_sockaddr.cc
index 4ca59b2e7eb..c9785f185b9 100644
--- a/test/gtest/ucp/test_ucp_sockaddr.cc
+++ b/test/gtest/ucp/test_ucp_sockaddr.cc
@@ -17,9 +17,9 @@ extern "C" {
 #include <ucp/core/ucp_listener.h>
 #include <ucp/core/ucp_ep.h>
 #include <ucp/core/ucp_ep.inl>
+#include <ucp/core/ucp_request.inl>
+#include <ucp/core/ucp_worker.h>
 #include <ucp/wireup/wireup_cm.h>
-/* TODO: remove when it is not needed anymore */
-#include <uct/tcp/tcp_sockcm_ep.h>
 }
 
 #define UCP_INSTANTIATE_ALL_TEST_CASE(_test_case) \
@@ -44,9 +44,9 @@ class test_ucp_sockaddr : public ucp_test {
     };
 
     enum {
-        TEST_MODIFIER_MASK      = UCS_MASK(16),
-        TEST_MODIFIER_MT        = UCS_BIT(16),
-        TEST_MODIFIER_CM        = UCS_BIT(17)
+        TEST_MODIFIER_MASK               = UCS_MASK(16),
+        TEST_MODIFIER_MT                 = UCS_BIT(16),
+        TEST_MODIFIER_CM_USE_ALL_DEVICES = UCS_BIT(17)
     };
 
     enum {
@@ -63,9 +63,9 @@ class test_ucp_sockaddr : public ucp_test {
     ucs::sock_addr_storage m_test_addr;
 
     void init() {
-        if (get_variant_value() & TEST_MODIFIER_CM) {
-            modify_config("SOCKADDR_CM_ENABLE", "yes");
-        }
+        m_err_count = 0;
+        modify_config("KEEPALIVE_INTERVAL", "10s");
+        modify_config("CM_USE_ALL_DEVICES", cm_use_all_devices() ? "y" : "n");
         get_sockaddr();
         ucp_test::init();
         skip_loopback();
@@ -79,15 +79,37 @@ class test_ucp_sockaddr : public ucp_test {
                                name + ",mt", MULTI_THREAD_WORKER);
     }
 
+    static void
+    get_test_variants_cm_mode(std::vector<ucp_test_variant>& variants, uint64_t features,
+                              int modifier, const std::string& name)
+    {
+        get_test_variants_mt(variants, features,
+                             modifier | TEST_MODIFIER_CM_USE_ALL_DEVICES, name);
+        get_test_variants_mt(variants, features, modifier, name + ",not_all_devs");
+    }
+
     static void
     get_test_variants(std::vector<ucp_test_variant>& variants,
                       uint64_t features = UCP_FEATURE_TAG | UCP_FEATURE_STREAM) {
-        get_test_variants_mt(variants, features, CONN_REQ_TAG, "tag");
-        get_test_variants_mt(variants, features, CONN_REQ_STREAM, "stream");
-        get_test_variants_mt(variants, features, CONN_REQ_TAG | TEST_MODIFIER_CM,
-                          "tag,cm");
-        get_test_variants_mt(variants, features, CONN_REQ_STREAM | TEST_MODIFIER_CM,
-                          "stream,cm");
+        get_test_variants_cm_mode(variants, features, CONN_REQ_TAG, "tag");
+        get_test_variants_cm_mode(variants, features, CONN_REQ_STREAM, "stream");
+    }
+
+    static ucs_log_func_rc_t
+    detect_warn_logger(const char *file, unsigned line, const char *function,
+                       ucs_log_level_t level,
+                       const ucs_log_component_config_t *comp_conf,
+                       const char *message, va_list ap)
+    {
+        if (level == UCS_LOG_LEVEL_WARN) {
+            std::string err_str = format_message(message, ap);
+            if (err_str.find("failed to connect CM lane on device") !=
+                std::string::npos) {
+                UCS_TEST_MESSAGE << err_str;
+                return UCS_LOG_FUNC_RC_STOP;
+            }
+        }
+        return UCS_LOG_FUNC_RC_CONTINUE;
     }
 
     static ucs_log_func_rc_t
@@ -105,6 +127,7 @@ class test_ucp_sockaddr : public ucp_test {
                 stop_list.push_back("connection request failed on listener");
                 /* when the "peer failure" error happens, it is followed by: */
                 stop_list.push_back("received event RDMA_CM_EVENT_UNREACHABLE");
+                stop_list.push_back("Connection reset by remote peer");
                 stop_list.push_back(ucs_status_string(UCS_ERR_UNREACHABLE));
                 stop_list.push_back(ucs_status_string(UCS_ERR_UNSUPPORTED));
             }
@@ -130,14 +153,9 @@ class test_ucp_sockaddr : public ucp_test {
              * only IPoIB IP addresses. therefore, if the interface
              * isn't as such, we continue to the next one. */
             skip = 1;
-        } else if (!ucs::is_rdmacm_netdev(ifa->ifa_name) &&
-                   !(get_variant_value() & TEST_MODIFIER_CM)) {
-            /* old client-server API (without CM) ran only with
-             * IPoIB/RoCE interface */
-            skip = 1;
         } else if ((has_transport("tcp") || has_transport("all")) &&
                    (ifa->ifa_addr->sa_family == AF_INET6)) {
-            /* the tcp transport (and 'all' which may fallback to tcp_sockcmm)
+            /* the tcp transport (and 'all' which may fallback to tcp_sockcm)
              * can run either on an rdma-enabled interface (IPoIB/RoCE)
              * or any interface with IPv4 address because IPv6 isn't supported
              * by the tcp transport yet */
@@ -163,7 +181,8 @@ class test_ucp_sockaddr : public ucp_test {
                     continue;
                 }
 
-                saddrs.push_back(ucs::sock_addr_storage());
+                saddrs.push_back(ucs::sock_addr_storage(
+                        ucs::is_rdmacm_netdev(ifa->ifa_name)));
                 status = ucs_sockaddr_sizeof(ifa->ifa_addr, &size);
                 ASSERT_UCS_OK(status);
                 saddrs.back().set_sock_addr(*ifa->ifa_addr, size);
@@ -199,6 +218,10 @@ class test_ucp_sockaddr : public ucp_test {
             status = receiver().listen(cb_type, m_test_addr.get_sock_addr_ptr(),
                                        m_test_addr.get_addr_size(),
                                        get_server_ep_params());
+            if (m_test_addr.get_port() == 0) {
+                /* any port can't be busy */
+                break;
+            }
         } while ((status == UCS_ERR_BUSY) && (ucs_get_time() < deadline));
 
         if (status == UCS_ERR_UNREACHABLE) {
@@ -217,6 +240,23 @@ class test_ucp_sockaddr : public ucp_test {
         UCS_TEST_MESSAGE << "server listening on " << m_test_addr.to_str();
     }
 
+    ucs_status_t create_listener_wrap_err(const ucp_listener_params_t &params,
+                                          ucp_listener_h &listener)
+    {
+        scoped_log_handler wrap_err(wrap_errors_logger);
+        return ucp_listener_create(receiver().worker(), &params, &listener);
+    }
+
+    static void complete_err_handling_status_verify(ucs_status_t status)
+    {
+        EXPECT_TRUE(/* was successful */
+                    (status == UCS_OK)                   ||
+                    /* completed from error handling for EP */
+                    (status == UCS_ERR_ENDPOINT_TIMEOUT) ||
+                    (status == UCS_ERR_CONNECTION_RESET) ||
+                    (status == UCS_ERR_CANCELED));
+    }
+
     static void scomplete_cb(void *req, ucs_status_t status)
     {
         if ((status == UCS_OK)              ||
@@ -227,12 +267,23 @@ class test_ucp_sockaddr : public ucp_test {
         UCS_TEST_ABORT("Error: " << ucs_status_string(status));
     }
 
+    static void scomplete_err_handling_cb(void *req, ucs_status_t status)
+    {
+        complete_err_handling_status_verify(status);
+    }
+
     static void rtag_complete_cb(void *req, ucs_status_t status,
                                  ucp_tag_recv_info_t *info)
     {
         EXPECT_UCS_OK(status);
     }
 
+    static void rtag_complete_err_handling_cb(void *req, ucs_status_t status,
+                                              ucp_tag_recv_info_t *info)
+    {
+        complete_err_handling_status_verify(status);
+    }
+
     static void rstream_complete_cb(void *req, ucs_status_t status,
                                     size_t length)
     {
@@ -326,10 +377,10 @@ class test_ucp_sockaddr : public ucp_test {
              * If so, skip the test since a valid error occurred - the one expected
              * from the error handling flow - cases of failure to handle long worker
              * address or transport doesn't support the error handling requirement */
-            UCS_TEST_SKIP_R("Skipping due an unreachable destination (unsupported "
-                            "feature or too long worker address or no "
-                            "supported transport to send partial worker "
-                            "address)");
+            UCS_TEST_SKIP_R("Skipping due to an unreachable destination"
+                            " (unsupported feature or too long worker address or"
+                            " no supported transport to send partial worker"
+                            " address)");
         } else if ((send_status == UCS_ERR_REJECTED) &&
                    (cb_type == ucp_test_base::entity::LISTEN_CB_REJECT)) {
             return;
@@ -400,14 +451,14 @@ class test_ucp_sockaddr : public ucp_test {
     virtual ucp_ep_params_t get_ep_params()
     {
         ucp_ep_params_t ep_params = ucp_test::get_ep_params();
-        ep_params.field_mask      |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE |
-                                     UCP_EP_PARAM_FIELD_ERR_HANDLER;
+        ep_params.field_mask     |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE |
+                                    UCP_EP_PARAM_FIELD_ERR_HANDLER;
         /* The error handling requirement is needed since we need to take
          * care of a case where the client gets an error. In case ucp needs to
          * handle a large worker address but neither ud nor ud_x are present */
-        ep_params.err_mode         = UCP_ERR_HANDLING_MODE_PEER;
-        ep_params.err_handler.cb   = err_handler_cb;
-        ep_params.err_handler.arg  = NULL;
+        ep_params.err_mode        = UCP_ERR_HANDLING_MODE_PEER;
+        ep_params.err_handler.cb  = err_handler_cb;
+        ep_params.err_handler.arg = this;
         return ep_params;
     }
 
@@ -415,9 +466,10 @@ class test_ucp_sockaddr : public ucp_test {
         return get_ep_params();
     }
 
-    void client_ep_connect()
+    void client_ep_connect_basic(const ucp_ep_params_t &base_ep_params)
     {
-        ucp_ep_params_t ep_params = get_ep_params();
+        ucp_ep_params_t ep_params = base_ep_params;
+
         ep_params.field_mask      |= UCP_EP_PARAM_FIELD_FLAGS |
                                      UCP_EP_PARAM_FIELD_SOCK_ADDR |
                                      UCP_EP_PARAM_FIELD_USER_DATA;
@@ -425,9 +477,15 @@ class test_ucp_sockaddr : public ucp_test {
         ep_params.sockaddr.addr    = m_test_addr.get_sock_addr_ptr();
         ep_params.sockaddr.addrlen = m_test_addr.get_addr_size();
         ep_params.user_data        = &sender();
+
         sender().connect(&receiver(), ep_params);
     }
 
+    void client_ep_connect()
+    {
+        client_ep_connect_basic(get_ep_params());
+    }
+
     void connect_and_send_recv(bool wakeup, uint64_t flags)
     {
         {
@@ -462,25 +520,28 @@ class test_ucp_sockaddr : public ucp_test {
         wait_for_reject(sender(),   wakeup);
     }
 
-    void listen_and_communicate(bool wakeup, uint64_t flags)
+    void listen(ucp_test_base::entity::listen_cb_type_t cb_type)
     {
         UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str();
+        start_listener(cb_type);
+    }
 
-        start_listener(cb_type());
+    void listen_and_communicate(bool wakeup, uint64_t flags)
+    {
+        listen(cb_type());
         connect_and_send_recv(wakeup, flags);
     }
 
     void listen_and_reject(bool wakeup)
     {
-        UCS_TEST_MESSAGE << "Testing " << m_test_addr.to_str();
-
-        start_listener(ucp_test_base::entity::LISTEN_CB_REJECT);
+        listen(ucp_test_base::entity::LISTEN_CB_REJECT);
         connect_and_reject(wakeup);
     }
 
     void one_sided_disconnect(entity &e, enum ucp_ep_close_mode mode) {
         void *req           = e.disconnect_nb(0, 0, mode);
-        ucs_time_t deadline = ucs_time_from_sec(10.0) + ucs_get_time();
+        ucs_time_t deadline = ucs::get_deadline();
+        scoped_log_handler slh(detect_error_logger);
         while (!is_request_completed(req) && (ucs_get_time() < deadline)) {
             /* TODO: replace the progress() with e().progress() when
                      async progress is implemented. */
@@ -501,6 +562,7 @@ class test_ucp_sockaddr : public ucp_test {
         void *receiver_ep_close_req = receiver().disconnect_nb(0, 0, mode);
 
         ucs_time_t deadline = ucs::get_deadline();
+        scoped_log_handler slh(detect_error_logger);
         while ((!is_request_completed(sender_ep_close_req) ||
                 !is_request_completed(receiver_ep_close_req)) &&
                (ucs_get_time() < deadline)) {
@@ -511,9 +573,52 @@ class test_ucp_sockaddr : public ucp_test {
         receiver().close_ep_req_free(receiver_ep_close_req);
     }
 
+    void setup_unreachable_listener()
+    {
+        ucs::sock_addr_storage listen_addr(m_test_addr.to_ucs_sock_addr());
+        ucs_status_t status = receiver().listen(cb_type(),
+                                                m_test_addr.get_sock_addr_ptr(),
+                                                m_test_addr.get_addr_size(),
+                                                get_server_ep_params());
+        if (status == UCS_ERR_UNREACHABLE) {
+            UCS_TEST_SKIP_R("cannot listen to " + m_test_addr.to_str());
+        }
+
+        /* make the client try to connect to a non-existing port on the server
+         * side */
+        m_test_addr.set_port(1);
+    }
+
+    static ucs_log_func_rc_t
+    detect_fail_no_err_cb(const char *file, unsigned line, const char *function,
+                          ucs_log_level_t level,
+                          const ucs_log_component_config_t *comp_conf,
+                          const char *message, va_list ap)
+    {
+        if (level == UCS_LOG_LEVEL_ERROR) {
+            std::string err_str = format_message(message, ap);
+
+            if (err_str.find("on CM lane will not be handled since no error"
+                             " callback is installed") != std::string::npos) {
+                UCS_TEST_MESSAGE << "< " << err_str << " >";
+                ++m_err_count;
+                return UCS_LOG_FUNC_RC_STOP;
+            }
+        }
+
+        return UCS_LOG_FUNC_RC_CONTINUE;
+    }
+
+    static void close_completion(void *request, ucs_status_t status,
+                                 void *user_data) {
+        *reinterpret_cast<bool*>(user_data) = true;
+    }
+
     static void err_handler_cb(void *arg, ucp_ep_h ep, ucs_status_t status) {
         ucp_test::err_handler_cb(arg, ep, status);
 
+        ++m_err_count;
+
         /* The current expected errors are only from the err_handle test
          * and from transports where the worker address is too long but ud/ud_x
          * are not present, or ud/ud_x are present but their addresses are too
@@ -525,6 +630,7 @@ class test_ucp_sockaddr : public ucp_test {
         case UCS_ERR_UNREACHABLE:
         case UCS_ERR_CONNECTION_RESET:
         case UCS_ERR_NOT_CONNECTED:
+        case UCS_ERR_ENDPOINT_TIMEOUT:
             UCS_TEST_MESSAGE << "ignoring error " << ucs_status_string(status)
                              << " on endpoint " << ep;
             return;
@@ -555,22 +661,27 @@ class test_ucp_sockaddr : public ucp_test {
 
     bool nonparameterized_test() const {
         return (get_variant_value() != DEFAULT_PARAM_VARIANT) &&
-               (get_variant_value() != (CONN_REQ_TAG | TEST_MODIFIER_CM));
+               (get_variant_value() != CONN_REQ_TAG);
     }
 
-    bool no_close_protocol() const {
-        return !(get_variant_value() & TEST_MODIFIER_CM);
+    bool cm_use_all_devices() const {
+        return get_variant_value() & TEST_MODIFIER_CM_USE_ALL_DEVICES;
     }
 
     static void cmp_cfg_lanes(ucp_ep_config_key_t *key1, ucp_lane_index_t lane1,
                               ucp_ep_config_key_t *key2, ucp_lane_index_t lane2) {
         EXPECT_TRUE(((lane1 == UCP_NULL_LANE) && (lane2 == UCP_NULL_LANE)) ||
                     ((lane1 != UCP_NULL_LANE) && (lane2 != UCP_NULL_LANE) &&
-                     ucp_ep_config_lane_is_peer_equal(key1, lane1, key2, lane2)));
+                     ucp_ep_config_lane_is_peer_match(key1, lane1, key2, lane2)));
     }
+
+protected:
+    static unsigned m_err_count;
 };
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, listen, no_close_protocol()) {
+unsigned test_ucp_sockaddr::m_err_count = 0;
+
+UCS_TEST_P(test_ucp_sockaddr, listen) {
     listen_and_communicate(false, 0);
 }
 
@@ -578,7 +689,7 @@ UCS_TEST_P(test_ucp_sockaddr, listen_c2s) {
     listen_and_communicate(false, SEND_DIRECTION_C2S);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, listen_s2c, no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, listen_s2c) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
 }
 
@@ -586,8 +697,7 @@ UCS_TEST_P(test_ucp_sockaddr, listen_bidi) {
     listen_and_communicate(false, SEND_DIRECTION_BIDI);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, onesided_disconnect,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect) {
     listen_and_communicate(false, 0);
     one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH);
 }
@@ -597,8 +707,7 @@ UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect_c2s) {
     one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, onesided_disconnect_s2c,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect_s2c) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
     one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH);
 }
@@ -608,20 +717,51 @@ UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect_bidi) {
     one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, close_callback) {
+    listen_and_communicate(false, SEND_DIRECTION_BIDI);
+
+    request_wait(receiver().flush_ep_nb());
+    request_wait(sender().flush_ep_nb());
+    ucp_ep_h ep = receiver().revoke_ep();
+
+    bool user_data = false;
+
+    ucp_request_param_t param = {0};
+    param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK  |
+                         UCP_OP_ATTR_FIELD_USER_DATA |
+                         UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+    param.cb.send      = close_completion;
+    param.user_data    = &user_data;
+
+    ucs_status_ptr_t request = ucp_ep_close_nbx(ep, &param);
+
+    bool is_pointer = UCS_PTR_IS_PTR(request);
+    request_wait(request);
+
+    if (is_pointer) {
+        ASSERT_TRUE(user_data);
+    }
+}
+
+UCS_TEST_P(test_ucp_sockaddr, onesided_disconnect_bidi_wait_err_cb) {
+    listen_and_communicate(false, SEND_DIRECTION_BIDI);
+
+    one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FLUSH);
+    wait_for_flag(&m_err_count);
+    EXPECT_EQ(1u, m_err_count);
+}
+
+UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect) {
     listen_and_communicate(false, 0);
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_c2s,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_c2s) {
     listen_and_communicate(false, SEND_DIRECTION_C2S);
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_s2c,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_s2c) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
 }
@@ -631,20 +771,17 @@ UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_bidi) {
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_force,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_force) {
     listen_and_communicate(false, 0);
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_force_c2s,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_force_c2s) {
     listen_and_communicate(false, SEND_DIRECTION_C2S);
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, concurrent_disconnect_force_s2c,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr, concurrent_disconnect_force_s2c) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
     concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE);
 }
@@ -688,19 +825,9 @@ UCS_TEST_P(test_ucp_sockaddr, listener_query) {
     EXPECT_EQ(m_test_addr, listener_attr.sockaddr);
 }
 
-UCS_TEST_P(test_ucp_sockaddr, err_handle) {
-
-    ucs::sock_addr_storage listen_addr(m_test_addr.to_ucs_sock_addr());
-    ucs_status_t status = receiver().listen(cb_type(),
-                                            m_test_addr.get_sock_addr_ptr(),
-                                            m_test_addr.get_addr_size(),
-                                            get_server_ep_params());
-    if (status == UCS_ERR_UNREACHABLE) {
-        UCS_TEST_SKIP_R("cannot listen to " + m_test_addr.to_str());
-    }
-
-    /* make the client try to connect to a non-existing port on the server side */
-    m_test_addr.set_port(1);
+UCS_TEST_P(test_ucp_sockaddr, err_handle)
+{
+    setup_unreachable_listener();
 
     {
         scoped_log_handler slh(wrap_errors_logger);
@@ -712,8 +839,97 @@ UCS_TEST_P(test_ucp_sockaddr, err_handle) {
     EXPECT_EQ(1u, sender().get_err_num());
 }
 
+UCS_TEST_P(test_ucp_sockaddr, err_handle_without_err_cb)
+{
+    setup_unreachable_listener();
+
+    {
+        scoped_log_handler slh(detect_fail_no_err_cb);
+        ucp_ep_params_t ep_params = ucp_test::get_ep_params();
+
+        ep_params.field_mask |= UCP_EP_PARAM_FIELD_ERR_HANDLING_MODE;
+        ep_params.err_mode    = UCP_ERR_HANDLING_MODE_PEER;
+
+        client_ep_connect_basic(ep_params);
+
+        /* allow for the unreachable event to arrive before restoring errors */
+        wait_for_flag(&m_err_count);
+        if (m_err_count > 0) {
+            sender().add_err(UCS_ERR_CONNECTION_RESET);
+        }
+    }
+
+    EXPECT_EQ(1u, sender().get_err_num());
+}
+
+UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, listener_invalid_params,
+                     nonparameterized_test(), "CM_REUSEADDR?=y")
+{
+    ucp_listener_params_t params;
+    ucp_listener_h listener;
+    ucs_status_t status;
+
+    params.field_mask = 0;
+    /* address and conn/accept handlers are not specified */
+    status            = create_listener_wrap_err(params, listener);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    /* add listen address, use ANY addr/port to avoid BUSY error in the end */
+    m_test_addr.reset_to_any();
+    m_test_addr.set_port(0);
+    params.field_mask       = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR;
+    params.sockaddr.addr    = m_test_addr.get_sock_addr_ptr();
+    params.sockaddr.addrlen = m_test_addr.get_addr_size();
+    /* accept handlers aren't set */
+    status                  = create_listener_wrap_err(params, listener);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    /* define conn handler flag but set to NULL */
+    params.field_mask       = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR |
+                              UCP_LISTENER_PARAM_FIELD_CONN_HANDLER;
+    params.conn_handler.cb  = NULL;
+    params.conn_handler.arg = NULL;
+    status                  = create_listener_wrap_err(params, listener);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    /* define both conn and accept handlers to NULL */
+    params.field_mask         = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR |
+                                UCP_LISTENER_PARAM_FIELD_CONN_HANDLER |
+                                UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER;
+    params.accept_handler.cb  = NULL;
+    params.accept_handler.arg = NULL;
+    status                    = create_listener_wrap_err(params, listener);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    /* define both conn and accept handlers to valid callbacks
+     * (should be only 1) */
+    params.field_mask        = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR |
+                               UCP_LISTENER_PARAM_FIELD_CONN_HANDLER |
+                               UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER;
+    params.conn_handler.cb   =
+            (ucp_listener_conn_callback_t)ucs_empty_function;
+    params.accept_handler.cb =
+            (ucp_listener_accept_callback_t)ucs_empty_function;
+    status                   = create_listener_wrap_err(params, listener);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    /* sockaddr and valid conn handler is OK */
+    params.field_mask = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR |
+                        UCP_LISTENER_PARAM_FIELD_CONN_HANDLER;
+    status            = create_listener_wrap_err(params, listener);
+    ASSERT_UCS_OK(status);
+    ucp_listener_destroy(listener);
+
+    /* sockaddr and valid accept handler is OK */
+    params.field_mask = UCP_LISTENER_PARAM_FIELD_SOCK_ADDR |
+                        UCP_LISTENER_PARAM_FIELD_ACCEPT_HANDLER;
+    status            = create_listener_wrap_err(params, listener);
+    ASSERT_UCS_OK(status);
+    ucp_listener_destroy(listener);
+}
+
 UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, compare_cm_and_wireup_configs,
-                     no_close_protocol()) {
+                     !cm_use_all_devices()) {
     ucp_worker_cfg_index_t cm_ep_cfg_index, wireup_ep_cfg_index;
     ucp_ep_config_key_t *cm_ep_cfg_key, *wireup_ep_cfg_key;
 
@@ -721,12 +937,6 @@ UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, compare_cm_and_wireup_configs,
     listen_and_communicate(false, SEND_DIRECTION_C2S);
     cm_ep_cfg_index = sender().ep()->cfg_index;
     cm_ep_cfg_key   = &ucp_ep_config(sender().ep())->key;
-    /* TODO: remove the SKIP below and include for <uct/tcp/tcp_sockcm_ep.h>
-     *       header file, when CONNECT_TO_EP support is added for TCP */
-    if (sender().ep()->uct_eps[ucp_ep_get_cm_lane(sender().ep())]
-        ->iface->ops.ep_disconnect == uct_tcp_sockcm_ep_disconnect) {
-        UCS_TEST_SKIP_R("don't test TCP SOCKCM");
-    }
     EXPECT_NE(UCP_NULL_LANE, ucp_ep_get_cm_lane(sender().ep()));
     disconnect(sender());
     disconnect(receiver());
@@ -792,15 +1002,194 @@ UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, compare_cm_and_wireup_configs,
     }
 }
 
+UCS_TEST_SKIP_COND_P(test_ucp_sockaddr, connect_and_fail_wireup,
+                     !cm_use_all_devices())
+{
+    start_listener(cb_type());
+
+    scoped_log_handler slh(wrap_errors_logger);
+    client_ep_connect();
+    if (!wait_for_server_ep(false)) {
+        UCS_TEST_SKIP_R("cannot connect to server");
+    }
+
+    ucp_lane_index_t am_lane = ucp_ep_get_wireup_msg_lane(sender().ep());
+    uct_ep_h uct_ep          = sender().ep()->uct_eps[am_lane];
+
+    /* Emulate failure of WIREUP MSG sending */
+    uct_ep->iface->ops.ep_am_bcopy = reinterpret_cast<uct_ep_am_bcopy_func_t>(
+            ucs_empty_function_return_bc_ep_timeout);
+
+    while (!(sender().ep()->flags & UCP_EP_FLAG_CONNECT_REQ_QUEUED)) {
+        progress();
+    }
+
+    concurrent_disconnect(UCP_EP_CLOSE_MODE_FORCE);
+}
+
 UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr)
 
+
+class test_ucp_sockaddr_different_tl_rsc : public test_ucp_sockaddr
+{
+public:
+    static void get_test_variants(std::vector<ucp_test_variant>& variants)
+    {
+        uint64_t features = UCP_FEATURE_STREAM | UCP_FEATURE_TAG;
+        test_ucp_sockaddr::get_test_variants_cm_mode(variants, features,
+                                                     UNSET_SELF_DEVICES,
+                                                     "unset_self_devices");
+        test_ucp_sockaddr::get_test_variants_cm_mode(variants, features,
+                                                     UNSET_SHM_DEVICES,
+                                                     "unset_shm_devices");
+        test_ucp_sockaddr::get_test_variants_cm_mode(variants, features,
+                                                     UNSET_SELF_DEVICES |
+                                                     UNSET_SHM_DEVICES,
+                                                     "unset_self_shm_devices");
+    }
+
+protected:
+    enum {
+        UNSET_SELF_DEVICES = UCS_BIT(0),
+        UNSET_SHM_DEVICES  = UCS_BIT(1)
+    };
+
+    void init()
+    {
+        m_err_count = 0;
+        get_sockaddr();
+        test_base::init();
+        // entities will be created in a test
+    }
+};
+
+
+UCS_TEST_P(test_ucp_sockaddr_different_tl_rsc, unset_devices_and_communicate)
+{
+    int variants = get_variant_value();
+
+    // create entities with different set of MDs and TL resources on a client
+    // and on a server to test non-homogeneous setups
+    if (variants & UNSET_SELF_DEVICES) {
+        if (is_self()) {
+            UCS_TEST_SKIP_R("unable to run test for self transport with unset"
+                            " self devices");
+        }
+
+        modify_config("SELF_DEVICES", "");
+    }
+    if (variants & UNSET_SHM_DEVICES) {
+        modify_config("SHM_DEVICES", "");
+    }
+    push_config();
+
+    // create a client with restrictions
+    create_entity();
+
+    pop_config();
+
+    // create a server without restrictions
+    if (!is_self()) {
+        create_entity();
+    }
+
+    skip_loopback();
+    listen_and_communicate(false, SEND_DIRECTION_BIDI);
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_sockaddr_different_tl_rsc, all, "all")
+
+
+class test_ucp_sockaddr_cm_private_data : public test_ucp_sockaddr {
+protected:
+    ucp_rsc_index_t get_num_cms()
+    {
+        const ucp_worker_h worker    = sender().worker();
+        ucp_rsc_index_t num_cm_cmpts = ucp_worker_num_cm_cmpts(worker);
+        ucp_rsc_index_t num_cms      = 0;
+
+        for (ucp_rsc_index_t cm_idx = 0; cm_idx < num_cm_cmpts; ++cm_idx) {
+            if (worker->cms[cm_idx].cm != NULL) {
+                num_cms++;
+            }
+        }
+
+        return num_cms;
+    }
+
+    void check_cm_fallback()
+    {
+        if (get_num_cms() < 2) {
+            UCS_TEST_SKIP_R("No CM for fallback to");
+        }
+
+        if (!m_test_addr.is_rdmacm_netdev()) {
+            UCS_TEST_SKIP_R("RDMACM isn't allowed to be used on " +
+                            m_test_addr.to_str());
+        }
+    }
+
+    void check_rdmacm()
+    {
+        ucp_rsc_index_t num_cm_cmpts = receiver().ucph()->config.num_cm_cmpts;
+        ucp_rsc_index_t cm_idx;
+
+        if (!m_test_addr.is_rdmacm_netdev()) {
+            UCS_TEST_SKIP_R("RDMACM isn't allowed to be used on " +
+                            m_test_addr.to_str());
+        }
+
+        for (cm_idx = 0; cm_idx < num_cm_cmpts; ++cm_idx) {
+            if (sender().worker()->cms[cm_idx].cm == NULL) {
+                continue;
+            }
+
+            std::string cm_name = ucp_context_cm_name(sender().ucph(), cm_idx);
+            if (cm_name.compare("rdmacm") == 0) {
+                break;
+            }
+        }
+
+        if (cm_idx == num_cm_cmpts) {
+            UCS_TEST_SKIP_R("No RDMACM to check address packing");
+        }
+    }
+};
+
+UCS_TEST_P(test_ucp_sockaddr_cm_private_data,
+           short_cm_private_data_fallback_to_next_cm,
+           "TCP_CM_PRIV_DATA_LEN?=16", "SOCKADDR_TLS_PRIORITY=tcp,rdmacm")
+{
+    check_cm_fallback();
+    listen_and_communicate(false, SEND_DIRECTION_BIDI);
+    concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_cm_private_data,
+           create_multiple_lanes_no_fallback_to_next_cm, "TLS=ud,rc,sm",
+           "NUM_EPS=128", "SOCKADDR_TLS_PRIORITY=rdmacm")
+{
+    check_rdmacm();
+    listen_and_communicate(false, SEND_DIRECTION_BIDI);
+    concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_cm_private_data,
+           create_multiple_lanes_have_fallback_to_next_cm, "TLS=ud,rc,sm,tcp",
+           "NUM_EPS=128", "SOCKADDR_TLS_PRIORITY=rdmacm,tcp")
+{
+    check_cm_fallback();
+    listen_and_communicate(false, SEND_DIRECTION_BIDI);
+    concurrent_disconnect(UCP_EP_CLOSE_MODE_FLUSH);
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_sockaddr_cm_private_data, all, "all")
+
+
 class test_ucp_sockaddr_destroy_ep_on_err : public test_ucp_sockaddr {
 public:
     test_ucp_sockaddr_destroy_ep_on_err() {
-        /* Set small TL timeouts to reduce testing time */
-        m_env.push_back(new ucs::scoped_setenv("UCX_RC_TIMEOUT",     "10ms"));
-        m_env.push_back(new ucs::scoped_setenv("UCX_RC_RNR_TIMEOUT", "10ms"));
-        m_env.push_back(new ucs::scoped_setenv("UCX_RC_RETRY_COUNT", "2"));
+        set_tl_timeouts(m_env);
     }
 
     virtual ucp_ep_params_t get_server_ep_params() {
@@ -826,90 +1215,128 @@ class test_ucp_sockaddr_destroy_ep_on_err : public test_ucp_sockaddr {
     ucs::ptr_vector<ucs::scoped_setenv> m_env;
 };
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, empty,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, empty) {
     listen_and_communicate(false, 0);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, s2c,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, s2c) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, c2s,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, c2s) {
     listen_and_communicate(false, SEND_DIRECTION_C2S);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, bidi,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, bidi) {
     listen_and_communicate(false, SEND_DIRECTION_BIDI);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_client_cforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_client_cforce) {
     listen_and_communicate(false, 0);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_c2s_cforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_c2s_cforce) {
     listen_and_communicate(false, SEND_DIRECTION_C2S);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_s2c_cforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_s2c_cforce) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_bidi_cforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_bidi_cforce) {
     listen_and_communicate(false, SEND_DIRECTION_BIDI);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_client_sforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_client_sforce) {
     listen_and_communicate(false, 0);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_c2s_sforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_c2s_sforce) {
     listen_and_communicate(false, SEND_DIRECTION_C2S);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_s2c_sforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_s2c_sforce) {
     listen_and_communicate(false, SEND_DIRECTION_S2C);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FLUSH);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_bidi_sforce,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, onesided_bidi_sforce) {
     listen_and_communicate(false, SEND_DIRECTION_BIDI);
     scoped_log_handler slh(wrap_errors_logger);
     one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE);
     one_sided_disconnect(sender(),   UCP_EP_CLOSE_MODE_FLUSH);
 }
 
+/* The test check that a client disconnection works fine when a server received
+ * a conenction request, but a conenction wasn't fully established */
+UCS_TEST_P(test_ucp_sockaddr_destroy_ep_on_err, create_and_destroy_immediately)
+{
+    ucp_test_base::entity::listen_cb_type_t listen_cb_type = cb_type();
+
+    listen(listen_cb_type);
+
+    {
+        scoped_log_handler warn_slh(detect_warn_logger);
+        scoped_log_handler error_slh(detect_error_logger);
+        client_ep_connect();
+
+        if (listen_cb_type == ucp_test_base::entity::LISTEN_CB_CONN) {
+            /* Wait for either connection to a peer failed (e.g. no TL to create
+             * after CM created a connection) or connection request is provided
+             * by UCP */
+            while ((m_err_count == 0) &&
+                   receiver().is_conn_reqs_queue_empty()) {
+                progress();
+            }
+        } else {
+            /* Wait for EP being created on a server side */
+            ASSERT_EQ(ucp_test_base::entity::LISTEN_CB_EP, listen_cb_type);
+            if (!wait_for_server_ep(false)) {
+                UCS_TEST_SKIP_R("cannot connect to server");
+            }
+        }
+
+        /* Disconnect from a peer while conenction is not fully established with
+         * a peer */
+        one_sided_disconnect(sender(), UCP_EP_CLOSE_MODE_FORCE);
+
+        /* Wait until either accepting a connection fails on a server side or
+         * disconnection is detected by a server in case of a connection was
+         * established successfully */
+        ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(10.0);
+        while ((ucs_get_time() < loop_end_limit) &&
+               (m_err_count == 0) && (receiver().get_accept_err_num() == 0)) {
+            progress();
+        }
+
+        EXPECT_TRUE((m_err_count != 0) ||
+                    (receiver().get_accept_err_num() != 0));
+    }
+
+    /* Disconnect from a client if a connection was established */
+    one_sided_disconnect(receiver(), UCP_EP_CLOSE_MODE_FORCE);
+}
+
 UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr_destroy_ep_on_err)
 
 class test_ucp_sockaddr_with_wakeup : public test_ucp_sockaddr {
@@ -921,8 +1348,7 @@ class test_ucp_sockaddr_with_wakeup : public test_ucp_sockaddr {
     }
 };
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_with_wakeup, wakeup,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_with_wakeup, wakeup) {
     listen_and_communicate(true, 0);
 }
 
@@ -930,8 +1356,7 @@ UCS_TEST_P(test_ucp_sockaddr_with_wakeup, wakeup_c2s) {
     listen_and_communicate(true, SEND_DIRECTION_C2S);
 }
 
-UCS_TEST_SKIP_COND_P(test_ucp_sockaddr_with_wakeup, wakeup_s2c,
-                     no_close_protocol()) {
+UCS_TEST_P(test_ucp_sockaddr_with_wakeup, wakeup_s2c) {
     listen_and_communicate(true, SEND_DIRECTION_S2C);
 }
 
@@ -988,14 +1413,17 @@ UCP_INSTANTIATE_ALL_TEST_CASE(test_ucp_sockaddr_with_rma_atomic)
 
 class test_ucp_sockaddr_protocols : public test_ucp_sockaddr {
 public:
+    virtual ~test_ucp_sockaddr_protocols() { }
+
     static void get_test_variants(std::vector<ucp_test_variant>& variants) {
         /* Atomics not supported for now because need to emulate the case
          * of using different device than the one selected by default on the
          * worker for atomic operations */
         uint64_t features = UCP_FEATURE_TAG | UCP_FEATURE_STREAM |
                             UCP_FEATURE_RMA | UCP_FEATURE_AM;
-        test_ucp_sockaddr::get_test_variants_mt(variants, features,
-                                                TEST_MODIFIER_CM, "");
+
+        add_variant_with_value(variants, features, TEST_MODIFIER_MT,
+                               "mt", MULTI_THREAD_WORKER);
     }
 
     virtual void init() {
@@ -1030,36 +1458,144 @@ class test_ucp_sockaddr_protocols : public test_ucp_sockaddr {
             << "recv_buf: '" << ucs::compact_string(recv_buf, 20) << "'";
     }
 
-    void test_tag_send_recv(size_t size, bool is_exp, bool is_sync = false)
+    typedef void (*stop_cb_t)(void *arg);
+
+    void *do_unexp_recv(std::string &recv_buf, size_t size, void *sreq,
+                        bool send_stop, bool recv_stop)
     {
-        std::string send_buf(size, 'x');
-        std::string recv_buf(size, 'y');
+        ucp_tag_recv_info_t recv_info = {};
+        bool err_handling             = send_stop || recv_stop;
+        ucp_tag_message_h message;
+
+        do {
+            short_progress_loop();
+            message = ucp_tag_probe_nb(receiver().worker(),
+                                       0, 0, 1, &recv_info);
+        } while (message == NULL);
 
-        void *rreq = NULL, *sreq = NULL;
+        EXPECT_EQ(size, recv_info.length);
+        EXPECT_EQ(0,    recv_info.sender_tag);
 
-        if (is_exp) {
-            rreq = ucp_tag_recv_nb(receiver().worker(), &recv_buf[0], size,
-                                   ucp_dt_make_contig(1), 0, 0, rtag_complete_cb);
+        if (recv_stop) {
+            disconnect(*this, receiver());
         }
 
-        if (is_sync) {
-            sreq = ucp_tag_send_sync_nb(sender().ep(), &send_buf[0], size,
-                                        ucp_dt_make_contig(1), 0, scomplete_cb);
-        } else {
-            sreq = ucp_tag_send_nb(sender().ep(), &send_buf[0], size,
-                                   ucp_dt_make_contig(1), 0, scomplete_cb);
+        if (send_stop) {
+            disconnect(*this, sender());
         }
 
-        if (!is_exp) {
-            short_progress_loop();
-            rreq = ucp_tag_recv_nb(receiver().worker(), &recv_buf[0], size,
-                                   ucp_dt_make_contig(1), 0, 0, rtag_complete_cb);
+        ucp_request_param_t recv_param = {};
+        recv_param.op_attr_mask        = UCP_OP_ATTR_FIELD_CALLBACK;
+        /* TODO: remove casting when changed to using NBX API */
+        recv_param.cb.recv             = reinterpret_cast
+                                         <ucp_tag_recv_nbx_callback_t>(
+                                             !err_handling ? rtag_complete_cb :
+                                             rtag_complete_err_handling_cb);
+        return ucp_tag_msg_recv_nbx(receiver().worker(), &recv_buf[0], size,
+                                    message, &recv_param);
+    }
+
+    void sreq_release(void *sreq) {
+        if ((sreq == NULL) || !UCS_PTR_IS_PTR(sreq)) {
+            return;
+        }
+
+        if (ucp_request_check_status(sreq) == UCS_INPROGRESS) {
+            ucp_request_t *req = (ucp_request_t*)sreq - 1;
+            req->flags        |= UCP_REQUEST_FLAG_COMPLETED;
+
+            ucp_request_t *req_from_id;
+            ucs_status_t status = ucp_send_request_get_by_id(
+                    sender().worker(), req->id,&req_from_id, 1);
+            if (status == UCS_OK) {
+                EXPECT_EQ(req, req_from_id);
+            }
         }
 
+        ucp_request_release(sreq);
+    }
+
+    void extra_send_before_disconnect(entity &e, const std::string &send_buf,
+                                      const ucp_request_param_t &send_param)
+    {
+        void *sreq = ucp_tag_send_nbx(e.ep(), &send_buf[0], send_buf.size(), 0,
+                                      &send_param);
         request_wait(sreq);
-        request_wait(rreq);
 
-        compare_buffers(send_buf, recv_buf);
+        e.disconnect_nb(0, 0, UCP_EP_CLOSE_MODE_FORCE);
+    }
+
+    void test_tag_send_recv(size_t size, bool is_exp, bool is_sync = false,
+                            bool send_stop = false, bool recv_stop = false)
+    {
+        bool err_handling_test = send_stop || recv_stop;
+        unsigned num_iters     = err_handling_test ? 1 : m_num_iters;
+
+        /* send multiple messages to test the protocol both before and after
+         * connection establishment */
+        for (int i = 0; i < num_iters; i++) {
+            std::string send_buf(size, 'x');
+            std::string recv_buf(size, 'y');
+
+            void *rreq = NULL, *sreq = NULL;
+            std::vector<void*> reqs;
+
+            ucs::auto_ptr<scoped_log_handler> slh;
+            if (err_handling_test) {
+                slh.reset(new scoped_log_handler(wrap_errors_logger));
+            }
+
+            if (is_exp) {
+                rreq = ucp_tag_recv_nb(receiver().worker(), &recv_buf[0], size,
+                                       ucp_dt_make_contig(1), 0, 0,
+                                       rtag_complete_cb);
+                reqs.push_back(rreq);
+            }
+
+            ucp_request_param_t send_param = {};
+            send_param.op_attr_mask        = UCP_OP_ATTR_FIELD_CALLBACK;
+            /* TODO: remove casting when changed to using NBX API */
+            send_param.cb.send             = reinterpret_cast
+                                             <ucp_send_nbx_callback_t>(
+                                                 !err_handling_test ? scomplete_cb :
+                                                 scomplete_err_handling_cb);
+            if (is_sync) {
+                sreq = ucp_tag_send_sync_nbx(sender().ep(), &send_buf[0], size, 0,
+                                             &send_param);
+            } else {
+                sreq = ucp_tag_send_nbx(sender().ep(), &send_buf[0], size, 0,
+                                        &send_param);
+            }
+            reqs.push_back(sreq);
+
+            if (!is_exp) {
+                rreq = do_unexp_recv(recv_buf, size, sreq, send_stop,
+                                     recv_stop);
+                reqs.push_back(rreq);
+            }
+
+            /* Wait for completions of send and receive requests.
+             * The requests could be completed with the following statuses:
+             * - UCS_OK, when it was successfully sent before a peer failure was
+             *   detected
+             * - UCS_ERR_CANCELED, when it was purged from an UCP EP list of
+             *   tracked requests
+             * - UCS_ERR_* (e.g. UCS_ERR_ENDPOINT_TIMEOUT), when it was
+             *   completed from an UCT transport with an error */
+            requests_wait(reqs);
+
+            if (!err_handling_test) {
+                compare_buffers(send_buf, recv_buf);
+            } else {
+                wait_for_flag(&m_err_count);
+
+                if (send_stop == false) {
+                    extra_send_before_disconnect(sender(), send_buf, send_param);
+                } else if (recv_stop == false) {
+                    extra_send_before_disconnect(receiver(), send_buf, send_param);
+                }
+            }
+        }
     }
 
     void wait_for_server_ep()
@@ -1071,32 +1607,40 @@ class test_ucp_sockaddr_protocols : public test_ucp_sockaddr {
 
     void test_stream_send_recv(size_t size, bool is_exp)
     {
-        std::string send_buf(size, 'x');
-        std::string recv_buf(size, 'y');
-        size_t recv_length;
-        void *rreq, *sreq;
-
-        if (is_exp) {
-            wait_for_server_ep();
-            rreq = ucp_stream_recv_nb(receiver().ep(), &recv_buf[0], size,
-                                      ucp_dt_make_contig(1), rstream_complete_cb,
-                                      &recv_length, UCP_STREAM_RECV_FLAG_WAITALL);
-            sreq = ucp_stream_send_nb(sender().ep(), &send_buf[0], size,
-                                      ucp_dt_make_contig(1), scomplete_cb, 0);
-        } else {
-            sreq = ucp_stream_send_nb(sender().ep(), &send_buf[0], size,
-                                   ucp_dt_make_contig(1), scomplete_cb, 0);
-            short_progress_loop();
-            wait_for_server_ep();
-            rreq = ucp_stream_recv_nb(receiver().ep(), &recv_buf[0], size,
-                                      ucp_dt_make_contig(1), rstream_complete_cb,
-                                      &recv_length, UCP_STREAM_RECV_FLAG_WAITALL);
-        }
+        /* send multiple messages to test the protocol both before and after
+         * connection establishment */
+        for (int i = 0; i < m_num_iters; i++) {
+            std::string send_buf(size, 'x');
+            std::string recv_buf(size, 'y');
+            size_t recv_length;
+            void *rreq, *sreq;
+
+            if (is_exp) {
+                wait_for_server_ep();
+                rreq = ucp_stream_recv_nb(receiver().ep(), &recv_buf[0], size,
+                                          ucp_dt_make_contig(1),
+                                          rstream_complete_cb, &recv_length,
+                                          UCP_STREAM_RECV_FLAG_WAITALL);
+                sreq = ucp_stream_send_nb(sender().ep(), &send_buf[0], size,
+                                          ucp_dt_make_contig(1), scomplete_cb,
+                                          0);
+            } else {
+                sreq = ucp_stream_send_nb(sender().ep(), &send_buf[0], size,
+                                          ucp_dt_make_contig(1), scomplete_cb,
+                                          0);
+                short_progress_loop();
+                wait_for_server_ep();
+                rreq = ucp_stream_recv_nb(receiver().ep(), &recv_buf[0], size,
+                                          ucp_dt_make_contig(1),
+                                          rstream_complete_cb, &recv_length,
+                                          UCP_STREAM_RECV_FLAG_WAITALL);
+            }
 
-        request_wait(sreq);
-        request_wait(rreq);
+            request_wait(sreq);
+            request_wait(rreq);
 
-        compare_buffers(send_buf, recv_buf);
+            compare_buffers(send_buf, recv_buf);
+        }
     }
 
     void register_mem(entity* initiator, entity* target, void *buffer,
@@ -1125,47 +1669,56 @@ class test_ucp_sockaddr_protocols : public test_ucp_sockaddr {
 
     void test_rma(size_t size, rma_nb_func_t rma_func)
     {
-        std::string send_buf(size, 'x');
-        std::string recv_buf(size, 'y');
+        /* send multiple messages to test the protocol both before and after
+         * connection establishment */
+        for (int i = 0; i < m_num_iters; i++) {
+            std::string send_buf(size, 'x');
+            std::string recv_buf(size, 'y');
 
-        ucp_mem_h memh;
-        ucp_rkey_h rkey;
+            ucp_mem_h memh;
+            ucp_rkey_h rkey;
 
-        register_mem(&sender(), &receiver(), &recv_buf[0], size, &memh, &rkey);
+            register_mem(&sender(), &receiver(), &recv_buf[0], size, &memh,
+                         &rkey);
 
-        std::vector<void*> reqs;
-        (this->*rma_func)(send_buf, recv_buf, rkey, reqs);
+            std::vector<void*> reqs;
+            (this->*rma_func)(send_buf, recv_buf, rkey, reqs);
 
-        while (!reqs.empty()) {
-            request_wait(reqs.back());
-            reqs.pop_back();
-        }
+            while (!reqs.empty()) {
+                request_wait(reqs.back());
+                reqs.pop_back();
+            }
 
-        compare_buffers(send_buf, recv_buf);
+            compare_buffers(send_buf, recv_buf);
 
-        ucp_rkey_destroy(rkey);
-        ucs_status_t status = ucp_mem_unmap(receiver().ucph(), memh);
-        ASSERT_UCS_OK(status);
+            ucp_rkey_destroy(rkey);
+            ucs_status_t status = ucp_mem_unmap(receiver().ucph(), memh);
+            ASSERT_UCS_OK(status);
+        }
     }
 
     void test_am_send_recv(size_t size, size_t hdr_size = 0ul)
     {
-        std::string sb(size, 'x');
-        std::string hdr(hdr_size, 'x');
+        /* send multiple messages to test the protocol both before and after
+         * connection establishment */
+        for (int i = 0; i < m_num_iters; i++) {
+            std::string sb(size, 'x');
+            std::string hdr(hdr_size, 'x');
 
-        bool am_received = false;
+            bool am_received = false;
 
-        set_am_data_handler(receiver(), 0, rx_am_msg_cb, &am_received);
+            set_am_data_handler(receiver(), 0, rx_am_msg_cb, &am_received);
 
-        ucp_request_param_t param = {};
-        ucs_status_ptr_t sreq     = ucp_am_send_nbx(sender().ep(), 0, &hdr[0],
-                                                    hdr_size, &sb[0], size,
-                                                    &param);
-        request_wait(sreq);
-        wait_for_flag(&am_received);
-        EXPECT_TRUE(am_received);
+            ucp_request_param_t param = {};
+            ucs_status_ptr_t sreq     = ucp_am_send_nbx(sender().ep(), 0,
+                                                        &hdr[0], hdr_size,
+                                                        &sb[0], size, &param);
+            request_wait(sreq);
+            wait_for_flag(&am_received);
+            EXPECT_TRUE(am_received);
 
-        set_am_data_handler(receiver(), 0, NULL, NULL);
+            set_am_data_handler(receiver(), 0, NULL, NULL);
+        }
     }
 
 private:
@@ -1194,8 +1747,28 @@ class test_ucp_sockaddr_protocols : public test_ucp_sockaddr {
         param.arg        = arg;
         ASSERT_UCS_OK(ucp_worker_set_am_recv_handler(e.worker(), &param));
     }
+
+protected:
+    enum {
+        SEND_STOP = UCS_BIT(0),
+        RECV_STOP = UCS_BIT(1)
+    };
+
+    static void disconnect(test_ucp_sockaddr_protocols &test, entity &e) {
+        test.one_sided_disconnect(e, UCP_EP_CLOSE_MODE_FORCE);
+        while (m_err_count == 0) {
+            test.short_progress_loop();
+        }
+    }
+
+private:
+    static const unsigned m_num_iters;
 };
 
+
+const unsigned test_ucp_sockaddr_protocols::m_num_iters = 10;
+
+
 UCS_TEST_P(test_ucp_sockaddr_protocols, stream_short_exp)
 {
     test_stream_send_recv(1, true);
@@ -1366,16 +1939,178 @@ UCS_TEST_P(test_ucp_sockaddr_protocols, am_zcopy_64k,
 }
 
 
-
 /* For DC case, allow fallback to UD if DC is not supported */
 #define UCP_INSTANTIATE_CM_TEST_CASE(_test_case) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dcudx, "dc_x,ud") \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ud,    "ud_v") \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, udx,   "ud_x") \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rc,    "rc_v") \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rcx,   "rc_x") \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ib,    "ib")   \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, tcp,   "tcp")  \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, all,   "all")
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, dcudx, "dc_x,ud") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, ud, "ud_v") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, udx, "ud_x") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, rc, "rc_v") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, rcx, "rc_x") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, ib, "ib") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, tcp, "tcp") \
+    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, all, "all")
 
 UCP_INSTANTIATE_CM_TEST_CASE(test_ucp_sockaddr_protocols)
+
+
+class test_ucp_sockaddr_protocols_diff_config : public test_ucp_sockaddr_protocols
+{
+public:
+    void init() {
+        if (is_self()) {
+            UCS_TEST_SKIP_R("self - same config");
+        }
+
+        m_err_count = 0;
+        get_sockaddr();
+        test_base::init();
+    }
+
+    void init_entity(const char *num_paths) {
+        /* coverity[tainted_string_argument] */
+        ucs::scoped_setenv num_paths_env("UCX_IB_NUM_PATHS", num_paths);
+        create_entity();
+    }
+
+    void create_entities_and_connect(bool server_less_num_paths) {
+        /* coverity[tainted_string_argument] */
+        ucs::scoped_setenv max_eager_lanes_env("UCX_MAX_EAGER_LANES", "2");
+
+        if (server_less_num_paths) {
+            // create the client
+            init_entity("2");
+            // create the server
+            init_entity("1");
+        } else {
+            // create the client
+            init_entity("1");
+            // create the server
+            init_entity("2");
+        }
+
+        start_listener(cb_type());
+        client_ep_connect();
+    }
+};
+
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_diff_config,
+           diff_num_paths_small_msg_server_less_lanes)
+{
+    create_entities_and_connect(true);
+    test_tag_send_recv(4 * UCS_KBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_diff_config,
+           diff_num_paths_large_msg_server_less_lanes)
+{
+    create_entities_and_connect(true);
+    test_tag_send_recv(4 * UCS_MBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_diff_config,
+           diff_num_paths_small_msg_server_more_lanes)
+{
+    create_entities_and_connect(false);
+    test_tag_send_recv(4 * UCS_KBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_diff_config,
+           diff_num_paths_large_msg_server_more_lanes)
+{
+    create_entities_and_connect(false);
+    test_tag_send_recv(4 * UCS_MBYTE, false, false);
+}
+
+UCP_INSTANTIATE_CM_TEST_CASE(test_ucp_sockaddr_protocols_diff_config)
+
+
+class test_ucp_sockaddr_protocols_err : public test_ucp_sockaddr_protocols {
+public:
+    static void get_test_variants(std::vector<ucp_test_variant>& variants) {
+        uint64_t features = UCP_FEATURE_TAG;
+        test_ucp_sockaddr::get_test_variants_cm_mode(variants, features,
+                                                     SEND_STOP, "send_stop");
+        test_ucp_sockaddr::get_test_variants_cm_mode(variants, features,
+                                                     RECV_STOP, "recv_stop");
+        test_ucp_sockaddr::get_test_variants_cm_mode(variants, features,
+                                                     SEND_STOP | RECV_STOP,
+                                                     "bidi_stop");
+    }
+
+protected:
+    test_ucp_sockaddr_protocols_err() {
+        set_tl_timeouts(m_env);
+    }
+
+    void test_tag_send_recv(size_t size, bool is_exp,
+                            bool is_sync = false) {
+        /* warmup */
+        test_ucp_sockaddr_protocols::test_tag_send_recv(size, is_exp, is_sync);
+
+        /* run error-handling test */
+        int variants = get_variant_value();
+        test_ucp_sockaddr_protocols::test_tag_send_recv(size, is_exp, is_sync,
+                                                        variants & SEND_STOP,
+                                                        variants & RECV_STOP);
+    }
+
+    ucs::ptr_vector<ucs::scoped_setenv> m_env;
+};
+
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_eager_32_unexp,
+           "ZCOPY_THRESH=inf", "RNDV_THRESH=inf")
+{
+    test_tag_send_recv(32, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_zcopy_4k_unexp,
+           "ZCOPY_THRESH=2k", "RNDV_THRESH=inf")
+{
+    test_tag_send_recv(4 * UCS_KBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_zcopy_64k_unexp,
+           "ZCOPY_THRESH=2k", "RNDV_THRESH=inf")
+{
+    test_tag_send_recv(64 * UCS_KBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_eager_32_unexp_sync,
+           "ZCOPY_THRESH=inf", "RNDV_THRESH=inf")
+{
+    test_tag_send_recv(32, false, true);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_zcopy_4k_unexp_sync,
+           "ZCOPY_THRESH=2k", "RNDV_THRESH=inf")
+{
+    test_tag_send_recv(4 * UCS_KBYTE, false, true);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_zcopy_64k_unexp_sync,
+           "ZCOPY_THRESH=2k", "RNDV_THRESH=inf")
+{
+    test_tag_send_recv(64 * UCS_KBYTE, false, true);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_rndv_unexp,
+           "RNDV_THRESH=0", "RNDV_SCHEME=auto")
+{
+    test_tag_send_recv(64 * UCS_KBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_rndv_unexp_get_scheme,
+           "RNDV_THRESH=0", "RNDV_SCHEME=get_zcopy")
+{
+    test_tag_send_recv(64 * UCS_KBYTE, false, false);
+}
+
+UCS_TEST_P(test_ucp_sockaddr_protocols_err, tag_rndv_unexp_put_scheme,
+           "RNDV_THRESH=0", "RNDV_SCHEME=put_zcopy")
+{
+    test_tag_send_recv(64 * UCS_KBYTE, false, false);
+}
+
+UCP_INSTANTIATE_CM_TEST_CASE(test_ucp_sockaddr_protocols_err)
diff --git a/test/gtest/ucp/test_ucp_tag.cc b/test/gtest/ucp/test_ucp_tag.cc
index 68524c6becc..8ed12d4b929 100644
--- a/test/gtest/ucp/test_ucp_tag.cc
+++ b/test/gtest/ucp/test_ucp_tag.cc
@@ -14,6 +14,7 @@ extern "C" {
 #include <ucp/core/ucp_worker.h>
 #include <ucp/core/ucp_ep.h>
 #include <ucp/core/ucp_ep.inl>
+#include <ucs/arch/atomic.h>
 }
 
 #include <sys/mman.h>
@@ -56,7 +57,7 @@ void test_ucp_tag::enable_tag_mp_offload()
     m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_MP_SRQ_ENABLE", "try"));
     m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_MP_NUM_STRIDES", "8"));
     m_env.push_back(new ucs::scoped_setenv("UCX_IB_MLX5_DEVX_OBJECTS",
-                                           "dct,dcsrq,rcsrq,rcqp"));
+                                           "dct,dcsrq,rcsrq,rcqp,dci"));
 }
 
 void test_ucp_tag::request_init(void *request)
@@ -175,7 +176,8 @@ void test_ucp_tag::wait_for_unexpected_msg(ucp_worker_h worker, double sec)
 
 void test_ucp_tag::check_offload_support(bool offload_required)
 {
-    bool offload_supported = ucp_ep_is_tag_offload_enabled(ucp_ep_config(sender().ep()));
+    bool offload_supported = ucp_ep_config_key_has_tag_lane(
+                               &ucp_ep_config(sender().ep())->key);
     if (offload_supported != offload_required) {
         cleanup();
         std::string reason = offload_supported ? "tag offload" : "no tag offload";
@@ -443,9 +445,9 @@ UCS_TEST_P(test_ucp_tag_limits, check_max_short_rndv_thresh_zero, "RNDV_THRESH=0
         size_t min_rndv = ucp_ep_tag_offload_min_rndv_thresh(ucp_ep_config(sender().ep()));
 
         EXPECT_GT(min_rndv, 0ul); // min_rndv should be RTS size at least
-        EXPECT_GE(min_rndv,
+        EXPECT_LE(min_rndv,
                   ucp_ep_config(sender().ep())->tag.rndv.am_thresh.local);
-        EXPECT_GE(min_rndv,
+        EXPECT_LE(min_rndv,
                   ucp_ep_config(sender().ep())->tag.rndv.rma_thresh.local);
     }
 }
@@ -462,28 +464,39 @@ UCS_TEST_P(test_ucp_tag_limits, check_max_short_zcopy_thresh_zero, "ZCOPY_THRESH
 UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_limits)
 
 
-class test_ucp_tag_fallback : public ucp_test {
+class test_ucp_tag_nbx : public test_ucp_tag {
 public:
     void init() {
         /* forbid zcopy access because it will always fail due to read-only
          * memory pages (will fail to register memory) */
         modify_config("ZCOPY_THRESH", "inf");
-        ucp_test::init();
-        sender().connect(&receiver(), get_ep_params());
-        receiver().connect(&sender(), get_ep_params());
-    }
-
-    static void get_test_variants(std::vector<ucp_test_variant>& variants) {
-        add_variant(variants, UCP_FEATURE_TAG);
+        test_ucp_tag::init();
+        m_completed = 0;
     }
 
 protected:
     static const size_t MSG_SIZE;
+    uint32_t m_completed;
+
+    static void send_callback(void *req, ucs_status_t status,
+                              void *user_data)
+    {
+        request_free((request*)req);
+        ucs_atomic_add32((volatile uint32_t*)user_data, 1);
+    }
+
+    static void recv_callback(void *req, ucs_status_t status,
+                              const ucp_tag_recv_info_t *info,
+                              void *user_data)
+    {
+        request_free((request*)req);
+        ucs_atomic_add32((volatile uint32_t*)user_data, 1);
+    }
 };
 
-const size_t test_ucp_tag_fallback::MSG_SIZE  = 4 * 1024 * ucs_get_page_size();
+const size_t test_ucp_tag_nbx::MSG_SIZE  = 4 * UCS_KBYTE * ucs_get_page_size();
 
-UCS_TEST_P(test_ucp_tag_fallback, fallback)
+UCS_TEST_P(test_ucp_tag_nbx, fallback)
 {
     ucp_request_param_t param = {0};
 
@@ -509,4 +522,36 @@ UCS_TEST_P(test_ucp_tag_fallback, fallback)
     munmap(send_buffer, MSG_SIZE);
 }
 
-UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_fallback)
+UCS_TEST_P(test_ucp_tag_nbx, external_request_free)
+{
+    ucp_request_param_t send_param;
+    ucp_request_param_t recv_param;
+
+    send_param.op_attr_mask = recv_param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK |
+                                                        UCP_OP_ATTR_FIELD_REQUEST  |
+                                                        UCP_OP_ATTR_FLAG_NO_IMM_CMPL |
+                                                        UCP_OP_ATTR_FIELD_USER_DATA;
+    send_param.user_data   = recv_param.user_data     = &m_completed;
+    send_param.request     = request_alloc();
+    recv_param.request     = request_alloc();
+    send_param.cb.send     = (ucp_send_nbx_callback_t)send_callback;
+    recv_param.cb.recv     = (ucp_tag_recv_nbx_callback_t)recv_callback;
+    send_param.user_data   = &m_completed;
+    recv_param.user_data   = &m_completed;
+
+    std::vector<char> send_buffer(MSG_SIZE);
+    std::vector<char> recv_buffer(MSG_SIZE);
+
+    ucs_status_ptr_t recv_req = ucp_tag_recv_nbx(receiver().worker(),
+                                                 &recv_buffer[0], MSG_SIZE,
+                                                 0, 0, &recv_param);
+    ASSERT_TRUE(UCS_PTR_IS_PTR(recv_req));
+
+    ucs_status_ptr_t send_req = ucp_tag_send_nbx(sender().ep(), &send_buffer[0],
+                                                 MSG_SIZE, 0, &send_param);
+    ASSERT_TRUE(UCS_PTR_IS_PTR(send_req));
+
+    wait_for_value(&m_completed, 2u);
+}
+
+UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_nbx)
diff --git a/test/gtest/ucp/test_ucp_tag_match.cc b/test/gtest/ucp/test_ucp_tag_match.cc
index c1541063929..b1d3a72ce87 100644
--- a/test/gtest/ucp/test_ucp_tag_match.cc
+++ b/test/gtest/ucp/test_ucp_tag_match.cc
@@ -23,8 +23,6 @@ class test_ucp_tag_match : public test_ucp_tag {
     };
 
     test_ucp_tag_match() {
-        // TODO: test offload and offload MP as different variants
-        enable_tag_mp_offload();
         if (RUNNING_ON_VALGRIND) {
             m_env.push_back(new ucs::scoped_setenv("UCX_RC_TM_SEG_SIZE", "8k"));
             m_env.push_back(new ucs::scoped_setenv("UCX_TCP_RX_SEG_SIZE", "8k"));
@@ -34,9 +32,15 @@ class test_ucp_tag_match : public test_ucp_tag {
     virtual void init()
     {
         modify_config("TM_THRESH", "1");
-        if (get_variant_value() & ENABLE_PROTO) {
+        if (use_proto()) {
             modify_config("PROTO_ENABLE", "y");
             modify_config("MAX_EAGER_LANES", "2");
+        } else {
+            // TODO:
+            // 1. test offload and offload MP as different variants
+            // 2. Enable offload for new protocols as well when it is fully
+            //    supported.
+            enable_tag_mp_offload();
         }
         test_ucp_tag::init();
     }
@@ -66,6 +70,11 @@ class test_ucp_tag_match : public test_ucp_tag {
         m_req_status = status;
     }
 
+    bool use_proto() const
+    {
+        return get_variant_value() & ENABLE_PROTO;
+    }
+
     static ucs_status_t m_req_status;
 };
 
@@ -475,9 +484,8 @@ class test_ucp_tag_match_rndv : public test_ucp_tag_match {
     static const std::string rndv_schemes[];
 
     void init() {
-        ASSERT_LE(get_variant_value(), (int)RNDV_SCHEME_GET_ZCOPY);
-        modify_config("RNDV_SCHEME", rndv_schemes[get_variant_value()]);
-
+        ASSERT_LE(rndv_scheme(), (int)RNDV_SCHEME_GET_ZCOPY);
+        modify_config("RNDV_SCHEME", rndv_schemes[rndv_scheme()]);
         test_ucp_tag_match::init();
     }
 
@@ -486,6 +494,22 @@ class test_ucp_tag_match_rndv : public test_ucp_tag_match {
             add_variant_with_value(variants, get_ctx_params(), rndv_scheme,
                                    "rndv_" + rndv_schemes[rndv_scheme]);
         }
+
+        // Generate variants with and new protocols
+        add_variant_with_value(variants, get_ctx_params(),
+                               RNDV_SCHEME_AUTO | ENABLE_PROTO,
+                               "rndv_auto,proto");
+        add_variant_with_value(variants, get_ctx_params(),
+                               RNDV_SCHEME_GET_ZCOPY | ENABLE_PROTO,
+                               "rndv_get_zcopy,proto");
+    }
+
+protected:
+    int rndv_scheme() const
+    {
+        int mask = ucs_roundup_pow2(static_cast<int>(RNDV_SCHEME_LAST) + 1) - 1;
+        ucs_assert(!(mask & ENABLE_PROTO));
+        return get_variant_value() & mask;
     }
 };
 
@@ -493,6 +517,18 @@ const std::string test_ucp_tag_match_rndv::rndv_schemes[] = { "auto",
                                                               "put_zcopy",
                                                               "get_zcopy" };
 
+UCS_TEST_P(test_ucp_tag_match_rndv, length0, "RNDV_THRESH=0")
+{
+    request *my_send_req = send_nb((void*)0xdeadbeef, 0, DATATYPE, 1);
+    ASSERT_TRUE(!UCS_PTR_IS_ERR(my_send_req));
+
+    ucp_tag_recv_info_t info;
+    ucs_status_t status = recv_b((void*)0xbadc0fee, 0, DATATYPE, 1, 0, &info);
+    EXPECT_EQ(UCS_OK, status);
+
+    wait_and_validate(my_send_req);
+}
+
 UCS_TEST_P(test_ucp_tag_match_rndv, sync_send_unexp, "RNDV_THRESH=1048576") {
     static const size_t size = 1148576;
     request             *my_send_req;
@@ -795,3 +831,4 @@ UCS_TEST_P(test_ucp_tag_match_rndv, bidir_multi_exp_post, "RNDV_THRESH=0") {
 }
 
 UCP_INSTANTIATE_TEST_CASE(test_ucp_tag_match_rndv)
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_match_rndv, mm_tcp, "posix,sysv,tcp")
diff --git a/test/gtest/ucp/test_ucp_tag_mem_type.cc b/test/gtest/ucp/test_ucp_tag_mem_type.cc
index 95a68a2c568..8303ddbe62d 100644
--- a/test/gtest/ucp/test_ucp_tag_mem_type.cc
+++ b/test/gtest/ucp/test_ucp_tag_mem_type.cc
@@ -76,6 +76,16 @@ class test_ucp_tag_mem_type: public test_ucp_tag {
         }
     }
 
+    void do_basic_send(void *sbuf, void *rbuf, size_t length, ucp_datatype_t type,
+                       ucs_memory_type_t s_mem_type, ucs_memory_type_t r_mem_type)
+    {
+        mem_buffer::pattern_fill(rbuf, length, 1, r_mem_type);
+        mem_buffer::pattern_fill(sbuf, length, 2, s_mem_type);
+        size_t recvd = do_xfer(sbuf, rbuf, length, type, type, true, false, false);
+        ASSERT_EQ(length, recvd);
+        mem_buffer::pattern_check(rbuf, length, 2, r_mem_type);
+    }
+
     static std::vector<std::vector<ucs_memory_type_t> > mem_type_pairs;
 
 protected:
@@ -148,30 +158,35 @@ size_t test_ucp_tag_mem_type::do_xfer(const void *sendbuf, void *recvbuf,
 UCS_TEST_P(test_ucp_tag_mem_type, basic)
 {
     ucp_datatype_t type = ucp_dt_make_contig(1);
+    size_t max_length;
 
     UCS_TEST_MESSAGE << "TEST: "
                      << ucs_memory_type_names[m_send_mem_type] << " <-> "
                      << ucs_memory_type_names[m_recv_mem_type];
 
     for (unsigned i = 1; i <= 7; ++i) {
-        size_t max = (long)pow(10.0, i);
-        size_t length = ucs::rand() % max + 1;
+        max_length = (size_t)pow(10.0, i);
+        size_t length = ucs::rand() % max_length + 1;
 
         mem_buffer m_recv_mem_buf(length, m_recv_mem_type);
         mem_buffer m_send_mem_buf(length, m_send_mem_type);
 
-        mem_buffer::pattern_fill(m_recv_mem_buf.ptr(), m_recv_mem_buf.size(),
-                                 1, m_recv_mem_buf.mem_type());
+        do_basic_send(m_send_mem_buf.ptr(),m_recv_mem_buf.ptr(), length, type,
+                      m_send_mem_buf.mem_type(), m_recv_mem_buf.mem_type());
+    }
 
-        mem_buffer::pattern_fill(m_send_mem_buf.ptr(), m_send_mem_buf.size(),
-                                 2, m_send_mem_buf.mem_type());
+    /*  test with re-using the buffers */
+    max_length = (size_t)pow(10.0, 7);
+    mem_buffer m_recv_mem_buf(max_length, m_recv_mem_type);
+    mem_buffer m_send_mem_buf(max_length, m_send_mem_type);
 
-        size_t recvd = do_xfer(m_send_mem_buf.ptr(), m_recv_mem_buf.ptr(),
-                               length, type, type, true, false, false);
-        ASSERT_EQ(length, recvd);
-        mem_buffer::pattern_check(m_recv_mem_buf.ptr(), length,
-                                  2, m_recv_mem_buf.mem_type());
+    for (unsigned i = 0; i < 2; ++i) {
+        size_t length = ucs::rand() % max_length + 1;
+
+        do_basic_send(m_send_mem_buf.ptr(),m_recv_mem_buf.ptr(), length, type,
+                      m_send_mem_buf.mem_type(), m_recv_mem_buf.mem_type());
     }
+
 }
 
 UCS_TEST_P(test_ucp_tag_mem_type, xfer_mismatch_length)
diff --git a/test/gtest/ucp/test_ucp_tag_offload.cc b/test/gtest/ucp/test_ucp_tag_offload.cc
index f5b870f4285..5c8b7c0d6d2 100644
--- a/test/gtest/ucp/test_ucp_tag_offload.cc
+++ b/test/gtest/ucp/test_ucp_tag_offload.cc
@@ -311,6 +311,34 @@ UCS_TEST_P(test_ucp_tag_offload, connect)
     e->connect(&receiver(), get_ep_params());
 }
 
+// Send small chunk of data to be scattered to CQE on the receiver. Post bigger
+// chunk of memory for receive operation, so it would be posted to the HW.
+UCS_TEST_P(test_ucp_tag_offload, eager_send_less, "RNDV_THRESH=inf",
+           "TM_THRESH=0", "TM_MAX_BB_SIZE=0")
+{
+    activate_offload(sender());
+
+    uint8_t              send_data = 0;
+    size_t               length    = 4 * UCS_KBYTE;
+    ucp_tag_t            tag       = 0x11;
+    std::vector<uint8_t> recvbuf(length);
+
+    request *rreq = recv_nb_exp(&recvbuf[0], length, ucp_dt_make_contig(1), tag,
+                                UCP_TAG_MASK_FULL);
+
+    request *sreq = (request*)ucp_tag_send_nb(sender().ep(), &send_data,
+                                              sizeof(send_data),
+                                              ucp_dt_make_contig(1), tag,
+                                              send_callback);
+    if (UCS_PTR_IS_ERR(sreq)) {
+        ASSERT_UCS_OK(UCS_PTR_STATUS(sreq));
+    } else if (sreq != NULL) {
+        request_wait(sreq);
+    }
+
+    request_wait(rreq);
+}
+
 UCS_TEST_P(test_ucp_tag_offload, small_rndv, "RNDV_THRESH=0", "TM_THRESH=0")
 {
     activate_offload(sender());
@@ -400,14 +428,17 @@ class test_ucp_tag_offload_multi : public test_ucp_tag_offload {
     {
         se.connect(&receiver(), get_ep_params());
         // Need to send twice:
-        // 1. to ensure that wireup's UCT iface has been closed and
-        //    it is not considered for num_active_iface on worker
-        //    (message has to be less than `UCX_TM_THRESH` value)
+        // 1. to ensure that wireup's UCT iface has been closed and it is not
+        //    considered for num_active_iface on worker (message has to be less
+        //    than `UCX_TM_THRESH` value) + UCP workers have to be flushed prior
+        //    to ensure that UCT ifaces were deactivated at the end of auxiliary
+        //    UCT EP discarding
         // 2. to activate tag ofload
-        //    (num_active_ifaces on worker is increased when any message
-        //     is received on any iface. Tag hashing is done when we have
-        //     more than 1 active ifaces and message has to be greater
-        //     than `UCX_TM_THRESH` value)
+        //    (num_active_ifaces on worker is increased when any message is
+        //    received on any iface. Tag hashing is done when we have more than
+        //    1 active ifaces and message has to be greater than `UCX_TM_THRESH`
+        //    value)
+        flush_workers();
         send_recv(se, tag, 8);
         send_recv(se, tag, 2048);
     }
@@ -510,12 +541,12 @@ UCS_TEST_P(test_ucp_tag_offload_selection, tag_lane)
     ucp_ep_config_t *ep_config = ucp_ep_config(ep);
 
     if (has_tag_offload && !has_shm_or_self) {
-        EXPECT_TRUE(ucp_ep_is_tag_offload_enabled(ep_config));
+        EXPECT_TRUE(ucp_ep_config_key_has_tag_lane(&ep_config->key));
         EXPECT_EQ(ep_config->key.tag_lane, ep_config->tag.lane);
     } else {
         // If shm or self transports exist they would be used for tag matching
         // rather than network offload
-        EXPECT_FALSE(ucp_ep_is_tag_offload_enabled(ep_config));
+        EXPECT_FALSE(ucp_ep_config_key_has_tag_lane(&ep_config->key));
         EXPECT_EQ(ep_config->key.am_lane, ep_config->tag.lane);
     }
 }
@@ -584,8 +615,8 @@ UCS_TEST_P(test_ucp_tag_offload_gpu, rx_scatter_to_cqe, "TM_THRESH=1")
     wait_and_validate(sreq);
 }
 
-UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_gpu, rc_dc_gpu,
-                              "dc_x,rc_x," UCP_TEST_GPU_COPY_TLS)
+UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_tag_offload_gpu, rc_dc_gpu,
+                                        "dc_x,rc_x")
 
 class test_ucp_tag_offload_status : public test_ucp_tag {
 public:
@@ -836,7 +867,7 @@ UCS_TEST_P(test_ucp_tag_offload_stats_gpu, block_gpu_no_gpu_direct,
     req_cancel(receiver(), rreq);
 }
 
-UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tag_offload_stats_gpu, rc_dc_gpu,
-                              "dc_x,rc_x," UCP_TEST_GPU_COPY_TLS)
+UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_tag_offload_stats_gpu,
+                                        rc_dc_gpu, "dc_x,rc_x")
 
 #endif
diff --git a/test/gtest/ucp/test_ucp_tag_probe.cc b/test/gtest/ucp/test_ucp_tag_probe.cc
index cfb355518c3..63137bfe94b 100644
--- a/test/gtest/ucp/test_ucp_tag_probe.cc
+++ b/test/gtest/ucp/test_ucp_tag_probe.cc
@@ -166,9 +166,14 @@ UCS_TEST_P(test_ucp_tag_probe, send_rndv_msg_probe, "RNDV_THRESH=1048576") {
     EXPECT_EQ((ucp_tag_t)0x111337, info.sender_tag);
 
     /* receiver - process the rts and schedule a get operation */
-    my_recv_req = (request*)ucp_tag_msg_recv_nb(receiver().worker(), &recvbuf[0],
-                                                recvbuf.size(), DATATYPE, message,
-                                                recv_callback);
+    ucp_request_param_t param;
+    param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | UCP_OP_ATTR_FIELD_DATATYPE |
+                         UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+    param.cb.recv      = recv_callback;
+    param.datatype     = DATATYPE;
+
+    my_recv_req = (request*)ucp_tag_msg_recv_nbx(receiver().worker(), &recvbuf[0],
+                                                 recvbuf.size(), message, &param);
     ASSERT_TRUE(!UCS_PTR_IS_ERR(my_recv_req));
 
     /* receiver - perform rndv get and send the ATS */
diff --git a/test/gtest/ucp/test_ucp_tag_xfer.cc b/test/gtest/ucp/test_ucp_tag_xfer.cc
index e7a5484bc25..f095eb6bdb4 100644
--- a/test/gtest/ucp/test_ucp_tag_xfer.cc
+++ b/test/gtest/ucp/test_ucp_tag_xfer.cc
@@ -516,7 +516,7 @@ void test_ucp_tag_xfer::test_xfer_len_offset()
     const size_t buf_size    = max_length + max_offset + 2;
     ucp_datatype_t type      = ucp_dt_make_contig(1);
     void *send_buf           = 0;
-    void *recv_buf           = 0;;
+    void *recv_buf           = 0;
     size_t offset;
     size_t length;
     ucs::detail::message_stream *ms;
diff --git a/test/gtest/ucp/test_ucp_tls.cc b/test/gtest/ucp/test_ucp_tls.cc
new file mode 100644
index 00000000000..7107673f741
--- /dev/null
+++ b/test/gtest/ucp/test_ucp_tls.cc
@@ -0,0 +1,19 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "ucp_test.h"
+#include <ucp/core/ucp_context.h>
+
+class test_ucp_tl : public test_ucp_context {
+};
+
+UCS_TEST_P(test_ucp_tl, check_ucp_tl, "SELF_NUM_DEVICES?=50")
+{
+    create_entity();
+    EXPECT_GE((sender().ucph())->num_tls, 50);
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_tl, self, "self");
diff --git a/test/gtest/ucp/test_ucp_wireup.cc b/test/gtest/ucp/test_ucp_wireup.cc
index 257d60b1768..144d4ca0ddf 100644
--- a/test/gtest/ucp/test_ucp_wireup.cc
+++ b/test/gtest/ucp/test_ucp_wireup.cc
@@ -56,12 +56,13 @@ class test_ucp_wireup : public ucp_test {
     void send_recv(ucp_ep_h send_ep, ucp_worker_h recv_worker, ucp_ep_h recv_ep,
                    size_t vecsize, int repeat);
 
-    void waitall(std::vector<void*> reqs);
-
     void disconnect(ucp_ep_h ep);
 
     void disconnect(ucp_test::entity &e);
 
+    static void close_completion(void *request, ucs_status_t status,
+                                 void *user_data);
+
     static void send_completion(void *request, ucs_status_t status);
 
     static void tag_recv_completion(void *request, ucs_status_t status,
@@ -268,7 +269,7 @@ void test_ucp_wireup::send_b(ucp_ep_h ep, size_t length, int repeat,
 {
     std::vector<void*> reqs;
     send_nb(ep, length, repeat, reqs, send_data);
-    waitall(reqs);
+    requests_wait(reqs);
 }
 
 void test_ucp_wireup::recv_b(ucp_worker_h worker, ucp_ep_h ep, size_t length,
@@ -315,6 +316,14 @@ void test_ucp_wireup::send_completion(void *request, ucs_status_t status)
 {
 }
 
+void test_ucp_wireup::close_completion(void *request, ucs_status_t status,
+                                       void *user_data)
+{
+    ASSERT_UCS_OK(status);
+    ASSERT_NE((test_ucp_wireup *)NULL, (test_ucp_wireup *)user_data);
+}
+
+
 void test_ucp_wireup::tag_recv_completion(void *request, ucs_status_t status,
                                           ucp_tag_recv_info_t *info)
 {
@@ -334,7 +343,7 @@ void test_ucp_wireup::send_recv(ucp_ep_h send_ep, ucp_worker_h recv_worker,
 
     send_nb(send_ep, length, repeat, send_reqs, send_data);
     recv_b (recv_worker, recv_ep, length, repeat, send_data);
-    waitall(send_reqs);
+    requests_wait(send_reqs);
     m_rkeys.clear();
 }
 
@@ -350,14 +359,6 @@ void test_ucp_wireup::disconnect(ucp_test::entity &e) {
     disconnect(e.revoke_ep());
 }
 
-void test_ucp_wireup::waitall(std::vector<void*> reqs)
-{
-    while (!reqs.empty()) {
-        request_wait(reqs.back());
-        reqs.pop_back();
-    }
-}
-
 bool test_ucp_wireup::ep_iface_has_caps(const entity& e, const std::string& tl,
                                         uint64_t caps)
 {
@@ -399,10 +400,10 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) {
     size_t size;
     void *buffer;
     std::set<uint8_t> packed_dev_priorities, unpacked_dev_priorities;
+    std::set<ucs_sys_device_t> packed_sys_devices, unpacked_sys_devices;
     ucp_rsc_index_t tl;
 
-    status = ucp_address_pack(sender().worker(), NULL,
-                              std::numeric_limits<uint64_t>::max(),
+    status = ucp_address_pack(sender().worker(), NULL, &ucp_tl_bitmap_max,
                               UCP_ADDRESS_PACK_FLAGS_ALL, m_lanes2remote, &size,
                               &buffer);
     ASSERT_UCS_OK(status);
@@ -410,11 +411,15 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) {
     ASSERT_GT(size, 0ul);
     EXPECT_LE(size, 2048ul); /* Expect a reasonable address size */
 
-    ucs_for_each_bit(tl, sender().worker()->context->tl_bitmap) {
-        if (sender().worker()->context->tl_rscs[tl].flags & UCP_TL_RSC_FLAG_SOCKADDR) {
+    UCS_BITMAP_FOR_EACH_BIT(sender().worker()->context->tl_bitmap, tl) {
+        const ucp_tl_resource_desc_t &rsc =
+                sender().worker()->context->tl_rscs[tl];
+        if (rsc.flags & UCP_TL_RSC_FLAG_SOCKADDR) {
             continue;
         }
-        packed_dev_priorities.insert(ucp_worker_iface_get_attr(sender().worker(), tl)->priority);
+        packed_dev_priorities.insert(
+                ucp_worker_iface_get_attr(sender().worker(), tl)->priority);
+        packed_sys_devices.insert(rsc.tl_rsc.sys_device);
     }
 
     ucp_unpacked_address unpacked_address;
@@ -425,7 +430,7 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) {
 
     EXPECT_EQ(sender().worker()->uuid, unpacked_address.uuid);
 #if ENABLE_DEBUG_DATA
-    EXPECT_EQ(std::string(ucp_worker_get_name(sender().worker())),
+    EXPECT_EQ(std::string(ucp_worker_get_address_name(sender().worker())),
               std::string(unpacked_address.name));
 #endif
     EXPECT_LE(unpacked_address.address_count,
@@ -434,6 +439,7 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) {
     const ucp_address_entry_t *ae;
     ucp_unpacked_address_for_each(ae, &unpacked_address) {
         unpacked_dev_priorities.insert(ae->iface_attr.priority);
+        unpacked_sys_devices.insert(ae->sys_dev);
     }
 
     /* TODO test addresses */
@@ -443,6 +449,7 @@ UCS_TEST_P(test_ucp_wireup_1sided, address) {
     /* Make sure that the packed device priorities are equal to the unpacked
      * device priorities */
     ASSERT_TRUE(packed_dev_priorities == unpacked_dev_priorities);
+    ASSERT_TRUE(packed_sys_devices == unpacked_sys_devices);
 }
 
 UCS_TEST_P(test_ucp_wireup_1sided, ep_address, "IB_NUM_PATHS?=2") {
@@ -453,9 +460,8 @@ UCS_TEST_P(test_ucp_wireup_1sided, ep_address, "IB_NUM_PATHS?=2") {
     sender().connect(&receiver(), get_ep_params());
 
     status = ucp_address_pack(sender().worker(), sender().ep(),
-                              std::numeric_limits<uint64_t>::max(),
-                              UCP_ADDRESS_PACK_FLAGS_ALL, m_lanes2remote, &size,
-                              &buffer);
+                              &ucp_tl_bitmap_max, UCP_ADDRESS_PACK_FLAGS_ALL,
+                              m_lanes2remote, &size, &buffer);
     ASSERT_UCS_OK(status);
     ASSERT_TRUE(buffer != NULL);
 
@@ -478,7 +484,7 @@ UCS_TEST_P(test_ucp_wireup_1sided, empty_address) {
     size_t size;
     void *buffer;
 
-    status = ucp_address_pack(sender().worker(), NULL, 0,
+    status = ucp_address_pack(sender().worker(), NULL, &ucp_tl_bitmap_min,
                               UCP_ADDRESS_PACK_FLAGS_ALL, m_lanes2remote, &size,
                               &buffer);
     ASSERT_UCS_OK(status);
@@ -493,7 +499,7 @@ UCS_TEST_P(test_ucp_wireup_1sided, empty_address) {
 
     EXPECT_EQ(sender().worker()->uuid, unpacked_address.uuid);
 #if ENABLE_DEBUG_DATA
-    EXPECT_EQ(std::string(ucp_worker_get_name(sender().worker())),
+    EXPECT_EQ(std::string(ucp_worker_get_address_name(sender().worker())),
               std::string(unpacked_address.name));
 #endif
     EXPECT_EQ(0u, unpacked_address.address_count);
@@ -515,7 +521,12 @@ UCS_TEST_P(test_ucp_wireup_1sided, one_sided_wireup_rndv, "RNDV_THRESH=1") {
         /* expect the endpoint to be connected to itself */
         ucp_ep_h ep         = sender().ep();
         ucp_worker_h worker = sender().worker();
-        EXPECT_EQ(ep, ucp_worker_get_ep_by_id(worker, ucp_ep_remote_id(ep)));
+        ucp_ep_h ep_by_id;
+        ucs_status_t status = ucp_worker_get_ep_by_id(worker,
+                                                      ucp_ep_remote_id(ep),
+                                                      &ep_by_id);
+        ASSERT_EQ(UCS_OK, status);
+        EXPECT_EQ(ep, ep_by_id);
     }
     flush_worker(sender());
 }
@@ -630,7 +641,15 @@ UCS_TEST_P(test_ucp_wireup_1sided, send_disconnect_reply1) {
     recv_b(sender().worker(), sender().ep(), 8, 1);
 }
 
-UCS_TEST_P(test_ucp_wireup_1sided, send_disconnect_reply2) {
+UCS_TEST_SKIP_COND_P(test_ucp_wireup_1sided, send_disconnect_reply2,
+                     /* skip the test for TCP, because it fails from time to
+                      * time: the sender re-uses a socket fd from the already
+                      * accepted connection from the receiver, but then the
+                      * socket fd is closed, since the receiver closed the
+                      * connection and the underlying TCP EP isn't able to
+                      * receive the data on the failed socket.
+                      * TODO: fix the bug on TCP level */
+                     has_transport("tcp")) {
     sender().connect(&receiver(), get_ep_params());
 
     send_b(sender().ep(), 8, 1);
@@ -663,7 +682,7 @@ UCS_TEST_P(test_ucp_wireup_1sided, disconnect_nb_onesided) {
     sender().close_ep_req_free(req);
 
     recv_b(receiver().worker(), receiver().ep(), 1000, 1000);
-    waitall(sreqs);
+    requests_wait(sreqs);
 }
 
 UCS_TEST_P(test_ucp_wireup_1sided, multi_ep_1sided) {
@@ -784,7 +803,7 @@ UCS_TEST_SKIP_COND_P(test_ucp_wireup_2sided, async_connect,
     reqs.push_back(ucp_tag_recv_nb(receiver().worker(), NULL, 0, DT_U64, 1,
                                    (ucp_tag_t)-1, tag_recv_completion));
     EXPECT_FALSE(UCS_PTR_IS_ERR(reqs.back()));
-    waitall(reqs);
+    requests_wait(reqs);
 }
 
 UCS_TEST_P(test_ucp_wireup_2sided, connect_disconnect) {
@@ -798,6 +817,32 @@ UCS_TEST_P(test_ucp_wireup_2sided, connect_disconnect) {
     }
 }
 
+UCS_TEST_P(test_ucp_wireup_2sided, close_nbx_callback) {
+    sender().connect(&receiver(), get_ep_params());
+    if (!is_loopback()) {
+        receiver().connect(&sender(), get_ep_params());
+    }
+
+    std::vector<void *> reqs;
+    ucp_request_param_t param;
+
+    param.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK  |
+                         UCP_OP_ATTR_FIELD_USER_DATA |
+                         UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+    param.cb.send      = close_completion;
+    param.user_data    = this;
+
+    reqs.push_back(ucp_ep_close_nbx(sender().revoke_ep(), &param));
+    EXPECT_FALSE(UCS_PTR_IS_ERR(reqs.back()));
+
+    if (!is_loopback()) {
+        reqs.push_back(ucp_ep_close_nbx(receiver().revoke_ep(), &param));
+        EXPECT_FALSE(UCS_PTR_IS_ERR(reqs.back()));
+    }
+
+    requests_wait(reqs);
+}
+
 UCS_TEST_P(test_ucp_wireup_2sided, multi_ep_2sided) {
     const unsigned count = 10;
 
@@ -895,7 +940,7 @@ class test_ucp_wireup_fallback : public test_ucp_wireup {
     bool check_scalable_tls(const ucp_worker_h worker, size_t est_num_eps) {
         ucp_rsc_index_t rsc_index;
 
-        ucs_for_each_bit(rsc_index, worker->context->tl_bitmap) {
+        UCS_BITMAP_FOR_EACH_BIT(worker->context->tl_bitmap, rsc_index) {
             ucp_md_index_t md_index      = worker->context->tl_rscs[rsc_index].md_index;
             const uct_md_attr_t *md_attr = &worker->context->tl_mds[md_index].attr;
 
@@ -907,10 +952,12 @@ class test_ucp_wireup_fallback : public test_ucp_wireup {
             }
 
             if (ucp_worker_iface_get_attr(worker, rsc_index)->max_num_eps >= est_num_eps) {
-                EXPECT_TRUE((worker->scalable_tl_bitmap & UCS_BIT(rsc_index)) != 0);
+                EXPECT_TRUE(
+                        UCS_BITMAP_GET(worker->scalable_tl_bitmap, rsc_index));
                 return true;
             } else {
-                EXPECT_TRUE((worker->scalable_tl_bitmap & UCS_BIT(rsc_index)) == 0);
+                EXPECT_TRUE(UCS_BITMAP_GET(worker->scalable_tl_bitmap,
+                                           rsc_index) == 0);
             }
         }
 
@@ -1137,7 +1184,8 @@ class test_ucp_wireup_fallback_amo : public test_ucp_wireup {
                 device_atomics_cnt++;
             }
         }
-        bool device_atomics_supported = sender().worker()->atomic_tls != 0;
+        bool device_atomics_supported = !UCS_BITMAP_IS_ZERO_INPLACE(
+                &sender().worker()->atomic_tls);
 
         test_ucp_wireup::cleanup();
 
@@ -1413,10 +1461,13 @@ UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wireup_asymmetric, ib, "ib")
 
 class test_ucp_wireup_keepalive : public test_ucp_wireup {
 public:
+    test_ucp_wireup_keepalive() {
+        m_env.push_back(new ucs::scoped_setenv("UCX_TCP_KEEPIDLE", "inf"));
+    }
+
     static void get_test_variants(std::vector<ucp_test_variant>& variants)
     {
-        test_ucp_wireup::get_test_variants(variants,
-                                           UCP_FEATURE_RMA | UCP_FEATURE_TAG);
+        test_ucp_wireup::get_test_variants(variants, UCP_FEATURE_TAG);
     }
 
     ucp_ep_params_t get_ep_params() {
@@ -1437,6 +1488,9 @@ class test_ucp_wireup_keepalive : public test_ucp_wireup {
         sender().connect(&receiver(), get_ep_params());
         receiver().connect(&sender(), get_ep_params());
     }
+
+protected:
+    ucs::ptr_vector<ucs::scoped_setenv> m_env;
 };
 
 /* test if EP has non-empty keepalive lanes mask */
diff --git a/test/gtest/ucp/test_ucp_worker.cc b/test/gtest/ucp/test_ucp_worker.cc
index 057d8fbf43d..071fa34401c 100644
--- a/test/gtest/ucp/test_ucp_worker.cc
+++ b/test/gtest/ucp/test_ucp_worker.cc
@@ -10,6 +10,7 @@
 
 extern "C" {
 #include <ucp/core/ucp_worker.h>
+#include <ucp/core/ucp_worker.inl>
 #include <ucp/core/ucp_request.h>
 #include <ucp/wireup/wireup_ep.h>
 #include <uct/base/uct_iface.h>
@@ -39,6 +40,7 @@ class test_ucp_worker_discard : public ucp_test {
         m_destroyed_ep_count = 0;
         m_fake_ep.flags      = UCP_EP_FLAG_REMOTE_CONNECTED;
 
+        sender().connect(&receiver(), get_ep_params());
         m_flush_comps.clear();
         m_pending_reqs.clear();
         m_ep_test_info_map.clear();
@@ -58,7 +60,7 @@ class test_ucp_worker_discard : public ucp_test {
 
             pending_reqs.push_back(req);
 
-            if (func == ucp_wireup_msg_progress) {               
+            if (func == ucp_wireup_msg_progress) {
                 req->send.ep = &m_fake_ep;
             }
 
@@ -67,9 +69,20 @@ class test_ucp_worker_discard : public ucp_test {
         }
     }
 
+    static void
+    discarded_cb(void *request, ucs_status_t status, void *user_data)
+    {
+        /* Make Coverity happy */
+        ucs_assert(user_data != NULL);
+
+        unsigned *discarded_count_p = static_cast<unsigned*>(user_data);
+        (*discarded_count_p)++;
+    }
+
     void test_worker_discard(void *ep_flush_func,
                              void *ep_pending_add_func,
                              void *ep_pending_purge_func,
+                             ucs_status_t ep_flush_comp_status = UCS_OK,
                              bool wait_for_comp = true,
                              unsigned ep_count = 8,
                              unsigned wireup_ep_count = 0,
@@ -77,16 +90,18 @@ class test_ucp_worker_discard : public ucp_test {
         uct_iface_ops_t ops                  = {0};
         unsigned created_wireup_aux_ep_count = 0;
         unsigned total_ep_count              = ep_count + wireup_aux_ep_count;
+        unsigned discarded_count             = 0;
+        void *flush_req                      = NULL;
+        ucp_ep_h ucp_ep;
         uct_iface_t iface;
         std::vector<uct_ep_t> eps(total_ep_count);
         std::vector<uct_ep_h> wireup_eps(wireup_ep_count);
-        ucp_ep_t ucp_ep;
         ucs_status_t status;
 
         ASSERT_LE(wireup_ep_count, ep_count);
         ASSERT_LE(wireup_aux_ep_count, wireup_ep_count);
 
-        ucp_ep.worker = sender().worker();
+        ucp_ep = sender().ep();
 
         ops.ep_flush         = (uct_ep_flush_func_t)ep_flush_func;
         ops.ep_pending_add   = (uct_ep_pending_add_func_t)ep_pending_add_func;
@@ -94,6 +109,11 @@ class test_ucp_worker_discard : public ucp_test {
         ops.ep_destroy       = ep_destroy_func;
         iface.ops            = ops;
 
+        ucp_rsc_index_t rsc_index  = UCS_BITMAP_FFS(sender().ucph()->tl_bitmap);
+        ucp_worker_iface_t *wiface = ucp_worker_iface(sender().worker(),
+                                                      rsc_index);
+        std::vector<uct_ep_h> eps_to_discard;
+
         for (unsigned i = 0; i < ep_count; i++) {
             uct_ep_h discard_ep;
 
@@ -103,7 +123,7 @@ class test_ucp_worker_discard : public ucp_test {
             std::vector<ucp_request_t*> pending_reqs;
 
             if (i < wireup_ep_count) {
-                status = ucp_wireup_ep_create(&ucp_ep, &discard_ep);
+                status = ucp_wireup_ep_create(ucp_ep, &discard_ep);
                 ASSERT_UCS_OK(status);
 
                 wireup_eps.push_back(discard_ep);
@@ -114,9 +134,12 @@ class test_ucp_worker_discard : public ucp_test {
                 if (i < wireup_aux_ep_count) {
                     eps[ep_count + created_wireup_aux_ep_count].iface = &iface;
 
+                    ucp_worker_iface_progress_ep(wiface);
+
                     /* coverity[escape] */
-                    wireup_ep->aux_ep = &eps[ep_count +
-                                             created_wireup_aux_ep_count];
+                    wireup_ep->aux_ep        =
+                            &eps[ep_count + created_wireup_aux_ep_count];
+                    wireup_ep->aux_rsc_index = rsc_index;
 
                     created_wireup_aux_ep_count++;
                     m_created_ep_count++;
@@ -145,11 +168,21 @@ class test_ucp_worker_discard : public ucp_test {
                                  pending_reqs);
             }
 
+            eps_to_discard.push_back(discard_ep);
+        }
+
+        for (std::vector<uct_ep_h>::iterator iter = eps_to_discard.begin();
+             iter != eps_to_discard.end(); ++iter) {
+            uct_ep_h discard_ep        = *iter;
             unsigned purged_reqs_count = 0;
-            ucp_worker_discard_uct_ep(sender().worker(), discard_ep,
-                                      UCT_FLUSH_FLAG_LOCAL,
+
+            UCS_ASYNC_BLOCK(&sender().worker()->async);
+            ucp_worker_iface_progress_ep(wiface);
+            ucp_worker_discard_uct_ep(ucp_ep, discard_ep, UCT_FLUSH_FLAG_LOCAL,
                                       ep_pending_purge_count_reqs_cb,
-                                      &purged_reqs_count);
+                                      &purged_reqs_count, discarded_cb,
+                                      static_cast<void*>(&discarded_count));
+            UCS_ASYNC_UNBLOCK(&sender().worker()->async);
 
             if (ep_pending_purge_func == (void*)ep_pending_purge_func_iter_reqs) {
                 EXPECT_EQ(m_pending_purge_reqs_count, purged_reqs_count);
@@ -159,14 +192,12 @@ class test_ucp_worker_discard : public ucp_test {
         }
 
         if (!wait_for_comp) {
-            /* destroy sender's entity here to have an access to the valid
-             * pointers */
-            sender().cleanup();
-            return;
+            /* to not do flush_worker() before sender's entity destroy */
+            sender().add_err(UCS_ERR_ENDPOINT_TIMEOUT);
+            goto out;
         }
 
-        void *flush_req = sender().flush_worker_nb(0);
-
+        flush_req = sender().flush_worker_nb(0);
         ASSERT_FALSE(flush_req == NULL);
         ASSERT_TRUE(UCS_PTR_IS_PTR(flush_req));
 
@@ -177,7 +208,7 @@ class test_ucp_worker_discard : public ucp_test {
                 uct_completion_t *comp = m_flush_comps.back();
 
                 m_flush_comps.pop_back();
-                uct_invoke_completion(comp, UCS_OK);
+                uct_invoke_completion(comp, ep_flush_comp_status);
             }
 
             if (!m_pending_reqs.empty()) {
@@ -193,8 +224,15 @@ class test_ucp_worker_discard : public ucp_test {
         } while (ucp_request_check_status(flush_req) == UCS_INPROGRESS);
 
         EXPECT_UCS_OK(ucp_request_check_status(flush_req));
-        EXPECT_EQ(m_created_ep_count, m_destroyed_ep_count);
+        ucp_request_release(flush_req);
+
+        if (ep_flush_comp_status != UCS_ERR_CANCELED) {
+            EXPECT_EQ(m_created_ep_count, m_destroyed_ep_count);
+        }
         EXPECT_EQ(m_created_ep_count, total_ep_count);
+        /* discarded_cb is called only for UCT EPs passed to
+         * ucp_worker_discard_uct_ep() */
+        EXPECT_EQ(ep_count, discarded_count);
 
         for (unsigned i = 0; i < m_created_ep_count; i++) {
             ep_test_info_t &test_info = ep_test_info_get(&eps[i]);
@@ -216,16 +254,35 @@ class test_ucp_worker_discard : public ucp_test {
         EXPECT_TRUE(m_flush_comps.empty());
         EXPECT_TRUE(m_pending_reqs.empty());
 
-        ucp_request_release(flush_req);
-
         /* check that uct_ep_destroy() was called for the all EPs that
          * were created in the test */
         for (unsigned i = 0; i < created_wireup_aux_ep_count; i++) {
             EXPECT_EQ(NULL, eps[i].iface);
         }
+
+        EXPECT_EQ(1u, ucp_ep->refcount);
+
+out:
+        disconnect(sender());
+        sender().cleanup();
+        EXPECT_EQ(m_created_ep_count, m_destroyed_ep_count);
     }
 
-    static void ep_destroy_func(uct_ep_h ep) {
+    static void ep_destroy_func(uct_ep_h ep)
+    {
+        for (std::vector<uct_completion_t*>::iterator iter = m_flush_comps.begin();
+             iter != m_flush_comps.end(); ++iter) {
+            ucp_request_t *req = ucs_container_of(*iter, ucp_request_t,
+                                                  send.state.uct_comp);
+            if (req->send.discard_uct_ep.uct_ep == ep) {
+                /* When UCT endpoint is destroyed, all its outstanding
+                 * operations are completed with status UCS_ERR_CANCELED */
+                uct_invoke_completion(&req->send.state.uct_comp, UCS_ERR_CANCELED);
+                m_flush_comps.erase(iter);
+                break;
+            }
+        }
+
         ep->iface = NULL;
         m_destroyed_ep_count++;
     }
@@ -349,97 +406,200 @@ std::vector<uct_pending_req_t*>             test_ucp_worker_discard::m_pending_r
 test_ucp_worker_discard::ep_test_info_map_t test_ucp_worker_discard::m_ep_test_info_map;
 
 
-UCS_TEST_P(test_ucp_worker_discard, flush_ok) {
+UCS_TEST_P(test_ucp_worker_discard, flush_ok)
+{
     test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
                         (void*)ucs_empty_function_do_assert      /* ep_pending_add */,
                         (void*)ucs_empty_function                /* ep_pending_purge */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok) {
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok)
+{
     test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
                         (void*)ucs_empty_function_do_assert      /* ep_pending_add */,
                         (void*)ucs_empty_function                /* ep_pending_purge */,
+                        UCS_OK                                   /* ep_flush_comp_status */,
                         true                                     /* wait for the completion */,
                         8                                        /* UCT EP count */,
                         6                                        /* WIREUP EP count */,
                         3                                        /* WIREUP AUX EP count */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, flush_ok_pending_purge) {
+UCS_TEST_P(test_ucp_worker_discard, flush_ok_pending_purge)
+{
     test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
                         (void*)ep_pending_add_save_req           /* ep_pending_add */,
                         (void*)ep_pending_purge_func_iter_reqs   /* ep_pending_purge */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok_pending_purge) {
+UCS_TEST_P(test_ucp_worker_discard, flush_ok_pending_purge_not_wait_comp)
+{
     test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
                         (void*)ep_pending_add_save_req           /* ep_pending_add */,
                         (void*)ep_pending_purge_func_iter_reqs   /* ep_pending_purge */,
+                        UCS_OK                                   /* ep_flush_comp_status */,
+                        false                                    /* don't wait for the completion */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok_pending_purge)
+{
+    test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
+                        (void*)ep_pending_add_save_req           /* ep_pending_add */,
+                        (void*)ep_pending_purge_func_iter_reqs   /* ep_pending_purge */,
+                        UCS_OK                                   /* ep_flush_comp_status */,
                         true                                     /* wait for the completion */,
                         8                                        /* UCT EP count */,
                         6                                        /* WIREUP EP count */,
                         3                                        /* WIREUP AUX EP count */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, flush_in_progress) {
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok_pending_purge_not_wait_comp)
+{
+    test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
+                        (void*)ep_pending_add_save_req           /* ep_pending_add */,
+                        (void*)ep_pending_purge_func_iter_reqs   /* ep_pending_purge */,
+                        UCS_OK                                   /* ep_flush_comp_status */,
+                        false                                    /* don't wait for the completion */,
+                        8                                        /* UCT EP count */,
+                        6                                        /* WIREUP EP count */,
+                        3                                        /* WIREUP AUX EP count */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, flush_in_progress)
+{
     test_worker_discard((void*)ep_flush_func_return_in_progress /* ep_flush */,
                         (void*)ucs_empty_function_do_assert     /* ep_pending_add */,
                         (void*)ucs_empty_function               /* ep_pending_purge */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_in_progress) {
+UCS_TEST_P(test_ucp_worker_discard, flush_in_progress_return_canceled)
+{
     test_worker_discard((void*)ep_flush_func_return_in_progress /* ep_flush */,
                         (void*)ucs_empty_function_do_assert     /* ep_pending_add */,
                         (void*)ucs_empty_function               /* ep_pending_purge */,
+                        UCS_ERR_CANCELED                        /* ep_flush_comp_status */);
+}
+
+
+UCS_TEST_P(test_ucp_worker_discard, flush_in_progress_not_wait_comp)
+{
+    test_worker_discard((void*)ep_flush_func_return_in_progress /* ep_flush */,
+                        (void*)ucs_empty_function_do_assert     /* ep_pending_add */,
+                        (void*)ucs_empty_function               /* ep_pending_purge */,
+                        UCS_OK                                  /* ep_flush_comp_status */,
+                        false                                   /* don't wait for the completion */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_in_progress)
+{
+    test_worker_discard((void*)ep_flush_func_return_in_progress /* ep_flush */,
+                        (void*)ucs_empty_function_do_assert     /* ep_pending_add */,
+                        (void*)ucs_empty_function               /* ep_pending_purge */,
+                        UCS_OK                                  /* ep_flush_comp_status */,
+                        true                                    /* wait for the completion */,
+                        8                                       /* UCT EP count */,
+                        6                                       /* WIREUP EP count */,
+                        3                                       /* WIREUP AUX EP count */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_in_progress_return_canceled)
+{
+    test_worker_discard((void*)ep_flush_func_return_in_progress /* ep_flush */,
+                        (void*)ucs_empty_function_do_assert     /* ep_pending_add */,
+                        (void*)ucs_empty_function               /* ep_pending_purge */,
+                        UCS_ERR_CANCELED                        /* ep_flush_comp_status */,
                         true                                    /* wait for the completion */,
                         8                                       /* UCT EP count */,
                         6                                       /* WIREUP EP count */,
                         3                                       /* WIREUP AUX EP count */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, flush_no_resource_pending_add_busy) {
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_in_progress_not_wait_comp)
+{
+    test_worker_discard((void*)ep_flush_func_return_in_progress /* ep_flush */,
+                        (void*)ucs_empty_function_do_assert     /* ep_pending_add */,
+                        (void*)ucs_empty_function               /* ep_pending_purge */,
+                        UCS_OK                                  /* ep_flush_comp_status */,
+                        false                                   /* don't wait for the completion */,
+                        8                                       /* UCT EP count */,
+                        6                                       /* WIREUP EP count */,
+                        3                                       /* WIREUP AUX EP count */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, flush_no_resource_pending_add_busy)
+{
     test_worker_discard((void*)ep_flush_func_return_3_no_resource_then_ok /* ep_flush */,
                         (void*)ucs_empty_function_return_busy             /* ep_pending_add */,
                         (void*)ucs_empty_function                         /* ep_pending_purge */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_no_resource_pending_add_busy) {
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_no_resource_pending_add_busy)
+{
     test_worker_discard((void*)ep_flush_func_return_3_no_resource_then_ok /* ep_flush */,
                         (void*)ucs_empty_function_return_busy             /* ep_pending_add */,
                         (void*)ucs_empty_function                         /* ep_pending_purge */,
+                        UCS_OK                                            /* ep_flush_comp_status */,
                         true                                              /* wait for the completion */,
                         8                                                 /* UCT EP count */,
                         6                                                 /* WIREUP EP count */,
                         3                                                 /* WIREUP AUX EP count */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, flush_no_resource_pending_add_ok_then_busy) {
+UCS_TEST_P(test_ucp_worker_discard, flush_no_resource_pending_add_ok_then_busy)
+{
     test_worker_discard((void*)ep_flush_func_return_3_no_resource_then_ok /* ep_flush */,
                         (void*)ep_pending_add_func_return_ok_then_busy    /* ep_pending_add */,
                         (void*)ucs_empty_function                         /* ep_pending_purge */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_no_resource_pending_add_ok_then_busy) {
+UCS_TEST_P(test_ucp_worker_discard, flush_no_resource_pending_add_ok_then_busy_not_wait_comp)
+{
+    test_worker_discard((void*)ep_flush_func_return_3_no_resource_then_ok /* ep_flush */,
+                        (void*)ep_pending_add_save_req                    /* ep_pending_add */,
+                        (void*)ep_pending_purge_func_iter_reqs            /* ep_pending_purge */,
+                        UCS_OK                                            /* ep_flush_comp_status */,
+                        false                                             /* don't wait for the completion */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_no_resource_pending_add_ok_then_busy)
+{
     test_worker_discard((void*)ep_flush_func_return_3_no_resource_then_ok /* ep_flush */,
                         (void*)ep_pending_add_func_return_ok_then_busy    /* ep_pending_add */,
                         (void*)ucs_empty_function                         /* ep_pending_purge */,
+                        UCS_OK                                            /* ep_flush_comp_status */,
                         true                                              /* wait for the completion */,
                         8                                                 /* UCT EP count */,
                         6                                                 /* WIREUP EP count */,
                         3                                                 /* WIREUP AUX EP count */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, flush_ok_not_wait_comp) {
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_no_resource_pending_add_ok_then_busy_not_wait_comp)
+{
+    test_worker_discard((void*)ep_flush_func_return_3_no_resource_then_ok /* ep_flush */,
+                        (void*)ep_pending_add_save_req                    /* ep_pending_add */,
+                        (void*)ep_pending_purge_func_iter_reqs            /* ep_pending_purge */,
+                        UCS_OK                                            /* ep_flush_comp_status */,
+                        false                                             /* don't wait for the completion */,
+                        8                                                 /* UCT EP count */,
+                        6                                                 /* WIREUP EP count */,
+                        3                                                 /* WIREUP AUX EP count */);
+}
+
+UCS_TEST_P(test_ucp_worker_discard, flush_ok_not_wait_comp)
+{
     test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
                         (void*)ucs_empty_function_do_assert      /* ep_pending_add */,
                         (void*)ucs_empty_function                /* ep_pending_purge */,
+                        UCS_OK                                   /* ep_flush_comp_status */,
                         false                                    /* don't wait for the completion */);
 }
 
-UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok_not_wait_comp) {
+UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok_not_wait_comp)
+{
     test_worker_discard((void*)ucs_empty_function_return_success /* ep_flush */,
                         (void*)ucs_empty_function_do_assert      /* ep_pending_add */,
                         (void*)ucs_empty_function                /* ep_pending_purge */,
+                        UCS_OK                                   /* ep_flush_comp_status */,
                         false                                    /* don't wait for the completion */,
                         8                                        /* UCT EP count */,
                         6                                        /* WIREUP EP count */,
@@ -447,3 +607,129 @@ UCS_TEST_P(test_ucp_worker_discard, wireup_ep_flush_ok_not_wait_comp) {
 }
 
 UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_worker_discard, all, "all")
+
+
+class test_ucp_worker_request_leak : public ucp_test {
+public:
+    enum {
+        LEAK_CHECK,
+        LEAK_IGNORE
+    };
+
+    static void get_test_variants(std::vector<ucp_test_variant> &variants)
+    {
+        add_variant_with_value(variants, UCP_FEATURE_TAG, LEAK_CHECK,
+                               "leak_check");
+        add_variant_with_value(variants, UCP_FEATURE_TAG, LEAK_IGNORE,
+                               "leak_ignore");
+    }
+
+    bool ignore_leak()
+    {
+        return get_variant_value(0) == LEAK_IGNORE;
+    }
+
+    /// @override
+    virtual ucp_worker_params_t get_worker_params()
+    {
+        ucp_worker_params_t params = ucp_test::get_worker_params();
+        if (ignore_leak()) {
+            params.field_mask |= UCP_WORKER_PARAM_FIELD_FLAGS;
+            params.flags      |= UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK;
+        }
+        return params;
+    }
+
+    /// @override
+    virtual void init()
+    {
+        ucp_test::init();
+        sender().connect(&receiver(), get_ep_params());
+    }
+
+    /// @override
+    virtual void cleanup()
+    {
+        if (ignore_leak()) {
+            // Should not have warnings if leak check is off
+            ucp_test::cleanup();
+        } else {
+            scoped_log_handler wrap_warn(wrap_warns_logger);
+            ucp_test::cleanup();
+            check_leak_warnings(); // Leak check is enabled - expect warnings
+        }
+    }
+
+private:
+    void check_leak_warnings()
+    {
+        EXPECT_EQ(2u, m_warnings.size());
+        for (size_t i = 0; i < m_warnings.size(); ++i) {
+            std::string::size_type pos = m_warnings[i].find(
+                    "not returned to mpool ucp_requests");
+            EXPECT_NE(std::string::npos, pos);
+        }
+    }
+};
+
+UCS_TEST_P(test_ucp_worker_request_leak, tag_send_recv)
+{
+    ucp_request_param_t param;
+    param.op_attr_mask = UCP_OP_ATTR_FLAG_NO_IMM_CMPL;
+    void *sreq         = ucp_tag_send_nbx(sender().ep(), NULL, 0, 0, &param);
+    ASSERT_TRUE(UCS_PTR_IS_PTR(sreq));
+
+    void *rreq = ucp_tag_recv_nbx(receiver().worker(), NULL, 0, 0, 0, &param);
+    ASSERT_TRUE(UCS_PTR_IS_PTR(rreq));
+
+    UCS_TEST_MESSAGE << "send req: " << sreq << ", recv req: " << rreq;
+    while ((ucp_request_check_status(sreq) != UCS_OK) ||
+           (ucp_request_check_status(rreq) != UCS_OK)) {
+        progress();
+    }
+
+    // Exit the test without releasing the requests
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_worker_request_leak, all, "all")
+
+class test_ucp_worker_thread_mode : public ucp_test {
+public:
+    static void get_test_variants(std::vector<ucp_test_variant> &variants)
+    {
+        add_variant_with_value(variants, UCP_FEATURE_TAG,
+                               UCS_THREAD_MODE_SINGLE, "single");
+        add_variant_with_value(variants, UCP_FEATURE_TAG,
+                               UCS_THREAD_MODE_SERIALIZED, "serialized");
+        add_variant_with_value(variants, UCP_FEATURE_TAG, UCS_THREAD_MODE_MULTI,
+                               "multi");
+    }
+
+    /// @override
+    virtual ucp_worker_params_t get_worker_params()
+    {
+        ucp_worker_params_t params = ucp_test::get_worker_params();
+
+        params.field_mask |= UCP_WORKER_PARAM_FIELD_THREAD_MODE;
+        params.thread_mode = thread_mode();
+        return params;
+    }
+
+protected:
+    ucs_thread_mode_t thread_mode() const
+    {
+        return static_cast<ucs_thread_mode_t>(get_variant_value(0));
+    }
+};
+
+UCS_TEST_P(test_ucp_worker_thread_mode, query)
+{
+    ucp_worker_attr_t worker_attr = {};
+
+    worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE;
+    ucs_status_t status    = ucp_worker_query(sender().worker(), &worker_attr);
+    ASSERT_EQ(UCS_OK, status);
+    EXPECT_EQ(thread_mode(), worker_attr.thread_mode);
+}
+
+UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_worker_thread_mode, all, "all")
diff --git a/test/gtest/ucp/ucp_test.cc b/test/gtest/ucp/ucp_test.cc
index 9346cddc642..3ad6c289078 100644
--- a/test/gtest/ucp/ucp_test.cc
+++ b/test/gtest/ucp/ucp_test.cc
@@ -177,6 +177,17 @@ void ucp_test::flush_worker(const entity &e, int worker_index)
     request_wait(request, worker_index);
 }
 
+void ucp_test::flush_workers()
+{
+    for (ucs::ptr_vector<entity>::const_iterator iter = entities().begin();
+         iter != entities().end(); ++iter) {
+        const entity &e = **iter;
+        for (int i = 0; i < e.get_num_workers(); i++) {
+            flush_worker(e, i);
+        }
+    }
+}
+
 void ucp_test::disconnect(entity& e) {
     bool has_failed_entity = false;
     for (ucs::ptr_vector<entity>::const_iterator iter = entities().begin();
@@ -240,6 +251,23 @@ ucs_status_t ucp_test::request_wait(void *req, int worker_index)
     return request_process(req, worker_index, true);
 }
 
+ucs_status_t ucp_test::requests_wait(std::vector<void*> &reqs,
+                                     int worker_index)
+{
+    ucs_status_t ret_status = UCS_OK;
+
+    while (!reqs.empty()) {
+        ucs_status_t status = request_wait(reqs.back(), worker_index);
+        if (ret_status == UCS_OK) {
+            // Save the first failure
+            ret_status = status;
+        }
+        reqs.pop_back();
+    }
+
+    return ret_status;
+}
+
 void ucp_test::request_release(void *req)
 {
     request_process(req, 0, false);
@@ -253,6 +281,14 @@ int ucp_test::max_connections() {
     }
 }
 
+void ucp_test::set_tl_timeouts(ucs::ptr_vector<ucs::scoped_setenv> &env)
+{
+    /* Set small TL timeouts to reduce testing time */
+    env.push_back(new ucs::scoped_setenv("UCX_RC_TIMEOUT",     "10ms"));
+    env.push_back(new ucs::scoped_setenv("UCX_RC_RNR_TIMEOUT", "10ms"));
+    env.push_back(new ucs::scoped_setenv("UCX_RC_RETRY_COUNT", "2"));
+}
+
 void ucp_test::set_ucp_config(ucp_config_t *config, const std::string& tls)
 {
     ucs_status_t status;
@@ -465,28 +501,31 @@ ucp_test_base::entity::entity(const ucp_test_param& test_param,
                               ucp_config_t* ucp_config,
                               const ucp_worker_params_t& worker_params,
                               const ucp_test_base *test_owner)
-    : m_err_cntr(0), m_rejected_cntr(0)
+    : m_err_cntr(0), m_rejected_cntr(0), m_accept_err_cntr(0)
 {
-    ucp_test_variant entity_param           = test_param.variant;
+    const int thread_type                   = test_param.variant.thread_type;
+    ucp_params_t local_ctx_params           = test_param.variant.ctx_params;
     ucp_worker_params_t local_worker_params = worker_params;
     int num_workers;
 
-    if (entity_param.thread_type == MULTI_THREAD_CONTEXT) {
-        num_workers = MT_TEST_NUM_THREADS;
-        entity_param.ctx_params.mt_workers_shared = 1;
-        local_worker_params.thread_mode = UCS_THREAD_MODE_SINGLE;
-    } else if (entity_param.thread_type == MULTI_THREAD_WORKER) {
-        num_workers = 1;
-        entity_param.ctx_params.mt_workers_shared = 0;
-        local_worker_params.thread_mode = UCS_THREAD_MODE_MULTI;
+    if (thread_type == MULTI_THREAD_CONTEXT) {
+        /* Test multi-threading on context level, so create multiple workers
+           which share the context */
+        num_workers                        = MT_TEST_NUM_THREADS;
+        local_ctx_params.field_mask       |= UCP_PARAM_FIELD_MT_WORKERS_SHARED;
+        local_ctx_params.mt_workers_shared = 1;
     } else {
+        /* Test multi-threading on worker level, so create a single worker */
         num_workers = 1;
-        entity_param.ctx_params.mt_workers_shared = 0;
-        local_worker_params.thread_mode = UCS_THREAD_MODE_SINGLE;
     }
 
-    entity_param.ctx_params.field_mask |= UCP_PARAM_FIELD_MT_WORKERS_SHARED;
-    local_worker_params.field_mask     |= UCP_WORKER_PARAM_FIELD_THREAD_MODE;
+    /* Set thread mode according to variant.thread_type, unless it's already set
+       in worker_params */
+    if (!(worker_params.field_mask & UCP_WORKER_PARAM_FIELD_THREAD_MODE) &&
+        (thread_type == MULTI_THREAD_WORKER)) {
+        local_worker_params.thread_mode = UCS_THREAD_MODE_MULTI;
+        local_worker_params.field_mask |= UCP_WORKER_PARAM_FIELD_THREAD_MODE;
+    }
 
     /* Set transports configuration */
     std::stringstream ss;
@@ -496,7 +535,7 @@ ucp_test_base::entity::entity(const ucp_test_param& test_param,
     {
         scoped_log_handler slh(hide_errors_logger);
         UCS_TEST_CREATE_HANDLE_IF_SUPPORTED(ucp_context_h, m_ucph, ucp_cleanup,
-                                            ucp_init, &entity_param.ctx_params,
+                                            ucp_init, &local_ctx_params,
                                             ucp_config);
     }
 
@@ -578,9 +617,10 @@ bool ucp_test_base::entity::verify_client_address(struct sockaddr_storage
     return false;
 }
 
-ucp_ep_h ucp_test_base::entity::accept(ucp_worker_h worker,
-                                       ucp_conn_request_h conn_request)
+void ucp_test_base::entity::accept(int worker_index,
+                                   ucp_conn_request_h conn_request)
 {
+    ucp_worker_h ucp_worker   = worker(worker_index);
     ucp_ep_params_t ep_params = *m_server_ep_params;
     ucp_conn_request_attr_t attr;
     ucs_status_t status;
@@ -598,14 +638,17 @@ ucp_ep_h ucp_test_base::entity::accept(ucp_worker_h worker,
     ep_params.user_data    = reinterpret_cast<void*>(this);
     ep_params.conn_request = conn_request;
 
-    status = ucp_ep_create(worker, &ep_params, &ep);
+    status = ucp_ep_create(ucp_worker, &ep_params, &ep);
     if (status == UCS_ERR_UNREACHABLE) {
         UCS_TEST_SKIP_R("Skipping due an unreachable destination (unsupported "
                         "feature or no supported transport to send partial "
                         "worker address)");
+    } else if (status != UCS_OK) {
+        ++m_accept_err_cntr;
+        return;
     }
-    ASSERT_UCS_OK(status);
-    return ep;
+
+    set_ep(ep, worker_index, std::numeric_limits<int>::max());
 }
 
 
@@ -686,7 +729,7 @@ void *ucp_test_base::entity::disconnect_nb(int worker_index, int ep_index,
         return req;
     }
 
-    ASSERT_UCS_OK(UCS_PTR_STATUS(req));
+    /* close request can be completed with any status depends on peer state */
     return NULL;
 }
 
@@ -697,7 +740,8 @@ void ucp_test_base::entity::close_ep_req_free(void *close_req) {
 
     ucs_status_t status = UCS_PTR_IS_ERR(close_req) ? UCS_PTR_STATUS(close_req) :
                           ucp_request_check_status(close_req);
-    ASSERT_NE(UCS_INPROGRESS, status) << "free not completed EP close request";
+    ASSERT_NE(UCS_INPROGRESS, status) << "free not completed EP close request: "
+                                      << close_req;
     if (status != UCS_OK) {
         UCS_TEST_MESSAGE << "ucp_ep_close_nb completed with status "
                          << ucs_status_string(status);
@@ -837,8 +881,7 @@ unsigned ucp_test_base::entity::progress(int worker_index)
     if (!m_conn_reqs.empty()) {
         ucp_conn_request_h conn_req = m_conn_reqs.back();
         m_conn_reqs.pop();
-        ucp_ep_h ep = accept(ucp_worker, conn_req);
-        set_ep(ep, worker_index, std::numeric_limits<int>::max());
+        accept(worker_index, conn_req);
         ++progress_count;
     }
 
@@ -873,6 +916,10 @@ const size_t &ucp_test_base::entity::get_err_num() const {
     return m_err_cntr;
 }
 
+const size_t &ucp_test_base::entity::get_accept_err_num() const {
+    return m_accept_err_cntr;
+}
+
 void ucp_test_base::entity::warn_existing_eps() const {
     for (size_t worker_index = 0; worker_index < m_workers.size(); ++worker_index) {
         for (size_t ep_index = 0; ep_index < m_workers[worker_index].second.size();
@@ -949,6 +996,11 @@ bool ucp_test_base::entity::has_lane_with_caps(uint64_t caps) const
     return false;
 }
 
+bool ucp_test_base::entity::is_conn_reqs_queue_empty() const
+{
+    return m_conn_reqs.empty();
+}
+
 bool ucp_test_base::is_request_completed(void *request) {
     return (request == NULL) ||
            (ucp_request_check_status(request) != UCS_INPROGRESS);
@@ -1002,3 +1054,8 @@ ucp_mem_h ucp_test::mapped_buffer::memh() const
 {
     return m_memh;
 }
+
+void test_ucp_context::get_test_variants(std::vector<ucp_test_variant> &variants)
+{
+    add_variant(variants, UCP_FEATURE_TAG | UCP_FEATURE_WAKEUP);
+}
diff --git a/test/gtest/ucp/ucp_test.h b/test/gtest/ucp/ucp_test.h
index 36f0f4811fc..1afdb961eb0 100644
--- a/test/gtest/ucp/ucp_test.h
+++ b/test/gtest/ucp/ucp_test.h
@@ -93,7 +93,7 @@ class ucp_test_base : public ucs::test_base {
 
         bool verify_client_address(struct sockaddr_storage *client_address);
 
-        ucp_ep_h accept(ucp_worker_h worker, ucp_conn_request_h conn_request);
+        void accept(int worker_index, ucp_conn_request_h conn_request);
 
         void* modify_ep(const ucp_ep_params_t& ep_params, int worker_idx = 0,
                        int ep_idx = 0);
@@ -141,6 +141,8 @@ class ucp_test_base : public ucs::test_base {
 
         const size_t &get_err_num() const;
 
+        const size_t &get_accept_err_num() const;
+
         void warn_existing_eps() const;
 
         double set_ib_ud_timeout(double timeout_sec);
@@ -151,6 +153,8 @@ class ucp_test_base : public ucs::test_base {
 
         bool has_lane_with_caps(uint64_t caps) const;
 
+        bool is_conn_reqs_queue_empty() const;
+
     protected:
         ucs::handle<ucp_context_h>      m_ucph;
         worker_vec_t                    m_workers;
@@ -159,6 +163,7 @@ class ucp_test_base : public ucs::test_base {
         close_ep_reqs_t                 m_close_ep_reqs;
         size_t                          m_err_cntr;
         size_t                          m_rejected_cntr;
+        size_t                          m_accept_err_cntr;
         ucs::handle<ucp_ep_params_t*>   m_server_ep_params;
 
     private:
@@ -227,10 +232,13 @@ class ucp_test : public ucp_test_base,
     void short_progress_loop(int worker_index = 0) const;
     void flush_ep(const entity &e, int worker_index = 0, int ep_index = 0);
     void flush_worker(const entity &e, int worker_index = 0);
+    void flush_workers();
     void disconnect(entity& entity);
     ucs_status_t request_wait(void *req, int worker_index = 0);
+    ucs_status_t requests_wait(std::vector<void*> &reqs, int worker_index = 0);
     void request_release(void *req);
     int max_connections();
+    void set_tl_timeouts(ucs::ptr_vector<ucs::scoped_setenv> &env);
 
     // Add test variant without values, with given context params
     static ucp_test_variant&
@@ -297,6 +305,16 @@ class ucp_test : public ucp_test_base,
         }
     }
 
+    template <typename T>
+    void wait_for_value(volatile T *var, T value, double timeout = 10.0) const
+    {
+        ucs_time_t deadline = ucs_get_time() +
+                              ucs_time_from_sec(timeout) * ucs::test_time_multiplier();
+        while ((ucs_get_time() < deadline) && (*var != value)) {
+            short_progress_loop();
+        }
+    }
+
     static const ucp_datatype_t DATATYPE;
     static const ucp_datatype_t DATATYPE_IOV;
 
@@ -319,6 +337,12 @@ class ucp_test : public ucp_test_base,
 };
 
 
+class test_ucp_context : public ucp_test {
+public:
+    static void get_test_variants(std::vector<ucp_test_variant> &variants);
+};
+
+
 std::ostream& operator<<(std::ostream& os, const ucp_test_param& test_param);
 
 template <class T>
@@ -335,13 +359,26 @@ std::vector<ucp_test_param> enum_test_params(const std::string& tls)
  *
  * @param _test_case   Test case class, derived from ucp_test.
  * @param _name        Instantiation name.
- * @param ...          Transport names.
+ * @param _tls         Transport names.
  */
 #define UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, _name, _tls) \
     INSTANTIATE_TEST_CASE_P(_name,  _test_case, \
                             testing::ValuesIn(enum_test_params<_test_case>(_tls)));
 
 
+/**
+ * Instantiate the parameterized test case a combination of transports with GPU
+ * awareness.
+ *
+ * @param _test_case   Test case class, derived from ucp_test.
+ * @param _name        Instantiation name.
+ * @param _tls         Transport names.
+ */
+#define UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, _name, _tls) \
+    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, _name, \
+                                  _tls "," UCP_TEST_GPU_COPY_TLS)
+
+
 /**
  * Instantiate the parameterized test case for all transport combinations.
  *
@@ -372,15 +409,23 @@ std::vector<ucp_test_param> enum_test_params(const std::string& tls)
  * @param _test_case  Test case class, derived from ucp_test.
  */
 #define UCP_INSTANTIATE_TEST_CASE_GPU_AWARE(_test_case) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, dcx,        "dc_x," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ud,         "ud_v," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, udx,        "ud_x," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rc,         "rc_v," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, rcx,        "rc_x," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, shm_ib,     "shm,ib," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, shm_ib_ipc, "shm,ib,cuda_ipc,rocm_ipc," \
-                                                          UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, ugni,       "ugni," UCP_TEST_GPU_COPY_TLS) \
-    UCP_INSTANTIATE_TEST_CASE_TLS(_test_case, tcp,        "tcp," UCP_TEST_GPU_COPY_TLS)
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, dcx, \
+                                            "dc_x") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, ud, \
+                                            "ud_v") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, udx, \
+                                            "ud_x") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, rc, \
+                                            "rc_v") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, rcx, \
+                                            "rc_x") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib, \
+                                            "shm,ib") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, shm_ib_ipc, \
+                                            "shm,ib,cuda_ipc,rocm_ipc") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, ugni, \
+                                            "ugni") \
+    UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(_test_case, tcp, \
+                                            "tcp")
 
 #endif
diff --git a/test/gtest/ucs/test_async.cc b/test/gtest/ucs/test_async.cc
index 1c89b744148..7984dea0d52 100644
--- a/test/gtest/ucs/test_async.cc
+++ b/test/gtest/ucs/test_async.cc
@@ -201,6 +201,10 @@ class local : public async_poll {
         UCS_ASYNC_UNBLOCK(&m_async);
     }
 
+    bool is_blocked() const {
+        return ucs_async_is_blocked(&m_async);
+    }
+
     void check_miss() {
         ucs_async_check_miss(&m_async);
     }
@@ -293,6 +297,12 @@ public ucs::test_base {
                                          << " retries";
     }
 
+    void check_is_blocked(const local *le, bool expected)
+    {
+#if UCS_ENABLE_ASSERT
+        EXPECT_EQ(expected, le->is_blocked());
+#endif
+    }
 };
 
 template<typename LOCAL>
@@ -313,11 +323,14 @@ class test_async_mt : public test_async {
     int thread_run(unsigned index) {
         LOCAL* le;
         m_ev[index] = le = new LOCAL(GetParam());
+  
+        check_is_blocked(le, false);
 
         barrier();
 
         while (!m_stop[index]) {
             le->block();
+            check_is_blocked(le, true);
             unsigned before = le->count();
             suspend_and_poll(le, 1.0);
             unsigned after  = le->count();
@@ -328,6 +341,8 @@ class test_async_mt : public test_async {
             suspend_and_poll(le, 1.0);
         }
 
+        check_is_blocked(le, false);
+
         int result = le->count();
         delete le;
         m_ev[index] = NULL;
@@ -359,6 +374,26 @@ class test_async_mt : public test_async {
         return m_thread_counts[thread];
     }
 
+    void is_blocked_test()
+    {
+        spawn();
+        suspend();
+
+        for (unsigned i = 0; i < NUM_THREADS; ++i) {
+            LOCAL *le = m_ev[i];
+
+            EXPECT_FALSE(le->is_blocked());
+            le->block();
+            {
+                EXPECT_TRUE(le->is_blocked());
+            }
+            le->unblock();
+            EXPECT_FALSE(le->is_blocked());
+        }
+
+        stop();
+    }
+
 private:
     void barrier() {
         pthread_barrier_wait(&m_barrier);
@@ -603,6 +638,32 @@ UCS_TEST_P(test_async, warn_block) {
     }
 }
 
+UCS_TEST_P(test_async, check_blocks) {
+    local_event le(GetParam());
+
+    check_is_blocked(&le, false);
+
+    le.block();
+    {
+        check_is_blocked(&le, true);
+        le.block();
+        {
+            check_is_blocked(&le, true);
+            le.block();
+            {
+                check_is_blocked(&le, true);
+            }
+            le.unblock();
+            check_is_blocked(&le, true);
+        }
+        le.unblock();
+        check_is_blocked(&le, true);
+    }
+    le.unblock();
+
+    check_is_blocked(&le, false);
+}
+
 class local_timer_long_handler : public local_timer {
 public:
     local_timer_long_handler(ucs_async_mode_t mode, int sleep_usec) :
@@ -786,6 +847,15 @@ UCS_TEST_SKIP_COND_P(test_async_event_mt, multithread,
     EXPECT_GE(min_count, exp_min_count);
 }
 
+UCS_TEST_SKIP_COND_P(test_async_event_mt, check_blocks_multithread,
+                     // This test blocks async in two threads simultaneously -
+                     // poll_block and signal don't allow it
+                     (GetParam() == UCS_ASYNC_MODE_POLL) ||
+                     (GetParam() == UCS_ASYNC_MODE_SIGNAL))
+{
+    is_blocked_test();
+}
+
 UCS_TEST_P(test_async_timer_mt, multithread) {
     const int exp_min_count = (int)(COUNT * 0.10);
     int min_count = 0;
diff --git a/test/gtest/ucs/test_bitmap.cc b/test/gtest/ucs/test_bitmap.cc
new file mode 100644
index 00000000000..be7a6e8ef09
--- /dev/null
+++ b/test/gtest/ucs/test_bitmap.cc
@@ -0,0 +1,291 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include <common/test.h>
+#include <ucs/datastruct/bitmap.h>
+
+class test_ucs_bitmap : public ucs::test {
+public:
+    virtual void init()
+    {
+        UCS_BITMAP_CLEAR(&bitmap);
+    }
+
+protected:
+    void copy_bitmap(ucs_bitmap_t(128) *bitmap, uint64_t *dest)
+    {
+        int i;
+
+        UCS_BITMAP_FOR_EACH_BIT(*bitmap, i) {
+            dest[UCS_BITMAP_WORD_INDEX(*bitmap, i)] |= UCS_BIT(
+                    i % UCS_BITMAP_BITS_IN_WORD);
+        }
+    }
+
+protected:
+    ucs_bitmap_t(128) bitmap;
+};
+
+void test_set_get_unset(ucs_bitmap_t(128) *bitmap, uint64_t offset)
+{
+    UCS_BITMAP_SET(*bitmap, offset);
+    EXPECT_EQ(UCS_BITMAP_GET(*bitmap, offset), 1);
+    EXPECT_EQ(bitmap->bits[offset >= UCS_BITMAP_BITS_IN_WORD], UCS_BIT(offset % 64));
+    EXPECT_EQ(bitmap->bits[offset < UCS_BITMAP_BITS_IN_WORD], 0);
+
+    UCS_BITMAP_UNSET(*bitmap, offset);
+    EXPECT_EQ(bitmap->bits[0], 0);
+    EXPECT_EQ(bitmap->bits[1], 0);
+    EXPECT_EQ(UCS_BITMAP_GET(*bitmap, offset), 0);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_popcount) {
+    int popcount = UCS_BITMAP_POPCOUNT(bitmap);
+    EXPECT_EQ(popcount, 0);
+    UCS_BITMAP_SET(bitmap, 12);
+    UCS_BITMAP_SET(bitmap, 53);
+    UCS_BITMAP_SET(bitmap, 71);
+    UCS_BITMAP_SET(bitmap, 110);
+    popcount = UCS_BITMAP_POPCOUNT(bitmap);
+    EXPECT_EQ(popcount, 4);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_popcount_upto_index) {
+    int popcount;
+    UCS_BITMAP_SET(bitmap, 17);
+    UCS_BITMAP_SET(bitmap, 71);
+    UCS_BITMAP_SET(bitmap, 121);
+    popcount = UCS_BITMAP_POPCOUNT_UPTO_INDEX(bitmap, 110);
+    EXPECT_EQ(popcount, 2);
+
+    popcount = UCS_BITMAP_POPCOUNT_UPTO_INDEX(bitmap, 20);
+    EXPECT_EQ(popcount, 1);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_mask) {
+    /* coverity[unsigned_compare] */
+    UCS_BITMAP_MASK(&bitmap, 0);
+    EXPECT_EQ(UCS_BITMAP_IS_ZERO_INPLACE(&bitmap), true);
+    UCS_BITMAP_SET(bitmap, 64 + 42);
+    UCS_BITMAP_MASK(&bitmap, 64 + 42);
+
+    EXPECT_EQ(bitmap.bits[0], UINT64_MAX);
+    EXPECT_EQ(bitmap.bits[1], (1ul << 42) - 1);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_set_all) {
+    UCS_BITMAP_SET_ALL(bitmap);
+    EXPECT_EQ(bitmap.bits[0], UINT64_MAX);
+    EXPECT_EQ(bitmap.bits[1], UINT64_MAX);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_ffs) {
+    size_t bit_index;
+
+    bit_index = UCS_BITMAP_FFS(bitmap);
+    EXPECT_EQ(bit_index, 128);
+
+    UCS_BITMAP_SET(bitmap, 90);
+    UCS_BITMAP_SET(bitmap, 100);
+    bit_index = UCS_BITMAP_FFS(bitmap);
+    EXPECT_EQ(bit_index, 90);
+
+    UCS_BITMAP_CLEAR(&bitmap);
+    UCS_BITMAP_SET(bitmap, 0);
+    bit_index = UCS_BITMAP_FFS(bitmap);
+    EXPECT_EQ(bit_index, 0);
+
+    UCS_BITMAP_CLEAR(&bitmap);
+    UCS_BITMAP_SET(bitmap, 64);
+    bit_index = UCS_BITMAP_FFS(bitmap);
+    EXPECT_EQ(bit_index, 64);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_is_zero) {
+    EXPECT_TRUE(UCS_BITMAP_IS_ZERO_INPLACE(&bitmap));
+
+    UCS_BITMAP_SET(bitmap, 71);
+    EXPECT_FALSE(UCS_BITMAP_IS_ZERO_INPLACE(&bitmap));
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_get_set_clear)
+{
+    const uint64_t offset = 15;
+
+    EXPECT_EQ(bitmap.bits[0], 0);
+    EXPECT_EQ(bitmap.bits[1], 0);
+    EXPECT_EQ(UCS_BITMAP_GET(bitmap, offset), 0);
+
+    test_set_get_unset(&bitmap, offset);
+    test_set_get_unset(&bitmap, offset + 64);
+
+    UCS_BITMAP_CLEAR(&bitmap);
+    for (int i = 0; i < 128; i++) {
+        EXPECT_EQ(UCS_BITMAP_GET(bitmap, i), 0);
+    }
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_foreach)
+{
+    uint64_t bitmap_words[2] = {};
+
+    UCS_BITMAP_SET(bitmap, 1);
+    UCS_BITMAP_SET(bitmap, 25);
+    UCS_BITMAP_SET(bitmap, 61);
+
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 0);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 37);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 58);
+
+    copy_bitmap(&bitmap, bitmap_words);
+
+    EXPECT_EQ(bitmap_words[0], UCS_BIT(1) | UCS_BIT(25) | UCS_BIT(61));
+    EXPECT_EQ(bitmap_words[1], UCS_BIT(0) | UCS_BIT(37) | UCS_BIT(58));
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_not)
+{
+    ucs_bitmap_t(128) bitmap2;
+
+    UCS_BITMAP_SET(bitmap, 1);
+    bitmap2 = UCS_BITMAP_NOT(bitmap, 128);
+    UCS_BITMAP_NOT_INPLACE(&bitmap);
+
+    EXPECT_EQ(bitmap.bits[0], -3ull);
+    EXPECT_EQ(bitmap.bits[1], UINT64_MAX);
+    EXPECT_EQ(bitmap2.bits[0], -3ull);
+    EXPECT_EQ(bitmap2.bits[1], UINT64_MAX);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_and)
+{
+    ucs_bitmap_t(128) bitmap2, bitmap3;
+
+    UCS_BITMAP_CLEAR(&bitmap2);
+    UCS_BITMAP_SET(bitmap, 1);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 1);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 16);
+
+    UCS_BITMAP_SET(bitmap2, 25);
+    UCS_BITMAP_SET(bitmap2, UCS_BITMAP_BITS_IN_WORD + 1);
+    UCS_BITMAP_SET(bitmap2, UCS_BITMAP_BITS_IN_WORD + 30);
+    bitmap3 = UCS_BITMAP_AND(bitmap, bitmap2, 128);
+    UCS_BITMAP_AND_INPLACE(&bitmap, bitmap2);
+
+    EXPECT_EQ(bitmap.bits[0], 0);
+    EXPECT_EQ(bitmap.bits[1], UCS_BIT(1));
+    EXPECT_EQ(bitmap3.bits[0], 0);
+    EXPECT_EQ(bitmap3.bits[1], UCS_BIT(1));
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_or)
+{
+    ucs_bitmap_t(128) bitmap2, bitmap3;
+
+    UCS_BITMAP_CLEAR(&bitmap2);
+    UCS_BITMAP_SET(bitmap, 1);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 1);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 16);
+
+    UCS_BITMAP_SET(bitmap2, 25);
+    UCS_BITMAP_SET(bitmap2, UCS_BITMAP_BITS_IN_WORD + 1);
+    UCS_BITMAP_SET(bitmap2, UCS_BITMAP_BITS_IN_WORD + 30);
+    bitmap3 = UCS_BITMAP_OR(bitmap, bitmap2, 128);
+    UCS_BITMAP_OR_INPLACE(&bitmap, bitmap2);
+
+    EXPECT_EQ(bitmap.bits[0], UCS_BIT(1) | UCS_BIT(25));
+    EXPECT_EQ(bitmap.bits[1], UCS_BIT(1) | UCS_BIT(16) | UCS_BIT(30));
+    EXPECT_EQ(bitmap3.bits[0], UCS_BIT(1) | UCS_BIT(25));
+    EXPECT_EQ(bitmap3.bits[1], UCS_BIT(1) | UCS_BIT(16) | UCS_BIT(30));
+}
+
+
+UCS_TEST_F(test_ucs_bitmap, test_xor)
+{
+    ucs_bitmap_t(128) bitmap2 = UCS_BITMAP_ZERO, bitmap3 = UCS_BITMAP_ZERO;
+
+    bitmap.bits[0]  = 1;
+    bitmap.bits[1]  = UINT64_MAX;
+    bitmap2.bits[0] = UINT64_MAX;
+    bitmap2.bits[1] = 1;
+    bitmap3         = UCS_BITMAP_XOR(bitmap, bitmap2, 128);
+    UCS_BITMAP_XOR_INPLACE(&bitmap, bitmap2);
+
+    EXPECT_EQ(bitmap.bits[0], -2);
+    EXPECT_EQ(bitmap.bits[1], -2);
+    EXPECT_EQ(bitmap3.bits[0], -2);
+    EXPECT_EQ(bitmap3.bits[1], -2);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_copy)
+{
+    ucs_bitmap_t(128) bitmap2 = UCS_BITMAP_ZERO;
+
+    UCS_BITMAP_SET(bitmap, 1);
+    UCS_BITMAP_SET(bitmap, 25);
+    UCS_BITMAP_SET(bitmap, 61);
+
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 0);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 37);
+    UCS_BITMAP_SET(bitmap, UCS_BITMAP_BITS_IN_WORD + 58);
+
+    UCS_BITMAP_COPY(bitmap2, bitmap);
+
+    EXPECT_EQ(bitmap.bits[0], UCS_BIT(1) | UCS_BIT(25) | UCS_BIT(61));
+    EXPECT_EQ(bitmap.bits[1], UCS_BIT(0) | UCS_BIT(37) | UCS_BIT(58));
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_for_each_bit)
+{
+    int i = 0, bit_index;
+    int bits[128] = {0};
+
+    UCS_BITMAP_SET(bitmap, 0);
+    UCS_BITMAP_SET(bitmap, 25);
+    UCS_BITMAP_SET(bitmap, 64);
+    UCS_BITMAP_SET(bitmap, 100);
+    UCS_BITMAP_FOR_EACH_BIT(bitmap, bit_index) {
+        i++;
+        bits[bit_index]++;
+    }
+    
+    EXPECT_EQ(i, 4);
+    EXPECT_EQ(bits[0], 1);
+    EXPECT_EQ(bits[25], 1);
+    EXPECT_EQ(bits[64], 1);
+    EXPECT_EQ(bits[100], 1);
+
+    /* Test FOREACH on an empty bitmap */
+    UCS_BITMAP_CLEAR(&bitmap);
+    i = 0;
+    
+    UCS_BITMAP_FOR_EACH_BIT(bitmap, bit_index) {
+        i++;
+    }
+    EXPECT_EQ(i, 0);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_for_each_bit_single_word) {
+    int i         = 0;
+    int bits[128] = {0};
+    int bit_index;
+
+    UCS_BITMAP_SET(bitmap, 0);
+    UCS_BITMAP_SET(bitmap, 25);
+    UCS_BITMAP_FOR_EACH_BIT(bitmap, bit_index) {
+        i++;
+        bits[bit_index]++;
+    }
+
+    EXPECT_EQ(i, 2);
+    EXPECT_EQ(bits[0], 1);
+    EXPECT_EQ(bits[25], 1);
+}
+
+UCS_TEST_F(test_ucs_bitmap, test_compose) {
+    /* The result is irrelevant, the code only needs to compile */
+    UCS_BITMAP_AND(UCS_BITMAP_NOT(bitmap, 128), bitmap, 128);
+}
diff --git a/test/gtest/ucs/test_callbackq.cc b/test/gtest/ucs/test_callbackq.cc
index 705dbdc9134..c8e840d4616 100644
--- a/test/gtest/ucs/test_callbackq.cc
+++ b/test/gtest/ucs/test_callbackq.cc
@@ -32,6 +32,7 @@ class test_callbackq :
         uint32_t                  count;
         int                       command;
         callback_ctx              *to_add;
+        unsigned                  flags;
         int                       key;
     };
 
@@ -81,9 +82,11 @@ class test_callbackq :
     void init_ctx(callback_ctx *ctx, int key = 0)
     {
         ctx->test        = this;
+        ctx->callback_id = UCS_CALLBACKQ_ID_NULL;
         ctx->count       = 0;
         ctx->command     = COMMAND_NONE;
-        ctx->callback_id = UCS_CALLBACKQ_ID_NULL;
+        ctx->to_add      = NULL;
+        ctx->flags       = 0;
         ctx->key         = key;
     }
 
@@ -95,7 +98,7 @@ class test_callbackq :
     {
         ctx->callback_id = ucs_callbackq_add(&m_cbq, callback_proxy,
                                              reinterpret_cast<void*>(ctx),
-                                             cb_flags() | flags);
+                                             ctx->flags | cb_flags() | flags);
     }
 
     void remove(int callback_id)
@@ -215,19 +218,22 @@ UCS_TEST_P(test_callbackq, add_another) {
     ctx.command = COMMAND_NONE;
 
     unsigned count = ctx.count;
+    if (cb_flags() & UCS_CALLBACKQ_FLAG_FAST) {
+        count++; /* fast CBs are executed immediately after "add" */
+    }
 
     dispatch();
     EXPECT_EQ(2u, ctx.count);
-    EXPECT_EQ(count + 1, ctx2.count);
+    EXPECT_EQ(count, ctx2.count);
 
     remove(&ctx);
     dispatch();
     EXPECT_EQ(2u, ctx.count);
-    EXPECT_EQ(count + 2, ctx2.count);
+    EXPECT_EQ(count + 1, ctx2.count);
 
     remove(&ctx2);
     dispatch();
-    EXPECT_EQ(count + 2, ctx2.count);
+    EXPECT_EQ(count + 1, ctx2.count);
 }
 
 UCS_MT_TEST_P(test_callbackq, threads, 10) {
@@ -337,6 +343,24 @@ UCS_TEST_F(test_callbackq_noflags, oneshot) {
     EXPECT_EQ(1u, ctx.count);
 }
 
+UCS_TEST_F(test_callbackq_noflags, oneshot_recursive) {
+    callback_ctx ctx;
+
+    init_ctx(&ctx);
+    ctx.command = COMMAND_ADD_ANOTHER;
+    ctx.flags   = UCS_CALLBACKQ_FLAG_ONESHOT;
+    ctx.to_add  = &ctx;
+
+    add(&ctx);
+
+    for (unsigned i = 0; i < 10; ++i) {
+        dispatch(1);
+        EXPECT_LE(i + 1, ctx.count);
+    }
+
+    remove(ctx.callback_id);
+}
+
 UCS_TEST_F(test_callbackq_noflags, remove_if) {
     const size_t count = 1000;
     const int num_keys = 10;
diff --git a/test/gtest/ucs/test_config.cc b/test/gtest/ucs/test_config.cc
index 120e26ce1a9..3ec54fd207f 100644
--- a/test/gtest/ucs/test_config.cc
+++ b/test/gtest/ucs/test_config.cc
@@ -12,6 +12,7 @@ extern "C" {
 #include <ucs/time/time.h>
 }
 
+#define TEST_CONFIG_FILE TOP_SRCDIR "/test/gtest/ucs/ucx.conf"
 
 typedef enum {
     COLOR_RED,
@@ -87,6 +88,11 @@ typedef struct {
     int             air_conditioning;
     int             abs;
     int             transmission;
+
+    ucs_time_t      time_value;
+    ucs_time_t      time_auto;
+    ucs_time_t      time_inf;
+    ucs_config_allow_list_t allow_list;
 } car_opts_t;
 
 
@@ -205,13 +211,32 @@ ucs_config_field_t car_opts_table[] = {
   {"TRANSMISSION", "auto", "Transmission mode",
    ucs_offsetof(car_opts_t, transmission), UCS_CONFIG_TYPE_ON_OFF_AUTO},
 
+  {"TIME_VAL", "1s", "Time value 1 sec",
+   ucs_offsetof(car_opts_t, time_value), UCS_CONFIG_TYPE_TIME_UNITS},
+
+  {"TIME_AUTO", "auto", "Time value \"auto\"",
+   ucs_offsetof(car_opts_t, time_auto), UCS_CONFIG_TYPE_TIME_UNITS},
+
+  {"TIME_INF", "inf", "Time value \"inf\"",
+   ucs_offsetof(car_opts_t, time_inf), UCS_CONFIG_TYPE_TIME_UNITS},
+
+  {"ALLOW_LIST", "all", "Allow-list: \"all\" OR \"val1,val2\" OR \"^val1,val2\"",
+   ucs_offsetof(car_opts_t, allow_list), UCS_CONFIG_TYPE_ALLOW_LIST},
+
   {NULL}
 };
 
 static std::vector<std::string> config_err_exp_str;
 
 class test_config : public ucs::test {
+public:
+    test_config() {
+        m_num_errors = 0;
+    }
+
 protected:
+    static int m_num_errors;
+
     static ucs_log_func_rc_t
     config_error_handler(const char *file, unsigned line, const char *function,
                          ucs_log_level_t level,
@@ -233,6 +258,22 @@ class test_config : public ucs::test {
         return UCS_LOG_FUNC_RC_CONTINUE;
     }
 
+    static ucs_log_func_rc_t
+    config_error_suppress(const char *file, unsigned line, const char *function,
+                          ucs_log_level_t level,
+                          const ucs_log_component_config_t *comp_conf,
+                          const char *message, va_list ap)
+    {
+        // Ignore errors that invalid input parameters as it is expected
+        if (level == UCS_LOG_LEVEL_ERROR) {
+            m_num_errors++;
+            return wrap_errors_logger(file, line, function, level, comp_conf,
+                                      message, ap);
+        }
+
+        return UCS_LOG_FUNC_RC_CONTINUE;
+    }
+
     /*
      * Wrapper class for car options parser.
      */
@@ -286,11 +327,14 @@ class test_config : public ucs::test {
         static car_opts_t parse(const char *env_prefix,
                                 const char *table_prefix) {
             car_opts_t tmp;
-            ucs_status_t status = ucs_config_parser_fill_opts(&tmp,
-                                                              car_opts_table,
-                                                              env_prefix,
-                                                              table_prefix,
-                                                              0);
+            ucs_status_t status = ucs_config_parse_config_file(TEST_CONFIG_FILE, 1);
+            ASSERT_UCS_OK(status);
+
+            status = ucs_config_parser_fill_opts(&tmp,
+                                                 car_opts_table,
+                                                 env_prefix,
+                                                 table_prefix,
+                                                 0);
             ASSERT_UCS_OK(status);
             return tmp;
         }
@@ -364,6 +408,8 @@ class test_config : public ucs::test {
     }
 };
 
+int test_config::m_num_errors;
+
 UCS_TEST_F(test_config, parse_default) {
     car_opts opts(UCS_DEFAULT_ENV_PREFIX, "TEST");
 
@@ -397,6 +443,12 @@ UCS_TEST_F(test_config, parse_default) {
     EXPECT_EQ(UCS_CONFIG_ON, opts->air_conditioning);
     EXPECT_EQ(UCS_CONFIG_OFF, opts->abs);
     EXPECT_EQ(UCS_CONFIG_AUTO, opts->transmission);
+
+    EXPECT_EQ(ucs_time_from_sec(1.0), opts->time_value);
+    EXPECT_EQ(UCS_TIME_AUTO, opts->time_auto);
+    EXPECT_EQ(UCS_TIME_INFINITY, opts->time_inf);
+    EXPECT_EQ(UCS_CONFIG_ALLOW_LIST_ALLOW_ALL, opts->allow_list.mode);
+    EXPECT_EQ(0, opts->allow_list.array.count);
 }
 
 UCS_TEST_F(test_config, clone) {
@@ -419,6 +471,7 @@ UCS_TEST_F(test_config, clone) {
     }
 
     EXPECT_EQ(COLOR_WHITE, (*opts_clone_ptr)->color);
+    EXPECT_EQ(UCS_CONFIG_ALLOW_LIST_ALLOW_ALL, (*opts_clone_ptr)->allow_list.mode);
     delete opts_clone_ptr;
 }
 
@@ -440,6 +493,17 @@ UCS_TEST_F(test_config, set_get) {
 
     opts.set("VIN", "123456");
     EXPECT_EQ(123456UL, opts->vin);
+
+    /* try to set incorrect value - color should not be updated */
+    {
+        scoped_log_handler log_handler_vars(config_error_suppress);
+        opts.set("COLOR", "magenta");
+    }
+
+    EXPECT_EQ(COLOR_WHITE, opts->color);
+    EXPECT_EQ(std::string(color_names[COLOR_WHITE]),
+            std::string(opts.get("COLOR")));
+    EXPECT_EQ(1, m_num_errors);
 }
 
 UCS_TEST_F(test_config, set_get_with_table_prefix) {
@@ -498,7 +562,7 @@ UCS_TEST_F(test_config, unused) {
         scoped_log_handler log_handler(config_error_handler);
         car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL);
 
-        ucs_config_parser_warn_unused_env_vars_once(UCS_DEFAULT_ENV_PREFIX);
+        ucs_config_parser_print_env_vars_once(UCS_DEFAULT_ENV_PREFIX);
 
         config_err_exp_str.pop_back();
     }
@@ -512,7 +576,7 @@ UCS_TEST_F(test_config, unused) {
         scoped_log_handler log_handler(config_error_handler);
         car_opts opts("TEST_", NULL);
 
-        ucs_config_parser_warn_unused_env_vars_once("TEST_");
+        ucs_config_parser_print_env_vars_once("TEST_");
 
         config_err_exp_str.pop_back();
     }
@@ -523,27 +587,23 @@ UCS_TEST_F(test_config, unused) {
 
 UCS_TEST_F(test_config, dump) {
     /* aliases must not be counted here */
-    test_config_print_opts(UCS_CONFIG_PRINT_CONFIG, 28u);
+    test_config_print_opts(UCS_CONFIG_PRINT_CONFIG, 32u);
 }
 
 UCS_TEST_F(test_config, dump_hidden) {
     /* aliases must be counted here */
-    test_config_print_opts((UCS_CONFIG_PRINT_CONFIG |
-                            UCS_CONFIG_PRINT_HIDDEN),
-                           35u);
+    test_config_print_opts(UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HIDDEN, 39u);
 }
 
 UCS_TEST_F(test_config, dump_hidden_check_alias_name) {
     /* aliases must be counted here */
-    test_config_print_opts((UCS_CONFIG_PRINT_CONFIG |
-                            UCS_CONFIG_PRINT_HIDDEN |
-                            UCS_CONFIG_PRINT_DOC),
-                           35u);
-
-    test_config_print_opts((UCS_CONFIG_PRINT_CONFIG |
-                            UCS_CONFIG_PRINT_HIDDEN |
-                            UCS_CONFIG_PRINT_DOC),
-                           35u, "TEST_");
+    test_config_print_opts(
+        UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HIDDEN | UCS_CONFIG_PRINT_DOC,
+        39u);
+
+    test_config_print_opts(
+        UCS_CONFIG_PRINT_CONFIG | UCS_CONFIG_PRINT_HIDDEN | UCS_CONFIG_PRINT_DOC,
+        39u, "TEST_");
 }
 
 UCS_TEST_F(test_config, deprecated) {
@@ -577,3 +637,50 @@ UCS_TEST_F(test_config, deprecated) {
     /* reset to not warn about unused env vars */
     ucs_global_opts.warn_unused_env_vars = 0;
 }
+
+UCS_TEST_F(test_config, test_allow_list) {
+    const std::string allow_list = "UCX_ALLOW_LIST";
+
+    {
+        /* coverity[tainted_string_argument] */
+        ucs::scoped_setenv env(allow_list.c_str(), "first,second");
+
+        car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL);
+        EXPECT_EQ(UCS_CONFIG_ALLOW_LIST_ALLOW, opts->allow_list.mode);
+        EXPECT_EQ(2, opts->allow_list.array.count);
+        EXPECT_EQ(std::string("first"), opts->allow_list.array.names[0]);
+        EXPECT_EQ(std::string("second"), opts->allow_list.array.names[1]);
+    }
+
+    {
+        /* coverity[tainted_string_argument] */
+        ucs::scoped_setenv env(allow_list.c_str(), "^first,second");
+
+        car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL);
+        EXPECT_EQ(UCS_CONFIG_ALLOW_LIST_NEGATE, opts->allow_list.mode);
+        EXPECT_EQ(2, opts->allow_list.array.count);
+        EXPECT_EQ(std::string("first"), opts->allow_list.array.names[0]);
+        EXPECT_EQ(std::string("second"), opts->allow_list.array.names[1]);
+    }
+}
+
+UCS_TEST_F(test_config, test_allow_list_negative)
+{
+    ucs_config_allow_list_t field;
+
+    EXPECT_EQ(ucs_config_sscanf_allow_list("all,all", &field,
+                                           &ucs_config_array_string), 0);
+}
+
+UCS_TEST_F(test_config, test_config_file) {
+    /* coverity[tainted_string_argument] */
+    ucs::scoped_setenv env1("UCX_BRAND", "Ford");
+
+    car_opts opts(UCS_DEFAULT_ENV_PREFIX, NULL);
+
+    /* Option parsing from INI file */
+    EXPECT_EQ(100, opts->price);
+
+    /* Overriding INI file by env vars */
+    EXPECT_EQ(std::string("Ford"), std::string(opts->brand));
+}
diff --git a/test/gtest/ucs/test_datatype.cc b/test/gtest/ucs/test_datatype.cc
index 19d37a51611..40b7ebfa83c 100644
--- a/test/gtest/ucs/test_datatype.cc
+++ b/test/gtest/ucs/test_datatype.cc
@@ -275,9 +275,11 @@ UCS_TEST_F(test_datatype, queue) {
         ucs_queue_push(&head, &elem0.queue);
         EXPECT_FALSE(ucs_queue_is_empty(&head));
         EXPECT_EQ((unsigned long)1, ucs_queue_length(&head));
+        EXPECT_TRUE(ucs_queue_is_tail(&head, &elem0.queue));
 
         ucs_queue_push(&head, &elem1.queue);
         EXPECT_EQ((unsigned long)2, ucs_queue_length(&head));
+        EXPECT_TRUE(ucs_queue_is_tail(&head, &elem1.queue));
 
         EXPECT_EQ(&elem1, ucs_queue_tail_elem_non_empty(&head, elem_t, queue));
 
@@ -586,6 +588,23 @@ UCS_TEST_F(test_datatype, ptr_array_basic) {
     ucs_ptr_array_cleanup(&pa);
 }
 
+UCS_TEST_F(test_datatype, ptr_array_set_first) {
+    ucs_ptr_array_t pa;
+    int a = 1;
+
+    ucs_ptr_array_init(&pa, "ptr_array set-first test");
+
+    EXPECT_EQ(0u, pa.size);
+
+    ucs_ptr_array_set(&pa, 0, &a);
+
+    EXPECT_GT(pa.size, 0u);
+
+    ucs_ptr_array_remove(&pa, 0);
+
+    ucs_ptr_array_cleanup(&pa);
+}
+
 UCS_TEST_F(test_datatype, ptr_array_random) {
     const unsigned count = 10000 / ucs::test_time_multiplier();
     ucs_ptr_array_t pa;
@@ -917,14 +936,21 @@ static std::ostream& operator<<(std::ostream& os, const test_value_type_t &v) {
     return os << "<" << v.num1 << "," << v.num2 << ">";
 }
 
+
 UCS_ARRAY_DEFINE_INLINE(test_2num, unsigned, test_value_type_t);
+UCS_ARRAY_DEFINE_INLINE(test_1int, size_t, int);
+
+class test_array : public test_datatype {
+protected:
+    void test_fixed(ucs_array_t(test_1int) *array, size_t capacity);
+};
 
-UCS_TEST_F(test_datatype, dynamic_array_2int_grow) {
+UCS_TEST_F(test_array, dynamic_array_2int_grow) {
     ucs_array_t(test_2num) test_array;
     test_value_type_t value;
     ucs_status_t status;
 
-    ucs_array_init_dynamic(test_2num, &test_array);
+    ucs_array_init_dynamic(&test_array);
     EXPECT_FALSE(ucs_array_is_fixed(&test_array));
 
     /* grow the array enough to contain 'value_index' */
@@ -949,19 +975,17 @@ UCS_TEST_F(test_datatype, dynamic_array_2int_grow) {
     ASSERT_UCS_OK(status);
     EXPECT_EQ(value, ucs_array_elem(&test_array, value_index));
 
-    ucs_array_cleanup_dynamic(test_2num, &test_array);
+    ucs_array_cleanup_dynamic(&test_array);
 }
 
-UCS_ARRAY_DEFINE_INLINE(test_1int, size_t, int);
-
-UCS_TEST_F(test_datatype, dynamic_array_int_append) {
+UCS_TEST_F(test_array, dynamic_array_int_append) {
     static const size_t NUM_ELEMS = 1000;
 
     ucs_array_t(test_1int) test_array;
     std::vector<int> vec;
     ucs_status_t status;
 
-    ucs_array_init_dynamic(test_1int, &test_array);
+    ucs_array_init_dynamic(&test_array);
     EXPECT_FALSE(ucs_array_is_fixed(&test_array));
 
     /* push same elements to the array and the std::vector */
@@ -1004,31 +1028,53 @@ UCS_TEST_F(test_datatype, dynamic_array_int_append) {
     ucs_array_set_length(&test_array, new_length);
     EXPECT_EQ(new_length, ucs_array_length(&test_array));
 
-    ucs_array_cleanup_dynamic(test_1int, &test_array);
+    ucs_array_cleanup_dynamic(&test_array);
 }
 
-UCS_TEST_F(test_datatype, fixed_array) {
-    const size_t num_elems = 100;
-    UCS_ARRAY_DEFINE_ONSTACK(test_array, test_1int, num_elems);
+void test_array::test_fixed(ucs_array_t(test_1int) *array, size_t capacity)
+{
     ucs_status_t status;
 
     /* check initial capacity */
-    size_t initial_capacity = ucs_array_capacity(&test_array);
-    EXPECT_LE(initial_capacity, num_elems);
-    EXPECT_GE(initial_capacity, num_elems - 1);
+    size_t initial_capacity = ucs_array_capacity(array);
+    EXPECT_LE(initial_capacity, capacity);
+    EXPECT_GE(initial_capacity, capacity - 1);
 
     /* append one element */
-    status = ucs_array_append(test_1int, &test_array);
+    status = ucs_array_append(test_1int, array);
     ASSERT_UCS_OK(status);
 
-    size_t idx = ucs_array_length(&test_array) - 1;
-    ucs_array_elem(&test_array, idx) = 17;
+    size_t idx = ucs_array_length(array) - 1;
+    ucs_array_elem(array, idx) = 17;
     EXPECT_EQ(0u, idx);
-    EXPECT_EQ(1u, ucs_array_length(&test_array));
+    EXPECT_EQ(1u, ucs_array_length(array));
 
     /* check end capacity */
-    EXPECT_EQ(initial_capacity - 1, ucs_array_available_length(&test_array));
-    EXPECT_EQ(&ucs_array_elem(&test_array, 1), ucs_array_end(&test_array));
+    EXPECT_EQ(initial_capacity - 1, ucs_array_available_length(array));
+    EXPECT_EQ(&ucs_array_elem(array, 1), ucs_array_end(array));
+}
+
+UCS_TEST_F(test_array, fixed_static) {
+    const size_t num_elems = 100;
+    int buffer[num_elems];
+    ucs_array_t(test_1int) test_array =
+             UCS_ARRAY_FIXED_INITIALIZER(buffer, num_elems);
+    test_fixed(&test_array, num_elems);
+}
+
+UCS_TEST_F(test_array, fixed_init) {
+    const size_t num_elems = 100;
+    int buffer[num_elems];
+    ucs_array_t(test_1int) test_array;
+
+    ucs_array_init_fixed(&test_array, buffer, num_elems);
+    test_fixed(&test_array, num_elems);
+}
+
+UCS_TEST_F(test_array, fixed_onstack) {
+    const size_t num_elems = 100;
+    UCS_ARRAY_DEFINE_ONSTACK(test_array, test_1int, num_elems);
+    test_fixed(&test_array, num_elems);
 }
 
 UCS_TEST_F(test_datatype, ptr_map) {
@@ -1044,16 +1090,40 @@ UCS_TEST_F(test_datatype, ptr_map) {
     ASSERT_EQ(UCS_OK, status);
 
     for (size_t i = 0; i < N; ++i) {
-        char *ptr = new char;
-        status = ucs_ptr_map_put(&ptr_map, ptr, ucs::rand() % 2, &ptr_key);
-        ASSERT_EQ(UCS_OK, status);
+        char *ptr     = new char;
+        bool indirect = ucs::rand() % 2;
+        status = ucs_ptr_map_put(&ptr_map, ptr, indirect, &ptr_key);
+        if (indirect) {
+            ASSERT_UCS_OK(status);
+            EXPECT_TRUE(ucs_ptr_map_key_indirect(ptr_key));
+        } else {
+            ASSERT_EQ(UCS_ERR_NO_PROGRESS, status);
+            EXPECT_FALSE(ucs_ptr_map_key_indirect(ptr_key));
+        }
+
         std_map[ptr_key] = ptr;
     }
 
     for (std_map_t::iterator i = std_map.begin(); i != std_map.end(); ++i) {
-        ASSERT_EQ(i->second, ucs_ptr_map_get(&ptr_map, i->first));
-        status = ucs_ptr_map_del(&ptr_map, i->first);
-        ASSERT_EQ(UCS_OK, status);
+        bool indirect = ucs_ptr_map_key_indirect(i->first);
+        bool extract  = ucs::rand() % 2;
+        void *value;
+        status = ucs_ptr_map_get(&ptr_map, i->first, extract, &value);
+        if (indirect) {
+            ASSERT_EQ(UCS_OK, status);
+        } else {
+            ASSERT_EQ(UCS_ERR_NO_PROGRESS, status);
+        }
+
+        ASSERT_EQ(i->second, value);
+        if (!extract) {
+            status = ucs_ptr_map_del(&ptr_map, i->first);
+            if (indirect) {
+                ASSERT_EQ(UCS_OK, status);
+            } else {
+                ASSERT_EQ(UCS_ERR_NO_PROGRESS, status);
+            }
+        }
         delete i->second;
     }
 
diff --git a/test/gtest/ucs/test_debug.cc b/test/gtest/ucs/test_debug.cc
index 1ddc4adb3fa..e69063dcedd 100644
--- a/test/gtest/ucs/test_debug.cc
+++ b/test/gtest/ucs/test_debug.cc
@@ -6,7 +6,7 @@
 
 #include <common/test.h>
 extern "C" {
-#include <ucs/debug/debug.h>
+#include <ucs/debug/debug_int.h>
 #include <ucs/sys/compiler.h>
 #include <ucs/sys/sys.h>
 }
diff --git a/test/gtest/ucs/test_log.cc b/test/gtest/ucs/test_log.cc
index d0c11066988..c2f5c06af9c 100644
--- a/test/gtest/ucs/test_log.cc
+++ b/test/gtest/ucs/test_log.cc
@@ -59,7 +59,6 @@ class log_test : private ucs::clear_dontcopy_regions, public ucs::test {
 
     virtual void cleanup() {
         ucs_log_cleanup();
-        m_num_log_handlers_before = 0;
         pop_config();
         check_log_file();
         unsigned files_count = log_files_foreach(&log_test::remove_file);
@@ -166,17 +165,38 @@ class log_test : private ucs::clear_dontcopy_regions, public ucs::test {
 };
 
 class log_test_info : public log_test {
-    virtual void check_log_file() {
-        if (!do_grep("UCX  INFO  hello world")) {
+protected:
+    log_test_info() : m_spacer("  "), m_log_str("hello world")
+    {
+    }
+
+    virtual void check_log_file()
+    {
+        std::string log_str = "UCX  INFO" + m_spacer + m_log_str;
+        if (!do_grep(log_str)) {
             ADD_FAILURE() << read_logfile();
         }
     }
+
+    void log_info()
+    {
+        ucs_info("%s", m_log_str.c_str());
+    }
+
+    std::string m_spacer;
+    std::string m_log_str;
 };
 
 UCS_TEST_F(log_test_info, hello) {
-    ucs_info("hello world");
+    log_info();
 }
 
+UCS_TEST_F(log_test_info, hello_indent) {
+    ucs_log_indent(1);
+    log_info();
+    ucs_log_indent(-1);
+    m_spacer += "  ";
+}
 
 class log_test_print : public log_test {
     virtual void check_log_file() {
@@ -313,3 +333,29 @@ class log_test_backtrace : public log_test {
 UCS_TEST_F(log_test_backtrace, backtrace) {
     ucs_log_print_backtrace(UCS_LOG_LEVEL_INFO);
 }
+
+class log_demo : public ucs::test {
+};
+
+UCS_MT_TEST_F(log_demo, indent, 4)
+{
+    ucs::scoped_log_level enable_debug(UCS_LOG_LEVEL_DEBUG);
+
+    ucs_debug("scope begin");
+
+    ucs_log_indent(1);
+    EXPECT_EQ(1, ucs_log_get_current_indent());
+    ucs_debug("nested log 1");
+
+    ucs_log_indent(1);
+    EXPECT_EQ(2, ucs_log_get_current_indent());
+    ucs_debug("nested log 1.1");
+    ucs_log_indent(-1);
+
+    EXPECT_EQ(1, ucs_log_get_current_indent());
+    ucs_debug("nested log 2");
+    ucs_log_indent(-1);
+
+    EXPECT_EQ(0, ucs_log_get_current_indent());
+    ucs_debug("done");
+}
diff --git a/test/gtest/ucs/test_math.cc b/test/gtest/ucs/test_math.cc
index 1cf8f7d1f22..3b7af52e9ed 100644
--- a/test/gtest/ucs/test_math.cc
+++ b/test/gtest/ucs/test_math.cc
@@ -177,7 +177,7 @@ UCS_TEST_F(test_math, for_each_bit) {
 
     mask = ucs_generate_uuid(0);
 
-    ucs_for_each_bit (idx, mask) {
+    ucs_for_each_bit(idx, mask) {
         EXPECT_EQ(gen_mask & UCS_BIT(idx), 0ull);
         gen_mask |= UCS_BIT(idx);
     }
@@ -203,6 +203,34 @@ UCS_TEST_F(test_math, for_each_bit) {
     EXPECT_EQ(UCS_BIT(63), gen_mask);
 }
 
+UCS_TEST_F(test_math, for_each_submask) {
+    /* Generate mask values to test */
+    std::vector<int64_t> masks;
+    masks.push_back(0);
+    masks.push_back(1);
+    masks.push_back(65536);
+    for (int i = 0; i < 100; ++i) {
+        masks.push_back((ucs::rand() % 65536) + 2);
+    }
+
+    for (std::vector<int64_t>::const_iterator iter = masks.begin();
+         iter != masks.end(); ++iter) {
+        int64_t mask         = *iter;
+        int64_t prev_submask = -1;
+        unsigned count       = 0;
+        int64_t submask;
+        ucs_for_each_submask(submask, mask) {
+            EXPECT_GT(submask, prev_submask); /* expect strictly monotonic series */
+            EXPECT_EQ(0u, submask & ~mask);   /* sub-mask contained in the mask */
+            prev_submask = submask;
+            ++count;
+        }
+
+        /* expect to get all possible values */
+        EXPECT_EQ(UCS_BIT(ucs_popcount(mask)), count);
+    }
+}
+
 UCS_TEST_F(test_math, linear_func) {
     ucs_linear_func_t func[2];
     double x, y[2];
@@ -272,3 +300,11 @@ UCS_TEST_F(test_math, linear_func) {
     double y_added_func = ucs_linear_func_apply(added_func, x);
     EXPECT_NEAR(y[0] + y[1], y_added_func, 1e-6);
 }
+
+UCS_TEST_F(test_math, double_to_sizet) {
+    EXPECT_EQ(SIZE_MAX, ucs_double_to_sizet(1e20, SIZE_MAX));
+    EXPECT_EQ(SIZE_MAX, ucs_double_to_sizet(1e30, SIZE_MAX));
+    EXPECT_EQ(SIZE_MAX, ucs_double_to_sizet((double)SIZE_MAX, SIZE_MAX));
+    EXPECT_EQ(10, ucs_double_to_sizet(10.0, SIZE_MAX));
+    EXPECT_EQ(UCS_MBYTE, ucs_double_to_sizet(UCS_MBYTE, SIZE_MAX));
+}
diff --git a/test/gtest/ucs/test_memtype_cache.cc b/test/gtest/ucs/test_memtype_cache.cc
index e891db0efc3..063cdfa6dcf 100644
--- a/test/gtest/ucs/test_memtype_cache.cc
+++ b/test/gtest/ucs/test_memtype_cache.cc
@@ -34,27 +34,31 @@ class test_memtype_cache : public ucs::test_with_param<ucs_memory_type_t> {
         ucs::test_with_param<ucs_memory_type_t>::cleanup();
     }
 
-    void check_lookup(const void *ptr, size_t size,
-                      bool expect_found,
-                      ucs_memory_type_t expected_type = UCS_MEMORY_TYPE_LAST) const {
+    void
+    check_lookup(const void *ptr, size_t size, bool expect_found,
+                 ucs_memory_type_t expected_type = UCS_MEMORY_TYPE_UNKNOWN) const
+    {
         if (!size) {
             return;
         }
 
-        ucs_memory_type_t mem_type;
+        ucs_memory_info_t mem_info;
         ucs_status_t status = ucs_memtype_cache_lookup(m_memtype_cache, ptr,
-                                                       size, &mem_type);
+                                                       size, &mem_info);
 
         if (!expect_found || (expected_type == UCS_MEMORY_TYPE_HOST)) {
             /* memory type should be not found or unknown */
             EXPECT_TRUE((status == UCS_ERR_NO_ELEM) ||
-                        ((status == UCS_OK) && (mem_type == UCS_MEMORY_TYPE_LAST)))
-                << "ptr=" << ptr << " size=" << size << ": "
-                << ucs_status_string(status)
-                << " memtype=" << mem_buffer::mem_type_name(mem_type);
+                        ((status == UCS_OK) &&
+                         (mem_info.type == UCS_MEMORY_TYPE_UNKNOWN)))
+                    << "ptr=" << ptr << " size=" << size << ": "
+                    << ucs_status_string(status) << " memtype="
+                    << mem_buffer::mem_type_name(
+                               (ucs_memory_type_t)mem_info.type);
         } else {
             EXPECT_UCS_OK(status);
-            EXPECT_EQ(expected_type, mem_type) << "ptr=" << ptr << " size=" << size;
+            EXPECT_EQ(expected_type, mem_info.type)
+                    << "ptr=" << ptr << " size=" << size;
         }
     }
 
@@ -282,7 +286,10 @@ class test_memtype_cache : public ucs::test_with_param<ucs_memory_type_t> {
             return;
         }
 
-        ucs_memtype_cache_update(m_memtype_cache, ptr, size, mem_type);
+        ucs_memory_info_t mem_info;
+        mem_info.type    = mem_type;
+        mem_info.sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+        ucs_memtype_cache_update(m_memtype_cache, ptr, size, &mem_info);
     }
 
     void memtype_cache_update(const mem_buffer &b) {
@@ -481,7 +488,7 @@ class test_memtype_cache_deferred_create : public test_memtype_cache {
                 memtype_cache_update(b);
             }
 
-            /* check that able to find the entire region */ 
+            /* check that able to find the entire region */
             test_region_found(b);
 
             ptr = b.ptr();
diff --git a/test/gtest/ucs/test_mpool.cc b/test/gtest/ucs/test_mpool.cc
index 5d720ab1e31..3e26b7d5056 100644
--- a/test/gtest/ucs/test_mpool.cc
+++ b/test/gtest/ucs/test_mpool.cc
@@ -44,6 +44,24 @@ class test_mpool : public ucs::test {
         return UCS_LOG_FUNC_RC_CONTINUE;
     }
 
+    static ucs_log_func_rc_t
+    mpool_log_leak_handler(const char *file, unsigned line,
+                           const char *function, ucs_log_level_t level,
+                           const ucs_log_component_config_t *comp_conf,
+                           const char *message, va_list ap)
+    {
+        if (level == UCS_LOG_LEVEL_WARN) {
+            std::string err_str = format_message(message, ap);
+
+            if (err_str.find("was not returned to mpool test") !=
+                std::string::npos) {
+                return UCS_LOG_FUNC_RC_STOP;
+            }
+        }
+
+        return UCS_LOG_FUNC_RC_CONTINUE;
+    }
+
     static const size_t header_size = 30;
     static const size_t data_size = 152;
     static const size_t align = 128;
@@ -203,3 +221,26 @@ UCS_TEST_F(test_mpool, infinite) {
 
     ucs_mpool_cleanup(&mp, 1);
 }
+
+UCS_TEST_F(test_mpool, leak_check) {
+    ucs_mpool_t mp;
+    ucs_status_t status;
+
+    ucs_mpool_ops_t ops = {
+        ucs_mpool_chunk_malloc,
+        ucs_mpool_chunk_free,
+        NULL,
+        NULL
+    };
+
+    status = ucs_mpool_init(&mp, 0, header_size + data_size, header_size, align,
+                            6, 18, &ops, "test");
+    ASSERT_UCS_OK(status);
+
+    void *obj = ucs_mpool_get(&mp);
+    EXPECT_TRUE(obj != NULL);
+    // Do not release allocated object
+
+    scoped_log_handler log_handler(mpool_log_leak_handler);
+    ucs_mpool_cleanup(&mp, 1);
+}
diff --git a/test/gtest/ucs/test_pgtable.cc b/test/gtest/ucs/test_pgtable.cc
index 1a4669a641a..8b258bc6646 100644
--- a/test/gtest/ucs/test_pgtable.cc
+++ b/test/gtest/ucs/test_pgtable.cc
@@ -524,7 +524,9 @@ class test_pgtable_perf : public test_pgtable {
 
 private:
     struct region_comparator {
-        bool operator()(ucs_pgt_region_t* region1, ucs_pgt_region_t* region2) {
+        bool
+        operator()(ucs_pgt_region_t *region1, ucs_pgt_region_t *region2) const
+        {
             return region1->end <= region2->start;
         }
     };
diff --git a/test/gtest/ucs/test_profile.cc b/test/gtest/ucs/test_profile.cc
index 4ca1ce34481..3b4937babef 100644
--- a/test/gtest/ucs/test_profile.cc
+++ b/test/gtest/ucs/test_profile.cc
@@ -80,13 +80,18 @@ class test_profile : public testing::TestWithParam<int>,
 
     void test_header(const ucs_profile_header_t *hdr, unsigned exp_mode,
                      const void **ptr);
+
     void test_locations(const ucs_profile_location_t *locations,
                         unsigned num_locations, const void **ptr);
+
     void test_thread_locations(const ucs_profile_thread_header_t *thread_hdr,
                                unsigned num_locations, uint64_t exp_count,
                                unsigned exp_num_records, const void **ptr);
 
-    void do_test(unsigned int_mode, const std::string& str_mode);
+    void test_nesting(const ucs_profile_location_t *loc, int nesting,
+                      const std::string &exp_name, int exp_nesting);
+
+    void do_test(unsigned int_mode, const std::string &str_mode);
 };
 
 static int sum(int a, int b)
@@ -254,6 +259,15 @@ void test_profile::test_thread_locations(
            num_locations;
 }
 
+void test_profile::test_nesting(const ucs_profile_location_t *loc, int nesting,
+                                const std::string &exp_name, int exp_nesting)
+{
+    if (loc->name == exp_name) {
+        EXPECT_EQ(exp_nesting, nesting)
+                << "nesting level of " << exp_name << " is wrong";
+    }
+}
+
 void test_profile::do_test(unsigned int_mode, const std::string& str_mode)
 {
     const int ITER           = 5;
@@ -288,8 +302,10 @@ void test_profile::do_test(unsigned int_mode, const std::string& str_mode)
                               exp_num_records, &ptr);
 
         const ucs_profile_record_t *records =
-                        reinterpret_cast<const ucs_profile_record_t*>(ptr);
+                reinterpret_cast<const ucs_profile_record_t*>(ptr);
         uint64_t prev_ts = records[0].timestamp;
+        int nesting      = 0;
+
         for (uint64_t i = 0; i < thread_hdr->num_records; ++i) {
             const ucs_profile_record_t *rec = &records[i];
 
@@ -303,12 +319,27 @@ void test_profile::do_test(unsigned int_mode, const std::string& str_mode)
 
             /* test param64 */
             const ucs_profile_location_t *loc = &locations[rec->location];
-            if ((loc->type == UCS_PROFILE_TYPE_REQUEST_NEW) ||
-                (loc->type == UCS_PROFILE_TYPE_REQUEST_EVENT) ||
-                (loc->type == UCS_PROFILE_TYPE_REQUEST_FREE))
-            {
+            switch (loc->type) {
+            case UCS_PROFILE_TYPE_REQUEST_NEW:
+            case UCS_PROFILE_TYPE_REQUEST_EVENT:
+            case UCS_PROFILE_TYPE_REQUEST_FREE:
                 EXPECT_EQ((uintptr_t)&test_request, rec->param64);
-            }
+                break;
+            case UCS_PROFILE_TYPE_SCOPE_BEGIN:
+                ++nesting;
+                break;
+            case UCS_PROFILE_TYPE_SCOPE_END:
+                --nesting;
+                break;
+            default:
+                break;
+            };
+
+            test_nesting(loc, nesting, "profile_test_func1", 0);
+            test_nesting(loc, nesting, "code", 1);
+            test_nesting(loc, nesting, "sample", 2);
+            test_nesting(loc, nesting, "profile_test_func2", 0);
+            test_nesting(loc, nesting, "sum", 1);
         }
 
         ptr = records + thread_hdr->num_records;
diff --git a/test/gtest/ucs/test_rcache.cc b/test/gtest/ucs/test_rcache.cc
index f8da479b97f..0222b31037d 100644
--- a/test/gtest/ucs/test_rcache.cc
+++ b/test/gtest/ucs/test_rcache.cc
@@ -16,6 +16,22 @@ extern "C" {
 }
 #include <set>
 
+static ucs_rcache_params_t
+get_default_rcache_params(void *context, const ucs_rcache_ops_t *ops)
+{
+    ucs_rcache_params_t params = {sizeof(ucs_rcache_region_t),
+                                  UCS_PGT_ADDR_ALIGN,
+                                  ucs_get_page_size(),
+                                  UCM_EVENT_VM_UNMAPPED,
+                                  1000,
+                                  ops,
+                                  context,
+                                  0,
+                                  ULONG_MAX,
+                                  SIZE_MAX};
+
+    return params;
+}
 
 class test_rcache_basic : public ucs::test {
 };
@@ -24,16 +40,9 @@ UCS_TEST_F(test_rcache_basic, create_fail) {
     static const ucs_rcache_ops_t ops = {
         NULL, NULL, NULL
     };
-    ucs_rcache_params_t params = {
-        sizeof(ucs_rcache_region_t),
-        UCS_PGT_ADDR_ALIGN,
-        ucs_get_page_size(),
-        UCS_BIT(30), /* non-existing event */
-        1000,
-        &ops,
-        NULL,
-        0
-    };
+    ucs_rcache_params_t params        = get_default_rcache_params(this, &ops);
+    params.ucm_events                 = UCS_BIT(30); /* non-existing event */
+    params.context                    = NULL;
 
     ucs_rcache_t *rcache;
     ucs_status_t status = ucs_rcache_create(&params, "test",
@@ -59,21 +68,7 @@ class test_rcache : public ucs::test {
 
     virtual void init() {
         ucs::test::init();
-        static const ucs_rcache_ops_t ops = {
-            mem_reg_cb,
-            mem_dereg_cb,
-            dump_region_cb
-        };
-        ucs_rcache_params_t params = {
-            sizeof(region),
-            UCS_PGT_ADDR_ALIGN,
-            ucs_get_page_size(),
-            UCM_EVENT_VM_UNMAPPED,
-            1000,
-            &ops,
-            reinterpret_cast<void*>(this),
-            0
-        };
+        ucs_rcache_params params = rcache_params();
         UCS_TEST_CREATE_HANDLE_IF_SUPPORTED(ucs_rcache_t*, m_rcache, ucs_rcache_destroy,
                                             ucs_rcache_create, &params, "test", ucs_stats_get_root());
     }
@@ -84,6 +79,15 @@ class test_rcache : public ucs::test {
         ucs::test::cleanup();
     }
 
+    virtual ucs_rcache_params_t rcache_params()
+    {
+        static const ucs_rcache_ops_t ops = {mem_reg_cb, mem_dereg_cb,
+                                             dump_region_cb};
+        ucs_rcache_params_t params        = get_default_rcache_params(this, &ops);
+        params.region_struct_size         = sizeof(region);
+        return params;
+    }
+
     region *get(void *address, size_t length, int prot = PROT_READ|PROT_WRITE) {
         ucs_status_t status;
         ucs_rcache_region_t *r;
@@ -640,6 +644,101 @@ UCS_MT_TEST_F(test_rcache_no_register, merge_invalid_prot_slow, 5)
     munmap(mem, size1+size2);
 }
 
+class test_rcache_with_limit : public test_rcache {
+protected:
+    virtual ucs_rcache_params_t rcache_params()
+    {
+        ucs_rcache_params_t params = test_rcache::rcache_params();
+        params.max_regions         = 2;
+        params.max_size            = 1000;
+        params.alignment           = 16;
+        return params;
+    }
+
+    uint32_t get_put(void *ptr, size_t size)
+    {
+        region *region = get(ptr, size);
+        uint32_t id    = region->id;
+        put(region);
+        return id;
+    }
+};
+
+UCS_TEST_F(test_rcache_with_limit, by_count) {
+    static const size_t size = 32;
+
+    /* First region will be added */
+    void *ptr1          = malloc(size);
+    uint32_t region1_id = get_put(ptr1, size);
+    EXPECT_EQ(1, m_rcache.get()->num_regions);
+
+    /* Second region will be added as well */
+    void *ptr2          = malloc(size);
+    uint32_t region2_id = get_put(ptr2, size);
+    EXPECT_EQ(2, m_rcache.get()->num_regions);
+
+    /* This time, something must be removed */
+    void *ptr3          = malloc(size);
+    uint32_t region3_id = get_put(ptr3, size);
+    EXPECT_EQ(2, m_rcache.get()->num_regions);
+
+    /* Second region should be kept by lru policy */
+    uint32_t region2_new_id = get_put(ptr2, size);
+    EXPECT_EQ(region2_id, region2_new_id);
+    EXPECT_EQ(2, m_rcache.get()->num_regions);
+
+    /* Third region should be also kept limit policy */
+    uint32_t region3_new_id = get_put(ptr3, size);
+    EXPECT_EQ(region3_new_id, region3_id);
+    EXPECT_EQ(2, m_rcache.get()->num_regions);
+
+    /* First region should be removed by lru policy */
+    uint32_t region1_new_id = get_put(ptr1, size);
+    EXPECT_NE(region1_new_id, region1_id);
+    EXPECT_EQ(2, m_rcache.get()->num_regions);
+
+    free(ptr3);
+    free(ptr2);
+    free(ptr1);
+}
+
+UCS_TEST_F(test_rcache_with_limit, by_size) {
+    static const size_t size = 600;
+
+    /* First region will be added */
+    void *ptr1 = malloc(size);
+    get_put(ptr1, size);
+    EXPECT_EQ(1, m_rcache.get()->num_regions);
+
+    /* Second region will cause removing of first region */
+    void *ptr2 = malloc(size);
+    get_put(ptr2, size);
+    EXPECT_EQ(1, m_rcache.get()->num_regions);
+
+    free(ptr2);
+    free(ptr1);
+}
+
+UCS_TEST_F(test_rcache_with_limit, by_size_inuse) {
+    static const size_t size = 600;
+
+    /* First region will be added */
+    void *ptr1      = malloc(size);
+    region *region1 = get(ptr1, size);
+    EXPECT_EQ(1, m_rcache.get()->num_regions);
+
+    /* Second region will NOT cause removing of first region since it's still in
+     * use */
+    void *ptr2 = malloc(size);
+    get_put(ptr2, size);
+    EXPECT_EQ(2, m_rcache.get()->num_regions);
+
+    put(region1);
+
+    free(ptr2);
+    free(ptr1);
+}
+
 #ifdef ENABLE_STATS
 class test_rcache_stats : public test_rcache {
 protected:
diff --git a/test/gtest/ucs/test_sock.cc b/test/gtest/ucs/test_sock.cc
index 3a0c5f70126..85dcbf43dc2 100644
--- a/test/gtest/ucs/test_sock.cc
+++ b/test/gtest/ucs/test_sock.cc
@@ -7,6 +7,7 @@
 #include <common/test.h>
 extern "C" {
 #include <ucs/sys/sock.h>
+#include <ucs/debug/assert.h>
 }
 
 #include <sys/un.h>
@@ -39,23 +40,23 @@ class test_socket : public ucs::test {
 };
 
 UCS_TEST_F(test_socket, sockaddr_sizeof) {
+    struct sockaddr_un sa_un = {};
     struct sockaddr_in sa_in;
     struct sockaddr_in6 sa_in6;
-    struct sockaddr_un sa_un;
     size_t size;
 
     sa_in.sin_family   = AF_INET;
     sa_in6.sin6_family = AF_INET6;
     sa_un.sun_family   = AF_UNIX;
 
-    /* Check with wrong IPv4 */
+    /* Check with IPv4 */
     {
         size = 0;
         EXPECT_UCS_OK(ucs_sockaddr_sizeof((const struct sockaddr*)&sa_in, &size));
         EXPECT_EQ(sizeof(struct sockaddr_in), size);
     }
 
-    /* Check with wrong IPv6 */
+    /* Check with IPv6 */
     {
         size = 0;
         EXPECT_UCS_OK(ucs_sockaddr_sizeof((const struct sockaddr*)&sa_in6, &size));
@@ -75,12 +76,52 @@ UCS_TEST_F(test_socket, sockaddr_sizeof) {
     }
 }
 
+UCS_TEST_F(test_socket, sockaddr_inet_addr_sizeof) {
+    struct sockaddr_un sa_un = {};
+    struct sockaddr_in sa_in;
+    struct sockaddr_in6 sa_in6;
+    size_t size;
+
+    sa_in.sin_family   = AF_INET;
+    sa_in6.sin6_family = AF_INET6;
+    sa_un.sun_family   = AF_UNIX;
+
+    /* Check with IPv4 */
+    {
+        size = 0;
+        EXPECT_UCS_OK(ucs_sockaddr_inet_addr_sizeof((const struct sockaddr*)
+                                                            &sa_in, &size));
+        EXPECT_EQ(UCS_IPV4_ADDR_LEN, size);
+    }
+
+    /* Check with IPv6 */
+    {
+        size = 0;
+        EXPECT_UCS_OK(ucs_sockaddr_inet_addr_sizeof((const struct sockaddr*)
+                                                            &sa_in6, &size));
+        EXPECT_EQ(UCS_IPV6_ADDR_LEN, size);
+    }
+
+    /* Check with wrong address family */
+    {
+        socket_err_exp_str = "unknown address family:";
+        scoped_log_handler log_handler(socket_error_handler);
+
+        size = 0;
+        EXPECT_EQ(UCS_ERR_INVALID_PARAM,
+                  ucs_sockaddr_inet_addr_sizeof((const struct sockaddr*)&sa_un,
+                                                &size));
+        /* Check that doesn't touch provided memory in error case */
+        EXPECT_EQ(0ULL, size);
+    }
+}
+
 UCS_TEST_F(test_socket, sockaddr_get_port) {
-    const uint16_t sin_port    = 5555;
+    const uint16_t sin_port  = 5555;
+    uint16_t port            = 0;
+    struct sockaddr_un sa_un = {};
     struct sockaddr_in sa_in;
     struct sockaddr_in6 sa_in6;
-    struct sockaddr_un sa_un;
-    uint16_t port = 0;
 
     sa_in.sin_family   = AF_INET;
     sa_in.sin_port     = htons(sin_port);
@@ -88,14 +129,14 @@ UCS_TEST_F(test_socket, sockaddr_get_port) {
     sa_in6.sin6_port   = htons(sin_port);
     sa_un.sun_family   = AF_UNIX;
 
-    /* Check with wrong IPv4 */
+    /* Check with IPv4 */
     {
         port = 0;
         EXPECT_UCS_OK(ucs_sockaddr_get_port((const struct sockaddr*)&sa_in, &port));
         EXPECT_EQ(sin_port, port);
     }
 
-    /* Check with wrong IPv6 */
+    /* Check with IPv6 */
     {
         port = 0;
         EXPECT_UCS_OK(ucs_sockaddr_get_port((const struct sockaddr*)&sa_in6, &port));
@@ -116,9 +157,9 @@ UCS_TEST_F(test_socket, sockaddr_get_port) {
 }
 
 UCS_TEST_F(test_socket, sockaddr_get_inet_addr) {
+    struct sockaddr_un sa_un = {};
     struct sockaddr_in sa_in;
     struct sockaddr_in6 sa_in6;
-    struct sockaddr_un sa_un;
     struct in_addr sin_addr;
     struct in6_addr sin6_addr;
 
@@ -154,13 +195,160 @@ UCS_TEST_F(test_socket, sockaddr_get_inet_addr) {
     }
 }
 
-UCS_TEST_F(test_socket, sockaddr_str) {
-    const uint16_t port        = 65534;
-    const char *ipv4_addr      = "192.168.122.157";
-    const char *ipv6_addr      = "fe80::218:e7ff:fe16:fb97";
+UCS_TEST_F(test_socket, sockaddr_set_port) {
+    const uint16_t sin_port     = 5555;
+    const uint16_t sin_port_net = htons(sin_port);
+    struct sockaddr_un sa_un    = {};
+    struct sockaddr_in sa_in;
+    struct sockaddr_in6 sa_in6;
+
+    sa_in.sin_family   = AF_INET;
+    sa_in6.sin6_family = AF_INET6;
+    sa_un.sun_family   = AF_UNIX;
+
+    /* Check with IPv4 */
+    {
+        EXPECT_UCS_OK(ucs_sockaddr_set_port((struct sockaddr*)&sa_in,
+                                            sin_port));
+        EXPECT_EQ(sin_port_net, sa_in.sin_port);
+    }
+
+    /* Check with IPv6 */
+    {
+        EXPECT_UCS_OK(ucs_sockaddr_set_port((struct sockaddr*)&sa_in6,
+                                            sin_port));
+        EXPECT_EQ(sin_port_net, sa_in6.sin6_port);
+    }
+
+    /* Check with wrong address family */
+    {
+        socket_err_exp_str = "unknown address family:";
+        scoped_log_handler log_handler(socket_error_handler);
+
+        EXPECT_EQ(UCS_ERR_INVALID_PARAM,
+                  ucs_sockaddr_set_port((struct sockaddr*)&sa_un,
+                                        sin_port));
+    }
+}
+
+UCS_TEST_F(test_socket, sockaddr_set_inet_addr) {
+    struct sockaddr_un sa_un = {};
+    struct sockaddr_in sa_in;
+    struct sockaddr_in6 sa_in6;
+    struct sockaddr *sa;
+    struct in_addr sin_addr;
+    struct in6_addr sin6_addr;
+
+    sa_in.sin_family   = AF_INET;
+    sa_in6.sin6_family = AF_INET6;
+    sa_un.sun_family   = AF_UNIX;
+
+    sin_addr.s_addr = htonl(INADDR_ANY);
+    sin6_addr       = in6addr_any;
+
+    /* Check with IPv4 */
+    {
+        /* To suppress Coverity warning about possible overruning in_addr */
+        sa = reinterpret_cast<struct sockaddr*>(&sa_in);
+        ucs_assert(sa->sa_family == AF_INET);
+
+        EXPECT_UCS_OK(ucs_sockaddr_set_inet_addr(sa, &sin_addr));
+        EXPECT_EQ(0, memcmp(&sa_in.sin_addr, &sin_addr,
+                            sizeof(sa_in.sin_addr)));
+    }
+
+    /* Check with IPv6 */
+    {
+        /* To suppress Coverity warning about possible overruning in_addr */
+        sa = reinterpret_cast<struct sockaddr*>(&sa_in6);
+        ucs_assert(sa->sa_family == AF_INET6);
+
+        EXPECT_UCS_OK(ucs_sockaddr_set_inet_addr(sa, &sin6_addr));
+        EXPECT_EQ(0, memcmp(&sa_in6.sin6_addr, &sin6_addr,
+                            sizeof(sa_in6.sin6_addr)));
+    }
+
+    /* Check with wrong address family */
+    {
+        socket_err_exp_str = "unknown address family:";
+        scoped_log_handler log_handler(socket_error_handler);
+
+        EXPECT_EQ(UCS_ERR_INVALID_PARAM,
+                  ucs_sockaddr_set_inet_addr((struct sockaddr*)&sa_un, NULL));
+    }
+}
+
+UCS_TEST_F(test_socket, sockaddr_is_inaddr) {
+    struct sockaddr_un sa_un = {};
+    struct sockaddr_in sa_in;
+    struct sockaddr_in6 sa_in6;
+    struct sockaddr *sa;
+    struct in_addr sin_addr_loopback, sin_addr_any;
+    struct in6_addr sin6_addr_loopback, sin6_addr_any;
+
+    sa_in.sin_family   = AF_INET;
+    sa_in6.sin6_family = AF_INET6;
+    sa_un.sun_family   = AF_UNIX;
+
+    sin_addr_any.s_addr = htonl(INADDR_ANY);
+    sin6_addr_any       = in6addr_any;
+
+    sin_addr_loopback.s_addr = htonl(INADDR_LOOPBACK);
+    sin6_addr_loopback       = in6addr_loopback;
+
+    /* Check with IPv4 */
+    {
+        /* ANY is specified to address */
+
+        /* To suppress Coverity warning about possible overruning in_addr */
+        sa = reinterpret_cast<struct sockaddr*>(&sa_in);
+        ucs_assert(sa->sa_family == AF_INET);
+
+        ASSERT_UCS_OK(ucs_sockaddr_set_inet_addr(sa, &sin_addr_any));
+        EXPECT_TRUE(ucs_sockaddr_is_inaddr_any((struct sockaddr*)&sa_in));
+        EXPECT_FALSE(ucs_sockaddr_is_inaddr_loopback((struct sockaddr*)&sa_in));
+
+        /* LOOPBACK is specified to address */
+        ASSERT_UCS_OK(ucs_sockaddr_set_inet_addr(sa, &sin_addr_loopback));
+        EXPECT_FALSE(ucs_sockaddr_is_inaddr_any((struct sockaddr*)&sa_in));
+        EXPECT_TRUE(ucs_sockaddr_is_inaddr_loopback((struct sockaddr*)&sa_in));
+    }
+
+    /* Check with IPv6 */
+    {
+        /* ANY is specified to address */
+
+        /* To suppress Coverity warning about possible overruning in_addr */
+        sa = reinterpret_cast<struct sockaddr*>(&sa_in6);
+        ucs_assert(sa->sa_family == AF_INET6);
+
+        ASSERT_UCS_OK(ucs_sockaddr_set_inet_addr(sa, &sin6_addr_any));
+        EXPECT_TRUE(ucs_sockaddr_is_inaddr_any((struct sockaddr*)&sa_in6));
+        EXPECT_FALSE(ucs_sockaddr_is_inaddr_loopback((struct sockaddr*)&sa_in6));
+
+        /* LOOPBACK is specified to address */
+        ASSERT_UCS_OK(ucs_sockaddr_set_inet_addr(sa, &sin6_addr_loopback));
+        EXPECT_FALSE(ucs_sockaddr_is_inaddr_any((struct sockaddr*)&sa_in6));
+        EXPECT_TRUE(ucs_sockaddr_is_inaddr_loopback((struct sockaddr*)&sa_in6));
+    }
+
+    /* Check with wrong address family */
+    {
+        socket_err_exp_str = "unknown address family:";
+        scoped_log_handler log_handler(socket_error_handler);
+
+        EXPECT_FALSE(ucs_sockaddr_is_inaddr_any((struct sockaddr*)&sa_un));
+    }
+}
+
+UCS_TEST_F(test_socket, str_sockaddr_str) {
+    const uint16_t port   = 65534;
+    const char *ipv4_addr = "192.168.122.157";
+    const char *ipv6_addr = "fe80::218:e7ff:fe16:fb97";
     struct sockaddr_in sa_in;
     struct sockaddr_in6 sa_in6;
     char ipv4_addr_out[128], ipv6_addr_out[128], *str, test_str[1024];
+    ucs_status_t status;
 
     sa_in.sin_family   = AF_INET;
     sa_in.sin_port     = htons(port);
@@ -170,8 +358,13 @@ UCS_TEST_F(test_socket, sockaddr_str) {
     sprintf(ipv4_addr_out, "%s:%d", ipv4_addr, port);
     sprintf(ipv6_addr_out, "%s:%d", ipv6_addr, port);
 
-    inet_pton(AF_INET, ipv4_addr, &(sa_in.sin_addr));
-    inet_pton(AF_INET6, ipv6_addr, &(sa_in6.sin6_addr));
+    status = ucs_sock_ipstr_to_sockaddr(ipv4_addr,
+                                        (struct sockaddr_storage *)&sa_in);
+    ASSERT_EQ(UCS_OK, status);
+
+    status = ucs_sock_ipstr_to_sockaddr(ipv6_addr,
+                                        (struct sockaddr_storage *)&sa_in6);
+    ASSERT_EQ(UCS_OK, status);
 
     /* Check with short `str_len` to fit IP address only */
     {
@@ -203,7 +396,7 @@ UCS_TEST_F(test_socket, sockaddr_str) {
 
     /* Check with wrong sa_family */
     {
-        struct sockaddr_un sa_un;
+        struct sockaddr_un sa_un = {};
         sa_un.sun_family = AF_UNIX;
 
         /* with big enough string */
@@ -221,6 +414,22 @@ UCS_TEST_F(test_socket, sockaddr_str) {
             EXPECT_EQ(NULL, str);
         }
     }
+
+    /* Check NULL sockaddr */
+    {
+        /* with big enough string */
+        {
+            str = (char*)ucs_sockaddr_str(NULL, test_str, 1024);
+            EXPECT_EQ(test_str, str);
+            EXPECT_EQ(0, strcmp(str, "<null>"));
+        }
+
+        /* without string */
+        {
+            str = (char*)ucs_sockaddr_str(NULL, NULL, 0);
+            EXPECT_EQ(NULL, str);
+        }
+    }
 }
 
 UCS_TEST_F(test_socket, socket_setopt) {
@@ -404,8 +613,8 @@ static void sockaddr_cmp_err_test(const struct sockaddr *sa1,
 
 UCS_TEST_F(test_socket, sockaddr_cmp_err) {
     // Check with wrong sa_family
-    struct sockaddr_un sa_un;
-    struct sockaddr_in sa_in;
+    struct sockaddr_un sa_un = {};
+    struct sockaddr_in sa_in = {};
 
     sa_un.sun_family = AF_UNIX;
     sa_in.sin_family = AF_INET;
@@ -422,3 +631,45 @@ UCS_TEST_F(test_socket, sockaddr_cmp_err) {
     sockaddr_cmp_err_test((const struct sockaddr*)&sa_un,
                           (const struct sockaddr*)&sa_in);
 }
+
+static void sockaddr_get_ipstr_check(const struct sockaddr *sockaddr,
+                                     ucs_status_t expected_ret,
+                                     const char *expected_str = NULL)
+{
+    const size_t max_size = 1024;
+    char str[max_size];
+
+    EXPECT_EQ(expected_ret, ucs_sockaddr_get_ipstr(sockaddr, str, max_size));
+    if (expected_str != NULL) {
+        EXPECT_STREQ(expected_str, str);
+    }
+}
+
+static void sockaddr_get_ipstr_check_ip(void *sockaddr, const char *ip_str)
+{
+    EXPECT_UCS_OK(
+            ucs_sock_ipstr_to_sockaddr(ip_str,
+                                       (struct sockaddr_storage*)sockaddr));
+    sockaddr_get_ipstr_check((const struct sockaddr*)sockaddr, UCS_OK, ip_str);
+}
+
+UCS_TEST_F(test_socket, sockaddr_get_ipstr) {
+    /* Check ipv4 */
+    struct sockaddr_in sa_in;
+    sa_in.sin_family = AF_INET;
+    sockaddr_get_ipstr_check_ip(&sa_in, "192.168.122.157");
+
+    /* Check ipv6 */
+    struct sockaddr_in6 sa_in6;
+    sa_in6.sin6_family = AF_INET6;
+    sockaddr_get_ipstr_check_ip(&sa_in6, "fe80::218:e7ff:fe16:fb97");
+
+    /* Check invalid sa_family */
+    socket_err_exp_str = "unknown address family:";
+    scoped_log_handler log_handler(socket_error_handler);
+
+    struct sockaddr_un sa_un;
+    sa_un.sun_family = AF_UNIX;
+    sockaddr_get_ipstr_check((const struct sockaddr*)&sa_un,
+                             UCS_ERR_INVALID_PARAM);
+}
diff --git a/test/gtest/ucs/test_string.cc b/test/gtest/ucs/test_string.cc
index 15d4aa7f11a..9dff8c94d65 100644
--- a/test/gtest/ucs/test_string.cc
+++ b/test/gtest/ucs/test_string.cc
@@ -6,14 +6,57 @@
 
 #include <common/test.h>
 extern "C" {
+#include <ucs/debug/memtrack.h>
 #include <ucs/datastruct/string_buffer.h>
 #include <ucs/datastruct/string_set.h>
 #include <ucs/sys/string.h>
 }
 
 class test_string : public ucs::test {
+protected:
+    void check_mask_str(uint64_t mask, const std::string &exp_str) const {
+        ucs_string_buffer_t mask_str;
+        ucs_string_buffer_init(&mask_str);
+        EXPECT_EQ(exp_str,
+                  static_cast<std::string>(
+                      ucs_mask_str(mask, &mask_str)));
+        ucs_string_buffer_cleanup(&mask_str);
+    }
 };
 
+UCS_TEST_F(test_string, count_char) {
+    static const char *str1 = "/foo";
+    static const char *str2 = "/foo/bar";
+    size_t count;
+
+    count = ucs_string_count_char(str1, '/');
+    EXPECT_EQ(1, count);
+
+    count = ucs_string_count_char((const char*)UCS_PTR_BYTE_OFFSET(str1, 1),
+                                  '/');
+    EXPECT_EQ(0, count);
+
+    count = ucs_string_count_char(str2, '/');
+    EXPECT_EQ(2, count);
+
+    count = ucs_string_count_char((const char*)UCS_PTR_BYTE_OFFSET(str2, 1),
+                                  '/');
+    EXPECT_EQ(1, count);
+}
+
+UCS_TEST_F(test_string, common_prefix_len) {
+    static const char *str1 = "/foo";
+    static const char *str2 = "/foobar";
+    static const char *str3 = "foo/bar";
+    size_t common_length;
+
+    common_length = ucs_string_common_prefix_len(str1, str2);
+    EXPECT_EQ(4, common_length);
+
+    common_length = ucs_string_common_prefix_len(str1, str3);
+    EXPECT_EQ(0, common_length);
+}
+
 UCS_TEST_F(test_string, trim) {
     char str1[] = " foo ";
     EXPECT_EQ("foo", std::string(ucs_strtrim(str1)));
@@ -35,7 +78,37 @@ UCS_TEST_F(test_string, snprintf_safe) {
     EXPECT_EQ(std::string("123"), buf);
 }
 
+UCS_TEST_F(test_string, mask_str) {
+    const uint64_t empty_mask = 0;
+
+    check_mask_str(empty_mask, "<none>");
+
+    uint64_t mask = empty_mask;
+    std::string exp_str;
+    for (int i = 0; i < 64; ++i) {
+        mask |= UCS_BIT(i);
+
+        if (!exp_str.empty()) {
+            exp_str += ", ";
+        }
+        exp_str     += ucs::to_string(i);
+
+        check_mask_str(mask, exp_str);
+    }
+}
+
+UCS_TEST_F(test_string, range_str) {
+    char buf[64];
+    EXPECT_EQ(std::string("1..10"),
+              ucs_memunits_range_str(1, 10, buf, sizeof(buf)));
+    EXPECT_EQ(std::string("10"),
+              ucs_memunits_range_str(10, 10, buf, sizeof(buf)));
+}
+
 class test_string_buffer : public ucs::test {
+protected:
+    void test_fixed(ucs_string_buffer_t *strb, size_t capacity);
+    void check_extract_mem(ucs_string_buffer_t *strb);
 };
 
 
@@ -89,16 +162,71 @@ UCS_TEST_F(test_string_buffer, rtrim) {
     ucs_string_buffer_cleanup(&strb);
 }
 
-UCS_TEST_F(test_string_buffer, fixed) {
+void test_string_buffer::test_fixed(ucs_string_buffer_t *strb, size_t capacity)
+{
+    ucs_string_buffer_appendf(strb, "%s", "im");
+    ucs_string_buffer_appendf(strb, "%s", "mrmeeseeks");
+    ucs_string_buffer_appendf(strb, "%s", "lookatme");
+
+    EXPECT_LE(ucs_string_buffer_length(strb), capacity - 1);
+    EXPECT_EQ(std::string("immrmeeseeksloo"), ucs_string_buffer_cstr(strb));
+}
+
+UCS_TEST_F(test_string_buffer, fixed_static) {
+    char buf[17];
+    UCS_STRING_BUFFER_STATIC(strb, buf);
+    test_fixed(&strb, sizeof(buf));
+}
+
+UCS_TEST_F(test_string_buffer, fixed_init) {
+    ucs_string_buffer_t strb;
     char buf[17];
-    UCS_STRING_BUFFER_FIXED(strb, buf);
 
-    ucs_string_buffer_appendf(&strb, "%s", "im");
-    ucs_string_buffer_appendf(&strb, "%s", "mrmeeseeks");
-    ucs_string_buffer_appendf(&strb, "%s", "lookatme");
+    ucs_string_buffer_init_fixed(&strb, buf, sizeof(buf));
+    test_fixed(&strb, sizeof(buf));
+}
+
+UCS_TEST_F(test_string_buffer, fixed_onstack) {
+    const size_t num_elems = 17;
+    UCS_STRING_BUFFER_ONSTACK(strb, num_elems);
+    test_fixed(&strb, num_elems);
+}
+
+UCS_TEST_F(test_string_buffer, append_hex) {
+    static const uint8_t hexbytes[] = {0xde, 0xad, 0xbe, 0xef,
+                                       0xba, 0xdc, 0xf,  0xee};
+    UCS_STRING_BUFFER_ONSTACK(strb, 128);
+    ucs_string_buffer_append_hex(&strb, hexbytes,
+                                 ucs_static_array_size(hexbytes), SIZE_MAX);
+    EXPECT_EQ(std::string("deadbeef:badc0fee"), ucs_string_buffer_cstr(&strb));
+}
+
+UCS_TEST_F(test_string_buffer, dump) {
+    UCS_STRING_BUFFER_ONSTACK(strb, 128);
+    ucs_string_buffer_appendf(&strb, "hungry\n");
+    ucs_string_buffer_appendf(&strb, "for\n");
+    ucs_string_buffer_appendf(&strb, "apples\n");
+    ucs_string_buffer_dump(&strb, "[ TEST     ] ", stdout);
+}
+
+void test_string_buffer::check_extract_mem(ucs_string_buffer_t *strb)
+{
+    char test_str[] = "test";
+    ucs_string_buffer_appendf(strb, "%s", test_str);
+    char *c_str = ucs_string_buffer_extract_mem(strb);
+    EXPECT_STREQ(test_str, c_str);
+    ucs_free(c_str);
+}
+
+UCS_TEST_F(test_string_buffer, extract_mem) {
+    ucs_string_buffer_t strb;
+    char buf[8];
 
-    EXPECT_LE(ucs_string_buffer_length(&strb), sizeof(buf) - 1);
-    EXPECT_EQ(std::string("immrmeeseeksloo"), ucs_string_buffer_cstr(&strb));
+    ucs_string_buffer_init_fixed(&strb, buf, sizeof(buf));
+    check_extract_mem(&strb);
+
+    ucs_string_buffer_init(&strb);
+    check_extract_mem(&strb);
 }
 
 class test_string_set : public ucs::test {
diff --git a/test/gtest/ucs/test_sys.cc b/test/gtest/ucs/test_sys.cc
index dffcfd59311..03bd31103b2 100644
--- a/test/gtest/ucs/test_sys.cc
+++ b/test/gtest/ucs/test_sys.cc
@@ -23,6 +23,11 @@ class test_sys : public ucs::test {
         return ucs_get_mem_prot((uintptr_t)address, (uintptr_t)address + size);
     }
 
+    void test_dirname(char *path, int num_layers, const char *expected) {
+        path = ucs_dirname(path, num_layers);
+        EXPECT_EQ(std::string(expected), path);
+    }
+
     void test_memunits(size_t size, const char *expected) {
         char buf[256];
 
@@ -139,6 +144,11 @@ UCS_TEST_F(test_sys, module) {
     EXPECT_EQ(1, test_module_loaded);
 }
 
+UCS_TEST_F(test_sys, dirname) {
+    char path[] = "/sys/devices/pci0000:00/0000:00:00.0";
+    test_dirname(path, 3, "/sys");
+}
+
 UCS_TEST_F(test_sys, memunits_to_str) {
     test_memunits(256, "256");
     test_memunits(1256, "1256");
diff --git a/test/gtest/ucs/test_topo.cc b/test/gtest/ucs/test_topo.cc
index 4b6aed70347..14787c5478b 100644
--- a/test/gtest/ucs/test_topo.cc
+++ b/test/gtest/ucs/test_topo.cc
@@ -21,16 +21,20 @@ UCS_TEST_F(test_topo, find_device_by_bus_id) {
     dummy_bus_id.domain   = 0xffff;
     dummy_bus_id.bus      = 0xff;
     dummy_bus_id.slot     = 0xff;
-    dummy_bus_id.function = 1; 
+    dummy_bus_id.function = 1;
 
     status = ucs_topo_find_device_by_bus_id(&dummy_bus_id, &dev1);
     ASSERT_UCS_OK(status);
+    EXPECT_LT(dev1, UCS_SYS_DEVICE_ID_MAX);
 
-    dummy_bus_id.function = 2; 
+    dummy_bus_id.function = 2;
 
     status = ucs_topo_find_device_by_bus_id(&dummy_bus_id, &dev2);
     ASSERT_UCS_OK(status);
-    ASSERT_EQ(dev2, ((unsigned)dev1 + 1));
+    EXPECT_EQ((unsigned)dev1 + 1, dev2);
+    EXPECT_LT(dev2, UCS_SYS_DEVICE_ID_MAX);
+
+    EXPECT_GE(ucs_topo_num_devices(), 2);
 }
 
 UCS_TEST_F(test_topo, get_distance) {
@@ -41,8 +45,60 @@ UCS_TEST_F(test_topo, get_distance) {
                                    UCS_SYS_DEVICE_ID_UNKNOWN, &distance);
     ASSERT_EQ(UCS_OK, status);
     EXPECT_NEAR(distance.latency, 0.0, 1e-9);
+
+    char buf[128];
+    UCS_TEST_MESSAGE << "distance: "
+                     << ucs_topo_distance_str(&distance, buf, sizeof(buf));
 }
 
 UCS_TEST_F(test_topo, print_info) {
     ucs_topo_print_info(NULL);
 }
+
+UCS_TEST_F(test_topo, bdf_name) {
+    static const char *bdf_name = "0002:8f:5c.0";
+    ucs_sys_device_t sys_dev    = UCS_SYS_DEVICE_ID_UNKNOWN;
+
+    ucs_status_t status = ucs_topo_find_device_by_bdf_name(bdf_name, &sys_dev);
+    ASSERT_UCS_OK(status);
+    ASSERT_NE(UCS_SYS_DEVICE_ID_UNKNOWN, sys_dev);
+
+    char name_buffer[64];
+    const char *found_name = ucs_topo_sys_device_bdf_name(sys_dev, name_buffer,
+                                                          sizeof(name_buffer));
+    ASSERT_UCS_OK(status);
+    EXPECT_EQ(std::string(bdf_name), std::string(found_name));
+}
+
+UCS_TEST_F(test_topo, bdf_name_zero_domain) {
+    static const char *bdf_name = "0000:8f:5c.0";
+    ucs_sys_device_t sys_dev    = UCS_SYS_DEVICE_ID_UNKNOWN;
+
+    const char *short_bdf = strchr(bdf_name, ':') + 1;
+    ucs_status_t status = ucs_topo_find_device_by_bdf_name(short_bdf, &sys_dev);
+    ASSERT_UCS_OK(status);
+    ASSERT_NE(UCS_SYS_DEVICE_ID_UNKNOWN, sys_dev);
+
+    char name_buffer[64];
+    const char *found_name = ucs_topo_sys_device_bdf_name(sys_dev, name_buffer,
+                                                          sizeof(name_buffer));
+    ASSERT_UCS_OK(status);
+    EXPECT_EQ(std::string(bdf_name), std::string(found_name));
+}
+
+UCS_TEST_F(test_topo, bdf_name_invalid) {
+    ucs_sys_device_t sys_dev = UCS_SYS_DEVICE_ID_UNKNOWN;
+    ucs_status_t status;
+
+    status = ucs_topo_find_device_by_bdf_name("0000:8f:5c!0", &sys_dev);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    status = ucs_topo_find_device_by_bdf_name("0000:8t:5c.0", &sys_dev);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    status = ucs_topo_find_device_by_bdf_name("5c.0", &sys_dev);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+
+    status = ucs_topo_find_device_by_bdf_name("1:2:3", &sys_dev);
+    EXPECT_EQ(UCS_ERR_INVALID_PARAM, status);
+}
diff --git a/test/gtest/ucs/test_type.cc b/test/gtest/ucs/test_type.cc
index 333fea488fb..1e61a0c9d6b 100644
--- a/test/gtest/ucs/test_type.cc
+++ b/test/gtest/ucs/test_type.cc
@@ -8,7 +8,9 @@
 extern "C" {
 #include <ucs/type/cpu_set.h>
 #include <ucs/type/init_once.h>
+#include <ucs/type/serialize.h>
 #include <ucs/type/status.h>
+#include <ucs/type/float8.h>
 }
 
 #include <time.h>
@@ -48,6 +50,90 @@ UCS_TEST_F(test_type, status) {
     EXPECT_NE(UCS_OK, UCS_PTR_STATUS(ptr));
 }
 
+UCS_TEST_F(test_type, serialize) {
+    std::vector<uint8_t> data(100);
+    const size_t raw_field_size = 3;
+
+    std::vector<uint64_t> values;
+    values.push_back(ucs::rand() % UINT8_MAX);
+    values.push_back(ucs::rand() % UINT32_MAX);
+    for (unsigned i = 0; i < 3; ++i) {
+        values.push_back(ucs::rand() * ucs::rand());
+    }
+    values.push_back(ucs::rand() % UCS_BIT(raw_field_size * 8));
+
+    /* Pack */
+    uint64_t *p64;
+    void *pack_ptr = &data[0];
+
+    *ucs_serialize_next(&pack_ptr, uint8_t)  = values[0];
+    *ucs_serialize_next(&pack_ptr, uint32_t) = values[1];
+    *ucs_serialize_next(&pack_ptr, uint64_t) = values[2];
+    p64  = ucs_serialize_next(&pack_ptr, uint64_t);
+    *p64 = values[3];
+    *ucs_serialize_next(&pack_ptr, uint64_t) = values[4];
+    /* Pack raw 3-byte value */
+    memcpy(ucs_serialize_next_raw(&pack_ptr, void, raw_field_size), &values[5],
+           raw_field_size);
+    EXPECT_EQ(1 + 4 + (3 * 8) + raw_field_size,
+              UCS_PTR_BYTE_DIFF(&data[0], pack_ptr));
+
+    /* Unpack */
+    const void *unpack_ptr = &data[0];
+    uint64_t value;
+    value = *ucs_serialize_next(&unpack_ptr, const uint8_t);
+    EXPECT_EQ(values[0], value);
+    value = *ucs_serialize_next(&unpack_ptr, const uint32_t);
+    EXPECT_EQ(values[1], value);
+    for (unsigned i = 0; i < 3; ++i) {
+        value = *ucs_serialize_next(&unpack_ptr, const uint64_t);
+        EXPECT_EQ(values[2 + i], value);
+    }
+    /* Unpack raw 3-byte value */
+    value = 0;
+    memcpy(&value, ucs_serialize_next_raw(&unpack_ptr, void, raw_field_size),
+           raw_field_size);
+    EXPECT_EQ(values[5], value);
+
+    EXPECT_EQ(pack_ptr, unpack_ptr);
+}
+
+/* Represents latency (in ns) */
+UCS_FP8_DECLARE_TYPE(LATENCY, UCS_BIT(7), UCS_BIT(20))
+
+UCS_TEST_F(test_type, pack_float) {
+    const std::size_t values_size    = 10;
+    double values_array[values_size] = {
+        130, 135.1234, 140, 200, 400, 1000, 10000, 100000, 1000000, 1000000
+    };
+    std::vector<double> values(values_array, values_array + values_size);
+    float unpacked;
+
+    /* 0 -> 0 */
+    unpacked = UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, 0));
+    EXPECT_EQ(unpacked, 0);
+
+    /* NaN -> NaN */
+    unpacked = UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, NAN));
+    EXPECT_TRUE(isnan(unpacked));
+
+    /* Below min -> min */
+    EXPECT_EQ(UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, UCS_BIT(7))),
+              UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, 15)));
+
+    /* Precision test throughout the whole range */
+    for (std::vector<double>::const_iterator it = values.begin();
+         it < values.end(); it++) {
+        unpacked = UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, *it));
+        ucs_assert((UCS_FP8_PRECISION < unpacked / *it) &&
+                   (unpacked / *it <= 1));
+    }
+
+    /* Above max -> max */
+    EXPECT_EQ(UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, UCS_BIT(20))),
+              UCS_FP8_UNPACK(LATENCY, UCS_FP8_PACK(LATENCY, 200000000)));
+}
+
 class test_init_once: public test_type {
 protected:
     test_init_once() : m_once(INIT_ONCE_INIT), m_count(0) {};
diff --git a/test/gtest/ucs/test_vfs.cc b/test/gtest/ucs/test_vfs.cc
new file mode 100644
index 00000000000..5b21c7155ce
--- /dev/null
+++ b/test/gtest/ucs/test_vfs.cc
@@ -0,0 +1,259 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include <common/test.h>
+extern "C" {
+#include <ucs/debug/memtrack.h>
+#include <ucs/vfs/base/vfs_obj.h>
+#include <ucs/vfs/sock/vfs_sock.h>
+}
+
+#include <sys/fcntl.h>
+#include <time.h>
+
+
+class test_vfs_sock : public ucs::test
+{
+protected:
+    virtual void init()
+    {
+        int ret = socketpair(AF_UNIX, SOCK_STREAM, 0, m_sockets);
+        if (ret < 0) {
+            UCS_TEST_ABORT("failed to create socket pair: " << strerror(errno));
+        }
+
+        /* socket[1] will always receive credentials */
+        ucs_vfs_sock_setopt_passcred(m_sockets[1]);
+    }
+
+    virtual void cleanup()
+    {
+        close(m_sockets[1]);
+        close(m_sockets[0]);
+    }
+
+protected:
+    void
+    do_send_recv(ucs_vfs_sock_action_t action, int send_sockfd, int recv_sockfd,
+                 int fd_in, ucs_vfs_sock_message_t *msg_out)
+    {
+        int ret;
+
+        ucs_vfs_sock_message_t msg_in = {};
+        msg_in.action                 = action;
+        msg_in.fd                     = fd_in;
+        ret = ucs_vfs_sock_send(send_sockfd, &msg_in);
+        ASSERT_EQ(0, ret) << strerror(-ret);
+
+        ret = ucs_vfs_sock_recv(recv_sockfd, msg_out);
+        ASSERT_EQ(0, ret) << strerror(-ret);
+        EXPECT_EQ(action, msg_out->action);
+    }
+
+    ino_t fd_inode(int fd)
+    {
+        struct stat st;
+        int ret = fstat(fd, &st);
+        if (ret < 0) {
+            UCS_TEST_ABORT("stat() failed: " << strerror(errno));
+        }
+        return st.st_ino;
+    }
+
+    int m_sockets[2];
+};
+
+UCS_TEST_F(test_vfs_sock, send_recv_stop) {
+    /* send stop/start commands from socket[0] to socket[1] */
+    ucs_vfs_sock_message_t msg_out = {};
+    do_send_recv(UCS_VFS_SOCK_ACTION_STOP, m_sockets[0], m_sockets[1], -1,
+                 &msg_out);
+}
+
+UCS_TEST_F(test_vfs_sock, send_recv_mount) {
+    /* send mount request from socket[0] to socket[1] */
+    ucs_vfs_sock_message_t msg_out = {};
+    do_send_recv(UCS_VFS_SOCK_ACTION_MOUNT, m_sockets[0], m_sockets[1], -1,
+                 &msg_out);
+    EXPECT_EQ(getpid(), msg_out.pid);
+}
+
+UCS_TEST_F(test_vfs_sock, send_recv_mount_reply) {
+    /* open a file */
+    int fd = open("/dev/null", O_WRONLY);
+    if (fd < 0) {
+        UCS_TEST_ABORT("failed to open /dev/null: " << strerror(errno));
+    }
+
+    /* send mount reply with fd from socket[1] to socket[0] */
+    ucs_vfs_sock_message_t msg_out = {};
+    do_send_recv(UCS_VFS_SOCK_ACTION_MOUNT_REPLY, m_sockets[1], m_sockets[0],
+                 fd, &msg_out);
+
+    UCS_TEST_MESSAGE << "send fd: " << fd << " recv fd: " << msg_out.fd;
+    /* expect to have different fd but same inode */
+    ASSERT_NE(msg_out.fd, fd);
+    EXPECT_EQ(fd_inode(fd), fd_inode(msg_out.fd));
+
+    close(msg_out.fd);
+    close(fd);
+}
+
+UCS_TEST_F(test_vfs_sock, send_recv_nop) {
+    /* send stop/start commands from socket[0] to socket[1] */
+    ucs_vfs_sock_message_t msg_out = {};
+    do_send_recv(UCS_VFS_SOCK_ACTION_NOP, m_sockets[0], m_sockets[1], -1,
+                 &msg_out);
+}
+
+class test_vfs_obj : public ucs::test {
+public:
+    static void file_show_cb(void *obj, ucs_string_buffer_t *strb,
+                             void *arg_ptr, uint64_t arg_u64)
+    {
+        ucs_string_buffer_appendf(strb, "%s", file_content().c_str());
+    }
+
+    static std::string file_content()
+    {
+        return "info";
+    }
+
+    static void list_dir_cb(const char *name, void *arg)
+    {
+        char *buffer = static_cast<char*>(arg);
+
+        strcpy(buffer, name);
+    }
+
+    static void refresh_cb(void *obj)
+    {
+        ucs_vfs_obj_add_ro_file(obj, test_vfs_obj::file_show_cb, NULL, 0,
+                                "info");
+    }
+
+    static void *create_simple_tree()
+    {
+        static char obj;
+        ucs_vfs_obj_add_dir(NULL, &obj, "obj");
+        ucs_vfs_obj_add_ro_file(&obj, test_vfs_obj::file_show_cb, NULL, 0,
+                                "info");
+        return &obj;
+    }
+};
+
+UCS_MT_TEST_F(test_vfs_obj, simple_obj_tree, 4) {
+    char obj1, obj2, obj3, obj4;
+
+    /**
+     * obj1
+     * |
+     * |____obj2
+     * |    |
+     * |    |____obj3
+     * |
+     * |____obj4
+     */
+
+    ucs_vfs_obj_add_dir(NULL, &obj1, "obj1");
+    ucs_vfs_obj_add_dir(&obj1, &obj2, "obj2");
+    ucs_vfs_obj_add_dir(&obj2, &obj3, "obj3");
+    ucs_vfs_obj_add_dir(&obj1, &obj4, "obj4");
+    ucs_vfs_obj_remove(&obj1);
+}
+
+UCS_MT_TEST_F(test_vfs_obj, remove_middle_obj, 4) {
+    char obj1, obj2, obj3;
+
+    ucs_vfs_obj_add_dir(NULL, &obj1, "obj1");
+    ucs_vfs_obj_add_dir(&obj1, &obj2, "subdir/obj2");
+    ucs_vfs_obj_add_dir(&obj2, &obj3, "obj3");
+    ucs_vfs_obj_remove(&obj2);
+    ucs_vfs_obj_remove(&obj1);
+}
+
+UCS_MT_TEST_F(test_vfs_obj, path_get_info, 4) {
+    void *obj = create_simple_tree();
+
+    ucs_vfs_path_info_t path_info;
+    ucs_status_t status = ucs_vfs_path_get_info("/obj", &path_info);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_EQ(path_info.size, 1);
+    EXPECT_TRUE(path_info.mode & S_IFDIR);
+
+    status = ucs_vfs_path_get_info("/obj/info", &path_info);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_EQ(path_info.size, file_content().size());
+    EXPECT_TRUE(path_info.mode & S_IFREG);
+
+    status = ucs_vfs_path_get_info("invalid_path", &path_info);
+    EXPECT_EQ(status, UCS_ERR_NO_ELEM);
+
+    barrier();
+    ucs_vfs_obj_remove(obj);
+}
+
+UCS_MT_TEST_F(test_vfs_obj, path_read_file, 4) {
+    void *obj = create_simple_tree();
+
+    ucs_string_buffer_t strb;
+    ucs_string_buffer_init(&strb);
+    ucs_status_t status = ucs_vfs_path_read_file("/obj", &strb);
+    EXPECT_EQ(status, UCS_ERR_NO_ELEM);
+
+    status = ucs_vfs_path_read_file("/obj/info", &strb);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_EQ(file_content(), ucs_string_buffer_cstr(&strb));
+
+    status = ucs_vfs_path_read_file("invalid_path", &strb);
+    EXPECT_EQ(status, UCS_ERR_NO_ELEM);
+    ucs_string_buffer_cleanup(&strb);
+
+    barrier();
+    ucs_vfs_obj_remove(obj);
+}
+
+UCS_MT_TEST_F(test_vfs_obj, path_list_dir, 4) {
+    void *obj = create_simple_tree();
+
+    char buffer[32];
+    ucs_status_t status = ucs_vfs_path_list_dir("/obj",
+                                                test_vfs_obj::list_dir_cb,
+                                                buffer);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_STREQ(buffer, "info");
+
+    status = ucs_vfs_path_list_dir("/obj/info", test_vfs_obj::list_dir_cb,
+                                   buffer);
+    EXPECT_EQ(status, UCS_ERR_NO_ELEM);
+
+    status = ucs_vfs_path_list_dir("invalid_path", test_vfs_obj::list_dir_cb,
+                                   buffer);
+    EXPECT_EQ(status, UCS_ERR_NO_ELEM);
+
+    barrier();
+    ucs_vfs_obj_remove(obj);
+}
+
+UCS_MT_TEST_F(test_vfs_obj, set_dirty_and_refresh, 4) {
+    static char obj;
+    ucs_vfs_obj_add_dir(NULL, &obj, "obj");
+
+    ucs_vfs_path_info_t path_info;
+    ucs_status_t status = ucs_vfs_path_get_info("/obj", &path_info);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_EQ(path_info.size, 0);
+
+    barrier();
+    ucs_vfs_obj_set_dirty(&obj, test_vfs_obj::refresh_cb);
+
+    status = ucs_vfs_path_get_info("/obj", &path_info);
+    EXPECT_EQ(status, UCS_OK);
+    EXPECT_EQ(path_info.size, 1);
+
+    barrier();
+    ucs_vfs_obj_remove(&obj);
+}
diff --git a/test/gtest/ucs/ucx.conf b/test/gtest/ucs/ucx.conf
new file mode 100644
index 00000000000..684e2fbad30
--- /dev/null
+++ b/test/gtest/ucs/ucx.conf
@@ -0,0 +1,2 @@
+UCX_PRICE=100
+UCX_BRAND=Mazda
diff --git a/test/gtest/uct/ib/test_dc.cc b/test/gtest/uct/ib/test_dc.cc
index 42c49099c1d..0a2d577d138 100644
--- a/test/gtest/uct/ib/test_dc.cc
+++ b/test/gtest/uct/ib/test_dc.cc
@@ -173,16 +173,16 @@ UCS_TEST_P(test_dc, dcs_single) {
     status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0);
     EXPECT_UCS_OK(status);
     /* dci 0 must be assigned to the ep */
-    EXPECT_EQ(iface->tx.dcis_stack[0], ep->dci);
-    EXPECT_EQ(1, iface->tx.stack_top);
+    EXPECT_EQ(iface->tx.dci_pool[0].stack[0], ep->dci);
+    EXPECT_EQ(1, iface->tx.dci_pool[0].stack_top);
     EXPECT_EQ(ep, iface->tx.dcis[ep->dci].ep);
 
     flush();
 
     /* after the flush dci must be released */
     EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci);
-    EXPECT_EQ(0, iface->tx.stack_top);
-    EXPECT_EQ(0, iface->tx.dcis_stack[0]);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack_top);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack[0]);
 }
 
 UCS_TEST_P(test_dc, dcs_multi) {
@@ -203,8 +203,8 @@ UCS_TEST_P(test_dc, dcs_multi) {
         EXPECT_UCS_OK(status);
 
         /* dci on free LIFO must be assigned to the ep */
-        EXPECT_EQ(iface->tx.dcis_stack[i], ep->dci);
-        EXPECT_EQ(i+1, iface->tx.stack_top);
+        EXPECT_EQ(iface->tx.dci_pool[0].stack[i], ep->dci);
+        EXPECT_EQ(i+1, iface->tx.dci_pool[0].stack_top);
         EXPECT_EQ(ep, iface->tx.dcis[ep->dci].ep);
     }
 
@@ -216,7 +216,7 @@ UCS_TEST_P(test_dc, dcs_multi) {
 
     /* after the flush dci must be released */
 
-    EXPECT_EQ(0, iface->tx.stack_top);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack_top);
     for (i = 0; i < iface->tx.ndci; i++) {
         ep = dc_ep(m_e1, i);
         EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci);
@@ -242,15 +242,15 @@ UCS_TEST_P(test_dc, dcs_ep_destroy) {
     EXPECT_EQ(UCT_DC_MLX5_EP_NO_DCI, ep->dci);
     send_am_messages(m_e1, 2, UCS_OK);
     /* dci 0 must be assigned to the ep */
-    EXPECT_EQ(iface->tx.dcis_stack[0], ep->dci);
-    EXPECT_EQ(1, iface->tx.stack_top);
+    EXPECT_EQ(iface->tx.dci_pool[0].stack[0], ep->dci);
+    EXPECT_EQ(1, iface->tx.dci_pool[0].stack_top);
     EXPECT_EQ(ep, iface->tx.dcis[ep->dci].ep);
 
     m_e1->destroy_eps();
-    EXPECT_EQ(1, iface->tx.stack_top);
+    EXPECT_EQ(1, iface->tx.dci_pool[0].stack_top);
 
     flush();
-    EXPECT_EQ(0, iface->tx.stack_top);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack_top);
 }
 
 /**
@@ -270,8 +270,8 @@ UCS_TEST_P(test_dc, dcs_ep_flush_destroy) {
     status = uct_ep_am_short(m_e1->ep(0), 0, 0, NULL, 0);
     EXPECT_UCS_OK(status);
 
-    EXPECT_EQ(iface->tx.dcis_stack[0], ep->dci);
-    EXPECT_EQ(1, iface->tx.stack_top);
+    EXPECT_EQ(iface->tx.dci_pool[0].stack[0], ep->dci);
+    EXPECT_EQ(1, iface->tx.dci_pool[0].stack_top);
     EXPECT_EQ(ep, iface->tx.dcis[ep->dci].ep);
 
     comp.uct_comp.count  = 1;
@@ -284,7 +284,7 @@ UCS_TEST_P(test_dc, dcs_ep_flush_destroy) {
         progress();
     } while (comp.uct_comp.count > 0);
 
-    EXPECT_EQ(0, iface->tx.stack_top);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack_top);
 }
 
 UCS_TEST_P(test_dc, dcs_ep_flush_pending, "DC_NUM_DCI=1") {
@@ -331,7 +331,7 @@ UCS_TEST_P(test_dc, dcs_ep_flush_pending, "DC_NUM_DCI=1") {
     flush();
 
     /* check that ep does not hold dci */
-    EXPECT_EQ(0, iface->tx.stack_top);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack_top);
 }
 
 /* check that ep does not hold dci after purge
@@ -464,7 +464,7 @@ class test_dc_flow_control : public test_rc_flow_control {
         for (int i = 0; i < iface->tx.ndci; ++i) {
             uct_rc_txqp_available_set(&iface->tx.dcis[i].txqp, 0);
         }
-        iface->tx.stack_top = iface->tx.ndci;
+        iface->tx.dci_pool[0].stack_top = iface->tx.ndci;
     }
 
     virtual void enable_entity(entity *e, unsigned cq_num = 128) {
@@ -475,7 +475,7 @@ class test_dc_flow_control : public test_rc_flow_control {
             uct_rc_txqp_available_set(&iface->tx.dcis[i].txqp,
                                       iface->tx.dcis[i].txwq.bb_max);
         }
-        iface->tx.stack_top = 0;
+        iface->tx.dci_pool[0].stack_top = 0;
     }
 };
 
@@ -610,10 +610,10 @@ UCS_TEST_P(test_dc_flow_control, dci_leak)
     /* Make sure that ep does not hold dci when sends completed */
     uct_dc_mlx5_iface_t *iface = ucs_derived_of(m_e1->iface(), uct_dc_mlx5_iface_t);
     ucs_time_t deadline        = ucs::get_deadline();
-    while (iface->tx.stack_top && (ucs_get_time() < deadline)) {
+    while (iface->tx.dci_pool[0].stack_top && (ucs_get_time() < deadline)) {
         progress();
     }
-    EXPECT_EQ(0, iface->tx.stack_top);
+    EXPECT_EQ(0, iface->tx.dci_pool[0].stack_top);
 
     /* Clean up FC and pending to avoid assetions during tear down */
     uct_ep_pending_purge(m_e1->ep(0),
@@ -694,7 +694,6 @@ UCS_TEST_P(test_dc_fc_deadlock, basic, "DC_NUM_DCI=1")
     validate_grant(m_e2);
 
     // Restore m_e1 for proper cleanup
-    ucs_derived_of(m_e1->iface(), uct_dc_mlx5_iface_t)->tx.fc_grants = 0;
     uct_ep_pending_purge(m_e1->ep(0), NULL, NULL);
 }
 
diff --git a/test/gtest/uct/ib/test_ib.cc b/test/gtest/uct/ib/test_ib.cc
index 5c1a3433c66..3bf927ecf0c 100644
--- a/test/gtest/uct/ib/test_ib.cc
+++ b/test/gtest/uct/ib/test_ib.cc
@@ -5,13 +5,19 @@
 */
 
 #include <uct/ib/test_ib.h>
+#ifdef HAVE_MLX5_HW
+extern "C" {
+#include <uct/ib/mlx5/ib_mlx5.h>
+}
+#endif
+
 
 test_uct_ib::test_uct_ib() : m_e1(NULL), m_e2(NULL) { }
 
 void test_uct_ib::create_connected_entities() {
     m_e1 = uct_test::create_entity(0);
     m_e2 = uct_test::create_entity(0);
-    
+
     m_entities.push_back(m_e1);
     m_entities.push_back(m_e2);
 
@@ -93,8 +99,7 @@ class test_uct_ib_addr : public test_uct_ib {
         pack_params.gid       = gid_in;
         pack_params.lid       = lid_in;
         pack_params.roce_info = iface->gid_info.roce_info;
-        /* to suppress gcc 4.3.4 warning */
-        pack_params.path_mtu  = (enum ibv_mtu)0;
+        pack_params.path_mtu  = iface->config.path_mtu;
         pack_params.gid_index = std::numeric_limits<uint8_t>::max();
         pack_params.pkey      = iface->pkey;
         address_size          = uct_ib_address_size(&pack_params);
@@ -124,8 +129,15 @@ class test_uct_ib_addr : public test_uct_ib {
             EXPECT_EQ(gid_in.global.interface_id, unpack_params.gid.global.interface_id);
         }
 
-        EXPECT_TRUE(!(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU));
-        EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params.path_mtu);
+        if (iface->config.path_mtu == IBV_MTU_4096) {
+            EXPECT_FALSE(unpack_params.flags &
+                         UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU);
+            EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params.path_mtu);
+        } else {
+            EXPECT_TRUE(unpack_params.flags &
+                        UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU);
+            EXPECT_EQ(iface->config.path_mtu, unpack_params.path_mtu);
+        }
 
         EXPECT_TRUE(!(unpack_params.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX));
         EXPECT_EQ(UCT_IB_ADDRESS_INVALID_GID_INDEX, unpack_params.gid_index);
@@ -174,6 +186,20 @@ UCS_TEST_P(test_uct_ib_addr, address_pack) {
     test_address_pack(0xdeadfeedbeefa880ul);
 }
 
+UCS_TEST_P(test_uct_ib_addr, address_pack_path_mtu, "IB_PATH_MTU=2048")
+{
+    uct_ib_iface_t *iface = ucs_derived_of(m_entities.front()->iface(),
+                                           uct_ib_iface_t);
+    size_t addr_len       = uct_ib_iface_address_size(iface);
+    std::vector<char> buffer(addr_len);
+    uct_ib_address_t *addr = (uct_ib_address_t*)&buffer[0];
+    uct_ib_iface_address_pack(iface, addr);
+    uct_ib_address_pack_params_t params;
+    uct_ib_address_unpack(addr, &params);
+    EXPECT_TRUE(params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU);
+    EXPECT_EQ(IBV_MTU_2048, params.path_mtu);
+}
+
 UCS_TEST_P(test_uct_ib_addr, fill_ah_attr) {
     test_fill_ah_attr(UCT_IB_LINK_LOCAL_PREFIX);
     test_fill_ah_attr(UCT_IB_SITE_LOCAL_PREFIX | htobe64(0x7200));
@@ -369,6 +395,57 @@ UCS_TEST_P(test_uct_ib_gid_idx, non_default_gid_idx, "GID_INDEX=1") {
 
 UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib_gid_idx);
 
+
+#if HAVE_DEVX
+class test_uct_ib_sl : public test_uct_ib_with_specific_port {
+public:
+    void check_port_attr() {
+        ucs_status_t status;
+        ucs::handle<uct_md_h> uct_md;
+
+        UCS_TEST_CREATE_HANDLE(uct_md_h, uct_md, uct_ib_md_close, uct_ib_md_open,
+                               &uct_ib_component,
+                               ibv_get_device_name(m_ibctx->device),
+                               m_md_config);
+
+        uct_ib_mlx5_md_t *ib_md = ucs_derived_of(uct_md, uct_ib_mlx5_md_t);
+
+        /* check if OOO SL mask can be detected for the port */
+        status = uct_ib_mlx5_devx_query_ooo_sl_mask(ib_md, m_port,
+                                                    &m_ooo_sl_mask);
+        EXPECT_TRUE((status == UCS_OK) || (status == UCS_ERR_UNSUPPORTED));
+        if (status != UCS_OK) {
+            UCS_TEST_SKIP_R("OOO SL mask couldn't be detected for " +
+                            m_dev_name + ":" + ucs::to_string(m_port));
+        }
+    }
+
+protected:
+    uint16_t m_ooo_sl_mask;
+};
+
+UCS_TEST_P(test_uct_ib_sl, check_ib_sl_config) {
+    // go over all SLs, check UCTs could be initialized on a specific SL
+    // and able to send/recv traffic
+    for (uint8_t sl = 0; sl < UCT_IB_SL_NUM; ++sl)  {
+        if (!has_transport("rc_verbs") && !has_transport("ud_verbs")) {
+            // if AR is configured on the given SL, set AR_ENABLE to "y",
+            // otherwise - to "n" in order to test that AR_ENABLE parameter
+            // works as expected w/o errors and warnings
+            modify_config("IB_AR_ENABLE",
+                          (m_ooo_sl_mask & UCS_BIT(sl)) ? "y" : "n");
+        }
+        modify_config("IB_SL", ucs::to_string(static_cast<uint16_t>(sl)));
+
+        test_uct_ib::init();
+        send_recv_short();
+        test_uct_ib::cleanup();
+    }
+}
+
+UCT_INSTANTIATE_IB_TEST_CASE(test_uct_ib_sl);
+#endif
+
 class test_uct_ib_utils : public ucs::test {
 };
 
@@ -455,6 +532,250 @@ UCS_TEST_F(test_uct_ib_utils, sec_to_rnr_time) {
 }
 
 
+#if HAVE_DEVX
+class test_uct_ib_sl_utils : public test_uct_ib_utils {
+protected:
+    ucs_status_t ib_select_sl(ucs_ternary_auto_value_t ar_enable,
+                              uint64_t test_ooo_sl_mask,
+                              const uct_ib_iface_config_t &config,
+                              uint8_t &sl) const {
+        uint16_t ooo_sl_mask = (test_ooo_sl_mask !=
+                                m_ooo_sl_mask_not_detected) ?
+                               static_cast<uint16_t>(test_ooo_sl_mask) : 0;
+        return uct_ib_mlx5_select_sl(&config, ar_enable, ooo_sl_mask,
+                                     (test_ooo_sl_mask !=
+                                      m_ooo_sl_mask_not_detected),
+                                     "mlx5_0", 1, &sl);
+    }
+
+    ucs_status_t select_sl_ok(ucs_ternary_auto_value_t ar_enable,
+                              unsigned long config_sl,
+                              uint64_t ooo_sl_mask,
+                              const uct_ib_iface_config_t &config) const {
+        uint16_t sls_with_ar, sls_without_ar;
+        ucs_status_t status;
+        uint8_t sl;
+
+        if (ooo_sl_mask != m_ooo_sl_mask_not_detected) {
+            sls_with_ar    = static_cast<uint16_t>(ooo_sl_mask);
+            sls_without_ar = static_cast<uint16_t>(~ooo_sl_mask);
+        } else {
+            sls_with_ar    =
+            sls_without_ar = 0;
+        }
+
+        status = ib_select_sl(ar_enable, ooo_sl_mask, config, sl);
+        if ((ooo_sl_mask == 0) || (ar_enable == UCS_AUTO)) {
+            if (config_sl == UCS_ULUNITS_AUTO) {
+                EXPECT_EQ(m_default_sl, sl);
+            } else {
+                EXPECT_EQ(static_cast<uint8_t>(config_sl), sl);
+            }
+        } else if (config_sl == UCS_ULUNITS_AUTO) {
+            if ((ar_enable == UCS_YES) ||
+                ((ar_enable == UCS_TRY) && (sls_with_ar != 0))) {
+                EXPECT_EQ(ucs_ffs64_safe(sls_with_ar), sl);
+            } else if ((ar_enable == UCS_NO) ||
+                       ((ar_enable == UCS_TRY) && (sls_without_ar != 0))) {
+                EXPECT_EQ(ucs_ffs64_safe(sls_without_ar), sl);
+            } else {
+                EXPECT_EQ(UCS_TRY, ar_enable);
+                EXPECT_EQ(m_default_sl, sl);
+            }
+        } else {
+            EXPECT_EQ(static_cast<uint8_t>(config_sl), sl);
+        }
+
+        return status;
+    }
+
+    static ucs_log_func_rc_t
+    wrap_errors_check_sl_masks_logger(const char *file, unsigned line,
+                                      const char *function,
+                                      ucs_log_level_t level,
+                                      const ucs_log_component_config_t *
+                                      comp_conf,
+                                      const char *message, va_list ap)
+    {
+        if (level == UCS_LOG_LEVEL_ERROR) {
+            std::string err_str = format_message(message, ap);
+
+            for (uint8_t sl = 0; sl < UCT_IB_SL_NUM; ++sl) {
+                std::string sl_val = ucs::to_string(static_cast<uint16_t>(sl));
+
+                if ((err_str.find(sl_val + ", ") == std::string::npos) &&
+                    (err_str.find(sl_val + " }") == std::string::npos)) {
+                    return UCS_LOG_FUNC_RC_CONTINUE;
+                }
+            }
+
+            return UCS_LOG_FUNC_RC_STOP;
+        }
+
+        return UCS_LOG_FUNC_RC_CONTINUE;
+    }
+
+    ucs_status_t select_sl_nok(ucs_ternary_auto_value_t ar_enable,
+                               unsigned long config_sl, uint64_t ooo_sl_mask,
+                               const uct_ib_iface_config_t &config) const {
+        scoped_log_handler slh(((ooo_sl_mask != m_ooo_sl_mask_not_detected) &&
+                                (config_sl == UCS_ULUNITS_AUTO)) ?
+                               wrap_errors_check_sl_masks_logger :
+                               wrap_errors_logger);
+        uint8_t sl;
+
+        EXPECT_NE(UCS_AUTO, ar_enable);
+
+        return ib_select_sl(ar_enable, ooo_sl_mask, config, sl);
+    }
+
+    void select_sl(ucs_ternary_auto_value_t ar_enable, unsigned long config_sl,
+                   uint64_t ooo_sl_mask, ucs_status_t exp_status) const {
+        uct_ib_iface_config_t config = {};
+        ucs_status_t status;
+
+        config.sl = config_sl;
+
+        if (exp_status == UCS_OK) {
+            status = select_sl_ok(ar_enable, config_sl, ooo_sl_mask, config);
+        } else {
+            status = select_sl_nok(ar_enable, config_sl, ooo_sl_mask, config);
+        }
+        EXPECT_EQ(exp_status, status);
+    }
+
+protected:
+    const static uint64_t m_ooo_sl_mask_not_detected;
+    const static uint8_t  m_default_sl;
+};
+
+const uint64_t test_uct_ib_sl_utils::m_ooo_sl_mask_not_detected =
+                                     std::numeric_limits<uint64_t>::max();
+const uint8_t  test_uct_ib_sl_utils::m_default_sl               = 0;
+
+
+UCS_TEST_F(test_uct_ib_sl_utils, sl_selection) {
+    const ucs_status_t err_status = UCS_ERR_UNSUPPORTED;
+
+    for (unsigned i = 0; i < static_cast<unsigned>(UCS_TERNARY_LAST); i++) {
+        ucs_ternary_auto_value_t ar_enable =
+            static_cast<ucs_ternary_auto_value_t>(i);
+
+        // select the default SL, with empty OOO SL mask
+        select_sl(ar_enable, UCS_ULUNITS_AUTO, 0,
+                  (ar_enable == UCS_YES) ? err_status : UCS_OK);
+
+        // select the default SL, without OOO SL mask (not detected)
+        select_sl(ar_enable, UCS_ULUNITS_AUTO, m_ooo_sl_mask_not_detected,
+                  ((ar_enable != UCS_TRY) && (ar_enable != UCS_AUTO)) ?
+                  err_status : UCS_OK);
+
+        for (uint8_t sl = 0; sl < UCT_IB_SL_NUM; ++sl) {
+            // select the default SL, with OOO SL mask which contains only <sl>
+            select_sl(ar_enable, UCS_ULUNITS_AUTO, UCS_BIT(sl), UCS_OK);
+
+            // select SL=<sl>, with empty OOO SL mask
+            select_sl(ar_enable, sl, 0,
+                      (ar_enable == UCS_YES) ? err_status : UCS_OK);
+
+            // select SL=<sl>, without OOO SL mask (not detected)
+            select_sl(ar_enable, sl, m_ooo_sl_mask_not_detected,
+                      ((ar_enable != UCS_TRY) && (ar_enable != UCS_AUTO)) ?
+                      err_status : UCS_OK);
+
+            /* select SL=<sl>, with OOO SL mask which contains only <sl> */
+            select_sl(ar_enable, sl, UCS_BIT(sl),
+                      (ar_enable == UCS_NO) ? err_status : UCS_OK);
+
+            // select SL=<sl>, with OOO SL mask which doesn't contain <sl>, but
+            // it contains one another element
+            select_sl(ar_enable, sl, UCS_BIT(UCT_IB_SL_NUM - 1 - sl),
+                      (ar_enable == UCS_YES) ? err_status : UCS_OK);
+
+            // select SL=<sl>, with OOO SL mask which contains <sl> and two
+            // other elements
+            select_sl(ar_enable, sl,
+                      UCS_BIT(sl)                       |
+                      UCS_BIT((sl + 1) % UCT_IB_SL_NUM) |
+                      UCS_BIT((sl + 2) % UCT_IB_SL_NUM),
+                      (ar_enable == UCS_NO) ? err_status : UCS_OK);
+
+            // select SL=<sl>, with OOO SL mask which doesn't contain <sl>
+            select_sl(ar_enable, sl,
+                      UCS_BIT((sl + 1) % UCT_IB_SL_NUM) |
+                      UCS_BIT((sl + 2) % UCT_IB_SL_NUM) |
+                      UCS_BIT((sl + 3) % UCT_IB_SL_NUM),
+                      (ar_enable == UCS_YES) ? err_status : UCS_OK);
+
+            // select SL=<sl>, with full OOO SL mask
+            select_sl(ar_enable, sl, UCS_MASK(UCT_IB_SL_NUM),
+                      (ar_enable == UCS_NO) ? err_status : UCS_OK);
+
+            // select SL=<sl>, with full OOO SL mask, except <sl>
+            select_sl(ar_enable, sl, UCS_MASK(UCT_IB_SL_NUM) & ~UCS_BIT(sl),
+                      (ar_enable == UCS_YES) ? err_status : UCS_OK);
+        }
+    }
+}
+
+UCS_TEST_F(test_uct_ib_sl_utils, query_ooo_sl_mask) {
+    int num_devices;
+    struct ibv_device **ib_device_list;
+    ucs_status_t status;
+
+    ib_device_list = ibv_get_device_list(&num_devices);
+    ASSERT_TRUE(ib_device_list != NULL);
+
+    for (int i = 0; i < num_devices; ++i) {
+        const char *dev_name = ibv_get_device_name(ib_device_list[i]);
+        uct_md_config_t *md_config;
+        uct_ib_mlx5_md_t *ib_mlx5_md;
+        uct_ib_device_t *dev;
+        uct_md_h md;
+
+        status = uct_md_config_read(&uct_ib_component, NULL, NULL, &md_config);
+        EXPECT_UCS_OK(status);
+        if (status != UCS_OK) {
+            continue;
+        }
+
+        status = uct_ib_md_open(&uct_ib_component, dev_name, md_config, &md);
+        EXPECT_UCS_OK(status);
+        if (status != UCS_OK) {
+            goto out_md_config_release;
+        }
+
+        ib_mlx5_md = ucs_derived_of(md, uct_ib_mlx5_md_t);
+        dev        = &ib_mlx5_md->super.dev;
+
+        for (uint8_t port_num = dev->first_port;
+             port_num <= dev->num_ports; ++port_num) {
+            uint16_t ooo_sl_mask = 0;
+            ucs_string_buffer_t strb;
+
+            status = uct_ib_mlx5_devx_query_ooo_sl_mask(ib_mlx5_md, port_num,
+                                                        &ooo_sl_mask);
+            EXPECT_TRUE((status == UCS_OK) || (status == UCS_ERR_UNSUPPORTED));
+            if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) {
+                continue;
+            }
+
+            ucs_string_buffer_init(&strb);
+            UCS_TEST_MESSAGE << "OOO SL mask for " << dev_name << " - { "
+                             << ucs_mask_str(ooo_sl_mask, &strb) << " }";
+            ucs_string_buffer_cleanup(&strb);
+        }
+
+        uct_ib_md_close(md);
+out_md_config_release:
+        uct_config_release(md_config);
+    }
+
+    ibv_free_device_list(ib_device_list);
+}
+#endif
+
+
 class test_uct_event_ib : public test_uct_ib {
 public:
     test_uct_event_ib() {
@@ -653,5 +974,72 @@ UCS_TEST_SKIP_COND_P(test_uct_event_ib, txrx_cq,
     m_e2->flush();
 }
 
-
 UCT_INSTANTIATE_IB_TEST_CASE(test_uct_event_ib);
+
+class test_uct_ib_mtu : public test_uct_ib {
+public:
+    void create_connected_entities()
+    {
+        modify_config("IB_PATH_MTU", "4096");
+        m_e1 = uct_test::create_entity(0);
+        modify_config("IB_PATH_MTU", "2048");
+        m_e2 = uct_test::create_entity(0);
+
+        m_entities.push_back(m_e1);
+        m_entities.push_back(m_e2);
+
+        m_e1->connect(0, *m_e2, 0);
+        m_e2->connect(0, *m_e1, 0);
+    }
+
+    static ucs_status_t
+    ib_am_bcopy_handler(void *arg, void *data, size_t length, unsigned flags)
+    {
+        EXPECT_EQ((size_t)arg, length);
+        ++test_uct_ib::m_ib_am_handler_counter;
+        return UCS_OK;
+    }
+
+    static size_t ib_am_bcopy_pack_cb(void *dest, void *arg)
+    {
+        size_t length = (size_t)arg;
+
+        memset(dest, 0, length);
+        return length;
+    }
+
+    void send_recv_bcopy(uct_ep_h ep, entity *ent, size_t length)
+    {
+        size_t start_am_counter = test_uct_ib::m_ib_am_handler_counter;
+        uct_ib_iface_t *iface   = ucs_derived_of(ep->iface, uct_ib_iface_t);
+        uct_iface_attr_t attr;
+        ucs_status_t status;
+        size_t len;
+
+        status = uct_iface_query(&iface->super.super, &attr);
+        ASSERT_UCS_OK(status);
+        ASSERT_TRUE(attr.cap.flags & UCT_IFACE_FLAG_AM_BCOPY);
+        ASSERT_LE(length, attr.cap.am.max_bcopy);
+
+        /* set a callback for the uct to invoke for receiving the data */
+        uct_iface_set_am_handler(ent->iface(), 0, ib_am_bcopy_handler,
+                                 (void*)length, 0);
+
+        /* send the data */
+        len = uct_ep_am_bcopy(ep, 0, ib_am_bcopy_pack_cb, (void*)length, 0);
+        ASSERT_EQ(length, len);
+
+        flush();
+        wait_for_value(&test_uct_ib::m_ib_am_handler_counter,
+                       start_am_counter + 1, true);
+    }
+};
+
+UCS_TEST_SKIP_COND_P(test_uct_ib_mtu, non_equal_mtu,
+                     !check_caps(UCT_IFACE_FLAG_AM_BCOPY))
+{
+    send_recv_bcopy(m_e1->ep(0), m_e2, m_e1->iface_attr().cap.am.max_bcopy);
+    send_recv_bcopy(m_e2->ep(0), m_e1, m_e2->iface_attr().cap.am.max_bcopy);
+}
+
+UCT_INSTANTIATE_RC_TEST_CASE(test_uct_ib_mtu);
diff --git a/test/gtest/uct/ib/test_ib_event.cc b/test/gtest/uct/ib/test_ib_event.cc
index 5800924b4bf..57fdc38fc0e 100644
--- a/test/gtest/uct/ib/test_ib_event.cc
+++ b/test/gtest/uct/ib/test_ib_event.cc
@@ -59,9 +59,12 @@ class uct_test_event_base : public uct_p2p_test {
         volatile bool             got;
     };
 
-    static void last_wqe_check_cb(uct_ib_async_event_wait_t *arg) {
-        event_ctx *event = ucs_derived_of(arg, event_ctx);
+    static unsigned last_wqe_check_cb(void *arg) {
+        event_ctx *event = (event_ctx *)arg;
         event->got       = true;
+        ucs_callbackq_remove_safe(event->super.cbq, event->super.cb_id);
+        event->super.cb_id = UCS_CALLBACKQ_ID_NULL;
+        return 1;
     }
 
     virtual void init_qp(entity &e) = 0;
@@ -354,7 +357,7 @@ class uct_qp_test_event : public uct_test_event_base {
 
             struct ibv_ah_attr ah = ah_attr();
             status = uct_rc_mlx5_ep_connect_qp(m_iface, &m_txwq.super,
-                                               qp_num(), &ah, path_mtu());
+                                               qp_num(), &ah, path_mtu(), 0);
             ASSERT_UCS_OK(status);
         }
 
diff --git a/test/gtest/uct/ib/test_ib_pkey.cc b/test/gtest/uct/ib/test_ib_pkey.cc
index 80a9264512c..a77c51bde58 100644
--- a/test/gtest/uct/ib/test_ib_pkey.cc
+++ b/test/gtest/uct/ib/test_ib_pkey.cc
@@ -122,7 +122,7 @@ class test_uct_ib_pkey : public test_uct_ib_with_specific_port {
 
         uct_ib_iface_address_pack(iface, ib_addr);
         uct_ib_address_unpack(ib_addr, &params);
-        EXPECT_TRUE((params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY) != 0);
+        EXPECT_TRUE(params.flags & UCT_IB_ADDRESS_PACK_FLAG_PKEY);
         EXPECT_EQ(m_pkey[0], params.pkey);
 
         return params.pkey;
diff --git a/test/gtest/uct/ib/test_rc.cc b/test/gtest/uct/ib/test_rc.cc
index fcfe1b073e6..b69aca7d58f 100644
--- a/test/gtest/uct/ib/test_rc.cc
+++ b/test/gtest/uct/ib/test_rc.cc
@@ -99,6 +99,19 @@ UCS_TEST_P(test_rc, tx_cq_moderation) {
     EXPECT_EQ(init_rsc, rc_ep(m_e1)->txqp.available);
 }
 
+UCS_TEST_P(test_rc, flush_fc, "FLUSH_MODE?=fc") {
+    send_am_messages(m_e1, 1, UCS_OK);
+
+    ucs_status_t status;
+    do {
+        status = uct_ep_flush(m_e1->ep(0), 0, NULL);
+        short_progress_loop();
+        if (status != UCS_ERR_NO_RESOURCE) {
+            ASSERT_UCS_OK_OR_INPROGRESS(status);
+        }
+    } while (status != UCS_OK);
+}
+
 UCT_INSTANTIATE_RC_TEST_CASE(test_rc)
 
 
@@ -439,7 +452,7 @@ UCS_TEST_SKIP_COND_P(test_rc_get_limit, get_zcopy_purge,
         ASSERT_EQ(1ul, m_e1->num_eps());
         status = uct_ep_flush(m_e1->ep(0), flags, NULL);
         progress();
-        if (flags & UCT_FLUSH_FLAG_CANCEL) {
+        if ((flags & UCT_FLUSH_FLAG_CANCEL) && (status != UCS_ERR_NO_RESOURCE)) {
             ASSERT_UCS_OK_OR_INPROGRESS(status);
             flags = UCT_FLUSH_FLAG_LOCAL;
             continue;
@@ -871,11 +884,6 @@ UCS_TEST_SKIP_COND_P(test_rc_keepalive, pending,
 {
     ucs_status_t status;
 
-    /* for now rc_mlx5 transport supported only */
-    if (!has_transport("rc_mlx5")) {
-        UCS_TEST_SKIP_R("Unsupported");
-    }
-
     scoped_log_handler slh(wrap_errors_logger);
     flush();
     /* ensure that everything works as expected */
@@ -910,3 +918,83 @@ UCS_TEST_SKIP_COND_P(test_rc_keepalive, pending,
 }
 
 UCT_INSTANTIATE_RC_TEST_CASE(test_rc_keepalive)
+
+
+#ifdef HAVE_MLX5_HW
+
+class test_rc_srq : public test_rc {
+public:
+    test_rc_srq() : m_buf8b(NULL), m_buf8k(NULL)
+    {
+    }
+
+    void init()
+    {
+        test_rc::init();
+
+        m_buf8b = new mapped_buffer(8, 0x1, *m_e1);
+        m_buf8k = new mapped_buffer(8 * UCS_KBYTE, 0x2, *m_e1);
+    }
+
+    void connect()
+    {
+        test_rc::connect();
+
+        m_e1->connect(0, *m_e2, 0);
+        m_e2->connect(0, *m_e1, 0);
+        m_e1->connect(1, *m_e2, 1);
+        m_e2->connect(1, *m_e1, 1);
+    }
+
+    bool send(int ep, void *buf)
+    {
+        ssize_t status;
+
+        status = uct_ep_am_bcopy(m_e1->ep(ep), 0, mapped_buffer::pack, buf, 0);
+        if (status == UCS_ERR_NO_RESOURCE) {
+            short_progress_loop();
+            return false;
+        } else if (status < 0) {
+            ASSERT_UCS_OK((ucs_status_t)status);
+        }
+
+        return true;
+    }
+
+    void test_reorder() {
+        unsigned i = 0;
+        ucs_time_t deadline = ucs::get_deadline();
+        while ((i < 10000) && (ucs_get_time() < deadline)) {
+            if (send(0, m_buf8k) && send(1, m_buf8b)) {
+                i++;
+            }
+        }
+    }
+
+    void cleanup() {
+        delete m_buf8b;
+        delete m_buf8k;
+        test_rc::cleanup();
+    }
+
+protected:
+    mapped_buffer *m_buf8b, *m_buf8k;
+};
+
+UCS_TEST_SKIP_COND_P(test_rc_srq, reorder_list,
+                     !check_caps(UCT_IFACE_FLAG_AM_BCOPY),
+                     "RC_SRQ_TOPO?=list")
+{
+    test_reorder();
+}
+
+UCS_TEST_SKIP_COND_P(test_rc_srq, reorder_cyclic,
+                     !check_caps(UCT_IFACE_FLAG_AM_BCOPY),
+                     "RC_SRQ_TOPO?=cyclic,cyclic_emulated")
+{
+    test_reorder();
+}
+
+UCT_INSTANTIATE_RC_DC_TEST_CASE(test_rc_srq);
+
+#endif
diff --git a/test/gtest/uct/ib/test_rc.h b/test/gtest/uct/ib/test_rc.h
index d15c87111e1..0af560988e3 100644
--- a/test/gtest/uct/ib/test_rc.h
+++ b/test/gtest/uct/ib/test_rc.h
@@ -27,7 +27,7 @@ class test_rc : public uct_test {
     } pending_send_request_t;
 
     virtual void init();
-    void connect();
+    virtual void connect();
 
     uct_rc_iface_t* rc_iface(entity *e) {
         return ucs_derived_of(e->iface(), uct_rc_iface_t);
diff --git a/test/gtest/uct/ib/test_sockaddr.cc b/test/gtest/uct/ib/test_sockaddr.cc
index 30ee1c7d760..49d458cc5a0 100644
--- a/test/gtest/uct/ib/test_sockaddr.cc
+++ b/test/gtest/uct/ib/test_sockaddr.cc
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2017.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2017-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -16,352 +16,14 @@ extern "C" {
 
 #include <queue>
 
-class test_uct_sockaddr : public uct_test {
-public:
-    test_uct_sockaddr() : server(NULL), client(NULL), err_count(0),
-                          server_recv_req(0), delay_conn_reply(false) {
-    }
-
-    void check_md_usability() {
-        uct_md_attr_t md_attr;
-        uct_md_config_t *md_config;
-        ucs_status_t status;
-        uct_md_h md;
-
-        status = uct_md_config_read(GetParam()->component, NULL, NULL, &md_config);
-        EXPECT_TRUE(status == UCS_OK);
-
-        status = uct_md_open(GetParam()->component, GetParam()->md_name.c_str(),
-                             md_config, &md);
-        EXPECT_TRUE(status == UCS_OK);
-        uct_config_release(md_config);
-
-        status = uct_md_query(md, &md_attr);
-        ASSERT_UCS_OK(status);
-
-        uct_md_close(md);
-
-        if (!(md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR)) {
-            UCS_TEST_SKIP_R(GetParam()->md_name.c_str() +
-                            std::string(" does not support client-server "
-                                        "connection establishment via sockaddr "
-                                        "without a cm"));
-        }
-    }
-
-    void init() {
-        check_md_usability();
-
-        uct_iface_params_t server_params, client_params;
-        uint16_t port;
-
-        uct_test::init();
-
-        /* This address is accessible, as it was tested at the resource creation */
-        m_listen_addr  = GetParam()->listen_sock_addr;
-        m_connect_addr = GetParam()->connect_sock_addr;
-
-        port = ucs::get_port();
-        m_listen_addr.set_port(port);
-        m_connect_addr.set_port(port);
-
-        /* open iface for the server side */
-        server_params.field_mask                     = UCT_IFACE_PARAM_FIELD_OPEN_MODE         |
-                                                       UCT_IFACE_PARAM_FIELD_ERR_HANDLER       |
-                                                       UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG   |
-                                                       UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS |
-                                                       UCT_IFACE_PARAM_FIELD_SOCKADDR;
-        server_params.open_mode                      = UCT_IFACE_OPEN_MODE_SOCKADDR_SERVER;
-        server_params.err_handler                    = err_handler;
-        server_params.err_handler_arg                = reinterpret_cast<void*>(this);
-        server_params.err_handler_flags              = 0;
-        server_params.mode.sockaddr.listen_sockaddr  = m_listen_addr.to_ucs_sock_addr();
-        server_params.mode.sockaddr.cb_flags         = UCT_CB_FLAG_ASYNC;
-        server_params.mode.sockaddr.conn_request_cb  = conn_request_cb;
-        server_params.mode.sockaddr.conn_request_arg = reinterpret_cast<void*>(this);
-
-        /* if origin port is busy, create_entity will retry with another one */
-        server = uct_test::create_entity(server_params);
-        m_entities.push_back(server);
-
-        check_skip_test();
-
-        port = ucs::sock_addr_storage(server->iface_params().mode.sockaddr
-                                                            .listen_sockaddr)
-                                      .get_port();
-        m_listen_addr.set_port(port);
-        m_connect_addr.set_port(port);
-
-        /* open iface for the client side */
-        client_params.field_mask                     = UCT_IFACE_PARAM_FIELD_OPEN_MODE       |
-                                                       UCT_IFACE_PARAM_FIELD_ERR_HANDLER     |
-                                                       UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG |
-                                                       UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS;
-        client_params.open_mode                      = UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT;
-        client_params.err_handler                    = err_handler;
-        client_params.err_handler_arg                = reinterpret_cast<void*>(this);
-        client_params.err_handler_flags              = 0;
-
-        client = uct_test::create_entity(client_params);
-        m_entities.push_back(client);
-
-        /* initiate the client's private data callback argument */
-        client->max_conn_priv = server->iface_attr().max_conn_priv;
-
-        UCS_TEST_MESSAGE << "Testing " << m_listen_addr
-                         << " Interface: " << GetParam()->dev_name;
-    }
-
-    size_t iface_priv_data_do_pack(void *priv_data)
-    {
-        size_t priv_data_len;
-
-        client_priv_data = "Client private data";
-        priv_data_len = 1 + client_priv_data.length();
-
-        memcpy(priv_data, client_priv_data.c_str(), priv_data_len);
-        return priv_data_len;
-    }
-
-    static ssize_t client_iface_priv_data_cb(void *arg,
-                                             const uct_cm_ep_priv_data_pack_args_t
-                                             *pack_args, void *priv_data)
-    {
-        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr*>(arg);
-        size_t priv_data_len;
-
-        priv_data_len = self->iface_priv_data_do_pack(priv_data);
-        EXPECT_LE(priv_data_len, self->client->max_conn_priv);
-
-        return priv_data_len;
-    }
-
-    static void conn_request_cb(uct_iface_h iface, void *arg,
-                                uct_conn_request_h conn_request,
-                                const void *conn_priv_data, size_t length)
-    {
-        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr*>(arg);
-
-        EXPECT_EQ(self->client_priv_data,
-                  std::string(reinterpret_cast<const char *>(conn_priv_data)));
-
-        EXPECT_EQ(1 + self->client_priv_data.length(), length);
-
-        if (self->delay_conn_reply) {
-            self->delayed_conn_reqs.push(conn_request);
-        } else {
-            uct_iface_accept(iface, conn_request);
-        }
-        ucs_memory_cpu_store_fence();
-        self->server_recv_req++;
-    }
-
-    static ucs_status_t err_handler(void *arg, uct_ep_h ep, ucs_status_t status)
-    {
-        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr*>(arg);
-        ucs_atomic_add32(&self->err_count, 1);
-        return UCS_OK;
-    }
 
+class test_uct_sockaddr : public uct_test {
+    friend class uct_test::entity;
 protected:
-    entity *server, *client;
-    ucs::sock_addr_storage m_listen_addr, m_connect_addr;
-    volatile uint32_t err_count;
-    volatile int server_recv_req;
-    std::queue<uct_conn_request_h> delayed_conn_reqs;
-    bool delay_conn_reply;
-    std::string client_priv_data;
-};
 
-UCS_TEST_P(test_uct_sockaddr, connect_client_to_server)
-{
-    client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb,
-                    NULL, NULL, this);
+    class client_user_data;
+    typedef std::map<uct_ep_h, client_user_data*> ep_client_data_map_t;
 
-    /* wait for the server to connect */
-    while (server_recv_req == 0) {
-        progress();
-    }
-    ASSERT_TRUE(server_recv_req == 1);
-    /* since the transport may support a graceful exit in case of an error,
-     * make sure that the error handling flow wasn't invoked (there were no
-     * errors) */
-    EXPECT_EQ(0ul, err_count);
-    /* the test may end before the client's ep got connected.
-     * it should also pass in this case as well - the client's
-     * ep shouldn't be accessed (for connection reply from the server) after the
-     * test ends and the client's ep was destroyed */
-}
-
-UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_with_delay)
-{
-    delay_conn_reply = true;
-    client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb,
-                    NULL, NULL, this);
-
-    /* wait for the server to connect */
-    while (server_recv_req == 0) {
-        progress();
-    }
-    ASSERT_EQ(1,   server_recv_req);
-    ucs_memory_cpu_load_fence();
-    ASSERT_EQ(1ul, delayed_conn_reqs.size());
-    EXPECT_EQ(0ul, err_count);
-    while (!delayed_conn_reqs.empty()) {
-        uct_iface_accept(server->iface(), delayed_conn_reqs.front());
-        delayed_conn_reqs.pop();
-    }
-
-    uct_completion_t comp;
-    comp.func   = (uct_completion_callback_t)ucs_empty_function;
-    comp.count  = 1;
-    comp.status = UCS_OK;
-
-    ucs_status_t status = uct_ep_flush(client->ep(0), 0, &comp);
-    if (status == UCS_INPROGRESS) {
-        do {
-            short_progress_loop();
-            /* coverity[loop_condition] */
-        } while (comp.count != 0);
-        EXPECT_EQ(UCS_OK, comp.status);
-    } else {
-        EXPECT_EQ(UCS_OK, status);
-    }
-    EXPECT_EQ(0ul, err_count);
-}
-
-UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_reject_with_delay)
-{
-    delay_conn_reply = true;
-    client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb,
-                    NULL, NULL, this);
-
-    /* wait for the server to connect */
-    while (server_recv_req == 0) {
-        progress();
-    }
-    ASSERT_EQ(1, server_recv_req);
-    ucs_memory_cpu_load_fence();
-    ASSERT_EQ(1ul, delayed_conn_reqs.size());
-    EXPECT_EQ(0ul, err_count);
-    while (!delayed_conn_reqs.empty()) {
-        uct_iface_reject(server->iface(), delayed_conn_reqs.front());
-        delayed_conn_reqs.pop();
-    }
-    while (err_count == 0) {
-        progress();
-    }
-    EXPECT_EQ(1ul, err_count);
-}
-
-UCS_TEST_P(test_uct_sockaddr, many_clients_to_one_server)
-{
-    int num_clients = ucs_max(2, 100 / ucs::test_time_multiplier());
-    uct_iface_params_t client_params;
-    entity *client_test;
-
-    /* multiple clients, each on an iface of its own, connecting to the same server */
-    for (int i = 0; i < num_clients; ++i) {
-        /* open iface for the client side */
-        client_params.field_mask        = UCT_IFACE_PARAM_FIELD_OPEN_MODE       |
-                                          UCT_IFACE_PARAM_FIELD_ERR_HANDLER     |
-                                          UCT_IFACE_PARAM_FIELD_ERR_HANDLER_ARG |
-                                          UCT_IFACE_PARAM_FIELD_ERR_HANDLER_FLAGS;
-        client_params.open_mode         = UCT_IFACE_OPEN_MODE_SOCKADDR_CLIENT;
-        client_params.err_handler       = err_handler;
-        client_params.err_handler_arg   = reinterpret_cast<void*>(this);
-        client_params.err_handler_flags = 0;
-
-        client_test = uct_test::create_entity(client_params);
-        m_entities.push_back(client_test);
-
-        client_test->max_conn_priv = server->iface_attr().max_conn_priv;
-        client_test->connect(i, *server, 0, m_connect_addr,
-                             client_iface_priv_data_cb, NULL, NULL, this);
-    }
-
-    while (server_recv_req < num_clients){
-        progress();
-    }
-    ASSERT_TRUE(server_recv_req == num_clients);
-    EXPECT_EQ(0ul, err_count);
-}
-
-UCS_TEST_P(test_uct_sockaddr, many_conns_on_client)
-{
-    int num_conns_on_client = ucs_max(2, 100 / ucs::test_time_multiplier());
-
-    /* multiple clients, on the same iface, connecting to the same server */
-    for (int i = 0; i < num_conns_on_client; ++i) {
-        client->connect(i, *server, 0, m_connect_addr, client_iface_priv_data_cb,
-                        NULL, NULL, this);
-    }
-
-    while (server_recv_req < num_conns_on_client) {
-        progress();
-    }
-    ASSERT_TRUE(server_recv_req == num_conns_on_client);
-    EXPECT_EQ(0ul, err_count);
-}
-
-UCS_TEST_SKIP_COND_P(test_uct_sockaddr, err_handle,
-                     !check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE))
-{
-    client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb,
-                    NULL, NULL, this);
-
-    scoped_log_handler slh(wrap_errors_logger);
-    /* kill the server */
-    m_entities.remove(server);
-
-    /* If the server didn't receive a connection request from the client yet,
-     * test error handling */
-    if (server_recv_req == 0) {
-        wait_for_flag(&err_count);
-        /* Double check for server_recv_req if it's not delivered from NIC to
-         * host memory under hight load */
-        EXPECT_TRUE((err_count == 1) || (server_recv_req == 1));
-    }
-}
-
-UCS_TEST_SKIP_COND_P(test_uct_sockaddr, conn_to_non_exist_server,
-                     !check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE))
-{
-    m_connect_addr.set_port(htons(1));
-    err_count = 0;
-
-    /* wrap errors now since the client will try to connect to a non existing port */
-    {
-        scoped_log_handler slh(wrap_errors_logger);
-        /* client - try to connect to a non-existing port on the server side */
-        client->connect(0, *server, 0, m_connect_addr, client_iface_priv_data_cb,
-                        NULL, NULL, this);
-
-        uct_completion_t comp;
-        comp.func   = (uct_completion_callback_t)ucs_empty_function;
-        comp.count  = 1;
-        comp.status = UCS_OK;
-
-        ucs_status_t status = uct_ep_flush(client->ep(0), 0, &comp);
-        if (status == UCS_INPROGRESS) {
-            do {
-                short_progress_loop();
-                /* coverity[loop_condition] */
-            } while (comp.count != 0);
-            EXPECT_EQ(UCS_ERR_UNREACHABLE, comp.status);
-        } else {
-            EXPECT_EQ(UCS_ERR_UNREACHABLE, status);
-        }
-        /* destroy the client's ep. this ep shouldn't be accessed anymore */
-        client->destroy_ep(0);
-    }
-}
-
-UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr)
-
-class test_uct_cm_sockaddr : public uct_test {
-    friend class uct_test::entity;
-protected:
     enum {
         TEST_STATE_CONNECT_REQUESTED             = UCS_BIT(0),
         TEST_STATE_CLIENT_CONNECTED              = UCS_BIT(1),
@@ -379,24 +41,98 @@ class test_uct_cm_sockaddr : public uct_test {
         TEST_EP_FLAG_DISCONNECT_CB_INVOKED = UCS_BIT(1)
     };
 
+    class client_user_data {
+    public:
+        client_user_data(test_uct_sockaddr &test, entity &entity,
+                         unsigned ep_index) :
+            m_test(test), m_entity(entity), m_ep_index(ep_index)
+        {
+        }
+
+        test_uct_sockaddr* get_test() const
+        {
+            return &m_test;
+        }
+
+        entity* get_entity() const
+        {
+            return &m_entity;
+        }
+
+        unsigned get_ep_index() const
+        {
+            return m_ep_index;
+        }
+
+        uct_ep_h get_ep() const
+        {
+            return get_entity()->ep(get_ep_index());
+        }
+
+    private:
+        test_uct_sockaddr &m_test;
+        entity            &m_entity;
+        unsigned          m_ep_index;
+    };
+
 public:
-    test_uct_cm_sockaddr() : m_state(0), m_server(NULL), m_client(NULL),
-                             m_server_recv_req_cnt(0), m_client_connect_cb_cnt(0),
-                             m_server_connect_cb_cnt(0),
-                             m_server_disconnect_cnt(0), m_client_disconnect_cnt(0),
-                             m_reject_conn_request(false),
-                             m_server_start_disconnect(false),
-                             m_delay_conn_reply(false),
-                             m_short_priv_data_len(0), m_long_priv_data_len(0) {
+    test_uct_sockaddr() : m_state(0), m_server(NULL), m_client(NULL),
+                          m_server_recv_req_cnt(0), m_client_connect_cb_cnt(0),
+                          m_server_connect_cb_cnt(0),
+                          m_server_disconnect_cnt(0), m_client_disconnect_cnt(0),
+                          m_reject_conn_request(false),
+                          m_server_start_disconnect(false),
+                          m_delay_conn_reply(false), m_short_priv_data_len(0),
+                          m_long_priv_data_len(0) {
+        pthread_mutex_init(&m_ep_client_data_lock, NULL);
+    }
+
+    ~test_uct_sockaddr() {
+        pthread_mutex_destroy(&m_ep_client_data_lock);
+    }
+
+    static std::vector<const resource*>
+    enum_cm_resources(const std::string &cm_cmpt_name)
+    {
+        static std::vector<resource> all_resources;
+
+        if (all_resources.empty()) {
+            set_cm_resources(all_resources);
+        }
+
+        return filter_resources(all_resources,
+                                resource::is_equal_component_name,
+                                cm_cmpt_name);
     }
 
     void init() {
+        struct {
+            bool is_set;
+            char cstr[UCS_SOCKADDR_STRING_LEN];
+        } src_addr = {
+            .is_set = false,
+            .cstr   = {0}
+        };
+
         uct_test::init();
 
         /* This address is accessible, as it was tested at the resource creation */
         m_listen_addr  = GetParam()->listen_sock_addr;
         m_connect_addr = GetParam()->connect_sock_addr;
 
+        const ucs::sock_addr_storage &src_sock_addr = 
+                                                GetParam()->source_sock_addr;
+        if (src_sock_addr.get_sock_addr_ptr() != NULL) {
+            int sa_family   = src_sock_addr.get_sock_addr_ptr()->sa_family;
+            const char *ret = inet_ntop(sa_family,
+                                        src_sock_addr.get_sock_addr_in_buf(),
+                                        src_addr.cstr, UCS_SOCKADDR_STRING_LEN);
+            EXPECT_EQ(src_addr.cstr, ret);
+            set_config((std::string("RDMA_CM_SOURCE_ADDRESS?=") +
+                        src_addr.cstr).c_str());
+            src_addr.is_set = true;
+        }
+
         uint16_t port = ucs::get_port();
         m_listen_addr.set_port(port);
         m_connect_addr.set_port(port);
@@ -418,8 +154,21 @@ class test_uct_cm_sockaddr : public uct_test {
         m_long_priv_data.resize(m_long_priv_data_len);
         ucs::fill_random(m_long_priv_data);
 
-        UCS_TEST_MESSAGE << "Testing " << m_listen_addr
-                         << " Interface: " << GetParam()->dev_name;
+        UCS_TEST_MESSAGE << "Testing " << GetParam()->component_name << " on "
+                         << m_listen_addr << " interface "
+                         << GetParam()->dev_name
+                         << (src_addr.is_set ?
+                             (std::string(" with RDMA_CM_SOURCE_ADDRESS=") +
+                              src_addr.cstr) : "");
+    }
+
+    void cleanup() {
+        {
+            ucs::scoped_mutex_lock lock(m_ep_client_data_lock);
+            ASSERT_TRUE(m_ep_client_data.empty());
+        }
+
+        uct_test::cleanup();
     }
 
 protected:
@@ -430,19 +179,31 @@ class test_uct_cm_sockaddr : public uct_test {
         params.field_mask      = UCT_LISTENER_PARAM_FIELD_CONN_REQUEST_CB |
                                  UCT_LISTENER_PARAM_FIELD_USER_DATA;
         params.conn_request_cb = server_conn_req_cb;
-        params.user_data       = static_cast<test_uct_cm_sockaddr *>(this);
-        /* if origin port set in init() is busy, listen() will retry with another one */
-        m_server->listen(m_listen_addr, params);
+        params.user_data       = static_cast<test_uct_sockaddr *>(this);
+
+        ucs_time_t deadline = ucs::get_deadline();
+        ucs_status_t status;
+        do {
+            status = m_server->listen(m_listen_addr, params);
+            if (status == UCS_ERR_BUSY) {
+                m_listen_addr.set_port(ucs::get_port());
+            } else {
+                break;
+            }
+        } while (ucs_get_time() < deadline);
 
-        /* the listen function may have changed the initial port on the listener's
-         * address. update this port for the address to connect to */
+        ASSERT_EQ(UCS_OK, status);
         m_connect_addr.set_port(m_listen_addr.get_port());
     }
 
     void listen_and_connect() {
-        start_listen(test_uct_cm_sockaddr::conn_request_cb);
-        m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb,
-                          client_connect_cb, client_disconnect_cb, this);
+        start_listen(test_uct_sockaddr::conn_request_cb);
+
+        client_user_data *user_data = new client_user_data(*this, *m_client, 0);
+        m_client->connect_to_sockaddr(0, m_connect_addr, client_resolve_cb,
+                                      client_connect_cb, client_disconnect_cb,
+                                      user_data);
+        add_user_data(user_data);
 
         wait_for_bits(&m_state, TEST_STATE_CONNECT_REQUESTED);
         EXPECT_TRUE(m_state & TEST_STATE_CONNECT_REQUESTED);
@@ -460,41 +221,63 @@ class test_uct_cm_sockaddr : public uct_test {
         }
     }
 
-    ssize_t common_priv_data_cb(void *arg, size_t pack_limit, void *priv_data) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
-        size_t priv_data_len;
+    ssize_t common_priv_data_cb(size_t pack_limit, void *priv_data) {
+        size_t priv_data_len = priv_data_do_pack(pack_limit, priv_data);
 
-        priv_data_len = self->priv_data_do_pack(pack_limit, priv_data);
         EXPECT_LE(priv_data_len, pack_limit);
         return priv_data_len;
     }
 
-    static ssize_t client_priv_data_cb(void *arg,
-                                       const uct_cm_ep_priv_data_pack_args_t
-                                       *pack_args, void *priv_data)
+    static ucs_status_t
+    client_resolve_cb(void *user_data, const uct_cm_ep_resolve_args_t *args)
     {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
-        return self->common_priv_data_cb(arg, self->m_client->max_conn_priv, priv_data);
+        client_user_data *sa_user_data =
+                reinterpret_cast<client_user_data*>(user_data);
+        test_uct_sockaddr *self = sa_user_data->get_test();
+
+        std::vector<char> priv_data_buf(self->m_client->max_conn_priv);
+        ssize_t packed = self->common_priv_data_cb(priv_data_buf.size(),
+                                                   priv_data_buf.data());
+        if (packed < 0) {
+            self->del_user_data(sa_user_data);
+            return ucs_status_t(packed);
+        }
+
+        uct_ep_connect_params_t params;
+        params.field_mask          = UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA |
+                                     UCT_EP_CONNECT_PARAM_FIELD_PRIVATE_DATA_LENGTH;
+        params.private_data        = priv_data_buf.data();
+        params.private_data_length = packed;
+        return uct_ep_connect(sa_user_data->get_ep(), &params);
     }
 
-    static ssize_t server_priv_data_cb(void *arg,
-                                       const uct_cm_ep_priv_data_pack_args_t
-                                       *pack_args, void *priv_data)
+    static void check_connection_status(ucs_status_t status, bool can_fail)
     {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
-        return self->common_priv_data_cb(arg, self->m_server->max_conn_priv, priv_data);
+        if (can_fail) {
+            ASSERT_TRUE((status == UCS_OK) ||
+                        (status == UCS_ERR_CONNECTION_RESET) ||
+                        (status == UCS_ERR_NOT_CONNECTED));
+        } else {
+            ASSERT_UCS_OK(status);
+        }
     }
 
-    void accept(uct_cm_h cm, uct_conn_request_h conn_request,
-                uct_cm_ep_server_conn_notify_callback_t notify_cb,
-                uct_ep_disconnect_cb_t disconnect_cb,
-                void *user_data)
+    virtual void accept(uct_cm_h cm, uct_conn_request_h conn_request,
+                        uct_cm_ep_server_conn_notify_callback_t notify_cb,
+                        uct_ep_disconnect_cb_t disconnect_cb,
+                        void *user_data, bool can_fail)
     {
+        std::vector<char> priv_data_buf(m_server->max_conn_priv);
         uct_ep_params_t ep_params;
         ucs_status_t status;
         uct_ep_h ep;
 
         ASSERT_TRUE(m_server->listener());
+
+        ssize_t packed = common_priv_data_cb(priv_data_buf.size(),
+                                             priv_data_buf.data());
+        ASSERT_GT(packed, 0);
+
         m_server->reserve_ep(m_server->num_eps());
 
         ep_params.field_mask = UCT_EP_PARAM_FIELD_CM                        |
@@ -503,32 +286,40 @@ class test_uct_cm_sockaddr : public uct_test {
                                UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER |
                                UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB    |
                                UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS         |
-                               UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB;
-
-        ep_params.cm                 = cm;
-        ep_params.conn_request       = conn_request;
-        ep_params.sockaddr_cb_flags  = UCT_CB_FLAG_ASYNC;
-        ep_params.sockaddr_pack_cb   = server_priv_data_cb;
-        ep_params.sockaddr_cb_server = notify_cb;
-        ep_params.disconnect_cb      = disconnect_cb;
-        ep_params.user_data          = user_data;
+                               UCT_EP_PARAM_FIELD_PRIV_DATA                 |
+                               UCT_EP_PARAM_FIELD_PRIV_DATA_LENGTH;
+
+        ep_params.cm                  = cm;
+        ep_params.conn_request        = conn_request;
+        ep_params.sockaddr_cb_flags   = UCT_CB_FLAG_ASYNC;
+        ep_params.sockaddr_cb_server  = notify_cb;
+        ep_params.disconnect_cb       = disconnect_cb;
+        ep_params.user_data           = user_data;
+        ep_params.private_data        = priv_data_buf.data();
+        ep_params.private_data_length = packed;
 
         status = uct_ep_create(&ep_params, &ep);
-        ASSERT_UCS_OK(status);
-        m_server->eps().back().reset(ep, uct_ep_destroy);
+        check_connection_status(status, can_fail);
+        if (status == UCS_OK) {
+            m_server->eps().back().reset(ep, uct_ep_destroy);
+        }
     }
 
     virtual void server_accept(entity *server, uct_conn_request_h conn_request,
                                uct_cm_ep_server_conn_notify_callback_t notify_cb,
                                uct_ep_disconnect_cb_t disconnect_cb,
-                               void *user_data)
+                               void *user_data, bool can_fail)
     {
-        accept(server->cm(), conn_request, notify_cb, disconnect_cb, user_data);
+        ucs::scoped_async_lock listen_lock(m_server->async());
+        ucs::scoped_async_lock accept_lock(server->async());
+        accept(server->cm(), conn_request, notify_cb, disconnect_cb,
+               user_data, can_fail);
     }
 
     void verify_remote_data(const void *remote_data, size_t remote_length)
     {
-        std::vector<char> r_data((char*)(remote_data), (char*)(remote_data) + remote_length);
+        std::vector<char> r_data((char*)(remote_data),
+                                 (char*)(remote_data) + remote_length);
 
         if (remote_length == m_short_priv_data_len) {
             EXPECT_EQ(m_short_priv_data, r_data);
@@ -550,7 +341,7 @@ class test_uct_cm_sockaddr : public uct_test {
     static bool common_conn_request(uct_listener_h listener, void *arg,
                                     const uct_cm_listener_conn_request_args_t
                                     *conn_req_args) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
+        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr *>(arg);
         ucs_sock_addr_t m_connect_addr_sock_addr =
                         self->m_connect_addr.to_ucs_sock_addr();
         uct_conn_request_h conn_request;
@@ -595,13 +386,14 @@ class test_uct_cm_sockaddr : public uct_test {
     static void
     conn_request_cb(uct_listener_h listener, void *arg,
                     const uct_cm_listener_conn_request_args_t *conn_req_args) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
+        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr *>(arg);
 
         if (self->common_conn_request(listener, arg, conn_req_args)) {
             EXPECT_TRUE(conn_req_args->field_mask &
                         UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST);
             self->server_accept(self->m_server, conn_req_args->conn_request,
-                                server_connect_cb, server_disconnect_cb, self);
+                                server_connect_cb, server_disconnect_cb, self,
+                                false);
         }
 
         ucs_memory_cpu_store_fence();
@@ -612,20 +404,25 @@ class test_uct_cm_sockaddr : public uct_test {
     static void
     server_connect_cb(uct_ep_h ep, void *arg,
                       const uct_cm_ep_server_conn_notify_args_t *notify_args) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
+        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr *>(arg);
 
-        if (notify_args->field_mask & UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS) {
-            EXPECT_EQ(UCS_OK, notify_args->status);
+        self->m_server_connect_cb_cnt++;
+        if ((notify_args->field_mask &
+             UCT_CM_EP_SERVER_CONN_NOTIFY_ARGS_FIELD_STATUS) &&
+            (notify_args->status != UCS_OK)) {
+            return;
         }
 
         self->m_state |= TEST_STATE_SERVER_CONNECTED;
-        self->m_server_connect_cb_cnt++;
     }
 
     static void
     client_connect_cb(uct_ep_h ep, void *arg,
                       const uct_cm_ep_client_connect_args_t *connect_args) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
+        client_user_data *sa_user_data =
+                reinterpret_cast<client_user_data*>(arg);
+        test_uct_sockaddr *self = sa_user_data->get_test();
+
         const uct_cm_remote_data_t *remote_data;
         ucs_status_t status;
 
@@ -638,7 +435,9 @@ class test_uct_cm_sockaddr : public uct_test {
 
         if (status == UCS_ERR_REJECTED) {
             self->m_state |= TEST_STATE_CLIENT_GOT_REJECT;
-        } else if ((status == UCS_ERR_UNREACHABLE) || (status == UCS_ERR_NOT_CONNECTED)) {
+        } else if ((status == UCS_ERR_UNREACHABLE) ||
+                   (status == UCS_ERR_NOT_CONNECTED) ||
+                   (status == UCS_ERR_CONNECTION_RESET)) {
             self->m_state |= TEST_STATE_CLIENT_GOT_SERVER_UNAVAILABLE;
         } else if (status != UCS_OK) {
             self->m_state |= TEST_STATE_CLIENT_GOT_ERROR;
@@ -647,19 +446,25 @@ class test_uct_cm_sockaddr : public uct_test {
                                            (UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA_LENGTH |
                                             UCT_CM_REMOTE_DATA_FIELD_CONN_PRIV_DATA)));
 
-            self->verify_remote_data(remote_data->conn_priv_data, remote_data->conn_priv_data_length);
+            self->verify_remote_data(remote_data->conn_priv_data,
+                                     remote_data->conn_priv_data_length);
 
+            ASSERT_EQ(sa_user_data->get_ep(), ep);
             status = uct_cm_client_ep_conn_notify(ep);
             ASSERT_UCS_OK(status);
 
             self->m_state |= TEST_STATE_CLIENT_CONNECTED;
             self->m_client_connect_cb_cnt++;
         }
+
+        if (status != UCS_OK) {
+            self->del_user_data(sa_user_data);
+        }
     }
 
     static void
     server_disconnect_cb(uct_ep_h ep, void *arg) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
+        test_uct_sockaddr *self = reinterpret_cast<test_uct_sockaddr *>(arg);
 
         if (!(self->m_server_start_disconnect)) {
             self->m_server->disconnect(ep);
@@ -670,7 +475,9 @@ class test_uct_cm_sockaddr : public uct_test {
     }
 
     static void client_disconnect_cb(uct_ep_h ep, void *arg) {
-        test_uct_cm_sockaddr *self = reinterpret_cast<test_uct_cm_sockaddr *>(arg);
+        client_user_data *sa_user_data =
+                reinterpret_cast<client_user_data*>(arg);
+        test_uct_sockaddr *self = sa_user_data->get_test();
 
         if (self->m_server_start_disconnect) {
             /* if the server was the one who initiated the disconnect flow,
@@ -681,6 +488,7 @@ class test_uct_cm_sockaddr : public uct_test {
 
         self->m_state |= TEST_STATE_CLIENT_DISCONNECTED;
         self->m_client_disconnect_cnt++;
+        self->del_user_data(sa_user_data);
     }
 
     void cm_disconnect(entity *ent) {
@@ -711,7 +519,7 @@ class test_uct_cm_sockaddr : public uct_test {
         }
     }
 
-    void test_delayed_server_response(bool reject)
+    void test_delayed_server_response(bool reject, bool early_destroy)
     {
         ucs_status_t status;
         ucs_time_t deadline;
@@ -725,6 +533,17 @@ class test_uct_cm_sockaddr : public uct_test {
                       TEST_STATE_CLIENT_GOT_REJECT | TEST_STATE_CLIENT_GOT_ERROR |
                       TEST_STATE_CLIENT_GOT_SERVER_UNAVAILABLE));
 
+        if (early_destroy) {
+            {
+                ucs::scoped_mutex_lock lock(m_ep_client_data_lock);
+                for (int i = 0; i < m_client->num_eps(); ++i) {
+                    del_user_data_no_lock(m_client->ep(i));
+                }
+            }
+
+            m_client->destroy_eps();
+        }
+
         deadline = ucs_get_time() + ucs_time_from_sec(DEFAULT_TIMEOUT_SEC) *
                                     ucs::test_time_multiplier();
 
@@ -740,18 +559,22 @@ class test_uct_cm_sockaddr : public uct_test {
 
             status = uct_listener_reject(m_server->listener(),
                                          m_delayed_conn_reqs.front());
-            ASSERT_UCS_OK(status);
-
-            wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_REJECT);
-            EXPECT_TRUE(m_state & TEST_STATE_CLIENT_GOT_REJECT);
+            check_connection_status(status, early_destroy);
+            if (!early_destroy) {
+                wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_REJECT);
+                EXPECT_TRUE(m_state & TEST_STATE_CLIENT_GOT_REJECT);
+            }
         } else {
             server_accept(m_server, m_delayed_conn_reqs.front(),
-                          server_connect_cb, server_disconnect_cb, this);
-
-            wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED |
-                                       TEST_STATE_CLIENT_CONNECTED);
-            EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_SERVER_CONNECTED |
-                                                    TEST_STATE_CLIENT_CONNECTED));
+                          server_connect_cb, server_disconnect_cb, this,
+                          early_destroy);
+            if (!early_destroy) {
+                wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED |
+                                        TEST_STATE_CLIENT_CONNECTED);
+                EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_SERVER_CONNECTED |
+                                                        TEST_STATE_CLIENT_CONNECTED));
+                cm_disconnect(m_client);
+            }
         }
 
         m_delayed_conn_reqs.pop();
@@ -820,6 +643,35 @@ class test_uct_cm_sockaddr : public uct_test {
         cm_disconnect(m_client);
     }
 
+    void add_user_data(client_user_data *user_data) {
+        ucs::scoped_mutex_lock lock(m_ep_client_data_lock);
+
+        EXPECT_NE(uct_ep_h(NULL), user_data->get_ep());
+        EXPECT_TRUE(m_ep_client_data.insert(std::make_pair(user_data->get_ep(),
+                                                           user_data)).second);
+    }
+
+    void del_user_data(client_user_data *user_data) {
+        ucs::scoped_mutex_lock lock(m_ep_client_data_lock);
+        del_user_data_no_lock(user_data->get_ep());
+    }
+
+    void release_user_data() {
+        ucs::scoped_mutex_lock lock(m_ep_client_data_lock);
+        while (!m_ep_client_data.empty()) {
+            del_user_data_no_lock(m_ep_client_data.begin()->first);
+        }
+    }
+
+private:
+    void del_user_data_no_lock(uct_ep_h ep) {
+        ep_client_data_map_t::iterator it = m_ep_client_data.find(ep);
+
+        EXPECT_NE(m_ep_client_data.end(), it) << "ep: " << ep;
+        delete it->second;
+        m_ep_client_data.erase(it);
+    }
+
 protected:
     ucs::sock_addr_storage m_listen_addr, m_connect_addr;
     uint64_t               m_state;
@@ -835,10 +687,12 @@ class test_uct_cm_sockaddr : public uct_test {
     size_t                 m_short_priv_data_len, m_long_priv_data_len;
     std::vector<char>      m_short_priv_data;
     std::vector<char>      m_long_priv_data;
+    pthread_mutex_t        m_ep_client_data_lock;
+    ep_client_data_map_t   m_ep_client_data;
 };
 
 
-UCS_TEST_P(test_uct_cm_sockaddr, cm_query)
+UCS_TEST_P(test_uct_sockaddr, cm_query)
 {
     ucs_status_t status;
     size_t i;
@@ -852,7 +706,7 @@ UCS_TEST_P(test_uct_cm_sockaddr, cm_query)
     }
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, listener_query)
+UCS_TEST_P(test_uct_sockaddr, listener_query)
 {
     uct_listener_attr_t attr;
     ucs_status_t status;
@@ -860,7 +714,7 @@ UCS_TEST_P(test_uct_cm_sockaddr, listener_query)
     char m_listener_ip_port_str[UCS_SOCKADDR_STRING_LEN];
     char attr_addr_ip_port_str[UCS_SOCKADDR_STRING_LEN];
 
-    start_listen(test_uct_cm_sockaddr::conn_request_cb);
+    start_listen(test_uct_sockaddr::conn_request_cb);
 
     attr.field_mask = UCT_LISTENER_ATTR_FIELD_SOCKADDR;
     status = uct_listener_query(m_server->listener(), &attr);
@@ -878,12 +732,12 @@ UCS_TEST_P(test_uct_cm_sockaddr, listener_query)
     EXPECT_EQ(m_listen_addr.get_port(), port);
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_close)
+UCS_TEST_P(test_uct_sockaddr, cm_open_listen_close)
 {
     basic_listen_connect_disconnect();
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_close_large_priv_data)
+UCS_TEST_P(test_uct_sockaddr, cm_open_listen_close_large_priv_data)
 {
     m_entities.clear();
 
@@ -906,7 +760,7 @@ UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_close_large_priv_data)
     basic_listen_connect_disconnect();
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_kill_server)
+UCS_TEST_P(test_uct_sockaddr, cm_open_listen_kill_server)
 {
     listen_and_connect();
 
@@ -922,7 +776,7 @@ UCS_TEST_P(test_uct_cm_sockaddr, cm_open_listen_kill_server)
     EXPECT_TRUE(m_state & TEST_STATE_CLIENT_DISCONNECTED);
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, cm_server_reject)
+UCS_TEST_P(test_uct_sockaddr, cm_server_reject)
 {
     m_reject_conn_request = true;
 
@@ -940,7 +794,7 @@ UCS_TEST_P(test_uct_cm_sockaddr, cm_server_reject)
                  (TEST_STATE_SERVER_CONNECTED | TEST_STATE_CLIENT_CONNECTED)));
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, many_conns_on_client)
+UCS_TEST_P(test_uct_sockaddr, many_conns_on_client)
 {
     int num_conns_on_client = ucs_max(2, 100 / ucs::test_time_multiplier());
 
@@ -952,8 +806,11 @@ UCS_TEST_P(test_uct_cm_sockaddr, many_conns_on_client)
     /* Connect */
     /* multiple clients, on the same cm, connecting to the same server */
     for (int i = 0; i < num_conns_on_client; ++i) {
-        m_client->connect(i, *m_server, 0, m_connect_addr, client_priv_data_cb,
-                          client_connect_cb, client_disconnect_cb, this);
+        client_user_data *user_data = new client_user_data(*this, *m_client, i);
+        m_client->connect_to_sockaddr(i, m_connect_addr, client_resolve_cb,
+                                      client_connect_cb, client_disconnect_cb,
+                                      user_data);
+        add_user_data(user_data);
     }
 
     /* wait for the server to connect to all the endpoints on the cm */
@@ -979,11 +836,14 @@ UCS_TEST_P(test_uct_cm_sockaddr, many_conns_on_client)
     EXPECT_EQ(num_conns_on_client, m_client_disconnect_cnt);
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, err_handle)
+UCS_TEST_P(test_uct_sockaddr, err_handle)
 {
     /* client - try to connect to a server that isn't listening */
-    m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb,
-                      client_connect_cb, client_disconnect_cb, this);
+    client_user_data *user_data = new client_user_data(*this, *m_client, 0);
+    m_client->connect_to_sockaddr(0, m_connect_addr, client_resolve_cb,
+                                  client_connect_cb, client_disconnect_cb,
+                                  user_data);
+    add_user_data(user_data);
 
     EXPECT_FALSE(m_state & TEST_STATE_CONNECT_REQUESTED);
 
@@ -997,10 +857,10 @@ UCS_TEST_P(test_uct_cm_sockaddr, err_handle)
     EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_CLIENT_GOT_SERVER_UNAVAILABLE));
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, conn_to_non_exist_server_port)
+UCS_TEST_P(test_uct_sockaddr, conn_to_non_exist_server_port)
 {
     /* Listen */
-    start_listen(test_uct_cm_sockaddr::conn_request_cb);
+    start_listen(test_uct_sockaddr::conn_request_cb);
 
     m_connect_addr.set_port(htons(1));
 
@@ -1008,8 +868,11 @@ UCS_TEST_P(test_uct_cm_sockaddr, conn_to_non_exist_server_port)
     scoped_log_handler slh(detect_reject_error_logger);
 
     /* client - try to connect to a non-existing port on the server side. */
-    m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb,
-                      client_connect_cb, client_disconnect_cb, this);
+    client_user_data *user_data = new client_user_data(*this, *m_client, 0);
+    m_client->connect_to_sockaddr(0, m_connect_addr, client_resolve_cb,
+                                  client_connect_cb, client_disconnect_cb,
+                                  user_data);
+    add_user_data(user_data);
 
     /* with the TCP port space (which is currently tested with rdmacm),
      * a REJECT event will be generated on the client side and since it's a
@@ -1021,26 +884,34 @@ UCS_TEST_P(test_uct_cm_sockaddr, conn_to_non_exist_server_port)
     EXPECT_TRUE(ucs_test_all_flags(m_state, TEST_STATE_CLIENT_GOT_SERVER_UNAVAILABLE));
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, connect_client_to_server_with_delay)
+UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_with_delay)
 {
-    test_delayed_server_response(false);
+    test_delayed_server_response(false, false);
+}
 
-    cm_disconnect(m_client);
+UCS_TEST_P(test_uct_sockaddr, destroy_client_before_accept)
+{
+    test_delayed_server_response(false, true);
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, connect_client_to_server_reject_with_delay)
+UCS_TEST_P(test_uct_sockaddr, connect_client_to_server_reject_with_delay)
 {
-    test_delayed_server_response(true);
+    test_delayed_server_response(true, false);
 }
 
-UCS_TEST_P(test_uct_cm_sockaddr, ep_disconnect_err_codes)
+UCS_TEST_P(test_uct_sockaddr, destroy_client_before_reject)
+{
+    test_delayed_server_response(true, true);
+}
+
+UCS_TEST_P(test_uct_sockaddr, ep_disconnect_err_codes)
 {
     bool disconnecting = false;
 
     listen_and_connect();
 
     {
-        entity::scoped_async_lock lock(*m_client);
+        ucs::scoped_async_lock lock(m_client->async());
         if (m_state & TEST_STATE_CLIENT_CONNECTED) {
             UCS_TEST_MESSAGE << "EXP: " << ucs_status_string(UCS_OK);
             EXPECT_EQ(UCS_OK, uct_ep_disconnect(m_client->ep(0), 0));
@@ -1057,7 +928,7 @@ UCS_TEST_P(test_uct_cm_sockaddr, ep_disconnect_err_codes)
                                              TEST_STATE_CLIENT_CONNECTED)));
 
     {
-        entity::scoped_async_lock lock(*m_client);
+        ucs::scoped_async_lock lock(m_client->async());
         if (disconnecting) {
             scoped_log_handler slh(detect_double_disconnect_error_logger);
             if (m_state & TEST_STATE_CLIENT_DISCONNECTED) {
@@ -1091,10 +962,10 @@ UCS_TEST_P(test_uct_cm_sockaddr, ep_disconnect_err_codes)
     }
 }
 
-UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr)
+UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr)
 
 
-class test_uct_cm_sockaddr_err_handle_non_exist_ip : public test_uct_cm_sockaddr {
+class test_uct_sockaddr_err_handle_non_exist_ip : public test_uct_sockaddr {
 public:
     void init() {
         /* tcp_sockcm requires setting this parameter to shorten the time of waiting
@@ -1103,18 +974,18 @@ class test_uct_cm_sockaddr_err_handle_non_exist_ip : public test_uct_cm_sockaddr
          * will have no effect. */
         modify_config("SYN_CNT", "1", SETENV_IF_NOT_EXIST);
 
-        test_uct_cm_sockaddr::init();
+        test_uct_sockaddr::init();
     }
 };
 
-UCS_TEST_P(test_uct_cm_sockaddr_err_handle_non_exist_ip, conn_to_non_exist_ip)
+UCS_TEST_P(test_uct_sockaddr_err_handle_non_exist_ip, conn_to_non_exist_ip)
 {
     struct sockaddr_in addr;
     ucs_status_t status;
     size_t size;
 
     /* Listen */
-    start_listen(test_uct_cm_sockaddr::conn_request_cb);
+    start_listen(test_uct_sockaddr::conn_request_cb);
 
     /* 240.0.0.0/4 - This block, formerly known as the Class E address
        space, is reserved for future use; see [RFC1112], Section 4.
@@ -1133,8 +1004,11 @@ UCS_TEST_P(test_uct_cm_sockaddr_err_handle_non_exist_ip, conn_to_non_exist_ip)
     {
         scoped_log_handler slh(detect_addr_route_error_logger);
         /* client - try to connect to a non-existing IP */
-        m_client->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb,
-                          client_connect_cb, client_disconnect_cb, this);
+        client_user_data *user_data = new client_user_data(*this, *m_client, 0);
+        m_client->connect_to_sockaddr(0, m_connect_addr, client_resolve_cb,
+                                      client_connect_cb, client_disconnect_cb,
+                                      user_data);
+        add_user_data(user_data);
 
         wait_for_bits(&m_state, TEST_STATE_CLIENT_GOT_SERVER_UNAVAILABLE, 300);
         EXPECT_TRUE(m_state & TEST_STATE_CLIENT_GOT_SERVER_UNAVAILABLE);
@@ -1145,13 +1019,12 @@ UCS_TEST_P(test_uct_cm_sockaddr_err_handle_non_exist_ip, conn_to_non_exist_ip)
     }
 }
 
-UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr_err_handle_non_exist_ip)
+UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr_err_handle_non_exist_ip)
 
 
-class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr {
+class test_uct_sockaddr_stress : public test_uct_sockaddr {
 public:
-    test_uct_cm_sockaddr_stress() : m_clients_num(0),
-                                    m_ep_init_disconnect_cnt(0) {
+    test_uct_sockaddr_stress() : m_clients_num(0), m_ep_init_disconnect_cnt(0) {
     }
 
     typedef struct {
@@ -1160,7 +1033,7 @@ class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr {
     } ep_state_t;
 
     void init() {
-        test_uct_cm_sockaddr::init();
+        test_uct_sockaddr::init();
 
         m_clients_num = ucs_max(2, 100 / ucs::test_time_multiplier());
         pthread_mutex_init(&m_lock, NULL);
@@ -1168,7 +1041,7 @@ class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr {
 
     void cleanup() {
         pthread_mutex_destroy(&m_lock);
-        test_uct_cm_sockaddr::cleanup();
+        test_uct_sockaddr::cleanup();
     }
 
     int get_ep_index(uct_ep_h ep) {
@@ -1185,7 +1058,7 @@ class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr {
         int index;
 
         index = get_ep_index(ep);
-        ASSERT_GE(index, 0);
+        ASSERT_TRUE(index >= 0);
         EXPECT_LT(index, (2 * m_clients_num));
 
         pthread_mutex_lock(&m_lock);
@@ -1201,46 +1074,53 @@ class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr {
     }
 
     void disconnect_cnt_increment(volatile int *cnt) {
-        pthread_mutex_lock(&m_lock);
+        ucs::scoped_mutex_lock lock(m_lock);
         (*cnt)++;
-        pthread_mutex_unlock(&m_lock);
     }
 
     static void server_disconnect_cb(uct_ep_h ep, void *arg) {
-        test_uct_cm_sockaddr_stress *self =
-                        reinterpret_cast<test_uct_cm_sockaddr_stress *>(arg);
+        test_uct_sockaddr_stress *self =
+                        reinterpret_cast<test_uct_sockaddr_stress *>(arg);
 
         self->common_test_disconnect(ep);
         self->disconnect_cnt_increment(&self->m_server_disconnect_cnt);
     }
 
     static void client_disconnect_cb(uct_ep_h ep, void *arg) {
-        test_uct_cm_sockaddr_stress *self =
-                        reinterpret_cast<test_uct_cm_sockaddr_stress *>(arg);
+        client_user_data *sa_user_data =
+                reinterpret_cast<client_user_data*>(arg);
+        test_uct_sockaddr_stress *self =
+                static_cast<test_uct_sockaddr_stress*>(sa_user_data->get_test());
 
+        EXPECT_EQ(sa_user_data->get_ep(), ep);
         self->common_test_disconnect(ep);
         self->disconnect_cnt_increment(&self->m_client_disconnect_cnt);
+        self->del_user_data(sa_user_data);
     }
 
     void server_accept(entity *server, uct_conn_request_h conn_request,
                        uct_cm_ep_server_conn_notify_callback_t notify_cb,
                        uct_ep_disconnect_cb_t disconnect_cb,
-                       void *user_data) {
-        test_uct_cm_sockaddr::accept(server->cm(), conn_request, notify_cb,
-                                     disconnect_cb, user_data);
+                       void *user_data, bool can_fail) {
+        ucs::scoped_async_lock listen_lock(m_server->async());
+        ucs::scoped_async_lock accept_lock(server->async());
+        test_uct_sockaddr::accept(server->cm(), conn_request, notify_cb,
+                                  disconnect_cb, user_data, can_fail);
     }
 
     static void
     conn_request_cb(uct_listener_h listener, void *arg,
                     const uct_cm_listener_conn_request_args_t *conn_req_args) {
-        test_uct_cm_sockaddr_stress *self =
-                        reinterpret_cast<test_uct_cm_sockaddr_stress *>(arg);
+        test_uct_sockaddr_stress *self =
+                        reinterpret_cast<test_uct_sockaddr_stress *>(arg);
 
-        if (test_uct_cm_sockaddr::common_conn_request(listener, arg, conn_req_args)) {
+        if (test_uct_sockaddr::common_conn_request(listener, arg,
+                                                   conn_req_args)) {
             EXPECT_TRUE(conn_req_args->field_mask &
                         UCT_CM_LISTENER_CONN_REQUEST_ARGS_FIELD_CONN_REQUEST);
             self->server_accept(self->m_server, conn_req_args->conn_request,
-                                server_connect_cb, server_disconnect_cb, self);
+                                server_connect_cb, server_disconnect_cb, self,
+                                false);
         }
 
         ucs_memory_cpu_store_fence();
@@ -1254,7 +1134,7 @@ class test_uct_cm_sockaddr_stress : public test_uct_cm_sockaddr {
     pthread_mutex_t         m_lock;
 };
 
-UCS_TEST_P(test_uct_cm_sockaddr_stress, many_clients_to_one_server)
+UCS_TEST_P(test_uct_sockaddr_stress, many_clients_to_one_server)
 {
     int i, disconnected_eps_on_each_side, no_disconnect_eps_cnt = 0;
     entity *client_test;
@@ -1262,7 +1142,7 @@ UCS_TEST_P(test_uct_cm_sockaddr_stress, many_clients_to_one_server)
     ucs_time_t deadline;
 
     /* Listen */
-    start_listen(test_uct_cm_sockaddr_stress::conn_request_cb);
+    start_listen(test_uct_sockaddr_stress::conn_request_cb);
 
     /* Connect */
     /* multiple clients, each on a cm of its own, connecting to the same server */
@@ -1270,9 +1150,13 @@ UCS_TEST_P(test_uct_cm_sockaddr_stress, many_clients_to_one_server)
         client_test = uct_test::create_entity();
         m_entities.push_back(client_test);
 
-        client_test->max_conn_priv = client_test->cm_attr().max_conn_priv;
-        client_test->connect(0, *m_server, 0, m_connect_addr, client_priv_data_cb,
-                             client_connect_cb, client_disconnect_cb, this);
+        client_test->max_conn_priv  = client_test->cm_attr().max_conn_priv;
+        client_user_data *user_data = new client_user_data(*this, *client_test,
+                                                           0);
+        client_test->connect_to_sockaddr(0, m_connect_addr, client_resolve_cb,
+                                         client_connect_cb,
+                                         client_disconnect_cb, user_data);
+        add_user_data(user_data);
     }
 
     /* wait for the server to connect to all the clients */
@@ -1362,17 +1246,20 @@ UCS_TEST_P(test_uct_cm_sockaddr_stress, many_clients_to_one_server)
     /* destroy all the eps here (and not in the test's destruction flow) so that
      * no disconnect callbacks are invoked after the test ends */
     m_entities.clear();
+
+    /* destroyed EPs don't invoke CBs, need to clean up user data manually */
+    release_user_data();
 }
 
-UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr_stress)
+UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr_stress)
 
 
-class test_uct_cm_sockaddr_multiple_cms : public test_uct_cm_sockaddr {
+class test_uct_sockaddr_multiple_cms : public test_uct_sockaddr {
 public:
     void init() {
         ucs_status_t status;
 
-        test_uct_cm_sockaddr::init();
+        test_uct_sockaddr::init();
 
         status = ucs_async_context_create(UCS_ASYNC_MODE_THREAD_SPINLOCK,
                                           &m_test_async);
@@ -1395,15 +1282,18 @@ class test_uct_cm_sockaddr_multiple_cms : public test_uct_cm_sockaddr {
         uct_config_release(m_test_config);
         m_test_worker.reset();
         ucs_async_context_destroy(m_test_async);
-        test_uct_cm_sockaddr::cleanup();
+        test_uct_sockaddr::cleanup();
     }
 
     void server_accept(entity *server, uct_conn_request_h conn_request,
                        uct_cm_ep_server_conn_notify_callback_t notify_cb,
                        uct_ep_disconnect_cb_t disconnect_cb,
-                       void *user_data)
+                       void *user_data, bool can_fail)
     {
-        accept(m_test_cm, conn_request, notify_cb, disconnect_cb, user_data);
+        ucs::scoped_async_lock listen_lock(m_server->async());
+        ucs::scoped_async_lock accept_lock(*m_test_async);
+        accept(m_test_cm, conn_request, notify_cb, disconnect_cb, user_data,
+               can_fail);
     }
 
 protected:
@@ -1413,7 +1303,7 @@ class test_uct_cm_sockaddr_multiple_cms : public test_uct_cm_sockaddr {
     uct_cm_config_t           *m_test_config;
 };
 
-UCS_TEST_P(test_uct_cm_sockaddr_multiple_cms, server_switch_cm)
+UCS_TEST_P(test_uct_sockaddr_multiple_cms, server_switch_cm)
 {
     listen_and_connect();
 
@@ -1429,4 +1319,116 @@ UCS_TEST_P(test_uct_cm_sockaddr_multiple_cms, server_switch_cm)
     m_server->destroy_ep(0);
 }
 
-UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_cm_sockaddr_multiple_cms)
+UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr_multiple_cms)
+
+/**
+ * This class tests "legacy" API @ref uct_ep_params::sockaddr_pack_cb which can
+ * be replaced with more flexible API:
+ *  - @ref uct_ep_params::cm_resolve_cb + @ref uct_ep_connect on client side
+ *  - @ref uct_ep_params::private_data + @ref uct_ep_params::private_data_length
+ *    on server side
+ * how this is implemented in @ref test_uct_sockaddr.
+ */
+class test_uct_sockaddr_legacy : public test_uct_sockaddr
+{
+public:
+    static ssize_t client_priv_data_cb(void *arg,
+                                       const uct_cm_ep_priv_data_pack_args_t
+                                       *pack_args, void *priv_data)
+    {
+        client_user_data *sa_user_data =
+                reinterpret_cast<client_user_data*>(arg);
+        test_uct_sockaddr_legacy *self =
+                static_cast<test_uct_sockaddr_legacy*>(sa_user_data->get_test());
+
+        return self->common_priv_data_cb(self->m_client->max_conn_priv,
+                                         priv_data);
+    }
+
+    static ssize_t server_priv_data_cb(void *arg,
+                                       const uct_cm_ep_priv_data_pack_args_t
+                                       *pack_args, void *priv_data)
+    {
+        test_uct_sockaddr_legacy *self =
+                reinterpret_cast<test_uct_sockaddr_legacy*>(arg);
+
+        return self->common_priv_data_cb(self->m_server->max_conn_priv,
+                                         priv_data);
+    }
+
+    virtual void accept(uct_cm_h cm, uct_conn_request_h conn_request,
+                        uct_cm_ep_server_conn_notify_callback_t notify_cb,
+                        uct_ep_disconnect_cb_t disconnect_cb,
+                        void *user_data, bool can_fail)
+    {
+        uct_ep_params_t ep_params;
+        ucs_status_t status;
+        uct_ep_h ep;
+
+        ASSERT_FALSE(can_fail);
+        ASSERT_TRUE(m_server->listener());
+        m_server->reserve_ep(m_server->num_eps());
+
+        ep_params.field_mask = UCT_EP_PARAM_FIELD_CM                        |
+                               UCT_EP_PARAM_FIELD_CONN_REQUEST              |
+                               UCT_EP_PARAM_FIELD_USER_DATA                 |
+                               UCT_EP_PARAM_FIELD_SOCKADDR_NOTIFY_CB_SERVER |
+                               UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB    |
+                               UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS         |
+                               UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB;
+
+        ep_params.cm                 = cm;
+        ep_params.conn_request       = conn_request;
+        ep_params.sockaddr_cb_flags  = UCT_CB_FLAG_ASYNC;
+        ep_params.sockaddr_pack_cb   = server_priv_data_cb;
+        ep_params.sockaddr_cb_server = notify_cb;
+        ep_params.disconnect_cb      = disconnect_cb;
+        ep_params.user_data          = user_data;
+
+        status = uct_ep_create(&ep_params, &ep);
+        check_connection_status(status, can_fail);
+        m_server->eps().back().reset(ep, uct_ep_destroy);
+    }
+};
+
+UCS_TEST_P(test_uct_sockaddr_legacy, cm_open_listen_close)
+{
+    start_listen(conn_request_cb);
+
+    ucs_sock_addr_t ucs_remote_addr = m_connect_addr.to_ucs_sock_addr();
+
+    m_client->reserve_ep(0);
+    ASSERT_EQ(NULL, m_client->ep(0));
+
+    client_user_data *user_data = new client_user_data(*this, *m_client, 0);
+
+    /* Connect to the server */
+    uct_ep_h ep;
+    uct_ep_params_t params;
+    params.field_mask         = UCT_EP_PARAM_FIELD_CM                         |
+                                UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT |
+                                UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB     |
+                                UCT_EP_PARAM_FIELD_USER_DATA                  |
+                                UCT_EP_PARAM_FIELD_SOCKADDR                   |
+                                UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS          |
+                                UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB;
+    params.cm                 = m_client->cm();
+    params.sockaddr_cb_client = client_connect_cb;
+    params.disconnect_cb      = client_disconnect_cb;
+    params.user_data          = user_data;
+    params.sockaddr           = &ucs_remote_addr;
+    params.sockaddr_cb_flags  = UCT_CB_FLAG_ASYNC;
+    params.sockaddr_pack_cb   = client_priv_data_cb;
+
+    ucs_status_t status       = uct_ep_create(&params, &ep);
+    ASSERT_UCS_OK(status);
+    m_client->eps().at(0).reset(ep, uct_ep_destroy);
+    add_user_data(user_data);
+    wait_for_bits(&m_state, TEST_STATE_SERVER_CONNECTED |
+                            TEST_STATE_CLIENT_CONNECTED);
+    EXPECT_TRUE(ucs_test_all_flags(m_state, (TEST_STATE_SERVER_CONNECTED |
+                                             TEST_STATE_CLIENT_CONNECTED)));
+    cm_disconnect(m_client);
+}
+
+UCT_INSTANTIATE_SOCKADDR_TEST_CASE(test_uct_sockaddr_legacy)
diff --git a/test/gtest/uct/ib/test_ud_ds.cc b/test/gtest/uct/ib/test_ud_ds.cc
index a06b6d43c0b..ad712da525a 100644
--- a/test/gtest/uct/ib/test_ud_ds.cc
+++ b/test/gtest/uct/ib/test_ud_ds.cc
@@ -67,6 +67,16 @@ class test_ud_ds : public uct_test {
                    uct_ud_iface_addr_t *if_addr,
                    uct_ud_ep_conn_sn_t conn_sn, uct_ud_ep_t *ep);
 
+    void check_mtu(uct_ib_address_pack_params_t *unpack_params)
+    {
+        if (unpack_params->flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
+            EXPECT_NE(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params->path_mtu);
+            EXPECT_NE(IBV_MTU_4096, unpack_params->path_mtu);
+        } else {
+            EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params->path_mtu);
+        }
+    }
+
 protected:
     entity *m_e1, *m_e2;
     uct_ib_address_t *ib_adr1, *ib_adr2;
@@ -89,10 +99,8 @@ UCS_TEST_P(test_ud_ds, if_addr) {
     EXPECT_NE(uct_ib_unpack_uint24(if_adr1.qp_num),
               uct_ib_unpack_uint24(if_adr2.qp_num));
 
-    EXPECT_TRUE(!(unpack_params1.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU));
-    EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params1.path_mtu);
-    EXPECT_TRUE(!(unpack_params2.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU));
-    EXPECT_EQ(UCT_IB_ADDRESS_INVALID_PATH_MTU, unpack_params2.path_mtu);
+    check_mtu(&unpack_params1);
+    check_mtu(&unpack_params2);
 
     EXPECT_TRUE(!(unpack_params1.flags & UCT_IB_ADDRESS_PACK_FLAG_GID_INDEX));
     EXPECT_EQ(UCT_IB_ADDRESS_INVALID_GID_INDEX, unpack_params1.gid_index);
diff --git a/test/gtest/uct/tcp/test_tcp.cc b/test/gtest/uct/tcp/test_tcp.cc
index 25ef65eaf91..c9b77f50875 100644
--- a/test/gtest/uct/tcp/test_tcp.cc
+++ b/test/gtest/uct/tcp/test_tcp.cc
@@ -179,17 +179,14 @@ class test_uct_tcp : public uct_test {
         status = uct_iface_get_address(to.iface(), iface_addr);
         ASSERT_UCS_OK(status);
 
-        struct sockaddr_in dest_addr;
-        dest_addr.sin_family = AF_INET;
-        dest_addr.sin_port   = *(in_port_t*)iface_addr;
-        dest_addr.sin_addr   = *(const struct in_addr*)ucs_sockaddr_get_inet_addr
-                                                       ((struct sockaddr*)dev_addr);
+        struct sockaddr dest_addr;
+        uct_tcp_ep_set_dest_addr(dev_addr, iface_addr, &dest_addr);
 
         int fd;
         status = ucs_socket_create(AF_INET, SOCK_STREAM, &fd);
         ASSERT_UCS_OK(status);
 
-        status = ucs_socket_connect(fd, (const struct sockaddr*)&dest_addr);
+        status = ucs_socket_connect(fd, &dest_addr);
         ASSERT_UCS_OK(status);
 
         status = ucs_sys_fcntl_modfl(fd, O_NONBLOCK, 0);
@@ -259,4 +256,32 @@ UCS_TEST_P(test_uct_tcp, listener_flood_connect_and_close) {
     test_listener_flood(*m_ent, max_conn, 0);
 }
 
+UCS_TEST_P(test_uct_tcp, check_addr_len)
+{
+    uct_iface_attr_t iface_attr;
+
+    ucs_status_t status = uct_iface_query(m_ent->iface(), &iface_attr);
+    ASSERT_UCS_OK(status);
+
+    UCS_TEST_MESSAGE << m_ent->md()->component->name;
+    if (!GetParam()->dev_name.compare("lo")) {
+        EXPECT_EQ(sizeof(uct_tcp_device_addr_t) +
+                          sizeof(uct_iface_local_addr_ns_t),
+                  iface_attr.device_addr_len);
+    } else {
+        struct sockaddr *saddr = reinterpret_cast<struct sockaddr*>(
+                                         &m_tcp_iface->config.ifaddr);
+        size_t in_addr_len;
+        status = ucs_sockaddr_inet_addr_sizeof(saddr, &in_addr_len);
+        ASSERT_UCS_OK(status);
+
+        EXPECT_EQ(sizeof(uct_tcp_device_addr_t) + in_addr_len,
+                  iface_attr.device_addr_len);
+    }
+
+    EXPECT_EQ(sizeof(uct_tcp_iface_addr_t), iface_attr.iface_addr_len);
+    EXPECT_EQ(sizeof(uct_tcp_ep_addr_t), iface_attr.ep_addr_len);
+}
+
+
 _UCT_INSTANTIATE_TEST_CASE(test_uct_tcp, tcp)
diff --git a/test/gtest/uct/test_flush.cc b/test/gtest/uct/test_flush.cc
index c65812a29e5..1a5dc4162e0 100644
--- a/test/gtest/uct/test_flush.cc
+++ b/test/gtest/uct/test_flush.cc
@@ -122,6 +122,12 @@ class uct_flush_test : public uct_test {
         return am_req->test->am_send_pending(am_req);
     }
 
+    static void purge_cb(uct_pending_req_t *self, void *arg)
+    {
+        test_req_t *req = ucs_container_of(self, test_req_t, uct);
+        --req->comp.count;
+    }
+
     static ucs_status_t flush_progress(uct_pending_req_t *req)
     {
         test_req_t *flush_req = ucs_container_of(req, test_req_t, uct);
@@ -368,6 +374,10 @@ void uct_flush_test::test_flush_am_pending(flush_func_t flush, bool destroy_ep)
          ASSERT_UCS_OK(status);
      }
 
+     if (is_flush_cancel()) {
+         uct_ep_pending_purge(sender().ep(0), purge_cb, NULL);
+     }
+
      /* Try to start a flush */
      test_req_t flush_req;
      flush_req.comp.count  = 2;
@@ -399,11 +409,7 @@ void uct_flush_test::test_flush_am_pending(flush_func_t flush, bool destroy_ep)
      EXPECT_EQ(1, flush_req.comp.count);
 
      while (!reqs.empty()) {
-         if (is_flush_cancel()) {
-            EXPECT_EQ(2, reqs.back().comp.count);
-         } else {
-            EXPECT_EQ(1, reqs.back().comp.count);
-         }
+         EXPECT_EQ(1, reqs.back().comp.count);
          reqs.pop_back();
      }
 
@@ -546,7 +552,8 @@ UCT_INSTANTIATE_TEST_CASE(uct_flush_test)
 
 class uct_cancel_test : public uct_test {
 public:
-    static const size_t BUF_SIZE = 8 * 1024;
+    static const size_t BUF_SIZE    = 8 * 1024;
+    static const size_t BUF_SIZE_DC = 1 * 1024;
 
     class peer {
     public:
@@ -563,6 +570,8 @@ class uct_cancel_test : public uct_test {
         }
 
         void connect() {
+            m_e->destroy_eps();
+            m_peer->m_e->destroy_eps();
             m_e->connect(0, *m_peer->m_e, 0);
             m_peer->m_e->connect(0, *m_e, 0);
         }
@@ -601,6 +610,10 @@ class uct_cancel_test : public uct_test {
         size_t header_length = 0;
         uct_iov_t iov;
 
+        if (has_transport("dc_mlx5")) {
+            size = ucs_min(BUF_SIZE_DC, size);
+        }
+
         iov.buffer = (char*)sendbuf.ptr() + header_length;
         iov.count  = 1;
         iov.length = size - header_length;
@@ -614,6 +627,10 @@ class uct_cancel_test : public uct_test {
         mapped_buffer &sendbuf = *s->m_buf;
         mapped_buffer &recvbuf = *s->m_peer->m_buf;
 
+        if (has_transport("dc_mlx5")) {
+            size = ucs_min(BUF_SIZE_DC, size);
+        }
+
         UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, sendbuf.ptr(), size,
                                 sendbuf.memh(), s->m_e->iface_attr().cap.get.max_iov);
 
@@ -641,7 +658,7 @@ class uct_cancel_test : public uct_test {
         done.count  = flushing.size() + 1;
         done.status = UCS_OK;
         done.func   = NULL;
-        ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(50.0);
+        ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(200.0);
         while (!flushing.empty() && (ucs_get_time() < loop_end_limit)) {
             std::list<entity *>::iterator iter = flushing.begin();
             while (iter != flushing.end()) {
@@ -660,16 +677,21 @@ class uct_cancel_test : public uct_test {
             short_progress_loop();
         }
         ASSERT_UCS_OK_OR_INPROGRESS(status);
+        double holdup = 200.0 - ucs_time_to_sec(loop_end_limit - ucs_get_time());
+        if (holdup > 10.0) {
+            UCS_TEST_MESSAGE << "flush took " << holdup << " sec";
+        }
 
         /* coverity[loop_condition] */
         while (done.count != 1) {
             progress();
         }
 
-        m_s1->m_e->destroy_eps();
-        m_s1->m_e->connect(0, *m_s0->m_e, 0);
+        m_s0->connect();
 
-        ASSERT_EQ(2, m_err_count);
+        /* there is a chance that one side getting disconect error before
+         * calling flush(CANCEL) */
+        EXPECT_LE(m_err_count, 1);
     }
 
     typedef ucs_status_t (uct_cancel_test::* send_func_t)(peer *s);
@@ -699,8 +721,9 @@ class uct_cancel_test : public uct_test {
     }
 
     void do_test(send_func_t send) {
-        for (int i = 0; i < count(); ++i) {
-            fill(&uct_cancel_test::am_zcopy);
+        ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(300.0);
+        for (int i = 0; (i < count()) && (ucs_get_time() < loop_end_limit); ++i) {
+            fill(send);
             flush_and_reconnect();
         }
     }
@@ -746,7 +769,7 @@ class uct_cancel_test : public uct_test {
     }
 
     ucs_status_t error_handler(uct_ep_h ep, ucs_status_t status) {
-        EXPECT_EQ(UCS_ERR_CANCELED, status);
+        EXPECT_EQ(UCS_ERR_ENDPOINT_TIMEOUT, status);
         m_err_count++;
         return UCS_OK;
     }
@@ -756,9 +779,9 @@ class uct_cancel_test : public uct_test {
     }
 
     void check_skip_test_tl() {
-        const resource *r = dynamic_cast<const resource*>(GetParam());
-
-        if ((r->tl_name != "rc_mlx5") && (r->tl_name != "rc_verbs")) {
+        if ((GetParam()->tl_name != "dc_mlx5") &&
+            (GetParam()->tl_name != "rc_verbs") &&
+            (GetParam()->tl_name != "rc_mlx5")) {
             UCS_TEST_SKIP_R("not supported yet");
         }
 
diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc
index 6746b6bd0a6..098be37402e 100644
--- a/test/gtest/uct/test_md.cc
+++ b/test/gtest/uct/test_md.cc
@@ -82,6 +82,8 @@ test_md::test_md()
 
 void test_md::init()
 {
+    const std::vector<ucs_memory_type_t>
+        supported_mem_types = mem_buffer::supported_mem_types();
     ucs::test_base::init();
     UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close, uct_md_open,
                            GetParam().component, GetParam().md_name.c_str(),
@@ -141,6 +143,12 @@ void test_md::free_memory(void *address, ucs_memory_type_t mem_type)
     mem_buffer::release(address, mem_type);
 }
 
+bool test_md::is_device_detected(ucs_memory_type_t mem_type)
+{
+    return (mem_type != UCS_MEMORY_TYPE_ROCM) &&
+           (mem_type != UCS_MEMORY_TYPE_ROCM_MANAGED);
+}
+
 UCS_TEST_SKIP_COND_P(test_md, rkey_ptr,
                      !check_caps(UCT_MD_FLAG_ALLOC |
                                  UCT_MD_FLAG_RKEY_PTR)) {
@@ -231,6 +239,7 @@ UCS_TEST_SKIP_COND_P(test_md, alloc,
     size_t size, orig_size;
     ucs_status_t status;
     void *address;
+    unsigned mem_type;
     uct_allocated_memory_t mem;
     uct_mem_alloc_params_t params;
 
@@ -241,35 +250,43 @@ UCS_TEST_SKIP_COND_P(test_md, alloc,
                              UCT_MEM_ALLOC_PARAM_FIELD_NAME;
     params.flags           = UCT_MD_MEM_ACCESS_ALL;
     params.name            = "test";
-    params.mem_type        = UCS_MEMORY_TYPE_HOST;
     params.mds.mds         = &md_ref;
     params.mds.count       = 1;
 
-    for (unsigned i = 0; i < 300; ++i) {
-        size = orig_size = ucs::rand() % 65536;
-        if (size == 0) {
-            continue;
-        }
+    ucs_for_each_bit(mem_type, md_attr().cap.alloc_mem_types) {
+        for (unsigned i = 0; i < 300; ++i) {
+            size = orig_size = ucs::rand() % 65536;
+            if (size == 0) {
+                continue;
+            }
 
-        address        = NULL;
-        params.address = address;
-        status = uct_mem_alloc(size, &method, 1, &params, &mem);
-        EXPECT_GT(mem.length, 0ul);
-        address = mem.address;
-        size    = mem.length;
+            address         = NULL;
+            params.address  = address;
+            params.mem_type = (ucs_memory_type_t)mem_type;
 
-        ASSERT_UCS_OK(status);
-        EXPECT_GE(size, orig_size);
-        EXPECT_TRUE(address != NULL);
-        EXPECT_TRUE(mem.memh != UCT_MEM_HANDLE_NULL);
+            status = uct_mem_alloc(size, &method, 1, &params, &mem);
 
-        memset(address, 0xBB, size);
-        uct_mem_free(&mem);
+            EXPECT_GT(mem.length, 0ul);
+            address = mem.address;
+            size    = mem.length;
+
+            ASSERT_UCS_OK(status);
+            EXPECT_GE(size, orig_size);
+            EXPECT_TRUE(address != NULL);
+            EXPECT_TRUE(mem.memh != UCT_MEM_HANDLE_NULL);
+
+            if (mem_type == UCS_MEMORY_TYPE_HOST) {
+                memset(address, 0xBB, size);
+            }
+            uct_mem_free(&mem);
+        }
     }
 }
 
 UCS_TEST_P(test_md, mem_type_detect_mds) {
     const size_t buffer_size = 1024;
+    size_t slice_offset;
+    size_t slice_length;
     ucs_status_t status;
     int alloc_mem_type;
     void *address;
@@ -293,19 +310,48 @@ UCS_TEST_P(test_md, mem_type_detect_mds) {
 
         /* test mem_query API */
         uct_md_mem_attr_t mem_attr;
-        mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE |
-                              UCT_MD_MEM_ATTR_FIELD_SYS_DEV;
-        status = uct_md_mem_query(md(), address, buffer_size, &mem_attr);
-        ASSERT_UCS_OK(status);
-        EXPECT_EQ(alloc_mem_type, mem_attr.mem_type);
+        mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_MEM_TYPE     |
+                              UCT_MD_MEM_ATTR_FIELD_SYS_DEV      |
+                              UCT_MD_MEM_ATTR_FIELD_BASE_ADDRESS |
+                              UCT_MD_MEM_ATTR_FIELD_ALLOC_LENGTH;
+
+        for (unsigned i = 0; i < 300; i++) {
+            slice_offset = ucs::rand() % buffer_size;
+            slice_length = ucs::rand() % buffer_size;
+
+            if (slice_length == 0) {
+                continue;
+            }
+
+            status = uct_md_mem_query(md(),
+                                      UCS_PTR_BYTE_OFFSET(address,
+                                                          slice_offset),
+                                      slice_length, &mem_attr);
+            ASSERT_UCS_OK(status);
+            EXPECT_EQ(alloc_mem_type, mem_attr.mem_type);
+            if ((alloc_mem_type == UCS_MEMORY_TYPE_CUDA) ||
+                (alloc_mem_type == UCS_MEMORY_TYPE_CUDA_MANAGED)) {
+                EXPECT_EQ(buffer_size, mem_attr.alloc_length);
+                EXPECT_EQ(address, mem_attr.base_address);
+            } else {
+                EXPECT_EQ(slice_length, mem_attr.alloc_length);
+                EXPECT_EQ(UCS_PTR_BYTE_OFFSET(address, slice_offset),
+                          mem_attr.base_address);
+            }
+        }
 
         /* print memory type and dev name */
         char sys_dev_name[128];
+        mem_attr.field_mask = UCT_MD_MEM_ATTR_FIELD_SYS_DEV;
+
+        status = uct_md_mem_query(md(), address, buffer_size, &mem_attr);
+        ASSERT_UCS_OK(status);
+
         ucs_topo_sys_device_bdf_name(mem_attr.sys_dev, sys_dev_name,
                                      sizeof(sys_dev_name));
         UCS_TEST_MESSAGE << ucs_memory_type_names[alloc_mem_type] << ": "
-                         << "sys_dev[" << mem_attr.sys_dev << "] "
-                         << "(" << sys_dev_name << ")";
+                         << "sys_dev[" << static_cast<int>(mem_attr.sys_dev)
+                         << "] (" << sys_dev_name << ")";
     }
 }
 
@@ -325,7 +371,9 @@ UCS_TEST_P(test_md, mem_query) {
                                                mem_buf.size(), &mem_attr);
         ASSERT_UCS_OK(status);
         EXPECT_EQ(mem_type, mem_attr.mem_type);
-        EXPECT_NE(UCS_SYS_DEVICE_ID_UNKNOWN, mem_attr.sys_dev);
+        if (is_device_detected(mem_attr.mem_type)) {
+            EXPECT_NE(UCS_SYS_DEVICE_ID_UNKNOWN, mem_attr.sys_dev);
+        }
 
         char bdf_buf[32];
         UCS_TEST_MESSAGE << ucs_memory_type_names[mem_type] << ": "
@@ -376,7 +424,6 @@ UCS_TEST_SKIP_COND_P(test_md, reg,
 
     for (unsigned mem_type_id = 0; mem_type_id < UCS_MEMORY_TYPE_LAST; mem_type_id++) {
         ucs_memory_type_t mem_type = static_cast<ucs_memory_type_t>(mem_type_id);
-
         if (!(md_attr().cap.reg_mem_types & UCS_BIT(mem_type_id))) {
             UCS_TEST_MESSAGE << mem_buffer::mem_type_name(mem_type) << " memory "
                              << "registration is not supported by "
@@ -566,53 +613,28 @@ UCS_TEST_SKIP_COND_P(test_md, reg_multi_thread,
     pthread_join(thread_id, NULL);
 }
 
-UCS_TEST_SKIP_COND_P(test_md, sockaddr_accessibility,
-                     !check_caps(UCT_MD_FLAG_SOCKADDR)) {
+UCS_TEST_P(test_md, sockaddr_accessibility) {
     ucs_sock_addr_t sock_addr;
     struct ifaddrs *ifaddr, *ifa;
-    bool found_rdma = false;
-    bool found_ip   = false;
 
+    /* currently we don't have MDs with deprecated capability */
+    ASSERT_FALSE(check_caps(UCT_MD_FLAG_SOCKADDR));
+    ASSERT_NE(NULL, uintptr_t(md()));
     ASSERT_TRUE(getifaddrs(&ifaddr) != -1);
-
     /* go through a linked list of available interfaces */
     for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
         if (ucs::is_inet_addr(ifa->ifa_addr) &&
             ucs_netif_flags_is_active(ifa->ifa_flags)) {
             sock_addr.addr = ifa->ifa_addr;
 
-            found_ip = true;
-
-            if (GetParam().md_name == "rdmacm") {
-                if (ucs::is_rdmacm_netdev(ifa->ifa_name)) {
-                    UCS_TEST_MESSAGE << "Testing " << ifa->ifa_name << " with " <<
-                                        ucs::sockaddr_to_str(ifa->ifa_addr);
-                    ASSERT_TRUE(uct_md_is_sockaddr_accessible(md(), &sock_addr,
-                                                              UCT_SOCKADDR_ACC_LOCAL));
-                    ASSERT_TRUE(uct_md_is_sockaddr_accessible(md(), &sock_addr,
-                                                              UCT_SOCKADDR_ACC_REMOTE));
-                    found_rdma = true;
-                }
-            } else {
-                UCS_TEST_MESSAGE << "Testing " << ifa->ifa_name << " with " <<
-                                    ucs::sockaddr_to_str(ifa->ifa_addr);
-                ASSERT_TRUE(uct_md_is_sockaddr_accessible(md(), &sock_addr,
-                                                          UCT_SOCKADDR_ACC_LOCAL));
-                ASSERT_TRUE(uct_md_is_sockaddr_accessible(md(), &sock_addr,
-                                                          UCT_SOCKADDR_ACC_REMOTE));
-            }
+            UCS_TEST_MESSAGE << "Testing " << ifa->ifa_name << " with "
+                             << ucs::sockaddr_to_str(ifa->ifa_addr);
+            ASSERT_FALSE(uct_md_is_sockaddr_accessible(md(), &sock_addr,
+                                                       UCT_SOCKADDR_ACC_LOCAL));
+            ASSERT_FALSE(uct_md_is_sockaddr_accessible(md(), &sock_addr,
+                                                       UCT_SOCKADDR_ACC_REMOTE));
         }
     }
-
-    if (GetParam().md_name == "rdmacm") {
-        if (!found_rdma) {
-            UCS_TEST_MESSAGE <<
-                "Cannot find an IPoIB/RoCE interface with an IPv4 address on the host";
-        }
-    } else if (!found_ip) {
-        UCS_TEST_MESSAGE << "Cannot find an IPv4/IPv6 interface on the host";
-    }
-
     freeifaddrs(ifaddr);
 }
 
diff --git a/test/gtest/uct/test_md.h b/test/gtest/uct/test_md.h
index 2ecaa4368ad..eb1cf4556f0 100644
--- a/test/gtest/uct/test_md.h
+++ b/test/gtest/uct/test_md.h
@@ -42,8 +42,9 @@ class test_md : public testing::TestWithParam<test_md_param>,
     void check_memory(void *address, void *expect, size_t size,
                       ucs_memory_type_t mem_type);
     void free_memory(void *address, ucs_memory_type_t mem_type);
-
     void test_registration();
+    static bool is_device_detected(ucs_memory_type_t mem_type);
+    static void* alloc_thread(void *arg);
 
     uct_md_h md() const {
         return m_md;
@@ -53,9 +54,6 @@ class test_md : public testing::TestWithParam<test_md_param>,
         return m_md_attr;
     }
 
-
-    static void* alloc_thread(void *arg);
-
 private:
     ucs::handle<uct_md_config_t*> m_md_config;
     ucs::handle<uct_md_h>         m_md;
diff --git a/test/gtest/uct/test_mem.cc b/test/gtest/uct/test_mem.cc
index aa7c292f848..33d47c41f05 100644
--- a/test/gtest/uct/test_mem.cc
+++ b/test/gtest/uct/test_mem.cc
@@ -70,6 +70,7 @@ UCS_TEST_P(test_mem, md_alloc) {
     size_t length = min_length;
     uct_alloc_method_t methods[3];
     uct_allocated_memory mem;
+    unsigned mem_type;
     std::vector<md_resource> md_resources;
     uct_md_attr_t md_attr;
     ucs_status_t status;
@@ -106,25 +107,30 @@ UCS_TEST_P(test_mem, md_alloc) {
         status = uct_md_query(md, &md_attr);
         ASSERT_UCS_OK(status);
 
-        for (nonblock = 0; nonblock <= 1; ++nonblock) {
-            int flags = nonblock ? UCT_MD_MEM_FLAG_NONBLOCK : 0;
+        ucs_for_each_bit(mem_type, md_attr.cap.alloc_mem_types) {
+            params.mem_type = (ucs_memory_type_t)mem_type;
+            for (nonblock = 0; nonblock <= 1; ++nonblock) {
+                if (nonblock && (mem_type != UCS_MEMORY_TYPE_HOST)) {
+                    continue;
+                }
 
-            flags         |= UCT_MD_MEM_ACCESS_ALL;
-            params.flags   = flags;
-            params.mds.mds = &md;
+                params.flags   = nonblock ? UCT_MD_MEM_FLAG_NONBLOCK : 0;
+                params.flags  |= UCT_MD_MEM_ACCESS_ALL;
+                params.mds.mds = &md;
 
-            status = uct_mem_alloc(length, methods, 3, &params, &mem);
-            ASSERT_UCS_OK(status);
+                status = uct_mem_alloc(length, methods, 3, &params, &mem);
+                ASSERT_UCS_OK(status);
 
-            if (md_attr.cap.flags & UCT_MD_FLAG_ALLOC) {
-                EXPECT_EQ(UCT_ALLOC_METHOD_MD, mem.method);
-            } else {
-                EXPECT_NE(UCT_ALLOC_METHOD_MD, mem.method);
-            }
+                if (md_attr.cap.flags & UCT_MD_FLAG_ALLOC) {
+                    EXPECT_EQ(UCT_ALLOC_METHOD_MD, mem.method);
+                } else {
+                    EXPECT_NE(UCT_ALLOC_METHOD_MD, mem.method);
+                }
 
-            check_mem(mem, min_length);
+                check_mem(mem, min_length);
 
-            uct_mem_free(&mem);
+                uct_mem_free(&mem);
+            }
         }
 
         uct_md_close(md);
diff --git a/test/gtest/uct/test_mm.cc b/test/gtest/uct/test_mm.cc
index c664f030b1b..eac6a1720c4 100644
--- a/test/gtest/uct/test_mm.cc
+++ b/test/gtest/uct/test_mm.cc
@@ -21,8 +21,8 @@ class test_uct_mm : public uct_test {
         std::string  shm_dir;
 
         mm_resource(const resource& res, const std::string& shm_dir = "") :
-            resource(res.component, res.md_name, res.local_cpus, res.tl_name,
-                     res.dev_name, res.dev_type),
+            resource(res.component, res.component_name, res.md_name,
+                     res.local_cpus, res.tl_name, res.dev_name, res.dev_type),
             shm_dir(shm_dir)
         {
         }
@@ -56,7 +56,8 @@ class test_uct_mm : public uct_test {
             }
         }
 
-        return filter_resources(all_resources, tl_name);
+        return filter_resources(all_resources, resource::is_equal_tl_name,
+                                tl_name);
     }
 
     test_uct_mm() : m_e1(NULL), m_e2(NULL) {
diff --git a/test/gtest/uct/test_p2p_am.cc b/test/gtest/uct/test_p2p_am.cc
index 303b3b6a78a..a925958dcd7 100644
--- a/test/gtest/uct/test_p2p_am.cc
+++ b/test/gtest/uct/test_p2p_am.cc
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -182,6 +182,16 @@ class uct_p2p_am_test : public uct_p2p_test
                                sendbuf.length() - sizeof(hdr));
     }
 
+    ucs_status_t am_short_iov(uct_ep_h ep, const mapped_buffer &sendbuf,
+                              const mapped_buffer &recvbuf)
+    {
+        UCS_TEST_GET_BUFFER_IOV(
+            iov, iovcnt, (char*)sendbuf.ptr(), sendbuf.length(), sendbuf.memh(),
+            ucs_min(sendbuf.length(), sender().iface_attr().cap.am.max_iov));
+
+        return uct_ep_am_short_iov(ep, AM_ID, iov, iovcnt);
+    }
+
     ucs_status_t am_bcopy(uct_ep_h ep, const mapped_buffer& sendbuf,
                           const mapped_buffer& recvbuf)
     {
@@ -337,6 +347,11 @@ UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_sync,
         blocking_send(static_cast<send_func_t>(&uct_p2p_am_test::am_short),
                       sender_ep(), sendbuf_short, recvbuf, false);
         am_sync_finish(am_count);
+
+        am_count = m_am_count;
+        blocking_send(static_cast<send_func_t>(&uct_p2p_am_test::am_short_iov),
+                      sender_ep(), sendbuf_short, recvbuf, false);
+        am_sync_finish(am_count);
     }
 
     if (receiver().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_BCOPY) {
@@ -380,6 +395,11 @@ UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_async,
         blocking_send(static_cast<send_func_t>(&uct_p2p_am_test::am_short),
                       sender_ep(), sendbuf_short, recvbuf, false);
         am_async_finish(am_count);
+
+        am_count = m_am_count;
+        blocking_send(static_cast<send_func_t>(&uct_p2p_am_test::am_short_iov),
+                      sender_ep(), sendbuf_short, recvbuf, false);
+        am_async_finish(am_count);
     }
 
     if (receiver().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_BCOPY) {
@@ -511,6 +531,42 @@ class uct_p2p_am_misc : public uct_p2p_am_test
         return UCS_LOG_FUNC_RC_CONTINUE;
     }
 
+    void am_max_multi(send_func_t send)
+    {
+        ucs_status_t status;
+
+        mapped_buffer small_sendbuf(sizeof(SEED1), SEED1, sender());
+        mapped_buffer sendbuf(ucs_min(sender().iface_attr().cap.am.max_short, 8192ul),
+                              SEED1, sender());
+        mapped_buffer recvbuf(0, 0, sender()); /* dummy */
+
+        m_am_count = 0;
+        set_keep_data(false);
+
+        status = uct_iface_set_am_handler(receiver().iface(), AM_ID, am_handler,
+                                          this, UCT_CB_FLAG_ASYNC);
+        ASSERT_UCS_OK(status);
+
+        /* exhaust all resources or time out 1sec */
+        ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(1.0);
+        do {
+            status = (this->*send)(sender_ep(), sendbuf, recvbuf);
+        } while ((ucs_get_time() < loop_end_limit) && (status == UCS_OK));
+        if (status != UCS_ERR_NO_RESOURCE) {
+            ASSERT_UCS_OK(status);
+        }
+
+        /* should be able to send again after a while */
+        ucs_time_t deadline = ucs_get_time() +
+                              (ucs::test_time_multiplier() *
+                               ucs_time_from_sec(DEFAULT_TIMEOUT_SEC));
+        do {
+            progress();
+            status = (this->*send)(sender_ep(), small_sendbuf, recvbuf);
+        } while ((status == UCS_ERR_NO_RESOURCE) && (ucs_get_time() < deadline));
+        EXPECT_EQ(UCS_OK, status);
+    }
+
     bool m_rx_buf_limit_failed;
 };
 
@@ -533,6 +589,15 @@ UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_short_keep_data,
                     TEST_UCT_FLAG_DIR_SEND_TO_RECV);
 }
 
+UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_short_iov_keep_data,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT, UCT_IFACE_FLAG_AM_DUP))
+{
+    set_keep_data(true);
+    test_xfer_multi(static_cast<send_func_t>(&uct_p2p_am_test::am_short_iov),
+                    sizeof(uint64_t), sender().iface_attr().cap.am.max_short,
+                    TEST_UCT_FLAG_DIR_SEND_TO_RECV);
+}
+
 UCS_TEST_SKIP_COND_P(uct_p2p_am_test, am_bcopy_keep_data,
                      !check_caps(UCT_IFACE_FLAG_AM_BCOPY,
                                  UCT_IFACE_FLAG_AM_DUP)) {
@@ -600,41 +665,15 @@ UCS_TEST_SKIP_COND_P(uct_p2p_am_misc, no_rx_buffs,
 }
 
 UCS_TEST_SKIP_COND_P(uct_p2p_am_misc, am_max_short_multi,
-                     !check_caps(UCT_IFACE_FLAG_AM_SHORT)) {
-    ucs_status_t status;
-
-    m_am_count = 0;
-    set_keep_data(false);
-
-    status = uct_iface_set_am_handler(receiver().iface(), AM_ID, am_handler,
-                                      this, UCT_CB_FLAG_ASYNC);
-    ASSERT_UCS_OK(status);
-
-    size_t size = ucs_min(sender().iface_attr().cap.am.max_short, 8192ul);
-    std::string sendbuf(size, 0);
-    mem_buffer::pattern_fill(&sendbuf[0], sendbuf.size(), SEED1);
-    ucs_assert(SEED1 == *(uint64_t*)&sendbuf[0]);
-
-    /* exhaust all resources or time out 1sec */
-    ucs_time_t loop_end_limit = ucs_get_time() + ucs_time_from_sec(1.0);
-    do {
-        status = uct_ep_am_short(sender_ep(), AM_ID, SEED1,
-                                 ((uint64_t*)&sendbuf[0]) + 1,
-                                 sendbuf.size() - sizeof(uint64_t));
-    } while ((ucs_get_time() < loop_end_limit) && (status == UCS_OK));
-    if (status != UCS_ERR_NO_RESOURCE) {
-        ASSERT_UCS_OK(status);
-    }
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    am_max_multi(static_cast<send_func_t>(&uct_p2p_am_test::am_short));
+}
 
-    /* should be able to send again after a while */
-    ucs_time_t deadline = ucs_get_time() +
-                    (ucs::test_time_multiplier() *
-                     ucs_time_from_sec(DEFAULT_TIMEOUT_SEC));
-    do {
-        progress();
-        status = uct_ep_am_short(sender_ep(), AM_ID, SEED1, NULL, 0);
-    } while ((status == UCS_ERR_NO_RESOURCE) && (ucs_get_time() < deadline));
-    EXPECT_EQ(UCS_OK, status);
+UCS_TEST_SKIP_COND_P(uct_p2p_am_misc, am_max_short_iov_multi,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    am_max_multi(static_cast<send_func_t>(&uct_p2p_am_test::am_short_iov));
 }
 
 UCT_INSTANTIATE_TEST_CASE(uct_p2p_am_misc)
@@ -707,3 +746,181 @@ UCS_TEST_P(uct_p2p_am_tx_bufs, am_tx_max_bufs) {
 }
 
 UCT_INSTANTIATE_TEST_CASE(uct_p2p_am_tx_bufs)
+
+class uct_p2p_am_alignment : public uct_p2p_am_test {
+public:
+    uct_p2p_am_alignment()
+        : m_am_received(false), m_alignment(1), m_align_offset(0)
+    {
+    }
+
+    void init()
+    {
+        if (has_ugni() || has_gpu() || has_transport("tcp") ||
+            has_transport("cma") || has_transport("knem") ||
+            has_transport("xpmem")) {
+            UCS_TEST_SKIP_R(GetParam()->tl_name +
+                            " does not support AM alignment");
+        }
+        // Do not call init method of uct_p2p_am_test and others to avoid
+        // creating UCT instances with basic iface params
+        uct_test::init();
+    }
+
+    void test_align(send_func_t send_f, bool set_offset)
+    {
+        size_t data_length = SIZE_MAX;
+
+        m_alignment = pow(2, ucs::rand() % 13);
+
+        if (set_offset) {
+            // Offset has to be:
+            // 1. Smaller than alignment.
+            // 2. Smaller than UCT iface element size. For now assume element
+            //    size is always bigger than 128. TODO: Fix this when new
+            //    interface for querying maximal alignment offset is defined.
+            m_align_offset = ucs_min(128, ucs::rand() % m_alignment);
+        } else {
+            m_align_offset = 0;
+        }
+
+        UCS_TEST_MESSAGE << "alignment: " << m_alignment << ", offset: "
+                     << (set_offset ? ucs::to_string(m_align_offset) : "none");
+
+        create_connected_entities(0ul, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                                  m_alignment, m_align_offset);
+        check_skip_test();
+
+        if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_SHORT) {
+            data_length = sender().iface_attr().cap.am.max_short;
+        }
+
+        if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_BCOPY) {
+            data_length = ucs_min(data_length,
+                                  sender().iface_attr().cap.am.max_bcopy);
+        }
+
+        if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) {
+            data_length = ucs_min(data_length,
+                                  sender().iface_attr().cap.am.max_zcopy);
+        }
+
+        m_am_received  = false;
+        disable_comp();
+
+        ASSERT_UCS_OK(uct_iface_set_am_handler(receiver().iface(), AM_ID,
+                                               am_handler, this, 0));
+
+        mapped_buffer sendbuf(data_length, 0, sender());
+        ucs_status_t status = (this->*send_f)(sender_ep(), sendbuf, sendbuf);
+        ASSERT_UCS_OK_OR_INPROGRESS(status);
+        wait_for_flag(&m_am_received);
+    }
+
+    void test_invalid_alignment(size_t alignment, size_t align_offset,
+                                uint64_t field_mask)
+    {
+        entity *dummy = uct_test::create_entity(0);
+        m_entities.push_back(dummy);
+        uct_iface_params_t params = dummy->iface_params();
+
+        check_skip_test();
+
+        if (field_mask & UCT_IFACE_PARAM_FIELD_AM_ALIGNMENT) {
+            params.am_alignment = alignment;
+        }
+
+        if (field_mask & UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET) {
+            params.am_align_offset = align_offset;
+        }
+
+        params.field_mask |= field_mask;
+
+        scoped_log_handler wrap_err(wrap_errors_logger);
+        uct_iface_h iface;
+        ucs_status_t status = uct_iface_open(dummy->md(), dummy->worker(),
+                                             &params, m_iface_config, &iface);
+        EXPECT_EQ(UCS_ERR_INVALID_PARAM, status) << "alignment " << alignment;
+    }
+
+    static ucs_status_t
+    am_handler(void *arg, void *data, size_t length, unsigned flags)
+    {
+        uct_p2p_am_alignment *self = reinterpret_cast<uct_p2p_am_alignment*>(
+                arg);
+        EXPECT_FALSE(self->m_am_received);
+
+        if (flags & UCT_CB_PARAM_FLAG_DESC) {
+            void *aligned_data = UCS_PTR_BYTE_OFFSET(data,
+                                                     self->m_align_offset);
+            EXPECT_EQ(0u, ((uintptr_t)aligned_data) % self->m_alignment)
+                      << "aligned data ptr " << aligned_data;
+        } else {
+            UCS_TEST_MESSAGE << "Alignment is not supported for inlined data";
+        }
+
+        self->m_am_received = true;
+
+        return UCS_OK;
+    }
+
+private:
+    bool m_am_received;
+    size_t m_alignment;
+    size_t m_align_offset;
+};
+
+
+UCS_TEST_P(uct_p2p_am_alignment, invalid_align)
+{
+    test_invalid_alignment(0, 0, UCT_IFACE_PARAM_FIELD_AM_ALIGNMENT);
+    test_invalid_alignment(3, 1, UCT_IFACE_PARAM_FIELD_AM_ALIGNMENT);
+}
+
+UCS_TEST_P(uct_p2p_am_alignment, invalid_offset)
+{
+    // Align ofsset has no meaning if alignment is not requested
+    test_invalid_alignment(0, 11, UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET);
+
+    // Align offset must be less than alignment itself
+    test_invalid_alignment(8, 8, UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET);
+    test_invalid_alignment(8, 11, UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET);
+}
+
+UCS_TEST_SKIP_COND_P(uct_p2p_am_alignment, align_short,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    test_align(static_cast<send_func_t>(&uct_p2p_am_test::am_short), false);
+}
+
+UCS_TEST_SKIP_COND_P(uct_p2p_am_alignment, align_short_with_offset,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    test_align(static_cast<send_func_t>(&uct_p2p_am_test::am_short), true);
+}
+
+UCS_TEST_SKIP_COND_P(uct_p2p_am_alignment, align_bcopy,
+                     !check_caps(UCT_IFACE_FLAG_AM_BCOPY))
+{
+    test_align(static_cast<send_func_t>(&uct_p2p_am_test::am_bcopy), false);
+}
+
+UCS_TEST_SKIP_COND_P(uct_p2p_am_alignment, align_bcopy_with_offset,
+                     !check_caps(UCT_IFACE_FLAG_AM_BCOPY))
+{
+    test_align(static_cast<send_func_t>(&uct_p2p_am_test::am_bcopy), true);
+}
+
+UCS_TEST_SKIP_COND_P(uct_p2p_am_alignment, align_zcopy,
+                     !check_caps(UCT_IFACE_FLAG_AM_ZCOPY))
+{
+    test_align(static_cast<send_func_t>(&uct_p2p_am_test::am_zcopy), false);
+}
+
+UCS_TEST_SKIP_COND_P(uct_p2p_am_alignment, align_zcopy_with_offset,
+                     !check_caps(UCT_IFACE_FLAG_AM_ZCOPY))
+{
+    test_align(static_cast<send_func_t>(&uct_p2p_am_test::am_zcopy), true);
+}
+
+UCT_INSTANTIATE_TEST_CASE(uct_p2p_am_alignment)
diff --git a/test/gtest/uct/test_p2p_err.cc b/test/gtest/uct/test_p2p_err.cc
index e1d9e7f8ad6..362422ea7d3 100644
--- a/test/gtest/uct/test_p2p_err.cc
+++ b/test/gtest/uct/test_p2p_err.cc
@@ -16,6 +16,7 @@ class uct_p2p_err_test : public uct_p2p_test {
         OP_PUT_BCOPY,
         OP_PUT_ZCOPY,
         OP_AM_SHORT,
+        OP_AM_SHORT_IOV,
         OP_AM_BCOPY,
         OP_AM_ZCOPY
     };
@@ -71,6 +72,11 @@ class uct_p2p_err_test : public uct_p2p_test {
             case OP_AM_SHORT:
                 status = uct_ep_am_short(sender_ep(), am_id, 0, buffer, length);
                 break;
+            case OP_AM_SHORT_IOV: {
+                UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buffer, length, memh, 1);
+                status = uct_ep_am_short_iov(sender_ep(), am_id, iov, iovcnt);
+                break;
+            }
             case OP_AM_BCOPY:
                 arg.buffer = buffer;
                 arg.length = length;
@@ -231,6 +237,23 @@ UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_am_short_length,
     recvbuf.pattern_check(2);
 }
 
+UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_am_short_iov_length,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    size_t max_short = sender().iface_attr().cap.am.max_short;
+    if (max_short > (2 * UCS_MBYTE)) {
+        UCS_TEST_SKIP_R("max_short too large");
+    }
+
+    mapped_buffer sendbuf(max_short + 1, 1, sender());
+    mapped_buffer recvbuf(max_short + 1, 2, receiver());
+
+    test_error_run(OP_AM_SHORT_IOV, 0, sendbuf.ptr(), sendbuf.length(),
+                   UCT_MEM_HANDLE_NULL, recvbuf.addr(), recvbuf.rkey(), "length");
+
+    recvbuf.pattern_check(2);
+}
+
 UCS_TEST_SKIP_COND_P(uct_p2p_err_test, invalid_am_bcopy_length,
                      !check_caps(UCT_IFACE_FLAG_AM_BCOPY |
                                  UCT_IFACE_FLAG_ERRHANDLE_BCOPY_LEN)) {
@@ -279,6 +302,15 @@ UCS_TEST_SKIP_COND_P(uct_p2p_err_test, short_invalid_am_id,
                    "active message id");
 }
 
+UCS_TEST_SKIP_COND_P(uct_p2p_err_test, short_iov_invalid_am_id,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    mapped_buffer sendbuf(4, 2, sender());
+
+    test_error_run(OP_AM_SHORT_IOV, UCT_AM_ID_MAX, sendbuf.ptr(), sendbuf.length(),
+                   UCT_MEM_HANDLE_NULL, 0, UCT_INVALID_RKEY, "active message id");
+}
+
 UCS_TEST_SKIP_COND_P(uct_p2p_err_test, bcopy_invalid_am_id,
                      !check_caps(UCT_IFACE_FLAG_AM_BCOPY)) {
     mapped_buffer sendbuf(4, 2, sender());
diff --git a/test/gtest/uct/test_p2p_mix.cc b/test/gtest/uct/test_p2p_mix.cc
index 4d6266e3fc3..2d1f84f02a0 100644
--- a/test/gtest/uct/test_p2p_mix.cc
+++ b/test/gtest/uct/test_p2p_mix.cc
@@ -88,6 +88,26 @@ ucs_status_t uct_p2p_mix_test::am_short(const mapped_buffer &sendbuf,
     return status;
 }
 
+ucs_status_t uct_p2p_mix_test::am_short_iov(const mapped_buffer &sendbuf,
+                                            const mapped_buffer &recvbuf,
+                                            uct_completion_t *comp)
+{
+    ucs_status_t status;
+    uct_iov_t iov;
+
+    iov.buffer = sendbuf.ptr();
+    iov.length = sendbuf.length();
+    iov.count  = 1;
+    iov.stride = 0;
+    iov.memh   = sendbuf.memh();
+
+    status = uct_ep_am_short_iov(sender().ep(0), AM_ID, &iov, 1);
+    if (status == UCS_OK) {
+        ucs_atomic_add32(&am_pending, +1);
+    }
+    return status;
+}
+
 ucs_status_t uct_p2p_mix_test::am_zcopy(const mapped_buffer &sendbuf,
                                         const mapped_buffer &recvbuf,
                                         uct_completion_t *comp)
@@ -170,6 +190,9 @@ void uct_p2p_mix_test::init() {
     if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_SHORT) {
         m_avail_send_funcs.push_back(&uct_p2p_mix_test::am_short);
         m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.am.max_short);
+
+        m_avail_send_funcs.push_back(&uct_p2p_mix_test::am_short_iov);
+        m_send_size = ucs_min(m_send_size, sender().iface_attr().cap.am.max_short);
     }
     if (sender().iface_attr().cap.flags & UCT_IFACE_FLAG_AM_ZCOPY) {
         m_avail_send_funcs.push_back(&uct_p2p_mix_test::am_zcopy);
diff --git a/test/gtest/uct/test_p2p_mix.h b/test/gtest/uct/test_p2p_mix.h
index 51d9bdc834e..24be23ae9a4 100644
--- a/test/gtest/uct/test_p2p_mix.h
+++ b/test/gtest/uct/test_p2p_mix.h
@@ -49,6 +49,9 @@ class uct_p2p_mix_test : public uct_p2p_test {
                           const mapped_buffer &recvbuf,
                           uct_completion_t *comp);
 
+    ucs_status_t am_short_iov(const mapped_buffer &sendbuf,
+                              const mapped_buffer &recvbuf, uct_completion_t *comp);
+
     ucs_status_t am_zcopy(const mapped_buffer &sendbuf,
                           const mapped_buffer &recvbuf,
                           uct_completion_t *comp);
diff --git a/test/gtest/uct/test_peer_failure.cc b/test/gtest/uct/test_peer_failure.cc
index 84cf85a760e..06bc260232c 100644
--- a/test/gtest/uct/test_peer_failure.cc
+++ b/test/gtest/uct/test_peer_failure.cc
@@ -8,6 +8,12 @@
 
 #include "test_peer_failure.h"
 
+#if HAVE_CUDA
+extern "C" {
+#include <uct/cuda/cuda_ipc/cuda_ipc_ep.h>
+}
+#endif
+
 
 size_t test_uct_peer_failure::m_req_purge_count       = 0ul;
 const uint64_t test_uct_peer_failure::m_required_caps = UCT_IFACE_FLAG_AM_SHORT  |
@@ -84,9 +90,20 @@ void test_uct_peer_failure::purge_cb(uct_pending_req_t *self, void *arg)
 ucs_status_t test_uct_peer_failure::err_cb(void *arg, uct_ep_h ep,
                                            ucs_status_t status)
 {
-    EXPECT_EQ(UCS_ERR_ENDPOINT_TIMEOUT, status);
-    reinterpret_cast<test_uct_peer_failure*>(arg)->m_err_count++;
-    return UCS_OK;
+    test_uct_peer_failure *self = reinterpret_cast<test_uct_peer_failure*>(arg);
+
+    self->m_err_count++;
+
+    switch (status) {
+    case UCS_ERR_ENDPOINT_TIMEOUT:
+    case UCS_ERR_CONNECTION_RESET:
+    case UCS_ERR_CANCELED: /* goes from ib flushed QP */
+        return UCS_OK;
+    default:
+        EXPECT_TRUE(false) << "unexpected error status: "
+                           << ucs_status_string(status);
+        return status;
+    }
 }
 
 void test_uct_peer_failure::kill_receiver()
@@ -236,36 +253,6 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure, peer_failure,
         flush();
     }
 
-    UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, NULL, 0, NULL, 1);
-
-    /* Check that all ep operations return pre-defined error code */
-    EXPECT_EQ(uct_ep_am_short(ep0(), 0, 0, NULL, 0), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_am_bcopy(ep0(), 0, NULL, NULL, 0), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_am_zcopy(ep0(), 0, NULL, 0, iov, iovcnt, 0, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_put_short(ep0(), NULL, 0, 0, 0), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_put_bcopy(ep0(), NULL, NULL, 0, 0), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_put_zcopy(ep0(), iov, iovcnt, 0, 0, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_get_bcopy(ep0(), NULL, NULL, 0, 0, 0, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_get_zcopy(ep0(), iov, iovcnt, 0, 0, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_atomic64_post(ep0(), UCT_ATOMIC_OP_ADD, 0, 0, 0), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_atomic32_post(ep0(), UCT_ATOMIC_OP_ADD, 0, 0, 0), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_atomic64_fetch(ep0(), UCT_ATOMIC_OP_ADD, 0, NULL, 0, 0, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_atomic32_fetch(ep0(), UCT_ATOMIC_OP_ADD, 0, NULL, 0, 0, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_atomic_cswap64(ep0(), 0, 0, 0, 0, NULL, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_atomic_cswap32(ep0(), 0, 0, 0, 0, NULL, NULL),
-              UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_flush(ep0(), 0, NULL), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_get_address(ep0(), NULL), UCS_ERR_ENDPOINT_TIMEOUT);
-    EXPECT_EQ(uct_ep_pending_add(ep0(), NULL, 0), UCS_ERR_BUSY);
-    EXPECT_EQ(uct_ep_connect_to_ep(ep0(), NULL, NULL), UCS_ERR_ENDPOINT_TIMEOUT);
-
     EXPECT_GT(m_err_count, 0ul);
 }
 
@@ -308,7 +295,9 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure, purge_failed_peer,
         flush();
     }
 
-    EXPECT_EQ(UCS_ERR_ENDPOINT_TIMEOUT, uct_ep_am_short(ep0(), 0, 0, NULL, 0));
+    EXPECT_GE(m_err_count, 0ul);
+
+    /* any new op is not determined */
 
     uct_ep_pending_purge(ep0(), purge_cb, NULL);
     EXPECT_EQ(num_pend_sends, m_req_purge_count);
@@ -335,13 +324,11 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure, two_pairs_send,
     }
 
     /* test flushing one operations */
-    send_recv_am(0, UCS_ERR_ENDPOINT_TIMEOUT);
     send_recv_am(1, UCS_OK);
     flush();
 
     /* test flushing many operations */
     for (size_t i = 0; i < (m_tx_window * 10 / ucs::test_time_multiplier()); ++i) {
-        send_recv_am(0, UCS_ERR_ENDPOINT_TIMEOUT);
         send_recv_am(1, UCS_OK);
     }
     flush();
@@ -356,14 +343,13 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure, two_pairs_send_after,
     {
         scoped_log_handler slh(wrap_errors_logger);
         kill_receiver();
-        for (int i = 0; i < 100; ++i) {
+        for (int i = 0; (i < 100) && (m_err_count == 0); ++i) {
             send_am(0);
         }
         flush();
     }
 
-    send_recv_am(0, UCS_ERR_ENDPOINT_TIMEOUT);
-
+    wait_for_value(&m_err_count, size_t(1), true);
     m_am_count = 0;
     send_am(1);
     ucs_debug("flushing");
@@ -375,32 +361,6 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure, two_pairs_send_after,
 
 UCT_INSTANTIATE_TEST_CASE(test_uct_peer_failure)
 
-class test_uct_peer_failure_cb : public test_uct_peer_failure {
-public:
-    virtual uct_error_handler_t get_err_handler() const {
-        return err_cb_ep_destroy;
-    }
-
-    static ucs_status_t err_cb_ep_destroy(void *arg, uct_ep_h ep, ucs_status_t status) {
-        test_uct_peer_failure_cb *self(reinterpret_cast<test_uct_peer_failure_cb*>(arg));
-        EXPECT_EQ(self->ep0(), ep);
-        self->m_sender->destroy_ep(0);
-        return UCS_OK;
-    }
-};
-
-UCS_TEST_SKIP_COND_P(test_uct_peer_failure_cb, desproy_ep_cb,
-                     !check_caps(UCT_IFACE_FLAG_PUT_SHORT |
-                                 m_required_caps))
-{
-    scoped_log_handler slh(wrap_errors_logger);
-    kill_receiver();
-    EXPECT_EQ(uct_ep_put_short(ep0(), NULL, 0, 0, 0), UCS_OK);
-    flush();
-}
-
-UCT_INSTANTIATE_TEST_CASE(test_uct_peer_failure_cb)
-
 class test_uct_peer_failure_multiple : public test_uct_peer_failure
 {
 public:
@@ -475,8 +435,9 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure_multiple, test,
                       !check_caps(m_required_caps)),
                      "RC_TM_ENABLE?=n")
 {
+    /* with DC peer failure may stuck peer DCRs and cause very long DCT close */
     ucs_time_t timeout  = ucs_get_time() +
-                          ucs_time_from_sec(200 * ucs::test_time_multiplier());
+                          ucs_time_from_sec(300 * ucs::test_time_multiplier());
 
     {
         scoped_log_handler slh(wrap_errors_logger);
@@ -489,19 +450,18 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure_multiple, test,
         flush(timeout);
 
         /* if EPs are not failed yet, these ops should trigger that */
-        for (size_t idx = 0; idx < m_nreceivers - 1; ++idx) {
+        for (size_t idx = 0; (idx < m_nreceivers - 1) &&
+                             (m_err_count == 0); ++idx) {
             for (size_t i = 0; i < m_tx_window; ++i) {
-                send_am(idx);
+                if (UCS_STATUS_IS_ERR(send_am(idx))) {
+                    break;
+                }
             }
         }
 
         flush(timeout);
     }
 
-    for (size_t idx = 0; idx < m_nreceivers - 1; ++idx) {
-        send_recv_am(idx, UCS_ERR_ENDPOINT_TIMEOUT);
-    }
-
     m_am_count = 0;
     send_am(m_nreceivers - 1);
     ucs_debug("flushing");
@@ -513,24 +473,109 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure_multiple, test,
 
 UCT_INSTANTIATE_TEST_CASE(test_uct_peer_failure_multiple)
 
+class test_uct_keepalive : public ucs::test {
+public:
+    test_uct_keepalive()
+    {
+        m_ka  = NULL;
+        m_pid = getpid();
+    }
+
+    void init()
+    {
+        m_err_handler_count = 0;
+
+        ASSERT_UCS_OK(uct_ep_keepalive_create(m_pid, &m_ka));
+    }
+
+    void cleanup()
+    {
+        ucs_free(m_ka);
+    }
+
+    static ucs_status_t
+    err_handler_cb(void *arg, uct_ep_h ep, ucs_status_t status)
+    {
+        m_err_handler_count++;
+        return status;
+    }
+
+protected:
+    uct_keepalive_info_t *m_ka;
+    pid_t                m_pid;
+    static unsigned      m_err_handler_count;
+};
+
+
+unsigned test_uct_keepalive::m_err_handler_count = 0;
+
+
+UCS_TEST_F(test_uct_keepalive, ep_check)
+{
+    uct_base_iface_t iface = {};
+    uct_ep_t ep            = {};
+
+    iface.err_handler     = err_handler_cb;
+    iface.err_handler_arg = &m_err_handler_count;
+    ep.iface              = &iface.super;
+
+    for (unsigned i = 0; i < 10; ++i) {
+        ucs_status_t status = uct_ep_keepalive_check(&ep, &m_ka, m_pid, 0,
+                                                     NULL);
+        EXPECT_UCS_OK(status);
+    }
+
+    /* change start time saved in KA to force an error from EP check */
+    m_ka->start_time--;
+
+    ucs_status_t status = uct_ep_keepalive_check(&ep, &m_ka, m_pid, 0, NULL);
+    EXPECT_EQ(UCS_ERR_ENDPOINT_TIMEOUT, status);
+    EXPECT_EQ(1u, m_err_handler_count);
+}
+
+
 class test_uct_peer_failure_keepalive : public test_uct_peer_failure
 {
 public:
+    test_uct_peer_failure_keepalive()
+    {
+        m_env.push_back(new ucs::scoped_setenv("UCX_TCP_KEEPIDLE", "inf"));
+    }
+
     void kill_receiver()
     {
         /* Hack: for SHM-based transports we can't really terminate
          * peer EP, but instead we bit change process owner info to force
          * ep_check failure. Simulation of case when peer process is
          * terminated and PID is immediately reused by another process */
-        uct_ep_h tl_ep = ep0();
+        uct_ep_h tl_ep                = ep0();
+        uct_keepalive_info_t *ka_info = NULL;
+
         if (has_mm()) {
             uct_mm_ep_t *ep = ucs_derived_of(tl_ep, uct_mm_ep_t);
-            ASSERT_NE((void*)NULL, ep->keepalive);
-            ep->keepalive->starttime--;
+            ka_info         = ep->keepalive;
+            ASSERT_TRUE(ka_info != NULL);
+        } else if (has_cuda_ipc()) {
+#if HAVE_CUDA
+            uct_cuda_ipc_ep_t *ep = ucs_derived_of(tl_ep, uct_cuda_ipc_ep_t);
+            ka_info               = ep->keepalive;
+            ASSERT_TRUE(ka_info != NULL);
+#endif
+        } else if (has_cma()) {
+            uct_cma_ep_t *ep = ucs_derived_of(tl_ep, uct_cma_ep_t);
+            ka_info          = ep->keepalive;
+            ASSERT_TRUE(ka_info != NULL);
+        }
+
+        if (ka_info != NULL) {
+            ka_info->start_time--;
         }
 
         test_uct_peer_failure::kill_receiver();
     }
+
+protected:
+    ucs::ptr_vector<ucs::scoped_setenv> m_env;
 };
 
 UCS_TEST_SKIP_COND_P(test_uct_peer_failure_keepalive, killed,
@@ -546,13 +591,81 @@ UCS_TEST_SKIP_COND_P(test_uct_peer_failure_keepalive, killed,
     ASSERT_UCS_OK(status);
     flush();
 
+    /* allow keepalive requests to complete */
+    short_progress_loop();
+
+    /* we are still alive */
+    EXPECT_EQ(0, m_err_count);
+
     kill_receiver();
 
     status = uct_ep_check(ep0(), 0, NULL);
     ASSERT_UCS_OK(status);
     flush();
 
+    wait_for_flag(&m_err_count);
     EXPECT_EQ(1, m_err_count);
 }
 
 UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_peer_failure_keepalive)
+_UCT_INSTANTIATE_TEST_CASE(test_uct_peer_failure_keepalive, cuda_ipc);
+
+
+class test_uct_peer_failure_rma_zcopy : public test_uct_peer_failure
+{
+public:
+    static const uint64_t INVALID_ADDRESS = UINT64_MAX;
+
+    test_uct_peer_failure_rma_zcopy()
+    {
+        m_dummy_comp.func   = NULL;
+        m_dummy_comp.count  = INT_MAX;
+        m_dummy_comp.status = UCS_OK;
+    }
+
+    void test_rma_zcopy_peer_failure(bool is_put_op)
+    {
+        {
+            scoped_log_handler slh(wrap_errors_logger);
+            const size_t size = 128;
+            mapped_buffer sendbuf(size, 0, *m_sender);
+            ucs_status_t status;
+
+            // Pretend peer failure by using invalid address
+            if (is_put_op) {
+                status = uct_ep_put_zcopy(m_sender->ep(0),
+                                          sendbuf.iov(), 1,
+                                          INVALID_ADDRESS, 0ul, &m_dummy_comp);
+            } else {
+                status = uct_ep_get_zcopy(m_sender->ep(0),
+                                          sendbuf.iov(), 1,
+                                          INVALID_ADDRESS, 0ul, &m_dummy_comp);
+            }
+            EXPECT_FALSE(UCS_STATUS_IS_ERR(status))
+                << ucs_status_string(status);
+
+            flush();
+        }
+
+        EXPECT_GT(m_err_count, 0ul);
+    }
+
+    uct_completion_t m_dummy_comp;
+};
+
+UCS_TEST_SKIP_COND_P(test_uct_peer_failure_rma_zcopy, put,
+                     !check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
+                                 UCT_IFACE_FLAG_PUT_ZCOPY))
+{
+    test_rma_zcopy_peer_failure(true);
+}
+
+UCS_TEST_SKIP_COND_P(test_uct_peer_failure_rma_zcopy, get,
+                     !check_caps(UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE |
+                                 UCT_IFACE_FLAG_GET_ZCOPY))
+{
+    test_rma_zcopy_peer_failure(false);
+}
+
+_UCT_INSTANTIATE_TEST_CASE(test_uct_peer_failure_rma_zcopy, cma)
+
diff --git a/test/gtest/uct/test_stats.cc b/test/gtest/uct/test_stats.cc
index eda38295296..f9d9ddf45f5 100644
--- a/test/gtest/uct/test_stats.cc
+++ b/test/gtest/uct/test_stats.cc
@@ -220,6 +220,30 @@ UCS_TEST_SKIP_COND_P(test_uct_stats, am_short,
     check_am_rx_counters(sizeof(hdr) + sizeof(send_data));
 }
 
+UCS_TEST_SKIP_COND_P(test_uct_stats, am_short_iov,
+                     !check_caps(UCT_IFACE_FLAG_AM_SHORT))
+{
+    ucs_status_t status;
+
+    init_bufs(0, sender().iface_attr().cap.am.max_short);
+
+    status = uct_iface_set_am_handler(receiver().iface(), 0, am_handler, 0,
+                                      UCT_CB_FLAG_ASYNC);
+    EXPECT_UCS_OK(status);
+
+    UCS_TEST_GET_BUFFER_IOV(
+        iov, iovcnt, lbuf->ptr(), lbuf->length(), lbuf->memh(),
+        ucs_min(lbuf->length(), sender().iface_attr().cap.am.max_iov));
+
+    UCT_TEST_CALL_AND_TRY_AGAIN(uct_ep_am_short_iov(sender_ep(), 0, iov, iovcnt),
+                                status);
+    EXPECT_UCS_OK(status);
+
+    EXPECT_STAT(sender, uct_ep, UCT_EP_STAT_AM, 1UL);
+    EXPECT_STAT(sender, uct_ep, UCT_EP_STAT_BYTES_SHORT, lbuf->length());
+    check_am_rx_counters(lbuf->length());
+}
+
 UCS_TEST_SKIP_COND_P(test_uct_stats, am_bcopy,
                      !check_caps(UCT_IFACE_FLAG_AM_BCOPY))
 {
diff --git a/test/gtest/uct/test_tag.cc b/test/gtest/uct/test_tag.cc
index d6ffe5a1a97..3f74a432663 100644
--- a/test/gtest/uct/test_tag.cc
+++ b/test/gtest/uct/test_tag.cc
@@ -63,25 +63,10 @@ class test_tag : public uct_test {
 
         uct_test::init();
 
-        entity *sender = uct_test::create_entity(0ul, NULL, unexp_eager,
-                                                 unexp_rndv,
-                                                 reinterpret_cast<void*>(this),
-                                                 reinterpret_cast<void*>(this));
-        m_entities.push_back(sender);
-
+        uct_test::create_connected_entities(0ul, NULL, unexp_eager, unexp_rndv,
+                                            reinterpret_cast<void*>(this),
+                                            reinterpret_cast<void*>(this));
         check_skip_test();
-
-        if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) {
-            sender->connect(0, *sender, 0);
-        } else {
-            entity *receiver = uct_test::create_entity(0ul, NULL, unexp_eager,
-                                                       unexp_rndv,
-                                                       reinterpret_cast<void*>(this),
-                                                       reinterpret_cast<void*>(this));
-            m_entities.push_back(receiver);
-
-            sender->connect(0, *receiver, 0);
-        }
     }
 
     void init_send_ctx(send_ctx &s,mapped_buffer *b, uct_tag_t t, uint64_t i,
@@ -371,25 +356,32 @@ class test_tag : public uct_test {
         user_ctx->consumed = true;
     }
 
+    static void verify_completed(recv_ctx *user_ctx, uct_tag_t stag, size_t length)
+    {
+        EXPECT_EQ(user_ctx->tag, (stag & user_ctx->tmask));
+        EXPECT_EQ(user_ctx->mbuf->length(), length);
+    }
+
     static void completed(uct_tag_context_t *self, uct_tag_t stag, uint64_t imm,
-                          size_t length, ucs_status_t status)
+                          size_t length, void *inline_data, ucs_status_t status)
     {
         recv_ctx *user_ctx = ucs_container_of(self, recv_ctx, uct_ctx);
         user_ctx->comp     = true;
         user_ctx->status   = status;
-        EXPECT_EQ(user_ctx->tag, (stag & user_ctx->tmask));
-        EXPECT_EQ(user_ctx->mbuf->length(), length);
+        verify_completed(user_ctx, stag, length);
     }
 
     static void sw_rndv_completed(uct_tag_context_t *self, uct_tag_t stag,
                                   const void *header, unsigned header_length,
-                                  ucs_status_t status)
+                                  ucs_status_t status, unsigned flags)
     {
         recv_ctx *user_ctx = ucs_container_of(self, recv_ctx, uct_ctx);
         user_ctx->sw_rndv  = true;
         user_ctx->status   = status;
-        EXPECT_EQ(user_ctx->tag, (stag & user_ctx->tmask));
-        EXPECT_EQ(user_ctx->mbuf->length(), header_length);
+        if (flags & UCT_TAG_RECV_CB_INLINE_DATA) {
+            memcpy(user_ctx->mbuf->ptr(), header, header_length);
+        }
+        verify_completed(user_ctx, stag, header_length);
     }
 
     static ucs_status_t unexp_eager(void *arg, void *data, size_t length,
diff --git a/test/gtest/uct/test_uct_ep.cc b/test/gtest/uct/test_uct_ep.cc
index 81ba4079e7b..5411eba71d7 100644
--- a/test/gtest/uct/test_uct_ep.cc
+++ b/test/gtest/uct/test_uct_ep.cc
@@ -15,26 +15,38 @@ class test_uct_ep : public uct_test {
 
     void init() {
         uct_test::init();
-        m_sender = uct_test::create_entity(0);
-        m_entities.push_back(m_sender);
-
-        check_skip_test();
 
+        m_sender   = NULL;
         m_receiver = uct_test::create_entity(0);
         m_entities.push_back(m_receiver);
 
+        check_skip_test();
+
         uct_iface_set_am_handler(m_receiver->iface(), 1,
                                  (uct_am_callback_t)ucs_empty_function_return_success,
                                  NULL, UCT_CB_FLAG_ASYNC);
     }
 
-    void connect() {
-        m_sender->connect(0, *m_receiver, 0);
-        short_progress_loop(10); /* Some transports need time to become ready */
+    void create_sender()
+    {
+        m_sender = uct_test::create_entity(0);
+        m_entities.push_back(m_sender);
     }
 
-    void disconnect() {
+    void connect()
+    {
+        m_sender->connect(0, *m_receiver, 0);
+
+        /* Some transports need time to become ready */
         flush();
+    }
+
+    void disconnect(bool should_flush = true)
+    {
+        if (should_flush) {
+            flush();
+        }
+
         if (m_receiver->iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) {
             m_receiver->destroy_ep(0);
         }
@@ -49,6 +61,46 @@ class test_uct_ep : public uct_test {
 #endif
     }
 
+    struct test_ep_comp_t {
+        uct_completion_t comp;
+        test_uct_ep      *test;
+        uct_ep_h         ep;
+    };
+
+    static void completion_cb(uct_completion_t *comp)
+    {
+        test_ep_comp_t *ep_comp = ucs_container_of(comp, test_ep_comp_t, comp);
+
+        EXPECT_TRUE(ep_comp->ep != NULL);
+        /* Check that completion callback was invoked not after EP destroy */
+        EXPECT_EQ(ep_comp->test->m_sender->ep(0), ep_comp->ep);
+    }
+
+    static ucs_log_func_rc_t
+    detect_uncomp_op_logger(const char *file, unsigned line,
+                            const char *function, ucs_log_level_t level,
+                            const ucs_log_component_config_t *comp_conf,
+                            const char *message, va_list ap)
+    {
+        if (level == UCS_LOG_LEVEL_WARN) {
+            std::string err_str = format_message(message, ap);
+            if (err_str.find("with uncompleted operation") !=
+                std::string::npos) {
+                return UCS_LOG_FUNC_RC_STOP;
+            }
+        }
+        return UCS_LOG_FUNC_RC_CONTINUE;
+    }
+
+    void handle_status(ucs_status_t status, test_ep_comp_t &comp)
+    {
+        if (status == UCS_INPROGRESS) {
+            ++comp.comp.count;
+        } else if (status != UCS_OK) {
+            EXPECT_EQ(UCS_ERR_NO_RESOURCE, status);
+        }
+    }
+
     entity * m_sender;
     entity * m_receiver;
 };
@@ -58,14 +110,21 @@ UCS_TEST_SKIP_COND_P(test_uct_ep, disconnect_after_send,
                       skip_on_ib_dc())) {
     ucs_status_t status;
 
+    create_sender();
+
     mapped_buffer buffer(256, 0, *m_sender);
     UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buffer.ptr(),
-                            (ucs_min(buffer.length(), m_sender->iface_attr().cap.am.max_zcopy)),
+                            (ucs_min(buffer.length(),
+                                     m_sender->iface_attr().cap.am.max_zcopy)),
                             buffer.memh(),
                             m_sender->iface_attr().cap.am.max_iov);
 
     unsigned max_iter = 300 / ucs::test_time_multiplier();
-    for (unsigned i = 0; i < max_iter; ++i) {
+
+    /* FIXME: need to investigate RC/VERBS hang after ~200 iterations, when a
+     * sender entity is created after a receiver one */
+    unsigned max_retry_iter = has_transport("rc_verbs") ? 1 : max_iter;
+    for (unsigned i = 0; i < max_retry_iter; ++i) {
         connect();
         for (unsigned count = 0; count < max_iter; ) {
             status = uct_ep_am_zcopy(m_sender->ep(0), 1, NULL, 0, iov, iovcnt,
@@ -85,4 +144,64 @@ UCS_TEST_SKIP_COND_P(test_uct_ep, disconnect_after_send,
     }
 }
 
+UCS_TEST_SKIP_COND_P(test_uct_ep, destroy_entity_after_send,
+                     !check_caps(UCT_IFACE_FLAG_AM_ZCOPY))
+{
+    const unsigned max_iter = 300 / ucs::test_time_multiplier();
+
+    for (unsigned i = 0; i < max_iter; ++i) {
+        create_sender();
+        connect();
+
+        const uct_iface_attr &iface_attr = m_sender->iface_attr();
+        const size_t msg_length          = 256 * UCS_KBYTE;
+        ucs::auto_ptr<mapped_buffer> buffer(
+                new mapped_buffer(msg_length, 0, *m_sender));
+
+        UCS_TEST_GET_BUFFER_IOV(iov, iovcnt, buffer->ptr(),
+                                ucs_min(buffer->length(),
+                                        iface_attr.cap.am.max_zcopy),
+                                buffer->memh(), iface_attr.cap.am.max_iov);
+
+        test_ep_comp_t comp;
+
+        comp.comp.status = UCS_OK;
+        comp.comp.count  = 0;
+        comp.comp.func   = completion_cb;
+        comp.test        = this;
+        comp.ep          = m_sender->ep(0);
+
+        for (unsigned count = 0; count < max_iter;) {
+            ucs_status_t status = uct_ep_am_zcopy(m_sender->ep(0), 1, NULL, 0,
+                                                  iov, iovcnt, 0, &comp.comp);
+            handle_status(status, comp);
+            if (status == UCS_ERR_NO_RESOURCE) {
+                if (count == 0) {
+                    progress();
+                } else {
+                    status = uct_ep_flush(m_sender->ep(0), UCT_FLUSH_FLAG_LOCAL,
+                                          &comp.comp);
+                    handle_status(status, comp);
+                    break;
+                }
+            }
+            ++count;
+        }
+
+        if (comp.comp.count != 0) {
+            scoped_log_handler slh(detect_uncomp_op_logger);
+            /* Destroy EP without flushing prior to not complete AM Zcopy and
+             * flush operations during progress() */
+            disconnect(false);
+            /* All outstanding operations must be completed in EP destroy */
+            EXPECT_EQ(0, comp.comp.count);
+        }
+
+        /* Mapped buffer has to be released before destroying a sender entity */
+        buffer.reset();
+
+        m_entities.remove(m_sender);
+    }
+}
+
 UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_ep)
diff --git a/test/gtest/uct/test_uct_perf.cc b/test/gtest/uct/test_uct_perf.cc
index 81e8dde05e6..2ec9df00451 100644
--- a/test/gtest/uct/test_uct_perf.cc
+++ b/test/gtest/uct/test_uct_perf.cc
@@ -24,109 +24,148 @@ class test_uct_perf : public uct_test, public test_perf {
 
 const test_perf::test_spec test_uct_perf::tests[] =
 {
-  { "am latency", "usec",
+  { "am short latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5,
     0 },
 
-  { "am rate", "Mpps",
+  { "am short rate", "Mpps",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
     0 },
 
-  { "am rate64", "Mpps",
+  { "am short rate64", "Mpps",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 64 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
     0 },
 
+  { "am short iov latency", "usec",
+    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 4, 4 }, 1, 100000lu,
+    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5,
+    0 },
+
+  { "am short iov rate", "Mpps",
+    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 4, 4 }, 1, 2000000lu,
+    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
+    0 },
+
+  { "am short iov rate64", "Mpps",
+    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
+    UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 32, 32 }, 1, 2000000lu,
+    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
+    0 },
+
   { "am bcopy latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5},
 
   { "am bcopy bw", "MB/sec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 1000 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 15000.0,
     0 },
 
   { "am zcopy bw", "MB/sec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0,
     0 },
 
   { "am zcopy bw flush ep", "MB/sec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0,
     UCX_PERF_TEST_FLAG_FLUSH_EP },
 
   { "put latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 1.5,
     0 },
 
   { "put rate", "Mpps",
     UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
     0 },
 
   { "put bcopy bw", "MB/sec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 2048 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0,
     0 },
 
   { "put zcopy bw", "MB/sec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 2048 }, 32, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0,
     0 },
 
   { "get latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
     0 },
 
   { "atomic add latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_PINGPONG,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
     0 },
 
   { "atomic add rate", "Mpps",
     UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu,
     ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.5, 50.0,
     0 },
 
   { "atomic fadd latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
     0 },
 
   { "atomic cswap latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
     0 },
 
   { "atomic swap latency", "usec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
     ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
     0 },
 
   { "am iov bw", "MB/sec",
     UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
+    UCX_PERF_WAIT_MODE_POLL,
     UCT_PERF_DATA_LAYOUT_ZCOPY, 8192, 3, { 256, 256, 512 }, 32, 100000lu,
     ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0,
     0 },
diff --git a/test/gtest/uct/uct_p2p_test.cc b/test/gtest/uct/uct_p2p_test.cc
index 17e856f400a..eddff8abb99 100644
--- a/test/gtest/uct/uct_p2p_test.cc
+++ b/test/gtest/uct/uct_p2p_test.cc
@@ -43,7 +43,8 @@ std::vector<const resource*> uct_p2p_test::enum_resources(const std::string& tl_
         }
     }
 
-    return filter_resources(all_resources, tl_name);
+    return filter_resources<p2p_resource>(all_resources,
+                                          resource::is_equal_tl_name, tl_name);
 }
 
 uct_p2p_test::uct_p2p_test(size_t rx_headroom,
@@ -147,8 +148,8 @@ void uct_p2p_test::test_xfer_print(O& os, send_func_t send, size_t length,
 void uct_p2p_test::test_xfer_multi(send_func_t send, size_t min_length,
                                    size_t max_length, unsigned flags)
 {
-
-    for (int mem_type = 0; mem_type < UCS_MEMORY_TYPE_LAST; mem_type++) {
+    for (size_t i = 0; i < mem_buffer::supported_mem_types().size(); ++i) {
+        ucs_memory_type_t mem_type = mem_buffer::supported_mem_types()[i];
         /* test mem type if md supports mem type
          * (or) if HOST MD can register mem type
          */
@@ -321,6 +322,11 @@ uct_completion_t *uct_p2p_test::comp() {
     }
 }
 
+void uct_p2p_test::disable_comp()
+{
+    m_null_completion = true;
+}
+
 void uct_p2p_test::completion_cb(uct_completion_t *self) {
     completion *comp = ucs_container_of(self, completion, uct);
     ++comp->self->m_completion_count;
diff --git a/test/gtest/uct/uct_p2p_test.h b/test/gtest/uct/uct_p2p_test.h
index 700ce6d83af..7d22c5e9d0c 100644
--- a/test/gtest/uct/uct_p2p_test.h
+++ b/test/gtest/uct/uct_p2p_test.h
@@ -43,9 +43,10 @@ class uct_p2p_test : public uct_test {
         bool loopback;
 
         p2p_resource(const resource& res) :
-                     resource(res.component, res.md_name, res.local_cpus,
-                              res.tl_name, res.dev_name, res.dev_type),
-                              loopback(false) { }
+                     resource(res.component, res.component_name, res.md_name,
+                              res.local_cpus, res.tl_name, res.dev_name,
+                              res.dev_type), loopback(false) {
+        }
     };
 
     virtual void test_xfer(send_func_t send, size_t length, unsigned flags,
@@ -61,6 +62,7 @@ class uct_p2p_test : public uct_test {
     uct_ep_h sender_ep();
     entity& receiver();
     uct_completion_t *comp();
+    void disable_comp();
 
 private:
     template <typename O>
diff --git a/test/gtest/uct/uct_test.cc b/test/gtest/uct/uct_test.cc
index 28890bc026d..789b0b5cafd 100644
--- a/test/gtest/uct/uct_test.cc
+++ b/test/gtest/uct/uct_test.cc
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * See file LICENSE for terms.
 */
@@ -27,42 +27,58 @@ std::string resource::name() const {
     return ss.str();
 }
 
-resource::resource() : component(NULL), md_name(""), tl_name(""), dev_name(""),
-                       variant_name(""), dev_type(UCT_DEVICE_TYPE_LAST),
+resource::resource() : component(NULL), dev_type(UCT_DEVICE_TYPE_LAST),
                        variant(DEFAULT_VARIANT)
 {
     CPU_ZERO(&local_cpus);
 }
 
-resource::resource(uct_component_h component, const std::string& md_name,
-                   const ucs_cpu_set_t& local_cpus, const std::string& tl_name,
-                   const std::string& dev_name, uct_device_type_t dev_type) :
-                   component(component), md_name(md_name), local_cpus(local_cpus),
-                   tl_name(tl_name), dev_name(dev_name), variant_name(""),
-                   dev_type(dev_type), variant(DEFAULT_VARIANT)
+resource::resource(uct_component_h component, const std::string& component_name,
+                   const std::string& md_name, const ucs_cpu_set_t& local_cpus,
+                   const std::string& tl_name, const std::string& dev_name,
+                   uct_device_type_t dev_type) :
+                   component(component), component_name(component_name),
+                   md_name(md_name), local_cpus(local_cpus), tl_name(tl_name),
+                   dev_name(dev_name), dev_type(dev_type),
+                   variant(DEFAULT_VARIANT)
 {
 }
 
-resource::resource(uct_component_h component, const uct_md_attr_t& md_attr,
+resource::resource(uct_component_h component,
+                   const uct_component_attr& cmpt_attr,
+                   const uct_md_attr_t& md_attr,
                    const uct_md_resource_desc_t& md_resource,
                    const uct_tl_resource_desc_t& tl_resource) :
                    component(component),
+                   component_name(cmpt_attr.name),
                    md_name(md_resource.md_name),
                    local_cpus(md_attr.local_cpus),
                    tl_name(tl_resource.tl_name),
                    dev_name(tl_resource.dev_name),
-                   variant_name(""),
                    dev_type(tl_resource.dev_type),
                    variant(DEFAULT_VARIANT)
 {
 }
 
-resource_speed::resource_speed(uct_component_h component, const uct_worker_h& worker,
-                               const uct_md_h& md, const uct_md_attr_t& md_attr,
+bool resource::is_equal_tl_name(const resource &rsc, const std::string &name)
+{
+    return rsc.tl_name == name;
+}
+
+bool resource::is_equal_component_name(const resource &rsc,
+                                       const std::string &name)
+{
+    return rsc.component_name == name;
+}
+
+resource_speed::resource_speed(uct_component_h component,
+                               const uct_component_attr& cmpt_attr,
+                               const uct_worker_h& worker, const uct_md_h& md,
+                               const uct_md_attr_t& md_attr,
                                const uct_md_resource_desc_t& md_resource,
                                const uct_tl_resource_desc_t& tl_resource) :
-                               resource(component, md_attr, md_resource,
-                                        tl_resource) {
+                               resource(component, cmpt_attr, md_attr,
+                                        md_resource, tl_resource) {
     ucs_status_t status;
     uct_iface_params_t iface_params = { 0 };
     uct_iface_config_t *iface_config;
@@ -142,107 +158,131 @@ std::vector<uct_test_base::md_resource> uct_test_base::enum_md_resources() {
     return all_md_resources;
 }
 
-uct_test::uct_test() {
+uct_test::uct_test() : m_iface_config(NULL), m_md_config(NULL),
+                       m_cm_config(NULL) {
     uct_component_attr_t component_attr = {0};
     ucs_status_t status;
     uct_md_attr_t md_attr;
     uct_md_h md;
 
-    status = uct_md_config_read(GetParam()->component, NULL, NULL, &m_md_config);
+    component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME  |
+                                UCT_COMPONENT_ATTR_FIELD_FLAGS |
+                                UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT;
+    /* coverity[var_deref_model] */
+    status = uct_component_query(GetParam()->component, &component_attr);
+    ASSERT_UCS_OK(status);
+
+    if (component_attr.flags & UCT_COMPONENT_FLAG_CM) {
+        status = uct_cm_config_read(GetParam()->component, NULL, NULL,
+                                    &m_cm_config);
+        ASSERT_UCS_OK(status);
+    }
+
+    if (component_attr.md_resource_count == 0) {
+        return;
+    }
+
+    status = uct_md_config_read(GetParam()->component, NULL, NULL,
+                                &m_md_config);
     ASSERT_UCS_OK(status);
 
     status = uct_md_open(GetParam()->component, GetParam()->md_name.c_str(),
                          m_md_config, &md);
+    if (status == UCS_ERR_UNSUPPORTED) {
+        return;
+    }
+
     ASSERT_UCS_OK(status);
 
     status = uct_md_query(md, &md_attr);
     ASSERT_UCS_OK(status);
-
     if (md_attr.cap.flags & UCT_MD_FLAG_SOCKADDR) {
         status = uct_md_iface_config_read(md, NULL, NULL, NULL, &m_iface_config);
-    } else if (!strcmp(GetParam()->tl_name.c_str(), "sockaddr")) {
-        m_iface_config = NULL;
-    } else {
+    } else if (!GetParam()->tl_name.empty()) {
         status = uct_md_iface_config_read(md, GetParam()->tl_name.c_str(), NULL,
                                           NULL, &m_iface_config);
     }
 
     ASSERT_UCS_OK(status);
     uct_md_close(md);
-
-    component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME |
-                                UCT_COMPONENT_ATTR_FIELD_FLAGS;
-    /* coverity[var_deref_model] */
-    status = uct_component_query(GetParam()->component, &component_attr);
-    ASSERT_UCS_OK(status);
-
-    UCS_TEST_MESSAGE << "Testing component: " << component_attr.name;
-
-    if (component_attr.flags & UCT_COMPONENT_FLAG_CM) {
-        status = uct_cm_config_read(GetParam()->component, NULL, NULL, &m_cm_config);
-        ASSERT_UCS_OK(status);
-    } else {
-        m_cm_config = NULL;
-    }
 }
 
 uct_test::~uct_test() {
     if (m_cm_config != NULL) {
         uct_config_release(m_cm_config);
     }
+
     if (m_iface_config != NULL) {
         uct_config_release(m_iface_config);
     }
-    uct_config_release(m_md_config);
+
+    if (m_md_config != NULL) {
+        uct_config_release(m_md_config);
+    }
 }
 
 void uct_test::init_sockaddr_rsc(resource *rsc, struct sockaddr *listen_addr,
-                                 struct sockaddr *connect_addr, size_t size)
+                                 struct sockaddr *connect_addr, size_t size,
+                                 bool init_src)
 {
     rsc->listen_sock_addr.set_sock_addr(*listen_addr, size);
     rsc->connect_sock_addr.set_sock_addr(*connect_addr, size);
+    if (init_src) {
+        /* src_addr == dst_addr to be sure they are reachable */
+        rsc->source_sock_addr.set_sock_addr(*connect_addr, size);
+    }
 }
 
-void uct_test::set_interface_rscs(uct_component_h cmpt, const char *name,
-                                  ucs_cpu_set_t local_cpus, struct ifaddrs *ifa,
+void uct_test::set_interface_rscs(uct_component_h cmpt, const char *cmpt_name,
+                                  const char *md_name, ucs_cpu_set_t local_cpus,
+                                  struct ifaddrs *ifa,
                                   std::vector<resource>& all_resources)
 {
     int i;
 
-    /* Create two resources on the same interface. the first one will have the
-     * ip of the interface and the second one will have INADDR_ANY */
-    for (i = 0; i < 2; i++) {
-        resource rsc(cmpt, std::string(name), local_cpus, "sockaddr",
-                     std::string(ifa->ifa_name), UCT_DEVICE_TYPE_NET);
-
-        if (i == 0) {
-            /* first rsc */
+    /* Create three resources on the same interface:
+     *  0 - has the ip of the dst interface
+     *  1 - has the ip of the dst interface and IP of src interface
+     *  2 - has INADDR_ANY
+     */
+    for (i = 0; i < 3; i++) {
+        resource rsc(cmpt, std::string(cmpt_name), std::string(md_name),
+                     local_cpus, "", std::string(ifa->ifa_name),
+                     UCT_DEVICE_TYPE_NET);
+        bool init_src_addr = (i == 1);
+
+        if (i < 2) {
             if (ifa->ifa_addr->sa_family == AF_INET) {
                 uct_test::init_sockaddr_rsc(&rsc, ifa->ifa_addr, ifa->ifa_addr,
-                                            sizeof(struct sockaddr_in));
+                                            sizeof(struct sockaddr_in),
+                                            init_src_addr);
             } else if (ifa->ifa_addr->sa_family == AF_INET6) {
                 uct_test::init_sockaddr_rsc(&rsc, ifa->ifa_addr, ifa->ifa_addr,
-                                            sizeof(struct sockaddr_in6));
+                                            sizeof(struct sockaddr_in6),
+                                            init_src_addr);
             } else {
                 UCS_TEST_ABORT("Unknown sa_family " << ifa->ifa_addr->sa_family);
             }
             all_resources.push_back(rsc);
         } else {
-            /* second rsc */
             if (ifa->ifa_addr->sa_family == AF_INET) {
                 struct sockaddr_in sin;
                 memset(&sin, 0, sizeof(struct sockaddr_in));
                 sin.sin_family      = AF_INET;
                 sin.sin_addr.s_addr = INADDR_ANY;
                 uct_test::init_sockaddr_rsc(&rsc, (struct sockaddr*)&sin,
-                                            ifa->ifa_addr, sizeof(struct sockaddr_in));
+                                            ifa->ifa_addr,
+                                            sizeof(struct sockaddr_in),
+                                            init_src_addr);
             } else if (ifa->ifa_addr->sa_family == AF_INET6) {
                 struct sockaddr_in6 sin;
                 memset(&sin, 0, sizeof(struct sockaddr_in6));
                 sin.sin6_family     = AF_INET6;
                 sin.sin6_addr       = in6addr_any;
                 uct_test::init_sockaddr_rsc(&rsc, (struct sockaddr*)&sin,
-                                            ifa->ifa_addr, sizeof(struct sockaddr_in6));
+                                            ifa->ifa_addr,
+                                            sizeof(struct sockaddr_in6),
+                                            init_src_addr);
             } else {
                 UCS_TEST_ABORT("Unknown sa_family " << ifa->ifa_addr->sa_family);
             }
@@ -284,8 +324,9 @@ void uct_test::set_md_sockaddr_resources(const md_resource& md_rsc, uct_md_h md,
         if (uct_md_is_sockaddr_accessible(md, &sock_addr, UCT_SOCKADDR_ACC_LOCAL) &&
             uct_md_is_sockaddr_accessible(md, &sock_addr, UCT_SOCKADDR_ACC_REMOTE))
         {
-            uct_test::set_interface_rscs(md_rsc.cmpt, md_rsc.rsc_desc.md_name,
-                                         local_cpus, ifa, all_resources);
+            uct_test::set_interface_rscs(md_rsc.cmpt, md_rsc.cmpt_attr.name,
+                                         md_rsc.rsc_desc.md_name, local_cpus,
+                                         ifa, all_resources);
         }
     }
 
@@ -305,7 +346,8 @@ void uct_test::set_cm_sockaddr_resources(uct_component_h cmpt, const char *cmpt_
             continue;
         }
 
-        uct_test::set_interface_rscs(cmpt, cmpt_name, local_cpus, ifa, all_resources);
+        uct_test::set_interface_rscs(cmpt, cmpt_name, "", local_cpus, ifa,
+                                     all_resources);
     }
 
     freeifaddrs(ifaddr);
@@ -388,15 +430,16 @@ std::vector<const resource*> uct_test::enum_resources(const std::string& tl_name
             resource_speed tcp_fastest_rsc;
 
             for (unsigned j = 0; j < num_tl_resources; ++j) {
-                if (tcp_fastest_dev && (std::string("tcp") == tl_resources[j].tl_name)) {
-                    resource_speed rsc(iter->cmpt, worker, md, md_attr,
-                                       iter->rsc_desc, tl_resources[j]);
+                if (tcp_fastest_dev && (std::string("tcp") ==
+                                        tl_resources[j].tl_name)) {
+                    resource_speed rsc(iter->cmpt, iter->cmpt_attr, worker, md,
+                                       md_attr, iter->rsc_desc, tl_resources[j]);
                     if (!tcp_fastest_rsc.bw || (rsc.bw > tcp_fastest_rsc.bw)) {
                         tcp_fastest_rsc = rsc;
                     }
                 } else {
-                    resource rsc(iter->cmpt, md_attr, iter->rsc_desc,
-                                 tl_resources[j]);
+                    resource rsc(iter->cmpt, iter->cmpt_attr, md_attr,
+                                 iter->rsc_desc, tl_resources[j]);
                     all_resources.push_back(rsc);
                 }
             }
@@ -417,31 +460,9 @@ std::vector<const resource*> uct_test::enum_resources(const std::string& tl_name
 
         uct_worker_destroy(worker);
         ucs_async_context_destroy(async);
-
-        set_cm_resources(all_resources);
     }
 
-    return filter_resources(all_resources, tl_name);
-}
-
-void uct_test::generate_test_variant(int variant,
-                                     const std::string &variant_name,
-                                     std::vector<resource>& test_res,
-                                     const std::string &tl_name)
-{
-    std::vector<const resource*> r = uct_test::enum_resources("");
-
-    for (std::vector<const resource*>::iterator iter = r.begin();
-         iter != r.end(); ++iter) {
-        if (tl_name.empty() || ((*iter)->tl_name == tl_name)) {
-            resource rsc((*iter)->component, (*iter)->md_name,
-                         (*iter)->local_cpus, (*iter)->tl_name,
-                         (*iter)->dev_name, (*iter)->dev_type);
-            rsc.variant      = variant;
-            rsc.variant_name = variant_name;
-            test_res.push_back(rsc);
-        }
-    }
+    return filter_resources(all_resources, resource::is_equal_tl_name, tl_name);
 }
 
 void uct_test::init() {
@@ -522,15 +543,17 @@ void uct_test::modify_config(const std::string& name, const std::string& value,
         }
     }
 
-    status = uct_config_modify(m_md_config, name.c_str(), value.c_str());
-    if (status == UCS_OK) {
-        mode = IGNORE_IF_NOT_EXIST;
-    }
-    if ((status == UCS_OK) || (status == UCS_ERR_NO_ELEM)) {
-        test_base::modify_config(name, value, mode);
-    } else if (status != UCS_OK) {
-        UCS_TEST_ABORT("Couldn't modify md config parameter: " << name.c_str() <<
-                       " to " << value.c_str() << ": " << ucs_status_string(status));
+    if (m_md_config != NULL) {
+        status = uct_config_modify(m_md_config, name.c_str(), value.c_str());
+        if (status == UCS_OK) {
+            mode = IGNORE_IF_NOT_EXIST;
+        }
+        if ((status == UCS_OK) || (status == UCS_ERR_NO_ELEM)) {
+            test_base::modify_config(name, value, mode);
+        } else if (status != UCS_OK) {
+            UCS_TEST_ABORT("Couldn't modify md config parameter: " << name.c_str() <<
+                           " to " << value.c_str() << ": " << ucs_status_string(status));
+        }
     }
 }
 
@@ -592,10 +615,24 @@ bool uct_test::has_mm() const {
             has_transport("xpmem"));
 }
 
+bool uct_test::has_cuda_ipc() const {
+    return has_transport("cuda_ipc");
+}
+
 bool uct_test::has_cma() const {
     return has_transport("cma");
 }
 
+bool uct_test::has_ugni() const {
+    return (has_transport("ugni_rdma") || has_transport("ugni_udt") ||
+            has_transport("ugni_smsg"));
+}
+
+bool uct_test::has_gpu() const {
+    return (has_transport("cuda_copy") || has_transport("gdr_copy") ||
+            has_transport("rocm_copy"));
+}
+
 void uct_test::stats_activate()
 {
     ucs_stats_cleanup();
@@ -613,13 +650,14 @@ void uct_test::stats_restore()
     ucs_stats_init();
 }
 
-uct_test::entity* uct_test::create_entity(size_t rx_headroom,
-                                          uct_error_handler_t err_handler,
-                                          uct_tag_unexp_eager_cb_t eager_cb,
-                                          uct_tag_unexp_rndv_cb_t rndv_cb,
-                                          void *eager_arg, void *rndv_arg,
-                                          uct_async_event_cb_t async_event_cb,
-                                          void *async_event_arg) {
+uct_test::entity *
+uct_test::create_entity(size_t rx_headroom, uct_error_handler_t err_handler,
+                        uct_tag_unexp_eager_cb_t eager_cb,
+                        uct_tag_unexp_rndv_cb_t rndv_cb, void *eager_arg,
+                        void *rndv_arg, uct_async_event_cb_t async_event_cb,
+                        void *async_event_arg, size_t am_alignment,
+                        size_t am_align_offset)
+{
     uct_iface_params_t iface_params;
 
     iface_params.field_mask        = UCT_IFACE_PARAM_FIELD_RX_HEADROOM       |
@@ -651,9 +689,50 @@ uct_test::entity* uct_test::create_entity(size_t rx_headroom,
     iface_params.async_event_cb    = async_event_cb;
     iface_params.async_event_arg   = async_event_arg;
 
+    if (am_alignment != 0) {
+        iface_params.field_mask  |= UCT_IFACE_PARAM_FIELD_AM_ALIGNMENT;
+        iface_params.am_alignment = am_alignment;
+    }
+
+    if (am_align_offset != 0) {
+        iface_params.field_mask     |= UCT_IFACE_PARAM_FIELD_AM_ALIGN_OFFSET;
+        iface_params.am_align_offset = am_align_offset;
+    }
+
     return new entity(*GetParam(), m_iface_config, &iface_params, m_md_config);
 }
 
+void
+uct_test::create_connected_entities(size_t rx_headroom,
+                                    uct_error_handler_t err_handler,
+                                    uct_tag_unexp_eager_cb_t eager_cb,
+                                    uct_tag_unexp_rndv_cb_t rndv_cb,
+                                    void *eager_arg, void *rndv_arg,
+                                    uct_async_event_cb_t async_event_cb,
+                                    void *async_event_arg, size_t am_alignment,
+                                    size_t am_align_offset)
+{
+    entity *sender = uct_test::create_entity(rx_headroom, err_handler, eager_cb,
+                                             rndv_cb, eager_arg, rndv_arg,
+                                             async_event_cb, async_event_arg,
+                                             am_alignment, am_align_offset);
+    m_entities.push_back(sender);
+
+    if (UCT_DEVICE_TYPE_SELF == GetParam()->dev_type) {
+        sender->connect(0, *sender, 0);
+    } else {
+        entity *receiver = uct_test::create_entity(rx_headroom, err_handler,
+                                                   eager_cb, rndv_cb, eager_arg,
+                                                   rndv_arg, async_event_cb,
+                                                   async_event_arg, am_alignment,
+                                                   am_align_offset);
+        m_entities.push_back(receiver);
+
+        sender->connect(0, *receiver, 0);
+    }
+
+}
+
 uct_test::entity* uct_test::create_entity(uct_iface_params_t &params) {
     entity *new_ent = new entity(*GetParam(), m_iface_config, &params,
                                  m_md_config);
@@ -825,13 +904,15 @@ uct_test::entity::entity(const resource& resource, uct_md_config_t *md_config,
                            uct_worker_create, &m_async.m_async,
                            UCS_THREAD_MODE_SINGLE);
 
-    UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close,
-                           uct_md_open, resource.component,
-                           resource.md_name.c_str(), md_config);
-
-    status = uct_md_query(m_md, &m_md_attr);
-    ASSERT_UCS_OK(status);
-
+    if (md_config != NULL) {
+        UCS_TEST_CREATE_HANDLE(uct_md_h, m_md, uct_md_close, uct_md_open,
+                               resource.component, resource.md_name.c_str(),
+                               md_config);
+        status = uct_md_query(m_md, &m_md_attr);
+        ASSERT_UCS_OK(status);
+    } else {
+        memset(&m_md_attr, 0, sizeof(m_md_attr));
+    }
 
     comp_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_NAME |
                            UCT_COMPONENT_ATTR_FIELD_FLAGS;
@@ -1119,9 +1200,9 @@ void uct_test::entity::destroy_eps() {
 }
 
 void
-uct_test::entity::connect_to_sockaddr(unsigned index, entity& other,
+uct_test::entity::connect_to_sockaddr(unsigned index,
                                       const ucs::sock_addr_storage &remote_addr,
-                                      uct_cm_ep_priv_data_pack_callback_t pack_cb,
+                                      uct_cm_ep_resolve_callback_t resolve_cb,
                                       uct_cm_ep_client_connect_callback_t connect_cb,
                                       uct_ep_disconnect_cb_t disconnect_cb,
                                       void *user_data)
@@ -1131,34 +1212,28 @@ uct_test::entity::connect_to_sockaddr(unsigned index, entity& other,
     uct_ep_h ep;
     ucs_status_t status;
 
+    ucs::scoped_async_lock lock(async());
+
     reserve_ep(index);
-    if (m_eps[index]) {
-        return; /* Already connected */
-    }
+    ASSERT_FALSE(m_eps[index]) << "Already connected";
 
     /* Connect to the server */
-    if (m_cm) {
-        params.field_mask = UCT_EP_PARAM_FIELD_CM                         |
-                            UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT |
-                            UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB     |
-                            UCT_EP_PARAM_FIELD_USER_DATA;
-        params.cm                 = m_cm;
-        params.sockaddr_cb_client = connect_cb;
-        params.disconnect_cb      = disconnect_cb;
-    } else {
-        params.field_mask = UCT_EP_PARAM_FIELD_IFACE;
-        params.iface      = m_iface;
-    }
-
-    params.field_mask       |= UCT_EP_PARAM_FIELD_USER_DATA         |
-                               UCT_EP_PARAM_FIELD_SOCKADDR          |
-                               UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS |
-                               UCT_EP_PARAM_FIELD_SOCKADDR_PACK_CB;
-    params.user_data         = user_data;
-    params.sockaddr          = &ucs_remote_addr;
-    params.sockaddr_cb_flags = UCT_CB_FLAG_ASYNC;
-    params.sockaddr_pack_cb  = pack_cb;
-    status = uct_ep_create(&params, &ep);
+    params.field_mask = UCT_EP_PARAM_FIELD_USER_DATA                  |
+                        UCT_EP_PARAM_FIELD_CM                         |
+                        UCT_EP_PARAM_FIELD_SOCKADDR                   |
+                        UCT_EP_PARAM_FIELD_SOCKADDR_CB_FLAGS          |
+                        UCT_EP_PARAM_FIELD_CM_RESOLVE_CB              |
+                        UCT_EP_PARAM_FIELD_SOCKADDR_CONNECT_CB_CLIENT |
+                        UCT_EP_PARAM_FIELD_SOCKADDR_DISCONNECT_CB;
+
+    params.user_data            = user_data;
+    params.cm                   = m_cm;
+    params.sockaddr             = &ucs_remote_addr;
+    params.sockaddr_cb_flags    = UCT_CB_FLAG_ASYNC;
+    params.cm_resolve_cb        = resolve_cb;
+    params.sockaddr_cb_client   = connect_cb;
+    params.disconnect_cb        = disconnect_cb;
+    status                      = uct_ep_create(&params, &ep);
     ASSERT_UCS_OK(status);
 
     m_eps[index].reset(ep, uct_ep_destroy);
@@ -1234,23 +1309,6 @@ void uct_test::entity::connect_to_iface(unsigned index, entity& other) {
     free(dev_addr);
 }
 
-void uct_test::entity::connect(unsigned index, entity& other,
-                               unsigned other_index,
-                               const ucs::sock_addr_storage &remote_addr,
-                               uct_cm_ep_priv_data_pack_callback_t pack_cb,
-                               uct_cm_ep_client_connect_callback_t connect_cb,
-                               uct_ep_disconnect_cb_t disconnect_cb,
-                               void *user_data)
-{
-    if (m_cm ||
-        iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_SOCKADDR) {
-        connect_to_sockaddr(index, other, remote_addr, pack_cb, connect_cb,
-                            disconnect_cb, user_data);
-    } else {
-        UCS_TEST_SKIP_R("cannot connect");
-    }
-}
-
 void uct_test::entity::connect(unsigned index, entity& other, unsigned other_index)
 {
     if (iface_attr().cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) {
@@ -1262,38 +1320,15 @@ void uct_test::entity::connect(unsigned index, entity& other, unsigned other_ind
     }
 }
 
-void uct_test::entity::listen(const ucs::sock_addr_storage &listen_addr,
-                              const uct_listener_params_t &params)
+ucs_status_t uct_test::entity::listen(const ucs::sock_addr_storage &listen_addr,
+                                      const uct_listener_params_t &params)
 {
-    ucs_status_t status;
-
-    for (;;) {
-        {
-            scoped_log_handler slh(wrap_errors_logger);
-            status = UCS_TEST_TRY_CREATE_HANDLE(uct_listener_h, m_listener,
-                                                uct_listener_destroy,
-                                                uct_listener_create, m_cm,
-                                                listen_addr.get_sock_addr_ptr(),
-                                                listen_addr.get_addr_size(),
-                                                &params);
-            if (status == UCS_OK) {
-                break;
-            }
-        }
-        EXPECT_EQ(UCS_ERR_BUSY, status);
+    scoped_log_handler slh(wrap_errors_logger);
 
-        const struct sockaddr* c_ifa_addr = listen_addr.get_sock_addr_ptr();
-        struct sockaddr* ifa_addr = const_cast<struct sockaddr*>(c_ifa_addr);
-        if (ifa_addr->sa_family == AF_INET) {
-            struct sockaddr_in *addr =
-                            reinterpret_cast<struct sockaddr_in *>(ifa_addr);
-            addr->sin_port = ntohs(ucs::get_port());
-        } else {
-            struct sockaddr_in6 *addr =
-                            reinterpret_cast<struct sockaddr_in6 *>(ifa_addr);
-            addr->sin6_port = ntohs(ucs::get_port());
-        }
-    }
+    return UCS_TEST_TRY_CREATE_HANDLE(uct_listener_h, m_listener,
+                                      uct_listener_destroy, uct_listener_create,
+                                      m_cm, listen_addr.get_sock_addr_ptr(),
+                                      listen_addr.get_addr_size(), &params);
 }
 
 void uct_test::entity::disconnect(uct_ep_h ep) {
@@ -1431,12 +1466,8 @@ void uct_test::entity::async_wrapper::check_miss()
     ucs_async_check_miss(&m_async);
 }
 
-uct_test::entity::scoped_async_lock::scoped_async_lock(entity &e) : m_entity(e) {
-    UCS_ASYNC_BLOCK(&m_entity.m_async.m_async);
-}
-
-uct_test::entity::scoped_async_lock::~scoped_async_lock() {
-    UCS_ASYNC_UNBLOCK(&m_entity.m_async.m_async);
+ucs_async_context_t &uct_test::entity::async() const {
+    return m_async.m_async;
 }
 
 ucs_status_t uct_test::send_am_message(entity *e, uint8_t am_id, int ep_idx)
diff --git a/test/gtest/uct/uct_test.h b/test/gtest/uct/uct_test.h
index 4c4817672a2..7f75d6ba6b1 100644
--- a/test/gtest/uct/uct_test.h
+++ b/test/gtest/uct/uct_test.h
@@ -1,5 +1,5 @@
 /**
-* Copyright (C) Mellanox Technologies Ltd. 2001-2019.  ALL RIGHTS RESERVED.
+* Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
 *
 * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
 * Copyright (C) ARM Ltd. 2017.  ALL RIGHTS RESERVED
@@ -46,6 +46,7 @@ struct resource {
     virtual ~resource() {};
     virtual std::string name() const;
     uct_component_h         component;
+    std::string             component_name;
     std::string             md_name;
     ucs_cpu_set_t           local_cpus;
     std::string             tl_name;
@@ -54,23 +55,31 @@ struct resource {
     uct_device_type_t       dev_type;
     ucs::sock_addr_storage  listen_sock_addr;     /* sockaddr to listen on */
     ucs::sock_addr_storage  connect_sock_addr;    /* sockaddr to connect to */
+    ucs::sock_addr_storage  source_sock_addr;     /* sockaddr to connect from */
     int                     variant;
 
     resource();
-    resource(uct_component_h component, const std::string& md_name,
-             const ucs_cpu_set_t& local_cpus, const std::string& tl_name,
-             const std::string& dev_name, uct_device_type_t dev_type);
-    resource(uct_component_h component, const uct_md_attr_t& md_attr,
+    resource(uct_component_h component, const std::string& component_name,
+             const std::string& md_name, const ucs_cpu_set_t& local_cpus,
+             const std::string& tl_name, const std::string& dev_name,
+             uct_device_type_t dev_type);
+    resource(uct_component_h component, const uct_component_attr& cmpnt_attr,
+             const uct_md_attr_t& md_attr,
              const uct_md_resource_desc_t& md_resource,
              const uct_tl_resource_desc_t& tl_resource);
+    static bool is_equal_tl_name(const resource &rsc, const std::string &name);
+    static bool
+    is_equal_component_name(const resource &rsc, const std::string &name);
 };
 
 struct resource_speed : public resource {
     double bw;
 
     resource_speed() : resource(), bw(0) { }
-    resource_speed(uct_component_h component, const uct_worker_h& worker,
-                   const uct_md_h& md, const uct_md_attr_t& md_attr,
+    resource_speed(uct_component_h component,
+                   const uct_component_attr& cmpnt_attr,
+                   const uct_worker_h& worker, const uct_md_h& md,
+                   const uct_md_attr_t& md_attr,
                    const uct_md_resource_desc_t& md_resource,
                    const uct_tl_resource_desc_t& tl_resource);
 };
@@ -104,12 +113,6 @@ class uct_test : public testing::TestWithParam<const resource*>,
      */
     static std::vector<const resource*> enum_resources(const std::string& tl_name);
 
-    /* By default generate test variant for all tls. If variant is specific to
-     * the particular transport tl_name need to be specified accordingly */
-    static void generate_test_variant(int variant,
-                                      const std::string &variant_name,
-                                      std::vector<resource>& test_res,
-                                      const std::string &tl_name="");
     uct_test();
     virtual ~uct_test();
 
@@ -182,37 +185,25 @@ class uct_test : public testing::TestWithParam<const resource*>,
         void revoke_ep(unsigned index);
         void destroy_eps();
         void connect(unsigned index, entity& other, unsigned other_index);
-        void connect(unsigned index, entity& other, unsigned other_index,
-                     const ucs::sock_addr_storage &remote_addr,
-                     uct_cm_ep_priv_data_pack_callback_t pack_cb,
-                     uct_cm_ep_client_connect_callback_t connect_cb,
-                     uct_ep_disconnect_cb_t disconnect_cb,
-                     void *user_data);
         void connect_to_iface(unsigned index, entity& other);
         void connect_to_ep(unsigned index, entity& other,
                            unsigned other_index);
-        void connect_to_sockaddr(unsigned index, entity& other,
+        void connect_to_sockaddr(unsigned index,
                                  const ucs::sock_addr_storage &remote_addr,
-                                 uct_cm_ep_priv_data_pack_callback_t pack_cb,
+                                 uct_cm_ep_resolve_callback_t resolve_cb,
                                  uct_cm_ep_client_connect_callback_t connect_cb,
                                  uct_ep_disconnect_cb_t disconnect_cb,
-                                 void *user_sata);
+                                 void *user_data);
 
-        void listen(const ucs::sock_addr_storage &listen_addr,
-                    const uct_listener_params_t &params);
+        ucs_status_t listen(const ucs::sock_addr_storage &listen_addr,
+                            const uct_listener_params_t &params);
         void disconnect(uct_ep_h ep);
 
         void flush() const;
 
-        size_t                   max_conn_priv;
+        ucs_async_context_t &async() const;
 
-        class scoped_async_lock {
-        public:
-            scoped_async_lock(entity &e);
-            ~scoped_async_lock();
-        private:
-            entity &m_entity;
-        };
+        size_t                   max_conn_priv;
 
     private:
         class async_wrapper {
@@ -225,8 +216,6 @@ class uct_test : public testing::TestWithParam<const resource*>,
             async_wrapper(const async_wrapper &);
         };
 
-        entity(const entity&);
-
 
         void connect_p2p_ep(uct_ep_h from, uct_ep_h to);
         void cuda_mem_alloc(size_t length, uct_allocated_memory_t *mem) const;
@@ -303,16 +292,16 @@ class uct_test : public testing::TestWithParam<const resource*>,
         bool             aux_pipe_init;
     };
 
-    template <typename T>
-    static std::vector<const resource*> filter_resources(const std::vector<T>& resources,
-                                                         const std::string& tl_name)
+    template<typename T>
+    static std::vector<const resource*>
+    filter_resources(const std::vector<T> &resources,
+                     bool is_equal(const resource&, const std::string&),
+                     const std::string &filter)
     {
         std::vector<const resource*> result;
-        for (typename std::vector<T>::const_iterator iter = resources.begin();
-                        iter != resources.end(); ++iter)
-        {
-            if (tl_name.empty() || (iter->tl_name == tl_name)) {
-                result.push_back(&*iter);
+        for (size_t i = 0; i < resources.size(); ++i) {
+            if (filter.empty() || is_equal(resources[i], filter)) {
+                result.push_back(&resources[i]);
             }
         }
         return result;
@@ -369,7 +358,10 @@ class uct_test : public testing::TestWithParam<const resource*>,
     virtual bool has_rc_or_dc() const;
     virtual bool has_ib() const;
     virtual bool has_mm() const;
+    virtual bool has_cuda_ipc() const;
     virtual bool has_cma() const;
+    virtual bool has_ugni() const;
+    virtual bool has_gpu() const;
 
     bool is_caps_supported(uint64_t required_flags);
     bool check_caps(uint64_t required_flags, uint64_t invalid_flags = 0);
@@ -389,19 +381,31 @@ class uct_test : public testing::TestWithParam<const resource*>,
     static void set_cm_sockaddr_resources(uct_component_h cmpt, const char *cmpt_name,
                                           ucs_cpu_set_t local_cpus,
                                           std::vector<resource>& all_resources);
-    static void set_interface_rscs(uct_component_h comt, const char * name,
-                                   ucs_cpu_set_t local_cpus, struct ifaddrs *ifa,
+    static void set_interface_rscs(uct_component_h cmpt, const char *cmpt_name,
+                                   const char *md_name, ucs_cpu_set_t local_cpus,
+                                   struct ifaddrs *ifa,
                                    std::vector<resource>& all_resources);
     static void init_sockaddr_rsc(resource *rsc, struct sockaddr *listen_addr,
-                                  struct sockaddr *connect_addr, size_t size);
-    uct_test::entity* create_entity(size_t rx_headroom,
-                                    uct_error_handler_t err_handler = NULL,
-                                    uct_tag_unexp_eager_cb_t eager_cb = NULL,
-                                    uct_tag_unexp_rndv_cb_t rndv_cb = NULL,
-                                    void *eager_arg = NULL,
-                                    void *rndv_arg = NULL,
-                                    uct_async_event_cb_t async_event_cb = NULL,
-                                    void *async_event_arg = NULL);
+                                  struct sockaddr *connect_addr, size_t size,
+                                  bool init_src);
+    uct_test::entity *
+    create_entity(size_t rx_headroom, uct_error_handler_t err_handler = NULL,
+                  uct_tag_unexp_eager_cb_t eager_cb = NULL,
+                  uct_tag_unexp_rndv_cb_t rndv_cb = NULL,
+                  void *eager_arg = NULL, void *rndv_arg = NULL,
+                  uct_async_event_cb_t async_event_cb = NULL,
+                  void *async_event_arg = NULL, size_t am_alignment = 0ul,
+                  size_t am_align_offset = 0ul);
+    void
+    create_connected_entities(size_t rx_headroom,
+                              uct_error_handler_t err_handler = NULL,
+                              uct_tag_unexp_eager_cb_t eager_cb = NULL,
+                              uct_tag_unexp_rndv_cb_t rndv_cb = NULL,
+                              void *eager_arg = NULL, void *rndv_arg = NULL,
+                              uct_async_event_cb_t async_event_cb = NULL,
+                              void *async_event_arg = NULL,
+                              size_t am_alignment = 0ul,
+                              size_t am_align_offset = 0ul);
     uct_test::entity* create_entity(uct_iface_params_t &params);
     uct_test::entity* create_entity();
     int max_connections();
@@ -441,8 +445,9 @@ class test_uct_iface_attrs : public uct_test {
     ud_mlx5,            \
     cm
 
-#define UCT_TEST_SOCKADDR_TLS \
-    sockaddr
+
+#define UCT_TEST_CMS rdmacm, tcp
+
 
 #define UCT_TEST_NO_SELF_TLS \
     UCT_TEST_IB_TLS,         \
@@ -497,8 +502,21 @@ class test_uct_iface_attrs : public uct_test {
 #define UCT_INSTANTIATE_NO_SELF_TEST_CASE(_test_case) \
     UCS_PP_FOREACH(_UCT_INSTANTIATE_TEST_CASE, _test_case, UCT_TEST_NO_SELF_TLS)
 
+
+/**
+ * Instantiate the parametrized test case for all sockaddr CMs.
+ *
+ * @param _test_case  Test case class, derived from @ref test_uct_sockaddr.
+ */
 #define UCT_INSTANTIATE_SOCKADDR_TEST_CASE(_test_case) \
-    UCS_PP_FOREACH(_UCT_INSTANTIATE_TEST_CASE, _test_case, UCT_TEST_SOCKADDR_TLS)
+    UCS_PP_FOREACH(_UCT_INSTANTIATE_CM_TEST_CASE, _test_case, UCT_TEST_CMS)
+
+
+#define _UCT_INSTANTIATE_CM_TEST_CASE(_test_case, _cm_name) \
+    INSTANTIATE_TEST_CASE_P(_cm_name, _test_case, \
+                            testing::ValuesIn(_test_case::enum_cm_resources( \
+                                    UCS_PP_QUOTE(_cm_name))));
+
 
 /**
  * Instantiate the parametrized test case for the RC/DC transports.
diff --git a/test/gtest/uct/v2/test_uct_query.cc b/test/gtest/uct/v2/test_uct_query.cc
new file mode 100644
index 00000000000..6ff7579c86a
--- /dev/null
+++ b/test/gtest/uct/v2/test_uct_query.cc
@@ -0,0 +1,60 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2021.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+extern "C" {
+#include <uct/api/uct.h>
+#include <uct/api/v2/uct_v2.h>
+}
+
+#include <gtest/uct/uct_p2p_test.h>
+
+class test_uct_query : public uct_p2p_test {
+public:
+    test_uct_query() : uct_p2p_test(0)
+    {
+    }
+};
+
+UCS_TEST_P(test_uct_query, query_perf)
+{
+    uct_perf_attr_t perf_attr;
+    ucs_status_t status;
+
+    perf_attr.field_mask         = UCT_PERF_ATTR_FIELD_OPERATION |
+                                   UCT_PERF_ATTR_FIELD_LOCAL_MEMORY_TYPE |
+                                   UCT_PERF_ATTR_FIELD_REMOTE_MEMORY_TYPE |
+                                   UCT_PERF_ATTR_FIELD_OVERHEAD |
+                                   UCT_PERF_ATTR_FIELD_BANDWIDTH;
+    perf_attr.operation          = UCT_OP_AM_SHORT;
+    perf_attr.local_memory_type  = UCS_MEMORY_TYPE_HOST;
+    perf_attr.remote_memory_type = UCS_MEMORY_TYPE_HOST;
+    status                       = uct_iface_estimate_perf(sender().iface(),
+                                                           &perf_attr);
+    EXPECT_EQ(status, UCS_OK);
+
+    perf_attr.remote_memory_type = UCS_MEMORY_TYPE_CUDA;
+    perf_attr.operation          = UCT_OP_PUT_SHORT;
+    status                       = uct_iface_estimate_perf(sender().iface(),
+                                                           &perf_attr);
+
+    /* At least one type of bandwidth must be non-zero */
+    EXPECT_NE(0, perf_attr.bandwidth.shared + perf_attr.bandwidth.dedicated);
+
+    if (has_transport("cuda_copy") || has_transport("gdr_copy")) {
+        uct_perf_attr_t perf_attr_get;
+        perf_attr_get.field_mask = UCT_PERF_ATTR_FIELD_OPERATION |
+                                   UCT_PERF_ATTR_FIELD_BANDWIDTH;
+        perf_attr_get.operation  = UCT_OP_GET_SHORT;
+        status = uct_iface_estimate_perf(sender().iface(), &perf_attr_get);
+        EXPECT_EQ(status, UCS_OK);
+
+        /* Put and get operations have different bandwidth in cuda_copy
+           and gdr_copy transports */
+        EXPECT_NE(perf_attr.bandwidth.shared, perf_attr_get.bandwidth.shared);
+    }
+}
+
+UCT_INSTANTIATE_TEST_CASE(test_uct_query)
diff --git a/test/mpi/test_memhooks.c b/test/mpi/test_memhooks.c
index c60ad01ec5e..25c90105b77 100644
--- a/test/mpi/test_memhooks.c
+++ b/test/mpi/test_memhooks.c
@@ -45,22 +45,22 @@
 
 #define SHMAT_FAILED ((void*)-1)
 
-void* open_dyn_lib(const char *lib_path);
-void* flag_no_install_init(const char *path);
+void *event_init(const char *path, ucm_mmap_hook_mode_t mmap_mode);
+void *ext_event_init(const char *path, ucm_mmap_hook_mode_t mmap_mode);
+void* flag_no_install_init(const char *path, ucm_mmap_hook_mode_t mmap_mode);
 int malloc_hooks_run_all(void *dl);
 int malloc_hooks_run_unmapped(void *dl);
 int ext_event_run(void *dl);
-void *ext_event_init(const char *path);
 
 typedef struct memtest_type {
     const char *name;
-    void*      (*init)(const char *path);
+    void*      (*init)(const char *path, ucm_mmap_hook_mode_t mmap_mode);
     int        (*run) (void *arg);
 } memtest_type_t;
 
 memtest_type_t tests[] = {
-    {"malloc_hooks",          open_dyn_lib,         malloc_hooks_run_all},
-    {"malloc_hooks_unmapped", open_dyn_lib,         malloc_hooks_run_unmapped},
+    {"malloc_hooks",          event_init,           malloc_hooks_run_all},
+    {"malloc_hooks_unmapped", event_init,           malloc_hooks_run_unmapped},
     {"external_events",       ext_event_init,       ext_event_run},
     {"flag_no_install",       flag_no_install_init, ext_event_run},
     {NULL}
@@ -78,6 +78,9 @@ static void usage() {
     printf("                 malloc_hooks_unmapped : Test VM_UNMAPPED event only\n");
     printf("                 external_events       : Test of ucm_set_external_event() API\n");
     printf("                 flag_no_install       : Test of UCM_EVENT_FLAG_NO_INSTALL flag\n");
+    printf(" -m <mode>   Memory hooks mode (bistro)\n");
+    printf("                 reloc   : Change .plt/.got tables\n");
+    printf("                 bistro  : Binary code patching\n");
     printf("\n");
 }
 
@@ -102,10 +105,27 @@ static ucs_status_t set_event_handler(void *dl, int events)
     return set_handler(events, 0, event_callback, NULL);
 }
 
-static ucs_status_t disable_memory_hooks(void *dl)
+static ucs_status_t init_ucm_config(void *dl_ucm, int enable_hooks,
+                                    ucm_mmap_hook_mode_t mmap_mode)
 {
-    setenv("UCX_MEM_MALLOC_HOOKS", "n", 1);
-    setenv("UCX_MEM_MMAP_RELOC",   "n", 1);
+    void (*library_init)(const ucm_global_config_t *ucm_opts);
+    ucm_global_config_t *ucm_opts;
+
+    DL_FIND_FUNC(dl_ucm, "ucm_library_init", library_init,
+                 return UCS_ERR_NO_ELEM);
+    DL_FIND_FUNC(dl_ucm, "ucm_global_opts", ucm_opts,
+                 return UCS_ERR_NO_ELEM);
+
+    if (enable_hooks) {
+        ucm_opts->mmap_hook_mode      = mmap_mode;
+    } else {
+        ucm_opts->enable_malloc_hooks = 0;
+        ucm_opts->enable_malloc_reloc = 0;
+        ucm_opts->mmap_hook_mode      = UCM_MMAP_HOOK_NONE;
+    }
+
+    library_init(NULL);
+
     return UCS_OK;
 }
 
@@ -122,8 +142,27 @@ void* open_dyn_lib(const char *lib_path)
     return dl;
 }
 
+void *event_init(const char *path, ucm_mmap_hook_mode_t mmap_mode)
+{
+    ucs_status_t status;
+    void *dl_ucm;
+
+    dl_ucm = open_dyn_lib(path);
+    if (dl_ucm == NULL) {
+        return NULL;
+    }
+
+    status = init_ucm_config(dl_ucm, 1, mmap_mode);
+    CHKERR_JUMP(status != UCS_OK, "Failed to initialize UCM", fail);
+
+    return dl_ucm;
 
-void *ext_event_init(const char *path)
+fail:
+    dlclose(dl_ucm);
+    return NULL;
+}
+
+void *ext_event_init(const char *path, ucm_mmap_hook_mode_t mmap_mode)
 {
     void (*set_ext_event)(int events);
     ucs_status_t status;
@@ -134,8 +173,8 @@ void *ext_event_init(const char *path)
         return NULL;
     }
 
-    status = disable_memory_hooks(dl_ucm);
-    CHKERR_JUMP(status != UCS_OK, "Failed to disable memory hooks", fail);
+    status = init_ucm_config(dl_ucm, 0, mmap_mode);
+    CHKERR_JUMP(status != UCS_OK, "Failed to initialize UCM", fail);
 
     DL_FIND_FUNC(dl_ucm, "ucm_set_external_event", set_ext_event, goto fail);
     set_ext_event(UCM_EVENT_VM_MAPPED | UCM_EVENT_VM_UNMAPPED);
@@ -151,7 +190,7 @@ void *ext_event_init(const char *path)
     return NULL;
 }
 
-void* flag_no_install_init(const char *path)
+void* flag_no_install_init(const char *path, ucm_mmap_hook_mode_t mmap_mode)
 {
     void *dl_ucm;
     ucs_status_t status;
@@ -161,8 +200,8 @@ void* flag_no_install_init(const char *path)
         return NULL;
     }
 
-    status = disable_memory_hooks(dl_ucm);
-    CHKERR_JUMP(status != UCS_OK, "Failed to disable memory hooks", fail);
+    status = init_ucm_config(dl_ucm, 0, mmap_mode);
+    CHKERR_JUMP(status != UCS_OK, "Failed to initialize UCM", fail);
 
     status = set_event_handler(dl_ucm, UCM_EVENT_VM_MAPPED   |
                                        UCM_EVENT_VM_UNMAPPED |
@@ -387,14 +426,14 @@ int ext_event_run(void *dl)
     total_mapped = 0;
     ptr_direct_mmap = mmap(NULL, size, PROT_READ|PROT_WRITE,
                            MAP_PRIVATE|MAP_ANON, -1, 0);
-    printf("totmapped %lu\n", total_mapped);
+    printf("total_mapped=%lu\n", total_mapped);
     /* No callback should be called as we registered events to be external */
     CHKERR_JUMP(total_mapped != 0,
                 "Callback for mmap invoked, while hooks were not set", fail);
     DL_FIND_FUNC(dl, "ucm_vm_mmap", ucm_event, goto fail);
     ucm_event(ptr_direct_mmap, size);
     CHKERR_JUMP(total_mapped == 0, "Callback for mmap is not called", fail);
-    printf("After ucm_vm_mmap called: mapped=%zu\n", total_mapped);
+    printf("After ucm_vm_mmap called: total_mapped=%zu\n", total_mapped);
 
     /* Call munmap directly */
     total_unmapped = 0;
@@ -405,7 +444,7 @@ int ext_event_run(void *dl)
     DL_FIND_FUNC(dl, "ucm_vm_munmap", ucm_event, goto fail);
     ucm_event(ptr_direct_mmap, size);
     CHKERR_JUMP(total_unmapped == 0, "Callback for mmap is not called", fail);
-    printf("After ucm_vm_munmap: unmapped=%zu\n", total_unmapped);
+    printf("After ucm_vm_munmap: total_unmapped=%zu\n", total_unmapped);
 
     ret = 0;
 
@@ -416,13 +455,14 @@ int ext_event_run(void *dl)
 
 int main(int argc, char **argv)
 {
-    const char *ucm_path = UCS_PP_MAKE_STRING(UCM_LIB_DIR) "/" "libucm.so";
-    memtest_type_t *test = tests;
+    const char *ucm_path           = UCS_PP_MAKE_STRING(UCM_LIB_DIR)"/libucm.so";
+    memtest_type_t *test           = tests;
+    ucm_mmap_hook_mode_t mmap_mode = UCM_MMAP_HOOK_BISTRO;
     void *dl;
     int ret;
     int c;
 
-    while ((c = getopt(argc, argv, "t:h")) != -1) {
+    while ((c = getopt(argc, argv, "t:m:h")) != -1) {
         switch (c) {
         case 't':
             for (test = tests; test->name != NULL; ++test) {
@@ -435,6 +475,16 @@ int main(int argc, char **argv)
                 return -1;
             }
             break;
+        case 'm':
+            if (!strcasecmp(optarg, "bistro")) {
+                mmap_mode = UCM_MMAP_HOOK_BISTRO;
+            } else if (!strcasecmp(optarg, "reloc")) {
+                mmap_mode = UCM_MMAP_HOOK_RELOC;
+            } else {
+                fprintf(stderr, "Wrong mmap mode %s\n", optarg);
+                return -1;
+            }
+            break;
         case 'h':
         default:
             usage();
@@ -444,7 +494,7 @@ int main(int argc, char **argv)
 
     /* Some tests need to modify UCM config before to call ucp_init,
      * which may be called by MPI_Init */
-    dl = test->init(ucm_path);
+    dl = test->init(ucm_path, mmap_mode);
     if (dl == NULL) {
         return -1;
     }
diff --git a/ucx.spec.in b/ucx.spec.in
index de3827fef3c..021aef48d30 100644
--- a/ucx.spec.in
+++ b/ucx.spec.in
@@ -3,16 +3,12 @@
 %bcond_with    cuda
 %bcond_with    gdrcopy
 %bcond_without ib
-%if 0%{?fedora} >= 30 || 0%{?rhel} >= 7
-%bcond_with ib_cm
-%else
-%bcond_without ib_cm
-%endif
 %bcond_with    knem
 %bcond_without rdmacm
 %bcond_with    rocm
 %bcond_with    ugni
 %bcond_with    xpmem
+%bcond_with    vfs
 
 Name: ucx
 Version: @VERSION@
@@ -25,6 +21,7 @@ URL: http://www.openucx.org
 Source: https://github.com/openucx/%{name}/releases/download/v@MAJOR_VERSION@.@MINOR_VERSION@.@PATCH_VERSION@/ucx-@MAJOR_VERSION@.@MINOR_VERSION@.@PATCH_VERSION@.tar.gz
 
 BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX)
+Prefix: %{_prefix}
 
 # UCX currently supports only the following architectures
 ExclusiveArch: aarch64 ppc64le x86_64
@@ -48,9 +45,6 @@ BuildRequires: gdrcopy
 %if %{with ib}
 BuildRequires: libibverbs-devel
 %endif
-%if %{with ib_cm}
-BuildRequires: libibcm-devel
-%endif
 %if %{with knem}
 BuildRequires: knem
 %endif
@@ -63,6 +57,9 @@ BuildRequires: hsa-rocr-dev
 %if %{with xpmem}
 BuildRequires: xpmem-devel
 %endif
+%if %{with vfs}
+BuildRequires: fuse3-devel
+%endif
 
 %description
 UCX is an optimized communication framework for high-performance distributed
@@ -104,11 +101,11 @@ Provides header files and examples for developing with UCX.
            %_with_arg cuda cuda \
            %_with_arg gdrcopy gdrcopy \
            %_with_arg ib verbs \
-           %_with_arg ib_cm cm \
            %_with_arg knem knem \
            %_with_arg rdmacm rdmacm \
            %_with_arg rocm rocm \
            %_with_arg xpmem xpmem \
+           %_with_arg vfs fuse3 \
            %_with_arg ugni ugni \
            %{?configure_options}
 make %{?_smp_mflags} V=1
@@ -123,7 +120,9 @@ rm -f %{buildroot}%{_libdir}/ucx/lib*.a
 
 %files
 %{_libdir}/lib*.so.*
-%{_bindir}/uc*
+%{_bindir}/ucx_info
+%{_bindir}/ucx_perftest
+%{_bindir}/ucx_read_profile
 @HAVE_GLIBCXX_NOTHROW_TRUE@%{_bindir}/io_demo
 %{_datadir}/ucx
 %exclude %{_datadir}/ucx/examples
@@ -137,7 +136,12 @@ rm -f %{buildroot}%{_libdir}/ucx/lib*.a
 %{_libdir}/pkgconfig/ucx.pc
 %{_datadir}/ucx/examples
 
-%post -p /sbin/ldconfig
+%post
+/sbin/ldconfig
+rm -f %{_sysconfdir}/ucx/ucx.conf
+mkdir -p %{_sysconfdir}/ucx
+ucx_info -fC > %{_sysconfdir}/ucx/ucx.conf
+
 %postun -p /sbin/ldconfig
 
 %if %{with cma}
@@ -202,19 +206,6 @@ hardware-offloaded data transfer.
 %{_libdir}/ucx/libuct_ib.so.*
 %endif
 
-%if %{with ib_cm}
-%package ib-cm
-Requires: %{name}-ib%{?_isa} = %{version}-%{release}
-Summary: UCX InfiniBand connection-manager support
-Group: System Environment/Libraries
-
-%description ib-cm
-Provides Infiniband Connection Manager (also known as ibcm) support for UCX.
-
-%files ib-cm
-%{_libdir}/ucx/libuct_ib_cm.so.*
-%endif
-
 %if %{with knem}
 %package knem
 Requires: %{name}%{?_isa} = %{version}-%{release}
@@ -299,8 +290,30 @@ process to map the memory of another process into its virtual address space.
 %{_libdir}/ucx/libuct_xpmem.so.*
 %endif
 
+%if %{with vfs}
+%package vfs
+Requires: %{name}%{?_isa} = %{version}-%{release}
+Summary: UCX Virtual Filesystem support.
+Group: System Environment/Libraries
+
+%description vfs
+Provides a virtual filesystem over FUSE which allows real-time monitoring of UCX
+library internals, protocol objects, transports status, and more.
+
+%files vfs
+%{_libdir}/ucx/libucs_fuse.so.*
+%{_bindir}/ucx_vfs
+%endif
 
 %changelog
+* Tue Apr 27 2021 Leonid Genkin <lgenkin@nvidia.com> 1.11.0-1
+- Remove obsolete ib/cm code
+* Wed Dec 16 2020 Yossi Itigin <yosefe@mellanox.com> 1.11.0-1
+- Add VFS sub-package
+* Wed Dec 16 2020 Yossi Itigin <yosefe@mellanox.com> 1.11.0-1
+- Bump version to 1.11.0
+* Wed Nov 11 2020 Yossi Itigin <yosefe@mellanox.com> 1.10.0-1
+- Make the RPM relocatable
 * Tue Jul 07 2020 Yossi Itigin <yosefe@mellanox.com> 1.10.0-1
 - Bump version to 1.10.0
 * Mon Feb 10 2020 Yossi Itigin <yosefe@mellanox.com> 1.9.0-1