diff --git a/docs/extended_api/synchronization_primitives/atomic/fetch_max.md b/docs/extended_api/synchronization_primitives/atomic/fetch_max.md
index d90e0dcad4..7ec86f3194 100644
--- a/docs/extended_api/synchronization_primitives/atomic/fetch_max.md
+++ b/docs/extended_api/synchronization_primitives/atomic/fetch_max.md
@@ -27,7 +27,7 @@ __global__ void example_kernel() {
   cuda::atomic<int> a(0);
   auto x = a.fetch_max(1);
   auto y = a.load();
-  assert(x == 1 && y == 0);
+  assert(x == 0 && y == 1);
 }
 ```
 
diff --git a/examples/nv_target.cu b/examples/nv_target.cu
new file mode 100644
index 0000000000..a1fb3c8cb2
--- /dev/null
+++ b/examples/nv_target.cu
@@ -0,0 +1,145 @@
+// This file demonstrates how to use <nv/target> and how to avoid common
+// pitfalls regarding compiler/dialect compatibility.
+
+//=======================================================================================================================
+#include <nv/target>
+
+// The below are part of libcu++ and are exposed for users that would like a simpler method of targeting host/device code
+// on NVCC, NVC++ or GCC/Clang/MSVC even when the NVCC compiler isn't present.
+
+// These macros are to be used in lieu of common #if defined(__CUDA_ARCH__) statements and
+// are only to be used inside of function scopes
+
+/* Macros defined when including <nv/target> or virtually any libcu++ header
+  NV_IF_TARGET(query, true, ...)        | Queries compilation mode and emits code if true                |
+  NV_IF_ELSE_TARGET(query, true, false) | As above, but can also emit different code when false          |
+  NV_DISPATCH_TARGET(...)               | Similar to a switch statement emitting code for multiple modes |
+*/
+
+/* Queryable properties defined by <nv/target>
+Can be imagined as __CUDA_ARCH__ >= SM_XX
+  NV_PROVIDES_SM_35
+  NV_PROVIDES_SM_37
+  NV_PROVIDES_SM_50
+  NV_PROVIDES_SM_52
+  NV_PROVIDES_SM_53
+  NV_PROVIDES_SM_60
+  NV_PROVIDES_SM_61
+  NV_PROVIDES_SM_62
+  NV_PROVIDES_SM_70
+  NV_PROVIDES_SM_72
+  NV_PROVIDES_SM_75
+  NV_PROVIDES_SM_80
+  NV_PROVIDES_SM_86
+  NV_PROVIDES_SM_87
+
+Similar to above, but instead __CUDA_ARCH__ == SM_XX
+  NV_IS_EXACTLY_SM_35
+  NV_IS_EXACTLY_SM_37
+  NV_IS_EXACTLY_SM_50
+  NV_IS_EXACTLY_SM_52
+  NV_IS_EXACTLY_SM_53
+  NV_IS_EXACTLY_SM_60
+  NV_IS_EXACTLY_SM_61
+  NV_IS_EXACTLY_SM_62
+  NV_IS_EXACTLY_SM_70
+  NV_IS_EXACTLY_SM_72
+  NV_IS_EXACTLY_SM_75
+  NV_IS_EXACTLY_SM_80
+  NV_IS_EXACTLY_SM_86
+  NV_IS_EXACTLY_SM_87
+
+Queries whether if host or device code is being compiled
+  NV_IS_HOST
+  NV_IS_DEVICE
+
+Static true/false values for fallbacks or user manipulation
+  NV_ANY_TARGET
+  NV_NO_TARGET
+*/
+
+//=======================================================================================================================
+// NV_IF_ELSE_TARGET(query, true statement, false statement)
+__host__ __device__ int my_popc(unsigned int v) {
+  // NV_IF_ELSE_TARGET accepts three arguments, a query and two statement.
+  // Here we check if we're compiling for device code. This function acts as a backend for both CUDA and host CPU popc.
+  NV_IF_ELSE_TARGET(
+    NV_IS_DEVICE,
+      return __popc(v);,        // Is false, use CUDA intrinsic
+      return __builtin_popc(v); // Is host, use GCC builtin
+  )
+}
+// Note the commas seperating statements, if preprocessed code is written out the macro will be preprocessed into the below:
+/*
+  my_popc(unsigned int v) {
+    {return __popc(v);}
+  }
+*/
+
+//=======================================================================================================================
+// NV_IF_TARGET(query, true statement) OR NV_IF_TARGET(q, t, ...)
+__host__ __device__ void some_algorithm() {
+  // NV_IF_TARGET accepts two arguments, a query and a statement. (and an optional false statement in >=C++11)
+  NV_IF_TARGET(
+    NV_IS_DEVICE,
+      do_device_specific_work(); // Code only emitted if compiling for device
+  )
+}
+
+//=======================================================================================================================
+// NV_DISPATCH_TARGET(...) - Available only in C++11 and up due to variadic macros
+__host__ __device__ void my_memset(void *p, uint8_t v, uint64_t c) {
+  // Target dispatch accepts pairs of queries and statements.
+  // The first postive query encountered will be emitted while others are ignored.
+  NV_DISPATCH_TARGET(
+    NV_PROVIDES_SM_80,
+      // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
+      if (v == 0) zero_fill(p, 0, c); // zero fill using cp.async available on SM_80
+      else memset(p, v, c);, // Notice comma signifying end of block
+    NV_ANY_TARGET, // Uncoditionally use memset in other cases
+      memset(p, v, c);
+  )
+}
+
+//***********************************************************************************************************************
+
+// # Common pitfalls:
+
+//=======================================================================================================================
+// Embedding preprocessor statements as an argument. Perform textual manipulation outside of the macro.
+/*
+NV_IF_TARGET(
+  NV_IS_DEVICE,
+// This will break immediately on most compilers
+#  if defined(ENABLE_SM_80_FEATURE)
+    sm80_function();
+#  else
+    device_function();
+#  endif
+)
+
+// Instead one could write the above as:
+#if defined(ENABLE_SM_80_FEATURE)
+# define OPTIMAL_DEVICE_FUNCTION() sm80_function()
+#else
+# define OPTIMAL_DEVICE_FUNCTION() device_function()
+#endif
+
+NV_IF_TARGET(
+  NV_IS_DEVICE,
+  OPTIMAL_DEVICE_FUNCTION();
+)
+*/
+
+//=======================================================================================================================
+// Some statements may have unguarded commas, e.g. lambdas or aggregate assignment
+// Supported with C++11 and up ONLY, as it requires variadic macro processing
+/*
+NV_IF_TARGET(
+  NV_IS_DEVICE,
+    ( // You may wrap a statement or series of statements with a parenthesis to guard commas from any macro machinery
+      int input[] = {x, y, z...};
+      my_algorithm(input);
+    )
+)
+*/
\ No newline at end of file