Docs preview for PR #660.

NVIDIA · Sep 29, 2023 · f91c514 · f91c514
1 parent 2a11cc7
commit f91c514
Show file tree

Hide file tree

Showing 68 changed files with 1,370 additions and 163 deletions.
diff --git a/pr-660/CMakeLists.txt b/pr-660/CMakeLists.txt
@@ -6,13 +6,29 @@
 # the terms of the Apache License 2.0 which accompanies this distribution.     #
 # ============================================================================ #
 
+# Add nvq++ compile + execution test of code examples
+# Args:
+#   TEST_NAME: name of the test executable. Test name is prefixed with "nvqpp"
+#   SOURCE_LOCATION: location of the source file (relative to 'sphinx/examples/cpp' directory by default) 
+# Optional keyword args:
+#   TARGET <TARGET_NAME>: name of the target to use
+#   SOURCE_DIR <DIR>: the directory that SOURCE_LOCATION is relative to (if not the default)
+#   LAUNCH_COMMAND <COMMAND>: the command to launch the test (e.g., mpirun)
 function(add_nvqpp_test TEST_NAME SOURCE_LOCATION)
+  cmake_parse_arguments(PARSED_ARGS "" "TARGET;SOURCE_DIR;LAUNCH_COMMAND" "" ${ARGN}) 
+  set(NVQPP_COMPILE_ARGS "")
+  if(PARSED_ARGS_TARGET)
+    set(NVQPP_COMPILE_ARGS "${NVQPP_COMPILE_ARGS} --target ${PARSED_ARGS_TARGET}")
+  endif()
+  if (NOT PARSED_ARGS_SOURCE_DIR)
+    set(PARSED_ARGS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/cpp")
+  endif()
   add_test(
   NAME
     nvqpp_${TEST_NAME}
   COMMAND
-    bash -c "${CMAKE_BINARY_DIR}/bin/nvq++ ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/cpp/${SOURCE_LOCATION} -o ${TEST_NAME} ;\
-              ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}"
+    bash -c "${CMAKE_BINARY_DIR}/bin/nvq++ ${NVQPP_COMPILE_ARGS} ${PARSED_ARGS_SOURCE_DIR}/${SOURCE_LOCATION} -o ${TEST_NAME} ;\
+              ${PARSED_ARGS_LAUNCH_COMMAND} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}"
   )
 endfunction()
 
@@ -34,13 +50,29 @@ add_nvqpp_test(IterativePhaseEstimation other/iterative_qpe.cpp)
 add_nvqpp_test(RandomWalkPhaseEstimation other/random_walk_qpe.cpp)
 
 if (CUSTATEVEC_ROOT AND CUDA_FOUND) 
-  add_test(
-    NAME
-      nvqpp_cuquantum
-    COMMAND
-      bash -c "${CMAKE_BINARY_DIR}/bin/nvq++ --target nvidia ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/cpp/basics/cuquantum_backends.cpp -o CuQuantumBackend ;\
-                ${CMAKE_CURRENT_BINARY_DIR}/CuQuantumBackend"
-    )
+  add_nvqpp_test(CuQuantumBackend basics/cuquantum_backends.cpp TARGET nvidia)  
+endif()
+
+# mqpu code snippets, needs custatevec backend and (optionally MPI)
+set(NGPUS 0)
+if (CUSTATEVEC_ROOT AND CUDA_FOUND) 
+  add_nvqpp_test(SampleAsync using/cudaq/platform/sample_async.cpp TARGET nvidia-mqpu SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/cpp)
+  add_nvqpp_test(ObserveMQPU using/cudaq/platform/observe_mqpu.cpp TARGET nvidia-mqpu SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/cpp)
+  # Add the MPI test if MPI was found and there are more than 2 GPUs
+  if (MPI_CXX_FOUND)
+    # Count the number of GPUs
+    find_program(NVIDIA_SMI "nvidia-smi")
+    if(NVIDIA_SMI)
+      execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS)
+      # Only build this test if we have more than 1 GPU
+      if (${NGPUS} GREATER_EQUAL 2)
+        add_nvqpp_test(ObserveMQPU_MPI using/cudaq/platform/observe_mqpu_mpi.cpp
+                        TARGET nvidia-mqpu 
+                        SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/cpp 
+                        LAUNCH_COMMAND "${MPIEXEC} --allow-run-as-root -np 2")
+      endif()
+    endif(NVIDIA_SMI)  
+  endif()   
 endif()
 
 # Only add the python tests if we built the python API
@@ -55,11 +87,15 @@ if (NOT Python_FOUND)
 endif()
 
 function(add_pycudaq_test TEST_NAME SOURCE_LOCATION)
+  cmake_parse_arguments(PARSED_ARGS "" "SOURCE_DIR;LAUNCH_COMMAND" "" ${ARGN}) 
+  if (NOT PARSED_ARGS_SOURCE_DIR)
+    set(PARSED_ARGS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/python")
+  endif()
   add_test(
   NAME
     pycudaq_${TEST_NAME}
   COMMAND
-    ${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/python/${SOURCE_LOCATION}
+    bash -c "${PARSED_ARGS_LAUNCH_COMMAND} ${Python_EXECUTABLE} ${PARSED_ARGS_SOURCE_DIR}/${SOURCE_LOCATION}"
   )
 endfunction()
 
@@ -69,3 +105,12 @@ add_pycudaq_test(QAOA qaoa_maxcut.py)
 add_pycudaq_test(VQE simple_vqe.py)
 add_pycudaq_test(VQEAdvanced advanced_vqe.py)
 
+if (CUSTATEVEC_ROOT AND CUDA_FOUND) 
+  add_pycudaq_test(SampleAsync using/cudaq/platform/sample_async.py SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/python)
+  add_pycudaq_test(ObserveMQPU using/cudaq/platform/observe_mqpu.py SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/python)
+  if (MPI_CXX_FOUND AND ${NGPUS} GREATER_EQUAL 2)
+    add_pycudaq_test(ObserveMQPU_MPI using/cudaq/platform/observe_mqpu_mpi.py 
+                      SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/python
+                      LAUNCH_COMMAND "${MPIEXEC} --allow-run-as-root -np 2")
+  endif()
+endif()
diff --git a/pr-660/_sources/install.rst.txt b/pr-660/_sources/install.rst.txt
@@ -143,6 +143,45 @@ we refer to the `CUDA Quantum GitHub repository`_.
 
 .. _CUDA Quantum GitHub repository: https://github.com/NVIDIA/cuda-quantum/blob/main/Building.md
 
+
+CUDA Quantum Dependencies
+-------------------------
+
+CUDA Quantum can be used to simulate quantum programs (see :doc:`using/simulators`) on a CPU-only system, but a GPU is highly recommended.
+
+The supported CPUs include x86_64 (x86-64-v3 architecture and newer) and ARM64 architectures.
+
+.. note:: 
+
+   The CUDA Quantum Python wheels depend on an existing CUDA installation on your system. For more information about installing the CUDA Quantum Python wheels, take a look at :ref:`this page <install-python-wheels>`.
+
+The following table summarizes the required components.
+
+.. list-table:: Supported Systems
+    :widths: 30 50
+    :header-rows: 0
+
+    * - CPU architectures
+      - x86_64, ARM64
+    * - Operating System
+      - Linux
+    * - Tested Distributions
+      - CentOS 8; Debian 11, 12; Fedora 38; OpenSUSE/SELD/SLES 15.5; RHEL 8, 9; Rocky 8, 9; Ubuntu 22.04
+
+.. list-table:: Requirements for GPU Simulation
+    :widths: 30 50
+    :header-rows: 0
+
+    * - GPU Architectures
+      - Volta, Turing, Ampere, Ada, Hopper
+    * - NVIDIA GPU with Compute Capability
+      - 7.0+
+    * - CUDA
+      - 11.x (Driver 470.57.02+), 12.x (Driver 525.60.13+)
+
+Detailed information about supported drivers for different CUDA versions and be found `here <https://docs.nvidia.com/deploy/cuda-compatibility/>`__.
+
+
 Next Steps
 ----------
 

diff --git a/pr-660/_sources/using/cudaq.rst.txt b/pr-660/_sources/using/cudaq.rst.txt
@@ -19,5 +19,5 @@ application codes.
   Generic Library Functions <cudaq/generic_functions>
   Creating Kernels at Runtime <cudaq/builder.rst>
   Variational Algorithms <cudaq/variational.rst>
-  Asynchronous Execution <cudaq/platform.rst>
+  Multi-processor Platforms <cudaq/platform.rst>
   Debugging and Verbose Logging <cudaq/verbose_out.rst>
diff --git a/pr-660/_sources/using/cudaq/platform.rst.txt b/pr-660/_sources/using/cudaq/platform.rst.txt
@@ -17,37 +17,119 @@ specific asynchronous function invocations targeting a desired QPU.
 
 Here is a simple example demonstrating this
 
-.. code-block:: cpp 
-
-    auto kernelToBeSampled = [](int runtimeParam) __qpu__ {
-      cudaq::qreg q(runtimeParam);
-      h(q);
-      mz(q);
-    };
-
-    // Get the quantum_platform singleton
-    auto& platform = cudaq::get_platform();
-
-    // Query the number of QPUs in the system
-    auto num_qpus = platform.num_qpus();
-
-    // We will launch asynchronous sampling tasks
-    // and will store the results immediately as a future 
-    // we can query at some later point
-    std::vector<cudaq::async_sample_result> countFutures;
-    for (std::size_t i = 0; i < num_qpus; i++) {
-      countFutures.emplace_back(cudaq::sample_async(i, kernelToBeSampled, 5 /*runtimeParam*/));
-    }
-
-    // 
-    // Go do other work, asynchronous execution of sample tasks on-going
-    // 
-
-    // Get the results, note future::get() will kick off a wait
-    // if the results are not yet available.
-    for (auto& counts : countsFutures) {
-      counts.get().dump();
-    }
+.. literalinclude:: ../../snippets/cpp/using/cudaq/platform/sample_async.cpp
+    :language: cpp
+    :start-after: [Begin Documentation]
+    :end-before: [End Documentation]
 
 CUDA Quantum exposes asynchronous versions of the default :code:`cudaq::` algorithmic
-primitive functions like :code:`sample` and :code:`observe`. 
+primitive functions like :code:`sample` and :code:`observe` (e.g., :code:`cudaq::sample_async` function in the above code snippet). 
+
+One can then specify the target multi-QPU architecture (:code:`nvidia-mqpu`) with the :code:`--target` flag:
+
+.. code-block:: console 
+
+    nvq++ sample_async.cpp -target nvidia-mqpu
+    ./a.out
+
+Depending on the number of GPUs available on the system, the :code:`nvidia-mqpu` platform will create the same number of virtual QPU instances.
+For example, on a system with 4 GPUs, the above code will distribute the four sampling tasks among those :code:`GPUEmulatedQPU` instances.
+
+The results might look like the following (4 different random samplings).
+
+.. code-block:: console 
+  
+    Number of QPUs: 4
+    { 10011:28 01100:28 ... }
+    { 10011:37 01100:25 ... }
+    { 10011:29 01100:25 ... }
+    { 10011:33 01100:30 ... }
+
+.. note:: 
+
+  By default, the :code:`nvidia-mqpu` platform will utilize all available GPUs (number of QPUs instances is equal to the number of GPUs).
+  To specify the number QPUs to be instantiated, one can set the :code:`CUDAQ_MQPU_NGPUS` environment variable.
+  For example, :code:`export CUDAQ_MQPU_NGPUS=2` to specify that only 2 QPUs (GPUs) are needed.
+
+
+An equivalent example in Python is as follows.
+
+.. literalinclude:: ../../snippets/python/using/cudaq/platform/sample_async.py
+    :language: python
+    :start-after: [Begin Documentation]
+
+Asynchronous expectation value computations
++++++++++++++++++++++++++++++++++++++++++++
+
+One typical use case of the :code:`nvidia-mqpu` platform is to distribute the 
+expectation value computations of a multi-term Hamiltonian across multiple virtual QPUs (:code:`GPUEmulatedQPU`).
+
+Here is an example.
+
+.. literalinclude:: ../../snippets/cpp/using/cudaq/platform/observe_mqpu.cpp
+    :language: cpp
+    :start-after: [Begin Documentation]
+    :end-before: [End Documentation]
+
+
+One can then target the :code:`nvidia-mqpu` platform by:
+
+.. code-block:: console 
+
+    nvq++ observe_mqpu.cpp -target nvidia-mqpu
+    ./a.out
+
+Equivalently, in Python
+
+.. literalinclude:: ../../snippets/python/using/cudaq/platform/observe_mqpu.py
+    :language: python
+    :start-after: [Begin Documentation]
+
+In the above code snippet, since the Hamiltonian contains four non-identity terms, there are four quantum circuits that need to be executed
+in order to compute the expectation value of that Hamiltonian and given the quantum state prepared by the ansatz kernel. When the :code:`nvidia-mqpu` platform
+is selected, these circuits will be distributed across all available QPUs. The final expectation value result is computed from all QPU execution results.
+
+Parallel distribution mode
+++++++++++++++++++++++++++
+
+The CUDA Quantum :code:`nvidia-mqpu` platform supports two modes of parallel distribution of expectation value computation:
+
+* MPI: distribute the expectation value computations across available MPI ranks and GPUs for each Hamiltonian term.
+* Thread: distribute the expectation value computations among available GPUs via standard C++ threads (each thread handles one GPU).
+
+For instance, if all GPUs are available on a single node, thread-based parallel distribution 
+(:code:`cudaq::parallel::thread` in C++ or :code:`cudaq.parallel.thread` in Python, as shown in the above example) is sufficient. 
+On the other hand, if one wants to distribute the tasks across GPUs on multiple nodes, e.g., on a compute cluster, MPI distribution mode
+should be used.
+
+An example of MPI distribution mode usage is as follows:
+
+C++
+^^^
+
+.. literalinclude:: ../../snippets/cpp/using/cudaq/platform/observe_mqpu_mpi.cpp
+    :language: cpp
+    :start-after: [Begin Documentation]
+    :end-before: [End Documentation]
+
+.. code-block:: console 
+
+    nvq++ observe_mqpu_mpi.cpp -target nvidia-mqpu
+    mpirun -np <N> a.out
+
+
+Python
+^^^^^^
+
+.. literalinclude:: ../../snippets/python/using/cudaq/platform/observe_mqpu_mpi.py
+    :language: python
+    :start-after: [Begin Documentation]
+
+.. code-block:: console 
+
+    mpirun -np <N> python3 observe_mpi.py
+
+In the above examples, the parallel distribution mode was set to :code:`mpi` using :code:`cudaq::parallel::mpi` in C++ or :code:`cudaq.parallel.mpi` in Python.
+CUDA Quantum provides MPI utility functions to initialize, finalize, or query (rank, size, etc.) the MPI runtime. 
+Last but not least, the compiled executable (C++) or Python script needs to be launched with an appropriate MPI command, 
+e.g., :code:`mpirun`, :code:`mpiexec`, :code:`srun`, etc. 
diff --git a/pr-660/api/api.html b/pr-660/api/api.html
@@ -105,6 +105,7 @@
 </li>
 <li class="toctree-l2"><a class="reference internal" href="../install.html#python-wheels">Python wheels</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../install.html#build-cuda-quantum-from-source">Build CUDA Quantum from Source</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../install.html#cuda-quantum-dependencies">CUDA Quantum Dependencies</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../install.html#next-steps">Next Steps</a></li>
 </ul>
 </li>
@@ -117,7 +118,15 @@
 <li class="toctree-l2"><a class="reference internal" href="../using/cudaq/generic_functions.html">Generic Library Functions</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../using/cudaq/builder.html">Creating Kernels at Runtime</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../using/cudaq/variational.html">Variational Algorithms</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/platform.html">Asynchronous Execution</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/platform.html">Multi-processor Platforms</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="../using/cudaq/platform.html#asynchronous-expectation-value-computations">Asynchronous expectation value computations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../using/cudaq/platform.html#parallel-distribution-mode">Parallel distribution mode</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="../using/cudaq/platform.html#c">C++</a></li>
+<li class="toctree-l4"><a class="reference internal" href="../using/cudaq/platform.html#python">Python</a></li>
+</ul>
+</li>
+</ul>
+</li>
 <li class="toctree-l2"><a class="reference internal" href="../using/cudaq/verbose_out.html">Debugging and Verbose Logging</a></li>
 </ul>
 </li>

diff --git a/pr-660/api/languages/cpp_api.html b/pr-660/api/languages/cpp_api.html
@@ -105,6 +105,7 @@
 </li>
 <li class="toctree-l2"><a class="reference internal" href="../../install.html#python-wheels">Python wheels</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../install.html#build-cuda-quantum-from-source">Build CUDA Quantum from Source</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../install.html#cuda-quantum-dependencies">CUDA Quantum Dependencies</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../install.html#next-steps">Next Steps</a></li>
 </ul>
 </li>
@@ -117,7 +118,15 @@
 <li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/generic_functions.html">Generic Library Functions</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/builder.html">Creating Kernels at Runtime</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/variational.html">Variational Algorithms</a></li>
-<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/platform.html">Asynchronous Execution</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/platform.html">Multi-processor Platforms</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="../../using/cudaq/platform.html#asynchronous-expectation-value-computations">Asynchronous expectation value computations</a></li>
+<li class="toctree-l3"><a class="reference internal" href="../../using/cudaq/platform.html#parallel-distribution-mode">Parallel distribution mode</a><ul>
+<li class="toctree-l4"><a class="reference internal" href="../../using/cudaq/platform.html#c">C++</a></li>
+<li class="toctree-l4"><a class="reference internal" href="../../using/cudaq/platform.html#python">Python</a></li>
+</ul>
+</li>
+</ul>
+</li>
 <li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/verbose_out.html">Debugging and Verbose Logging</a></li>
 </ul>
 </li>