Skip to content

Commit

Permalink
Docs preview for PR #660.
Browse files Browse the repository at this point in the history
  • Loading branch information
cuda-quantum-bot committed Sep 29, 2023
1 parent 2a11cc7 commit f91c514
Show file tree
Hide file tree
Showing 68 changed files with 1,370 additions and 163 deletions.
65 changes: 55 additions & 10 deletions pr-660/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,29 @@
# the terms of the Apache License 2.0 which accompanies this distribution. #
# ============================================================================ #

# Add nvq++ compile + execution test of code examples
# Args:
# TEST_NAME: name of the test executable. Test name is prefixed with "nvqpp"
# SOURCE_LOCATION: location of the source file (relative to 'sphinx/examples/cpp' directory by default)
# Optional keyword args:
# TARGET <TARGET_NAME>: name of the target to use
# SOURCE_DIR <DIR>: the directory that SOURCE_LOCATION is relative to (if not the default)
# LAUNCH_COMMAND <COMMAND>: the command to launch the test (e.g., mpirun)
function(add_nvqpp_test TEST_NAME SOURCE_LOCATION)
cmake_parse_arguments(PARSED_ARGS "" "TARGET;SOURCE_DIR;LAUNCH_COMMAND" "" ${ARGN})
set(NVQPP_COMPILE_ARGS "")
if(PARSED_ARGS_TARGET)
set(NVQPP_COMPILE_ARGS "${NVQPP_COMPILE_ARGS} --target ${PARSED_ARGS_TARGET}")
endif()
if (NOT PARSED_ARGS_SOURCE_DIR)
set(PARSED_ARGS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/cpp")
endif()
add_test(
NAME
nvqpp_${TEST_NAME}
COMMAND
bash -c "${CMAKE_BINARY_DIR}/bin/nvq++ ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/cpp/${SOURCE_LOCATION} -o ${TEST_NAME} ;\
${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}"
bash -c "${CMAKE_BINARY_DIR}/bin/nvq++ ${NVQPP_COMPILE_ARGS} ${PARSED_ARGS_SOURCE_DIR}/${SOURCE_LOCATION} -o ${TEST_NAME} ;\
${PARSED_ARGS_LAUNCH_COMMAND} ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}"
)
endfunction()

Expand All @@ -34,13 +50,29 @@ add_nvqpp_test(IterativePhaseEstimation other/iterative_qpe.cpp)
add_nvqpp_test(RandomWalkPhaseEstimation other/random_walk_qpe.cpp)

if (CUSTATEVEC_ROOT AND CUDA_FOUND)
add_test(
NAME
nvqpp_cuquantum
COMMAND
bash -c "${CMAKE_BINARY_DIR}/bin/nvq++ --target nvidia ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/cpp/basics/cuquantum_backends.cpp -o CuQuantumBackend ;\
${CMAKE_CURRENT_BINARY_DIR}/CuQuantumBackend"
)
add_nvqpp_test(CuQuantumBackend basics/cuquantum_backends.cpp TARGET nvidia)
endif()

# mqpu code snippets, needs custatevec backend and (optionally MPI)
set(NGPUS 0)
if (CUSTATEVEC_ROOT AND CUDA_FOUND)
add_nvqpp_test(SampleAsync using/cudaq/platform/sample_async.cpp TARGET nvidia-mqpu SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/cpp)
add_nvqpp_test(ObserveMQPU using/cudaq/platform/observe_mqpu.cpp TARGET nvidia-mqpu SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/cpp)
# Add the MPI test if MPI was found and there are more than 2 GPUs
if (MPI_CXX_FOUND)
# Count the number of GPUs
find_program(NVIDIA_SMI "nvidia-smi")
if(NVIDIA_SMI)
execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS)
# Only build this test if we have more than 1 GPU
if (${NGPUS} GREATER_EQUAL 2)
add_nvqpp_test(ObserveMQPU_MPI using/cudaq/platform/observe_mqpu_mpi.cpp
TARGET nvidia-mqpu
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/cpp
LAUNCH_COMMAND "${MPIEXEC} --allow-run-as-root -np 2")
endif()
endif(NVIDIA_SMI)
endif()
endif()

# Only add the python tests if we built the python API
Expand All @@ -55,11 +87,15 @@ if (NOT Python_FOUND)
endif()

function(add_pycudaq_test TEST_NAME SOURCE_LOCATION)
cmake_parse_arguments(PARSED_ARGS "" "SOURCE_DIR;LAUNCH_COMMAND" "" ${ARGN})
if (NOT PARSED_ARGS_SOURCE_DIR)
set(PARSED_ARGS_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/python")
endif()
add_test(
NAME
pycudaq_${TEST_NAME}
COMMAND
${Python_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/examples/python/${SOURCE_LOCATION}
bash -c "${PARSED_ARGS_LAUNCH_COMMAND} ${Python_EXECUTABLE} ${PARSED_ARGS_SOURCE_DIR}/${SOURCE_LOCATION}"
)
endfunction()

Expand All @@ -69,3 +105,12 @@ add_pycudaq_test(QAOA qaoa_maxcut.py)
add_pycudaq_test(VQE simple_vqe.py)
add_pycudaq_test(VQEAdvanced advanced_vqe.py)

if (CUSTATEVEC_ROOT AND CUDA_FOUND)
add_pycudaq_test(SampleAsync using/cudaq/platform/sample_async.py SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/python)
add_pycudaq_test(ObserveMQPU using/cudaq/platform/observe_mqpu.py SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/python)
if (MPI_CXX_FOUND AND ${NGPUS} GREATER_EQUAL 2)
add_pycudaq_test(ObserveMQPU_MPI using/cudaq/platform/observe_mqpu_mpi.py
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/sphinx/snippets/python
LAUNCH_COMMAND "${MPIEXEC} --allow-run-as-root -np 2")
endif()
endif()
39 changes: 39 additions & 0 deletions pr-660/_sources/install.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,45 @@ we refer to the `CUDA Quantum GitHub repository`_.

.. _CUDA Quantum GitHub repository: https://github.com/NVIDIA/cuda-quantum/blob/main/Building.md


CUDA Quantum Dependencies
-------------------------

CUDA Quantum can be used to simulate quantum programs (see :doc:`using/simulators`) on a CPU-only system, but a GPU is highly recommended.

The supported CPUs include x86_64 (x86-64-v3 architecture and newer) and ARM64 architectures.

.. note::

The CUDA Quantum Python wheels depend on an existing CUDA installation on your system. For more information about installing the CUDA Quantum Python wheels, take a look at :ref:`this page <install-python-wheels>`.

The following table summarizes the required components.

.. list-table:: Supported Systems
:widths: 30 50
:header-rows: 0

* - CPU architectures
- x86_64, ARM64
* - Operating System
- Linux
* - Tested Distributions
- CentOS 8; Debian 11, 12; Fedora 38; OpenSUSE/SELD/SLES 15.5; RHEL 8, 9; Rocky 8, 9; Ubuntu 22.04

.. list-table:: Requirements for GPU Simulation
:widths: 30 50
:header-rows: 0

* - GPU Architectures
- Volta, Turing, Ampere, Ada, Hopper
* - NVIDIA GPU with Compute Capability
- 7.0+
* - CUDA
- 11.x (Driver 470.57.02+), 12.x (Driver 525.60.13+)

Detailed information about supported drivers for different CUDA versions and be found `here <https://docs.nvidia.com/deploy/cuda-compatibility/>`__.


Next Steps
----------

Expand Down
2 changes: 1 addition & 1 deletion pr-660/_sources/using/cudaq.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ application codes.
Generic Library Functions <cudaq/generic_functions>
Creating Kernels at Runtime <cudaq/builder.rst>
Variational Algorithms <cudaq/variational.rst>
Asynchronous Execution <cudaq/platform.rst>
Multi-processor Platforms <cudaq/platform.rst>
Debugging and Verbose Logging <cudaq/verbose_out.rst>
146 changes: 114 additions & 32 deletions pr-660/_sources/using/cudaq/platform.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,37 +17,119 @@ specific asynchronous function invocations targeting a desired QPU.

Here is a simple example demonstrating this

.. code-block:: cpp
auto kernelToBeSampled = [](int runtimeParam) __qpu__ {
cudaq::qreg q(runtimeParam);
h(q);
mz(q);
};
// Get the quantum_platform singleton
auto& platform = cudaq::get_platform();
// Query the number of QPUs in the system
auto num_qpus = platform.num_qpus();
// We will launch asynchronous sampling tasks
// and will store the results immediately as a future
// we can query at some later point
std::vector<cudaq::async_sample_result> countFutures;
for (std::size_t i = 0; i < num_qpus; i++) {
countFutures.emplace_back(cudaq::sample_async(i, kernelToBeSampled, 5 /*runtimeParam*/));
}
//
// Go do other work, asynchronous execution of sample tasks on-going
//
// Get the results, note future::get() will kick off a wait
// if the results are not yet available.
for (auto& counts : countsFutures) {
counts.get().dump();
}
.. literalinclude:: ../../snippets/cpp/using/cudaq/platform/sample_async.cpp
:language: cpp
:start-after: [Begin Documentation]
:end-before: [End Documentation]

CUDA Quantum exposes asynchronous versions of the default :code:`cudaq::` algorithmic
primitive functions like :code:`sample` and :code:`observe`.
primitive functions like :code:`sample` and :code:`observe` (e.g., :code:`cudaq::sample_async` function in the above code snippet).

One can then specify the target multi-QPU architecture (:code:`nvidia-mqpu`) with the :code:`--target` flag:

.. code-block:: console
nvq++ sample_async.cpp -target nvidia-mqpu
./a.out
Depending on the number of GPUs available on the system, the :code:`nvidia-mqpu` platform will create the same number of virtual QPU instances.
For example, on a system with 4 GPUs, the above code will distribute the four sampling tasks among those :code:`GPUEmulatedQPU` instances.

The results might look like the following (4 different random samplings).

.. code-block:: console
Number of QPUs: 4
{ 10011:28 01100:28 ... }
{ 10011:37 01100:25 ... }
{ 10011:29 01100:25 ... }
{ 10011:33 01100:30 ... }
.. note::

By default, the :code:`nvidia-mqpu` platform will utilize all available GPUs (number of QPUs instances is equal to the number of GPUs).
To specify the number QPUs to be instantiated, one can set the :code:`CUDAQ_MQPU_NGPUS` environment variable.
For example, :code:`export CUDAQ_MQPU_NGPUS=2` to specify that only 2 QPUs (GPUs) are needed.


An equivalent example in Python is as follows.

.. literalinclude:: ../../snippets/python/using/cudaq/platform/sample_async.py
:language: python
:start-after: [Begin Documentation]

Asynchronous expectation value computations
+++++++++++++++++++++++++++++++++++++++++++

One typical use case of the :code:`nvidia-mqpu` platform is to distribute the
expectation value computations of a multi-term Hamiltonian across multiple virtual QPUs (:code:`GPUEmulatedQPU`).

Here is an example.

.. literalinclude:: ../../snippets/cpp/using/cudaq/platform/observe_mqpu.cpp
:language: cpp
:start-after: [Begin Documentation]
:end-before: [End Documentation]


One can then target the :code:`nvidia-mqpu` platform by:

.. code-block:: console
nvq++ observe_mqpu.cpp -target nvidia-mqpu
./a.out
Equivalently, in Python

.. literalinclude:: ../../snippets/python/using/cudaq/platform/observe_mqpu.py
:language: python
:start-after: [Begin Documentation]

In the above code snippet, since the Hamiltonian contains four non-identity terms, there are four quantum circuits that need to be executed
in order to compute the expectation value of that Hamiltonian and given the quantum state prepared by the ansatz kernel. When the :code:`nvidia-mqpu` platform
is selected, these circuits will be distributed across all available QPUs. The final expectation value result is computed from all QPU execution results.

Parallel distribution mode
++++++++++++++++++++++++++

The CUDA Quantum :code:`nvidia-mqpu` platform supports two modes of parallel distribution of expectation value computation:

* MPI: distribute the expectation value computations across available MPI ranks and GPUs for each Hamiltonian term.
* Thread: distribute the expectation value computations among available GPUs via standard C++ threads (each thread handles one GPU).

For instance, if all GPUs are available on a single node, thread-based parallel distribution
(:code:`cudaq::parallel::thread` in C++ or :code:`cudaq.parallel.thread` in Python, as shown in the above example) is sufficient.
On the other hand, if one wants to distribute the tasks across GPUs on multiple nodes, e.g., on a compute cluster, MPI distribution mode
should be used.

An example of MPI distribution mode usage is as follows:

C++
^^^

.. literalinclude:: ../../snippets/cpp/using/cudaq/platform/observe_mqpu_mpi.cpp
:language: cpp
:start-after: [Begin Documentation]
:end-before: [End Documentation]

.. code-block:: console
nvq++ observe_mqpu_mpi.cpp -target nvidia-mqpu
mpirun -np <N> a.out
Python
^^^^^^

.. literalinclude:: ../../snippets/python/using/cudaq/platform/observe_mqpu_mpi.py
:language: python
:start-after: [Begin Documentation]

.. code-block:: console
mpirun -np <N> python3 observe_mpi.py
In the above examples, the parallel distribution mode was set to :code:`mpi` using :code:`cudaq::parallel::mpi` in C++ or :code:`cudaq.parallel.mpi` in Python.
CUDA Quantum provides MPI utility functions to initialize, finalize, or query (rank, size, etc.) the MPI runtime.
Last but not least, the compiled executable (C++) or Python script needs to be launched with an appropriate MPI command,
e.g., :code:`mpirun`, :code:`mpiexec`, :code:`srun`, etc.
11 changes: 10 additions & 1 deletion pr-660/api/api.html
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
</li>
<li class="toctree-l2"><a class="reference internal" href="../install.html#python-wheels">Python wheels</a></li>
<li class="toctree-l2"><a class="reference internal" href="../install.html#build-cuda-quantum-from-source">Build CUDA Quantum from Source</a></li>
<li class="toctree-l2"><a class="reference internal" href="../install.html#cuda-quantum-dependencies">CUDA Quantum Dependencies</a></li>
<li class="toctree-l2"><a class="reference internal" href="../install.html#next-steps">Next Steps</a></li>
</ul>
</li>
Expand All @@ -117,7 +118,15 @@
<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/generic_functions.html">Generic Library Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/builder.html">Creating Kernels at Runtime</a></li>
<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/variational.html">Variational Algorithms</a></li>
<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/platform.html">Asynchronous Execution</a></li>
<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/platform.html">Multi-processor Platforms</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../using/cudaq/platform.html#asynchronous-expectation-value-computations">Asynchronous expectation value computations</a></li>
<li class="toctree-l3"><a class="reference internal" href="../using/cudaq/platform.html#parallel-distribution-mode">Parallel distribution mode</a><ul>
<li class="toctree-l4"><a class="reference internal" href="../using/cudaq/platform.html#c">C++</a></li>
<li class="toctree-l4"><a class="reference internal" href="../using/cudaq/platform.html#python">Python</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../using/cudaq/verbose_out.html">Debugging and Verbose Logging</a></li>
</ul>
</li>
Expand Down
11 changes: 10 additions & 1 deletion pr-660/api/languages/cpp_api.html
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
</li>
<li class="toctree-l2"><a class="reference internal" href="../../install.html#python-wheels">Python wheels</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../install.html#build-cuda-quantum-from-source">Build CUDA Quantum from Source</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../install.html#cuda-quantum-dependencies">CUDA Quantum Dependencies</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../install.html#next-steps">Next Steps</a></li>
</ul>
</li>
Expand All @@ -117,7 +118,15 @@
<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/generic_functions.html">Generic Library Functions</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/builder.html">Creating Kernels at Runtime</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/variational.html">Variational Algorithms</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/platform.html">Asynchronous Execution</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/platform.html">Multi-processor Platforms</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../using/cudaq/platform.html#asynchronous-expectation-value-computations">Asynchronous expectation value computations</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../using/cudaq/platform.html#parallel-distribution-mode">Parallel distribution mode</a><ul>
<li class="toctree-l4"><a class="reference internal" href="../../using/cudaq/platform.html#c">C++</a></li>
<li class="toctree-l4"><a class="reference internal" href="../../using/cudaq/platform.html#python">Python</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../using/cudaq/verbose_out.html">Debugging and Verbose Logging</a></li>
</ul>
</li>
Expand Down
Loading

0 comments on commit f91c514

Please sign in to comment.