glotzerlab · joaander · Sep 5, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/setup.cfg b/setup.cfg
@@ -25,9 +25,11 @@ ignore =
 # do not require docstrings in unit test files
 # F401 ignore unused imports in __init__.py files (these are for users)
 # D214 ignore overindented sections in Trigger - this is Google napoleon formatting
+# N816 ignore mixed case kT variables
+# D10* howto guides do not need docstrings
 per-file-ignores =
     */pytest/*.py:D100,D101,D102,D103,D104,D105,D106
-    sphinx-doc/howto/*.py:D100,D101,D102,D103,D104,D105,D106
+    sphinx-doc/howto/*.py:D100,D101,D102,D103,D104,D105,D106,N816
     */__init__.py: F401
     hoomd/version.py: F401
     hoomd/trigger.py: D214

diff --git a/sphinx-doc/figures/determine-the-most-efficient-device.ipynb b/sphinx-doc/figures/determine-the-most-efficient-device.ipynb
@@ -0,0 +1,136 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eeeb56f5-3a6b-4f6e-8873-ef30b0f7fa66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib\n",
+    "import numpy\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68b9ef5f-bec7-4eaf-aeba-ced28a10f44d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "matplotlib.style.use('ggplot')\n",
+    "matplotlib.rcParams.update({'font.size': 12})\n",
+    "matplotlib.rcParams.update({'xtick.labelsize': 'x-large'})\n",
+    "matplotlib.rcParams.update({'xtick.major.size': '0'})\n",
+    "matplotlib.rcParams.update({'ytick.labelsize': 'x-large'})\n",
+    "matplotlib.rcParams.update({'ytick.major.size': '0'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b9b1d3d-1b22-4535-bcd0-1f2558ceeb9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CPU_P = [1, 2, 4, 8, 16, 32, 64]\n",
+    "CPU_TPS = [2699, 4868, 8043, 12585, 18168, 22394, 25031]\n",
+    "GPU_TPS = 15955\n",
+    "CPU_eta = [CPU_TPS[i] / (CPU_TPS[0] * CPU_P[i]) for i in range(len(CPU_TPS))]\n",
+    "\n",
+    "fig = matplotlib.figure.Figure(figsize=(7, 4.32624056*2), dpi=100)\n",
+    "ax = fig.add_subplot(2, 1, 1)\n",
+    "ax.plot(CPU_P, CPU_TPS, 's', color='C0', label='CPU')\n",
+    "ax.hlines(y=GPU_TPS, xmin=1, xmax=64, color='C1', label='GPU')\n",
+    "ax.set_xlabel('P')\n",
+    "ax.set_ylabel('TPS')\n",
+    "ax.legend()\n",
+    "\n",
+    "ax = fig.add_subplot(2, 1, 2)\n",
+    "ax.plot(CPU_P, CPU_eta, 's', color='C0')\n",
+    "ax.hlines(y=GPU_TPS / (CPU_TPS[0] * 64), xmin=1, xmax=64, color='C1')\n",
+    "ax.set_xlabel('P')\n",
+    "ax.set_ylabel('$\\eta$')\n",
+    "fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55211aa0-0eea-4522-b529-63111d25d007",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig.savefig('../howto/wca-efficiency-2048.svg', bbox_inches='tight', facecolor=(1, 1, 1, 1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b937ea0-8ca0-4803-ad29-b7b57a945d03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CPU_P = [1, 2, 4, 8, 16, 32, 64, 128, 256]\n",
+    "CPU_TPS = [36.072, 61.988, 143.25, 281.35, 502.48, 910.58, 1451.5, 2216.1, 2706.8]\n",
+    "GPU_TPS = 7276.5\n",
+    "CPU_eta = [CPU_TPS[i] / (CPU_TPS[0] * CPU_P[i]) for i in range(len(CPU_TPS))]\n",
+    "\n",
+    "fig = matplotlib.figure.Figure(figsize=(7, 4.32624056*2), dpi=100)\n",
+    "ax = fig.add_subplot(2, 1, 1)\n",
+    "ax.plot(CPU_P, CPU_TPS, 's', color='C0', label='CPU')\n",
+    "ax.hlines(y=GPU_TPS, xmin=1, xmax=256, color='C1', label='GPU')\n",
+    "ax.set_xlabel('P')\n",
+    "ax.set_ylabel('TPS')\n",
+    "ax.legend()\n",
+    "\n",
+    "ax = fig.add_subplot(2, 1, 2)\n",
+    "ax.plot(CPU_P, CPU_eta, 's', color='C0')\n",
+    "ax.hlines(y=GPU_TPS / (CPU_TPS[0] * 64), xmin=1, xmax=256, color='C1')\n",
+    "ax.set_xlabel('P')\n",
+    "ax.set_ylabel('$\\eta$')\n",
+    "fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c03bf456-f13e-43c0-9513-e31a31523443",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig.savefig('../howto/wca-efficiency-131072.svg', bbox_inches='tight', facecolor=(1, 1, 1, 1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7371b729-f9ee-481a-b3af-82b358ef55d2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sphinx-doc/how-to.rst b/sphinx-doc/how-to.rst
@@ -7,6 +7,7 @@ How-to
 .. toctree::
     :maxdepth: 1
 
+    howto/determine-the-most-efficient-device
     howto/molecular
     howto/cpppotential
     howto/custom-md-potential

diff --git a/sphinx-doc/howto/custom-md-potential.rst b/sphinx-doc/howto/custom-md-potential.rst
@@ -8,15 +8,13 @@ There several different methods to apply arbitrary forces on particles in MD:
 
 1. Implement a C++ :doc:`component <../components>` that evaluates the force/potential. Use one of
    the `example plugins`_ as a reference.
-
-1. Use the appropriate tabulated potential:
+2. Use the appropriate tabulated potential:
 
    * `hoomd.md.pair.Table`
    * `hoomd.md.bond.Table`
    * `hoomd.md.angle.Table`
    * `hoomd.md.dihedral.Table`
-
-1. Implement a Python subclass `hoomd.md.force.Custom` that evaluates the force/potential.
+3. Implement a Python subclass `hoomd.md.force.Custom` that evaluates the force/potential.
 
 C++ components provide the highest performance, accuracy, and flexibility. Tabulated potentials
 provide moderate performance and accuracy is limited by the interpolation. The performance and

diff --git a/sphinx-doc/howto/determine-the-most-efficient-device.py b/sphinx-doc/howto/determine-the-most-efficient-device.py
@@ -0,0 +1,47 @@
+import hoomd
+import argparse
+
+kT = 1.2
+
+# Parse command line arguments.
+parser = argparse.ArgumentParser()
+parser.add_argument('--device', default='CPU')
+parser.add_argument('--replicate', default=1, type=int)
+parser.add_argument('--steps', default=10_000, type=int)
+args = parser.parse_args()
+
+# Create WCA MD simulation
+device = getattr(hoomd.device, args.device)()
+simulation = hoomd.Simulation(device=device, seed=1)
+simulation.create_state_from_gsd(filename='spheres.gsd')
+simulation.state.replicate(
+    nx=args.replicate,
+    ny=args.replicate,
+    nz=args.replicate,
+)
+simulation.state.thermalize_particle_momenta(filter=hoomd.filter.All(), kT=kT)
+
+cell = hoomd.md.nlist.Cell(buffer=0.2)
+lj = hoomd.md.pair.LJ(nlist=cell)
+lj.params[('A', 'A')] = dict(sigma=1, epsilon=1)
+lj.r_cut[('A', 'A')] = 2**(1 / 6)
+
+constant_volume = hoomd.md.methods.ConstantVolume(
+    filter=hoomd.filter.All(),
+    thermostat=hoomd.md.methods.thermostats.Bussi(kT=kT))
+
+simulation.operations.integrator = hoomd.md.Integrator(
+    dt=0.001, methods=[constant_volume], forces=[lj])
+
+# Wait until GPU kernel parameter autotuning is complete.
+if args.device == 'GPU':
+    simulation.run(100)
+    while not simulation.operations.is_tuning_complete:
+        simulation.run(100)
+
+# Warm up memory caches and pre-computed quantities.
+simulation.run(args.steps)
+
+# Run the benchmark and print the performance.
+simulation.run(args.steps)
+device.notice(f'TPS: {simulation.tps:0.5g}')
diff --git a/sphinx-doc/howto/determine-the-most-efficient-device.rst b/sphinx-doc/howto/determine-the-most-efficient-device.rst
@@ -0,0 +1,139 @@
+.. Copyright (c) 2009-2023 The Regents of the University of Michigan.
+.. Part of HOOMD-blue, released under the BSD 3-Clause License.
+
+How to determine the most efficient device
+==========================================
+
+Execute benchmarks of your simulation on a variety of device configurations then compare the results
+to determine which is the most efficient. Your simulation model, parameters, system size, and
+available hardware all impact the resulting performance. When benchmarking, make sure that all GPU
+kernels have completed autotuning and that the memory caches have been warmed up before measuring
+performance.
+
+For example:
+
+.. literalinclude:: determine-the-most-efficient-device.py
+    :language: python
+
+Example Results (N=2048)
+------------------------
+
+On AMD EPYC 7742 (PSC Bridges-2) and NVIDIA A100 (NCSA Delta), this script reports
+(``$ mpirun -n $P python3 determine-the-most-efficient-device.py --device $PROCESSOR``):
+
+.. list-table::
+   :header-rows: 1
+
+   * - Processor
+     - P
+     - TPS
+   * - CPU
+     - 1
+     - 2699
+   * - CPU
+     - 2
+     - 4868
+   * - CPU
+     - 4
+     - 8043
+   * - CPU
+     - 8
+     - 12585
+   * - CPU
+     - 16
+     - 18168
+   * - CPU
+     - 32
+     - 22394
+   * - CPU
+     - 64
+     - 25031
+   * - GPU
+     - 1
+     - 15955
+
+The optimal device selection depends on the metric. When the metric is wall clock time only, choose
+the highest performance benchmark. When the metric is a cost, choose based on the efficiency of each
+device configuration.
+
+One cost metric is compute time. Most HPC resources assign a cost by CPU core hours.
+Some HPC resources may assign an effective cost to GPUs. When this is not the case, use the ratio of
+available GPU hours to CPU core hours as a substitute. This example will assign a relative cost of
+1 GPU hour to 64 CPU core hours. The efficiency is:
+
+.. math::
+
+    \eta =
+    \begin{cases}
+    \frac{S_\mathrm{P\ CPUs}}{S_\mathrm{1\ CPU}} \cdot \frac{1}{P} & \mathrm{CPU} \\
+    \frac{S_\mathrm{P\ GPUs}}{S_\mathrm{1\ CPU}} \cdot \frac{1}{64 P} & \mathrm{GPU} \\
+    \end{cases}
+
+where :math:`S` is the relevant performance metric.
+
+.. image:: wca-efficiency-2048.svg
+    :alt: Performance and efficiency of 2048 particle WCA simulations.
+
+With 2048 particles in this example, the CPU is always more efficient than the GPU and the CPU is
+faster than the GPU when :math:`P \ge 16`. Therefore, the CPU is always the optimal choice. Choose a
+number of ranks :math:`P` depending on project needs and budgets. Larger values of :math:`P` will
+provide results with lower latency at the cost of more CPU core hours. In this example, :math:`P=8`
+(:math:`\eta \sim 0.6`) is a middle ground providing a significant reduction in time to solution at
+a moderate extra cost in CPU core hours.
+
+Example Results (N=131,072)
+---------------------------
+
+The results are very different with 131,072 particles
+(``$ mpirun -n $P python3 determine-the-most-efficient-device.py --device $PROCESSOR --replicate=4``):
+
+.. list-table::
+   :header-rows: 1
+
+   * - Processor
+     - P
+     - TPS
+   * - CPU
+     - 1
+     - 36.072
+   * - CPU
+     - 2
+     - 61.988
+   * - CPU
+     - 4
+     - 143.25
+   * - CPU
+     - 8
+     - 281.35
+   * - CPU
+     - 16
+     - 502.48
+   * - CPU
+     - 32
+     - 910.58
+   * - CPU
+     - 64
+     - 1451.5
+   * - CPU
+     - 128
+     - 2216.1
+   * - CPU
+     - 256
+     - 2706.8
+   * - GPU
+     - 1
+     - 7276.5
+
+.. image:: wca-efficiency-131072.svg
+    :alt: Performance and efficiency of 131,072 particle WCA simulations.
+
+At a this system size, the GPU is always both faster and more efficient than the CPU.
+
+Compare the two examples and notice that the TPS achieved by the GPU is only cut in half when the
+system size is increased by a factor of 64. This signals that the smaller system size was not able
+to utilize all the parallel processing units on the GPU.
+
+.. note::
+
+    Use trial moves per second (`hoomd.hpmc.integrate.HPMCIntegrator.mps`) as the performance
+    metric when benchmarking HPMC simulations.
diff --git a/sphinx-doc/howto/spheres.gsd b/sphinx-doc/howto/spheres.gsd