Force a gc between sphinx-gallery items to reclaim GPU memory. (apach…

…e#8722) GPU memory is only released once the PackedFunc for evaling the model is gced by Python. In CI we're noticing intermittent 'CUDA: Out of memory' failures while processing the tutorials, and tracing showed there was no gc happening between items. Not confident this will solve the problem but worth a try.
ylc · Sep 29, 2021 · c60afba · c60afba
1 parent 05c9dfe
commit c60afba
Showing 1 changed file with 12 additions and 0 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -29,6 +29,7 @@
 #
 # All configuration values have a default; values that are commented out
 # serve to show the default.
+import gc
 import sys
 import inspect
 import os, subprocess
@@ -300,6 +301,16 @@ def __call__(self, filename):
         return filename
 
 
+# When running the tutorials on GPUs we are dependent on the Python garbage collector
+# collecting TVM packed function closures for any device memory to also be released. This
+# is not a good setup for machines with lots of CPU ram but constrained GPU ram, so force
+# a gc after each example.
+def force_gc(gallery_cong, fname):
+    print("(Forcing Python gc after '{}' to avoid lag in reclaiming CUDA memory)".format(fname))
+    gc.collect()
+    print("(Remaining garbage: {})".format(gc.garbage))
+
+
 sphinx_gallery_conf = {
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("tvm", "numpy"),
@@ -317,6 +328,7 @@ def __call__(self, filename):
     "download_all_examples": False,
     "min_reported_time": 60,
     "expected_failing_examples": [],
+    "reset_modules": (force_gc, "matplotlib", "seaborn"),
 }
 
 autodoc_default_options = {