From c60afbaaf1c6742d9ba39172f8965d7dd307f324 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Wed, 11 Aug 2021 14:07:24 -0700 Subject: [PATCH] Force a gc between sphinx-gallery items to reclaim GPU memory. (#8722) GPU memory is only released once the PackedFunc for evaling the model is gced by Python. In CI we're noticing intermittent 'CUDA: Out of memory' failures while processing the tutorials, and tracing showed there was no gc happening between items. Not confident this will solve the problem but worth a try. --- docs/conf.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index b008c305b1e7..4a0455214db3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -29,6 +29,7 @@ # # All configuration values have a default; values that are commented out # serve to show the default. +import gc import sys import inspect import os, subprocess @@ -300,6 +301,16 @@ def __call__(self, filename): return filename +# When running the tutorials on GPUs we are dependent on the Python garbage collector +# collecting TVM packed function closures for any device memory to also be released. This +# is not a good setup for machines with lots of CPU ram but constrained GPU ram, so force +# a gc after each example. +def force_gc(gallery_cong, fname): + print("(Forcing Python gc after '{}' to avoid lag in reclaiming CUDA memory)".format(fname)) + gc.collect() + print("(Remaining garbage: {})".format(gc.garbage)) + + sphinx_gallery_conf = { "backreferences_dir": "gen_modules/backreferences", "doc_module": ("tvm", "numpy"), @@ -317,6 +328,7 @@ def __call__(self, filename): "download_all_examples": False, "min_reported_time": 60, "expected_failing_examples": [], + "reset_modules": (force_gc, "matplotlib", "seaborn"), } autodoc_default_options = {