From 7447264275f551fd758c8a61556a6a62ae5932e7 Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Tue, 12 Mar 2024 12:16:38 +0100 Subject: [PATCH 1/2] Make TF CUDA unit tests dependent on tf_cuda_support tool --- PhysicsTools/TensorFlow/test/BuildFile.xml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/PhysicsTools/TensorFlow/test/BuildFile.xml b/PhysicsTools/TensorFlow/test/BuildFile.xml index 03ca557c61619..c51046ba68ac9 100644 --- a/PhysicsTools/TensorFlow/test/BuildFile.xml +++ b/PhysicsTools/TensorFlow/test/BuildFile.xml @@ -5,7 +5,7 @@ - + @@ -29,7 +29,7 @@ - + @@ -52,7 +52,7 @@ - + @@ -75,7 +75,8 @@ - + + @@ -98,7 +99,7 @@ - + @@ -121,7 +122,8 @@ - + + @@ -145,7 +147,7 @@ - + From b9b41e4c8166dca668644f4aef63429fb9c4aebf Mon Sep 17 00:00:00 2001 From: Davide Valsecchi Date: Wed, 13 Mar 2024 00:09:09 +0100 Subject: [PATCH 2/2] Checking if Nvidia GPUs are visible to TF when visible to cmssw --- PhysicsTools/TensorFlow/src/TensorFlow.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/PhysicsTools/TensorFlow/src/TensorFlow.cc b/PhysicsTools/TensorFlow/src/TensorFlow.cc index fcb09e2e9c449..786166ef407a9 100644 --- a/PhysicsTools/TensorFlow/src/TensorFlow.cc +++ b/PhysicsTools/TensorFlow/src/TensorFlow.cc @@ -42,6 +42,15 @@ namespace tensorflow { // NVidia GPU else if (backend == Backend::cuda) { if (not ri->nvidiaDriverVersion().empty()) { + // Check if one GPU device is visible to TF + // If not, an exception is raised --> this can happen in case of driver version mismatch + // or missing CUDA support in TF compilation + if ((*_options.config.mutable_device_count())["GPU"] == 0) { + edm::Exception ex(edm::errors::UnavailableAccelerator); + ex << "Cuda backend requested, NVIDIA GPU visible to cmssw, but not visible to TensorFlow in the job"; + ex.addContext("Calling tensorflow::setBackend()"); + throw ex; + } // Take only the first GPU in the CUDA_VISIBLE_DEVICE list (*_options.config.mutable_device_count())["GPU"] = 1; _options.config.mutable_gpu_options()->set_visible_device_list("0");