Single instance model OOM #1420

jverpeut · 2023-07-25T20:42:40Z

jverpeut
Jul 25, 2023

I have a new dell computer with a Nvidia T1000 8GB GPU. If we run the single instance model (see below) we run out of memory (maxes at 6.8 out of 8GB), but I can run top-down or bottom-up without having that problem. Is this a bug or something known about these models? I have not updated SLEAP on my laptop to the newest version, but I haven't had memory problems with the GeForce GTX 1050 with 4GB. I had to delete some of this comment as it was too long.

Start training single_instance...
['sleap-train', 'C:\Users\VERPEU1\AppData\Local\Temp\tmpip_i6s8v\230725_132458_training_job.json', 'C:/Users/verpeutlab/labels.7.14.23 copy6 copy copy copy copy.slp', '--zmq', '--save_viz']
INFO:sleap.nn.training:Versions:
SLEAP: 1.3.1
TensorFlow: 2.8.4
Numpy: 1.21.5
Python: 3.7.16
OS: Windows-10-10.0.19041-SP0
INFO:sleap.nn.training:Training labels file: C:/Users/verpeutlab/labels.7.14.23 copy6 copy copy copy copy.slp
INFO:sleap.nn.training:Training profile: C:\Users\VERPEU1\AppData\Local\Temp\tmpip_i6s8v\230725_132458_training_job.json
INFO:sleap.nn.training:
INFO:sleap.nn.training:Arguments:
INFO:sleap.nn.training:{
"training_job_path": "C:\Users\VERPEU1\AppData\Local\Temp\tmpip_i6s8v\230725_132458_training_job.json",
"labels_path": "C:/Users/verpeutlab/labels.7.14.23 copy6 copy copy copy copy.slp",
"video_paths": [
""
],
"val_labels": null,
"test_labels": null,
"base_checkpoint": null,
"tensorboard": false,
"save_viz": true,
"zmq": true,
"run_name": "",
"prefix": "",
"suffix": "",
"cpu": false,
"first_gpu": false,
"last_gpu": false,
"gpu": "auto"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Training job:
INFO:sleap.nn.training:{
"data": {
"labels": {
"training_labels": null,
"validation_labels": null,
"validation_fraction": 0.1,
"test_labels": null,
"split_by_inds": false,
"training_inds": null,
"validation_inds": null,
"test_inds": null,
"search_path_hints": [],
"skeletons": []
},
"preprocessing": {
"ensure_rgb": false,
"ensure_grayscale": false,
"imagenet_mode": null,
"input_scaling": 1.0,
"pad_to_stride": null,
"resize_and_pad_to_target": true,
"target_height": null,
"target_width": null
},
"instance_cropping": {
"center_on_part": null,
"crop_size": null,
"crop_size_detection_padding": 16
}
},
"model": {
"backbone": {
"leap": null,
"unet": {
"stem_stride": null,
"max_stride": 64,
"output_stride": 2,
"filters": 64,
"filters_rate": 2.0,
"middle_block": false,
"up_interpolate": false,
"stacks": 1
},
"hourglass": null,
"resnet": null,
"pretrained_encoder": null
},
"heads": {
"single_instance": {
"part_names": null,
"sigma": 2.5,
"output_stride": 2,
"loss_weight": 1.0,
"offset_refinement": false
},
"centroid": null,
"centered_instance": null,
"multi_instance": null,
"multi_class_bottomup": null,
"multi_class_topdown": null
},
"base_checkpoint": null
},
"optimization": {
"preload_data": true,
"augmentation_config": {
"rotate": true,
"rotation_min_angle": -15.0,
"rotation_max_angle": 15.0,
"translate": false,
"translate_min": -5,
"translate_max": 5,
"scale": false,
"scale_min": 0.9,
"scale_max": 1.1,
"uniform_noise": false,
"uniform_noise_min_val": 0.0,
"uniform_noise_max_val": 10.0,
"gaussian_noise": false,
"gaussian_noise_mean": 5.0,
"gaussian_noise_stddev": 1.0,
"contrast": false,
"contrast_min_gamma": 0.5,
"contrast_max_gamma": 2.0,
"brightness": false,
"brightness_min_val": 0.0,
"brightness_max_val": 10.0,
"random_crop": false,
"random_crop_height": 256,
"random_crop_width": 256,
"random_flip": true,
"flip_horizontal": false
},
"online_shuffling": true,
"shuffle_buffer_size": 128,
"prefetch": true,
"batch_size": 2,
"batches_per_epoch": null,
"min_batches_per_epoch": 200,
"val_batches_per_epoch": null,
"min_val_batches_per_epoch": 10,
"epochs": 200,
"optimizer": "adam",
"initial_learning_rate": 0.0001,
"learning_rate_schedule": {
"reduce_on_plateau": true,
"reduction_factor": 0.5,
"plateau_min_delta": 1e-06,
"plateau_patience": 5,
"plateau_cooldown": 3,
"min_learning_rate": 1e-08
},
"hard_keypoint_mining": {
"online_mining": false,
"hard_to_easy_ratio": 2.0,
"min_hard_keypoints": 2,
"max_hard_keypoints": null,
"loss_scale": 5.0
},
"early_stopping": {
"stop_training_on_plateau": true,
"plateau_min_delta": 1e-08,
"plateau_patience": 10
}
},
"outputs": {
"save_outputs": true,
"run_name": "230725_132458.single_instance.n=60",
"run_name_prefix": "",
"run_name_suffix": "",
"runs_folder": "C:/Users/verpeutlab\models",
"tags": [
""
],
"save_visualizations": true,
"delete_viz_images": true,
"zip_outputs": false,
"log_to_csv": true,
"checkpointing": {
"initial_model": false,
"best_model": true,
"every_epoch": false,
"latest_model": false,
"final_model": false
},
"tensorboard": {
"write_logs": false,
"loss_frequency": "epoch",
"architecture_graph": false,
"profile_graph": false,
"visualizations": true
},
"zmq": {
"subscribe_to_controller": true,
"controller_address": "tcp://127.0.0.1:9000",
"controller_polling_timeout": 10,
"publish_updates": true,
"publish_address": "tcp://127.0.0.1:9001"
}
},
"name": "",
"description": "",
"sleap_version": "1.3.1",
"filename": "C:\Users\VERPEU1\AppData\Local\Temp\tmpip_i6s8v\230725_132458_training_job.json"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Auto-selected GPU 0 with 7827 MiB of free memory.
INFO:sleap.nn.training:Using GPU 0 for acceleration.
INFO:sleap.nn.training:Disabled GPU memory pre-allocation.
INFO:sleap.nn.training:System:
GPUs: 1/1 available
Device: /physical_device:GPU:0
Available: True
Initalized: False
Memory growth: True
INFO:sleap.nn.training:
INFO:sleap.nn.training:Initializing trainer...
INFO:sleap.nn.training:Loading training labels from: C:/Users/verpeutlab/labels.7.14.23 copy6 copy copy copy copy.slp
INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1
INFO:sleap.nn.training: Splits: Training = 54 / Validation = 6.
INFO:sleap.nn.training:Setting up for training...
INFO:sleap.nn.training:Setting up pipeline builders...
INFO:sleap.nn.training:Setting up model...
INFO:sleap.nn.training:Building test pipeline...
2023-07-25 13:25:01.753650: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-25 13:25:02.074669: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6043 MB memory: -> device: 0, name: NVIDIA T1000 8GB, pci bus id: 0000:01:00.0, compute capability: 7.5
INFO:sleap.nn.training:Loaded test example. [1.089s]
INFO:sleap.nn.training: Input shape: (1024, 1280, 1)
INFO:sleap.nn.training:Created Keras model.
INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=64, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=6, middle_block=False, up_blocks=5, up_interpolate=False, block_contraction=False)
INFO:sleap.nn.training: Max stride: 64
INFO:sleap.nn.training: Parameters: 289,146,702
INFO:sleap.nn.training: Heads:
INFO:sleap.nn.training: [0] = SingleInstanceConfmapsHead(part_names=['Thumb_L', 'Thumb_R', 'Index_L', 'Index_R', 'Middle_L', 'Middle_R', 'Ring_L', 'Ring_R', 'Pinky_L', 'Pinky_R', 'Palm_L', 'Palm_R', 'Wrist_L', 'Wrist_R'], sigma=2.5, output_stride=2, loss_weight=1.0)
INFO:sleap.nn.training: Outputs:
INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 512, 640, 14), dtype=tf.float32, name=None), name='SingleInstanceConfmapsHead/BiasAdd:0', description="created by layer 'SingleInstanceConfmapsHead'")
INFO:sleap.nn.training:Training from scratch
INFO:sleap.nn.training:Setting up data pipelines...
INFO:sleap.nn.training:Training set: n = 54
INFO:sleap.nn.training:Validation set: n = 6
INFO:sleap.nn.training:Setting up optimization...
INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08)
INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=10)
INFO:sleap.nn.training:Setting up outputs...
INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: )
INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000
INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set
INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001
INFO:sleap.nn.training:Created run path: C:/Users/verpeutlab\models\230725_132458.single_instance.n=60
INFO:sleap.nn.training:Setting up visualization...
INFO:sleap.nn.training:Finished trainer set up. [1.6s]
INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation...
INFO:sleap.nn.training:Finished creating training datasets. [2.5s]
INFO:sleap.nn.training:Starting training loop...
Epoch 1/200
2023-07-25 13:25:06.915347: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 656.00MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2023-07-25 13:25:06.915699: W tensorflow/core/kernels/gpu_utils.cc:50] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.
2023-07-25 13:25:07.386817: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8201
2023-07-25 13:25:18.498065: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 640.00MiB (rounded to 671088640)requested by op model/stack0_enc0_conv1/Conv2D
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation.
2023-07-25 13:25:18.568569: I tensorflow/core/common_runtime/bfc_allocator.cc:1078] Sum Total of in-use chunks: 5.22GiB
2023-07-25 13:25:18.568697: I tensorflow/core/common_runtime/bfc_allocator.cc:1080] total_region_allocated_bytes_: 6336544768 memory_limit_: 6336544768 available bytes: 0 curr_region_allocation_bytes_: 8589934592
2023-07-25 13:25:18.568844: I tensorflow/core/common_runtime/bfc_allocator.cc:1086] Stats:
Limit: 6336544768
InUse: 5600551680
MaxInUse: 5711703040
NumAllocs: 770
MaxAllocSize: 671088640
Reserved: 0
PeakReserved: 0
LargestFreeBlock: 0

2023-07-25 13:25:18.568997: W tensorflow/core/common_runtime/bfc_allocator.cc:474] ******************************************************************************************__________
2023-07-25 13:25:18.569151: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at conv_ops.cc:685 : RESOURCE_EXHAUSTED: OOM when allocating tensor with shape[2,64,1024,1280] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
Traceback (most recent call last):
File "C:\ProgramData\anaconda3\envs\sleap\Scripts\sleap-train-script.py", line 33, in
sys.exit(load_entry_point('sleap', 'console_scripts', 'sleap-train')())
File "c:\windows\system32\sleap\sleap\nn\training.py", line 2016, in main
trainer.train()
File "c:\windows\system32\sleap\sleap\nn\training.py", line 943, in train
verbose=2,
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler raise e.with_traceback(filtered_tb) from None
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\tensorflow\python\eager\execute.py", line 55, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Graph execution error:

Detected at node 'model/stack0_enc0_conv1/Conv2D' defined at (most recent call last):
File "C:\ProgramData\anaconda3\envs\sleap\Scripts\sleap-train-script.py", line 33, in
sys.exit(load_entry_point('sleap', 'console_scripts', 'sleap-train')())
File "c:\windows\system32\sleap\sleap\nn\training.py", line 2016, in main
trainer.train()
File "c:\windows\system32\sleap\sleap\nn\training.py", line 943, in train
verbose=2,
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\training.py", line 1384, in fit
tmp_logs = self.train_function(iterator)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\training.py", line 1021, in train_function
return step_function(self, iterator)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\training.py", line 1010, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\training.py", line 1000, in run_step
outputs = model.train_step(data)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\training.py", line 859, in train_step
y_pred = self(x, training=True)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\base_layer.py", line 1096, in call
outputs = call_fn(inputs, *args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\functional.py", line 452, in call
inputs, training=training, mask=mask)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\functional.py", line 589, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\engine\base_layer.py", line 1096, in call
outputs = call_fn(inputs, *args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\utils\traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\layers\convolutional.py", line 248, in call
outputs = self.convolution_op(inputs, self.kernel)
File "C:\ProgramData\anaconda3\envs\sleap\lib\site-packages\keras\layers\convolutional.py", line 240, in convolution_op
name=self.class.name)
Node: 'model/stack0_enc0_conv1/Conv2D'
OOM when allocating tensor with shape[2,64,1024,1280] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[{{node model/stack0_enc0_conv1/Conv2D}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_8742]
INFO:sleap.nn.callbacks:Closing the reporter controller/context.
INFO:sleap.nn.callbacks:Closing the training controller socket/context.
Run Path: C:/Users/verpeutlab\models\230725_132458.single_instance.n=60

roomrys · 2023-07-25T20:46:52Z

roomrys
Jul 25, 2023
Maintainer

Hi @jverpeut,

Unfortunately this is something that is known about the Single Instance model. In terms of memory usage, top-down requires the least and single instance requires the most.

One thing you could try is the set the input scaling a bit lower. Any input scaling less than 1 will down sample the images and essentially throw pixels out. This is fine when the features/body parts you are tracking are at least a few pixels large, but exercise caution when tracking smaller features such as antennae/insect legs as you might accidentally throw out the entire body part.

I am surprised that the GPU with greater memory cannot run the same model (assuming all the hyperparameters were the same accross?).

Thanks,
Liezl

0 replies

jverpeut · 2023-07-27T04:34:47Z

jverpeut
Jul 27, 2023
Author

Thank you!

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Single instance model OOM #1420

{{title}}

Replies: 2 comments

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

Select a reply

Single instance model OOM #1420

jverpeut Jul 25, 2023

Replies: 2 comments

roomrys Jul 25, 2023 Maintainer

jverpeut Jul 27, 2023 Author

jverpeut
Jul 25, 2023

roomrys
Jul 25, 2023
Maintainer

jverpeut
Jul 27, 2023
Author