From 544fe8dca37ea0ff5923dd8f2c90dddbda3ad44b Mon Sep 17 00:00:00 2001 From: Shalin Mehta Date: Wed, 7 Aug 2024 21:52:59 -0700 Subject: [PATCH] version lighting CLI example (#128) * version lighting CLI example * add some documentation * ignore slurm output * add more tips --- .gitignore | 3 + .../contrastive_phenotyping/demo_cli_fit.yml | 115 ++++++++++++++++++ .../demo_cli_fit_slurm.sh | 44 +++++++ 3 files changed, 162 insertions(+) create mode 100644 applications/contrastive_phenotyping/demo_cli_fit.yml create mode 100644 applications/contrastive_phenotyping/demo_cli_fit_slurm.sh diff --git a/.gitignore b/.gitignore index ac64f851..d84853e4 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,9 @@ __pycache__/ # written by setuptools_scm */_version.py +# slurm output files +slurm-* + # Distribution / packaging .Python build/ diff --git a/applications/contrastive_phenotyping/demo_cli_fit.yml b/applications/contrastive_phenotyping/demo_cli_fit.yml new file mode 100644 index 00000000..19ebe0e7 --- /dev/null +++ b/applications/contrastive_phenotyping/demo_cli_fit.yml @@ -0,0 +1,115 @@ +# See help here on how to configure hyper-parameters with config files: https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_advanced.html +seed_everything: 42 +trainer: + accelerator: gpu + strategy: ddp + devices: 4 + num_nodes: 1 + precision: 32-true + logger: + class_path: lightning.pytorch.loggers.TensorBoardLogger + init_args: + save_dir: /hpc/projects/intracellular_dashboard/viral-sensor/infection_classification/models/contrastive_tune_augmentations + version: chocolate # this is the name of the experiment. The logs will be saved in save_dir/lightning_logs/version + log_graph: True + # Nesting the logger config like this is equivalent to supplying the following argument to lightning.pytorch.Trainer + # logger=TensorBoardLogger( + # "/hpc/projects/intracellular_dashboard/viral-sensor/infection_classification/models/contrastive_tune_augmentations", + # log_graph=True, + # version="vanilla", + # ) + callbacks: + - class_path: lightning.pytorch.callbacks.LearningRateMonitor + init_args: + logging_interval: step + - class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + monitor: loss/val + every_n_epochs: 1 + save_top_k: 4 + save_last: true + fast_dev_run: false + max_epochs: 100 + log_every_n_steps: 10 + enable_checkpointing: true + inference_mode: true + use_distributed_sampler: true +model: + backbone: convnext_tiny + in_channels: 2 + log_batches_per_epoch: 3 + log_samples_per_batch: 3 + lr: 0.0002 +data: + data_path: /hpc/projects/virtual_staining/2024_02_04_A549_DENV_ZIKV_timelapse/registered_chunked.zarr + tracks_path: /hpc/projects/intracellular_dashboard/viral-sensor/2024_02_04_A549_DENV_ZIKV_timelapse/7.1-seg_track/tracking_v1.zarr + source_channel: + - Phase3D + - RFP + z_range: [25, 40] + batch_size: 32 + num_workers: 12 + initial_yx_patch_size: [384, 384] + final_yx_patch_size: [192, 192] + normalizations: + - class_path: viscy.transforms.NormalizeSampled + init_args: + keys: [Phase3D] + level: fov_statistics + subtrahend: mean + divisor: std + - class_path: viscy.transforms.ScaleIntensityRangePercentilesd + init_args: + keys: [RFP] + lower: 50 + upper: 99 + b_min: 0.0 + b_max: 1.0 + augmentations: + - class_path: viscy.transforms.RandAffined + init_args: + keys: [Phase3D, RFP] + prob: 0.8 + scale_range: [0, 0.2, 0.2] + rotate_range: [3.14, 0.0, 0.0] + shear_range: [0.0, 0.01, 0.01] + padding_mode: zeros + - class_path: viscy.transforms.RandAdjustContrastd + init_args: + keys: [RFP] + prob: 0.5 + gamma: [0.7, 1.3] + - class_path: viscy.transforms.RandAdjustContrastd + init_args: + keys: [Phase3D] + prob: 0.5 + gamma: [0.8, 1.2] + - class_path: viscy.transforms.RandScaleIntensityd + init_args: + keys: [RFP] + prob: 0.7 + factors: 0.5 + - class_path: viscy.transforms.RandScaleIntensityd + init_args: + keys: [Phase3D] + prob: 0.5 + factors: 0.5 + - class_path: viscy.transforms.RandGaussianSmoothd + init_args: + keys: [Phase3D, RFP] + prob: 0.5 + sigma_x: [0.25, 0.75] + sigma_y: [0.25, 0.75] + sigma_z: [0.0, 0.0] + - class_path: viscy.transforms.RandGaussianNoised + init_args: + keys: [RFP] + prob: 0.5 + mean: 0.0 + std: 0.5 + - class_path: viscy.transforms.RandGaussianNoised + init_args: + keys: [Phase3D] + prob: 0.5 + mean: 0.0 + std: 0.2 diff --git a/applications/contrastive_phenotyping/demo_cli_fit_slurm.sh b/applications/contrastive_phenotyping/demo_cli_fit_slurm.sh new file mode 100644 index 00000000..220e9837 --- /dev/null +++ b/applications/contrastive_phenotyping/demo_cli_fit_slurm.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +#SBATCH --job-name=contrastive_origin +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres=gpu:4 +#SBATCH --partition=gpu +#SBATCH --cpus-per-task=14 +#SBATCH --mem-per-cpu=15G +#SBATCH --time=0-20:00:00 + +# debugging flags (optional) +# https://lightning.ai/docs/pytorch/stable/clouds/cluster_advanced.html +export NCCL_DEBUG=INFO +export PYTHONFAULTHANDLER=1 + + +# Cleanup function to remove the temporary files +function cleanup() { + rm -rf /tmp/$SLURM_JOB_ID/*.zarr + echo "Cleanup Completed." +} + +trap cleanup EXIT +# trap the EXIT signal sent to the process and invoke the cleanup. + +# Activate the conda environment - specfic to your installation! +module load anaconda/2022.05 +# You'll need to replace this path with path to your own conda environment. +conda activate /hpc/mydata/$USER/envs/viscy + +config=./demo_cli_fit.yml + +# Printing this to the stdout lets us connect the job id to config. +scontrol show job $SLURM_JOB_ID +cat $config + +# Run the training CLI +srun python -m viscy.cli.contrastive_triplet fit -c $config + +# Tips: +# 1. run this script with `sbatch demo_cli_fit_slurm.sh` +# 2. check the status of the job with `squeue -u $USER` +# 3. use turm to monitor the job with `turm -u first.last`. Use module load turm to load the turm module.