From 92d2f97d088deb403d969fadccec5e3cb702dba1 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 15:09:46 +0200
Subject: [PATCH 01/43] Spellchecker action

---
 .github/workflows/spellchecker.yml | 165 +++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 .github/workflows/spellchecker.yml

diff --git a/.github/workflows/spellchecker.yml b/.github/workflows/spellchecker.yml
new file mode 100644
index 000000000..ae23f4923
--- /dev/null
+++ b/.github/workflows/spellchecker.yml
@@ -0,0 +1,165 @@
+name: Check Spelling
+
+# Comment management is handled through a secondary job, for details see:
+# https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Restricted-Permissions
+#
+# `jobs.comment-push` runs when a push is made to a repository and the `jobs.spelling` job needs to make a comment
+#   (in odd cases, it might actually run just to collapse a comment, but that's fairly rare)
+#   it needs `contents: write` in order to add a comment.
+#
+# `jobs.comment-pr` runs when a pull_request is made to a repository and the `jobs.spelling` job needs to make a comment
+#   or collapse a comment (in the case where it had previously made a comment and now no longer needs to show a comment)
+#   it needs `pull-requests: write` in order to manipulate those comments.
+
+# Updating pull request branches is managed via comment handling.
+# For details, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Update-expect-list
+#
+# These elements work together to make it happen:
+#
+# `on.issue_comment`
+#   This event listens to comments by users asking to update the metadata.
+#
+# `jobs.update`
+#   This job runs in response to an issue_comment and will push a new commit
+#   to update the spelling metadata.
+#
+# `with.experimental_apply_changes_via_bot`
+#   Tells the action to support and generate messages that enable it
+#   to make a commit to update the spelling metadata.
+#
+# `with.ssh_key`
+#   In order to trigger workflows when the commit is made, you can provide a
+#   secret (typically, a write-enabled github deploy key).
+#
+#   For background, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Update-with-deploy-key
+
+# Sarif reporting
+#
+# Access to Sarif reports is generally restricted (by GitHub) to members of the repository.
+#
+# Requires enabling `security-events: write`
+# and configuring the action with `use_sarif: 1`
+#
+#   For information on the feature, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Sarif-output
+
+# Minimal workflow structure:
+#
+# on:
+#   push:
+#     ...
+#   pull_request_target:
+#     ...
+# jobs:
+#   # you only want the spelling job, all others should be omitted
+#   spelling:
+#     # remove `security-events: write` and `use_sarif: 1`
+#     # remove `experimental_apply_changes_via_bot: 1`
+#     ... otherwise adjust the `with:` as you wish
+
+on:
+  push:
+    branches:
+    - "**"
+    tags-ignore:
+    - "**"
+  pull_request_target:
+    branches:
+    - "**"
+    types:
+    - 'opened'
+    - 'reopened'
+    - 'synchronize'
+  issue_comment:
+    types:
+    - 'created'
+
+jobs:
+  spelling:
+    name: Check Spelling
+    permissions:
+      contents: read
+      pull-requests: read
+      actions: read
+      security-events: write
+    outputs:
+      followup: ${{ steps.spelling.outputs.followup }}
+    runs-on: ubuntu-latest
+    if: ${{ contains(github.event_name, 'pull_request') || github.event_name == 'push' }}
+    concurrency:
+      group: spelling-${{ github.event.pull_request.number || github.ref }}
+      # note: If you use only_check_changed_files, you do not want cancel-in-progress
+      cancel-in-progress: true
+    steps:
+    - name: check-spelling
+      id: spelling
+      uses: check-spelling/check-spelling@main
+      with:
+        suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }}
+        checkout: true
+        check_file_names: 1
+        spell_check_this: check-spelling/spell-check-this@prerelease
+        post_comment: 0
+        use_magic_file: 1
+        report-timing: 1
+        warnings: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,no-files-to-check
+        experimental_apply_changes_via_bot: 1
+        use_sarif: ${{ (!github.event.pull_request || (github.event.pull_request.head.repo.full_name == github.repository)) && 1 }}
+        extra_dictionary_limit: 20
+        extra_dictionaries:
+          cspell:software-terms/dict/softwareTerms.txt
+
+  comment-push:
+    name: Report (Push)
+    # If your workflow isn't running on push, you can remove this job
+    runs-on: ubuntu-latest
+    needs: spelling
+    permissions:
+      contents: write
+    if: (success() || failure()) && needs.spelling.outputs.followup && github.event_name == 'push'
+    steps:
+    - name: comment
+      uses: check-spelling/check-spelling@main
+      with:
+        checkout: true
+        spell_check_this: check-spelling/spell-check-this@prerelease
+        task: ${{ needs.spelling.outputs.followup }}
+
+  comment-pr:
+    name: Report (PR)
+    # If you workflow isn't running on pull_request*, you can remove this job
+    runs-on: ubuntu-latest
+    needs: spelling
+    permissions:
+      contents: read
+      pull-requests: write
+    if: (success() || failure()) && needs.spelling.outputs.followup && contains(github.event_name, 'pull_request')
+    steps:
+    - name: comment
+      uses: check-spelling/check-spelling@main
+      with:
+        checkout: true
+        spell_check_this: check-spelling/spell-check-this@prerelease
+        task: ${{ needs.spelling.outputs.followup }}
+        experimental_apply_changes_via_bot: 1
+
+  update:
+    name: Update PR
+    permissions:
+      contents: write
+      pull-requests: write
+      actions: read
+    runs-on: ubuntu-latest
+    if: ${{
+        github.event_name == 'issue_comment' &&
+        github.event.issue.pull_request &&
+        contains(github.event.comment.body, '@check-spelling-bot apply')}}
+    concurrency:
+      group: spelling-update-${{ github.event.issue.number }}
+      cancel-in-progress: false
+    steps:
+    - name: apply spelling updates
+      uses: check-spelling/check-spelling@main
+      with:
+        experimental_apply_changes_via_bot: 1
+        checkout: true
+        ssh_key: "${{ secrets.CHECK_SPELLING }}"
\ No newline at end of file

From d453ca07774cb3598d89d7c41d04b800a8a58da2 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 15:40:03 +0200
Subject: [PATCH 02/43] Testing spellchekcer

---
 GANDLF/some_misspled_file_nmae.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 GANDLF/some_misspled_file_nmae.py

diff --git a/GANDLF/some_misspled_file_nmae.py b/GANDLF/some_misspled_file_nmae.py
new file mode 100644
index 000000000..bd3bdf297
--- /dev/null
+++ b/GANDLF/some_misspled_file_nmae.py
@@ -0,0 +1,13 @@
+# Testing the spellckecker workflow
+
+
+def some_function():
+    print("Hello World!")
+    print("This is a test of the spellchecker workflow")
+    print("Here it is properly spelled")
+
+
+def some_misspled_function():
+    print("This is a test of the spellchecker workflow")
+    print("Here it is not properly spelled")
+    print("misspled!")

From d790350b7ee83c46478fe319c976d62f4ceba9ab Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 15:46:36 +0200
Subject: [PATCH 03/43] specllcheck jobs update

---
 .github/workflows/spellchecker.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/spellchecker.yml b/.github/workflows/spellchecker.yml
index ae23f4923..e3f298cf8 100644
--- a/.github/workflows/spellchecker.yml
+++ b/.github/workflows/spellchecker.yml
@@ -59,12 +59,12 @@ name: Check Spelling
 on:
   push:
     branches:
-    - "**"
-    tags-ignore:
-    - "**"
+    - "master"
+    # tags-ignore:
+    # - "**"
   pull_request_target:
     branches:
-    - "**"
+    - "master"
     types:
     - 'opened'
     - 'reopened'

From 134b1a0f8f2483cc07149cf0229c2f0c0bf4676a Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 16:30:18 +0200
Subject: [PATCH 04/43] Adding new dicts for spellchecker

---
 .github/workflows/spellchecker.yml | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/spellchecker.yml b/.github/workflows/spellchecker.yml
index e3f298cf8..504641cbc 100644
--- a/.github/workflows/spellchecker.yml
+++ b/.github/workflows/spellchecker.yml
@@ -59,12 +59,12 @@ name: Check Spelling
 on:
   push:
     branches:
-    - "master"
-    # tags-ignore:
-    # - "**"
+    - "**"
+    tags-ignore:
+    - "**"
   pull_request_target:
     branches:
-    - "master"
+    - "**"
     types:
     - 'opened'
     - 'reopened'
@@ -107,6 +107,22 @@ jobs:
         extra_dictionary_limit: 20
         extra_dictionaries:
           cspell:software-terms/dict/softwareTerms.txt
+          cspell:python/src/python/python-lib.txt
+          cspell:python/src/python/python.txt
+          cspell:python/src/common/extra.txt
+          cspell:php/dict/php.txt
+          cspell:r/src/r.txt
+          cspell:aws/aws.txt
+          cspell:django/dict/django.txt
+          cspell:filetypes/filetypes.txt
+          cspell:node/dict/node.txt
+          cspell:golang/dict/go.txt
+          cspell:fullstack/dict/fullstack.txt
+          cspell:java/src/java.txt
+          cspell:k8s/dict/k8s.txt
+          cspell:css/dict/css.txt
+          cspell:npm/dict/npm.txt
+          cspell:latex/dict/latex.txt
 
   comment-push:
     name: Report (Push)

From 320aab3eb964e5d361e724e44cda7fba5a3a8d90 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 16:43:01 +0200
Subject: [PATCH 05/43] Spellchecker config

---
 .github/workflows/spellchecker.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/spellchecker.yml b/.github/workflows/spellchecker.yml
index 504641cbc..940aa2fe4 100644
--- a/.github/workflows/spellchecker.yml
+++ b/.github/workflows/spellchecker.yml
@@ -94,6 +94,7 @@ jobs:
       id: spelling
       uses: check-spelling/check-spelling@main
       with:
+        config: .spelling
         suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }}
         checkout: true
         check_file_names: 1

From 94e73e8c6db2d6772750629dd1dc8a509400c5a7 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 16:58:12 +0200
Subject: [PATCH 06/43] Remove check supression on push

---
 .github/workflows/spellchecker.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/spellchecker.yml b/.github/workflows/spellchecker.yml
index 940aa2fe4..bf9926d2f 100644
--- a/.github/workflows/spellchecker.yml
+++ b/.github/workflows/spellchecker.yml
@@ -95,7 +95,6 @@ jobs:
       uses: check-spelling/check-spelling@main
       with:
         config: .spelling
-        suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }}
         checkout: true
         check_file_names: 1
         spell_check_this: check-spelling/spell-check-this@prerelease

From b98d76d95732ed08e5717a3629120b1b18494de2 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 16 Sep 2024 17:07:26 +0200
Subject: [PATCH 07/43] Config spellchecker

---
 .github/workflows/spellchecker.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/spellchecker.yml b/.github/workflows/spellchecker.yml
index bf9926d2f..2b0309af4 100644
--- a/.github/workflows/spellchecker.yml
+++ b/.github/workflows/spellchecker.yml
@@ -123,6 +123,11 @@ jobs:
           cspell:css/dict/css.txt
           cspell:npm/dict/npm.txt
           cspell:latex/dict/latex.txt
+          cspell:latex/samples/sample-words.txt
+          cspell:html-symbol-entities/entities.txt
+          cspell:html/dict/html.txt
+          cspell:cpp/src/ecosystem.txt
+          cspell:mnemonics/src/mnemonics.txt
 
   comment-push:
     name: Report (Push)

From 16c8f77d4e806cba84b6dee03a8a996ec90d4034 Mon Sep 17 00:00:00 2001
From: Sarthak Pati <sarthak.pati@hotmail.com>
Date: Thu, 17 Oct 2024 09:58:40 -0400
Subject: [PATCH 08/43] Delete GANDLF/some_misspled_file_nmae.py

---
 GANDLF/some_misspled_file_nmae.py | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 GANDLF/some_misspled_file_nmae.py

diff --git a/GANDLF/some_misspled_file_nmae.py b/GANDLF/some_misspled_file_nmae.py
deleted file mode 100644
index bd3bdf297..000000000
--- a/GANDLF/some_misspled_file_nmae.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Testing the spellckecker workflow
-
-
-def some_function():
-    print("Hello World!")
-    print("This is a test of the spellchecker workflow")
-    print("Here it is properly spelled")
-
-
-def some_misspled_function():
-    print("This is a test of the spellchecker workflow")
-    print("Here it is not properly spelled")
-    print("misspled!")

From f44273eded2717f4c7d0985d22f9538bf59ffc9c Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Sat, 19 Oct 2024 10:15:44 +0200
Subject: [PATCH 09/43] Spellchecker configuration and code corrections

---
 .gitignore                            |   1 +
 .spelling/.spelling/expect.txt        | 726 ++++++++++++++++++++++++++
 GANDLF/cli/huggingface_hub_handler.py |   2 +-
 GANDLF/compute/forward_pass.py        |   2 +-
 GANDLF/compute/step.py                |   2 +-
 GANDLF/data/patch_miner/opm/utils.py  |   2 +-
 GANDLF/metrics/segmentation.py        |   2 +-
 docs/extending.md                     |   2 +-
 docs/usage.md                         |   2 +-
 testing/test_full.py                  |  16 +-
 10 files changed, 742 insertions(+), 15 deletions(-)
 create mode 100644 .spelling/.spelling/expect.txt

diff --git a/.gitignore b/.gitignore
index d40b4a5df..33a03d0ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ testing/config_segmentation_temp.yaml
 testing/failures.log
 coverage.xml
 mlcube/workspace/*
+!.spelling/.spelling/expect.txt
 !mlcube/workspace/config.yml
 !mlcube/workspace/channelIDs.yml
 tutorials/classification_medmnist_notebook/dataset/pathmnist
diff --git a/.spelling/.spelling/expect.txt b/.spelling/.spelling/expect.txt
new file mode 100644
index 000000000..795ac46eb
--- /dev/null
+++ b/.spelling/.spelling/expect.txt
@@ -0,0 +1,726 @@
+Abhishek
+Abousamra
+acdfbac
+acsconv
+adadelta
+adagrad
+adamax
+adamw
+addcdiv
+addcmul
+addgroup
+addoption
+ademamix
+agc
+agni
+Aimilia
+aimiliag
+albumentations
+allclose
+allcontributors
+allsigned
+amsgrad
+Anirban
+anonymization
+applyaugs
+apptainer
+Aqubvel
+arange
+archs
+arcname
+argmax
+argwhere
+Arnout
+arxiv
+asarray
+astype
+atleast
+augs
+auroc
+autobuild
+autocast
+autodetermined
+auxilary
+avgs
+awaa
+Babak
+bacf
+backprop
+backpropagate
+backpropagation
+Baheti
+Baid
+Bakas
+Bashyam
+batchnorm
+bdfc
+beggining
+bgr
+Bhalerao
+bibtex
+bincount
+biomedicalimaging
+Bjoern
+blabla
+brahim
+brainage
+Brox
+btw
+Buildx
+BVd
+BVpye
+capsys
+cbica
+cbig
+cca
+CCE
+cdc
+cdir
+cel
+cencoder
+centercrop
+cff
+Chitalia
+christos
+Chunrui
+Chv
+cla
+classif
+classitk
+codacy
+codecov
+CODEOWNERS
+codeql
+codereview
+codespace
+Colab
+colorconv
+colorjitter
+colorlog
+commandline
+configfile
+configgenerator
+convs
+cooldown
+cosineannealing
+cosineannealinglr
+cosinesimilarity
+croporpad
+cropzero
+ctc
+CUBLAS
+cudnn
+cycliclr
+datacenter
+dataframe
+dataprep
+datestampnow
+Davatzikos
+dcce
+dclog
+dcm
+dcmtk
+deac
+deadsnakes
+DEBIAN
+deconv
+deepresunet
+Deepthi
+deepunet
+denseblock
+denselayer
+densenet
+depthconv
+Despina
+despinak
+devcontainer
+dfu
+dicelog
+dicom
+dicomanonymizer
+digestpath
+disaggregating
+discussioncomment
+distilbert
+DLF
+DNN
+dockerfiles
+dockerized
+dockertag
+doi
+Dokania
+dotnet
+downsamp
+downsample
+downsampling
+doxygen
+dpn
+dqbm
+dropna
+dtype
+dynunet
+edac
+edt
+edu
+eep
+efc
+efficientnet
+efficientnetb
+EIuqemz
+elementwise
+embeddings
+Emre
+ener
+entrypoints
+Ethem
+excepthook
+exctype
+exponentiallr
+fcn
+Fdocker
+fepegar
+figsize
+filenaming
+filereader
+fillna
+finetuned
+flaim
+fnull
+frgfm
+fromarray
+fromiter
+Fsegmentation
+Fulcio
+Fworkflows
+gandlf
+Gastounioti
+gbp
+gcam
+gcampp
+GDCM
+gdown
+gdp
+gelu
+geometricanirban
+Getka
+getoption
+getsizeof
+ggcam
+ghc
+gle
+glx
+Gonz
+Gotkowski
+gpu
+Grenko
+gridaggregator
+gridsampler
+groundtruth
+Guley
+gumbel
+Haghighi
+Hamamc
+Hamamci
+hausdorff
+healper
+heatmaps
+hexops
+hft
+histo
+histopath
+holocron
+hookwrapper
+HOUNSFIELD
+hpc
+hstack
+HTR
+huggingface
+hyperparameters
+idxs
+ihc
+iloc
+imagenet
+imbalanced
+imread
+imsave
+imshow
+imwrite
+indeces
+inlinehilite
+inp
+inputdata
+instancenorm
+interp
+ISBI
+issn
+itcr
+iterrows
+itk
+itkimage
+itksnap
+jaccard
+JAX
+JBHI
+JDu
+JSTARS
+Junwen
+jupyter
+Jupyterlab
+kaiming
+kakumarabhishek
+Karargyris
+Karkada
+keepdim
+keleshev
+kenshohara
+KFold
+kickstart
+kld
+Kontos
+ksel
+kspace
+Kullback
+Kurc
+labelsample
+labelsampler
+lambd
+layerchange
+Lbtnaq
+ldir
+leakyrelu
+Leibler
+levelname
+levelno
+libgl
+libjpeg
+libpython
+libsm
+libvips
+libxext
+lightresunet
+lightunet
+linalg
+linenums
+lineplot
+linspace
+linting
+lly
+logit
+logpt
+logsoftmax
+logvar
+longreprtext
+lps
+lrelu
+LROn
+lstsq
+lucidrains
+macenko
+mainrun
+makereport
+mathews
+matplotlib
+matthews
+maxpool
+mbergman
+mcc
+mcclog
+MCD
+mcr
+MCT
+mde
+mdmc
+medcam
+medmnist
+medperf
+medpy
+Megh
+mencoder
+menze
+metr
+miccai
+missingprediction
+mkdocs
+mlco
+mlcommons
+mlcube
+mlcubedir
+mlp
+modeified
+modelbase
+modelcard
+modeldir
+modelio
+monai
+Mouchtaris
+moveaxis
+mpp
+mps
+mri
+msa
+mscadocs
+msdnet
+mse
+msle
+Mukhopadhyay
+multiclass
+multidim
+multilabel
+mytagname
+nadam
+nans
+naveenk
+ncbi
+ncc
+ndarray
+ndexbio
+ndim
+ndimage
+ndlf
+nesterov
+neuroimage
+nfnets
+nibabel
+nicl
+NIf
+nifti
+nih
+nii
+nlabel
+nnf
+nonroot
+normtype
+notsigned
+novograd
+nsd
+nuitka
+numel
+numlay
+nvidia
+octicons
+offis
+OFWCPDRE
+ohif
+onefile
+onlatest
+onnx
+openfl
+openslide
+opensource
+openvino
+opm
+opset
+Orhun
+ossar
+outconv
+outputdir
+outputfile
+palletsprojects
+Panchumarthy
+pathmnist
+pati
+pbar
+pchs
+Pdocker
+pearson
+Phenomics
+pkl
+plt
+pmwiki
+pnas
+Prashant
+prcomment
+predmask
+preds
+probs
+Prunner
+prv
+psnr
+psutil
+pth
+PTk
+pubmed
+purelib
+pwadry
+pydantic
+pydicom
+pyinstaller
+pymdownx
+pypa
+pyplot
+pytorch
+pyversion
+qsub
+qubvel
+radam
+Radeon
+radiomics
+radxtools
+ramework
+randomaffine
+randomanisotropy
+randombiasfield
+randomblur
+randomelasticdeformation
+randomflip
+randommotion
+randomnoise
+randomswap
+rdp
+reco
+recoverconfig
+reducelronplateau
+reduceonplateau
+reencoded
+refering
+Rekor
+relativized
+relu
+rensen
+Reparameterization
+reparameterize
+rescaler
+residualunet
+resnet
+resunet
+rgbatorgb
+rgbtorgba
+rigourous
+Ritesh
+rmsprop
+rocm
+rocmdocs
+Ronneberger
+rowvar
+ruifrok
+runnning
+runtest
+Saltz
+samplewise
+Sarthak
+sarthakpati
+savefig
+sbakas
+sbia
+scikit
+scipy
+screenshots
+scse
+sdata
+sdnet
+seaborn
+Seac
+sebastianffx
+securefederatedai
+segmap
+segmask
+segmentor
+Sens
+sessionstart
+setbiasranges
+setcutoffrange
+setsigmaranges
+Sezgin
+sge
+Shahira
+shubham
+siddhesh
+sigstore
+silu
+Simonyan
+simpleitk
+sitk
+skimage
+sklearn
+slurm
+smi
+socio
+Soham
+Sotirios
+sparseadam
+spellchecker
+Sprop
+Spyridon
+ssim
+stackexchange
+stainextract
+stainlib
+steplr
+stepsize
+subjectid
+sume
+superfences
+sustainability
+swapaxes
+Tahsin
+tcia
+tempconvs
+tensorboard
+tgz
+thresholded
+thresholding
+Thu
+tiatoolbox
+tiffslide
+timepoints
+timm
+tio
+tioq
+tiosd
+TLDR
+tmi
+TOOLSDIRECTORY
+torchaudio
+torchinfo
+torchio
+torchmetrics
+torchvision
+towardsdatascience
+TPAMI
+tqdm
+traininginference
+transunet
+triaged
+tryfirst
+tsaftaris
+TUDA
+tversky
+uanced
+uinc
+Ujjwal
+Umeton
+unet
+unetr
+uniformsample
+uniformsampler
+unittests
+unitwise
+unsqueeze
+upenn
+Uploaing
+Uploded
+upsample
+upsampled
+upsampling
+utm
+uzh
+vahadane
+validing
+valuetopredict
+vgg
+Vinayak
+vios
+visualstudiomagazine
+vmem
+voxel
+VRAM
+vtk
+vvv
+WACV
+warmupcosineschedule
+Wauplin
+wcs
+weightedsample
+weightedsampler
+whl
+WORKDIR
+wsi
+wsl
+xavier
+xdim
+XDl
+XEI
+xkq
+xlabel
+xlim
+xnat
+XResolution
+XTools
+yamlchecker
+yamlvalidator
+ydim
+ylabel
+YResolution
+Yrv
+Yuemeng
+zarr
+Zeroplanes
+zicat
+znorm
+ZNormalization
+Zou
+abebbed
+adipocytes
+afcc
+AMNIST
+autorefs
+avq
+baf
+bdf
+bfa
+bjoh
+cadf
+cbcb
+cdbe
+cividis
+cmap
+CMNIST
+colorectal
+colormaps
+cxbdfkig
+dbdf
+Deconvolutional
+Dermatoscope
+dff
+eaik
+edc
+eeaf
+eee
+enu
+faa
+fdb
+fdc
+hdwlu
+hjzwmjxvamrxotxu
+ifqqtrs
+ihd
+Ingnore
+ipykernel
+ipynb
+ipynbcheckpoints
+jjzoifpdly
+Kather
+kernelspec
+Krisam
+lexer
+lkrvyj
+luap
+mkdocstrings
+MNIST
+mpimg
+mtdpzx
+nbconvert
+nbformat
+nhfspc
+ocflopa
+OCTMNIST
+pmkdaguy
+pvqg
+qfzk
+qwuvqx
+redownload
+rsmff
+rxtzgrcaq
+SMNIST
+swp
+torchtext
+uzbklab
+uzsc
+viridis
+xaburhd
+xso
+ystore
+Zisserman
+zsuokb
+zwezggl
+zzokqk
\ No newline at end of file
diff --git a/GANDLF/cli/huggingface_hub_handler.py b/GANDLF/cli/huggingface_hub_handler.py
index 72e2f35b0..b09fa9743 100644
--- a/GANDLF/cli/huggingface_hub_handler.py
+++ b/GANDLF/cli/huggingface_hub_handler.py
@@ -121,7 +121,7 @@ def push_to_model_hub(
         ignore_patterns=ignore_patterns,
         delete_patterns=delete_patterns,
     )
-    print("Model Sucessfully Uploded")
+    print("Model Successfully Uploded")
 
 
 def download_from_hub(
diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
index 69efa15a9..2135ca0ee 100644
--- a/GANDLF/compute/forward_pass.py
+++ b/GANDLF/compute/forward_pass.py
@@ -89,7 +89,7 @@ def validate_network(
 
     # # putting stuff in individual arrays for correlation analysis
     # all_targets = []
-    # all_predics = []
+    # all_predicts = []
     if params["medcam_enabled"] and params["model"]["type"] == "torch":
         model.enable_medcam()
         params["medcam_enabled"] = True
diff --git a/GANDLF/compute/step.py b/GANDLF/compute/step.py
index 148d206cf..1d9db8c12 100644
--- a/GANDLF/compute/step.py
+++ b/GANDLF/compute/step.py
@@ -98,7 +98,7 @@ def step(
             f"Model output is not a Tensor: {type(output)}. Say, `deep_resunet` and `deep_unet` may return "
             f"list of tensors on different scales instead of just one prediction Tensor. However due to "
             f"GaNDLF architecture it is expected that models return only one tensor. For deep_* models "
-            f"only the biggeest scale is processed. Use these models with caution till fix is implemented."
+            f"only the biggest scale is processed. Use these models with caution till fix is implemented."
         )
         output = output[0]
 
diff --git a/GANDLF/data/patch_miner/opm/utils.py b/GANDLF/data/patch_miner/opm/utils.py
index 1bee9b1f1..ed2e57258 100644
--- a/GANDLF/data/patch_miner/opm/utils.py
+++ b/GANDLF/data/patch_miner/opm/utils.py
@@ -431,7 +431,7 @@ def generate_initial_mask(slide_path: str, scale: int) -> Tuple[np.ndarray, tupl
     slide = tiffslide.open_slide(slide_path)
     slide_dims = slide.dimensions
 
-    # Call thumbnail for effiency, calculate scale relative to whole slide
+    # Call thumbnail for efficiency, calculate scale relative to whole slide
     slide_thumbnail = np.asarray(
         slide.get_thumbnail((slide_dims[0] // scale, slide_dims[1] // scale))
     )
diff --git a/GANDLF/metrics/segmentation.py b/GANDLF/metrics/segmentation.py
index 82254079f..cd1a7637b 100644
--- a/GANDLF/metrics/segmentation.py
+++ b/GANDLF/metrics/segmentation.py
@@ -226,7 +226,7 @@ def _calculator_sensitivity_specificity(
         float, float: The sensitivity and specificity between the object(s) in ```inp``` and the object(s) in ```target```.
     """
     # inMask is mask of input array equal to a certain tissue (ie. all one's in tumor core)
-    # Ref mask is mask of certain tissue in ground truth (ie. all one's in refernce core )
+    # Ref mask is mask of certain tissue in ground truth (ie. all one's in reference core )
     # overlap is mask where the two equal each other
     # They are of the total number of voxels of the ground truth brain mask
 
diff --git a/docs/extending.md b/docs/extending.md
index 929efa515..897bf1e9b 100644
--- a/docs/extending.md
+++ b/docs/extending.md
@@ -80,7 +80,7 @@ To update/change/add a dependency in [setup](https://github.com/mlcommons/GaNDLF
 ## Adding new CLI command
 Example: `gandlf config-generator` [CLI command](https://github.com/mlcommons/GaNDLF/blob/master/GANDLF/entrypoints/config_generator.py)
 - Implement function and wrap it with `@click.command()` + `@click.option()`
-- Add it to `cli_subommands` [dict](https://github.com/mlcommons/GaNDLF/blob/master/GANDLF/entrypoints/subcommands.py)
+- Add it to `cli_subcommands` [dict](https://github.com/mlcommons/GaNDLF/blob/master/GANDLF/entrypoints/subcommands.py)
 The command would be available under `gandlf your-subcommand-name` CLI command.
 
 ## Update parameters
diff --git a/docs/usage.md b/docs/usage.md
index 1f56947c9..24738d056 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -603,7 +603,7 @@ To upload to a dataset or a Space, use the --repo-type option:
 
 ### Huggingface Template For Upload
 #### Design and Modify Template 
-To design the huggingface template use the hugging_face.md file change the medatory field 
+To design the huggingface template use the hugging_face.md file change the mandatory field 
 [REQUIRED_FOR_GANDLF] to it's respective name don't  leave it blank other wise it may through error, other field can be modeified by the user as per his convenience
 
 ```bash
diff --git a/testing/test_full.py b/testing/test_full.py
index 50b628e76..eccf0b3c8 100644
--- a/testing/test_full.py
+++ b/testing/test_full.py
@@ -2110,7 +2110,7 @@ def test_generic_one_hot_logic():
 
 
 def test_generic_anonymizer():
-    print("33: Starting anomymizer tests")
+    print("33: Starting anonymizer tests")
     input_file = get_testdata_file("MR_small.dcm")
 
     output_file = os.path.join(outputDir, "MR_small_anonymized.dcm")
@@ -3292,14 +3292,14 @@ def test_generic_logging(capsys):
 
     os.remove(log_file)
 
-    # test the stout info level. The stout must show only INFO messages
-    message = "Testing stout logging"
+    # test the stdout info level. The stdout must show only INFO messages
+    message = "Testing stdout logging"
     logging.info(message)
     capture = capsys.readouterr()
     assert message in capture.out
 
-    # Test the stout not showing other messages
-    message = "Testing stout logging"
+    # Test the stdout not showing other messages
+    message = "Testing stdout logging"
     logging.debug(message)
     logging.warning(message)
     logging.error(message)
@@ -3307,14 +3307,14 @@ def test_generic_logging(capsys):
     capture = capsys.readouterr()
     assert message not in capture.out
 
-    # test sterr must NOT show these messages.
-    message = "Testing sterr logging"
+    # test stderr must NOT show these messages.
+    message = "Testing stderr logging"
     logging.info(message)
     logging.debug(message)
     capture = capsys.readouterr()
     assert message not in capture.err
 
-    # test sterr must show these messages.
+    # test stderr must show these messages.
     logging.error(message)
     logging.warning(message)
     logging.critical(message)

From 4dbec03b19797c20964dafec28faee9afdf5f90b Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 6 Nov 2024 11:28:38 -0500
Subject: [PATCH 10/43] added faq point for the version mismatch

---
 docs/faq.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/faq.md b/docs/faq.md
index dfe0c2234..0bb98239d 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -53,6 +53,10 @@ Please see https://mlcommons.github.io/GaNDLF/usage/#federating-your-model-evalu
 
 Please read the [migration guide](https://mlcommons.github.io/GaNDLF/migration_guide) to understand the changes that have been made to GaNDLF. If you have any questions, please feel free to [post a support request](https://github.com/mlcommons/GaNDLF/issues/new?assignees=&labels=&template=--questions-help-support.md&title=).
 
+### I am getting an error realted to version mismatch (greater or smaller) between the configuration and GaNDLF version. What should I do?
+
+This is a safety feature to ensure a tight integartion between the configuration used to define a model and the code version used to perform the training. Ensure that you have all requirements satisfied, and then check the ``version`` key in the configration, and ensure it appropriately matches the output of ``gandlf run --version``.
+
 ### What if I have another question?
 
 Please [post a support request](https://github.com/mlcommons/GaNDLF/issues/new?assignees=&labels=&template=--questions-help-support.md&title=).

From a09dfc57a2b965a4f81096f2a3e7b3934234af0c Mon Sep 17 00:00:00 2001
From: vmalefioudakis <benmalef@hotmail.com>
Date: Sun, 10 Nov 2024 12:22:35 +0200
Subject: [PATCH 11/43] change the default logfile from tmp->home dir

---
 GANDLF/utils/gandlf_logging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/utils/gandlf_logging.py b/GANDLF/utils/gandlf_logging.py
index 576df868e..c43d7679d 100644
--- a/GANDLF/utils/gandlf_logging.py
+++ b/GANDLF/utils/gandlf_logging.py
@@ -8,7 +8,7 @@
 
 
 def _create_tmp_log_file():
-    tmp_dir = Path(tempfile.gettempdir())
+    tmp_dir = Path(Path.home())
     log_dir = Path.joinpath(tmp_dir, ".gandlf")
     log_dir.mkdir(parents=True, exist_ok=True)
     log_file = Path.joinpath(log_dir, get_unique_timestamp() + ".log")

From 415e9e23c6fae5208374bcd6c5a5d956ba142b6e Mon Sep 17 00:00:00 2001
From: vmalefioudakis <benmalef@hotmail.com>
Date: Sun, 10 Nov 2024 12:57:12 +0200
Subject: [PATCH 12/43] add doc for --log-file

---
 docs/usage.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/usage.md b/docs/usage.md
index 1f56947c9..a3d7f0278 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -176,6 +176,14 @@ To split the data CSV into training, validation, and testing CSVs, the `gandlf s
   -o $output_dir # the output directory to save the split data
 ```
 
+### Using the `--log-file` parameter
+By default, only the `info` and `error` logs will be **displayed** in the console and
+the log file will be **saved** in `$(home)/.gandlf/<timestamp>.log`.
+
+Also, you can use the `--log-file` and provide the file that you want to save the logs
+```bash
+(venv_gandlf) $> gandlf <command> --log-file <log_file_path>
+```
 
 ## Customize the Training
 

From 836e77143f731eed5ed3d66fe7ebcc8a90f0c7c4 Mon Sep 17 00:00:00 2001
From: szmazurek <szymonmazurek57@gmail.com>
Date: Sat, 9 Nov 2024 21:17:22 +0100
Subject: [PATCH 13/43] Interface and example implementation of the loss class

---
 GANDLF/losses/loss_interface.py | 41 +++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 GANDLF/losses/loss_interface.py

diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
new file mode 100644
index 000000000..86cba7c75
--- /dev/null
+++ b/GANDLF/losses/loss_interface.py
@@ -0,0 +1,41 @@
+import torch
+from torch import nn
+from abc import ABC, abstractmethod
+
+
+class AbstractLossFunction(ABC, nn.Module):
+    def __init__(self, params: dict):
+        super().__init__()
+        self.params = params
+
+    @abstractmethod
+    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class WeightedCE(AbstractLossFunction):
+    def __init__(self, params: dict):
+        """
+        Cross entropy loss using class weights if provided.
+        """
+        super().__init__(params)
+
+    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        if len(target.shape) > 1 and target.shape[-1] == 1:
+            target = torch.squeeze(target, -1)
+
+        weights = None
+        if self.params.get("penalty_weights") is not None:
+            num_classes = len(self.params["penalty_weights"])
+            assert (
+                prediction.shape[-1] == num_classes
+            ), f"Number of classes {num_classes} does not match prediction shape {prediction.shape[-1]}"
+
+            weights = torch.tensor(
+                list(self.params["penalty_weights"].values()),
+                dtype=torch.float32,
+                device=target.device,
+            )
+
+        cel = nn.CrossEntropyLoss(weight=weights)
+        return cel(prediction, target)

From 227a86f3d119676db155ae7d1b5d52e88db00e60 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Sat, 16 Nov 2024 15:13:47 +0100
Subject: [PATCH 14/43] Porting losses to new interface WIP

---
 GANDLF/losses/loss_interface.py |  75 ++++++++++++-----
 GANDLF/losses/segmentation.py   | 144 ++++++++++++++++++++++++++++----
 2 files changed, 181 insertions(+), 38 deletions(-)

diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index 86cba7c75..49d5b5031 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -3,39 +3,68 @@
 from abc import ABC, abstractmethod
 
 
-class AbstractLossFunction(ABC, nn.Module):
+class AbstractLossFunction(nn.Module, ABC):
     def __init__(self, params: dict):
-        super().__init__()
+        nn.Module.__init__(self)
         self.params = params
 
     @abstractmethod
-    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self, prediction: torch.Tensor, target: torch.Tensor, *args
+    ) -> torch.Tensor:
         pass
 
 
-class WeightedCE(AbstractLossFunction):
+class AbstractSegmentationMultiClassLoss(AbstractLossFunction):
+    """
+    Base class for loss funcions that are used for multi-class segmentation tasks.
+    """
+
     def __init__(self, params: dict):
+        super().__init__(params)
+        self.num_classes = len(params["model"]["class_list"])
+        self.penalty_weights = params["penalty_weights"]
+
+    def _compute_single_class_loss(
+        self, prediction: torch.Tensor, target: torch.Tensor, class_idx: int
+    ) -> torch.Tensor:
+        """Compute loss for a single class."""
+        loss_value = self._single_class_loss_calculator(
+            prediction[:, class_idx, ...], target[:, class_idx, ...]
+        )
+        return 1 - loss_value
+
+    def _optional_loss_operations(self, loss: torch.Tensor) -> torch.Tensor:
         """
-        Cross entropy loss using class weights if provided.
+        Perform addtional operations of the loss value. Defaults to identity operation.
+        If needed, child classes can override this method. Useful in the cases where
+        for example, the loss value needs to log-transformed or clipped.
         """
-        super().__init__(params)
+        return loss
 
-    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        if len(target.shape) > 1 and target.shape[-1] == 1:
-            target = torch.squeeze(target, -1)
-
-        weights = None
-        if self.params.get("penalty_weights") is not None:
-            num_classes = len(self.params["penalty_weights"])
-            assert (
-                prediction.shape[-1] == num_classes
-            ), f"Number of classes {num_classes} does not match prediction shape {prediction.shape[-1]}"
-
-            weights = torch.tensor(
-                list(self.params["penalty_weights"].values()),
-                dtype=torch.float32,
-                device=target.device,
+    @abstractmethod
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """Compute loss for a pair of prediction and target tensors. To be implemented by child classes."""
+        pass
+
+    def forward(
+        self, prediction: torch.Tensor, target: torch.Tensor, *args
+    ) -> torch.Tensor:
+        accumulated_loss = torch.tensor(0.0, device=prediction.device)
+
+        for class_idx in range(self.num_classes):
+            current_loss = self._compute_single_class_loss(
+                prediction, target, class_idx
             )
+            current_loss = self._optional_loss_operations(current_loss)
+
+            if self.penalty_weights is not None:
+                current_loss = current_loss * self.penalty_weights[class_idx]
+            accumulated_loss += current_loss
+
+        if self.penalty_weights is None:
+            accumulated_loss /= self.num_classes
 
-        cel = nn.CrossEntropyLoss(weight=weights)
-        return cel(prediction, target)
+        return accumulated_loss
diff --git a/GANDLF/losses/segmentation.py b/GANDLF/losses/segmentation.py
index 32e43bc25..35feb3c25 100644
--- a/GANDLF/losses/segmentation.py
+++ b/GANDLF/losses/segmentation.py
@@ -1,29 +1,127 @@
 import sys
 from typing import List, Optional
 import torch
+from .loss_interface import AbstractSegmentationMultiClassLoss, AbstractLossFunction
 
 
-# Dice scores and dice losses
-def dice(predicted: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+class MulticlassDiceLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Dice loss between two tensors.
     """
-    This function computes a dice score between two tensors.
 
-    Args:
-        predicted (torch.Tensor): Predicted value by the network.
-        target (torch.Tensor): Required target label to match the predicted with
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute Dice score for a single class.
 
-    Returns:
-        torch.Tensor: The computed dice score.
+        Args:
+            prediction (torch.Tensor): Network's predicted segmentation mask
+            target (torch.Tensor): Target segmentation mask
+
+        Returns:
+            torch.Tensor: The computed dice score.
+        """
+        predicted_flat = prediction.flatten()
+        label_flat = target.flatten()
+        intersection = (predicted_flat * label_flat).sum()
+
+        dice_score = (2.0 * intersection + sys.float_info.min) / (
+            predicted_flat.sum() + label_flat.sum() + sys.float_info.min
+        )
+
+        return dice_score
+
+
+class MulticlassDiceLogLoss(MulticlassDiceLoss):
+    def _optional_loss_operations(self, loss):
+        return -torch.log(
+            loss + torch.finfo(torch.float32).eps
+        )  # epsilon for numerical stability
+
+
+class MulticlassMCCLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Matthews Correlation Coefficient (MCC) loss between two tensors.
     """
-    predicted_flat = predicted.flatten()
-    label_flat = target.flatten()
-    intersection = (predicted_flat * label_flat).sum()
 
-    dice_score = (2.0 * intersection + sys.float_info.min) / (
-        predicted_flat.sum() + label_flat.sum() + sys.float_info.min
-    )
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute MCC score for a single class.
+
+        Args:
+            prediction (torch.Tensor): Network's predicted segmentation mask
+            target (torch.Tensor): Target segmentation mask
+
+        Returns:
+            torch.Tensor: The computed MCC score.
+        """
+        tp = torch.sum(torch.mul(prediction, target))
+        tn = torch.sum(torch.mul((1 - prediction), (1 - target)))
+        fp = torch.sum(torch.mul(prediction, (1 - target)))
+        fn = torch.sum(torch.mul((1 - prediction), target))
+
+        numerator = torch.mul(tp, tn) - torch.mul(fp, fn)
+        # Adding epsilon to the denominator to avoid divide-by-zero errors.
+        denominator = (
+            torch.sqrt(
+                torch.add(tp, 1, fp)
+                * torch.add(tp, 1, fn)
+                * torch.add(tn, 1, fp)
+                * torch.add(tn, 1, fn)
+            )
+            + torch.finfo(torch.float32).eps
+        )
 
-    return dice_score
+        return torch.div(numerator.sum(), denominator.sum())
+
+
+class MulticlassMCLLogLoss(MulticlassMCCLoss):
+    def _optional_loss_operations(self, loss):
+        return -torch.log(
+            loss + torch.finfo(torch.float32).eps
+        )  # epsilon for numerical stability
+
+
+class MulticlassTverskyLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Tversky loss between two tensors.
+    """
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+        self.alpha = params.get("alpha", 0.5)
+        self.beta = params.get("beta", 0.5)
+
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute Tversky score for a single class.
+
+        Args:
+            prediction (torch.Tensor): Network's predicted segmentation mask
+            target (torch.Tensor): Target segmentation mask
+
+        Returns:
+            torch.Tensor: The computed Tversky score.
+        """
+        predicted_flat = prediction.contiguous().view(-1)
+        target_flat = target.contiguous().view(-1)
+
+        true_positives = (predicted_flat * target_flat).sum()
+        false_positives = ((1 - target_flat) * predicted_flat).sum()
+        false_negatives = (target_flat * (1 - predicted_flat)).sum()
+
+        numerator = true_positives
+        denominator = (
+            true_positives + self.alpha * false_positives + self.beta * false_negatives
+        )
+        loss = (numerator + sys.float_info.min) / (denominator + sys.float_info.min)
+
+        return loss
 
 
 def mcc(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
@@ -114,6 +212,22 @@ def generic_loss_calculator(
     return accumulated_loss
 
 
+class KullbackLeiblerDivergence(AbstractLossFunction):
+    def forward(self, mu: torch.Tensor, logvar: torch.Tensor, *args) -> torch.Tensor:
+        """
+        Calculates the Kullback-Leibler divergence between two Gaussian distributions.
+
+        Args:
+            mu (torch.Tensor): The mean of the first Gaussian distribution.
+            logvar (torch.Tensor): The logarithm of the variance of the first Gaussian distribution.
+
+        Returns:
+            torch.Tensor: The computed Kullback-Leibler divergence
+        """
+        loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=-1)
+        return loss.mean()
+
+
 def MCD_loss(
     predicted: torch.Tensor, target: torch.Tensor, params: dict
 ) -> torch.Tensor:

From f7e168bde3cecbac5b8efe1c34ccaa4e7feeebd4 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Sat, 16 Nov 2024 22:37:50 +0100
Subject: [PATCH 15/43] Segmentation losses refactored

---
 GANDLF/losses/loss_interface.py |   9 +--
 GANDLF/losses/segmentation.py   | 105 +++++++++++++++++++++++++++-----
 2 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index 49d5b5031..53c5a9325 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -9,9 +9,7 @@ def __init__(self, params: dict):
         self.params = params
 
     @abstractmethod
-    def forward(
-        self, prediction: torch.Tensor, target: torch.Tensor, *args
-    ) -> torch.Tensor:
+    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         pass
 
 
@@ -49,9 +47,7 @@ def _single_class_loss_calculator(
         """Compute loss for a pair of prediction and target tensors. To be implemented by child classes."""
         pass
 
-    def forward(
-        self, prediction: torch.Tensor, target: torch.Tensor, *args
-    ) -> torch.Tensor:
+    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         accumulated_loss = torch.tensor(0.0, device=prediction.device)
 
         for class_idx in range(self.num_classes):
@@ -64,6 +60,7 @@ def forward(
                 current_loss = current_loss * self.penalty_weights[class_idx]
             accumulated_loss += current_loss
 
+        # TODO shouldn't we always divide by the number of classes?
         if self.penalty_weights is None:
             accumulated_loss /= self.num_classes
 
diff --git a/GANDLF/losses/segmentation.py b/GANDLF/losses/segmentation.py
index 35feb3c25..675dab74c 100644
--- a/GANDLF/losses/segmentation.py
+++ b/GANDLF/losses/segmentation.py
@@ -124,6 +124,95 @@ def _single_class_loss_calculator(
         return loss
 
 
+class MulticlassFocalLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Focal loss between two tensors.
+    """
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+
+        self.ce_loss_helper = torch.nn.CrossEntropyLoss(reduction="none")
+        loss_params = params["loss_function"]
+        self.alpha = 1.0
+        self.gamma = 2.0
+        self.output_aggregation = "sum"
+        if isinstance(loss_params, dict):
+            self.alpha = loss_params.get("alpha", self.alpha)
+            self.gamma = loss_params.get("gamma", self.gamma)
+            self.output_aggregation = loss_params.get(
+                "size_average",
+                self.output_aggregation,  # naming mismatch of key due to keeping API consistent with config format
+            )
+        assert self.output_aggregation in [
+            "sum",
+            "mean",
+        ], f"Invalid output aggregation method defined for Foal Loss: {self.output_aggregation}. Valid options are ['sum', 'mean']"
+
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute focal loss for a single class. It is based on the following formulas:
+            FocalLoss(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)
+            CrossEntropy(pred, target) = -log(pred) if target = 1 else -log(1 - pred)
+            CrossEntropy(p_t) = CrossEntropy(pred, target) = -log(p_t)
+            p_t = p if target = 1 else 1 - p
+        """
+        ce_loss = self.ce_loss_helper(prediction, target)
+        p_t = torch.exp(-ce_loss)
+        loss = -self.alpha * (1 - p_t) ** self.gamma * ce_loss
+        return loss.sum() if self.output_aggregation == "sum" else loss.mean()
+
+    def _compute_single_class_loss(
+        self, prediction: torch.Tensor, target: torch.Tensor, class_idx: int
+    ) -> torch.Tensor:
+        """Compute loss for a single class."""
+        loss_value = self._single_class_loss_calculator(
+            prediction[:, class_idx, ...], target[:, class_idx, ...]
+        )
+        return loss_value  # no need to subtract from 1 in this case, hence the override
+
+
+class KullbackLeiblerDivergence(AbstractLossFunction):
+    def forward(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        """
+        Calculates the Kullback-Leibler divergence between two Gaussian distributions.
+
+        Args:
+            mu (torch.Tensor): The mean of the first Gaussian distribution.
+            logvar (torch.Tensor): The logarithm of the variance of the first Gaussian distribution.
+
+        Returns:
+            torch.Tensor: The computed Kullback-Leibler divergence
+        """
+        loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=-1)
+        return loss.mean()
+
+
+# Dice scores and dice losses
+def dice(predicted: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    """
+    This function computes a dice score between two tensors.
+
+    Args:
+        predicted (torch.Tensor): Predicted value by the network.
+        target (torch.Tensor): Required target label to match the predicted with
+
+    Returns:
+        torch.Tensor: The computed dice score.
+    """
+    predicted_flat = predicted.flatten()
+    label_flat = target.flatten()
+    intersection = (predicted_flat * label_flat).sum()
+
+    dice_score = (2.0 * intersection + sys.float_info.min) / (
+        predicted_flat.sum() + label_flat.sum() + sys.float_info.min
+    )
+
+    return dice_score
+
+
 def mcc(predictions: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
     """
     This function computes the Matthews Correlation Coefficient (MCC) between two tensors. Adapted from https://github.com/kakumarabhishek/MCC-Loss/blob/main/loss.py.
@@ -212,22 +301,6 @@ def generic_loss_calculator(
     return accumulated_loss
 
 
-class KullbackLeiblerDivergence(AbstractLossFunction):
-    def forward(self, mu: torch.Tensor, logvar: torch.Tensor, *args) -> torch.Tensor:
-        """
-        Calculates the Kullback-Leibler divergence between two Gaussian distributions.
-
-        Args:
-            mu (torch.Tensor): The mean of the first Gaussian distribution.
-            logvar (torch.Tensor): The logarithm of the variance of the first Gaussian distribution.
-
-        Returns:
-            torch.Tensor: The computed Kullback-Leibler divergence
-        """
-        loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=-1)
-        return loss.mean()
-
-
 def MCD_loss(
     predicted: torch.Tensor, target: torch.Tensor, params: dict
 ) -> torch.Tensor:

From 572ea5e94724458865ce106a1f7d204b24ae57ed Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Sat, 16 Nov 2024 22:39:48 +0100
Subject: [PATCH 16/43] Move losses temporairly to a new file

---
 GANDLF/losses/segmentation.py     | 187 -----------------------------
 GANDLF/losses/segmentation_new.py | 189 ++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+), 187 deletions(-)
 create mode 100644 GANDLF/losses/segmentation_new.py

diff --git a/GANDLF/losses/segmentation.py b/GANDLF/losses/segmentation.py
index 675dab74c..32e43bc25 100644
--- a/GANDLF/losses/segmentation.py
+++ b/GANDLF/losses/segmentation.py
@@ -1,193 +1,6 @@
 import sys
 from typing import List, Optional
 import torch
-from .loss_interface import AbstractSegmentationMultiClassLoss, AbstractLossFunction
-
-
-class MulticlassDiceLoss(AbstractSegmentationMultiClassLoss):
-    """
-    This class computes the Dice loss between two tensors.
-    """
-
-    def _single_class_loss_calculator(
-        self, prediction: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Compute Dice score for a single class.
-
-        Args:
-            prediction (torch.Tensor): Network's predicted segmentation mask
-            target (torch.Tensor): Target segmentation mask
-
-        Returns:
-            torch.Tensor: The computed dice score.
-        """
-        predicted_flat = prediction.flatten()
-        label_flat = target.flatten()
-        intersection = (predicted_flat * label_flat).sum()
-
-        dice_score = (2.0 * intersection + sys.float_info.min) / (
-            predicted_flat.sum() + label_flat.sum() + sys.float_info.min
-        )
-
-        return dice_score
-
-
-class MulticlassDiceLogLoss(MulticlassDiceLoss):
-    def _optional_loss_operations(self, loss):
-        return -torch.log(
-            loss + torch.finfo(torch.float32).eps
-        )  # epsilon for numerical stability
-
-
-class MulticlassMCCLoss(AbstractSegmentationMultiClassLoss):
-    """
-    This class computes the Matthews Correlation Coefficient (MCC) loss between two tensors.
-    """
-
-    def _single_class_loss_calculator(
-        self, prediction: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Compute MCC score for a single class.
-
-        Args:
-            prediction (torch.Tensor): Network's predicted segmentation mask
-            target (torch.Tensor): Target segmentation mask
-
-        Returns:
-            torch.Tensor: The computed MCC score.
-        """
-        tp = torch.sum(torch.mul(prediction, target))
-        tn = torch.sum(torch.mul((1 - prediction), (1 - target)))
-        fp = torch.sum(torch.mul(prediction, (1 - target)))
-        fn = torch.sum(torch.mul((1 - prediction), target))
-
-        numerator = torch.mul(tp, tn) - torch.mul(fp, fn)
-        # Adding epsilon to the denominator to avoid divide-by-zero errors.
-        denominator = (
-            torch.sqrt(
-                torch.add(tp, 1, fp)
-                * torch.add(tp, 1, fn)
-                * torch.add(tn, 1, fp)
-                * torch.add(tn, 1, fn)
-            )
-            + torch.finfo(torch.float32).eps
-        )
-
-        return torch.div(numerator.sum(), denominator.sum())
-
-
-class MulticlassMCLLogLoss(MulticlassMCCLoss):
-    def _optional_loss_operations(self, loss):
-        return -torch.log(
-            loss + torch.finfo(torch.float32).eps
-        )  # epsilon for numerical stability
-
-
-class MulticlassTverskyLoss(AbstractSegmentationMultiClassLoss):
-    """
-    This class computes the Tversky loss between two tensors.
-    """
-
-    def __init__(self, params: dict):
-        super().__init__(params)
-        self.alpha = params.get("alpha", 0.5)
-        self.beta = params.get("beta", 0.5)
-
-    def _single_class_loss_calculator(
-        self, prediction: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Compute Tversky score for a single class.
-
-        Args:
-            prediction (torch.Tensor): Network's predicted segmentation mask
-            target (torch.Tensor): Target segmentation mask
-
-        Returns:
-            torch.Tensor: The computed Tversky score.
-        """
-        predicted_flat = prediction.contiguous().view(-1)
-        target_flat = target.contiguous().view(-1)
-
-        true_positives = (predicted_flat * target_flat).sum()
-        false_positives = ((1 - target_flat) * predicted_flat).sum()
-        false_negatives = (target_flat * (1 - predicted_flat)).sum()
-
-        numerator = true_positives
-        denominator = (
-            true_positives + self.alpha * false_positives + self.beta * false_negatives
-        )
-        loss = (numerator + sys.float_info.min) / (denominator + sys.float_info.min)
-
-        return loss
-
-
-class MulticlassFocalLoss(AbstractSegmentationMultiClassLoss):
-    """
-    This class computes the Focal loss between two tensors.
-    """
-
-    def __init__(self, params: dict):
-        super().__init__(params)
-
-        self.ce_loss_helper = torch.nn.CrossEntropyLoss(reduction="none")
-        loss_params = params["loss_function"]
-        self.alpha = 1.0
-        self.gamma = 2.0
-        self.output_aggregation = "sum"
-        if isinstance(loss_params, dict):
-            self.alpha = loss_params.get("alpha", self.alpha)
-            self.gamma = loss_params.get("gamma", self.gamma)
-            self.output_aggregation = loss_params.get(
-                "size_average",
-                self.output_aggregation,  # naming mismatch of key due to keeping API consistent with config format
-            )
-        assert self.output_aggregation in [
-            "sum",
-            "mean",
-        ], f"Invalid output aggregation method defined for Foal Loss: {self.output_aggregation}. Valid options are ['sum', 'mean']"
-
-    def _single_class_loss_calculator(
-        self, prediction: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Compute focal loss for a single class. It is based on the following formulas:
-            FocalLoss(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)
-            CrossEntropy(pred, target) = -log(pred) if target = 1 else -log(1 - pred)
-            CrossEntropy(p_t) = CrossEntropy(pred, target) = -log(p_t)
-            p_t = p if target = 1 else 1 - p
-        """
-        ce_loss = self.ce_loss_helper(prediction, target)
-        p_t = torch.exp(-ce_loss)
-        loss = -self.alpha * (1 - p_t) ** self.gamma * ce_loss
-        return loss.sum() if self.output_aggregation == "sum" else loss.mean()
-
-    def _compute_single_class_loss(
-        self, prediction: torch.Tensor, target: torch.Tensor, class_idx: int
-    ) -> torch.Tensor:
-        """Compute loss for a single class."""
-        loss_value = self._single_class_loss_calculator(
-            prediction[:, class_idx, ...], target[:, class_idx, ...]
-        )
-        return loss_value  # no need to subtract from 1 in this case, hence the override
-
-
-class KullbackLeiblerDivergence(AbstractLossFunction):
-    def forward(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
-        """
-        Calculates the Kullback-Leibler divergence between two Gaussian distributions.
-
-        Args:
-            mu (torch.Tensor): The mean of the first Gaussian distribution.
-            logvar (torch.Tensor): The logarithm of the variance of the first Gaussian distribution.
-
-        Returns:
-            torch.Tensor: The computed Kullback-Leibler divergence
-        """
-        loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=-1)
-        return loss.mean()
 
 
 # Dice scores and dice losses
diff --git a/GANDLF/losses/segmentation_new.py b/GANDLF/losses/segmentation_new.py
new file mode 100644
index 000000000..e68965848
--- /dev/null
+++ b/GANDLF/losses/segmentation_new.py
@@ -0,0 +1,189 @@
+import sys
+import torch
+from .loss_interface import AbstractSegmentationMultiClassLoss, AbstractLossFunction
+
+
+class MulticlassDiceLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Dice loss between two tensors.
+    """
+
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute Dice score for a single class.
+
+        Args:
+            prediction (torch.Tensor): Network's predicted segmentation mask
+            target (torch.Tensor): Target segmentation mask
+
+        Returns:
+            torch.Tensor: The computed dice score.
+        """
+        predicted_flat = prediction.flatten()
+        label_flat = target.flatten()
+        intersection = (predicted_flat * label_flat).sum()
+
+        dice_score = (2.0 * intersection + sys.float_info.min) / (
+            predicted_flat.sum() + label_flat.sum() + sys.float_info.min
+        )
+
+        return dice_score
+
+
+class MulticlassDiceLogLoss(MulticlassDiceLoss):
+    def _optional_loss_operations(self, loss):
+        return -torch.log(
+            loss + torch.finfo(torch.float32).eps
+        )  # epsilon for numerical stability
+
+
+class MulticlassMCCLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Matthews Correlation Coefficient (MCC) loss between two tensors.
+    """
+
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute MCC score for a single class.
+
+        Args:
+            prediction (torch.Tensor): Network's predicted segmentation mask
+            target (torch.Tensor): Target segmentation mask
+
+        Returns:
+            torch.Tensor: The computed MCC score.
+        """
+        tp = torch.sum(torch.mul(prediction, target))
+        tn = torch.sum(torch.mul((1 - prediction), (1 - target)))
+        fp = torch.sum(torch.mul(prediction, (1 - target)))
+        fn = torch.sum(torch.mul((1 - prediction), target))
+
+        numerator = torch.mul(tp, tn) - torch.mul(fp, fn)
+        # Adding epsilon to the denominator to avoid divide-by-zero errors.
+        denominator = (
+            torch.sqrt(
+                torch.add(tp, 1, fp)
+                * torch.add(tp, 1, fn)
+                * torch.add(tn, 1, fp)
+                * torch.add(tn, 1, fn)
+            )
+            + torch.finfo(torch.float32).eps
+        )
+
+        return torch.div(numerator.sum(), denominator.sum())
+
+
+class MulticlassMCLLogLoss(MulticlassMCCLoss):
+    def _optional_loss_operations(self, loss):
+        return -torch.log(
+            loss + torch.finfo(torch.float32).eps
+        )  # epsilon for numerical stability
+
+
+class MulticlassTverskyLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Tversky loss between two tensors.
+    """
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+        self.alpha = params.get("alpha", 0.5)
+        self.beta = params.get("beta", 0.5)
+
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute Tversky score for a single class.
+
+        Args:
+            prediction (torch.Tensor): Network's predicted segmentation mask
+            target (torch.Tensor): Target segmentation mask
+
+        Returns:
+            torch.Tensor: The computed Tversky score.
+        """
+        predicted_flat = prediction.contiguous().view(-1)
+        target_flat = target.contiguous().view(-1)
+
+        true_positives = (predicted_flat * target_flat).sum()
+        false_positives = ((1 - target_flat) * predicted_flat).sum()
+        false_negatives = (target_flat * (1 - predicted_flat)).sum()
+
+        numerator = true_positives
+        denominator = (
+            true_positives + self.alpha * false_positives + self.beta * false_negatives
+        )
+        loss = (numerator + sys.float_info.min) / (denominator + sys.float_info.min)
+
+        return loss
+
+
+class MulticlassFocalLoss(AbstractSegmentationMultiClassLoss):
+    """
+    This class computes the Focal loss between two tensors.
+    """
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+
+        self.ce_loss_helper = torch.nn.CrossEntropyLoss(reduction="none")
+        loss_params = params["loss_function"]
+        self.alpha = 1.0
+        self.gamma = 2.0
+        self.output_aggregation = "sum"
+        if isinstance(loss_params, dict):
+            self.alpha = loss_params.get("alpha", self.alpha)
+            self.gamma = loss_params.get("gamma", self.gamma)
+            self.output_aggregation = loss_params.get(
+                "size_average",
+                self.output_aggregation,  # naming mismatch of key due to keeping API consistent with config format
+            )
+        assert self.output_aggregation in [
+            "sum",
+            "mean",
+        ], f"Invalid output aggregation method defined for Foal Loss: {self.output_aggregation}. Valid options are ['sum', 'mean']"
+
+    def _single_class_loss_calculator(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute focal loss for a single class. It is based on the following formulas:
+            FocalLoss(p_t) = -alpha_t * (1 - p_t)^gamma * log(p_t)
+            CrossEntropy(pred, target) = -log(pred) if target = 1 else -log(1 - pred)
+            CrossEntropy(p_t) = CrossEntropy(pred, target) = -log(p_t)
+            p_t = p if target = 1 else 1 - p
+        """
+        ce_loss = self.ce_loss_helper(prediction, target)
+        p_t = torch.exp(-ce_loss)
+        loss = -self.alpha * (1 - p_t) ** self.gamma * ce_loss
+        return loss.sum() if self.output_aggregation == "sum" else loss.mean()
+
+    def _compute_single_class_loss(
+        self, prediction: torch.Tensor, target: torch.Tensor, class_idx: int
+    ) -> torch.Tensor:
+        """Compute loss for a single class."""
+        loss_value = self._single_class_loss_calculator(
+            prediction[:, class_idx, ...], target[:, class_idx, ...]
+        )
+        return loss_value  # no need to subtract from 1 in this case, hence the override
+
+
+class KullbackLeiblerDivergence(AbstractLossFunction):
+    def forward(self, mu: torch.Tensor, logvar: torch.Tensor) -> torch.Tensor:
+        """
+        Calculates the Kullback-Leibler divergence between two Gaussian distributions.
+
+        Args:
+            mu (torch.Tensor): The mean of the first Gaussian distribution.
+            logvar (torch.Tensor): The logarithm of the variance of the first Gaussian distribution.
+
+        Returns:
+            torch.Tensor: The computed Kullback-Leibler divergence
+        """
+        loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), dim=-1)
+        return loss.mean()

From d80ff6ace2b1b8bff34cc2fa6dfdd3a6fef43293 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 18 Nov 2024 14:17:22 +0100
Subject: [PATCH 17/43] Refactored regression losses WIP

---
 GANDLF/losses/loss_interface.py   | 75 ++++++++++++++++++++++++++++---
 GANDLF/losses/regression.py       | 64 +++++++++++++++++++++++++-
 GANDLF/losses/segmentation_new.py |  8 +++-
 3 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index 53c5a9325..4aeeedb51 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -7,6 +7,14 @@ class AbstractLossFunction(nn.Module, ABC):
     def __init__(self, params: dict):
         nn.Module.__init__(self)
         self.params = params
+        self.num_classes = len(params["model"]["class_list"])
+        self._initialize_penalty_weights()
+
+    def _initialize_penalty_weights(self):
+        default_penalty_weights = torch.ones(self.num_classes)
+        self.penalty_weights = self.params.get(
+            "penalty_weights", default_penalty_weights
+        )
 
     @abstractmethod
     def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
@@ -20,8 +28,6 @@ class AbstractSegmentationMultiClassLoss(AbstractLossFunction):
 
     def __init__(self, params: dict):
         super().__init__(params)
-        self.num_classes = len(params["model"]["class_list"])
-        self.penalty_weights = params["penalty_weights"]
 
     def _compute_single_class_loss(
         self, prediction: torch.Tensor, target: torch.Tensor, class_idx: int
@@ -54,14 +60,69 @@ def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tenso
             current_loss = self._compute_single_class_loss(
                 prediction, target, class_idx
             )
-            current_loss = self._optional_loss_operations(current_loss)
-
-            if self.penalty_weights is not None:
-                current_loss = current_loss * self.penalty_weights[class_idx]
-            accumulated_loss += current_loss
+            accumulated_loss += (
+                self._optional_loss_operations(current_loss)
+                * self.penalty_weights[class_idx]
+            )
 
         # TODO shouldn't we always divide by the number of classes?
         if self.penalty_weights is None:
             accumulated_loss /= self.num_classes
 
         return accumulated_loss
+
+
+class AbstractRegressionLoss(AbstractLossFunction):
+    """
+    Base class for loss functions that are used for regression and classification tasks.
+    """
+
+    def __init__(self, params: dict):
+        super().__init__(params)
+        self.loss_calculator = self._initialize_loss_function_object()
+        self.reduction_method = self._initialize_reduction_method()
+
+    def _initialize_reduction_method(self) -> str:
+        """
+        Initialize the reduction method for the loss function. Defaults to 'mean'.
+        """
+        loss_params = self.params["loss_function"]
+        reduction_method = "mean"
+        if isinstance(loss_params, dict):
+            reduction_method = loss_params.get("reduction", reduction_method)
+            assert reduction_method in [
+                "mean",
+                "sum",
+            ], f"Invalid reduction method defined for loss function: {reduction_method}. Valid options are ['mean', 'sum']"
+        return reduction_method
+
+    def _calculate_loss_for_single_class(
+        self, prediction: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Calculate loss for a single class. To be implemented by child classes.
+        """
+        return self.loss_calculator(prediction, target)
+
+    @abstractmethod
+    def _initialize_loss_function_object(self) -> nn.modules.loss._Loss:
+        """
+        Initialize the loss function object used in the forward method. Has to return
+        callable pytorch loss function object.
+        """
+        pass
+
+    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        accumulated_loss = torch.tensor(0.0, device=prediction.device)
+        for class_idx in range(self.num_classes):
+            accumulated_loss += (
+                self._calculate_loss_for_single_class(
+                    prediction[:, class_idx, ...], target[:, class_idx, ...]
+                )
+                * self.penalty_weights[class_idx]
+            )
+
+        # TODO I Believe this is how it should be, also for segmentation - take average from all classes, despite weights being present or no
+        accumulated_loss /= self.num_classes
+
+        return accumulated_loss
diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py
index 6d74a33a2..593090319 100644
--- a/GANDLF/losses/regression.py
+++ b/GANDLF/losses/regression.py
@@ -1,8 +1,70 @@
 from typing import Optional
 import torch
+from torch import nn
 import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss
 from GANDLF.utils import one_hot
+from GANDLF.losses.loss_interface import AbstractRegressionLoss
+
+
+class CrossEntropyLoss(AbstractRegressionLoss):
+    """
+    This class computes the cross entropy loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.CrossEntropyLoss(reduction=self.reduction_method)
+
+
+class BinaryCrossEntropyLoss(AbstractRegressionLoss):
+    """
+    This class computes the binary cross entropy loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.BCELoss(reduction=self.reduction_method)
+
+
+class BinaryCrossEntropyWithLogitsLoss(AbstractRegressionLoss):
+    """
+    This class computes the binary cross entropy loss with logits between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.BCEWithLogitsLoss(reduction=self.reduction_method)
+
+
+class BaseLossWithScaledTarget(AbstractRegressionLoss):
+    """
+    General interface for the loss functions requiring scaling of the target tensor.
+    """
+
+    def _initialize_scaling_factor(self):
+        loss_params: dict = self.params["loss_function"]
+        self.scaling_factor = loss_params.get("scaling_factor", 1.0)
+        if isinstance(loss_params, dict):
+            self.scaling_factor = loss_params.get("scaling_factor", self.scaling_factor)
+        return self.scaling_factor
+
+    def _calculate_loss(self, prediction: torch.Tensor, target: torch.Tensor):
+        return self.loss_calculator(prediction, target * self.scaling_factor)
+
+
+class L1Loss(BaseLossWithScaledTarget):
+    """
+    This class computes the L1 loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.L1Loss(reduction=self.reduction_method)
+
+
+class MSELoss(BaseLossWithScaledTarget):
+    """
+    This class computes the mean squared error loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.MSELoss(reduction=self.reduction_method)
 
 
 def CEL(
diff --git a/GANDLF/losses/segmentation_new.py b/GANDLF/losses/segmentation_new.py
index e68965848..10133196d 100644
--- a/GANDLF/losses/segmentation_new.py
+++ b/GANDLF/losses/segmentation_new.py
@@ -91,8 +91,12 @@ class MulticlassTverskyLoss(AbstractSegmentationMultiClassLoss):
 
     def __init__(self, params: dict):
         super().__init__(params)
-        self.alpha = params.get("alpha", 0.5)
-        self.beta = params.get("beta", 0.5)
+        loss_params = params["loss_function"]
+        self.alpha = 0.5
+        self.beta = 0.5
+        if isinstance(loss_params, dict):
+            self.alpha = loss_params.get("alpha", self.alpha)
+            self.beta = loss_params.get("beta", self.beta)
 
     def _single_class_loss_calculator(
         self, prediction: torch.Tensor, target: torch.Tensor

From d2578ad88ecc49ef192477334c30cb1338a39918 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 18 Nov 2024 14:20:41 +0100
Subject: [PATCH 18/43] Move losses to separate file

---
 GANDLF/losses/regression.py | 63 +------------------------------------
 1 file changed, 1 insertion(+), 62 deletions(-)

diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py
index 593090319..62f80bffd 100644
--- a/GANDLF/losses/regression.py
+++ b/GANDLF/losses/regression.py
@@ -3,68 +3,7 @@
 from torch import nn
 import torch.nn.functional as F
 from GANDLF.utils import one_hot
-from GANDLF.losses.loss_interface import AbstractRegressionLoss
-
-
-class CrossEntropyLoss(AbstractRegressionLoss):
-    """
-    This class computes the cross entropy loss between two tensors.
-    """
-
-    def _initialize_loss_function_object(self):
-        return nn.CrossEntropyLoss(reduction=self.reduction_method)
-
-
-class BinaryCrossEntropyLoss(AbstractRegressionLoss):
-    """
-    This class computes the binary cross entropy loss between two tensors.
-    """
-
-    def _initialize_loss_function_object(self):
-        return nn.BCELoss(reduction=self.reduction_method)
-
-
-class BinaryCrossEntropyWithLogitsLoss(AbstractRegressionLoss):
-    """
-    This class computes the binary cross entropy loss with logits between two tensors.
-    """
-
-    def _initialize_loss_function_object(self):
-        return nn.BCEWithLogitsLoss(reduction=self.reduction_method)
-
-
-class BaseLossWithScaledTarget(AbstractRegressionLoss):
-    """
-    General interface for the loss functions requiring scaling of the target tensor.
-    """
-
-    def _initialize_scaling_factor(self):
-        loss_params: dict = self.params["loss_function"]
-        self.scaling_factor = loss_params.get("scaling_factor", 1.0)
-        if isinstance(loss_params, dict):
-            self.scaling_factor = loss_params.get("scaling_factor", self.scaling_factor)
-        return self.scaling_factor
-
-    def _calculate_loss(self, prediction: torch.Tensor, target: torch.Tensor):
-        return self.loss_calculator(prediction, target * self.scaling_factor)
-
-
-class L1Loss(BaseLossWithScaledTarget):
-    """
-    This class computes the L1 loss between two tensors.
-    """
-
-    def _initialize_loss_function_object(self):
-        return nn.L1Loss(reduction=self.reduction_method)
-
-
-class MSELoss(BaseLossWithScaledTarget):
-    """
-    This class computes the mean squared error loss between two tensors.
-    """
-
-    def _initialize_loss_function_object(self):
-        return nn.MSELoss(reduction=self.reduction_method)
+from torch.nn import CrossEntropyLoss
 
 
 def CEL(

From 2d661b9dd1c668c4211e380edaf89b49088baea7 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 18 Nov 2024 14:35:09 +0100
Subject: [PATCH 19/43] Hybrid losses implementation

---
 GANDLF/losses/hybrid.py         |  1 -
 GANDLF/losses/hybrid_new.py     | 21 +++++++++++
 GANDLF/losses/loss_interface.py | 22 ++++++++++++
 GANDLF/losses/regression_new.py | 64 +++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 GANDLF/losses/hybrid_new.py
 create mode 100644 GANDLF/losses/regression_new.py

diff --git a/GANDLF/losses/hybrid.py b/GANDLF/losses/hybrid.py
index ddf62fa01..f4c862606 100644
--- a/GANDLF/losses/hybrid.py
+++ b/GANDLF/losses/hybrid.py
@@ -1,5 +1,4 @@
 import torch
-
 from .segmentation import MCD_loss, FocalLoss
 from .regression import CCE_Generic, CE, CE_Logits
 
diff --git a/GANDLF/losses/hybrid_new.py b/GANDLF/losses/hybrid_new.py
new file mode 100644
index 000000000..4fa7edfcc
--- /dev/null
+++ b/GANDLF/losses/hybrid_new.py
@@ -0,0 +1,21 @@
+from .regression_new import BinaryCrossEntropyLoss, BinaryCrossEntropyWithLogitsLoss
+from .segmentation_new import MulticlassDiceLoss, MulticlassFocalLoss
+from .loss_interface import AbstractHybridLoss
+
+
+class DiceCrossEntropyLoss(AbstractHybridLoss):
+    def _initialize_all_loss_calculators(self):
+        return [MulticlassDiceLoss(self.params), BinaryCrossEntropyLoss(self.params)]
+
+
+class DiceCrossEntropyLossLogits(AbstractHybridLoss):
+    def _initialize_all_loss_calculators(self):
+        return [
+            MulticlassDiceLoss(self.params),
+            BinaryCrossEntropyWithLogitsLoss(self.params),
+        ]
+
+
+class DiceFocalLoss(AbstractHybridLoss):
+    def _initialize_all_loss_calculators(self):
+        return [MulticlassDiceLoss(self.params), MulticlassFocalLoss(self.params)]
diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index 4aeeedb51..bb983f2ed 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -1,6 +1,7 @@
 import torch
 from torch import nn
 from abc import ABC, abstractmethod
+from typing import List
 
 
 class AbstractLossFunction(nn.Module, ABC):
@@ -126,3 +127,24 @@ def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tenso
         accumulated_loss /= self.num_classes
 
         return accumulated_loss
+
+
+class AbstractHybridLoss(AbstractLossFunction):
+    def __init__(self, params: dict):
+        super().__init__(params)
+        self.loss_calculators = self._initialize_all_loss_calculators()
+
+    @abstractmethod
+    def _initialize_all_loss_calculators(self) -> List[AbstractLossFunction]:
+        """
+        Each hybrid loss should implement this method, creating all loss functions as a list that
+        will be used during the forward pass.
+        """
+        pass
+
+    def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        accumulated_loss = torch.tensor(0.0, device=prediction.device)
+        for loss_calculator in self._initialize_all_loss_calculators():
+            accumulated_loss += loss_calculator(prediction, target)
+
+        return accumulated_loss
diff --git a/GANDLF/losses/regression_new.py b/GANDLF/losses/regression_new.py
new file mode 100644
index 000000000..e9e0d5db0
--- /dev/null
+++ b/GANDLF/losses/regression_new.py
@@ -0,0 +1,64 @@
+import torch
+from torch import nn
+from .loss_interface import AbstractRegressionLoss
+
+
+class CrossEntropyLoss(AbstractRegressionLoss):
+    """
+    This class computes the cross entropy loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.CrossEntropyLoss(reduction=self.reduction_method)
+
+
+class BinaryCrossEntropyLoss(AbstractRegressionLoss):
+    """
+    This class computes the binary cross entropy loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.BCELoss(reduction=self.reduction_method)
+
+
+class BinaryCrossEntropyWithLogitsLoss(AbstractRegressionLoss):
+    """
+    This class computes the binary cross entropy loss with logits between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.BCEWithLogitsLoss(reduction=self.reduction_method)
+
+
+class BaseLossWithScaledTarget(AbstractRegressionLoss):
+    """
+    General interface for the loss functions requiring scaling of the target tensor.
+    """
+
+    def _initialize_scaling_factor(self):
+        loss_params: dict = self.params["loss_function"]
+        self.scaling_factor = loss_params.get("scaling_factor", 1.0)
+        if isinstance(loss_params, dict):
+            self.scaling_factor = loss_params.get("scaling_factor", self.scaling_factor)
+        return self.scaling_factor
+
+    def _calculate_loss(self, prediction: torch.Tensor, target: torch.Tensor):
+        return self.loss_calculator(prediction, target * self.scaling_factor)
+
+
+class L1Loss(BaseLossWithScaledTarget):
+    """
+    This class computes the L1 loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.L1Loss(reduction=self.reduction_method)
+
+
+class MSELoss(BaseLossWithScaledTarget):
+    """
+    This class computes the mean squared error loss between two tensors.
+    """
+
+    def _initialize_loss_function_object(self):
+        return nn.MSELoss(reduction=self.reduction_method)

From 9ac300c24ca6f3151437bd47405fe1b5658f48a8 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Mon, 18 Nov 2024 22:01:59 +0100
Subject: [PATCH 20/43] Fix docstrings, remove todos

---
 GANDLF/losses/loss_interface.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index bb983f2ed..69dc4360b 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -41,8 +41,8 @@ def _compute_single_class_loss(
 
     def _optional_loss_operations(self, loss: torch.Tensor) -> torch.Tensor:
         """
-        Perform addtional operations of the loss value. Defaults to identity operation.
-        If needed, child classes can override this method. Useful in the cases where
+        Perform addtional operations on the loss value. Defaults to identity operation.
+        If needed, child classes can override this method. Useful in cases where
         for example, the loss value needs to log-transformed or clipped.
         """
         return loss
@@ -66,9 +66,7 @@ def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tenso
                 * self.penalty_weights[class_idx]
             )
 
-        # TODO shouldn't we always divide by the number of classes?
-        if self.penalty_weights is None:
-            accumulated_loss /= self.num_classes
+        accumulated_loss /= self.num_classes
 
         return accumulated_loss
 
@@ -123,7 +121,6 @@ def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tenso
                 * self.penalty_weights[class_idx]
             )
 
-        # TODO I Believe this is how it should be, also for segmentation - take average from all classes, despite weights being present or no
         accumulated_loss /= self.num_classes
 
         return accumulated_loss

From 96b64e461af5409991de18c04389e919c867c8a8 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Tue, 19 Nov 2024 06:41:31 +0100
Subject: [PATCH 21/43] Cleaning up

---
 GANDLF/losses/loss_interface.py   | 18 ++++++++++++------
 GANDLF/losses/regression.py       |  1 -
 GANDLF/losses/segmentation_new.py | 10 +++++-----
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index 69dc4360b..e8459f41d 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -19,12 +19,14 @@ def _initialize_penalty_weights(self):
 
     @abstractmethod
     def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
-        pass
+        """
+        Forward pass of the loss function. To be implemented by child classes.
+        """
 
 
-class AbstractSegmentationMultiClassLoss(AbstractLossFunction):
+class AbstractSegmentationLoss(AbstractLossFunction):
     """
-    Base class for loss funcions that are used for multi-class segmentation tasks.
+    Base class for loss funcions that are used for segmentation tasks.
     """
 
     def __init__(self, params: dict):
@@ -51,8 +53,9 @@ def _optional_loss_operations(self, loss: torch.Tensor) -> torch.Tensor:
     def _single_class_loss_calculator(
         self, prediction: torch.Tensor, target: torch.Tensor
     ) -> torch.Tensor:
-        """Compute loss for a pair of prediction and target tensors. To be implemented by child classes."""
-        pass
+        """
+        Compute loss for a pair of prediction and target tensors. To be implemented by child classes.
+        """
 
     def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         accumulated_loss = torch.tensor(0.0, device=prediction.device)
@@ -109,7 +112,6 @@ def _initialize_loss_function_object(self) -> nn.modules.loss._Loss:
         Initialize the loss function object used in the forward method. Has to return
         callable pytorch loss function object.
         """
-        pass
 
     def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
         accumulated_loss = torch.tensor(0.0, device=prediction.device)
@@ -127,6 +129,10 @@ def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tenso
 
 
 class AbstractHybridLoss(AbstractLossFunction):
+    """
+    Base class for hybrid loss functions that are used for segmentation tasks.
+    """
+
     def __init__(self, params: dict):
         super().__init__(params)
         self.loss_calculators = self._initialize_all_loss_calculators()
diff --git a/GANDLF/losses/regression.py b/GANDLF/losses/regression.py
index 62f80bffd..4949bd9d2 100644
--- a/GANDLF/losses/regression.py
+++ b/GANDLF/losses/regression.py
@@ -1,6 +1,5 @@
 from typing import Optional
 import torch
-from torch import nn
 import torch.nn.functional as F
 from GANDLF.utils import one_hot
 from torch.nn import CrossEntropyLoss
diff --git a/GANDLF/losses/segmentation_new.py b/GANDLF/losses/segmentation_new.py
index 10133196d..4999686fe 100644
--- a/GANDLF/losses/segmentation_new.py
+++ b/GANDLF/losses/segmentation_new.py
@@ -1,9 +1,9 @@
 import sys
 import torch
-from .loss_interface import AbstractSegmentationMultiClassLoss, AbstractLossFunction
+from .loss_interface import AbstractSegmentationLoss, AbstractLossFunction
 
 
-class MulticlassDiceLoss(AbstractSegmentationMultiClassLoss):
+class MulticlassDiceLoss(AbstractSegmentationLoss):
     """
     This class computes the Dice loss between two tensors.
     """
@@ -39,7 +39,7 @@ def _optional_loss_operations(self, loss):
         )  # epsilon for numerical stability
 
 
-class MulticlassMCCLoss(AbstractSegmentationMultiClassLoss):
+class MulticlassMCCLoss(AbstractSegmentationLoss):
     """
     This class computes the Matthews Correlation Coefficient (MCC) loss between two tensors.
     """
@@ -84,7 +84,7 @@ def _optional_loss_operations(self, loss):
         )  # epsilon for numerical stability
 
 
-class MulticlassTverskyLoss(AbstractSegmentationMultiClassLoss):
+class MulticlassTverskyLoss(AbstractSegmentationLoss):
     """
     This class computes the Tversky loss between two tensors.
     """
@@ -127,7 +127,7 @@ def _single_class_loss_calculator(
         return loss
 
 
-class MulticlassFocalLoss(AbstractSegmentationMultiClassLoss):
+class MulticlassFocalLoss(AbstractSegmentationLoss):
     """
     This class computes the Focal loss between two tensors.
     """

From aae55cbaa177375d90b1c69a7f22530987808fd8 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:25:55 -0500
Subject: [PATCH 22/43] moved ademamix and lion to a new `thirdparty` module
 under optimizers

---
 GANDLF/optimizers/thirdparty/__init__.py |   3 +
 GANDLF/optimizers/thirdparty/ademamix.py | 204 +++++++++++++++++++++++
 GANDLF/optimizers/thirdparty/lion.py     |  22 +++
 3 files changed, 229 insertions(+)
 create mode 100644 GANDLF/optimizers/thirdparty/__init__.py
 create mode 100644 GANDLF/optimizers/thirdparty/ademamix.py
 create mode 100644 GANDLF/optimizers/thirdparty/lion.py

diff --git a/GANDLF/optimizers/thirdparty/__init__.py b/GANDLF/optimizers/thirdparty/__init__.py
new file mode 100644
index 000000000..110b32e2f
--- /dev/null
+++ b/GANDLF/optimizers/thirdparty/__init__.py
@@ -0,0 +1,3 @@
+from .ademamix import ademamix_wrapper
+
+from .lion import lion_wrapper
diff --git a/GANDLF/optimizers/thirdparty/ademamix.py b/GANDLF/optimizers/thirdparty/ademamix.py
new file mode 100644
index 000000000..63f68d9f9
--- /dev/null
+++ b/GANDLF/optimizers/thirdparty/ademamix.py
@@ -0,0 +1,204 @@
+import math
+from typing import Callable, Iterable, List, Optional, Tuple
+
+import torch
+from torch import Tensor
+from torch.optim import Optimizer
+
+
+class AdEMAMix(Optimizer):
+    r"""Adapted from https://github.com/frgfm/Holocron/blob/main/holocron/optim/ademamix.py
+    
+    Implements the AdEMAMix optimizer from `"The AdEMAMix Optimizer: Better, Faster, Older" <https://arxiv.org/pdf/2409.03137>`_.
+
+    The estimation of momentums is described as follows, :math:`\forall t \geq 1`:
+
+    .. math::
+        m_{1,t} \leftarrow \beta_1 m_{1, t-1} + (1 - \beta_1) g_t \\
+        m_{2,t} \leftarrow \beta_3 m_{2, t-1} + (1 - \beta_3) g_t \\
+        s_t \leftarrow \beta_2 s_{t-1} + (1 - \beta_2) (g_t - m_t)^2 + \epsilon
+
+    where :math:`g_t` is the gradient of :math:`\theta_t`,
+    :math:`\beta_1, \beta_2, \beta_3 \in [0, 1]^3` are the exponential average smoothing coefficients,
+    :math:`m_{1,0} = 0,\ m_{2,0} = 0,\ s_0 = 0`, :math:`\epsilon > 0`.
+
+    Then we correct their biases using:
+
+    .. math::
+        \hat{m_{1,t}} \leftarrow \frac{m_{1,t}}{1 - \beta_1^t} \\
+        \hat{s_t} \leftarrow \frac{s_t}{1 - \beta_2^t}
+
+    And finally the update step is performed using the following rule:
+
+    .. math::
+        \theta_t \leftarrow \theta_{t-1} - \eta \frac{\hat{m_{1,t}} + \alpha m_{2,t}}{\sqrt{\hat{s_t}} + \epsilon}
+
+    where :math:`\theta_t` is the parameter value at step :math:`t` (:math:`\theta_0` being the initialization value),
+    :math:`\eta` is the learning rate, :math:`\alpha > 0` :math:`\epsilon > 0`.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining parameter groups
+        lr (float, optional): learning rate
+        betas (Tuple[float, float, float], optional): coefficients used for running averages (default: (0.9, 0.999, 0.9999))
+        alpha (float, optional): the exponential decay rate of the second moment estimates (default: 5.0)
+        eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (bool, optional): whether to use the AMSGrad variant (default: False)
+    """
+
+    def __init__(
+        self,
+        params: Iterable[torch.nn.Parameter],
+        lr: float = 1e-3,
+        betas: Tuple[float, float, float] = (0.9, 0.999, 0.9999),
+        alpha: float = 5.0,
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+    ) -> None:
+        assert lr >= 0.0, f"Invalid learning rate: {lr}"
+        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
+        assert all(
+            0.0 <= beta < 1.0 for beta in betas
+        ), f"Invalid beta parameters: {betas}"
+        defaults = {
+            "lr": lr,
+            "betas": betas,
+            "alpha": alpha,
+            "eps": eps,
+            "weight_decay": weight_decay,
+        }
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:  # type: ignore[override]
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avgs_slow = []
+            exp_avg_sqs = []
+            state_steps = []
+
+            for p in group["params"]:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    if p.grad.is_sparse:
+                        raise RuntimeError(
+                            f"{self.__class__.__name__} does not support sparse gradients"
+                        )
+                    grads.append(p.grad)
+
+                    state = self.state[p]
+                    # Lazy state initialization
+                    if len(state) == 0:
+                        state["step"] = 0
+                        # Exponential moving average of gradient values
+                        state["exp_avg"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+                        state["exp_avg_slow"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+                        # Exponential moving average of squared gradient values
+                        state["exp_avg_sq"] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format
+                        )
+
+                    exp_avgs.append(state["exp_avg"])
+                    exp_avgs_slow.append(state["exp_avg_slow"])
+                    exp_avg_sqs.append(state["exp_avg_sq"])
+
+                    # update the steps for each param group update
+                    state["step"] += 1
+                    # record the step after step update
+                    state_steps.append(state["step"])
+
+            beta1, beta2, beta3 = group["betas"]
+            _update_ademamix(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avgs_slow,
+                exp_avg_sqs,
+                state_steps,
+                beta1,
+                beta2,
+                beta3,
+                group["alpha"],
+                group["lr"],
+                group["weight_decay"],
+                group["eps"],
+            )
+        return loss
+
+
+def _update_ademamix(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avgs_slow: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[int],
+    beta1: float,
+    beta2: float,
+    beta3: float,
+    alpha: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+) -> None:
+    r"""Functional API that performs AdaBelief algorithm computation.
+    See :class:`~holocron.optim.AdaBelief` for details.
+    """
+    for i, param in enumerate(params):
+        grad = grads[i]
+        m1 = exp_avgs[i]
+        m2 = exp_avgs_slow[i]
+        nu = exp_avg_sqs[i]
+        step = state_steps[i]
+
+        bias_correction1 = 1 - beta1**step
+        bias_correction2 = 1 - beta2**step
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        m1.mul_(beta1).add_(grad, alpha=1 - beta1)
+        nu.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        m2.mul_(beta3).add_(grad, alpha=1 - beta3)
+
+        denom = (nu.sqrt() / math.sqrt(bias_correction2)).add_(eps)
+
+        param.addcdiv_(m1 / bias_correction1 + alpha * m2, denom, value=-lr)
+
+
+def ademamix_wrapper(parameters: dict) -> torch.optim.Optimizer:
+    """
+    Creates an AdEMAMix optimizer from the PyTorch `torch.optim` module using the input parameters.
+
+    Args:
+        parameters (dict): A dictionary containing the input parameters for the optimizer.
+
+    Returns:
+        torch.optim.Optimizer: An AdEMAMix optimizer.
+    """
+
+    return AdEMAMix(
+        params=parameters["model_parameters"],
+        lr=parameters.get("learning_rate", 1e-3),
+        betas=parameters.get("betas", (0.9, 0.999, 0.9999)),
+        alpha=parameters.get("alpha", 5.0),
+        eps=parameters.get("eps", 1e-8),
+        weight_decay=parameters.get("weight_decay", 0.0),
+    )
diff --git a/GANDLF/optimizers/thirdparty/lion.py b/GANDLF/optimizers/thirdparty/lion.py
new file mode 100644
index 000000000..a1b871d2e
--- /dev/null
+++ b/GANDLF/optimizers/thirdparty/lion.py
@@ -0,0 +1,22 @@
+from torch.optim.optimizer import Optimizer
+from lion_pytorch import Lion
+
+
+def lion_wrapper(parameters: dict) -> Optimizer:
+    """
+    Creates an instance of the Lion optimizer from the `lion_pytorch` package using the input parameters.
+
+    Args:
+        parameters (dict): A dictionary containing the input parameters for the optimizer.
+
+    Returns:
+        Optimizer: An instance of the Lion optimizer.
+    """
+    return Lion(
+        parameters["model_parameters"],
+        lr=parameters.get("learning_rate", 1e-4),
+        betas=parameters["optimizer"].get("betas", (0.9, 0.999)),
+        weight_decay=parameters["optimizer"].get("weight_decay", 0.0),
+        decoupled_weight_decay=parameters["optimizer"].get("decoupled_weight_decay", False),
+        use_triton=False,  # as of 20241120, triton is not generally available for all platforms
+    )
\ No newline at end of file

From df1e96cddb0a04d161bd5b9dfe14138090074750 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:26:06 -0500
Subject: [PATCH 23/43] added the new submodule to ignore for testing coverage

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 67a73e9dd..c65e14363 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,4 +16,5 @@ omit = [
   "./setup.py",
   "./testing/conftest.py",
   "./tutorials/*",
+  "./GANDLF/optimizers/thirdparty/*",
 ]

From 527a8f65f5b8602e7572c7f1012de77859299e0c Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:26:15 -0500
Subject: [PATCH 24/43] added lion as dependency

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 908459ccd..ecd209966 100644
--- a/setup.py
+++ b/setup.py
@@ -84,6 +84,7 @@
     "huggingface-hub==0.25.1",
     "openslide-bin",
     "openslide-python==1.4.1",
+    "lion-pytorch==0.2.2",
 ]
 
 if __name__ == "__main__":

From 76838ff6e7f544b945976876914d915b7eac3e1d Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:26:26 -0500
Subject: [PATCH 25/43] moved this to new submodule

---
 GANDLF/optimizers/ademamix.py | 204 ----------------------------------
 1 file changed, 204 deletions(-)
 delete mode 100644 GANDLF/optimizers/ademamix.py

diff --git a/GANDLF/optimizers/ademamix.py b/GANDLF/optimizers/ademamix.py
deleted file mode 100644
index 63f68d9f9..000000000
--- a/GANDLF/optimizers/ademamix.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import math
-from typing import Callable, Iterable, List, Optional, Tuple
-
-import torch
-from torch import Tensor
-from torch.optim import Optimizer
-
-
-class AdEMAMix(Optimizer):
-    r"""Adapted from https://github.com/frgfm/Holocron/blob/main/holocron/optim/ademamix.py
-    
-    Implements the AdEMAMix optimizer from `"The AdEMAMix Optimizer: Better, Faster, Older" <https://arxiv.org/pdf/2409.03137>`_.
-
-    The estimation of momentums is described as follows, :math:`\forall t \geq 1`:
-
-    .. math::
-        m_{1,t} \leftarrow \beta_1 m_{1, t-1} + (1 - \beta_1) g_t \\
-        m_{2,t} \leftarrow \beta_3 m_{2, t-1} + (1 - \beta_3) g_t \\
-        s_t \leftarrow \beta_2 s_{t-1} + (1 - \beta_2) (g_t - m_t)^2 + \epsilon
-
-    where :math:`g_t` is the gradient of :math:`\theta_t`,
-    :math:`\beta_1, \beta_2, \beta_3 \in [0, 1]^3` are the exponential average smoothing coefficients,
-    :math:`m_{1,0} = 0,\ m_{2,0} = 0,\ s_0 = 0`, :math:`\epsilon > 0`.
-
-    Then we correct their biases using:
-
-    .. math::
-        \hat{m_{1,t}} \leftarrow \frac{m_{1,t}}{1 - \beta_1^t} \\
-        \hat{s_t} \leftarrow \frac{s_t}{1 - \beta_2^t}
-
-    And finally the update step is performed using the following rule:
-
-    .. math::
-        \theta_t \leftarrow \theta_{t-1} - \eta \frac{\hat{m_{1,t}} + \alpha m_{2,t}}{\sqrt{\hat{s_t}} + \epsilon}
-
-    where :math:`\theta_t` is the parameter value at step :math:`t` (:math:`\theta_0` being the initialization value),
-    :math:`\eta` is the learning rate, :math:`\alpha > 0` :math:`\epsilon > 0`.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining parameter groups
-        lr (float, optional): learning rate
-        betas (Tuple[float, float, float], optional): coefficients used for running averages (default: (0.9, 0.999, 0.9999))
-        alpha (float, optional): the exponential decay rate of the second moment estimates (default: 5.0)
-        eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (bool, optional): whether to use the AMSGrad variant (default: False)
-    """
-
-    def __init__(
-        self,
-        params: Iterable[torch.nn.Parameter],
-        lr: float = 1e-3,
-        betas: Tuple[float, float, float] = (0.9, 0.999, 0.9999),
-        alpha: float = 5.0,
-        eps: float = 1e-8,
-        weight_decay: float = 0.0,
-    ) -> None:
-        assert lr >= 0.0, f"Invalid learning rate: {lr}"
-        assert eps >= 0.0, f"Invalid epsilon value: {eps}"
-        assert all(
-            0.0 <= beta < 1.0 for beta in betas
-        ), f"Invalid beta parameters: {betas}"
-        defaults = {
-            "lr": lr,
-            "betas": betas,
-            "alpha": alpha,
-            "eps": eps,
-            "weight_decay": weight_decay,
-        }
-        super().__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:  # type: ignore[override]
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avgs_slow = []
-            exp_avg_sqs = []
-            state_steps = []
-
-            for p in group["params"]:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError(
-                            f"{self.__class__.__name__} does not support sparse gradients"
-                        )
-                    grads.append(p.grad)
-
-                    state = self.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state["step"] = 0
-                        # Exponential moving average of gradient values
-                        state["exp_avg"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-                        state["exp_avg_slow"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-                        # Exponential moving average of squared gradient values
-                        state["exp_avg_sq"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
-
-                    exp_avgs.append(state["exp_avg"])
-                    exp_avgs_slow.append(state["exp_avg_slow"])
-                    exp_avg_sqs.append(state["exp_avg_sq"])
-
-                    # update the steps for each param group update
-                    state["step"] += 1
-                    # record the step after step update
-                    state_steps.append(state["step"])
-
-            beta1, beta2, beta3 = group["betas"]
-            _update_ademamix(
-                params_with_grad,
-                grads,
-                exp_avgs,
-                exp_avgs_slow,
-                exp_avg_sqs,
-                state_steps,
-                beta1,
-                beta2,
-                beta3,
-                group["alpha"],
-                group["lr"],
-                group["weight_decay"],
-                group["eps"],
-            )
-        return loss
-
-
-def _update_ademamix(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avgs_slow: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    state_steps: List[int],
-    beta1: float,
-    beta2: float,
-    beta3: float,
-    alpha: float,
-    lr: float,
-    weight_decay: float,
-    eps: float,
-) -> None:
-    r"""Functional API that performs AdaBelief algorithm computation.
-    See :class:`~holocron.optim.AdaBelief` for details.
-    """
-    for i, param in enumerate(params):
-        grad = grads[i]
-        m1 = exp_avgs[i]
-        m2 = exp_avgs_slow[i]
-        nu = exp_avg_sqs[i]
-        step = state_steps[i]
-
-        bias_correction1 = 1 - beta1**step
-        bias_correction2 = 1 - beta2**step
-
-        if weight_decay != 0:
-            grad = grad.add(param, alpha=weight_decay)
-
-        # Decay the first and second moment running average coefficient
-        m1.mul_(beta1).add_(grad, alpha=1 - beta1)
-        nu.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-        m2.mul_(beta3).add_(grad, alpha=1 - beta3)
-
-        denom = (nu.sqrt() / math.sqrt(bias_correction2)).add_(eps)
-
-        param.addcdiv_(m1 / bias_correction1 + alpha * m2, denom, value=-lr)
-
-
-def ademamix_wrapper(parameters: dict) -> torch.optim.Optimizer:
-    """
-    Creates an AdEMAMix optimizer from the PyTorch `torch.optim` module using the input parameters.
-
-    Args:
-        parameters (dict): A dictionary containing the input parameters for the optimizer.
-
-    Returns:
-        torch.optim.Optimizer: An AdEMAMix optimizer.
-    """
-
-    return AdEMAMix(
-        params=parameters["model_parameters"],
-        lr=parameters.get("learning_rate", 1e-3),
-        betas=parameters.get("betas", (0.9, 0.999, 0.9999)),
-        alpha=parameters.get("alpha", 5.0),
-        eps=parameters.get("eps", 1e-8),
-        weight_decay=parameters.get("weight_decay", 0.0),
-    )

From 4a80fb6e02f7c682ee15ccfcaabc437d5665d632 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:26:31 -0500
Subject: [PATCH 26/43] updated init

---
 GANDLF/optimizers/__init__.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/GANDLF/optimizers/__init__.py b/GANDLF/optimizers/__init__.py
index b59afb22f..e15df328a 100644
--- a/GANDLF/optimizers/__init__.py
+++ b/GANDLF/optimizers/__init__.py
@@ -15,7 +15,7 @@
 
 from .wrap_monai import novograd_wrapper
 
-from .ademamix import ademamix_wrapper
+from .thirdparty import ademamix_wrapper, lion_wrapper
 
 global_optimizer_dict = {
     "sgd": sgd,
@@ -32,6 +32,7 @@
     "novograd": novograd_wrapper,
     "nadam": nadam,
     "ademamix": ademamix_wrapper,
+    "lion": lion_wrapper,
 }
 
 
@@ -49,9 +50,10 @@ def get_optimizer(params):
     # Retrieve the optimizer type from the input parameters
     optimizer_type = params["optimizer"]["type"]
 
+    assert (
+        optimizer_type in global_optimizer_dict
+    ), f"Optimizer type {optimizer_type} not found"
+
     # Create the optimizer instance using the specified type and input parameters
-    if optimizer_type in global_optimizer_dict:
-        optimizer_function = global_optimizer_dict[optimizer_type]
-        return optimizer_function(params)
-    else:
-        raise ValueError("Optimizer type %s not found" % optimizer_type)
+    optimizer_function = global_optimizer_dict[optimizer_type]
+    return optimizer_function(params)

From 43036ab52f894773aded71488b03c786c6cdb3bd Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:27:42 -0500
Subject: [PATCH 27/43] added comments, typing help and assert to improve code
 coverage

---
 GANDLF/optimizers/wrap_monai.py | 16 +++++++++++++---
 GANDLF/optimizers/wrap_torch.py | 32 ++++++++++++++++----------------
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/GANDLF/optimizers/wrap_monai.py b/GANDLF/optimizers/wrap_monai.py
index 221ba57bd..031d43279 100644
--- a/GANDLF/optimizers/wrap_monai.py
+++ b/GANDLF/optimizers/wrap_monai.py
@@ -1,11 +1,21 @@
-import monai
+from torch.optim.optimizer import Optimizer
+
 from monai.optimizers import Novograd
 
 
-def novograd_wrapper(parameters: dict) -> monai.optimizers.Novograd:
+def novograd_wrapper(parameters) -> Optimizer:
+    """
+    Creates an instance of the Novograd optimizer from the `monai` package using the input parameters.
+
+    Args:
+        parameters (dict): A dictionary containing the input parameters for the optimizer.
+
+    Returns:
+        Optimizer: An instance of the Novograd optimizer.
+    """
     return Novograd(
         parameters["model_parameters"],
-        lr=parameters.get("learning_rate", 1e-3),
+        lr=parameters.get("learning_rate"),
         betas=parameters["optimizer"].get("betas", (0.9, 0.999)),
         eps=parameters["optimizer"].get("eps", 1e-8),
         weight_decay=parameters["optimizer"].get("weight_decay", 3e-05),
diff --git a/GANDLF/optimizers/wrap_torch.py b/GANDLF/optimizers/wrap_torch.py
index 2f4650bdb..16fad536c 100644
--- a/GANDLF/optimizers/wrap_torch.py
+++ b/GANDLF/optimizers/wrap_torch.py
@@ -1,4 +1,5 @@
-import torch
+from torch.optim.optimizer import Optimizer
+
 from torch.optim import (
     SGD,
     ASGD,
@@ -15,7 +16,7 @@
 )
 
 
-def sgd(parameters: dict) -> torch.optim.SGD:
+def sgd(parameters) -> Optimizer:
     """
     Creates a Stochastic Gradient Descent optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -37,7 +38,7 @@ def sgd(parameters: dict) -> torch.optim.SGD:
     )
 
 
-def asgd(parameters: dict) -> torch.optim.ASGD:
+def asgd(parameters) -> Optimizer:
     """
     Creates an Averaged Stochastic Gradient Descent optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -59,7 +60,7 @@ def asgd(parameters: dict) -> torch.optim.ASGD:
     )
 
 
-def adam(parameters: dict, opt_type: str = "normal") -> torch.optim.Adam:
+def adam(parameters, opt_type="normal") -> Optimizer:
     """
     Creates an Adam or AdamW optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -72,12 +73,11 @@ def adam(parameters: dict, opt_type: str = "normal") -> torch.optim.Adam:
 
     """
     # Determine which optimizer to create based on opt_type
+    assert opt_type in ["normal", "AdamW"], f"Invalid optimizer type: {opt_type}"
+    optimizer_fn = AdamW
+
     if opt_type == "normal":
         optimizer_fn = Adam
-    elif opt_type == "AdamW":
-        optimizer_fn = AdamW
-    else:
-        raise ValueError(f"Invalid optimizer type: {opt_type}")
 
     # Create the optimizer using the input parameters
     return optimizer_fn(
@@ -90,7 +90,7 @@ def adam(parameters: dict, opt_type: str = "normal") -> torch.optim.Adam:
     )
 
 
-def adamw(parameters: dict) -> torch.optim.AdamW:
+def adamw(parameters) -> Optimizer:
     """
     Creates an AdamW optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -104,7 +104,7 @@ def adamw(parameters: dict) -> torch.optim.AdamW:
     return adam(parameters, opt_type="AdamW")
 
 
-def adamax(parameters: dict) -> torch.optim.Adamax:
+def adamax(parameters) -> Optimizer:
     """
     Creates an Adamax optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -140,7 +140,7 @@ def adamax(parameters: dict) -> torch.optim.Adamax:
 #     )
 
 
-def rprop(parameters: dict) -> torch.optim.Rprop:
+def rprop(parameters) -> Optimizer:
     """
     Creates a Resilient Backpropagation optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -160,7 +160,7 @@ def rprop(parameters: dict) -> torch.optim.Rprop:
     )
 
 
-def adadelta(parameters: dict) -> torch.optim.Adadelta:
+def adadelta(parameters) -> Optimizer:
     """
     Creates an Adadelta optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -181,7 +181,7 @@ def adadelta(parameters: dict) -> torch.optim.Adadelta:
     )
 
 
-def adagrad(parameters: dict) -> torch.optim.Adagrad:
+def adagrad(parameters) -> Optimizer:
     """
     Creates an Adagrad optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -203,7 +203,7 @@ def adagrad(parameters: dict) -> torch.optim.Adagrad:
     )
 
 
-def rmsprop(parameters: dict) -> torch.optim.RMSprop:
+def rmsprop(parameters) -> Optimizer:
     """
     Creates an RMSprop optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -226,7 +226,7 @@ def rmsprop(parameters: dict) -> torch.optim.RMSprop:
     )
 
 
-def radam(parameters: dict) -> torch.optim.RAdam:
+def radam(parameters) -> Optimizer:
     """
     Creates a RAdam optimizer from the PyTorch `torch.optim` module using the input parameters.
 
@@ -247,7 +247,7 @@ def radam(parameters: dict) -> torch.optim.RAdam:
     )
 
 
-def nadam(parameters: dict) -> torch.optim.NAdam:
+def nadam(parameters) -> Optimizer:
     """
     Creates a NAdam optimizer from the PyTorch `torch.optim` module using the input parameters.
 

From 64e15cfb3cc66fd5172821337baa21b15eede98a Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:34:44 -0500
Subject: [PATCH 28/43] added the `adopt` optimizer in thirdpart

---
 GANDLF/optimizers/thirdparty/__init__.py |   2 +
 GANDLF/optimizers/thirdparty/adopt.py    | 525 +++++++++++++++++++++++
 2 files changed, 527 insertions(+)
 create mode 100644 GANDLF/optimizers/thirdparty/adopt.py

diff --git a/GANDLF/optimizers/thirdparty/__init__.py b/GANDLF/optimizers/thirdparty/__init__.py
index 110b32e2f..7b47ed60c 100644
--- a/GANDLF/optimizers/thirdparty/__init__.py
+++ b/GANDLF/optimizers/thirdparty/__init__.py
@@ -1,3 +1,5 @@
 from .ademamix import ademamix_wrapper
 
 from .lion import lion_wrapper
+
+from .adopt import adopt_wrapper
diff --git a/GANDLF/optimizers/thirdparty/adopt.py b/GANDLF/optimizers/thirdparty/adopt.py
new file mode 100644
index 000000000..21b022005
--- /dev/null
+++ b/GANDLF/optimizers/thirdparty/adopt.py
@@ -0,0 +1,525 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+from typing import cast, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from torch.optim.optimizer import (
+    _capturable_doc,
+    _default_to_fused_or_foreach,
+    _device_dtype_check_for_fused,
+    _differentiable_doc,
+    _disable_dynamo_if_unsupported,
+    _foreach_doc,
+    _fused_doc,
+    _get_capturable_supported_devices,
+    _get_scalar_dtype,
+    _get_value,
+    _maximize_doc,
+    _stack_if_compiling,
+    _use_grad_for_differentiable,
+    _view_as_real,
+    DeviceDict,
+    Optimizer,
+    ParamsT,
+)
+
+
+__all__ = ["ADOPT", "adopt"]
+
+
+class ADOPT(Optimizer):
+    ### "adapted" from https://github.com/iShohei220/adopt/blob/main/adopt.py
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: Union[float, Tensor] = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.9999),
+        eps: float = 1e-6,
+        weight_decay: float = 0.0,
+        decoupled: bool = False,
+        *,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        capturable: bool = False,
+        differentiable: bool = False,
+        fused: Optional[bool] = None,
+    ):
+        if isinstance(lr, Tensor):
+            if foreach and not capturable:
+                raise ValueError(
+                    "lr as a Tensor is not supported for capturable=False and foreach=True"
+                )
+            if lr.numel() != 1:
+                raise ValueError("Tensor lr must be 1-element")
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            decoupled=decoupled,
+            maximize=maximize,
+            foreach=foreach,
+            capturable=capturable,
+            differentiable=differentiable,
+            fused=fused,
+        )
+        super().__init__(params, defaults)
+
+        if fused:
+            # TODO: support fused
+            raise RuntimeError("`fused` is not currently supported")
+
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            self._step_supports_amp_scaling = True
+            # TODO(crcrpar): [low prec params & their higher prec copy]
+            # Support AMP with FP16/BF16 model params which would need
+            # higher prec copy of params to do update math in higher prec to
+            # alleviate the loss of information.
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("maximize", False)
+            group.setdefault("foreach", None)
+            group.setdefault("capturable", False)
+            group.setdefault("differentiable", False)
+            fused = group.setdefault("fused", None)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (
+                        torch.tensor(
+                            step_val,
+                            dtype=_get_scalar_dtype(is_fused=fused),
+                            device=p.device,
+                        )
+                        if group["capturable"] or group["fused"]
+                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
+                    )
+
+    def _init_group(
+        self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps
+    ):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is not None:
+                has_complex |= torch.is_complex(p)
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError("ADOPT does not support sparse gradients")
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    if group["fused"]:
+                        _device_dtype_check_for_fused(p)
+                    # note(crcrpar): [special device hosting for step]
+                    # Deliberately host `step` on CPU if both capturable and fused are off.
+                    # This is because kernel launches are costly on CUDA and XLA.
+                    state["step"] = (
+                        torch.zeros(
+                            (),
+                            dtype=_get_scalar_dtype(is_fused=group["fused"]),
+                            device=p.device,
+                        )
+                        if group["capturable"] or group["fused"]
+                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                    )
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+
+                if group["differentiable"] and state["step"].requires_grad:
+                    raise RuntimeError(
+                        "`requires_grad` is not supported for `step` in differentiable mode"
+                    )
+
+                # Foreach without capturable does not support a tensor lr
+                if (
+                    group["foreach"]
+                    and torch.is_tensor(group["lr"])
+                    and not group["capturable"]
+                ):
+                    raise RuntimeError(
+                        "lr as a Tensor is not supported for capturable=False and foreach=True"
+                    )
+
+                state_steps.append(state["step"])
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad: List[Tensor] = []
+            grads: List[Tensor] = []
+            exp_avgs: List[Tensor] = []
+            exp_avg_sqs: List[Tensor] = []
+            state_steps: List[Tensor] = []
+            beta1, beta2 = group["betas"]
+
+            has_complex = self._init_group(
+                group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps
+            )
+
+            adopt(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                state_steps,
+                has_complex=has_complex,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                decoupled=group["decoupled"],
+                eps=group["eps"],
+                maximize=group["maximize"],
+                foreach=group["foreach"],
+                capturable=group["capturable"],
+                differentiable=group["differentiable"],
+                fused=group["fused"],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
+            )
+
+        return loss
+
+
+def _single_tensor_adopt(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    has_complex: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    decoupled: bool,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+):
+    assert grad_scale is None and found_inf is None
+
+    if torch.jit.is_scripting():
+        # this assert is due to JIT being dumb and not realizing that the ops below
+        # have overloads to handle both float and Tensor lrs, so we just assert it's
+        # a float since most people using JIT are using floats
+        assert isinstance(lr, float)
+
+    for i, param in enumerate(params):
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            capturable_supported_devices = _get_capturable_supported_devices()
+            assert (
+                param.device.type == step_t.device.type
+                and param.device.type in capturable_supported_devices
+            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+
+        # update step
+        step_t += 1
+
+        if weight_decay != 0:
+            if decoupled:
+                param.add_(param, alpha=-lr * weight_decay)
+            else:
+                grad = grad.add(param, alpha=weight_decay)
+
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            if exp_avg is not None:
+                exp_avg = torch.view_as_real(exp_avg)
+            if exp_avg_sq is not None:
+                exp_avg_sq = torch.view_as_real(exp_avg_sq)
+            param = torch.view_as_real(param)
+
+        step = step_t if capturable or differentiable else _get_value(step_t)
+        if step == 1:
+            exp_avg_sq.addcmul_(grad, grad.conj())
+            continue
+
+        denom = torch.clamp(exp_avg_sq.sqrt(), eps)
+        if step == 2:
+            exp_avg.addcdiv_(grad, denom)
+        else:
+            exp_avg.mul_(beta1).addcdiv_(grad, denom, value=1 - beta1)
+
+        param.add_(exp_avg, alpha=-lr)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
+
+
+def _multi_tensor_adopt(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    has_complex: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    decoupled: bool,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+):
+    if len(params) == 0:
+        return
+
+    if isinstance(lr, Tensor) and not capturable:
+        raise RuntimeError(
+            "lr as a Tensor is not supported for capturable=False and foreach=True"
+        )
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        capturable_supported_devices = _get_capturable_supported_devices(
+            supports_xla=False
+        )
+        assert all(
+            p.device.type == step.device.type
+            and p.device.type in capturable_supported_devices
+            for p, step in zip(params, state_steps)
+        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+
+    assert grad_scale is None and found_inf is None
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, state_steps]  # type: ignore[list-item]
+    )
+    for (
+        device_params_,
+        device_grads_,
+        device_exp_avgs_,
+        device_exp_avg_sqs_,
+        device_state_steps_,
+    ), _ in grouped_tensors.values():
+        device_params = cast(List[Tensor], device_params_)
+        device_grads = cast(List[Tensor], device_grads_)
+        device_exp_avgs = cast(List[Tensor], device_exp_avgs_)
+        device_exp_avg_sqs = cast(List[Tensor], device_exp_avg_sqs_)
+        device_state_steps = cast(List[Tensor], device_state_steps_)
+
+        # Handle complex parameters
+        if has_complex:
+            _view_as_real(
+                device_params, device_grads, device_exp_avgs, device_exp_avg_sqs
+            )
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
+            torch._foreach_add_(
+                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
+            )
+        else:
+            torch._foreach_add_(device_state_steps, 1)
+
+        if weight_decay != 0:
+            if decoupled:
+                torch._foreach_add_(
+                    device_params, device_params, alpha=-lr * weight_decay
+                )
+            else:
+                # Re-use the intermediate memory (device_grads) already allocated for maximize
+                if maximize:
+                    torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
+                else:
+                    device_grads = torch._foreach_add(  # type: ignore[assignment]
+                        device_grads, device_params, alpha=weight_decay
+                    )
+
+        if device_state_steps[0] == 1:
+            torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads)
+            continue
+
+        exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
+        exp_avg_sq_sqrt = torch._foreach_maximum(exp_avg_sq_sqrt, eps)
+
+        if device_state_steps[0] == 2:
+            torch._foreach_addcdiv_(device_exp_avgs, device_grads, exp_avg_sq_sqrt)
+        else:
+            torch._foreach_mul_(device_exp_avgs, beta1)
+            torch._foreach_addcdiv_(
+                device_exp_avgs, device_grads, exp_avg_sq_sqrt, value=1 - beta1
+            )
+
+        torch._foreach_add_(device_params, device_exp_avgs, alpha=-lr)
+        torch._foreach_mul_(device_exp_avg_sqs, beta2)
+        torch._foreach_addcmul_(
+            device_exp_avg_sqs, device_grads, device_grads, value=1 - beta2
+        )
+
+
+@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adopt)
+def adopt(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    capturable: bool = False,
+    differentiable: bool = False,
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
+    has_complex: bool = False,
+    *,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    decoupled: bool,
+    eps: float,
+    maximize: bool,
+):
+    r"""Functional API that performs ADOPT algorithm computation."""
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(
+            params, differentiable, use_fused=False
+        )
+        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
+        if foreach and isinstance(lr, Tensor) and not capturable:
+            foreach = False
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
+
+    # this check is slow during compilation, so we skip it
+    # if it's strictly needed we can add this check back in dynamo
+    if not torch._utils.is_compiling() and not all(
+        isinstance(t, torch.Tensor) for t in state_steps
+    ):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
+
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adopt
+    elif foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adopt
+    else:
+        func = _single_tensor_adopt
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        has_complex=has_complex,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        decoupled=decoupled,
+        eps=eps,
+        maximize=maximize,
+        capturable=capturable,
+        differentiable=differentiable,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
+    )
+
+
+def adopt_wrapper(parameters: dict) -> torch.optim.Optimizer:
+    """
+    Creates an AdEMAMix optimizer from the PyTorch `torch.optim` module using the input parameters.
+
+    Args:
+        parameters (dict): A dictionary containing the input parameters for the optimizer.
+
+    Returns:
+        torch.optim.Optimizer: An AdEMAMix optimizer.
+    """
+
+    return ADOPT(
+        params=parameters["model_parameters"],
+        lr=parameters.get("learning_rate", 1e-3),
+        betas=parameters.get("betas", (0.9, 0.999, 0.9999)),
+        eps=parameters.get("eps", 1e-8),
+        alpha=parameters.get("alpha", 5.0),
+        weight_decay=parameters.get("weight_decay", 0.0),
+        decoupled=parameters["optimizer"].get("decoupled", False),
+        foreach=parameters.get("foreach", None),
+        maximize=parameters.get("maximize", False),
+        capturable=parameters.get("capturable", False),
+        differentiable=parameters.get("differentiable", False),
+        fused=parameters.get("fused", None),
+    )

From 48f9e0b3822a8fcc8609f1875f1b3bc56283a9f6 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 11:35:05 -0500
Subject: [PATCH 29/43] added the call and updated docs

---
 GANDLF/optimizers/README.md   | 6 ++++--
 GANDLF/optimizers/__init__.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/GANDLF/optimizers/README.md b/GANDLF/optimizers/README.md
index b12a61f12..7db5f7c7b 100644
--- a/GANDLF/optimizers/README.md
+++ b/GANDLF/optimizers/README.md
@@ -3,10 +3,12 @@
 ## Adding a new algorithm
 
 - For an optimizer defined in PyTorch [[ref](https://pytorch.org/docs/stable/optim.html#algorithms)], update the `GANDLF.optimizers.wrap_torch.py` submodule.
-- For a custom optimizer, create a new submodule called `GANDLF.optimizers.${awesome_optimizer}.py`. Ensure that it inherits from PyTorch's base optimizer class [[ref](https://pytorch.org/docs/stable/optim.html#base-class)]
+- For a custom optimizer, create a new submodule called `GANDLF.optimizers.${awesome_optimizer}.py`.
+- For a third-party optimizer (where the code is available from an external repository), add the relevant code under the `GANDLF.optimizers.thirdparty` submodule.
 - If a new dependency needs to be used, update GaNDLF's [`setup.py`](https://github.com/mlcommons/GaNDLF/blob/master/setup.py) with the new requirement.
   - Define a new submodule under `GANDLF.optimizers` as `GANDLF.optimizers.wrap_${package_name}.py`.
   - Ensure that the new algorithm is wrapped in a function which returns an object with the PyTorch optimizer type. Use any of the optimizers in `GANDLF.optimizers.wrap_torch.py` as an example.
 - Add the algorithm's identifier to `GANDLF.optimizers.__init__.global_optimizer_dict` with an appropriate key.
 - Call the new algorithm from the config using the `optimizer` key.
-- [Update the tests!](https://mlcommons.github.io/GaNDLF/extending/#update-tests)https://mlcommons.github.io/GaNDLF/extending/#update-tests
+- [If appropriate, please update the tests!](https://mlcommons.github.io/GaNDLF/extending/#update-tests)https://mlcommons.github.io/GaNDLF/extending/#update-tests
+- All wrappers should return the type `from torch.optim.optimizer.Optimizer`.
\ No newline at end of file
diff --git a/GANDLF/optimizers/__init__.py b/GANDLF/optimizers/__init__.py
index e15df328a..4df3d0ec6 100644
--- a/GANDLF/optimizers/__init__.py
+++ b/GANDLF/optimizers/__init__.py
@@ -15,7 +15,7 @@
 
 from .wrap_monai import novograd_wrapper
 
-from .thirdparty import ademamix_wrapper, lion_wrapper
+from .thirdparty import ademamix_wrapper, lion_wrapper, adopt_wrapper
 
 global_optimizer_dict = {
     "sgd": sgd,
@@ -33,6 +33,7 @@
     "nadam": nadam,
     "ademamix": ademamix_wrapper,
     "lion": lion_wrapper,
+    "adopt": adopt_wrapper,
 }
 
 

From bf06c87a4020cc64787387b459f340acc5d955a4 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 17:21:59 -0500
Subject: [PATCH 30/43] lint should be fixed

---
 GANDLF/optimizers/thirdparty/lion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/optimizers/thirdparty/lion.py b/GANDLF/optimizers/thirdparty/lion.py
index a1b871d2e..75a8534a2 100644
--- a/GANDLF/optimizers/thirdparty/lion.py
+++ b/GANDLF/optimizers/thirdparty/lion.py
@@ -19,4 +19,4 @@ def lion_wrapper(parameters: dict) -> Optimizer:
         weight_decay=parameters["optimizer"].get("weight_decay", 0.0),
         decoupled_weight_decay=parameters["optimizer"].get("decoupled_weight_decay", False),
         use_triton=False,  # as of 20241120, triton is not generally available for all platforms
-    )
\ No newline at end of file
+    )

From 2aeed0be619c936f74c751b052a1eb430ee97a72 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 17:23:25 -0500
Subject: [PATCH 31/43] fixed lint

---
 GANDLF/optimizers/thirdparty/lion.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/GANDLF/optimizers/thirdparty/lion.py b/GANDLF/optimizers/thirdparty/lion.py
index 75a8534a2..0a6116e21 100644
--- a/GANDLF/optimizers/thirdparty/lion.py
+++ b/GANDLF/optimizers/thirdparty/lion.py
@@ -17,6 +17,8 @@ def lion_wrapper(parameters: dict) -> Optimizer:
         lr=parameters.get("learning_rate", 1e-4),
         betas=parameters["optimizer"].get("betas", (0.9, 0.999)),
         weight_decay=parameters["optimizer"].get("weight_decay", 0.0),
-        decoupled_weight_decay=parameters["optimizer"].get("decoupled_weight_decay", False),
+        decoupled_weight_decay=parameters["optimizer"].get(
+            "decoupled_weight_decay", False
+        ),
         use_triton=False,  # as of 20241120, triton is not generally available for all platforms
     )

From ed9b6e02eb5d79d61d11dffa1e5709c5c1b5f807 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 17:33:27 -0500
Subject: [PATCH 32/43] updated pytorch from `2.3.1` to `2.4.1`

---
 .devcontainer/onCreateCommand.sh          | 2 +-
 .devcontainer/postCreateCommand.sh        | 2 +-
 .github/workflows/dependencies/action.yml | 2 +-
 Dockerfile-CPU                            | 2 +-
 Dockerfile-CUDA11.8                       | 2 +-
 Dockerfile-CUDA12.1                       | 2 +-
 Dockerfile-ROCm                           | 4 ++--
 docs/setup.md                             | 2 +-
 setup.py                                  | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.devcontainer/onCreateCommand.sh b/.devcontainer/onCreateCommand.sh
index b2fb94354..0a789a6a6 100755
--- a/.devcontainer/onCreateCommand.sh
+++ b/.devcontainer/onCreateCommand.sh
@@ -6,4 +6,4 @@ pip install wheel
 pip install openvino-dev==2023.0.1 # [OPTIONAL] to generate optimized models for inference
 pip install mlcube_docker          # [OPTIONAL] to deploy GaNDLF models as MLCube-compliant Docker containers
 pip install medmnist==2.1.0
-pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh
index 8428eb5d7..4943931e1 100755
--- a/.devcontainer/postCreateCommand.sh
+++ b/.devcontainer/postCreateCommand.sh
@@ -6,7 +6,7 @@
 # if runnning on a GPU machine, install the GPU version of pytorch
 if command -v nvidia-smi &> /dev/null
 then
-	pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
+	pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
 fi
 
 pip install -e .
diff --git a/.github/workflows/dependencies/action.yml b/.github/workflows/dependencies/action.yml
index 32f862b54..2969903a2 100644
--- a/.github/workflows/dependencies/action.yml
+++ b/.github/workflows/dependencies/action.yml
@@ -100,5 +100,5 @@ runs:
           python -m pip install --upgrade pip==24.0
           python -m pip install wheel
           python -m pip install openvino-dev==2023.0.1 mlcube_docker
-          pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+          pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
           pip install -e .
diff --git a/Dockerfile-CPU b/Dockerfile-CPU
index 96a27a67c..907e1dc87 100644
--- a/Dockerfile-CPU
+++ b/Dockerfile-CPU
@@ -9,7 +9,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
 # EXPLICITLY install cpu versions of torch/torchvision (not all versions have +cpu modes on PyPI...)
-RUN python3.9 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
 # Do some dependency installation separately here to make layer caching more efficient
diff --git a/Dockerfile-CUDA11.8 b/Dockerfile-CUDA11.8
index 6b06fcda5..008ce3d3b 100644
--- a/Dockerfile-CUDA11.8
+++ b/Dockerfile-CUDA11.8
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
-RUN python3.9 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
+RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu118
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
 # Do some dependency installation separately here to make layer caching more efficient
diff --git a/Dockerfile-CUDA12.1 b/Dockerfile-CUDA12.1
index 4da63a335..838d47e4b 100644
--- a/Dockerfile-CUDA12.1
+++ b/Dockerfile-CUDA12.1
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
-RUN python3.9 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
+RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
 # Do some dependency installation separately here to make layer caching more efficient
diff --git a/Dockerfile-ROCm b/Dockerfile-ROCm
index 508af28ce..95b1fa5f4 100644
--- a/Dockerfile-ROCm
+++ b/Dockerfile-ROCm
@@ -1,4 +1,4 @@
-FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch
+FROM rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch
 LABEL github="https://github.com/mlcommons/GaNDLF"
 LABEL docs="https://mlcommons.github.io/GaNDLF/"
 LABEL version=1.0
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
-RUN python3.9 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/rocm6.0
+RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/rocm6.0
 RUN python3.9 -m pip install --upgrade pip && python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 RUN apt-get update && apt-get install -y libgl1
 
diff --git a/docs/setup.md b/docs/setup.md
index 9f9cb5397..be77f71b9 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -36,7 +36,7 @@ You may install pytorch to be compatible with CUDA, ROCm, or CPU-only. An exhaus
 Use one of the following depending on your needs:
 - CUDA 12.1
 ```bash
-(venv_gandlf) $> pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
+(venv_gandlf) $> pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
 ```
 
 ### Optional Dependencies 
diff --git a/setup.py b/setup.py
index 908459ccd..3f1cf5227 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
 # specifying version for `black` separately because it is also used to [check for lint](https://github.com/mlcommons/GaNDLF/blob/master/.github/workflows/black.yml)
 black_version = "23.11.0"
 requirements = [
-    "torch==2.3.1",
+    "torch==2.4.1",
     f"black=={black_version}",
     "numpy==1.25.0",
     "scipy",

From 042e26222306c95755ec79168277682ca0afdcee Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 17:43:06 -0500
Subject: [PATCH 33/43] succinct import

---
 GANDLF/optimizers/wrap_monai.py | 2 +-
 GANDLF/optimizers/wrap_torch.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/GANDLF/optimizers/wrap_monai.py b/GANDLF/optimizers/wrap_monai.py
index 031d43279..cdc53515e 100644
--- a/GANDLF/optimizers/wrap_monai.py
+++ b/GANDLF/optimizers/wrap_monai.py
@@ -1,4 +1,4 @@
-from torch.optim.optimizer import Optimizer
+from torch.optim import Optimizer
 
 from monai.optimizers import Novograd
 
diff --git a/GANDLF/optimizers/wrap_torch.py b/GANDLF/optimizers/wrap_torch.py
index 16fad536c..d6deff477 100644
--- a/GANDLF/optimizers/wrap_torch.py
+++ b/GANDLF/optimizers/wrap_torch.py
@@ -1,6 +1,5 @@
-from torch.optim.optimizer import Optimizer
-
 from torch.optim import (
+    Optimizer,
     SGD,
     ASGD,
     Rprop,

From 820df1c94434afed2c86c3700792ab1fd3d403b9 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 19:23:22 -0500
Subject: [PATCH 34/43] Update action.yml

---
 .github/workflows/dependencies/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dependencies/action.yml b/.github/workflows/dependencies/action.yml
index 2969903a2..dce85bc5b 100644
--- a/.github/workflows/dependencies/action.yml
+++ b/.github/workflows/dependencies/action.yml
@@ -100,5 +100,5 @@ runs:
           python -m pip install --upgrade pip==24.0
           python -m pip install wheel
           python -m pip install openvino-dev==2023.0.1 mlcube_docker
-          pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
+          pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu 
           pip install -e .

From c699a0a40b63401b7ea83c9318b450dcc8bd41c6 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 19:23:30 -0500
Subject: [PATCH 35/43] Update action.yml

---
 .github/workflows/dependencies/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dependencies/action.yml b/.github/workflows/dependencies/action.yml
index dce85bc5b..2969903a2 100644
--- a/.github/workflows/dependencies/action.yml
+++ b/.github/workflows/dependencies/action.yml
@@ -100,5 +100,5 @@ runs:
           python -m pip install --upgrade pip==24.0
           python -m pip install wheel
           python -m pip install openvino-dev==2023.0.1 mlcube_docker
-          pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu 
+          pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
           pip install -e .

From 6a5e1517f9fc45d8b5beeb93cd341ac029f94419 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Wed, 20 Nov 2024 21:32:40 -0500
Subject: [PATCH 36/43] update pytorch from `2.4.1` to `2.5.0`, which is the
 previous stable release

---
 .devcontainer/onCreateCommand.sh          | 2 +-
 .devcontainer/postCreateCommand.sh        | 2 +-
 .github/workflows/dependencies/action.yml | 2 +-
 Dockerfile-CPU                            | 2 +-
 Dockerfile-CUDA11.8                       | 2 +-
 Dockerfile-CUDA12.1                       | 2 +-
 Dockerfile-ROCm                           | 2 +-
 docs/setup.md                             | 2 +-
 setup.py                                  | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.devcontainer/onCreateCommand.sh b/.devcontainer/onCreateCommand.sh
index 0a789a6a6..d2f104a51 100755
--- a/.devcontainer/onCreateCommand.sh
+++ b/.devcontainer/onCreateCommand.sh
@@ -6,4 +6,4 @@ pip install wheel
 pip install openvino-dev==2023.0.1 # [OPTIONAL] to generate optimized models for inference
 pip install mlcube_docker          # [OPTIONAL] to deploy GaNDLF models as MLCube-compliant Docker containers
 pip install medmnist==2.1.0
-pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
+pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cpu
diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh
index 4943931e1..163341712 100755
--- a/.devcontainer/postCreateCommand.sh
+++ b/.devcontainer/postCreateCommand.sh
@@ -6,7 +6,7 @@
 # if runnning on a GPU machine, install the GPU version of pytorch
 if command -v nvidia-smi &> /dev/null
 then
-	pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
+	pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
 fi
 
 pip install -e .
diff --git a/.github/workflows/dependencies/action.yml b/.github/workflows/dependencies/action.yml
index 2969903a2..9aa2d2dd2 100644
--- a/.github/workflows/dependencies/action.yml
+++ b/.github/workflows/dependencies/action.yml
@@ -100,5 +100,5 @@ runs:
           python -m pip install --upgrade pip==24.0
           python -m pip install wheel
           python -m pip install openvino-dev==2023.0.1 mlcube_docker
-          pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
+          pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cpu
           pip install -e .
diff --git a/Dockerfile-CPU b/Dockerfile-CPU
index 907e1dc87..be93294c1 100644
--- a/Dockerfile-CPU
+++ b/Dockerfile-CPU
@@ -9,7 +9,7 @@ RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
 # EXPLICITLY install cpu versions of torch/torchvision (not all versions have +cpu modes on PyPI...)
-RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cpu
+RUN python3.9 -m pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cpu
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
 # Do some dependency installation separately here to make layer caching more efficient
diff --git a/Dockerfile-CUDA11.8 b/Dockerfile-CUDA11.8
index 008ce3d3b..84ecd3ab9 100644
--- a/Dockerfile-CUDA11.8
+++ b/Dockerfile-CUDA11.8
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
-RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu118
+RUN python3.9 -m pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu118
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
 # Do some dependency installation separately here to make layer caching more efficient
diff --git a/Dockerfile-CUDA12.1 b/Dockerfile-CUDA12.1
index 838d47e4b..1807b5562 100644
--- a/Dockerfile-CUDA12.1
+++ b/Dockerfile-CUDA12.1
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
-RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
+RUN python3.9 -m pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
 RUN python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 
 # Do some dependency installation separately here to make layer caching more efficient
diff --git a/Dockerfile-ROCm b/Dockerfile-ROCm
index 95b1fa5f4..60cd7f5a3 100644
--- a/Dockerfile-ROCm
+++ b/Dockerfile-ROCm
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y software-properties-common
 RUN add-apt-repository ppa:deadsnakes/ppa
 RUN apt-get update &&  apt-get install -y  python3.9 python3-pip libjpeg8-dev zlib1g-dev python3-dev libpython3.9-dev libffi-dev libgl1
 RUN python3.9 -m pip install --upgrade pip==24.0
-RUN python3.9 -m pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/rocm6.0
+RUN python3.9 -m pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/rocm6.0
 RUN python3.9 -m pip install --upgrade pip && python3.9 -m pip install openvino-dev==2023.0.1 opencv-python-headless mlcube_docker
 RUN apt-get update && apt-get install -y libgl1
 
diff --git a/docs/setup.md b/docs/setup.md
index be77f71b9..c4ee2fc18 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -36,7 +36,7 @@ You may install pytorch to be compatible with CUDA, ROCm, or CPU-only. An exhaus
 Use one of the following depending on your needs:
 - CUDA 12.1
 ```bash
-(venv_gandlf) $> pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
+(venv_gandlf) $> pip install torch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 --index-url https://download.pytorch.org/whl/cu121
 ```
 
 ### Optional Dependencies 
diff --git a/setup.py b/setup.py
index 3f1cf5227..b4ffc600e 100644
--- a/setup.py
+++ b/setup.py
@@ -37,7 +37,7 @@
 # specifying version for `black` separately because it is also used to [check for lint](https://github.com/mlcommons/GaNDLF/blob/master/.github/workflows/black.yml)
 black_version = "23.11.0"
 requirements = [
-    "torch==2.4.1",
+    "torch==2.5.0",
     f"black=={black_version}",
     "numpy==1.25.0",
     "scipy",

From 7ed664932fb0544bfb0bb4e3aaa581558ae0f620 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Thu, 21 Nov 2024 09:04:41 -0500
Subject: [PATCH 37/43] `alpha` is not needed for adopt (calculated internally
 using `lr`)

---
 GANDLF/optimizers/thirdparty/adopt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/GANDLF/optimizers/thirdparty/adopt.py b/GANDLF/optimizers/thirdparty/adopt.py
index 21b022005..e7f32117d 100644
--- a/GANDLF/optimizers/thirdparty/adopt.py
+++ b/GANDLF/optimizers/thirdparty/adopt.py
@@ -514,7 +514,6 @@ def adopt_wrapper(parameters: dict) -> torch.optim.Optimizer:
         lr=parameters.get("learning_rate", 1e-3),
         betas=parameters.get("betas", (0.9, 0.999, 0.9999)),
         eps=parameters.get("eps", 1e-8),
-        alpha=parameters.get("alpha", 5.0),
         weight_decay=parameters.get("weight_decay", 0.0),
         decoupled=parameters["optimizer"].get("decoupled", False),
         foreach=parameters.get("foreach", None),

From d5c3e8466208374d3121d2e5057fe135aae35552 Mon Sep 17 00:00:00 2001
From: Sarthak Pati <patis@iu.edu>
Date: Thu, 21 Nov 2024 09:28:15 -0500
Subject: [PATCH 38/43] Update GANDLF/optimizers/README.md

---
 GANDLF/optimizers/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GANDLF/optimizers/README.md b/GANDLF/optimizers/README.md
index 7db5f7c7b..8d1499fb3 100644
--- a/GANDLF/optimizers/README.md
+++ b/GANDLF/optimizers/README.md
@@ -4,7 +4,11 @@
 
 - For an optimizer defined in PyTorch [[ref](https://pytorch.org/docs/stable/optim.html#algorithms)], update the `GANDLF.optimizers.wrap_torch.py` submodule.
 - For a custom optimizer, create a new submodule called `GANDLF.optimizers.${awesome_optimizer}.py`.
-- For a third-party optimizer (where the code is available from an external repository), add the relevant code under the `GANDLF.optimizers.thirdparty` submodule.
+- For a third-party optimizer (i.e., where the code is available from an external source/repository):
+  - Add the relevant code under the `GANDLF.optimizers.thirdparty` submodule. 
+  - Add a wrapper which takes in GaNDLF's `parameter` dictionary as input and creates a `torch.optim.Optimizer` object as output.
+  - Add the wrapper to the `GANDLF.optimizers.thirdparty.__init__.py` so that it can be called from `GANDLF.optimizers.__init__.py`.
+  - See `GANDLF.optimizers.thirdparty.adopy.py` as an example.
 - If a new dependency needs to be used, update GaNDLF's [`setup.py`](https://github.com/mlcommons/GaNDLF/blob/master/setup.py) with the new requirement.
   - Define a new submodule under `GANDLF.optimizers` as `GANDLF.optimizers.wrap_${package_name}.py`.
   - Ensure that the new algorithm is wrapped in a function which returns an object with the PyTorch optimizer type. Use any of the optimizers in `GANDLF.optimizers.wrap_torch.py` as an example.

From 882fd51962674295541609bafcb52f7961a23e44 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Thu, 21 Nov 2024 09:30:28 -0500
Subject: [PATCH 39/43] minor adjustments

---
 GANDLF/optimizers/thirdparty/adopt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GANDLF/optimizers/thirdparty/adopt.py b/GANDLF/optimizers/thirdparty/adopt.py
index e7f32117d..115c61c00 100644
--- a/GANDLF/optimizers/thirdparty/adopt.py
+++ b/GANDLF/optimizers/thirdparty/adopt.py
@@ -49,7 +49,7 @@ def __init__(
         if isinstance(lr, Tensor):
             if foreach and not capturable:
                 raise ValueError(
-                    "lr as a Tensor is not supported for capturable=False and foreach=True"
+                    "lr as a Tensor is not supported for `capturable=False` and `foreach=True`"
                 )
             if lr.numel() != 1:
                 raise ValueError("Tensor lr must be 1-element")
@@ -512,8 +512,8 @@ def adopt_wrapper(parameters: dict) -> torch.optim.Optimizer:
     return ADOPT(
         params=parameters["model_parameters"],
         lr=parameters.get("learning_rate", 1e-3),
-        betas=parameters.get("betas", (0.9, 0.999, 0.9999)),
-        eps=parameters.get("eps", 1e-8),
+        betas=parameters.get("betas", (0.9, 0.9999)),
+        eps=parameters.get("eps", 1e-6),
         weight_decay=parameters.get("weight_decay", 0.0),
         decoupled=parameters["optimizer"].get("decoupled", False),
         foreach=parameters.get("foreach", None),

From bbfadbbe8a9c0a0a4cf5b81bf3f1f7f53c8c4f37 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Thu, 21 Nov 2024 10:20:58 -0500
Subject: [PATCH 40/43] version updated

---
 GANDLF/version.py                                          | 2 +-
 samples/config_all_options.yaml                            | 4 ++--
 samples/config_classification.yaml                         | 4 ++--
 samples/config_getting_started_classification_histo2d.yaml | 2 +-
 samples/config_getting_started_classification_rad3d.yaml   | 2 +-
 samples/config_getting_started_regression_histo2d.yaml     | 2 +-
 samples/config_getting_started_regression_rad3d.yaml       | 2 +-
 samples/config_getting_started_segmentation_histo2d.yaml   | 2 +-
 samples/config_getting_started_segmentation_rad3d.yaml     | 4 ++--
 samples/config_regression.yaml                             | 4 ++--
 samples/config_segmentation_brats.yaml                     | 4 ++--
 samples/config_segmentation_histology.yaml                 | 4 ++--
 testing/config_classification.yaml                         | 2 +-
 testing/config_regression.yaml                             | 2 +-
 testing/config_segmentation.yaml                           | 2 +-
 tutorials/classification_medmnist_notebook/config.yaml     | 2 +-
 16 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/GANDLF/version.py b/GANDLF/version.py
index 5e5047feb..20135e762 100644
--- a/GANDLF/version.py
+++ b/GANDLF/version.py
@@ -2,4 +2,4 @@
 # -*- coding: UTF-8 -*-
 
 # check GaNDLF wiki for versioning and release guidelines: https://github.com/mlcommons/GaNDLF/wiki
-__version__ = "0.1.2-dev"
+__version__ = "0.1.2"
diff --git a/samples/config_all_options.yaml b/samples/config_all_options.yaml
index 872d65c44..bbeba97dc 100644
--- a/samples/config_all_options.yaml
+++ b/samples/config_all_options.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2-dev,
-    maximum: 0.1.2-dev # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.2,
+    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
   }
 ## Choose the model parameters here
 model:
diff --git a/samples/config_classification.yaml b/samples/config_classification.yaml
index e8b720520..1c388c727 100644
--- a/samples/config_classification.yaml
+++ b/samples/config_classification.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2-dev,
-    maximum: 0.1.2-dev # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.2,
+    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_getting_started_classification_histo2d.yaml b/samples/config_getting_started_classification_histo2d.yaml
index f824fbd92..5b207ff80 100644
--- a/samples/config_getting_started_classification_histo2d.yaml
+++ b/samples/config_getting_started_classification_histo2d.yaml
@@ -94,6 +94,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_classification_rad3d.yaml b/samples/config_getting_started_classification_rad3d.yaml
index 109e001e6..218a2eb8e 100644
--- a/samples/config_getting_started_classification_rad3d.yaml
+++ b/samples/config_getting_started_classification_rad3d.yaml
@@ -99,6 +99,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_regression_histo2d.yaml b/samples/config_getting_started_regression_histo2d.yaml
index fa2a41e2f..bcfd895dc 100644
--- a/samples/config_getting_started_regression_histo2d.yaml
+++ b/samples/config_getting_started_regression_histo2d.yaml
@@ -59,6 +59,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_regression_rad3d.yaml b/samples/config_getting_started_regression_rad3d.yaml
index 8ce80e1d1..fc28692fd 100644
--- a/samples/config_getting_started_regression_rad3d.yaml
+++ b/samples/config_getting_started_regression_rad3d.yaml
@@ -62,6 +62,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: false
diff --git a/samples/config_getting_started_segmentation_histo2d.yaml b/samples/config_getting_started_segmentation_histo2d.yaml
index 13ca80436..92b52891b 100644
--- a/samples/config_getting_started_segmentation_histo2d.yaml
+++ b/samples/config_getting_started_segmentation_histo2d.yaml
@@ -66,6 +66,6 @@ scheduler:
 track_memory_usage: false
 verbose: true
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_segmentation_rad3d.yaml b/samples/config_getting_started_segmentation_rad3d.yaml
index 758163ff6..b9937efce 100644
--- a/samples/config_getting_started_segmentation_rad3d.yaml
+++ b/samples/config_getting_started_segmentation_rad3d.yaml
@@ -89,6 +89,6 @@ scheduler:
 track_memory_usage: false
 verbose: true
 version:
-  maximum: 0.1.2-dev
-  minimum: 0.1.2-dev
+  maximum: 0.1.2
+  minimum: 0.1.2
 weighted_loss: true
diff --git a/samples/config_regression.yaml b/samples/config_regression.yaml
index 0f4b91737..b50a69895 100644
--- a/samples/config_regression.yaml
+++ b/samples/config_regression.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2-dev,
-    maximum: 0.1.2-dev # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.2,
+    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_segmentation_brats.yaml b/samples/config_segmentation_brats.yaml
index c8a5ac005..54954e6c1 100644
--- a/samples/config_segmentation_brats.yaml
+++ b/samples/config_segmentation_brats.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2-dev,
-    maximum: 0.1.2-dev # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.2,
+    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_segmentation_histology.yaml b/samples/config_segmentation_histology.yaml
index 889ee9a98..5862369e7 100644
--- a/samples/config_segmentation_histology.yaml
+++ b/samples/config_segmentation_histology.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2-dev,
-    maximum: 0.1.2-dev # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.2,
+    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/testing/config_classification.yaml b/testing/config_classification.yaml
index 79dfb5feb..3d38bde04 100644
--- a/testing/config_classification.yaml
+++ b/testing/config_classification.yaml
@@ -55,7 +55,7 @@ save_output: false
 scaling_factor: 1
 scheduler: triangle
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: True
 
diff --git a/testing/config_regression.yaml b/testing/config_regression.yaml
index 47b9e2aab..83a68f93b 100644
--- a/testing/config_regression.yaml
+++ b/testing/config_regression.yaml
@@ -38,7 +38,7 @@ save_output: false
 scaling_factor: 1
 scheduler: triangle
 version:
-  maximum: 0.1.2-dev
+  maximum: 0.1.2
   minimum: 0.0.14
 weighted_loss: false
 
diff --git a/testing/config_segmentation.yaml b/testing/config_segmentation.yaml
index a275a6b8d..defed728d 100644
--- a/testing/config_segmentation.yaml
+++ b/testing/config_segmentation.yaml
@@ -3,7 +3,7 @@
 version:
   {
     minimum: 0.0.14,
-    maximum: 0.1.2-dev
+    maximum: 0.1.2
   }
 model:
   {
diff --git a/tutorials/classification_medmnist_notebook/config.yaml b/tutorials/classification_medmnist_notebook/config.yaml
index f1035dc7d..03591735f 100644
--- a/tutorials/classification_medmnist_notebook/config.yaml
+++ b/tutorials/classification_medmnist_notebook/config.yaml
@@ -2,7 +2,7 @@
 version:
   {
     minimum: 0.0.14,
-    maximum: 0.1.2-dev # this should NOT be made a variable, but should be tested after every tag is created
+    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:

From f057ab870ed89b78e5b11d20024d24f1d10c53d6 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Thu, 21 Nov 2024 17:50:47 +0100
Subject: [PATCH 41/43] Fix spellchecker action and errors

---
 .spelling/.spelling/expect.txt  | 11 ++++++++++-
 GANDLF/losses/loss_interface.py |  4 ++--
 GANDLF/optimizers/README.md     |  2 +-
 docs/faq.md                     |  4 ++--
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/.spelling/.spelling/expect.txt b/.spelling/.spelling/expect.txt
index 795ac46eb..fe6a792b1 100644
--- a/.spelling/.spelling/expect.txt
+++ b/.spelling/.spelling/expect.txt
@@ -723,4 +723,13 @@ ystore
 Zisserman
 zsuokb
 zwezggl
-zzokqk
\ No newline at end of file
+zzokqk
+thirdparty
+adopy
+Shohei
+crcrpar
+lrs
+autograd
+cudagraph
+kwonly
+torchscript
\ No newline at end of file
diff --git a/GANDLF/losses/loss_interface.py b/GANDLF/losses/loss_interface.py
index e8459f41d..90d29154a 100644
--- a/GANDLF/losses/loss_interface.py
+++ b/GANDLF/losses/loss_interface.py
@@ -26,7 +26,7 @@ def forward(self, prediction: torch.Tensor, target: torch.Tensor) -> torch.Tenso
 
 class AbstractSegmentationLoss(AbstractLossFunction):
     """
-    Base class for loss funcions that are used for segmentation tasks.
+    Base class for loss functions that are used for segmentation tasks.
     """
 
     def __init__(self, params: dict):
@@ -43,7 +43,7 @@ def _compute_single_class_loss(
 
     def _optional_loss_operations(self, loss: torch.Tensor) -> torch.Tensor:
         """
-        Perform addtional operations on the loss value. Defaults to identity operation.
+        Perform additional operations on the loss value. Defaults to identity operation.
         If needed, child classes can override this method. Useful in cases where
         for example, the loss value needs to log-transformed or clipped.
         """
diff --git a/GANDLF/optimizers/README.md b/GANDLF/optimizers/README.md
index 8d1499fb3..fe2e8d917 100644
--- a/GANDLF/optimizers/README.md
+++ b/GANDLF/optimizers/README.md
@@ -8,7 +8,7 @@
   - Add the relevant code under the `GANDLF.optimizers.thirdparty` submodule. 
   - Add a wrapper which takes in GaNDLF's `parameter` dictionary as input and creates a `torch.optim.Optimizer` object as output.
   - Add the wrapper to the `GANDLF.optimizers.thirdparty.__init__.py` so that it can be called from `GANDLF.optimizers.__init__.py`.
-  - See `GANDLF.optimizers.thirdparty.adopy.py` as an example.
+  - See `GANDLF.optimizers.thirdparty.adopt.py` as an example.
 - If a new dependency needs to be used, update GaNDLF's [`setup.py`](https://github.com/mlcommons/GaNDLF/blob/master/setup.py) with the new requirement.
   - Define a new submodule under `GANDLF.optimizers` as `GANDLF.optimizers.wrap_${package_name}.py`.
   - Ensure that the new algorithm is wrapped in a function which returns an object with the PyTorch optimizer type. Use any of the optimizers in `GANDLF.optimizers.wrap_torch.py` as an example.
diff --git a/docs/faq.md b/docs/faq.md
index 0bb98239d..62f8ae109 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -53,9 +53,9 @@ Please see https://mlcommons.github.io/GaNDLF/usage/#federating-your-model-evalu
 
 Please read the [migration guide](https://mlcommons.github.io/GaNDLF/migration_guide) to understand the changes that have been made to GaNDLF. If you have any questions, please feel free to [post a support request](https://github.com/mlcommons/GaNDLF/issues/new?assignees=&labels=&template=--questions-help-support.md&title=).
 
-### I am getting an error realted to version mismatch (greater or smaller) between the configuration and GaNDLF version. What should I do?
+### I am getting an error related to version mismatch (greater or smaller) between the configuration and GaNDLF version. What should I do?
 
-This is a safety feature to ensure a tight integartion between the configuration used to define a model and the code version used to perform the training. Ensure that you have all requirements satisfied, and then check the ``version`` key in the configration, and ensure it appropriately matches the output of ``gandlf run --version``.
+This is a safety feature to ensure a tight integration between the configuration used to define a model and the code version used to perform the training. Ensure that you have all requirements satisfied, and then check the ``version`` key in the configuration, and ensure it appropriately matches the output of ``gandlf run --version``.
 
 ### What if I have another question?
 

From 2f51c04dc4c99a8609d0d71a8d5c42b8f31097b2 Mon Sep 17 00:00:00 2001
From: Szymon Mazurek <szymonmazurek57@gmail.com>
Date: Thu, 21 Nov 2024 17:54:56 +0100
Subject: [PATCH 42/43] CLI spelling correction

---
 GANDLF/cli/huggingface_hub_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GANDLF/cli/huggingface_hub_handler.py b/GANDLF/cli/huggingface_hub_handler.py
index b09fa9743..a582e9bf7 100644
--- a/GANDLF/cli/huggingface_hub_handler.py
+++ b/GANDLF/cli/huggingface_hub_handler.py
@@ -121,7 +121,7 @@ def push_to_model_hub(
         ignore_patterns=ignore_patterns,
         delete_patterns=delete_patterns,
     )
-    print("Model Successfully Uploded")
+    print("Model Successfully Uploaded")
 
 
 def download_from_hub(

From 522a6478176aba4b34a52c361c5afa10f900e357 Mon Sep 17 00:00:00 2001
From: scap3yvt <149599669+scap3yvt@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:37:18 -0500
Subject: [PATCH 43/43] version updated

---
 GANDLF/version.py                                          | 2 +-
 samples/config_all_options.yaml                            | 4 ++--
 samples/config_classification.yaml                         | 4 ++--
 samples/config_getting_started_classification_histo2d.yaml | 2 +-
 samples/config_getting_started_classification_rad3d.yaml   | 2 +-
 samples/config_getting_started_regression_histo2d.yaml     | 2 +-
 samples/config_getting_started_regression_rad3d.yaml       | 2 +-
 samples/config_getting_started_segmentation_histo2d.yaml   | 2 +-
 samples/config_getting_started_segmentation_rad3d.yaml     | 4 ++--
 samples/config_regression.yaml                             | 4 ++--
 samples/config_segmentation_brats.yaml                     | 4 ++--
 samples/config_segmentation_histology.yaml                 | 4 ++--
 testing/config_classification.yaml                         | 2 +-
 testing/config_regression.yaml                             | 2 +-
 testing/config_segmentation.yaml                           | 2 +-
 tutorials/classification_medmnist_notebook/config.yaml     | 2 +-
 16 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/GANDLF/version.py b/GANDLF/version.py
index 20135e762..e51579b04 100644
--- a/GANDLF/version.py
+++ b/GANDLF/version.py
@@ -2,4 +2,4 @@
 # -*- coding: UTF-8 -*-
 
 # check GaNDLF wiki for versioning and release guidelines: https://github.com/mlcommons/GaNDLF/wiki
-__version__ = "0.1.2"
+__version__ = "0.1.3-dev"
diff --git a/samples/config_all_options.yaml b/samples/config_all_options.yaml
index bbeba97dc..c3769f8d1 100644
--- a/samples/config_all_options.yaml
+++ b/samples/config_all_options.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2,
-    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.3-dev,
+    maximum: 0.1.3-dev # this should NOT be made a variable, but should be tested after every tag is created
   }
 ## Choose the model parameters here
 model:
diff --git a/samples/config_classification.yaml b/samples/config_classification.yaml
index 1c388c727..828b903df 100644
--- a/samples/config_classification.yaml
+++ b/samples/config_classification.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2,
-    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.3-dev,
+    maximum: 0.1.3-dev # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_getting_started_classification_histo2d.yaml b/samples/config_getting_started_classification_histo2d.yaml
index 5b207ff80..e37e255a0 100644
--- a/samples/config_getting_started_classification_histo2d.yaml
+++ b/samples/config_getting_started_classification_histo2d.yaml
@@ -94,6 +94,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_classification_rad3d.yaml b/samples/config_getting_started_classification_rad3d.yaml
index 218a2eb8e..92752d94b 100644
--- a/samples/config_getting_started_classification_rad3d.yaml
+++ b/samples/config_getting_started_classification_rad3d.yaml
@@ -99,6 +99,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_regression_histo2d.yaml b/samples/config_getting_started_regression_histo2d.yaml
index bcfd895dc..6202d5ec7 100644
--- a/samples/config_getting_started_regression_histo2d.yaml
+++ b/samples/config_getting_started_regression_histo2d.yaml
@@ -59,6 +59,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_regression_rad3d.yaml b/samples/config_getting_started_regression_rad3d.yaml
index fc28692fd..0b82acdf1 100644
--- a/samples/config_getting_started_regression_rad3d.yaml
+++ b/samples/config_getting_started_regression_rad3d.yaml
@@ -62,6 +62,6 @@ scheduler:
 track_memory_usage: false
 verbose: false
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: false
diff --git a/samples/config_getting_started_segmentation_histo2d.yaml b/samples/config_getting_started_segmentation_histo2d.yaml
index 92b52891b..39830fea1 100644
--- a/samples/config_getting_started_segmentation_histo2d.yaml
+++ b/samples/config_getting_started_segmentation_histo2d.yaml
@@ -66,6 +66,6 @@ scheduler:
 track_memory_usage: false
 verbose: true
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: true
diff --git a/samples/config_getting_started_segmentation_rad3d.yaml b/samples/config_getting_started_segmentation_rad3d.yaml
index b9937efce..e6121ee52 100644
--- a/samples/config_getting_started_segmentation_rad3d.yaml
+++ b/samples/config_getting_started_segmentation_rad3d.yaml
@@ -89,6 +89,6 @@ scheduler:
 track_memory_usage: false
 verbose: true
 version:
-  maximum: 0.1.2
-  minimum: 0.1.2
+  maximum: 0.1.3-dev
+  minimum: 0.1.3-dev
 weighted_loss: true
diff --git a/samples/config_regression.yaml b/samples/config_regression.yaml
index b50a69895..20d22efed 100644
--- a/samples/config_regression.yaml
+++ b/samples/config_regression.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2,
-    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.3-dev,
+    maximum: 0.1.3-dev # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_segmentation_brats.yaml b/samples/config_segmentation_brats.yaml
index 54954e6c1..acb0d1841 100644
--- a/samples/config_segmentation_brats.yaml
+++ b/samples/config_segmentation_brats.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2,
-    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.3-dev,
+    maximum: 0.1.3-dev # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/samples/config_segmentation_histology.yaml b/samples/config_segmentation_histology.yaml
index 5862369e7..2ac3030b6 100644
--- a/samples/config_segmentation_histology.yaml
+++ b/samples/config_segmentation_histology.yaml
@@ -1,8 +1,8 @@
 # affix version
 version:
   {
-    minimum: 0.1.2,
-    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
+    minimum: 0.1.3-dev,
+    maximum: 0.1.3-dev # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model:
diff --git a/testing/config_classification.yaml b/testing/config_classification.yaml
index 3d38bde04..a332855be 100644
--- a/testing/config_classification.yaml
+++ b/testing/config_classification.yaml
@@ -55,7 +55,7 @@ save_output: false
 scaling_factor: 1
 scheduler: triangle
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: True
 
diff --git a/testing/config_regression.yaml b/testing/config_regression.yaml
index 83a68f93b..faaf1ee04 100644
--- a/testing/config_regression.yaml
+++ b/testing/config_regression.yaml
@@ -38,7 +38,7 @@ save_output: false
 scaling_factor: 1
 scheduler: triangle
 version:
-  maximum: 0.1.2
+  maximum: 0.1.3-dev
   minimum: 0.0.14
 weighted_loss: false
 
diff --git a/testing/config_segmentation.yaml b/testing/config_segmentation.yaml
index defed728d..7365f737c 100644
--- a/testing/config_segmentation.yaml
+++ b/testing/config_segmentation.yaml
@@ -3,7 +3,7 @@
 version:
   {
     minimum: 0.0.14,
-    maximum: 0.1.2
+    maximum: 0.1.3-dev
   }
 model:
   {
diff --git a/tutorials/classification_medmnist_notebook/config.yaml b/tutorials/classification_medmnist_notebook/config.yaml
index 03591735f..878430b80 100644
--- a/tutorials/classification_medmnist_notebook/config.yaml
+++ b/tutorials/classification_medmnist_notebook/config.yaml
@@ -2,7 +2,7 @@
 version:
   {
     minimum: 0.0.14,
-    maximum: 0.1.2 # this should NOT be made a variable, but should be tested after every tag is created
+    maximum: 0.1.3-dev # this should NOT be made a variable, but should be tested after every tag is created
   }
 # Choose the model parameters here
 model: