Merge branch 'master' into newmetric/logauc

Lightning-AI · Oct 25, 2024 · 5602871 · 5602871
2 parents 9ae3ee3 + 4c75369
commit 5602871
Show file tree

Hide file tree

Showing 245 changed files with 3,715 additions and 1,440 deletions.
diff --git a/.azure/gpu-integrations.yml b/.azure/gpu-integrations.yml
@@ -17,13 +17,13 @@ jobs:
   - job: integrate_GPU
     strategy:
       matrix:
-        "torch | 1.x":
-          docker-image: "pytorchlightning/torchmetrics:ubuntu22.04-cuda11.8.0-py3.9-torch1.13"
-          torch-ver: "1.13"
+        "torch | 2.0":
+          docker-image: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime"
+          torch-ver: "2.0"
           requires: "oldest"
         "torch | 2.x":
-          docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime"
-          torch-ver: "2.3"
+          docker-image: "pytorch/pytorch:2.5.0-cuda12.1-cudnn9-runtime"
+          torch-ver: "2.5"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "40"
     # how much time to give 'run always even if cancelled tasks' before stopping them

diff --git a/.azure/gpu-nuke-cache.yml b/.azure/gpu-nuke-cache.yml
@@ -0,0 +1,56 @@
+trigger:
+  tags:
+    include:
+      - "*"
+# run every month to sanitatize dev environment
+schedules:
+  - cron: "0 0 1 * *"
+    displayName: Monthly nuke caches
+    branches:
+      include:
+        - master
+# run on PR changing only this file
+pr:
+  branches:
+    include:
+      - master
+  paths:
+    include:
+      - .azure/gpu-nuke-cache.yml
+
+jobs:
+  - job: nuke_caches
+    # how long to run the job before automatically cancelling
+    timeoutInMinutes: "10"
+    # how much time to give 'run always even if cancelled tasks' before stopping them
+    cancelTimeoutInMinutes: "2"
+
+    pool: "lit-rtx-3090"
+
+    variables:
+      # these two caches assume to run repetitively on the same set of machines
+      #  see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481
+      TORCH_HOME: "/var/tmp/torch"
+      TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
+      HF_HOME: "/var/tmp/hf/home"
+      HF_HUB_CACHE: "/var/tmp/hf/hub"
+      PIP_CACHE_DIR: "/var/tmp/pip"
+      CACHED_REFERENCES: "/var/tmp/cached-references.zip"
+
+    container:
+      image: "ubuntu:22.04"
+      options: "-v /var/tmp:/var/tmp"
+
+    steps:
+      - bash: |
+          set -ex
+          rm -rf $(TORCH_HOME)
+          rm -rf $(TRANSFORMERS_CACHE)
+          rm -rf $(HF_HOME)
+          rm -rf $(HF_HUB_CACHE)
+          rm -rf $(PIP_CACHE_DIR)
+          rm -rf $(CACHED_REFERENCES)
+        displayName: "delete all caches"
+      - bash: |
+          ls -lh /var/tmp
+        displayName: "show tmp/ folder"
diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml
@@ -9,6 +9,13 @@ trigger:
       - master
       - release/*
       - refs/tags/*
+# run every month to populate caches
+schedules:
+  - cron: "0 1 1 * *"
+    displayName: Monthly re-build caches
+    branches:
+      include:
+        - master
 pr:
   - master
   - release/*
@@ -17,19 +24,13 @@ jobs:
   - job: unitest_GPU
     strategy:
       matrix:
-        "PyTorch | 1.10 oldest":
+        "PyTorch | 2.0 oldest":
           # Torch does not have build wheels with old Torch versions for newer CUDA
-          docker-image: "ubuntu20.04-cuda11.3.1-py3.9-torch1.10"
-          torch-ver: "1.10"
-        "PyTorch | 1.X LTS":
-          docker-image: "ubuntu22.04-cuda11.8.0-py3.9-torch1.13"
-          torch-ver: "1.13"
+          docker-image: "ubuntu22.04-cuda11.8.0-py3.10-torch2.0"
+          torch-ver: "2.0"
         "PyTorch | 2.X stable":
-          docker-image: "ubuntu22.04-cuda12.1.1-py3.11-torch2.3"
-          torch-ver: "2.3"
-        "PyTorch | 2.X future":
-          docker-image: "ubuntu22.04-cuda12.1.1-py3.11-torch2.4"
-          torch-ver: "2.4"
+          docker-image: "ubuntu22.04-cuda12.1.1-py3.11-torch2.5"
+          torch-ver: "2.5"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "180"
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -70,6 +71,11 @@ jobs:
           CUDA_version_mm="${CUDA_version//'.'/''}"
           echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
           echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html"
+          mkdir -p $(TORCH_HOME)
+          mkdir -p $(TRANSFORMERS_CACHE)
+          mkdir -p $(HF_HOME)
+          mkdir -p $(HF_HUB_CACHE)
+          mkdir -p $(PIP_CACHE_DIR)
         displayName: "set Env. vars"
       - bash: |
           echo "##vso[task.setvariable variable=ALLOW_SKIP_IF_OUT_OF_MEMORY]1"
@@ -114,7 +120,7 @@ jobs:
 
       - bash: |
           python .github/assistant.py set-oldest-versions
-        condition: eq(variables['torch-ver'], '1.10.2')
+        condition: eq(variables['torch-ver'], '2.0')
         displayName: "Setting oldest versions"
 
       - bash: |
@@ -135,6 +141,21 @@ jobs:
         displayName: "Show caches"
 
       - bash: |
+          python -m pytest torchmetrics --cov=torchmetrics \
+            --timeout=240 --durations=50 \
+            --reruns 2 --reruns-delay 1
+          #  --numprocesses=5 --dist=loadfile
+        env:
+          DOCTEST_DOWNLOAD_TIMEOUT: "180"
+          SKIP_SLOW_DOCTEST: "1"
+        workingDirectory: "src/"
+        timeoutInMinutes: "40"
+        displayName: "DocTesting"
+
+      - bash: |
+          df -h .
+          ls -lh $(CACHED_REFERENCES)
+          ls -lh tests/
           # Check if the file references exists
           if [ -f $(CACHED_REFERENCES) ]; then
               # Create a directory if it doesn't already exist
@@ -145,25 +166,12 @@ jobs:
           else
               echo "The file '$(CACHED_REFERENCES)' does not exist."
           fi
-          du -h --max-depth=1 tests/
         timeoutInMinutes: "5"
         # if pull request, copy the cache to the tests folder to be used in the next steps
         condition: eq(variables['Build.Reason'], 'PullRequest')
         continueOnError: "true"
         displayName: "Copy/Unzip cached refs"
 
-      - bash: |
-          python -m pytest torchmetrics --cov=torchmetrics \
-            --timeout=240 --durations=50 \
-            --reruns 2 --reruns-delay 1
-          #  --numprocesses=5 --dist=loadfile
-        env:
-          DOCTEST_DOWNLOAD_TIMEOUT: "180"
-          SKIP_SLOW_DOCTEST: "1"
-        workingDirectory: "src/"
-        timeoutInMinutes: "40"
-        displayName: "DocTesting"
-
       - bash: |
           wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip
           unzip -o data.zip
@@ -172,14 +180,15 @@ jobs:
         displayName: "Pull testing data from S3"
 
       - bash: |
+          du -h --max-depth=1 .
           python -m pytest $(TEST_DIRS) \
             -m "not DDP" --numprocesses=5 --dist=loadfile \
             --cov=torchmetrics --timeout=240 --durations=100 \
             --reruns 3 --reruns-delay 1
         workingDirectory: "tests/"
         # skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
         condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
-        timeoutInMinutes: "90"
+        timeoutInMinutes: "95"
         displayName: "UnitTesting common"
 
       - bash: |
@@ -191,13 +200,14 @@ jobs:
         workingDirectory: "tests/"
         # skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
         condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
-        timeoutInMinutes: "90"
+        timeoutInMinutes: "95"
         displayName: "UnitTesting DDP"
 
       - bash: |
+          du -h --max-depth=1 tests/
           # archive potentially updated cache to the machine filesystem to be reused with next jobs
           zip -q -r $(CACHED_REFERENCES) tests/_cache-references
-          du -h --max-depth=1 tests/
+          ls -lh $(CACHED_REFERENCES)
         # set as extra step to not pollute general cache when jobs fails or crashes
         # so do this update only with successful jobs on master
         condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
@@ -212,7 +222,6 @@ jobs:
           python -m coverage xml
           python -m codecov --token=$(CODECOV_TOKEN) --name="GPU-coverage" \
             --commit=$(Build.SourceVersion) --flags=gpu,unittest --env=linux,azure
-          ls -l
         workingDirectory: "tests/"
         # skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
         condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -16,13 +16,13 @@ We are always looking for help implementing new features or fixing bugs.
    - Add details on how to reproduce the issue - a minimal test case is always best, colab is also great.
      Note, that the sample code shall be minimal and if needed with publicly available data.
 
-1. Try to fix it or recommend a solution. We highly recommend to use test-driven approach:
+2. Try to fix it or recommend a solution. We highly recommend to use test-driven approach:
 
    - Convert your minimal code example to a unit/integration test with assert on expected results.
    - Start by debugging the issue... You can run just this particular test in your IDE and draft a fix.
    - Verify that your test case fails on the master branch and only passes with the fix applied.
 
-1. Submit a PR!
+3. Submit a PR!
 
 _**Note**, even if you do not find the solution, sending a PR with a test covering the issue is a valid contribution and we can
 help you or finish it with you :\]_
@@ -31,14 +31,14 @@ help you or finish it with you :\]_
 
 1. Submit a github issue - describe what is the motivation of such feature (adding the use case or an example is helpful).
 
-1. Let's discuss to determine the feature scope.
+2. Let's discuss to determine the feature scope.
 
-1. Submit a PR! We recommend test driven approach to adding new features as well:
+3. Submit a PR! We recommend test driven approach to adding new features as well:
 
    - Write a test for the functionality you want to add.
    - Write the functional code until the test passes.
 
-1. Add/update the relevant tests!
+4. Add/update the relevant tests!
 
 - [This PR](https://github.com/Lightning-AI/torchmetrics/pull/98) is a good example for adding a new metric
 
@@ -71,7 +71,7 @@ In case you adding new dependencies, make sure that they are compatible with the
 ### Coding Style
 
 1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`.
-1. You can use `pre-commit` to make sure your code style is correct.
+2. You can use `pre-commit` to make sure your code style is correct.
 
 ### Documentation
 

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -19,8 +19,10 @@ Steps to reproduce the behavior...
 <details>
   <summary>Code sample</summary>
 
-<!-- Ideally attach a minimal code sample to reproduce the decried issue.
-Minimal means having the shortest code but still preserving the bug. -->
+```python
+# Ideally attach a minimal code sample to reproduce the decried issue.
+# Minimal means having the shortest code but still preserving the bug.
+```
 
 </details>
 
@@ -30,9 +32,9 @@ Minimal means having the shortest code but still preserving the bug. -->
 
 ### Environment
 
-- TorchMetrics version (and how you installed TM, e.g. `conda`, `pip`, build from source):
-- Python & PyTorch Version (e.g., 1.0):
-- Any other relevant information such as OS (e.g., Linux):
+- TorchMetrics version (if build from source, add commit SHA): ???
+- Python & PyTorch Version (e.g., 1.0): ???
+- Any other relevant information such as OS (e.g., Linux): ???
 
 ### Additional context
 

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -3,6 +3,6 @@ contact_links:
   - name: Ask a Question
     url: https://github.com/Lightning-AI/torchmetrics/discussions/new
     about: Ask and answer TorchMetrics related questions
-  - name: 💬 Slack
-    url: https://app.slack.com/client/TR9DVT48M/CQXV8BRH9/thread/CQXV8BRH9-1591382895.254600
-    about: Chat with our community
+  - name: 💬 Chat with us
+    url: https://discord.gg/VptPCZkGNa
+    about: Live chat with experts, engineers, and users in our Discord community.
diff --git a/.github/assistant.py b/.github/assistant.py
@@ -31,8 +31,8 @@
     "3.10": "1.11",
     "3.11": "1.13",
 }
-_path = lambda *ds: os.path.join(_PATH_ROOT, *ds)
-REQUIREMENTS_FILES = (*glob.glob(_path("requirements", "*.txt")), _path("requirements.txt"))
+_path_root = lambda *ds: os.path.join(_PATH_ROOT, *ds)
+REQUIREMENTS_FILES = (*glob.glob(_path_root("requirements", "*.txt")), _path_root("requirements.txt"))
 
 
 class AssistantCLI:
@@ -73,21 +73,35 @@ def set_min_torch_by_python(fpath: str = "requirements/base.txt") -> None:
             fp.write(requires)
 
     @staticmethod
-    def replace_min_requirements(fpath: str) -> None:
-        """Replace all `>=` by `==` in given file."""
-        logging.info(f"processing: {fpath}")
+    def _replace_requirement(fpath: str, old_str: str = "", new_str: str = "") -> None:
+        """Replace all strings given file."""
+        logging.info(f"processing '{old_str}' -> '{new_str}': {fpath}")
         with open(fpath, encoding="utf-8") as fp:
             req = fp.read()
-        req = req.replace(">=", "==")
+        req = req.replace(old_str, new_str)
         with open(fpath, "w", encoding="utf-8") as fp:
             fp.write(req)
 
+    @staticmethod
+    def replace_str_requirements(old_str: str, new_str: str, req_files: List[str] = REQUIREMENTS_FILES) -> None:
+        """Replace a particular string in all requirements files."""
+        if isinstance(req_files, str):
+            req_files = [req_files]
+        for fpath in req_files:
+            AssistantCLI._replace_requirement(fpath, old_str=old_str, new_str=new_str)
+
+    @staticmethod
+    def replace_min_requirements(fpath: str) -> None:
+        """Replace all `>=` by `==` in given file."""
+        AssistantCLI._replace_requirement(fpath, old_str=">=", new_str="==")
+
     @staticmethod
     def set_oldest_versions(req_files: List[str] = REQUIREMENTS_FILES) -> None:
         """Set the oldest version for requirements."""
         AssistantCLI.set_min_torch_by_python()
+        if isinstance(req_files, str):
+            req_files = [req_files]
         for fpath in req_files:
-            logging.info(f"processing req: `{fpath}`")
             AssistantCLI.replace_min_requirements(fpath)
 
     @staticmethod

diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml
@@ -13,19 +13,19 @@ concurrency:
 
 jobs:
   check-code:
-    uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
+    uses: Lightning-AI/utilities/.github/workflows/[email protected].7
     with:
-      actions-ref: v0.11.3.post0
+      actions-ref: v0.11.7
       extra-typing: "typing"
 
   check-schema:
-    uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
+    uses: Lightning-AI/utilities/.github/workflows/[email protected].7
 
   check-package:
     if: github.event.pull_request.draft == false
-    uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
+    uses: Lightning-AI/utilities/.github/workflows/[email protected].7
     with:
-      actions-ref: v0.11.3.post0
+      actions-ref: v0.11.7
       artifact-name: dist-packages-${{ github.sha }}
       import-name: "torchmetrics"
       testing-matrix: |
@@ -35,7 +35,7 @@ jobs:
         }
 
   check-md-links:
-    uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
+    uses: Lightning-AI/utilities/.github/workflows/[email protected].7
     with:
       base-branch: master
       config-file: ".github/markdown-links-config.json"
diff --git a/.github/workflows/ci-integrate.yml b/.github/workflows/ci-integrate.yml
@@ -53,6 +53,8 @@ jobs:
 
       - name: source cashing
         uses: ./.github/actions/pull-caches
+        with:
+          requires: ${{ matrix.requires }}
       - name: set oldest if/only for integrations
         if: matrix.requires == 'oldest'
         run: python .github/assistant.py set-oldest-versions --req_files='["requirements/_integrate.txt"]'