Skip to content

Commit

Permalink
Merge branch 'master' into newmetric/logauc
Browse files Browse the repository at this point in the history
  • Loading branch information
SkafteNicki authored Oct 25, 2024
2 parents 9ae3ee3 + 4c75369 commit 5602871
Show file tree
Hide file tree
Showing 245 changed files with 3,715 additions and 1,440 deletions.
10 changes: 5 additions & 5 deletions .azure/gpu-integrations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ jobs:
- job: integrate_GPU
strategy:
matrix:
"torch | 1.x":
docker-image: "pytorchlightning/torchmetrics:ubuntu22.04-cuda11.8.0-py3.9-torch1.13"
torch-ver: "1.13"
"torch | 2.0":
docker-image: "pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime"
torch-ver: "2.0"
requires: "oldest"
"torch | 2.x":
docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime"
torch-ver: "2.3"
docker-image: "pytorch/pytorch:2.5.0-cuda12.1-cudnn9-runtime"
torch-ver: "2.5"
# how long to run the job before automatically cancelling
timeoutInMinutes: "40"
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down
56 changes: 56 additions & 0 deletions .azure/gpu-nuke-cache.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
trigger:
tags:
include:
- "*"
# run every month to sanitatize dev environment
schedules:
- cron: "0 0 1 * *"
displayName: Monthly nuke caches
branches:
include:
- master
# run on PR changing only this file
pr:
branches:
include:
- master
paths:
include:
- .azure/gpu-nuke-cache.yml

jobs:
- job: nuke_caches
# how long to run the job before automatically cancelling
timeoutInMinutes: "10"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"

pool: "lit-rtx-3090"

variables:
# these two caches assume to run repetitively on the same set of machines
# see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481
TORCH_HOME: "/var/tmp/torch"
TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
HF_HOME: "/var/tmp/hf/home"
HF_HUB_CACHE: "/var/tmp/hf/hub"
PIP_CACHE_DIR: "/var/tmp/pip"
CACHED_REFERENCES: "/var/tmp/cached-references.zip"

container:
image: "ubuntu:22.04"
options: "-v /var/tmp:/var/tmp"

steps:
- bash: |
set -ex
rm -rf $(TORCH_HOME)
rm -rf $(TRANSFORMERS_CACHE)
rm -rf $(HF_HOME)
rm -rf $(HF_HUB_CACHE)
rm -rf $(PIP_CACHE_DIR)
rm -rf $(CACHED_REFERENCES)
displayName: "delete all caches"
- bash: |
ls -lh /var/tmp
displayName: "show tmp/ folder"
67 changes: 38 additions & 29 deletions .azure/gpu-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ trigger:
- master
- release/*
- refs/tags/*
# run every month to populate caches
schedules:
- cron: "0 1 1 * *"
displayName: Monthly re-build caches
branches:
include:
- master
pr:
- master
- release/*
Expand All @@ -17,19 +24,13 @@ jobs:
- job: unitest_GPU
strategy:
matrix:
"PyTorch | 1.10 oldest":
"PyTorch | 2.0 oldest":
# Torch does not have build wheels with old Torch versions for newer CUDA
docker-image: "ubuntu20.04-cuda11.3.1-py3.9-torch1.10"
torch-ver: "1.10"
"PyTorch | 1.X LTS":
docker-image: "ubuntu22.04-cuda11.8.0-py3.9-torch1.13"
torch-ver: "1.13"
docker-image: "ubuntu22.04-cuda11.8.0-py3.10-torch2.0"
torch-ver: "2.0"
"PyTorch | 2.X stable":
docker-image: "ubuntu22.04-cuda12.1.1-py3.11-torch2.3"
torch-ver: "2.3"
"PyTorch | 2.X future":
docker-image: "ubuntu22.04-cuda12.1.1-py3.11-torch2.4"
torch-ver: "2.4"
docker-image: "ubuntu22.04-cuda12.1.1-py3.11-torch2.5"
torch-ver: "2.5"
# how long to run the job before automatically cancelling
timeoutInMinutes: "180"
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down Expand Up @@ -70,6 +71,11 @@ jobs:
CUDA_version_mm="${CUDA_version//'.'/''}"
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html"
mkdir -p $(TORCH_HOME)
mkdir -p $(TRANSFORMERS_CACHE)
mkdir -p $(HF_HOME)
mkdir -p $(HF_HUB_CACHE)
mkdir -p $(PIP_CACHE_DIR)
displayName: "set Env. vars"
- bash: |
echo "##vso[task.setvariable variable=ALLOW_SKIP_IF_OUT_OF_MEMORY]1"
Expand Down Expand Up @@ -114,7 +120,7 @@ jobs:
- bash: |
python .github/assistant.py set-oldest-versions
condition: eq(variables['torch-ver'], '1.10.2')
condition: eq(variables['torch-ver'], '2.0')
displayName: "Setting oldest versions"
- bash: |
Expand All @@ -135,6 +141,21 @@ jobs:
displayName: "Show caches"
- bash: |
python -m pytest torchmetrics --cov=torchmetrics \
--timeout=240 --durations=50 \
--reruns 2 --reruns-delay 1
# --numprocesses=5 --dist=loadfile
env:
DOCTEST_DOWNLOAD_TIMEOUT: "180"
SKIP_SLOW_DOCTEST: "1"
workingDirectory: "src/"
timeoutInMinutes: "40"
displayName: "DocTesting"
- bash: |
df -h .
ls -lh $(CACHED_REFERENCES)
ls -lh tests/
# Check if the file references exists
if [ -f $(CACHED_REFERENCES) ]; then
# Create a directory if it doesn't already exist
Expand All @@ -145,25 +166,12 @@ jobs:
else
echo "The file '$(CACHED_REFERENCES)' does not exist."
fi
du -h --max-depth=1 tests/
timeoutInMinutes: "5"
# if pull request, copy the cache to the tests folder to be used in the next steps
condition: eq(variables['Build.Reason'], 'PullRequest')
continueOnError: "true"
displayName: "Copy/Unzip cached refs"
- bash: |
python -m pytest torchmetrics --cov=torchmetrics \
--timeout=240 --durations=50 \
--reruns 2 --reruns-delay 1
# --numprocesses=5 --dist=loadfile
env:
DOCTEST_DOWNLOAD_TIMEOUT: "180"
SKIP_SLOW_DOCTEST: "1"
workingDirectory: "src/"
timeoutInMinutes: "40"
displayName: "DocTesting"
- bash: |
wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip
unzip -o data.zip
Expand All @@ -172,14 +180,15 @@ jobs:
displayName: "Pull testing data from S3"
- bash: |
du -h --max-depth=1 .
python -m pytest $(TEST_DIRS) \
-m "not DDP" --numprocesses=5 --dist=loadfile \
--cov=torchmetrics --timeout=240 --durations=100 \
--reruns 3 --reruns-delay 1
workingDirectory: "tests/"
# skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
timeoutInMinutes: "90"
timeoutInMinutes: "95"
displayName: "UnitTesting common"
- bash: |
Expand All @@ -191,13 +200,14 @@ jobs:
workingDirectory: "tests/"
# skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
timeoutInMinutes: "90"
timeoutInMinutes: "95"
displayName: "UnitTesting DDP"
- bash: |
du -h --max-depth=1 tests/
# archive potentially updated cache to the machine filesystem to be reused with next jobs
zip -q -r $(CACHED_REFERENCES) tests/_cache-references
du -h --max-depth=1 tests/
ls -lh $(CACHED_REFERENCES)
# set as extra step to not pollute general cache when jobs fails or crashes
# so do this update only with successful jobs on master
condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
Expand All @@ -212,7 +222,6 @@ jobs:
python -m coverage xml
python -m codecov --token=$(CODECOV_TOKEN) --name="GPU-coverage" \
--commit=$(Build.SourceVersion) --flags=gpu,unittest --env=linux,azure
ls -l
workingDirectory: "tests/"
# skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
Expand Down
12 changes: 6 additions & 6 deletions .github/CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ We are always looking for help implementing new features or fixing bugs.
- Add details on how to reproduce the issue - a minimal test case is always best, colab is also great.
Note, that the sample code shall be minimal and if needed with publicly available data.

1. Try to fix it or recommend a solution. We highly recommend to use test-driven approach:
2. Try to fix it or recommend a solution. We highly recommend to use test-driven approach:

- Convert your minimal code example to a unit/integration test with assert on expected results.
- Start by debugging the issue... You can run just this particular test in your IDE and draft a fix.
- Verify that your test case fails on the master branch and only passes with the fix applied.

1. Submit a PR!
3. Submit a PR!

_**Note**, even if you do not find the solution, sending a PR with a test covering the issue is a valid contribution and we can
help you or finish it with you :\]_
Expand All @@ -31,14 +31,14 @@ help you or finish it with you :\]_

1. Submit a github issue - describe what is the motivation of such feature (adding the use case or an example is helpful).

1. Let's discuss to determine the feature scope.
2. Let's discuss to determine the feature scope.

1. Submit a PR! We recommend test driven approach to adding new features as well:
3. Submit a PR! We recommend test driven approach to adding new features as well:

- Write a test for the functionality you want to add.
- Write the functional code until the test passes.

1. Add/update the relevant tests!
4. Add/update the relevant tests!

- [This PR](https://github.com/Lightning-AI/torchmetrics/pull/98) is a good example for adding a new metric

Expand Down Expand Up @@ -71,7 +71,7 @@ In case you adding new dependencies, make sure that they are compatible with the
### Coding Style

1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`.
1. You can use `pre-commit` to make sure your code style is correct.
2. You can use `pre-commit` to make sure your code style is correct.

### Documentation

Expand Down
12 changes: 7 additions & 5 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ Steps to reproduce the behavior...
<details>
<summary>Code sample</summary>

<!-- Ideally attach a minimal code sample to reproduce the decried issue.
Minimal means having the shortest code but still preserving the bug. -->
```python
# Ideally attach a minimal code sample to reproduce the decried issue.
# Minimal means having the shortest code but still preserving the bug.
```

</details>

Expand All @@ -30,9 +32,9 @@ Minimal means having the shortest code but still preserving the bug. -->

### Environment

- TorchMetrics version (and how you installed TM, e.g. `conda`, `pip`, build from source):
- Python & PyTorch Version (e.g., 1.0):
- Any other relevant information such as OS (e.g., Linux):
- TorchMetrics version (if build from source, add commit SHA): ???
- Python & PyTorch Version (e.g., 1.0): ???
- Any other relevant information such as OS (e.g., Linux): ???

### Additional context

Expand Down
6 changes: 3 additions & 3 deletions .github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ contact_links:
- name: Ask a Question
url: https://github.com/Lightning-AI/torchmetrics/discussions/new
about: Ask and answer TorchMetrics related questions
- name: 💬 Slack
url: https://app.slack.com/client/TR9DVT48M/CQXV8BRH9/thread/CQXV8BRH9-1591382895.254600
about: Chat with our community
- name: 💬 Chat with us
url: https://discord.gg/VptPCZkGNa
about: Live chat with experts, engineers, and users in our Discord community.
28 changes: 21 additions & 7 deletions .github/assistant.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
"3.10": "1.11",
"3.11": "1.13",
}
_path = lambda *ds: os.path.join(_PATH_ROOT, *ds)
REQUIREMENTS_FILES = (*glob.glob(_path("requirements", "*.txt")), _path("requirements.txt"))
_path_root = lambda *ds: os.path.join(_PATH_ROOT, *ds)
REQUIREMENTS_FILES = (*glob.glob(_path_root("requirements", "*.txt")), _path_root("requirements.txt"))


class AssistantCLI:
Expand Down Expand Up @@ -73,21 +73,35 @@ def set_min_torch_by_python(fpath: str = "requirements/base.txt") -> None:
fp.write(requires)

@staticmethod
def replace_min_requirements(fpath: str) -> None:
"""Replace all `>=` by `==` in given file."""
logging.info(f"processing: {fpath}")
def _replace_requirement(fpath: str, old_str: str = "", new_str: str = "") -> None:
"""Replace all strings given file."""
logging.info(f"processing '{old_str}' -> '{new_str}': {fpath}")
with open(fpath, encoding="utf-8") as fp:
req = fp.read()
req = req.replace(">=", "==")
req = req.replace(old_str, new_str)
with open(fpath, "w", encoding="utf-8") as fp:
fp.write(req)

@staticmethod
def replace_str_requirements(old_str: str, new_str: str, req_files: List[str] = REQUIREMENTS_FILES) -> None:
"""Replace a particular string in all requirements files."""
if isinstance(req_files, str):
req_files = [req_files]
for fpath in req_files:
AssistantCLI._replace_requirement(fpath, old_str=old_str, new_str=new_str)

@staticmethod
def replace_min_requirements(fpath: str) -> None:
"""Replace all `>=` by `==` in given file."""
AssistantCLI._replace_requirement(fpath, old_str=">=", new_str="==")

@staticmethod
def set_oldest_versions(req_files: List[str] = REQUIREMENTS_FILES) -> None:
"""Set the oldest version for requirements."""
AssistantCLI.set_min_torch_by_python()
if isinstance(req_files, str):
req_files = [req_files]
for fpath in req_files:
logging.info(f"processing req: `{fpath}`")
AssistantCLI.replace_min_requirements(fpath)

@staticmethod
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/ci-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,19 @@ concurrency:

jobs:
check-code:
uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
uses: Lightning-AI/utilities/.github/workflows/[email protected].7
with:
actions-ref: v0.11.3.post0
actions-ref: v0.11.7
extra-typing: "typing"

check-schema:
uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
uses: Lightning-AI/utilities/.github/workflows/[email protected].7

check-package:
if: github.event.pull_request.draft == false
uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
uses: Lightning-AI/utilities/.github/workflows/[email protected].7
with:
actions-ref: v0.11.3.post0
actions-ref: v0.11.7
artifact-name: dist-packages-${{ github.sha }}
import-name: "torchmetrics"
testing-matrix: |
Expand All @@ -35,7 +35,7 @@ jobs:
}
check-md-links:
uses: Lightning-AI/utilities/.github/workflows/[email protected].3.post0
uses: Lightning-AI/utilities/.github/workflows/[email protected].7
with:
base-branch: master
config-file: ".github/markdown-links-config.json"
2 changes: 2 additions & 0 deletions .github/workflows/ci-integrate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ jobs:

- name: source cashing
uses: ./.github/actions/pull-caches
with:
requires: ${{ matrix.requires }}
- name: set oldest if/only for integrations
if: matrix.requires == 'oldest'
run: python .github/assistant.py set-oldest-versions --req_files='["requirements/_integrate.txt"]'
Expand Down
Loading

0 comments on commit 5602871

Please sign in to comment.