diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 5088be2020738..a76e81246798c 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -94,11 +94,10 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str: text = text.replace("pytorch-lightning.readthedocs.io/en/stable/", f"pytorch-lightning.readthedocs.io/en/{version}") # codecov badge text = text.replace("/branch/master/graph/badge.svg", f"/release/{version}/graph/badge.svg") - # replace github badges for release ones + # github actions badge text = text.replace("badge.svg?branch=master&event=push", f"badge.svg?tag={version}") - # Azure... + # azure pipelines badge text = text.replace("?branchName=master", f"?branchName=refs%2Ftags%2F{version}") - text = re.sub(r"\?definitionId=\d+&branchName=master", f"?definitionId=2&branchName=refs%2Ftags%2F{version}", text) skip_begin = r"" skip_end = r"" diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index ac5ca6f60a6b4..0de590f2c54a6 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f37c17613affc..683212cd55d4b 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES @@ -75,7 +75,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] - pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded + pip install -U deepspeed # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: @@ -119,15 +119,6 @@ jobs: timeoutInMinutes: "35" condition: eq(variables['continue'], '1') - - bash: bash run_standalone_tasks.sh - workingDirectory: tests/tests_pytorch - env: - PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" - displayName: 'Testing: PyTorch standalone tasks' - timeoutInMinutes: "10" - condition: eq(variables['continue'], '1') - - bash: | python -m coverage report python -m coverage xml diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 0fe790310f247..f71844e9664fe 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,14 @@ blank_issues_enabled: false contact_links: - - name: Ask a Question + - name: ❓ Ask a Question url: https://github.com/Lightning-AI/lightning/discussions/new - about: Ask and answer Lightning related questions - - name: 💬 Slack + about: Ask and answer Lightning related questions. + - name: 💬 Chat with us url: https://www.pytorchlightning.ai/community - about: Chat with our community + about: Live chat with experts, engineers, and users in our Slack community. + - name: 📖 Read the documentation + url: https://lightning.ai/lightning-docs/ + about: Please consult the documentation before opening any issues! + - name: 🙋 Contact us about professional services + url: https://lightning.ai + about: Contact the Lightning.ai sales team for paid support. diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml new file mode 100644 index 0000000000000..c2654eddd7ca1 --- /dev/null +++ b/.github/checkgroup.yml @@ -0,0 +1,173 @@ +custom_service_name: "Lightning CI required checker" +subprojects: + - id: "CI: CircleCI" + paths: + - ".circleci/**" + checks: + - "test-on-tpus" + + - id: "CI: Azure" + paths: + - ".azure/**" + checks: + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + + - id: "pytorch_lightning" + paths: + # all examples don't need to be added because they aren't used in CI, but these are + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch*.yml" + - ".github/workflows/docs-*.yml" + checks: + - "conda (3.8, 1.10)" + - "conda (3.8, 1.9)" + - "conda (3.9, 1.11)" + - "conda (3.9, 1.12)" + - "cpu (macOS-11, 3.10, latest, stable)" + - "cpu (macOS-11, 3.7, latest, stable)" + - "cpu (macOS-11, 3.7, oldest, stable)" + - "cpu (ubuntu-20.04, 3.10, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "cpu (windows-2022, 3.10, latest, stable)" + - "cpu (windows-2022, 3.7, latest, stable)" + - "cpu (windows-2022, 3.7, oldest, stable)" + - "doctest (pytorch)" + - "make-docs (pytorch)" + - "mypy" + - "PR Gatekeeper (pytorch)" + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + - "slow (macOS-11, 3.7, 1.11)" + - "slow (ubuntu-20.04, 3.7, 1.11)" + - "slow (windows-2022, 3.7, 1.11)" + - "test-on-tpus" + + - id: "pytorch_lightning: Docs" + paths: + - "docs/source-pytorch/**" + - ".github/workflows/docs-*.yml" + - "requirements/docs.txt" + - "requirements/pytorch/**" + checks: + - "doctest (pytorch)" + - "make-docs (pytorch)" + + - id: "pytorch_lightning: Docker" + paths: + - "dockers/**" + - "!dockers/README.md" + - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" + - "environment.yml" + - ".github/workflows/*docker*.yml" + - "setup.py" + checks: + - "build-conda (3.8, 1.10)" + - "build-conda (3.8, 1.9)" + - "build-conda (3.9, 1.11)" + - "build-conda (3.9, 1.12)" + - "build-cuda (3.8, 1.9, 11.1.1)" + - "build-cuda (3.9, 1.10, 11.3.1)" + - "build-cuda (3.9, 1.11, 11.3.1)" + - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.9, 11.1.1)" + - "build-hpu (1.5.0, 1.11.0)" + - "build-ipu (3.9, 1.9)" + - "build-NGC" + - "build-pl (3.9, 1.10, 11.3.1)" + - "build-pl (3.9, 1.11, 11.3.1)" + - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.9, 11.1.1)" + - "build-xla (3.7, 1.12)" + + - id: "pytorch_lightning: mypy" + paths: + - ".github/workflows/code-checks.yml" + - "pyproject.toml" # includes mypy config + checks: + - "mypy" + + - id: "lightning_app" + paths: + - ".github/workflows/ci-app*.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app/**" + - "tests/tests_app_examples/**" + - "tests/tests_clusters/**" + # the examples are used in the app CI + - "examples/app_*" + checks: + - "Cloud Test (boring_app)" + - "Cloud Test (collect_failures)" + - "Cloud Test (commands_and_api)" + - "Cloud Test (custom_work_dependencies)" + - "Cloud Test (drive)" + - "Cloud Test (idle_timeout)" + - "Cloud Test (payload)" + - "Cloud Test (template_jupyterlab)" + - "Cloud Test (template_react_ui)" + - "Cloud Test (template_streamlit_ui)" + - "Cloud Test (v0_app)" + - "doctest (app)" + - "make-docs (app)" + - "pytest (macOS-11, 3.8, latest)" + - "pytest (macOS-11, 3.8, oldest)" + - "pytest (ubuntu-20.04, 3.8, latest)" + - "pytest (ubuntu-20.04, 3.8, oldest)" + - "pytest (windows-2022, 3.8, latest)" + - "pytest (windows-2022, 3.8, oldest)" + + - id: "lightning_app: Docs" + paths: + - "docs/source-app/**" + - ".github/workflows/docs-*.yml" + - "requirements/docs.txt" + - "requirements/app/**" + checks: + - "doctest (app)" + - "make-docs (app)" + + - id: "install" + paths: + - ".actions/setup_tools.py" + - ".github/workflows/ci-pkg-install.yml" + - "setup.py" + - "src/lightning/**" + # all __about__, __version__, __setup__ + - "src/*/__*.py" + checks: + - "install-meta-pypi (macOS-11, 3.8)" + - "install-meta-pypi (ubuntu-20.04, 3.8)" + - "install-meta-pypi (windows-2022, 3.8)" + - "install-meta-src (macOS-11, 3.8)" + - "install-meta-src (macOS-11, lightning, 3.8)" + - "install-meta-src (ubuntu-20.04, 3.8)" + - "install-meta-src (ubuntu-20.04, lightning, 3.8)" + - "install-meta-src (windows-2022, 3.8)" + - "install-meta-src (windows-2022, lightning, 3.8)" + - "install-standalone (macOS-11, app, 3.8)" + - "install-standalone (macOS-11, pytorch, 3.8)" + - "install-standalone (ubuntu-20.04, app, 3.8)" + - "install-standalone (ubuntu-20.04, pytorch, 3.8)" + - "install-standalone (windows-2022, app, 3.8)" + - "install-standalone (windows-2022, pytorch, 3.8)" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index f559551e1237f..4ed903c0f3a93 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,16 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows | -| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | +| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | +| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux | +| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 @@ -33,15 +33,15 @@ | --------------------------------- | ----------------------------------------------------------------------------------------- | | .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/Lightning-AI/lightning) | | .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). | -| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. | +| .github/workflows/ci-schema.yml | Validate the syntax of workflow files. | ## Others -| workflow file | action | -| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| .github/workflows/ci_dockers.yml | Build docker images used for testing in CI without pushing to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). Publishing these built images takes place in `.github/workflows/release-docker.yml` which only runs in master. | -| .github/workflows/ci_pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | -| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | +| workflow file | action | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| .github/workflows/cicd-pytorch-dockers.yml | Build docker images used for testing in CI. If run on nightly schedule, push to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). | +| .github/workflows/ci-pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | +| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | ## Deployment @@ -60,4 +60,4 @@ | .github/stale.yml | Close inactive issues/PRs sometimes after adding the "won't fix" label to them. | | .github/workflows/probot-auto-cc.yml, .github/lightning-probot.yml | Notify maintainers of interest depending on labels added to an issue We utilize lightning-probot forked from PyTorch’s probot. | | .pre-commit-config.yaml | pre-commit.ci runs a set of linters and formatters, such as black, flake8 and isort. When formatting is applied, the bot pushes a commit with its change. This configuration is also used for running pre-commit locally. | -| .github/workflows/ci_pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | +| .github/workflows/ci-pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml similarity index 99% rename from .github/workflows/ci-app_cloud_e2e_test.yml rename to .github/workflows/ci-app-cloud-e2e-test.yml index 3ad455650a117..81d5e70441771 100644 --- a/.github/workflows/ci-app_cloud_e2e_test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 - uses: actions/setup-python@v4 + uses: actions/setup-python@v2 with: python-version: "3.8" diff --git a/.github/workflows/ci-app_examples.yml b/.github/workflows/ci-app-examples.yml similarity index 98% rename from .github/workflows/ci-app_examples.yml rename to .github/workflows/ci-app-examples.yml index ec8becd5f70d1..01570f59c2c77 100644 --- a/.github/workflows/ci-app_examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] diff --git a/.github/workflows/ci-app_tests.yml b/.github/workflows/ci-app-tests.yml similarity index 96% rename from .github/workflows/ci-app_tests.yml rename to .github/workflows/ci-app-tests.yml index 1678dab257301..fe3cc36dc16d3 100644 --- a/.github/workflows/ci-app_tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] @@ -126,7 +126,7 @@ jobs: # - name: Clone Quick Start Example Repo # uses: actions/checkout@v3 # # TODO: this needs to be git submodule -# if: matrix.os == 'windows-2019' # because the install doesn't work on windows +# if: matrix.os == 'windows-2022' # because the install doesn't work on windows # with: # repository: Lightning-AI/lightning-quick-start # ref: 'main' @@ -134,6 +134,6 @@ jobs: # # - name: Lightning Install quick-start # shell: bash -# if: matrix.os != 'windows-2019' # because the install doesn't work on windows +# if: matrix.os != 'windows-2022' # because the install doesn't work on windows # run: | # python -m lightning install app lightning/quick-start -y diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci-pkg-install.yml similarity index 95% rename from .github/workflows/ci_pkg-install.yml rename to .github/workflows/ci-pkg-install.yml index 342e027b07cfe..a9fdd36693a67 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -33,7 +33,7 @@ jobs: fail-fast: true max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["app", "pytorch"] python-version: [3.8] # , 3.9 @@ -67,7 +67,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["", "lightning"] python-version: [3.8] # , 3.9 @@ -100,7 +100,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] # , 3.9 steps: diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml similarity index 100% rename from .github/workflows/ci_pr-gatekeeper.yml rename to .github/workflows/ci-pr-gatekeeper.yml diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/ci-pytorch-dockers.yml similarity index 81% rename from .github/workflows/cicd-pytorch_dockers.yml rename to .github/workflows/ci-pytorch-dockers.yml index a6ba2ac4aa5f4..a05dbbb5bc8ef 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -8,8 +8,9 @@ on: paths: - "dockers/**" - "!dockers/README.md" - - "requirements/**" - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" - "environment.yml" - ".github/workflows/*docker*.yml" - "setup.py" @@ -29,17 +30,22 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image - python_version: ["3.9"] - pytorch_version: ["1.12"] + include: + # We only release one docker image per PyTorch version. + # The matrix here is the same as the one in release-docker.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/release/Dockerfile push: false # pushed in release-docker.yml only when PL is released timeout-minutes: 50 @@ -53,14 +59,14 @@ jobs: python_version: ["3.7"] xla_version: ["1.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -85,30 +91,31 @@ jobs: fail-fast: false matrix: include: - # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} - # latest (used in Tutorials) - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + # These are the base images for PL release docker images, + # so include at least all of the combinations in release-dockers.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + # Used in Lightning-AI/tutorials + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ matrix.cuda_version }} - UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 95 - uses: ravsamhq/notify-slack-action@v1 if: failure() && env.PUSH_TO_HUB == 'true' @@ -126,25 +133,23 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - # nightly: add when there's a release candidate - # - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9"} + - {python_version: "3.8", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} + - {python_version: "3.9", pytorch_version: "1.12"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -168,14 +173,14 @@ jobs: # the config used in 'dockers/ci-runner-ipu/Dockerfile' - {python_version: "3.9", pytorch_version: "1.9"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -184,7 +189,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 100 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -199,7 +204,7 @@ jobs: status: ${{ job.status }} token: ${{ secrets.GITHUB_TOKEN }} notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} @@ -212,14 +217,14 @@ jobs: # the config used in 'dockers/ci-runner-hpu/Dockerfile' - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | DIST=latest @@ -243,10 +248,10 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build Conda Docker # publish master/release - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: file: dockers/nvidia/Dockerfile push: false diff --git a/.github/workflows/ci-pytorch_test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml similarity index 97% rename from .github/workflows/ci-pytorch_test-conda.yml rename to .github/workflows/ci-pytorch-test-conda.yml index 777ec2af759a0..3498f087ef0aa 100644 --- a/.github/workflows/ci-pytorch_test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -22,13 +22,11 @@ jobs: strategy: fail-fast: false matrix: - # nightly: add when there's a release candidate include: - {python-version: "3.8", pytorch-version: "1.9"} - {python-version: "3.8", pytorch-version: "1.10"} - {python-version: "3.9", pytorch-version: "1.11"} - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 30 steps: @@ -39,7 +37,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v24 + uses: tj-actions/changed-files@v23.1 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch-test-full.yml similarity index 97% rename from .github/workflows/ci-pytorch_test-full.yml rename to .github/workflows/ci-pytorch-test-full.yml index 445707d340c4b..173e2a44a61f4 100644 --- a/.github/workflows/ci-pytorch_test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] python-version: ["3.7", "3.10"] # minimum, maximum requires: ["oldest", "latest"] release: ["stable"] @@ -39,13 +39,13 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v24 + uses: tj-actions/changed-files@v23.1 - name: Decide if the test should be skipped id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-full.yml' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES @@ -59,7 +59,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} if: ${{ (steps.skip.outputs.continue == '1') }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml similarity index 95% rename from .github/workflows/ci-pytorch_test-slow.yml rename to .github/workflows/ci-pytorch-test-slow.yml index b3756bbe8c2f7..0bb9916ee302a 100644 --- a/.github/workflows/ci-pytorch_test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] # same config as '.azure-pipelines/gpu-tests.yml' python-version: ["3.7"] pytorch-version: ["1.11"] @@ -30,13 +30,13 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v24 + uses: tj-actions/changed-files@v23.1 - name: Decide if the test should be skipped id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-slow.yml' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES @@ -48,7 +48,7 @@ jobs: echo "::set-output name=continue::1" fi - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v2 if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci-schema.yml similarity index 100% rename from .github/workflows/ci_schema.yml rename to .github/workflows/ci-schema.yml diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 977118b644ef3..5b5a9aec778be 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -42,13 +42,13 @@ jobs: - name: Install dependencies env: FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: ${{ matrix.pkg }} run: | sudo apt-get update sudo apt-get install -y cmake pandoc pip --version - pip install -q fire # python -m pip install --upgrade --user pip - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/base.txt -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html pip install -r requirements/${{ matrix.pkg }}/devel.txt pip list shell: bash @@ -91,11 +91,12 @@ jobs: - name: Install dependencies env: FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: ${{ matrix.pkg }} run: | sudo apt-get update sudo apt-get install -y cmake pandoc pip --version - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/base.txt -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures pip list diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 9d87f1a582fb1..6901a24204683 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,6 +1,5 @@ name: Docker -# https://www.docker.com/blog/first-docker-github-action-is-here -# https://github.com/docker/build-push-action + on: push: branches: [master, "release/*"] @@ -15,8 +14,12 @@ jobs: strategy: fail-fast: false matrix: - python_version: ["3.7", "3.8", "3.9"] - pytorch_version: ["1.9", "1.10"] + include: + # We only release one docker image per PyTorch version. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - name: Checkout uses: actions/checkout@v2 @@ -32,19 +35,29 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} - tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + tags: | + ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} + latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 55 - name: Publish Latest to Docker uses: docker/build-push-action@v1.1.0 - # only on releases and latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10' + # Only latest Python and PyTorch + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} tags: "latest" timeout-minutes: 55 diff --git a/.gitignore b/.gitignore index 719f291a492ca..259d9f271189c 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,9 @@ hars* artifacts/* *docs/examples* *docs/source-app/api* + +# tutorials +our_model.tar +test.png +saved_models +data/ diff --git a/README.md b/README.md index 2fef343425f17..9c03e3707ec24 100644 --- a/README.md +++ b/README.md @@ -80,21 +80,24 @@ ______________________________________________________________________ ## Continuous Integration -Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major Python and PyTorch versions. +Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs and against major Python and PyTorch versions.
Current build statuses
-| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 (with Conda | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -136,8 +139,8 @@ conda install pytorch-lightning -c conda-forge The actual status of 1.7 \[stable\] is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml?query=branch%3Arelease%2Fpytorch) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml?query=branch%3Arelease%2Fpytorch) [![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch) diff --git a/dockers/README.md b/dockers/README.md index 533c85739f528..b1ff9826b6c1f 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,36 +1,17 @@ # Docker images -## Builds images form attached Dockerfiles +## Build images from Dockerfiles You can build it on your own, note it takes lots of time, be prepared. ```bash -git clone -docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile . -``` - -or with specific arguments - -```bash -git clone -docker image build \ - -t pytorch-lightning:base-cuda-py3.9-pt1.10 \ - -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.10 \ - . -``` +git clone https://github.com/Lightning-AI/lightning.git -or nightly version from Conda +# build with the default arguments +docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . -```bash -git clone -docker image build \ - -t pytorch-lightning:base-conda-py3.9-pt1.11 \ - -f dockers/base-conda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.11 \ - . +# build with specific arguments +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . ``` To run your docker use @@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest ## Run docker image with GPUs -To run docker image with access to you GPUs you need to install +To run docker image with access to your GPUs, you need to install ```bash # Add the package repositories @@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker ``` -and later run the docker image with `--gpus all` so for example +and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 ``` ## Run Jupyter server @@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in- 1. Build the docker image: ```bash - docker image build \ - -t pytorch-lightning:v1.3.1 \ - -f dockers/nvidia/Dockerfile \ - --build-arg LIGHTNING_VERSION=1.3.1 \ - . + docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 . ``` 1. start the server and map ports: ```bash - docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1 + docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5 ``` 1. Connect in local browser: - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6` diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index cb393c91dfbe0..c39e66509188c 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,8 +14,9 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 +ARG CUDA_VERSION=11.3.1 -FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} LABEL maintainer="Lightning-AI " diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index db4fc1e2c4cf8..ce7723e418e77 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -173,6 +173,7 @@ precision DeepSpeedPrecisionPlugin DoublePrecisionPlugin FullyShardedNativeMixedPrecisionPlugin + FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst index a0dbefd141464..27aff0c11fdcb 100644 --- a/docs/source-pytorch/extensions/plugins.rst +++ b/docs/source-pytorch/extensions/plugins.rst @@ -56,6 +56,7 @@ The full list of built-in precision plugins is listed below. DeepSpeedPrecisionPlugin DoublePrecisionPlugin FullyShardedNativeMixedPrecisionPlugin + FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index 63ac1f289331f..c189d6034ab28 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -1,17 +1,8 @@ -sphinx>=4.0,<5.0 -myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5, <=0.8.9 +-r ../docs.txt + ipython[notebook] ipython_genutils -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc +pytorch-lightning -https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31-rc1.zip -sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 sphinx-autobuild -jinja2>=3.0.0,<3.1.0 +https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000000000..1b00471602c60 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,13 @@ +sphinx>=4.0, <5.0 +myst-parser>=0.15, <0.17 +nbsphinx>=0.8.5, <=0.8.9 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 +sphinxcontrib-mockautodoc +sphinx-autodoc-typehints>=1.11, <1.15 # strict; v1.15 failing on master (#11405) +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 +sphinx-multiproject +jinja2>=3.0.0,<3.1.0 diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index 50e7c2049f6f6..474620b1e74b8 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,17 +1,6 @@ -sphinx>=4.0,<5.0 -myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5, <=0.8.9 +-r ../docs.txt + ipython[notebook] -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip -sphinx-autodoc-typehints>=1.11,<1.15 # strict; v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 -typing-extensions # already in `requirements.txt` but the docs CI job does not install it -jinja2>=3.0.0,<3.1.0 -r ../../_notebooks/.actions/requirements.txt diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index c386c5581cc42..20b6c1b8dbc12 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -7,5 +7,5 @@ torchtext>=0.10.*, <0.14.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 -gcsfs>=2021.5.0, <2022.6.0 +gcsfs>=2021.5.0, <2022.8.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 4e916fbc6c61f..c5fc92a67a837 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment fairscale>=0.4.5, <=0.4.6 -deepspeed>=0.6.0, <0.7.0 +deepspeed>=0.6.0, <=0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index c155400a3d35f..f8bd5793a0af6 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -10,7 +10,7 @@ mypy==0.971 # needed in tests cloudpickle>=1.3, <=2.1.0 scikit-learn>0.22.1, <=1.1.1 -onnxruntime<=1.12.0 +onnxruntime<1.13.0 psutil<=5.9.1 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks fastapi<=0.79.0 diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 835838342e610..92913fcdf760f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,6 +4,36 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.7.2] - 2022-08-16 + +### Added + +- Added `FullyShardedNativeNativeMixedPrecisionPlugin` to handle precision for `DDPFullyShardedNativeStrategy` ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Added profiling to these hooks: `on_before_batch_transfer`, `transfer_batch_to_device`, `on_after_batch_transfer`, `configure_gradient_clipping`, `clip_gradients` ([#14069](https://github.com/Lightning-AI/lightning/pull/14069)) + +### Changed + +- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) +- Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961)) +- The default project name in `WandbLogger` is now "lightning_logs" ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) +- The `WandbLogger.name` property no longer returns the name of the experiment, and instead returns the project's name ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) + +### Fixed + +- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) +- Avoided false positive warning about using `sync_dist` when using torchmetrics ([#14143](https://github.com/Lightning-AI/lightning/pull/14143)) +- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) +- Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) +- Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Fixed an issue in which the default name for a run in `WandbLogger` would be set to the project name instead of a randomly generated string ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) +- Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index eb1a42730b5f0..b57aea6fae147 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -78,17 +78,17 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -130,8 +130,8 @@ conda install pytorch-lightning -c conda-forge The actual status of stable is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) [![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch) [![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch) diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index 116d5667841f3..2196826f840ed 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.7.1" +version = "1.7.2" diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 20a3dcc3f0f26..6650bb3f0c479 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -16,7 +16,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ from copy import deepcopy -from typing import Any, Callable, cast, List, Optional, Union +from typing import Any, Callable, cast, Dict, List, Optional, Union import torch from torch import nn, Tensor @@ -24,6 +24,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback +from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig @@ -112,15 +113,22 @@ def __init__( if device is not None and not isinstance(device, (torch.device, str)): raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}") + self.n_averaged: Optional[torch.Tensor] = None self._swa_epoch_start = swa_epoch_start self._swa_lrs = swa_lrs self._annealing_epochs = annealing_epochs self._annealing_strategy = annealing_strategy self._avg_fn = avg_fn or self.avg_fn self._device = device - self._max_epochs: int - self._model_contains_batch_norm: bool + self._model_contains_batch_norm: Optional[bool] = None self._average_model: "pl.LightningModule" + self._initialized = False + self._swa_scheduler: Optional[_LRScheduler] = None + self._scheduler_state: Optional[Dict] = None + self._init_n_averaged = 0 + self._latest_update_epoch = -1 + self.momenta: Optional[Dict[nn.modules.batchnorm._BatchNorm, float]] = None + self._max_epochs: int @property def swa_start(self) -> int: @@ -147,6 +155,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - if len(trainer.lr_scheduler_configs) > 1: raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.") + if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DeepSpeedStrategy)): + raise MisconfigurationException("SWA does not currently support sharded models.") + if isinstance(self._swa_epoch_start, float): self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start) @@ -158,8 +169,13 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - assert trainer.fit_loop.max_epochs is not None trainer.fit_loop.max_epochs += 1 + if self._scheduler_state is not None: + self._clear_schedulers(trainer) + def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if trainer.current_epoch == self.swa_start: + if (not self._initialized) and (self.swa_start <= trainer.current_epoch <= self.swa_end): + self._initialized = True + # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) @@ -180,6 +196,17 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, ), ) + if self._scheduler_state is not None: + # Restore scheduler state from checkpoint + self._swa_scheduler.load_state_dict(self._scheduler_state) + elif trainer.current_epoch != self.swa_start: + # Log a warning if we're initializing after start without any checkpoint data, + # as behaviour will be different compared to having checkpoint data. + rank_zero_warn( + "SWA is initializing after swa_start without any checkpoint data. " + "This may be caused by loading a checkpoint from an older version of PyTorch Lightning." + ) + # We assert that there is only one optimizer on fit start, so know opt_idx is always 0 default_scheduler_cfg = LRSchedulerConfig(self._swa_scheduler, opt_idx=0) assert default_scheduler_cfg.interval == "epoch" and default_scheduler_cfg.frequency == 1 @@ -196,14 +223,18 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo else: trainer.lr_scheduler_configs.append(default_scheduler_cfg) - self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device) + if self.n_averaged is None: + self.n_averaged = torch.tensor(self._init_n_averaged, dtype=torch.long, device=pl_module.device) - if self.swa_start <= trainer.current_epoch <= self.swa_end: + if (self.swa_start <= trainer.current_epoch <= self.swa_end) and ( + trainer.current_epoch > self._latest_update_epoch + ): + assert self.n_averaged is not None self.update_parameters(self._average_model, pl_module, self.n_averaged, self._avg_fn) + self._latest_update_epoch = trainer.current_epoch # Note: No > here in case the callback is saved with the model and training continues if trainer.current_epoch == self.swa_end + 1: - # Transfer weights from average model to pl_module self.transfer_weights(self._average_model, pl_module) @@ -265,6 +296,7 @@ def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> No def reset_momenta(self) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.""" + assert self.momenta is not None for bn_module in self.momenta: bn_module.momentum = self.momenta[bn_module] @@ -285,3 +317,35 @@ def update_parameters( def avg_fn(averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: Tensor) -> Tensor: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97.""" return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) + + def state_dict(self) -> Dict[str, Any]: + return { + "n_averaged": 0 if self.n_averaged is None else self.n_averaged.item(), + "latest_update_epoch": self._latest_update_epoch, + "scheduler_state": None if self._swa_scheduler is None else self._swa_scheduler.state_dict(), + "average_model_state": None if self._average_model is None else self._average_model.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + self._init_n_averaged = state_dict["n_averaged"] + self._latest_update_epoch = state_dict["latest_update_epoch"] + self._scheduler_state = state_dict["scheduler_state"] + self._load_average_model_state(state_dict["average_model_state"]) + + @staticmethod + def _clear_schedulers(trainer: "pl.Trainer") -> None: + # If we have scheduler state saved, clear the scheduler configs so that we don't try to + # load state into the wrong type of schedulers when restoring scheduler checkpoint state. + # We'll configure the scheduler and re-load its state in on_train_epoch_start. + # Note that this relies on the callback state being restored before the scheduler state is + # restored, and doesn't work if restore_checkpoint_after_setup is True, but at the time of + # writing that is only True for deepspeed which is already not supported by SWA. + # See https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 for background. + if trainer.lr_scheduler_configs: + assert len(trainer.lr_scheduler_configs) == 1 + trainer.lr_scheduler_configs.clear() + + def _load_average_model_state(self, model_state: Any) -> None: + if self._average_model is None: + return + self._average_model.load_state_dict(model_state) diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index b12e1cf042a1f..98fd9c7074c28 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -118,14 +118,16 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: # ty while being optimized. Arguments: - device: if specified, all parameters will be - copied to that device + device: If specified, all parameters will be copied to that device. If `None`, the current CUDA device + index will be used. Returns: Module: self """ - if device is None or isinstance(device, int): - device = torch.device("cuda", index=(device or 0)) + if device is None: + device = torch.device("cuda", torch.cuda.current_device()) + elif isinstance(device, int): + device = torch.device("cuda", index=device) self.__update_properties(device=device) return super().cuda(device=device) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index b8cc1d91cde18..30e3562067ba7 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -38,7 +38,6 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.core.saving import ModelIO from pytorch_lightning.loggers import Logger, LoggerCollection -from pytorch_lightning.trainer.connectors.data_connector import _DataHookSelector from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType, warnings from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors @@ -293,16 +292,24 @@ def _apply_batch_transfer_handler( self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0 ) -> Any: device = device or self.device - datahook_selector = ( - _DataHookSelector(self, None) if self._trainer is None else self.trainer._data_connector._datahook_selector - ) - hook = datahook_selector.get_hook("on_before_batch_transfer") - batch = hook(batch, dataloader_idx) - hook = datahook_selector.get_hook("transfer_batch_to_device") - batch = hook(batch, device, dataloader_idx) - hook = datahook_selector.get_hook("on_after_batch_transfer") - batch = hook(batch, dataloader_idx) + def call_hook(hook_name, *args): + if self._trainer: + datahook_selector = self._trainer._data_connector._datahook_selector + obj = datahook_selector.get_instance(hook_name) + trainer_method = ( + self._trainer._call_lightning_module_hook + if isinstance(obj, self.__class__) + else self._trainer._call_lightning_datamodule_hook + ) + return trainer_method(hook_name, *args) + else: + hook = getattr(self, hook_name) + return hook(*args) + + batch = call_hook("on_before_batch_transfer", batch, dataloader_idx) + batch = call_hook("transfer_batch_to_device", batch, device, dataloader_idx) + batch = call_hook("on_after_batch_transfer", batch, dataloader_idx) return batch def print(self, *args, **kwargs) -> None: diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 5125bf4486a9d..ca45a4011fcdd 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -35,12 +35,11 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, - _replace_init_method, + _replace_dunder_methods, _update_dataloader, has_iterable_dataset, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import seed_everything @@ -106,8 +105,6 @@ def __init__( self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 - self._check_deepspeed_support() - # wrap the run method so we can inject setup logic or spawn processes for the user setattr(self, "run", partial(self._run_impl, self.run)) @@ -406,9 +403,9 @@ def _run_impl(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: def _run_with_strategy_setup(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: self._strategy.setup_environment() - with self._strategy.model_sharded_context(), _replace_init_method(DataLoader, "dataset"), _replace_init_method( - BatchSampler - ): + with self._strategy.model_sharded_context(), _replace_dunder_methods( + DataLoader, "dataset" + ), _replace_dunder_methods(BatchSampler): return run_method(*args, **kwargs) def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) -> nn.Module: @@ -459,18 +456,6 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N f" Choose one of {supported} or pass in a `Strategy` instance." ) - def _check_deepspeed_support(self) -> None: - if ( - isinstance(self._strategy, DeepSpeedStrategy) - and self._strategy.zero_stage_3 - and _RequirementAvailable("deepspeed>=0.6.5") - ): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise RuntimeError( - "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`." - " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available." - ) - @staticmethod def _supported_device_types() -> Sequence[_AcceleratorType]: return ( diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 530fb58fabe5e..baf4bc9092774 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -260,7 +260,7 @@ def __init__( id: Optional[str] = None, anonymous: Optional[bool] = None, version: Optional[str] = None, - project: Optional[str] = None, + project: str = "lightning_logs", log_model: Union[str, bool] = False, experiment: Union[Run, RunDisabled, None] = None, prefix: str = "", @@ -297,7 +297,7 @@ def __init__( self._checkpoint_callback: Optional["ReferenceType[Checkpoint]"] = None # set wandb init arguments self._wandb_init: Dict[str, Any] = dict( - name=name or project, + name=name, project=project, id=version or id, dir=save_dir, @@ -306,6 +306,7 @@ def __init__( ) self._wandb_init.update(**kwargs) # extract parameters + self._project = self._wandb_init.get("project") self._save_dir = self._wandb_init.get("dir") self._name = self._wandb_init.get("name") self._id = self._wandb_init.get("id") @@ -450,13 +451,13 @@ def save_dir(self) -> Optional[str]: @property def name(self) -> Optional[str]: - """Gets the name of the experiment. + """The project name of this experiment. Returns: - The name of the experiment if the experiment exists else the name given to the constructor. + The name of the project the current experiment belongs to. This name is not the same as `wandb.Run`'s + name. To access wandb's internal experiment name, use ``logger.experiment.name`` instead. """ - # don't create an experiment if we don't have one - return self._experiment.name if self._experiment else self._name + return self._project @property def version(self) -> Optional[str]: diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index afd10c88c951d..50d83ee708cbe 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -10,6 +10,7 @@ from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin @@ -63,6 +64,7 @@ "FullyShardedNativeMixedPrecisionPlugin", "SingleDevicePlugin", "SingleTPUPlugin", + "FullyShardedNativeNativeMixedPrecisionPlugin", "TPUPrecisionPlugin", "TPUBf16PrecisionPlugin", "TPUSpawnPlugin", diff --git a/src/pytorch_lightning/plugins/precision/__init__.py b/src/pytorch_lightning/plugins/precision/__init__.py index 4bc29c1be1864..5206aed62c497 100644 --- a/src/pytorch_lightning/plugins/precision/__init__.py +++ b/src/pytorch_lightning/plugins/precision/__init__.py @@ -11,17 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import ( # noqa: F401 - FullyShardedNativeMixedPrecisionPlugin, -) -from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin # noqa: F401 +from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin +from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin +from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin +from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin +from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin + +__all__ = [ + "ApexMixedPrecisionPlugin", + "DeepSpeedPrecisionPlugin", + "DoublePrecisionPlugin", + "FullyShardedNativeNativeMixedPrecisionPlugin", + "FullyShardedNativeMixedPrecisionPlugin", + "HPUPrecisionPlugin", + "IPUPrecisionPlugin", + "MixedPrecisionPlugin", + "NativeMixedPrecisionPlugin", + "PrecisionPlugin", + "ShardedNativeMixedPrecisionPlugin", + "TPUPrecisionPlugin", + "TPUBf16PrecisionPlugin", +] diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py new file mode 100644 index 0000000000000..38ec381fe5485 --- /dev/null +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -0,0 +1,65 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, Union + +import torch + +from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + +if _TORCH_GREATER_EQUAL_1_12: + from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler +else: + MixedPrecision = None + + +class FullyShardedNativeNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): + """Native AMP for Fully Sharded Native Training.""" + + def __init__( + self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None + ) -> None: + if not _TORCH_GREATER_EQUAL_1_12: + raise MisconfigurationException( + "`FullyShardedNativeNativeMixedPrecisionPlugin` is supported from PyTorch v1.12.0 onwards." + ) + super().__init__(precision, device, scaler=ShardedGradScaler() if scaler is None and precision == 16 else None) + + def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: + # see https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.clip_grad_norm_ + # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect + # for FSDP module. To overcome this, needs to call sharded_module.clip_grad_norm(clip_val) + # however we rely on LightningModule's configure_sharded_model to wrap FSDP, it would be hard to + # trace back the root FSDP. Now we only support clip by value. + raise MisconfigurationException( + f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" + ) + + @property + def mixed_precision_config(self) -> Optional[MixedPrecision]: + assert MixedPrecision is not None + if self.precision == PrecisionType.HALF: + dtype = torch.float16 + elif self.precision == PrecisionType.BFLOAT: + dtype = torch.bfloat16 + else: + raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") + return MixedPrecision( + param_dtype=dtype, + reduce_dtype=dtype, + buffer_dtype=dtype, + ) diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 8c693f2975bbd..870e658bfc9c3 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -11,19 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional - -import torch +from typing import Any from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 - -if _TORCH_GREATER_EQUAL_1_12: - from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision -else: - MixedPrecision = None class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): @@ -38,18 +29,3 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: raise MisconfigurationException( f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" ) - - @property - def mixed_precision_config(self) -> Optional[MixedPrecision]: - assert MixedPrecision is not None - if self.precision == PrecisionType.HALF: - dtype = torch.float16 - elif self.precision == PrecisionType.BFLOAT: - dtype = torch.bfloat16 - else: - raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") - return MixedPrecision( - param_dtype=dtype, - reduce_dtype=dtype, - buffer_dtype=dtype, - ) diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index 02d343a0876b4..b529568d1a04e 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -178,7 +178,9 @@ def _clip_gradients( if not isinstance(model, pl.LightningModule) or not model.automatic_optimization: # the configuration validator disallows clipping on manual return - model.configure_gradient_clipping( + + model.trainer._call_lightning_module_hook( + "configure_gradient_clipping", optimizer, optimizer_idx, gradient_clip_val=clip_val, diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 4c351f26fa3b9..9b927aa757d17 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -23,7 +23,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast @@ -158,7 +158,7 @@ def mixed_precision_config(self) -> Optional[MixedPrecision]: if self.mixed_precision: return self.mixed_precision plugin = self.precision_plugin - if isinstance(plugin, FullyShardedNativeMixedPrecisionPlugin): + if isinstance(plugin, FullyShardedNativeNativeMixedPrecisionPlugin): return plugin.mixed_precision_config @property diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 3c630403dafce..7dec5ba4bffe0 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -30,7 +30,7 @@ from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.cloud_io import get_filesystem -from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs +from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden @@ -239,7 +239,9 @@ def _convert_to_poptorch_loader( dataloader, sampler, mode, self.replication_factor > 1 ) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts - dataloader = poptorch.DataLoader(opts, *dl_args, **dl_kwargs) + dataloader = _reinstantiate_wrapped_cls( + dataloader, opts, *dl_args, explicit_cls=poptorch.DataLoader, **dl_kwargs + ) return dataloader def _handle_gradient_accumulation_steps(self) -> None: diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index bd879cf85ff7a..44c3b3ec7540a 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -53,6 +53,7 @@ TorchElasticEnvironment, ) from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import ( DDP2Strategy, DDPFullyShardedNativeStrategy, @@ -727,7 +728,9 @@ def _check_and_init_precision(self) -> PrecisionPlugin: if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - if isinstance(self.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy)): + if isinstance(self.strategy, DDPFullyShardedNativeStrategy): + return FullyShardedNativeNativeMixedPrecisionPlugin(self._precision_flag, device) + if isinstance(self.strategy, DDPFullyShardedStrategy): return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) return NativeMixedPrecisionPlugin(self._precision_flag, device) diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index 83881905beeb1..3c76e734db189 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -30,7 +30,7 @@ from pytorch_lightning.callbacks.rich_model_summary import RichModelSummary from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info _log = logging.getLogger(__name__) @@ -256,14 +256,19 @@ def _configure_external_callbacks() -> List[Callback]: Return: A list of all callbacks collected from external factories. """ + group = "pytorch_lightning.callbacks_factory" + if _PYTHON_GREATER_EQUAL_3_8_0: from importlib.metadata import entry_points - factories = entry_points().get("pytorch_lightning.callbacks_factory", ()) + if _PYTHON_GREATER_EQUAL_3_10_0: + factories = entry_points(group=group) # type: ignore[call-arg] + else: + factories = entry_points().get(group, {}) # type: ignore[assignment] else: from pkg_resources import iter_entry_points - factories = iter_entry_points("pytorch_lightning.callbacks_factory") + factories = iter_entry_points(group) # type: ignore[assignment] external_callbacks = [] for factory in factories: diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index e1aca404722db..e20eac2ffae57 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -14,7 +14,7 @@ import multiprocessing import os from dataclasses import dataclass, field -from typing import Any, Callable, Collection, List, Optional, Tuple, Union +from typing import Any, Collection, List, Optional, Tuple, Union from weakref import proxy from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler @@ -31,7 +31,7 @@ from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, _is_dataloader_shuffled, - _replace_init_method, + _replace_dunder_methods, _update_dataloader, has_iterable_dataset, has_len_all_ranks, @@ -298,10 +298,14 @@ def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional # update docs too once this is resolved trainer_fn = self.trainer.state.fn - if isinstance(sampler, DistributedSampler) and trainer_fn in (TrainerFn.VALIDATING, TrainerFn.TESTING): + if ( + isinstance(sampler, DistributedSampler) + and sampler.num_replicas > 1 + and trainer_fn in (TrainerFn.VALIDATING, TrainerFn.TESTING) + ): rank_zero_warn( - f"Using `DistributedSampler` with the dataloaders. During `trainer.{trainer_fn.value}()`," - " it is recommended to use `Trainer(devices=1)` to ensure each sample/batch gets evaluated" + f"Using `DistributedSampler` with the dataloaders. During `trainer.{trainer_fn.value}()`, it is" + " recommended to use `Trainer(devices=1, num_nodes=1)` to ensure each sample/batch gets evaluated" " exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates" " some samples to make sure all devices have same batch size in case of uneven inputs.", category=PossibleUserWarning, @@ -424,9 +428,11 @@ def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[Dat """ source = getattr(self, f"_{stage.dataloader_prefix}_dataloader_source") - with _replace_init_method(DataLoader, "dataset"), _replace_init_method(BatchSampler): + with _replace_dunder_methods(DataLoader, "dataset"), _replace_dunder_methods(BatchSampler): # under this context manager, the arguments passed to `DataLoader.__init__` will be captured and saved as - # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning + # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning. + # Also, it records all attribute setting and deletion using patched `__setattr__` and `__delattr__` + # methods so that the re-instantiated object is as close to the original as possible. dataloader = source.dataloader() if isinstance(dataloader, tuple): dataloader = list(dataloader) @@ -527,16 +533,16 @@ def is_module(self) -> bool: @dataclass class _DataHookSelector: - """Stores the info about the shared DataHooks within LightningModule and LightningDataModule. + """Stores the info about the shared DataHooks within ``LightningModule`` and ``LightningDataModule``. - The hook source can be + The hook source can be: - 1. a method from the :class:`~pytorch_lightning.core.module.LightningModule`, - 2. a method from the :class:`~pytorch_lightning.core.datamodule.LightningDataModule`, + 1. the :class:`~pytorch_lightning.core.module.LightningModule`, + 2. the :class:`~pytorch_lightning.core.datamodule.LightningDataModule`, Arguments: - model: A LightningModule - datamodule: A LightningDataModule + model: A ``LightningModule`` + datamodule: A ``LightningDataModule`` """ model: "pl.LightningModule" @@ -545,7 +551,7 @@ class _DataHookSelector: default=("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer") ) - def get_hook(self, hook_name: str) -> Callable: + def get_instance(self, hook_name: str) -> Union["pl.LightningModule", "pl.LightningDataModule"]: if hook_name not in self._valid_hooks: raise ValueError( f"`{hook_name}` is not a shared hook within `LightningModule` and `LightningDataModule`." @@ -553,7 +559,7 @@ def get_hook(self, hook_name: str) -> Callable: ) if self.datamodule is None: - return getattr(self.model, hook_name) + return self.model if is_overridden(hook_name, self.datamodule): if is_overridden(hook_name, self.model): @@ -561,11 +567,11 @@ def get_hook(self, hook_name: str) -> Callable: f"You have overridden `{hook_name}` in both `LightningModule` and `LightningDataModule`." " It will use the implementation from `LightningDataModule` instance." ) - return getattr(self.datamodule, hook_name) + return self.datamodule if is_overridden(hook_name, self.model): warning_cache.warn( f"You have overridden `{hook_name}` in `LightningModule` but have passed in a" " `LightningDataModule`. It will use the implementation from `LightningModule` instance." ) - return getattr(self.model, hook_name) + return self.model diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index 6f60ba6f1aa2f..56ad53ef4ba04 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -44,6 +44,8 @@ class _LogOptions(TypedDict): allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), "lr_scheduler_step": None, + "configure_gradient_clipping": None, + "clip_gradients": None, "on_before_zero_grad": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), @@ -98,6 +100,9 @@ class _LogOptions(TypedDict): "on_epoch_end": _LogOptions( allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True ), + "on_before_batch_transfer": None, + "transfer_batch_to_device": None, + "on_after_batch_transfer": None, "on_batch_start": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index ff882912625d0..02e17a8d93494 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -163,8 +163,7 @@ def update_train_epoch_metrics(self) -> None: self.log_metrics(self.metrics["log"]) # reset result collection for next epoch - assert self.trainer._results is not None - self.trainer._results.reset(metrics=True) + self.reset_results() """ Utilities and properties diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 9eb88fda4891e..a28599b5f20be 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -525,7 +525,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: should = result_metric.meta.sync.should - if not result_metric.meta.sync.should and distributed_available(): + if not should and distributed_available() and result_metric.is_tensor: # ensure sync happens for FT since during a failure, the metrics are synced and saved to the # checkpoint, so during restart, metrics on rank 0 are from the accumulated ones from the previous # run, and on other ranks, they are 0. So we need to make sure they are synced in further training diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 862c7f2de905b..b4d9d4dec5817 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -37,7 +37,7 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.apply_func import _is_dataclass_instance from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler -from pytorch_lightning.utilities.enums import _FaultTolerantMode +from pytorch_lightning.utilities.enums import _FaultTolerantMode, LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.seed import pl_worker_init_function @@ -48,6 +48,18 @@ warning_cache = WarningCache() +class _WrapAttrTag(LightningEnum): + SET = "set" + DEL = "del" + + def __call__(self, *args): + if self == self.SET: + fn = setattr + else: + fn = delattr + return fn(*args) + + def _extract_batch_size(batch: BType) -> Generator[int, None, None]: if isinstance(batch, Tensor): if batch.ndim == 0: @@ -188,27 +200,7 @@ def _update_dataloader( dataloader: DataLoader, sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None ) -> DataLoader: dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode) - dl_cls = type(dataloader) - try: - dataloader = dl_cls(*dl_args, **dl_kwargs) - except TypeError as e: - # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass - # `__init__` arguments map to one `DataLoader.__init__` argument - import re - - match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) - if not match: - # an unexpected `TypeError`, continue failure - raise - argument = match.groups()[0] - message = ( - f"The {dl_cls.__name__} `DataLoader` implementation has an error where more than one `__init__` argument" - f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" - f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." - f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." - " This argument was automatically passed to your DataLoader by PyTorch Lightning." - ) - raise MisconfigurationException(message) from e + dataloader = _reinstantiate_wrapped_cls(dataloader, *dl_args, **dl_kwargs) return dataloader @@ -374,7 +366,7 @@ def _dataloader_init_kwargs_resolve_sampler( "this, expose an argument `sampler` in the `__init__` method of your custom class." ) - batch_sampler = batch_sampler_cls(*args, **kwargs) + batch_sampler = _reinstantiate_wrapped_cls(batch_sampler, *args, **kwargs) else: try: batch_sampler = batch_sampler_cls( @@ -449,6 +441,37 @@ def _auto_add_worker_init_fn(dataloader: DataLoader, rank: int) -> None: dataloader.worker_init_fn = partial(pl_worker_init_function, rank=rank) +def _reinstantiate_wrapped_cls(orig_object: Any, *args: Any, explicit_cls: Optional[Type] = None, **kwargs: Any) -> Any: + constructor = type(orig_object) if explicit_cls is None else explicit_cls + + try: + result = constructor(*args, **kwargs) + except TypeError as e: + # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass + # `__init__` arguments map to one `DataLoader.__init__` argument + import re + + match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) + if not match: + # an unexpected `TypeError`, continue failure + raise + argument = match.groups()[0] + message = ( + f"The {constructor.__name__} implementation has an error where more than one `__init__` argument" + f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" + f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." + f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." + " This argument was automatically passed to your object by PyTorch Lightning." + ) + raise MisconfigurationException(message) from e + + attrs_record = getattr(orig_object, "__pl_attrs_record", list()) + for args, fn in attrs_record: + fn(result, *args) + + return result + + def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) -> Callable: """Wraps the ``__init__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" @@ -457,6 +480,8 @@ def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: # We need to inspect `init`, as inspecting `obj.__init__` # can lead to inspecting the wrong function with multiple inheritance + old_inside_init = getattr(obj, "__pl_inside_init", False) + object.__setattr__(obj, "__pl_inside_init", True) params = inspect.signature(init).parameters parameters_defaults = OrderedDict( @@ -474,21 +499,49 @@ def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: } if not hasattr(obj, "__pl_saved_args"): - obj.__pl_saved_args = args - obj.__pl_saved_kwargs = kwargs - obj.__pl_saved_arg_names = param_names - obj.__pl_saved_default_kwargs = default_kwargs + object.__setattr__(obj, "__pl_saved_args", args) + object.__setattr__(obj, "__pl_saved_kwargs", kwargs) + object.__setattr__(obj, "__pl_saved_arg_names", param_names) + object.__setattr__(obj, "__pl_saved_default_kwargs", default_kwargs) # We want to use the latest possible value for explicit argument (i.e. ideally what gets passed to base class) # so that we can be sure, that it will not get changed anymore. # That is why we are setting this in every `__init__` if store_explicit_arg is not None: if store_explicit_arg in param_names: - setattr(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) + object.__setattr__(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) elif store_explicit_arg in kwargs: - setattr(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) + object.__setattr__(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) init(obj, *args, **kwargs) + object.__setattr__(obj, "__pl_inside_init", old_inside_init) + + return wrapper + + +def _wrap_attr_method(method: Callable, tag: _WrapAttrTag) -> Callable: + """Wraps the ``__setattr__`` or ``__delattr__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and + :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" + + @functools.wraps(method) + def wrapper(obj: Any, *args: Any): + # First, let's find out if we're the first in inheritance chain calling the patched method. + name, *_ = args + prev_call_name, prev_call_method = getattr(obj, "__pl_current_call", (None, "method")) + first_call = not (prev_call_name == name and prev_call_method == tag) + + # Then mark the current called method + object.__setattr__(obj, "__pl_current_call", (name, tag)) + + # call original method + method(obj, *args) + if first_call and not getattr(obj, "__pl_inside_init", True): + # and save the value it was called with to the internal list, + # if we're outside of __init__ and the original call did not fail and we're the first call + attrs_record = getattr(obj, "__pl_attrs_record", list()) + attrs_record.append((args, tag)) + object.__setattr__(obj, "__pl_attrs_record", attrs_record) + object.__setattr__(obj, "__pl_current_call", (prev_call_name, prev_call_method)) return wrapper @@ -508,23 +561,34 @@ def recurse(cl: Type[Any]) -> None: @contextmanager -def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: +def _replace_dunder_methods(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. - It patches the ``__init__`` method. + It patches the ``__init__``, ``__setattr__`` and ``__delattr__`` methods. """ classes = _get_all_subclasses(base_cls) | {base_cls} - wrapped = set() for cls in classes: - if cls.__init__ not in wrapped: - cls._old_init = cls.__init__ + # Check that __init__ belongs to the class + # https://stackoverflow.com/a/5253424 + if "__init__" in cls.__dict__: + cls.__old__init__ = cls.__init__ cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) - wrapped.add(cls.__init__) + + # we want at least one setattr/delattr in the chain to be patched and it can happen, that none of the subclasses + # implement `__setattr__`/`__delattr__`. Therefore, we are always patching the `base_cls` + for patch_fn_name, tag in (("__setattr__", _WrapAttrTag.SET), ("__delattr__", _WrapAttrTag.DEL)): + if patch_fn_name in cls.__dict__ or cls is base_cls: + saved_name = f"__old{patch_fn_name}" + setattr(cls, saved_name, getattr(cls, patch_fn_name)) + setattr(cls, patch_fn_name, _wrap_attr_method(getattr(cls, patch_fn_name), tag)) yield for cls in classes: - if hasattr(cls, "_old_init"): - cls.__init__ = cls._old_init - del cls._old_init + for patched_name in ("__setattr__", "__delattr__", "__init__"): + # Check that __old__{init,setattr,delattr} belongs to the class + # https://stackoverflow.com/a/5253424 + if f"__old{patched_name}" in cls.__dict__: + setattr(cls, patched_name, getattr(cls, f"__old{patched_name}")) + delattr(cls, f"__old{patched_name}") def _wrap_with_capture_dataset(dataset: Dataset) -> Dataset: diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 7784741ca87c1..96dd62982439a 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -124,6 +124,7 @@ def __repr__(self) -> str: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) +_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 9f5fe2d6b6841..b619c5cb698b0 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -160,7 +160,10 @@ def get_init_args(frame: types.FrameType) -> Dict[str, Any]: def collect_init_args( - frame: types.FrameType, path_args: List[Dict[str, Any]], inside: bool = False + frame: types.FrameType, + path_args: List[Dict[str, Any]], + inside: bool = False, + classes: Tuple[Type, ...] = (), ) -> List[Dict[str, Any]]: """Recursively collects the arguments passed to the child constructors in the inheritance tree. @@ -168,6 +171,7 @@ def collect_init_args( frame: the current stack frame path_args: a list of dictionaries containing the constructor args in all parent classes inside: track if we are inside inheritance path, avoid terminating too soon + classes: the classes in which to inspect the frames Return: A list of dictionaries where each dictionary contains the arguments passed to the @@ -179,13 +183,13 @@ def collect_init_args( if not isinstance(frame.f_back, types.FrameType): return path_args - if "__class__" in local_vars: + if "__class__" in local_vars and (not classes or issubclass(local_vars["__class__"], classes)): local_args = get_init_args(frame) # recursive update path_args.append(local_args) - return collect_init_args(frame.f_back, path_args, inside=True) + return collect_init_args(frame.f_back, path_args, inside=True, classes=classes) if not inside: - return collect_init_args(frame.f_back, path_args, inside) + return collect_init_args(frame.f_back, path_args, inside, classes=classes) return path_args @@ -223,7 +227,10 @@ def save_hyperparameters( init_args = {f.name: getattr(obj, f.name) for f in fields(obj)} else: init_args = {} - for local_args in collect_init_args(frame, []): + + from pytorch_lightning.core.mixins import HyperparametersMixin + + for local_args in collect_init_args(frame, [], classes=(HyperparametersMixin,)): init_args.update(local_args) if ignore is None: diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 859cf2fa98c0c..65a0fea2fb4a5 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os +from pathlib import Path +from typing import ContextManager, Optional from unittest import mock import pytest import torch from torch import nn +from torch.optim.lr_scheduler import LambdaLR from torch.optim.swa_utils import SWALR from torch.utils.data import DataLoader @@ -30,7 +34,9 @@ class SwaTestModel(BoringModel): - def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False): + def __init__( + self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False, crash_on_epoch=None + ): super().__init__() layers = [nn.Linear(32, 32)] if batchnorm: @@ -39,17 +45,18 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dat self.layer = nn.Sequential(*layers) self.interval = interval self.iterable_dataset = iterable_dataset + self.crash_on_epoch = crash_on_epoch def training_step(self, batch, batch_idx): + if self.crash_on_epoch and self.trainer.current_epoch >= self.crash_on_epoch: + raise Exception("SWA crash test") output = self.forward(batch) loss = self.loss(batch, output) return {"loss": loss} def train_dataloader(self): - dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset dset = dset_cls(32, 64) - return DataLoader(dset, batch_size=2) def configure_optimizers(self): @@ -66,6 +73,8 @@ def configure_optimizers(self): class SwaTestCallback(StochasticWeightAveraging): update_parameters_calls: int = 0 transfer_weights_calls: int = 0 + # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0 + first_epoch: Optional[int] = None def update_parameters(self, *args, **kwargs): self.update_parameters_calls += 1 @@ -77,6 +86,11 @@ def transfer_weights(self, *args, **kwargs): def on_train_epoch_start(self, trainer, *args): super().on_train_epoch_start(trainer, *args) + if self.first_epoch is None and not trainer.fit_loop.restarting: + # since the checkpoint loaded was saved `on_train_epoch_end`, the first `FitLoop` iteration will + # not update the model and just call the epoch-level hooks, for that reason, we check that we are not + # restarting before choosing the first epoch + self.first_epoch = trainer.current_epoch assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end) if self.swa_start <= trainer.current_epoch: assert isinstance(trainer.lr_scheduler_configs[0].scheduler, SWALR) @@ -88,6 +102,7 @@ def on_train_epoch_end(self, trainer, *args): if self.swa_start <= trainer.current_epoch <= self.swa_end: swa_epoch = trainer.current_epoch - self.swa_start assert self.n_averaged == swa_epoch + 1 + assert self._swa_scheduler is not None # Scheduler is stepped once on initialization and then at the end of each epoch assert self._swa_scheduler._step_count == swa_epoch + 2 elif trainer.current_epoch > self.swa_end: @@ -103,10 +118,13 @@ def on_train_end(self, trainer, pl_module): if not isinstance(trainer.strategy, DDPSpawnStrategy): # check backward call count. the batchnorm update epoch should not backward - assert trainer.strategy.backward.call_count == trainer.max_epochs * trainer.limit_train_batches + assert trainer.strategy.backward.call_count == ( + (trainer.max_epochs - self.first_epoch) * trainer.limit_train_batches + ) # check call counts - assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1) + first_swa_epoch = max(self.first_epoch, self.swa_start) + assert self.update_parameters_calls == trainer.max_epochs - first_swa_epoch assert self.transfer_weights_calls == 1 @@ -140,7 +158,7 @@ def train_with_swa( devices=devices, ) - with mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward): + with _backward_patch(trainer): trainer.fit(model) # check the model is the expected @@ -226,9 +244,10 @@ def test_swa_multiple_lrs(tmpdir): class TestModel(BoringModel): def __init__(self): - super(BoringModel, self).__init__() + super().__init__() self.layer1 = torch.nn.Linear(32, 32) self.layer2 = torch.nn.Linear(32, 2) + self.on_train_epoch_start_called = False def forward(self, x): x = self.layer1(x) @@ -255,3 +274,98 @@ def on_train_epoch_start(self): ) trainer.fit(model) assert model.on_train_epoch_start_called + + +def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False): + swa_start = 3 + trainer_kwargs = { + "default_root_dir": tmpdir, + "max_epochs": 5, + "accelerator": "cpu", + "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None, + "devices": 2 if ddp else 1, + "limit_train_batches": 5, + "limit_val_batches": 0, + "accumulate_grad_batches": 2, + "enable_progress_bar": False, + } + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer), pytest.raises(Exception, match="SWA crash test"): + trainer.fit(model) + + checkpoint_dir = Path(tmpdir) / "lightning_logs" / "version_0" / "checkpoints" + checkpoint_files = os.listdir(checkpoint_dir) + assert len(checkpoint_files) == 1 + ckpt_path = str(checkpoint_dir / checkpoint_files[0]) + + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer): + trainer.fit(resume_model, ckpt_path=ckpt_path) + + +class CustomSchedulerModel(SwaTestModel): + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + + def lr_lambda(current_step: int): + return 0.1 + + scheduler = LambdaLR(optimizer, lr_lambda, -1) + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": scheduler, + "interval": self.interval, + }, + } + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint(tmpdir, crash_on_epoch): + model = SwaTestModel(crash_on_epoch=crash_on_epoch) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint_custom_scheduler(tmpdir, crash_on_epoch): + # Reproduces the bug reported in https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 + model = CustomSchedulerModel(crash_on_epoch=crash_on_epoch) + resume_model = CustomSchedulerModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@RunIf(skip_windows=True) +def test_swa_resume_training_from_checkpoint_ddp(tmpdir): + model = SwaTestModel(crash_on_epoch=3) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=True) + + +@pytest.mark.parametrize( + "strategy", + [ + pytest.param("fsdp", marks=RunIf(fairscale_fully_sharded=True, min_cuda_gpus=1)), + pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), + ], +) +def test_misconfiguration_error_with_sharded_model(tmpdir, strategy: str): + model = SwaTestModel() + swa_callback = SwaTestCallback(swa_epoch_start=2, swa_lrs=0.1) + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=5, + callbacks=[swa_callback], + strategy=strategy, + accelerator="gpu", + devices=1, + ) + with pytest.raises(MisconfigurationException, match="SWA does not currently support sharded models"): + trainer.fit(model) + + +def _backward_patch(trainer: Trainer) -> ContextManager: + return mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward) diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index cb8a51c5bf9ba..9672bb75b51f1 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -21,9 +21,11 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp +import torchmetrics from torch.nn import ModuleDict, ModuleList from torchmetrics import Metric, MetricCollection +import pytorch_lightning as pl import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint @@ -666,14 +668,22 @@ def on_train_start(self): @pytest.mark.parametrize("distributed_env", [True, False]) -def test_logger_sync_dist(distributed_env): - # self.log('bar', 7, ..., sync_dist=False) +@pytest.mark.parametrize("log_val", [torch.tensor(0.5), torchmetrics.Accuracy()]) +def test_logger_sync_dist(distributed_env, log_val): + pl.trainer.connectors.logger_connector.result.warning_cache.clear() + + # self.log('bar', 0.5, ..., sync_dist=False) meta = _Metadata("foo", "bar") meta.sync = _Sync(_should=False) - result_metric = _ResultMetric(metadata=meta, is_tensor=True) - result_metric.update(torch.tensor(7.0), 10) + is_tensor = isinstance(log_val, torch.Tensor) + + if not is_tensor: + log_val.update(torch.tensor([0, 1]), torch.tensor([0, 0], dtype=torch.long)) + + result_metric = _ResultMetric(metadata=meta, is_tensor=is_tensor) + result_metric.update(log_val, 10) - warning_ctx = pytest.warns if distributed_env else no_warning_call + warning_ctx = pytest.warns if distributed_env and is_tensor else no_warning_call with mock.patch( "pytorch_lightning.trainer.connectors.logger_connector.result.distributed_available", @@ -681,4 +691,4 @@ def test_logger_sync_dist(distributed_env): ): with warning_ctx(PossibleUserWarning, match=r"recommended to use `self.log\('bar', ..., sync_dist=True\)`"): value = _ResultCollection._get_cache(result_metric, on_step=False) - assert value == 7.0 + assert value == 0.5 diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 2215ab3129780..d45046f249d54 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib import os from copy import deepcopy from unittest import mock @@ -30,7 +29,6 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -179,7 +177,7 @@ def test_setup_dataloaders_return_type(): assert lite_dataloader1.dataset is dataset1 -@mock.patch("pytorch_lightning.lite.lite._replace_init_method") +@mock.patch("pytorch_lightning.lite.lite._replace_dunder_methods") def test_setup_dataloaders_captures_dataloader_arguments(ctx_manager): """Test that Lite intercepts the DataLoader constructor arguments with a context manager in its run method.""" @@ -480,13 +478,4 @@ def run(self): assert self.broadcast(True) assert self.is_global_zero == (self.local_rank == 0) - if _RequirementAvailable("deepspeed>=0.6.5"): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise_if_deepspeed_incompatible = pytest.raises( - RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" - ) - else: - raise_if_deepspeed_incompatible = contextlib.suppress() - - with raise_if_deepspeed_incompatible: - Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index d613296abccf5..612d7bf035c2f 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -300,7 +300,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): @pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES_WO_NEPTUNE_WANDB) -@RunIf(skip_windows=True, skip_hanging_spawn=True) +@RunIf(skip_windows=True) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" _patch_comet_atexit(monkeypatch) diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index fbc1d5e189637..648e1a8f38ec8 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -25,6 +25,16 @@ from tests_pytorch.helpers.utils import no_warning_call +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) +@mock.patch("pytorch_lightning.loggers.wandb.wandb") +def test_wandb_project_name(*_): + logger = WandbLogger() + assert logger.name == "lightning_logs" + + logger = WandbLogger(project="project") + assert logger.name == "project" + + @mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_logger_init(wandb, monkeypatch): @@ -48,7 +58,7 @@ def test_wandb_logger_init(wandb, monkeypatch): wandb.init.reset_mock() WandbLogger(project="test_project").experiment wandb.init.assert_called_once_with( - name="test_project", dir=None, id=None, project="test_project", resume="allow", anonymous=None + name=None, dir=None, id=None, project="test_project", resume="allow", anonymous=None ) # test wandb.init and setting logger experiment externally @@ -91,7 +101,6 @@ def test_wandb_logger_init(wandb, monkeypatch): logger.watch("model", "log", 10, False) wandb.init().watch.assert_called_once_with("model", log="log", log_freq=10, log_graph=False) - assert logger.name == wandb.init().name assert logger.version == wandb.init().id @@ -140,10 +149,9 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): """Test that the logger creates the folders and files in the right place.""" monkeypatch.setattr(pytorch_lightning.loggers.wandb, "_WANDB_GREATER_EQUAL_0_12_10", True) wandb.run = None - logger = WandbLogger(save_dir=str(tmpdir), offline=True) + logger = WandbLogger(project="project", save_dir=str(tmpdir), offline=True) # the logger get initialized assert logger.version == wandb.init().id - assert logger.name == wandb.init().name # mock return values of experiment wandb.run = None @@ -154,7 +162,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): _ = logger.experiment assert logger.version == "1" - assert logger.name == "run_name" + assert logger.name == "project" assert str(tmpdir) == logger.save_dir assert not os.listdir(tmpdir) @@ -164,7 +172,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): assert trainer.log_dir == logger.save_dir trainer.fit(model) - assert trainer.checkpoint_callback.dirpath == str(tmpdir / "run_name" / version / "checkpoints") + assert trainer.checkpoint_callback.dirpath == str(tmpdir / "project" / version / "checkpoints") assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=3.ckpt"} assert trainer.log_dir == logger.save_dir diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index c064d0f8c055e..90d9d1eb0e902 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -29,6 +29,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, _OMEGACONF_AVAILABLE, AttributeDict, is_picklable @@ -401,6 +402,24 @@ def _raw_checkpoint_path(trainer) -> str: return raw_checkpoint_path +@pytest.mark.parametrize("base_class", (HyperparametersMixin, LightningModule, LightningDataModule)) +def test_save_hyperparameters_under_composition(base_class): + """Test that in a composition where the parent is not a Lightning-like module, the parent's arguments don't get + collected.""" + + class ChildInComposition(base_class): + def __init__(self, same_arg): + super().__init__() + self.save_hyperparameters() + + class NotPLSubclass: # intentionally not subclassing LightningModule/LightningDataModule + def __init__(self, same_arg="parent_default", other_arg="other"): + self.child = ChildInComposition(same_arg="cocofruit") + + parent = NotPLSubclass() + assert parent.child.hparams == dict(same_arg="cocofruit") + + class LocalVariableModelSuperLast(BoringModel): """This model has the super().__init__() call at the end.""" diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 960bd867ceaa4..698ed7863ab96 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -34,6 +34,10 @@ fi # test that a user can manually launch individual processes echo "Running manual ddp launch test" export PYTHONPATH="${PYTHONPATH}:$(pwd)" -args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} +args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} & +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args} + +# test that ddp can launched as a module (-m option) +echo "Running ddp example as module" +python -m strategies.scripts.cli_script ${args} diff --git a/tests/tests_pytorch/serve/__init__.py b/tests/tests_pytorch/serve/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/ddp_model.py b/tests/tests_pytorch/strategies/ddp_model.py deleted file mode 100644 index 76d1f3f2f6866..0000000000000 --- a/tests/tests_pytorch/strategies/ddp_model.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Runs either `.fit()` or `.test()` on a single node across multiple gpus.""" -import os -from argparse import ArgumentParser - -import torch - -from pytorch_lightning import seed_everything, Trainer -from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.simple_models import ClassificationModel - - -def main(): - seed_everything(4321) - - parser = ArgumentParser(add_help=False) - parser = Trainer.add_argparse_args(parser) - parser.add_argument("--trainer_method", default="fit") - parser.add_argument("--tmpdir") - parser.add_argument("--workdir") - parser.set_defaults(accelerator="gpu", devices=2) - parser.set_defaults(strategy="ddp") - args = parser.parse_args() - - dm = ClassifDataModule() - model = ClassificationModel() - trainer = Trainer.from_argparse_args(args) - - if args.trainer_method == "fit": - trainer.fit(model, datamodule=dm) - result = None - elif args.trainer_method == "test": - result = trainer.test(model, datamodule=dm) - elif args.trainer_method == "fit_test": - trainer.fit(model, datamodule=dm) - result = trainer.test(model, datamodule=dm) - else: - raise ValueError(f"Unsupported: {args.trainer_method}") - - result_ext = {"status": "complete", "method": args.trainer_method, "result": result} - file_path = os.path.join(args.tmpdir, "ddp.result") - torch.save(result_ext, file_path) - - -if __name__ == "__main__": - main() diff --git a/tests/tests_pytorch/strategies/scripts/__init__.py b/tests/tests_pytorch/strategies/scripts/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/scripts/cli_script.py b/tests/tests_pytorch/strategies/scripts/cli_script.py new file mode 100644 index 0000000000000..17f0d29392eb9 --- /dev/null +++ b/tests/tests_pytorch/strategies/scripts/cli_script.py @@ -0,0 +1,24 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A trivial script that wraps a LightningCLI around the BoringModel and BoringDataModule.""" +from pytorch_lightning.cli import LightningCLI +from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel + +if __name__ == "__main__": + LightningCLI( + BoringModel, + BoringDataModule, + seed_everything_default=42, + save_config_overwrite=True, + ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 1a2a0475e7ed6..9b196f3e2a97f 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -21,60 +21,41 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning import Trainer +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import Callback from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPStrategy +from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.strategies import ddp_model -from tests_pytorch.utilities.distributed import call_training_script +from tests_pytorch.helpers.simple_models import ClassificationModel -CLI_ARGS = "--max_epochs 1 --accelerator gpu --devices 2 --strategy ddp" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit", tmpdir, timeout=120, as_module=as_module) - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_test_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.test(model, datamodule=dm) - # verify the file wrote the expected outputs - assert result["status"] == "complete" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_test(tmpdir): + seed_everything(4321) + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) + result = trainer.test(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_test_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "test", tmpdir, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit_test", tmpdir, timeout=20, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - model_outs = result["result"] - for out in model_outs: + for out in result: assert out["test_acc"] > 0.7 diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index 74f9534c47ce3..ede201da1f68f 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -7,7 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 @@ -35,7 +35,7 @@ def test_invalid_on_cpu(tmpdir): @RunIf(min_torch="1.12", min_cuda_gpus=1) @pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)]) def test_precision_plugin_config(precision, expected): - plugin = FullyShardedNativeMixedPrecisionPlugin(precision=precision, device="cuda") + plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda") config = plugin.mixed_precision_config assert config.param_dtype == expected assert config.buffer_dtype == expected @@ -96,6 +96,7 @@ def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: in def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer, FullyShardedDataParallel) + assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) assert isinstance(self.layer.module[0], FullyShardedDataParallel) assert isinstance(self.layer.module[2], FullyShardedDataParallel) # root should not be resharding diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index f485060833320..7fb22206c45c6 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -184,11 +184,17 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): "strategy_name,expected_ddp_kwargs", [ ("ddp_spawn", {}), - ("ddp_fork", {}), - ("ddp_notebook", {}), + pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)), + pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)), ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}), + pytest.param( + "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True) + ), + pytest.param( + "ddp_notebook_find_unused_parameters_false", + {"find_unused_parameters": False}, + marks=RunIf(skip_windows=True), + ), ], ) def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py index d6d5018aa1dd0..02e846425a2a0 100644 --- a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py @@ -30,7 +30,7 @@ ) from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 def test_checkpoint_callbacks_are_last(tmpdir): @@ -265,7 +265,10 @@ def _make_entry_point_query_mock(callback_factory): entry_point = Mock() entry_point.name = "mocked" entry_point.load.return_value = callback_factory - if _PYTHON_GREATER_EQUAL_3_8_0: + if _PYTHON_GREATER_EQUAL_3_10_0: + query_mock.return_value = [entry_point] + import_path = "importlib.metadata.entry_points" + elif _PYTHON_GREATER_EQUAL_3_8_0: query_mock().get.return_value = [entry_point] import_path = "importlib.metadata.entry_points" else: diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 52ef4c4db6d8d..379a3248a1535 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -445,7 +445,8 @@ def test_dataloader_source_direct_access(): def test_dataloader_source_request_from_module(): """Test requesting a dataloader from a module works.""" module = BoringModel() - module.trainer = Trainer() + trainer = Trainer() + module.trainer = trainer module.foo = Mock(return_value=module.train_dataloader()) source = _DataLoaderSource(module, "foo") @@ -470,34 +471,34 @@ def test_no_datamodule_no_overridden(self, hook_name): model, _, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=None) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_with_datamodule_no_overridden(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_override_model_hook(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_override_datamodule_hook(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) setattr(dm, hook_name, self.overridden_func) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(dm, hook_name) + assert instance is dm def test_override_both_model_and_datamodule(self, hook_name): model, dm, trainer = self.reset_instances() @@ -505,39 +506,40 @@ def test_override_both_model_and_datamodule(self, hook_name): setattr(model, hook_name, self.overridden_func) setattr(dm, hook_name, self.overridden_func) with pytest.warns(UserWarning, match=f"have overridden `{hook_name}` in both"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(dm, hook_name) + assert instance is dm def test_with_datamodule_override_model(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) setattr(model, hook_name, self.overridden_func) with pytest.warns(UserWarning, match=f"have overridden `{hook_name}` in `LightningModule`"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_invalid_hook_passed_in_datahook_selector(): dh_selector = _DataHookSelector(BoringModel(), None) with pytest.raises(ValueError, match="is not a shared hook"): - dh_selector.get_hook("setup") + dh_selector.get_instance("setup") -def test_eval_distributed_sampler_warning(tmpdir): +@pytest.mark.parametrize("devices, warn_context", [(1, no_warning_call), (2, pytest.warns)]) +def test_eval_distributed_sampler_warning(devices, warn_context): """Test that a warning is raised when `DistributedSampler` is used with evaluation.""" model = BoringModel() - trainer = Trainer(strategy="ddp", devices=2, accelerator="cpu", fast_dev_run=True) + trainer = Trainer(strategy="ddp", devices=devices, accelerator="cpu") trainer._data_connector.attach_data(model) trainer.state.fn = TrainerFn.VALIDATING - with pytest.warns(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): + with warn_context(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): trainer.reset_val_dataloader(model) trainer.state.fn = TrainerFn.TESTING - with pytest.warns(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): + with warn_context(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): trainer.reset_test_dataloader(model) diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index 760e8eea2a85c..c2be22c61244b 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -187,11 +187,6 @@ def __init__(self, not_supported): { "log", "log_dict", - # the following are problematic as they do have `self._current_fx_name` defined some times but - # not others depending on where they were called. So we cannot reliably `self.log` in them - "on_before_batch_transfer", - "transfer_batch_to_device", - "on_after_batch_transfer", } ) # remove `nn.Module` hooks @@ -227,6 +222,9 @@ def test_fx_validator_integration(tmpdir): "on_pretrain_routine_end": "You can't", "train_dataloader": "You can't", "val_dataloader": "You can't", + "on_before_batch_transfer": "You can't", + "transfer_batch_to_device": "You can't", + "on_after_batch_transfer": "You can't", "on_validation_end": "You can't", "on_train_end": "You can't", "on_fit_end": "You can't", @@ -238,6 +236,8 @@ def test_fx_validator_integration(tmpdir): "on_validation_model_eval": "You can't", "on_validation_model_train": "You can't", "lr_scheduler_step": "You can't", + "configure_gradient_clipping": "You can't", + "clip_gradients": "You can't", "on_save_checkpoint": "You can't", "on_load_checkpoint": "You can't", "on_exception": "You can't", diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index 5855eba4c86af..d16be306b9365 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -569,11 +569,12 @@ def on_train_epoch_end(self, trainer, pl_module): "accelerator", [ pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), + "cpu", ], ) def test_metric_are_properly_reduced(tmpdir, accelerator): class TestingModel(BoringModel): - def __init__(self, *args, **kwargs) -> None: + def __init__(self) -> None: super().__init__() self.val_acc = Accuracy() @@ -592,7 +593,6 @@ def validation_step(self, batch, batch_idx): return super().validation_step(batch, batch_idx) early_stop = EarlyStopping(monitor="val_acc", mode="max") - checkpoint = ModelCheckpoint(monitor="val_acc", save_last=True, save_top_k=2, mode="max") model = TestingModel() @@ -812,3 +812,28 @@ def training_step(self, batch, batch_idx): call(metrics={"foo_epoch": 0.0, "epoch": 1}, step=3), ] ) + + +@mock.patch("pytorch_lightning.loggers.TensorBoardLogger.log_metrics") +def test_log_on_train_start(mock_log_metrics, tmpdir): + """Tests that logged metrics on_train_start get reset after the first epoch.""" + + class MyModel(BoringModel): + def on_train_start(self): + self.log("foo", 123) + + model = MyModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=0, + max_epochs=2, + log_every_n_steps=1, + enable_model_summary=False, + enable_checkpointing=False, + enable_progress_bar=False, + ) + trainer.fit(model) + + assert mock_log_metrics.mock_calls == [call(metrics={"foo": 123.0, "epoch": 0}, step=0)] + assert trainer.max_epochs > 1 diff --git a/tests/tests_pytorch/utilities/distributed.py b/tests/tests_pytorch/utilities/distributed.py deleted file mode 100644 index 38a50edcc7177..0000000000000 --- a/tests/tests_pytorch/utilities/distributed.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import subprocess -import sys -from pathlib import Path -from subprocess import TimeoutExpired - -import pytorch_lightning - - -def call_training_script(module_file, cli_args, method, tmpdir, timeout=60, as_module=False): - file = Path(module_file.__file__).absolute() - cli_args = cli_args.split(" ") if cli_args else [] - cli_args += ["--tmpdir", str(tmpdir)] - cli_args += ["--trainer_method", method] - file_args = ["-m", module_file.__spec__.name] if as_module else [str(file)] - command = [sys.executable] + file_args + cli_args - - # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment - env = os.environ.copy() - env["PYTHONPATH"] = env.get("PYTHONPATH", "") + f"{pytorch_lightning.__file__}:" - - # for running in ddp mode, we need to launch it's own process or pytest will get stuck - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - try: - std, err = p.communicate(timeout=timeout) - err = str(err.decode("utf-8")) - if "Exception" in err: - raise Exception(err) - except TimeoutExpired: - p.kill() - std, err = p.communicate() - return std, err diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index ffb898efaa815..9b7abf0d90a88 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass import pytest @@ -12,9 +13,10 @@ from pytorch_lightning.utilities.data import ( _dataloader_init_kwargs_resolve_sampler, _get_dataloader_init_args_and_kwargs, - _replace_init_method, + _replace_dunder_methods, _replace_value_in_saved_args, _update_dataloader, + _WrapAttrTag, extract_batch_size, get_len, has_iterable_dataset, @@ -144,10 +146,10 @@ def __init__(self, foo, *args, **kwargs): super().__init__(foo, *args, **kwargs) dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) - with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`dataset`"): + with pytest.raises(MisconfigurationException, match="implementation has an error.*`dataset`"): _update_dataloader(dataloader, dataloader.sampler) - with _replace_init_method(DataLoader, "dataset"): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) new_dataloader = _update_dataloader(dataloader, dataloader.sampler) assert isinstance(new_dataloader, BadStandaloneGoodHookImpl) @@ -159,7 +161,7 @@ def __init__(self, randomize, *args, **kwargs): super().__init__(*args, shuffle=randomize, **kwargs) dataloader = BadImpl(False, []) - with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`shuffle`"): + with pytest.raises(MisconfigurationException, match="implementation has an error.*`shuffle`"): _update_dataloader(dataloader, dataloader.sampler) class GoodImpl(DataLoader): @@ -173,6 +175,35 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) +def test_replace_dunder_methods_multiple_loaders_without_init(): + """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` + method (the one we are wrapping), it can happen, that `hasattr(cls, "__old__init__")` is True because of parent + class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error + occured only sometimes because it depends on the order in which we are iterating over a set of classes we are + patching. + + This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` + and are children of `DataLoader`. We are testing that a) context manager `_replace_dunder_method` exits cleanly, and + b) the mechanism checking for presence of `__old__init__` works as expected. + """ + classes = [DataLoader] + for i in range(100): + classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) + + before = {cls: cls.__init__ for cls in classes} + + with _replace_dunder_methods(DataLoader, "dataset"): + for cls in classes[1:]: # First one is `DataLoader` + assert "__old__init__" not in cls.__dict__ + assert hasattr(cls, "__old__init__") + + assert "__old__init__" in DataLoader.__dict__ + assert hasattr(DataLoader, "__old__init__") + + for cls in classes: + assert before[cls] == cls.__init__ + + class DataLoaderSubclass1(DataLoader): def __init__(self, attribute1, *args, **kwargs): self.at1 = attribute1 @@ -298,8 +329,8 @@ def __init__(self, dataset, **kwargs): pytest.param(ChangingDataLoader, (range(5),), dict(), ("dataset",), list(range(10)), dict(), id="test9"), ], ) -def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): - with _replace_init_method(DataLoader, "dataset"): +def test_replace_dunder_methods_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = cls(*args, **kwargs) assert dataloader.__pl_saved_args == args @@ -336,12 +367,12 @@ def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, c assert dataloader_value == value -def test_replace_init_method_extra_kwargs(): +def test_replace_dunder_methods_extra_kwargs(): class LoaderSubclass(DataLoader): def __init__(self, dataset, *args, batch_size=10, **kwargs): super().__init__(dataset, *args, batch_size=batch_size, **kwargs) - with _replace_init_method(DataLoader, "dataset"): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = LoaderSubclass(range(10)) assert dataloader.__pl_saved_args == (range(10),) @@ -351,6 +382,90 @@ def __init__(self, dataset, *args, batch_size=10, **kwargs): assert dataloader.__dataset == range(10) +def test_replace_dunder_methods_attrs(): + """This test checks, that all the calls from setting and deleting attributes within `_replace_dunder_methods` + are correctly preserved even after reinstantiation. + + It also includes a custom `__setattr__` + """ + + class Loader(DataLoader): + def __setattr__(self, attr, val): + if attr == "custom_arg": + val = val + 2 + super().__setattr__(attr, val) + + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = Loader(range(10)) + dataloader.custom_arg = 5 + dataloader.my_arg = 10 + dataloader.another_arg = 100 + del dataloader.dataset + try: + del dataloader.abc_arg + except AttributeError: + pass + + assert dataloader.__pl_saved_args == (range(10),) + assert dataloader.__pl_saved_kwargs == {} + assert dataloader.__pl_saved_arg_names == ("dataset",) + assert dataloader.__dataset == range(10) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + assert dataloader.__pl_attrs_record == [ + (("custom_arg", 5), _WrapAttrTag.SET), + (("my_arg", 10), _WrapAttrTag.SET), + (("another_arg", 100), _WrapAttrTag.SET), + (("dataset",), _WrapAttrTag.DEL), + ] + + dataloader = _update_dataloader(dataloader, dataloader.sampler) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + + +def test_replace_dunder_methods_restore_methods(): + """This tests checks whether are all dunder methods restored to their original versions.""" + + class Init(DataLoader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + class SetAttr(DataLoader): + def __setattr__(self, *args): + return super().__setattr__(*args) + + class DelAttr(DataLoader): + def __delattr__(self, *args): + return super().__delattr__(*args) + + class InitAndSetAttr(Init, SetAttr): + pass + + class InitAndDelAttr(Init, DelAttr): + pass + + class SetAttrAndDelAttr(SetAttr, DelAttr): + pass + + class AllDunder(Init, SetAttr, DelAttr): + pass + + before = dict() + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + before[cls] = {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + with _replace_dunder_methods(DataLoader, "dataset"): + pass + + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + assert before[cls] == {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + @pytest.mark.parametrize("predicting", [True, False]) def test_custom_batch_sampler(predicting): """This test asserts, that custom `BatchSampler`, with all the arguments, that are required in order to @@ -367,8 +482,8 @@ def __init__(self, sampler, extra_arg, drop_last=True): super().__init__(sampler, 10, drop_last) sampler = RandomSampler(range(10)) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler(sampler, "random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) @@ -413,8 +528,8 @@ def __init__(self, sampler, extra_arg): super().__init__(sampler, 10, False) sampler = RandomSampler(range(10)) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler(sampler, "random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) @@ -440,8 +555,8 @@ def __init__(self, extra_arg): self.extra_arg = extra_arg super().__init__(RandomSampler(range(10)), 10, False) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler("random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py index 38f72b555d52d..7c17b3d9f7642 100644 --- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py +++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py @@ -113,7 +113,7 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir): ], ) @RunIf(min_cuda_gpus=1) -def test_gpu_cuda_device(device): +def test_cuda_device(device): model = TopModule() model.cuda(device) @@ -122,3 +122,25 @@ def test_gpu_cuda_device(device): assert device.type == "cuda" assert device.index is not None assert device.index == torch.cuda.current_device() + + +@RunIf(min_cuda_gpus=2) +def test_cuda_current_device(): + """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" + + class CudaModule(DeviceDtypeModuleMixin): + def __init__(self): + super().__init__() + self.layer = nn.Linear(1, 1) + + model = CudaModule() + + torch.cuda.set_device(0) + model.cuda(1) + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) + + torch.cuda.set_device(1) + model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1)