diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 5f251f86c204b..01e02b6a80869 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -6,7 +6,7 @@ subprojects: - id: "pytorch_lightning: Tests workflow" paths: - ".actions/**" - - ".github/workflows/ci-pytorch-tests.yml" + - ".github/workflows/ci-tests-pytorch.yml" - "requirements/fabric/**" - "src/lightning_fabric/**" - "requirements/pytorch/**" @@ -178,7 +178,7 @@ subprojects: - "src/lightning_fabric/**" - "tests/tests_fabric/**" - "setup.cfg" # includes pytest config - - ".github/workflows/ci-fabric-tests.yml" + - ".github/workflows/ci-tests-fabric.yml" - "!requirements/*/docs.txt" - "!*.md" - "!**/*.md" @@ -223,7 +223,7 @@ subprojects: - id: "lightning_app: Tests workflow" paths: - ".actions/**" - - ".github/workflows/ci-app-tests.yml" + - ".github/workflows/ci-tests-app.yml" - "src/lightning_app/**" - "tests/tests_app/**" - "requirements/app/**" @@ -245,7 +245,7 @@ subprojects: - id: "lightning_app: Examples" paths: - ".actions/**" - - ".github/workflows/ci-app-examples.yml" + - ".github/workflows/ci-examples-app.yml" - "src/lightning_app/**" - "tests/tests_examples_app/**" - "examples/app_*/**" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 3437dd03e6d50..9f3d7a05584b7 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,10 +4,10 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | -| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| Test PyTorch full | .github/workflows/ci-pytorch-tests.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | -| Test PyTorch slow | .github/workflows/ci-pytorch-tests-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | +| workflow name | workflow file | action | accelerator\* | +| ----------------- | -------------------------------------- | ------------------------------------------------------------------------- | ------------- | +| Test PyTorch full | .github/workflows/ci-tests-pytorch.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | + | pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | | pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | | pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests-pytorch.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-dockers-pytorch.yml similarity index 100% rename from .github/workflows/ci-pytorch-dockers.yml rename to .github/workflows/ci-dockers-pytorch.yml diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-examples-app.yml similarity index 96% rename from .github/workflows/ci-app-examples.yml rename to .github/workflows/ci-examples-app.yml index 9c7d1ac844a35..f68705a032b21 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-examples-app.yml @@ -9,7 +9,7 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".actions/**" - - ".github/workflows/ci-app-examples.yml" + - ".github/workflows/ci-examples-app.yml" - "src/lightning_app/**" - "tests/tests_examples_app/**" - "examples/app_*/**" @@ -89,7 +89,8 @@ jobs: - name: Install Lightning package env: PACKAGE_NAME: ${{ matrix.pkg-name }} - run: pip install -e . + # do not use -e because it will make both packages available since it adds `src` to `sys.path` automatically + run: pip install . - name: Adjust tests if: ${{ matrix.pkg-name == 'lightning' }} diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-tests-app.yml similarity index 99% rename from .github/workflows/ci-app-tests.yml rename to .github/workflows/ci-tests-app.yml index 8a7cb314b0ff5..32f529e3475c4 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-tests-app.yml @@ -9,7 +9,7 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".actions/**" - - ".github/workflows/ci-app-tests.yml" + - ".github/workflows/ci-tests-app.yml" - "src/lightning_app/**" - "tests/tests_app/**" - "requirements/app/**" diff --git a/.github/workflows/ci-lite-tests.yml b/.github/workflows/ci-tests-fabric.yml similarity index 67% rename from .github/workflows/ci-lite-tests.yml rename to .github/workflows/ci-tests-fabric.yml index bc25b4982743d..ca47828cd679e 100644 --- a/.github/workflows/ci-lite-tests.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -1,4 +1,4 @@ -name: Test Lite +name: Test Fabric # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: @@ -9,11 +9,11 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".actions/**" - - "requirements/lite/**" - - "src/lightning_lite/**" - - "tests/tests_lite/**" + - "requirements/fabric/**" + - "src/lightning_fabric/**" + - "tests/tests_fabric/**" - "setup.cfg" # includes pytest config - - ".github/workflows/ci-lite-tests.yml" + - ".github/workflows/ci-tests-fabric.yml" - "!requirements/*/docs.txt" - "!*.md" - "!**/*.md" @@ -30,7 +30,7 @@ defaults: shell: bash jobs: - lite-cpu: + fabric-cpu: runs-on: ${{ matrix.os }} if: github.event.pull_request.draft == false strategy: @@ -39,21 +39,21 @@ jobs: include: # assign python and pytorch version combinations to operating systems (arbitrarily) # note: there's no distribution of torch==1.10 for Python>=3.10 - - {os: "macOS-11", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.11"} - - {os: "macOS-11", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.12"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.8", pytorch-version: "1.10"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.9", pytorch-version: "1.11"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.12"} + - {os: "macOS-11", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.11"} + - {os: "macOS-11", pkg-name: "fabric", python-version: "3.9", pytorch-version: "1.12"} + - {os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.8", pytorch-version: "1.10"} + - {os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.9", pytorch-version: "1.11"} + - {os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "1.12"} + - {os: "windows-2022", pkg-name: "fabric", python-version: "3.9", pytorch-version: "1.11"} + - {os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "1.12"} # only run PyTorch latest with Python latest - - {os: "macOS-11", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.10", pytorch-version: "1.13"} + - {os: "macOS-11", pkg-name: "fabric", python-version: "3.10", pytorch-version: "1.13"} + - {os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "1.13"} + - {os: "windows-2022", pkg-name: "fabric", python-version: "3.10", pytorch-version: "1.13"} # "oldest" versions tests, only on minimum Python - - {os: "macOS-11", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.10", requires: "oldest"} - - {os: "ubuntu-20.04", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.10", requires: "oldest"} - - {os: "windows-2022", pkg-name: "lite", python-version: "3.7", pytorch-version: "1.10", requires: "oldest"} + - {os: "macOS-11", pkg-name: "fabric", python-version: "3.7", pytorch-version: "1.10", requires: "oldest"} + - {os: "ubuntu-20.04", pkg-name: "fabric", python-version: "3.7", pytorch-version: "1.10", requires: "oldest"} + - {os: "windows-2022", pkg-name: "fabric", python-version: "3.7", pytorch-version: "1.10", requires: "oldest"} # "lightning" installs the monolithic package - {os: "macOS-11", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} - {os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.8", pytorch-version: "1.13"} @@ -87,8 +87,8 @@ jobs: - name: Adjust PyTorch versions in requirements files if: ${{ matrix.requires != 'oldest' }} run: | - python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${{ matrix.pytorch-version }} - cat requirements/lite/base.txt + python ./requirements/pytorch/adjust-versions.py requirements/fabric/base.txt ${{ matrix.pytorch-version }} + cat requirements/fabric/base.txt - name: Get pip cache dir id: pip-cache @@ -98,7 +98,7 @@ jobs: uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.pkg-name }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/lite/*.txt') }} + key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.pkg-name }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements/fabric/*.txt') }} restore-keys: | ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.pkg-name }}-${{ matrix.release }}-${{ matrix.requires }}- @@ -109,27 +109,27 @@ jobs: env: PACKAGE_NAME: ${{ matrix.pkg-name }} run: | - pip install -e . "pytest-timeout" -r requirements/lite/devel.txt --upgrade --find-links ${TORCH_URL} + pip install -e . "pytest-timeout" -r requirements/fabric/devel.txt --upgrade --find-links ${TORCH_URL} pip list - name: Adjust tests if: ${{ matrix.pkg-name == 'lightning' }} run: | python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ - --source_import="lightning_lite" --target_import="lightning.lite" + --source_import="lightning_fabric" --target_import="lightning.fabric" - name: Testing Warnings # the stacklevel can only be set on >=3.7 if: matrix.python-version != '3.7' - working-directory: tests/tests_lite + working-directory: tests/tests_fabric # needs to run outside of `pytest` run: python utilities/test_warnings.py - name: Switch coverage scope - run: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_lite'))" >> $GITHUB_ENV + run: python -c "print('COVERAGE_SCOPE=' + str('lightning' if '${{matrix.pkg-name}}' == 'lightning' else 'lightning_fabric'))" >> $GITHUB_ENV - - name: Testing Lite - working-directory: tests/tests_lite + - name: Testing Fabric + working-directory: tests/tests_fabric # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source ${COVERAGE_SCOPE} -m pytest -v --timeout=30 --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml @@ -138,11 +138,11 @@ jobs: uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} - path: tests/tests_lite/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml + path: tests/tests_fabric/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Statistics if: success() - working-directory: tests/tests_lite + working-directory: tests/tests_fabric run: | coverage report coverage xml @@ -153,7 +153,7 @@ jobs: continue-on-error: true with: token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_lite/coverage.xml + file: tests/tests_fabric/coverage.xml flags: ${COVERAGE_SCOPE},cpu,pytest,python${{ matrix.python-version }} name: CPU-coverage fail_ci_if_error: false diff --git a/.github/workflows/ci-pytorch-tests.yml b/.github/workflows/ci-tests-pytorch.yml similarity index 95% rename from .github/workflows/ci-pytorch-tests.yml rename to .github/workflows/ci-tests-pytorch.yml index a9286ba8b19e2..cfbf7fe8c5776 100644 --- a/.github/workflows/ci-pytorch-tests.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -3,9 +3,9 @@ name: Test PyTorch # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows on: push: - branches: [master, "release/*", "lite/debug"] + branches: [master, "release/*"] pull_request: - branches: [master, "release/*", "lite/debug"] + branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".actions/**" @@ -14,9 +14,9 @@ on: - "tests/tests_pytorch/**" - "tests/legacy/back-compatible-versions.txt" - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-tests.yml" - - "requirements/fabric/**" - - "src/lightning_fabric/**" + - ".github/workflows/ci-tests-pytorch.yml" + - "requirements/lite/**" + - "src/lightning_lite/**" - "!requirements/pytorch/docs.txt" - "!*.md" - "!**/*.md" @@ -104,7 +104,7 @@ jobs: - name: Adjust PyTorch versions in requirements files if: ${{ matrix.requires != 'oldest' }} run: | - python ./requirements/pytorch/adjust-versions.py requirements/fabric/base.txt ${{ matrix.pytorch-version }} + python ./requirements/pytorch/adjust-versions.py requirements/lite/base.txt ${{ matrix.pytorch-version }} python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${{ matrix.pytorch-version }} python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${{ matrix.pytorch-version }} cat requirements/pytorch/base.txt @@ -171,8 +171,8 @@ jobs: if: ${{ matrix.pkg-name == 'lightning' }} run: | python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ - --source_import="pytorch_lightning,lightning_fabric" \ - --target_import="lightning.pytorch,lightning.fabric" + --source_import="pytorch_lightning,lightning_lite" \ + --target_import="lightning.pytorch,lightning.lite" - name: Testing Warnings # the stacklevel can only be set on >=3.7 diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 23a8395ba2f6f..34e3bb16d10a9 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -109,6 +109,7 @@ jobs: branch = f"origin/builds/{os.getenv('TAG')}" while True: remote_refs = [b.name for b in repo.remote().refs] + print([n for n in remote_refs if "builds" in n]) if branch in remote_refs: break time.sleep(60) diff --git a/.gitignore b/.gitignore index 982309a8a356a..03c30bc8caa2a 100644 --- a/.gitignore +++ b/.gitignore @@ -110,8 +110,8 @@ celerybeat-schedule # dotenv .env -.env_staging -.env_local +.env.staging +.env.local # virtualenv .venv diff --git a/README.md b/README.md index d1244ab1b5972..19c618122a8a8 100644 --- a/README.md +++ b/README.md @@ -96,9 +96,9 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs | Linux py3.7 \[TPUs\*\*\*\] | - | - | | Linux py3.8 \[IPUs\] | - | - | | Linux py3.8 \[HPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | -| Linux py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml) | -| OSX py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml) | -| Windows py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml) | +| Linux py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml) | +| OSX py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml) | +| Windows py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst index 2272f7bf13c41..931a9864d261f 100644 --- a/docs/source-app/api_references.rst +++ b/docs/source-app/api_references.rst @@ -45,7 +45,7 @@ ___________________ ~multi_node.lite.LiteMultiNode ~multi_node.pytorch_spawn.PyTorchSpawnMultiNode ~multi_node.trainer.LightningTrainerMultiNode - ~auto_scaler.AutoScaler + ~serve.auto_scaler.AutoScaler ---- diff --git a/docs/source-app/levels/basic/hello_components/pl_multinode.py b/docs/source-app/levels/basic/hello_components/pl_multinode.py index e6764ee8fafae..9df12ec732684 100644 --- a/docs/source-app/levels/basic/hello_components/pl_multinode.py +++ b/docs/source-app/levels/basic/hello_components/pl_multinode.py @@ -10,7 +10,7 @@ def run(self): trainer = L.Trainer(max_epochs=10, strategy="ddp") trainer.fit(model) -# 8 GPU: (2 nodes of 4 x v100) +# 8 GPUs: (2 nodes of 4 x v100) component = LightningTrainerMultiNode( LightningTrainerDistributed, num_nodes=4, diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst index 50b02c483997f..c34033a8f7534 100644 --- a/docs/source-pytorch/index.rst +++ b/docs/source-pytorch/index.rst @@ -64,6 +64,8 @@ Conda users Or read the `advanced install guide `_ +We are fully compatible with any stable PyTorch version v1.10 and above. + .. raw:: html
diff --git a/examples/app_boring/app.py b/examples/app_boring/app.py index aad288a11acb4..78a9b1c819f06 100644 --- a/examples/app_boring/app.py +++ b/examples/app_boring/app.py @@ -43,6 +43,10 @@ def __init__(self): raise_exception=True, ) + @property + def ready(self) -> bool: + return self.dest_work.is_running + def run(self): self.source_work.run() if self.source_work.has_succeeded: diff --git a/examples/app_display_name/.lightningignore b/examples/app_display_name/.lightningignore new file mode 100644 index 0000000000000..f7275bbbd035b --- /dev/null +++ b/examples/app_display_name/.lightningignore @@ -0,0 +1 @@ +venv/ diff --git a/examples/app_display_name/app.py b/examples/app_display_name/app.py new file mode 100644 index 0000000000000..f06d8ee562fdf --- /dev/null +++ b/examples/app_display_name/app.py @@ -0,0 +1,25 @@ +import lightning as L + + +class Work(L.LightningWork): + def __init__(self, start_with_flow=True): + super().__init__(start_with_flow=start_with_flow) + + def run(self): + pass + + +class Flow(L.LightningFlow): + def __init__(self): + super().__init__() + self.w = Work() + self.w1 = Work(start_with_flow=False) + self.w.display_name = "My Custom Name" # Not supported yet + self.w1.display_name = "My Custom Name 1" + + def run(self): + self.w.run() + self.w1.run() + + +app = L.LightningApp(Flow()) diff --git a/examples/app_multi_node/train_fabric.py b/examples/app_multi_node/train_fabric.py index ecf76d1fdb059..5a1751e538e09 100644 --- a/examples/app_multi_node/train_fabric.py +++ b/examples/app_multi_node/train_fabric.py @@ -31,7 +31,7 @@ def run(self): optimizer.step() -# Run over 2 nodes of 4 x V100 +# 8 GPUs: (2 nodes of 4 x v100) app = L.LightningApp( LiteMultiNode( LitePyTorchDistributed, diff --git a/examples/app_multi_node/train_lt.py b/examples/app_multi_node/train_lt.py index 4abe375c89b9b..8ed62a10fb9de 100644 --- a/examples/app_multi_node/train_lt.py +++ b/examples/app_multi_node/train_lt.py @@ -11,10 +11,10 @@ def run(self): trainer.fit(model) -# 8 GPU: (2 nodes of 4 x v100) +# 8 GPUs: (2 nodes of 4 x v100) component = LightningTrainerMultiNode( LightningTrainerDistributed, - num_nodes=4, + num_nodes=2, cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 ) app = L.LightningApp(component) diff --git a/examples/app_multi_node/train_lt_script.py b/examples/app_multi_node/train_lt_script.py index d2254e19daac0..58f847368346c 100644 --- a/examples/app_multi_node/train_lt_script.py +++ b/examples/app_multi_node/train_lt_script.py @@ -2,11 +2,11 @@ from lightning.app.components import LightningTrainerScript from lightning.app.utilities.packaging.cloud_compute import CloudCompute -# Run over 2 nodes of 4 x V100 +# 8 GPUs: (2 nodes of 4 x v100) app = L.LightningApp( LightningTrainerScript( "pl_boring_script.py", num_nodes=2, - cloud_compute=CloudCompute("gpu-fast-multi"), + cloud_compute=CloudCompute("gpu-fast-multi"), # 4 x v100 ), ) diff --git a/examples/app_multi_node/train_pytorch.py b/examples/app_multi_node/train_pytorch.py index cc9e84297c151..e5a9a1fc93e3b 100644 --- a/examples/app_multi_node/train_pytorch.py +++ b/examples/app_multi_node/train_pytorch.py @@ -56,6 +56,6 @@ def run(self, main_address: str, main_port: int, num_nodes: int, node_rank: int) # 8 GPUs: (2 nodes x 4 v 100) -compute = L.CloudCompute("gpu-fast-multi") # 4xV100 +compute = L.CloudCompute("gpu-fast-multi") # 4 x v100 component = MultiNode(PyTorchDistributed, num_nodes=2, cloud_compute=compute) app = L.LightningApp(component) diff --git a/examples/app_multi_node/train_pytorch_spawn.py b/examples/app_multi_node/train_pytorch_spawn.py index d29ec83562ffb..165a0c77dbfa9 100644 --- a/examples/app_multi_node/train_pytorch_spawn.py +++ b/examples/app_multi_node/train_pytorch_spawn.py @@ -42,11 +42,11 @@ def run( optimizer.step() -# Run over 2 nodes of 4 x V100 +# 8 GPUs: (2 nodes x 4 v 100) app = L.LightningApp( PyTorchSpawnMultiNode( PyTorchDistributed, num_nodes=2, - cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x V100 + cloud_compute=L.CloudCompute("gpu-fast-multi"), # 4 x v100 ) ) diff --git a/examples/app_server_with_auto_scaler/app.py b/examples/app_server_with_auto_scaler/app.py index 70799827776a8..2c8fb744c4fcf 100644 --- a/examples/app_server_with_auto_scaler/app.py +++ b/examples/app_server_with_auto_scaler/app.py @@ -1,5 +1,5 @@ # ! pip install torch torchvision -from typing import Any, List +from typing import List import torch import torchvision @@ -8,16 +8,12 @@ import lightning as L -class RequestModel(BaseModel): - image: str # bytecode - - class BatchRequestModel(BaseModel): - inputs: List[RequestModel] + inputs: List[L.app.components.Image] class BatchResponse(BaseModel): - outputs: List[Any] + outputs: List[L.app.components.Number] class PyTorchServer(L.app.components.PythonServer): @@ -79,10 +75,11 @@ def scale(self, replicas: int, metrics: dict) -> int: # autoscaler specific args min_replicas=1, max_replicas=4, - autoscale_interval=10, + scale_out_interval=10, + scale_in_interval=10, endpoint="predict", - input_type=RequestModel, - output_type=Any, + input_type=L.app.components.Image, + output_type=L.app.components.Number, timeout_batching=1, max_batch_size=8, ) diff --git a/examples/pl_loops/kfold.py b/examples/pl_loops/kfold.py index af777df6f211d..38103d95ff3a6 100644 --- a/examples/pl_loops/kfold.py +++ b/examples/pl_loops/kfold.py @@ -152,12 +152,12 @@ def test_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> None # self.reset(...) # # self.on_run_start(...) # # # -# while not self.done: # -# self.on_advance_start(...) # -# self.advance(...) # -# self.on_advance_end(...) # +# while not self.done: # +# self.on_advance_start(...) # +# self.advance(...) # +# self.on_advance_end(...) # # # -# return self.on_run_end(...) # +# return self.on_run_end(...) # ############################################################################################# diff --git a/pyproject.toml b/pyproject.toml index 1f23e7a63e545..782ba81779de3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,8 +79,8 @@ module = [ "lightning_app.components.serve.types.image", "lightning_app.components.serve.types.type", "lightning_app.components.serve.python_server", + "lightning_app.components.serve.auto_scaler", "lightning_app.components.training", - "lightning_app.components.auto_scaler", "lightning_app.core.api", "lightning_app.core.app", "lightning_app.core.flow", diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 0bc4a9ee277c6..3884652fd66c1 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy>=1.17.2, <1.23.1 -torch>=1.10.0, <=1.13.0 +torch>=1.10.0, <=1.13.1 fsspec[http]>2021.06.0, <2022.6.0 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <=4.4.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index 43bb03e07cc80..e4d4136b6b0c4 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,4 +1,4 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision>=0.10.0, <=0.13.0 +torchvision>=0.10.0, <=0.14.1 diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py index c9ed4c0770427..681886b966b75 100644 --- a/requirements/pytorch/adjust-versions.py +++ b/requirements/pytorch/adjust-versions.py @@ -5,7 +5,9 @@ # IMPORTANT: this list needs to be sorted in reverse VERSIONS = [ - dict(torch="1.13.0", torchvision="0.14.0"), # stable + dict(torch="1.14.0", torchvision="0.15.0"), # nightly + dict(torch="1.13.1", torchvision="0.14.1"), # stable + dict(torch="1.13.0", torchvision="0.14.0"), dict(torch="1.12.1", torchvision="0.13.1"), dict(torch="1.12.0", torchvision="0.13.0"), dict(torch="1.11.0", torchvision="0.12.0"), diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 01b2b8483cc29..dede48350a3fe 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy>=1.17.2, <1.23.1 -torch>=1.10.0, <=1.13.0 +torch>=1.10.0, <=1.13.1 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>2021.06.0, <2022.8.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 7e02a2f4bea99..ebd1bf0576b52 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision>=0.11.1, <=0.14.0 +torchvision>=0.11.1, <=0.14.1 gym[classic_control]>=0.17.0, <0.26.3 ipython[all] <8.6.1 diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index b8fc07e17fc05..4643f64be5c21 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -12,10 +12,27 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a progres bar while connecting to an app through the CLI ([#16035](https://github.com/Lightning-AI/lightning/pull/16035)) +- Added partial support for fastapi `Request` annotation in `configure_api` handlers ([#16047](https://github.com/Lightning-AI/lightning/pull/16047)) + +- Added a nicer UI with URL and examples for the autoscaler component ([#16063](https://github.com/Lightning-AI/lightning/pull/16063)) + +- Enabled users to have more control over scaling out/in interval ([#16093](https://github.com/Lightning-AI/lightning/pull/16093)) + +- Added more datatypes to serving component ([#16018](https://github.com/Lightning-AI/lightning/pull/16018)) + +- Added `work.delete` method to delete the work ([#16103](https://github.com/Lightning-AI/lightning/pull/16103)) + +- Added `display_name` property to LightningWork for the cloud ([#16095](https://github.com/Lightning-AI/lightning/pull/16095)) + ### Changed -- + +- The default `start_method` for creating Work processes locally on MacOS is now 'spawn' (previously 'fork') ([#16089](https://github.com/Lightning-AI/lightning/pull/16089)) + + +- The utility `lightning.app.utilities.cloud.is_running_in_cloud` now returns `True` during loading of the app locally when running with `--cloud` ([#16045](https://github.com/Lightning-AI/lightning/pull/16045)) + ### Deprecated @@ -32,8 +49,19 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `AutoScaler` raising an exception when non-default cloud compute is specified ([#15991](https://github.com/Lightning-AI/lightning/pull/15991)) +- Fixed the debugger detection mechanism for lightning App in VSCode ([#16068](https://github.com/Lightning-AI/lightning/pull/16068)) + + +- Fixed bug where components that are re-instantiated several times failed to initialize if they were modifying `self.lightningignore` ([#16080](https://github.com/Lightning-AI/lightning/pull/16080)) + + +- Fixed a bug where apps that had previously been deleted could not be run again from the CLI ([#16082](https://github.com/Lightning-AI/lightning/pull/16082)) + - Fixed MPS error for multinode component (defaults to cpu on mps devices now as distributed operations are not supported by pytorch on mps) ([#15748](https://github.com/Ligtning-AI/lightning/pull/15748)) +- Fixed a bug where `AutoScaler` would fail with min_replica=0 ([#16092](https://github.com/Lightning-AI/lightning/pull/16092) + + ## [1.8.4] - 2022-12-08 ### Added diff --git a/src/lightning_app/api/http_methods.py b/src/lightning_app/api/http_methods.py index ca09a9a83eecc..379e87cb68676 100644 --- a/src/lightning_app/api/http_methods.py +++ b/src/lightning_app/api/http_methods.py @@ -2,12 +2,14 @@ import inspect import time from copy import deepcopy +from dataclasses import dataclass from functools import wraps from multiprocessing import Queue from typing import Any, Callable, Dict, List, Optional from uuid import uuid4 -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Request, status +from lightning_utilities.core.apply_func import apply_to_collection from lightning_app.api.request_types import _APIRequest, _CommandRequest, _RequestResponse from lightning_app.utilities.app_helpers import Logger @@ -19,6 +21,77 @@ def _signature_proxy_function(): pass +@dataclass +class _FastApiMockRequest: + """This class is meant to mock FastAPI Request class that isn't pickle-able. + + If a user relies on FastAPI Request annotation, the Lightning framework + patches the annotation before pickling and replace them right after. + + Finally, the FastAPI request is converted back to the _FastApiMockRequest + before being delivered to the users. + + Example: + + import lightning as L + from fastapi import Request + from lightning.app.api import Post + + class Flow(L.LightningFlow): + + def request(self, request: Request) -> OutputRequestModel: + ... + + def configure_api(self): + return [Post("/api/v1/request", self.request)] + """ + + _body: Optional[str] = None + _json: Optional[str] = None + _method: Optional[str] = None + _headers: Optional[Dict] = None + + @property + def receive(self): + raise NotImplementedError + + @property + def method(self): + raise self._method + + @property + def headers(self): + return self._headers + + def body(self): + return self._body + + def json(self): + return self._json + + def stream(self): + raise NotImplementedError + + def form(self): + raise NotImplementedError + + def close(self): + raise NotImplementedError + + def is_disconnected(self): + raise NotImplementedError + + +async def _mock_fastapi_request(request: Request): + # TODO: Add more requests parameters. + return _FastApiMockRequest( + _body=await request.body(), + _json=await request.json(), + _headers=request.headers, + _method=request.method, + ) + + class _HttpMethod: def __init__(self, route: str, method: Callable, method_name: Optional[str] = None, timeout: int = 30, **kwargs): """This class is used to inject user defined methods within the App Rest API. @@ -34,6 +107,7 @@ def __init__(self, route: str, method: Callable, method_name: Optional[str] = No self.method_annotations = method.__annotations__ # TODO: Validate the signature contains only pydantic models. self.method_signature = inspect.signature(method) + if not self.attached_to_flow: self.component_name = method.__name__ self.method = method @@ -43,10 +117,16 @@ def __init__(self, route: str, method: Callable, method_name: Optional[str] = No self.timeout = timeout self.kwargs = kwargs + # Enable the users to rely on FastAPI annotation typing with Request. + # Note: Only a part of the Request functionatilities are supported. + self._patch_fast_api_request() + def add_route(self, app: FastAPI, request_queue: Queue, responses_store: Dict[str, Any]) -> None: # 1: Get the route associated with the http method. route = getattr(app, self.__class__.__name__.lower()) + self._unpatch_fast_api_request() + # 2: Create a proxy function with the signature of the wrapped method. fn = deepcopy(_signature_proxy_function) fn.__annotations__ = self.method_annotations @@ -69,6 +149,11 @@ async def _handle_request(*args, **kwargs): @wraps(_signature_proxy_function) async def _handle_request(*args, **kwargs): async def fn(*args, **kwargs): + args, kwargs = apply_to_collection((args, kwargs), Request, _mock_fastapi_request) + for k, v in kwargs.items(): + if hasattr(v, "__await__"): + kwargs[k] = await v + request_id = str(uuid4()).split("-")[0] logger.debug(f"Processing request {request_id} for route: {self.route}") request_queue.put( @@ -85,7 +170,10 @@ async def fn(*args, **kwargs): while request_id not in responses_store: await asyncio.sleep(0.01) if (time.time() - t0) > self.timeout: - raise Exception("The response was never received.") + raise HTTPException( + status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="The response was never received.", + ) logger.debug(f"Processed request {request_id} for route: {self.route}") @@ -101,6 +189,26 @@ async def fn(*args, **kwargs): # 4: Register the user provided route to the Rest API. route(self.route, **self.kwargs)(_handle_request) + def _patch_fast_api_request(self): + """This function replaces signature annotation for Request with its mock.""" + for k, v in self.method_annotations.items(): + if v == Request: + self.method_annotations[k] = _FastApiMockRequest + + for v in self.method_signature.parameters.values(): + if v._annotation == Request: + v._annotation = _FastApiMockRequest + + def _unpatch_fast_api_request(self): + """This function replaces back signature annotation to fastapi Request.""" + for k, v in self.method_annotations.items(): + if v == _FastApiMockRequest: + self.method_annotations[k] = Request + + for v in self.method_signature.parameters.values(): + if v._annotation == _FastApiMockRequest: + v._annotation = Request + class Post(_HttpMethod): pass diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index ab6cd9f78e6ea..9bf76c62bbcdd 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -358,8 +358,8 @@ def run_app( ) -if RequirementCache("lightning-fabric"): - # lightning-fabric may not be available when installing only standalone lightning-app package +if RequirementCache("lightning-fabric>=1.9.0.dev0") or RequirementCache("lightning>=1.9.0.dev0"): + # lightning.fabric.cli may not be available when installing only standalone lightning-app package from lightning_fabric.cli import _run_model run.add_command(_run_model) diff --git a/src/lightning_app/components/__init__.py b/src/lightning_app/components/__init__.py index ca47c36071dae..5fd8af6b055de 100644 --- a/src/lightning_app/components/__init__.py +++ b/src/lightning_app/components/__init__.py @@ -1,4 +1,3 @@ -from lightning_app.components.auto_scaler import AutoScaler from lightning_app.components.database.client import DatabaseClient from lightning_app.components.database.server import Database from lightning_app.components.multi_node import ( @@ -9,8 +8,9 @@ ) from lightning_app.components.python.popen import PopenPythonScript from lightning_app.components.python.tracer import Code, TracerPythonScript +from lightning_app.components.serve.auto_scaler import AutoScaler from lightning_app.components.serve.gradio import ServeGradio -from lightning_app.components.serve.python_server import Image, Number, PythonServer +from lightning_app.components.serve.python_server import Category, Image, Number, PythonServer, Text from lightning_app.components.serve.serve import ModelInferenceAPI from lightning_app.components.serve.streamlit import ServeStreamlit from lightning_app.components.training import LightningTrainerScript, PyTorchLightningScriptRunner @@ -28,6 +28,8 @@ "PythonServer", "Image", "Number", + "Category", + "Text", "MultiNode", "LiteMultiNode", "LightningTrainerScript", diff --git a/src/lightning_app/components/multi_node/base.py b/src/lightning_app/components/multi_node/base.py index 5662442b7375a..ac99abecff028 100644 --- a/src/lightning_app/components/multi_node/base.py +++ b/src/lightning_app/components/multi_node/base.py @@ -56,12 +56,12 @@ def run( """ super().__init__() if num_nodes > 1 and not is_running_in_cloud(): - num_nodes = 1 warnings.warn( f"You set {type(self).__name__}(num_nodes={num_nodes}, ...)` but this app is running locally." " We assume you are debugging and will ignore the `num_nodes` argument." " To run on multiple nodes in the cloud, launch your app with `--cloud`." ) + num_nodes = 1 self.ws = structures.List( *[ work_cls( diff --git a/src/lightning_app/components/serve/__init__.py b/src/lightning_app/components/serve/__init__.py index cb46a71bf9ea5..ac02e69c4f2ab 100644 --- a/src/lightning_app/components/serve/__init__.py +++ b/src/lightning_app/components/serve/__init__.py @@ -1,5 +1,6 @@ +from lightning_app.components.serve.auto_scaler import AutoScaler from lightning_app.components.serve.gradio import ServeGradio -from lightning_app.components.serve.python_server import Image, Number, PythonServer +from lightning_app.components.serve.python_server import Category, Image, Number, PythonServer, Text from lightning_app.components.serve.streamlit import ServeStreamlit -__all__ = ["ServeGradio", "ServeStreamlit", "PythonServer", "Image", "Number"] +__all__ = ["ServeGradio", "ServeStreamlit", "PythonServer", "Image", "Number", "Category", "Text", "AutoScaler"] diff --git a/src/lightning_app/components/auto_scaler.py b/src/lightning_app/components/serve/auto_scaler.py similarity index 78% rename from src/lightning_app/components/auto_scaler.py rename to src/lightning_app/components/serve/auto_scaler.py index 13948ba50af89..2493f63048e60 100644 --- a/src/lightning_app/components/auto_scaler.py +++ b/src/lightning_app/components/serve/auto_scaler.py @@ -6,7 +6,7 @@ import uuid from base64 import b64encode from itertools import cycle -from typing import Any, Dict, List, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import requests import uvicorn @@ -15,11 +15,13 @@ from fastapi.responses import RedirectResponse from fastapi.security import HTTPBasic, HTTPBasicCredentials from pydantic import BaseModel +from starlette.staticfiles import StaticFiles from starlette.status import HTTP_401_UNAUTHORIZED from lightning_app.core.flow import LightningFlow from lightning_app.core.work import LightningWork from lightning_app.utilities.app_helpers import Logger +from lightning_app.utilities.cloud import is_running_in_cloud from lightning_app.utilities.imports import _is_aiohttp_available, requires from lightning_app.utilities.packaging.cloud_compute import CloudCompute @@ -114,20 +116,21 @@ class _LoadBalancer(LightningWork): requests to be batched. In any case, requests are processed as soon as `max_batch_size` is reached. timeout_keep_alive: The number of seconds until it closes Keep-Alive connections if no new data is received. timeout_inference_request: The number of seconds to wait for inference. - \**kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc. + **kwargs: Arguments passed to :func:`LightningWork.init` like ``CloudCompute``, ``BuildConfig``, etc. """ @requires(["aiohttp"]) def __init__( self, - input_type: BaseModel, - output_type: BaseModel, + input_type: Type[BaseModel], + output_type: Type[BaseModel], endpoint: str, max_batch_size: int = 8, # all timeout args are in seconds - timeout_batching: int = 1, + timeout_batching: float = 1, timeout_keep_alive: int = 60, timeout_inference_request: int = 60, + work_name: Optional[str] = "API", # used for displaying the name in the UI **kwargs: Any, ) -> None: super().__init__(cloud_compute=CloudCompute("default"), **kwargs) @@ -142,6 +145,7 @@ def __init__( self._batch = [] self._responses = {} # {request_id: response} self._last_batch_sent = 0 + self._work_name = work_name if not endpoint.startswith("/"): endpoint = "/" + endpoint @@ -220,6 +224,8 @@ def run(self): security = HTTPBasic() fastapi_app.SEND_TASK = None + input_type = self._input_type + @fastapi_app.middleware("http") async def current_request_counter(request: Request, call_next): if not request.scope["path"] == self.endpoint: @@ -277,9 +283,17 @@ async def update_servers(servers: List[str], authenticated: bool = Depends(authe self._iter = cycle(self.servers) @fastapi_app.post(self.endpoint, response_model=self._output_type) - async def balance_api(inputs: self._input_type): + async def balance_api(inputs: input_type): return await self.process_request(inputs) + endpoint_info_page = self._get_endpoint_info_page() + if endpoint_info_page: + fastapi_app.mount( + "/endpoint-info", StaticFiles(directory=endpoint_info_page.serve_dir, html=True), name="static" + ) + + logger.info(f"Your load balancer has started. The endpoint is 'http://{self.host}:{self.port}{self.endpoint}'") + uvicorn.run( fastapi_app, host=self.host, @@ -332,6 +346,60 @@ def send_request_to_update_servers(self, servers: List[str]): response = requests.put(f"{self.url}/system/update-servers", json=servers, headers=headers, timeout=10) response.raise_for_status() + @staticmethod + def _get_sample_dict_from_datatype(datatype: Any) -> dict: + if not hasattr(datatype, "schema"): + # not a pydantic model + raise TypeError(f"datatype must be a pydantic model, for the UI to be generated. but got {datatype}") + + if hasattr(datatype, "_get_sample_data"): + return datatype._get_sample_data() + + datatype_props = datatype.schema()["properties"] + out: Dict[str, Any] = {} + lut = {"string": "data string", "number": 0.0, "integer": 0, "boolean": False} + for k, v in datatype_props.items(): + if v["type"] not in lut: + raise TypeError("Unsupported type") + out[k] = lut[v["type"]] + return out + + def get_code_sample(self, url: str) -> Optional[str]: + input_type: Any = self._input_type + output_type: Any = self._output_type + + if not (hasattr(input_type, "request_code_sample") and hasattr(output_type, "response_code_sample")): + return None + return f"{input_type.request_code_sample(url)}\n{output_type.response_code_sample()}" + + def _get_endpoint_info_page(self) -> Optional["APIAccessFrontend"]: # noqa: F821 + try: + from lightning_api_access import APIAccessFrontend + except ModuleNotFoundError: + logger.warn("APIAccessFrontend not found. Please install lightning-api-access to enable the UI") + return + + if is_running_in_cloud(): + url = f"{self._future_url}{self.endpoint}" + else: + url = f"http://localhost:{self.port}{self.endpoint}" + + frontend_objects = {"name": self._work_name, "url": url, "method": "POST", "request": None, "response": None} + code_samples = self.get_code_sample(url) + if code_samples: + frontend_objects["code_samples"] = code_samples + # TODO also set request/response for JS UI + else: + try: + request = self._get_sample_dict_from_datatype(self._input_type) + response = self._get_sample_dict_from_datatype(self._output_type) + except TypeError: + return None + else: + frontend_objects["request"] = request + frontend_objects["response"] = response + return APIAccessFrontend(apis=[frontend_objects]) + class AutoScaler(LightningFlow): """The ``AutoScaler`` can be used to automatically change the number of replicas of the given server in @@ -341,7 +409,8 @@ class AutoScaler(LightningFlow): Args: min_replicas: The number of works to start when app initializes. max_replicas: The max number of works to spawn to handle the incoming requests. - autoscale_interval: The number of seconds to wait before checking whether to upscale or downscale the works. + scale_out_interval: The number of seconds to wait before checking whether to increase the number of servers. + scale_in_interval: The number of seconds to wait before checking whether to decrease the number of servers. endpoint: Provide the REST API path. max_batch_size: (auto-batching) The number of requests to process at once. timeout_batching: (auto-batching) The number of seconds to wait before sending the requests to process. @@ -358,7 +427,8 @@ class AutoScaler(LightningFlow): MyPythonServer, min_replicas=1, max_replicas=8, - autoscale_interval=10, + scale_out_interval=10, + scale_in_interval=10, ) ) @@ -387,7 +457,8 @@ def scale(self, replicas: int, metrics: dict) -> int: MyPythonServer, min_replicas=1, max_replicas=8, - autoscale_interval=10, + scale_out_interval=10, + scale_in_interval=10, max_batch_size=8, # for auto batching timeout_batching=1, # for auto batching ) @@ -399,12 +470,13 @@ def __init__( work_cls: Type[LightningWork], min_replicas: int = 1, max_replicas: int = 4, - autoscale_interval: int = 10, + scale_out_interval: int = 10, + scale_in_interval: int = 10, max_batch_size: int = 8, timeout_batching: float = 1, endpoint: str = "api/predict", - input_type: BaseModel = Dict, - output_type: BaseModel = Dict, + input_type: Type[BaseModel] = Dict, + output_type: Type[BaseModel] = Dict, *work_args: Any, **work_kwargs: Any, ) -> None: @@ -418,7 +490,8 @@ def __init__( self._input_type = input_type self._output_type = output_type - self.autoscale_interval = autoscale_interval + self.scale_out_interval = scale_out_interval + self.scale_in_interval = scale_in_interval self.max_batch_size = max_batch_size if max_replicas < min_replicas: @@ -438,6 +511,7 @@ def __init__( timeout_batching=timeout_batching, cache_calls=True, parallel=True, + work_name=self._work_cls.__name__, ) for _ in range(min_replicas): work = self.create_work() @@ -511,9 +585,13 @@ def scale(self, replicas: int, metrics: dict) -> int: The target number of running works. The value will be adjusted after this method runs so that it satisfies ``min_replicas<=replicas<=max_replicas``. """ - pending_requests_per_running_or_pending_work = metrics["pending_requests"] / ( - replicas + metrics["pending_works"] - ) + pending_requests = metrics["pending_requests"] + active_or_pending_works = replicas + metrics["pending_works"] + + if active_or_pending_works == 0: + return 1 if pending_requests > 0 else 0 + + pending_requests_per_running_or_pending_work = pending_requests / active_or_pending_works # scale out if the number of pending requests exceeds max batch size. max_requests_per_work = self.max_batch_size @@ -539,11 +617,6 @@ def num_pending_works(self) -> int: def autoscale(self) -> None: """Adjust the number of works based on the target number returned by ``self.scale``.""" - if time.time() - self._last_autoscale < self.autoscale_interval: - return - - self.load_balancer.update_servers(self.workers) - metrics = { "pending_requests": self.num_pending_requests, "pending_works": self.num_pending_works, @@ -555,24 +628,33 @@ def autoscale(self) -> None: min(self.max_replicas, self.scale(self.num_replicas, metrics)), ) - # upscale - num_workers_to_add = num_target_workers - self.num_replicas - for _ in range(num_workers_to_add): - logger.info(f"Upscaling from {self.num_replicas} to {self.num_replicas + 1}") - work = self.create_work() - new_work_id = self.add_work(work) - logger.info(f"Work created: '{new_work_id}'") - - # downscale - num_workers_to_remove = self.num_replicas - num_target_workers - for _ in range(num_workers_to_remove): - logger.info(f"Downscaling from {self.num_replicas} to {self.num_replicas - 1}") - removed_work_id = self.remove_work(self.num_replicas - 1) - logger.info(f"Work removed: '{removed_work_id}'") + # scale-out + if time.time() - self._last_autoscale > self.scale_out_interval: + num_workers_to_add = num_target_workers - self.num_replicas + for _ in range(num_workers_to_add): + logger.info(f"Scaling out from {self.num_replicas} to {self.num_replicas + 1}") + work = self.create_work() + # TODO: move works into structures + new_work_id = self.add_work(work) + logger.info(f"Work created: '{new_work_id}'") + if num_workers_to_add > 0: + self._last_autoscale = time.time() + + # scale-in + if time.time() - self._last_autoscale > self.scale_in_interval: + num_workers_to_remove = self.num_replicas - num_target_workers + for _ in range(num_workers_to_remove): + logger.info(f"Scaling in from {self.num_replicas} to {self.num_replicas - 1}") + removed_work_id = self.remove_work(self.num_replicas - 1) + logger.info(f"Work removed: '{removed_work_id}'") + if num_workers_to_remove > 0: + self._last_autoscale = time.time() self.load_balancer.update_servers(self.workers) - self._last_autoscale = time.time() def configure_layout(self): - tabs = [{"name": "Swagger", "content": self.load_balancer.url}] + tabs = [ + {"name": "Endpoint Info", "content": f"{self.load_balancer}/endpoint-info"}, + {"name": "Swagger", "content": self.load_balancer.url}, + ] return tabs diff --git a/src/lightning_app/components/serve/python_server.py b/src/lightning_app/components/serve/python_server.py index 40b7e83a3bdca..ee958b30625fd 100644 --- a/src/lightning_app/components/serve/python_server.py +++ b/src/lightning_app/components/serve/python_server.py @@ -2,9 +2,9 @@ import base64 import os import platform -from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, TYPE_CHECKING +import requests import uvicorn from fastapi import FastAPI from lightning_utilities.core.imports import compare_version, module_available @@ -14,6 +14,9 @@ from lightning_app.utilities.app_helpers import Logger from lightning_app.utilities.imports import _is_torch_available, requires +if TYPE_CHECKING: + from lightning_app.frontend.frontend import Frontend + logger = Logger(__name__) # Skip doctests if requirements aren't available @@ -48,18 +51,80 @@ class Image(BaseModel): image: Optional[str] @staticmethod - def _get_sample_data() -> Dict[Any, Any]: - imagepath = Path(__file__).parent / "catimage.png" - with open(imagepath, "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return {"image": encoded_string.decode("UTF-8")} + def get_sample_data() -> Dict[Any, Any]: + url = "https://raw.githubusercontent.com/Lightning-AI/LAI-Triton-Server-Component/main/catimage.png" + img = requests.get(url).content + img = base64.b64encode(img).decode("UTF-8") + return {"image": img} + + @staticmethod + def request_code_sample(url: str) -> str: + return ( + """import base64 +from pathlib import Path +import requests + +imgurl = "https://raw.githubusercontent.com/Lightning-AI/LAI-Triton-Server-Component/main/catimage.png" +img = requests.get(imgurl).content +img = base64.b64encode(img).decode("UTF-8") +response = requests.post('""" + + url + + """', json={ + "image": img +})""" + ) + + @staticmethod + def response_code_sample() -> str: + return """img = response.json()["image"] +img = base64.b64decode(img.encode("utf-8")) +Path("response.png").write_bytes(img) +""" + + +class Category(BaseModel): + category: Optional[int] + + @staticmethod + def get_sample_data() -> Dict[Any, Any]: + return {"prediction": 463} + + @staticmethod + def response_code_sample() -> str: + return """print("Predicted category is: ", response.json()["category"]) +""" + + +class Text(BaseModel): + text: Optional[str] + + @staticmethod + def get_sample_data() -> Dict[Any, Any]: + return {"text": "A portrait of a person looking away from the camera"} + + @staticmethod + def request_code_sample(url: str) -> str: + return ( + """import base64 +from pathlib import Path +import requests + +response = requests.post('""" + + url + + """', json={ + "text": "A portrait of a person looking away from the camera" +}) +""" + ) class Number(BaseModel): + # deprecated + # TODO remove this in favour of Category prediction: Optional[int] @staticmethod - def _get_sample_data() -> Dict[Any, Any]: + def get_sample_data() -> Dict[Any, Any]: return {"prediction": 463} @@ -154,8 +219,8 @@ def predict(self, request: Any) -> Any: @staticmethod def _get_sample_dict_from_datatype(datatype: Any) -> dict: - if hasattr(datatype, "_get_sample_data"): - return datatype._get_sample_data() + if hasattr(datatype, "get_sample_data"): + return datatype.get_sample_data() datatype_props = datatype.schema()["properties"] out: Dict[str, Any] = {} @@ -187,7 +252,15 @@ def predict_fn(request: input_type): # type: ignore fastapi_app.post("/predict", response_model=output_type)(predict_fn) - def configure_layout(self) -> None: + def get_code_sample(self, url: str) -> Optional[str]: + input_type: Any = self.configure_input_type() + output_type: Any = self.configure_output_type() + + if not (hasattr(input_type, "request_code_sample") and hasattr(output_type, "response_code_sample")): + return None + return f"{input_type.request_code_sample(url)}\n{output_type.response_code_sample()}" + + def configure_layout(self) -> Optional["Frontend"]: try: from lightning_api_access import APIAccessFrontend except ModuleNotFoundError: @@ -203,17 +276,19 @@ def configure_layout(self) -> None: except TypeError: return None - return APIAccessFrontend( - apis=[ - { - "name": class_name, - "url": url, - "method": "POST", - "request": request, - "response": response, - } - ] - ) + frontend_payload = { + "name": class_name, + "url": url, + "method": "POST", + "request": request, + "response": response, + } + + code_sample = self.get_code_sample(url) + if code_sample: + frontend_payload["code_sample"] = code_sample + + return APIAccessFrontend(apis=[frontend_payload]) def run(self, *args: Any, **kwargs: Any) -> Any: """Run method takes care of configuring and setting up a FastAPI server behind the scenes. diff --git a/src/lightning_app/components/serve/streamlit.py b/src/lightning_app/components/serve/streamlit.py index 9b943a1708fa3..720139f93f25e 100644 --- a/src/lightning_app/components/serve/streamlit.py +++ b/src/lightning_app/components/serve/streamlit.py @@ -20,6 +20,8 @@ class ServeStreamlit(LightningWork, abc.ABC): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.ready = False + self._process = None @property @@ -58,6 +60,7 @@ def run(self) -> None: ], env=env, ) + self.ready = True self._process.wait() def on_exit(self) -> None: diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index e6f7b6ad0024c..4a439fa87bd82 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -34,6 +34,7 @@ from lightning_app.core.queues import QueuingSystem from lightning_app.storage import Drive from lightning_app.utilities.app_helpers import InMemoryStateStore, Logger, StateStore +from lightning_app.utilities.app_status import AppStatus from lightning_app.utilities.cloud import is_running_in_cloud from lightning_app.utilities.component import _context from lightning_app.utilities.enum import ComponentContext, OpenAPITags @@ -66,18 +67,24 @@ class SessionMiddleware: lock = Lock() app_spec: Optional[List] = None +app_status: Optional[AppStatus] = None + # In the future, this would be abstracted to support horizontal scaling. responses_store = {} logger = Logger(__name__) - # This can be replaced with a consumer that publishes states in a kv-store # in a serverless architecture class UIRefresher(Thread): - def __init__(self, api_publish_state_queue, api_response_queue, refresh_interval: float = 0.1) -> None: + def __init__( + self, + api_publish_state_queue, + api_response_queue, + refresh_interval: float = 0.1, + ) -> None: super().__init__(daemon=True) self.api_publish_state_queue = api_publish_state_queue self.api_response_queue = api_response_queue @@ -98,7 +105,8 @@ def run(self): def run_once(self): try: - state = self.api_publish_state_queue.get(timeout=0) + global app_status + state, app_status = self.api_publish_state_queue.get(timeout=0) with lock: global_app_state_store.set_app_state(TEST_SESSION_UUID, state) except queue.Empty: @@ -326,6 +334,17 @@ async def upload_file(response: Response, filename: str, uploaded_file: UploadFi return f"Successfully uploaded '{filename}' to the Drive" +@fastapi_service.get("/api/v1/status", response_model=AppStatus) +async def get_status() -> AppStatus: + """Get the current status of the app and works.""" + global app_status + if app_status is None: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="App status hasn't been reported yet." + ) + return app_status + + @fastapi_service.get("/healthz", status_code=200) async def healthz(response: Response): """Health check endpoint used in the cloud FastAPI servers to check the status periodically.""" diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 9c3aeeb650de0..dfce2097ec5a0 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -35,6 +35,7 @@ _should_dispatch_app, Logger, ) +from lightning_app.utilities.app_status import AppStatus from lightning_app.utilities.commands.base import _process_requests from lightning_app.utilities.component import _convert_paths_after_init, _validate_root_flow from lightning_app.utilities.enum import AppStage, CacheCallsKeys @@ -140,6 +141,7 @@ def __init__( self.exception = None self.collect_changes: bool = True + self.status: Optional[AppStatus] = None # TODO: Enable ready locally for opening the UI. self.ready = False @@ -150,6 +152,7 @@ def __init__( self.checkpointing: bool = False self._update_layout() + self._update_status() self.is_headless: Optional[bool] = None @@ -418,6 +421,7 @@ def run_once(self): self._update_layout() self._update_is_headless() + self._update_status() self.maybe_apply_changes() if self.checkpointing and self._should_snapshot(): @@ -485,19 +489,12 @@ def _run(self) -> bool: self._original_state = deepcopy(self.state) done = False - # TODO: Re-enable the `ready` property once issues are resolved - if not self.root.ready: - warnings.warn( - "One of your Flows returned `.ready` as `False`. " - "This feature is not yet enabled so this will be ignored.", - UserWarning, - ) - self.ready = True + self.ready = self.root.ready self._start_with_flow_works() - if self.ready and self.should_publish_changes_to_api and self.api_publish_state_queue: - self.api_publish_state_queue.put(self.state_vars) + if self.should_publish_changes_to_api and self.api_publish_state_queue is not None: + self.api_publish_state_queue.put((self.state_vars, self.status)) self._reset_run_time_monitor() @@ -506,8 +503,8 @@ def _run(self) -> bool: self._update_run_time_monitor() - if self.ready and self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue: - self.api_publish_state_queue.put(self.state_vars) + if self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue is not None: + self.api_publish_state_queue.put((self.state_vars, self.status)) self._has_updated = False @@ -532,6 +529,23 @@ def _update_is_headless(self) -> None: # This ensures support for apps which dynamically add a UI at runtime. _handle_is_headless(self) + def _update_status(self) -> None: + old_status = self.status + + work_statuses = {} + for work in breadth_first(self.root, types=(lightning_app.LightningWork,)): + work_statuses[work.name] = work.status + + self.status = AppStatus( + is_ui_ready=self.ready, + work_statuses=work_statuses, + ) + + # If the work statuses changed, the state delta will trigger an update. + # If ready has changed, we trigger an update manually. + if self.status != old_status: + self._has_updated = True + def _apply_restarting(self) -> bool: self._reset_original_state() # apply stage after restoring the original state. diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index 302ba344320d1..5987425713489 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -10,13 +10,7 @@ from lightning_app.frontend import Frontend from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive -from lightning_app.utilities.app_helpers import ( - _is_json_serializable, - _lightning_dispatched, - _LightningAppRef, - _set_child_name, - is_overridden, -) +from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, _set_child_name, is_overridden from lightning_app.utilities.component import _sanitize_state from lightning_app.utilities.exceptions import ExitAppException from lightning_app.utilities.introspection import _is_init_context, _is_run_context @@ -255,10 +249,7 @@ def __getattr__(self, item): @property def ready(self) -> bool: - """Not currently enabled. - - Override to customize when your App should be ready. - """ + """Override to customize when your App should be ready.""" flows = self.flows return all(flow.ready for flow in flows.values()) if flows else True @@ -325,7 +316,7 @@ def lightningignore(self) -> Tuple[str, ...]: @lightningignore.setter def lightningignore(self, lightningignore: Tuple[str, ...]) -> None: - if _lightning_dispatched(): + if self._backend is not None: raise RuntimeError( f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an" " effect" diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 43ffc0006d5ea..863d50db47cec 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -11,19 +11,14 @@ from lightning_app.storage import Path from lightning_app.storage.drive import _maybe_create_drive, Drive from lightning_app.storage.payload import Payload -from lightning_app.utilities.app_helpers import ( - _is_json_serializable, - _lightning_dispatched, - _LightningAppRef, - is_overridden, -) +from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef, is_overridden +from lightning_app.utilities.app_status import WorkStatus from lightning_app.utilities.component import _is_flow_context, _sanitize_state from lightning_app.utilities.enum import ( CacheCallsKeys, make_status, WorkFailureReasons, WorkStageStatus, - WorkStatus, WorkStopReasons, ) from lightning_app.utilities.exceptions import LightningWorkException @@ -56,7 +51,7 @@ class LightningWork: _run_executor_cls: Type[WorkRunExecutor] = WorkRunExecutor # TODO: Move to spawn for all Operating System. - _start_method = "spawn" if sys.platform == "win32" else "fork" + _start_method = "spawn" if sys.platform in ("darwin", "win32") else "fork" def __init__( self, @@ -124,7 +119,16 @@ def __init__( " in the next version. Use `cache_calls` instead." ) self._cache_calls = run_once if run_once is not None else cache_calls - self._state = {"_host", "_port", "_url", "_future_url", "_internal_ip", "_restarting", "_cloud_compute"} + self._state = { + "_host", + "_port", + "_url", + "_future_url", + "_internal_ip", + "_restarting", + "_cloud_compute", + "_display_name", + } self._parallel = parallel self._host: str = host self._port: Optional[int] = port @@ -134,6 +138,7 @@ def __init__( # setattr_replacement is used by the multiprocessing runtime to send the latest changes to the main coordinator self._setattr_replacement: Optional[Callable[[str, Any], None]] = None self._name = "" + self._display_name = "" # The ``self._calls`` is used to track whether the run # method with a given set of input arguments has already been called. # Example of its usage: @@ -212,6 +217,22 @@ def name(self): """Returns the name of the LightningWork.""" return self._name + @property + def display_name(self): + """Returns the display name of the LightningWork in the cloud. + + The display name needs to set before the run method of the work is called. + """ + return self._display_name + + @display_name.setter + def display_name(self, display_name: str): + """Sets the display name of the LightningWork in the cloud.""" + if not self.has_started: + self._display_name = display_name + elif self._display_name != display_name: + raise RuntimeError("The display name can be set only before the work has started.") + @property def cache_calls(self) -> bool: """Returns whether the ``run`` method should cache its input arguments and not run again when provided with @@ -267,7 +288,7 @@ def lightningignore(self) -> Tuple[str, ...]: @lightningignore.setter def lightningignore(self, lightningignore: Tuple[str, ...]) -> None: - if _lightning_dispatched(): + if self._backend is not None: raise RuntimeError( f"Your app has been already dispatched, so modifying the `{self.name}.lightningignore` does not have an" " effect" @@ -609,12 +630,12 @@ def on_exit(self): pass def stop(self): - """Stops LightingWork component and shuts down hardware provisioned via L.CloudCompute.""" + """Stops LightingWork component and shuts down hardware provisioned via L.CloudCompute. + + This can only be called from a ``LightningFlow``. + """ if not self._backend: - raise Exception( - "Can't stop the work, it looks like it isn't attached to a LightningFlow. " - "Make sure to assign the Work to a flow instance." - ) + raise RuntimeError(f"Only the `LightningFlow` can request this work ({self.name!r}) to stop.") if self.status.stage == WorkStageStatus.STOPPED: return latest_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH] @@ -623,6 +644,19 @@ def stop(self): app = _LightningAppRef().get_current() self._backend.stop_work(app, self) + def delete(self): + """Delete LightingWork component and shuts down hardware provisioned via L.CloudCompute. + + Locally, the work.delete() behaves as work.stop(). + """ + if not self._backend: + raise Exception( + "Can't delete the work, it looks like it isn't attached to a LightningFlow. " + "Make sure to assign the Work to a flow instance." + ) + app = _LightningAppRef().get_current() + self._backend.delete_work(app, self) + def _check_run_is_implemented(self) -> None: if not is_overridden("run", instance=self, parent=LightningWork): raise TypeError( diff --git a/src/lightning_app/runners/backends/mp_process.py b/src/lightning_app/runners/backends/mp_process.py index dc0681390046e..36f3cb8097604 100644 --- a/src/lightning_app/runners/backends/mp_process.py +++ b/src/lightning_app/runners/backends/mp_process.py @@ -88,6 +88,9 @@ def stop_work(self, app, work: "lightning_app.LightningWork") -> None: work_manager: MultiProcessWorkManager = app.processes[work.name] work_manager.kill() + def delete_work(self, app, work: "lightning_app.LightningWork") -> None: + self.stop_work(app, work) + class CloudMultiProcessingBackend(MultiProcessingBackend): def __init__(self, *args, **kwargs): @@ -108,3 +111,6 @@ def stop_work(self, app, work: "lightning_app.LightningWork") -> None: disable_port(work._port) self.ports = [port for port in self.ports if port != work._port] return super().stop_work(app, work) + + def delete_work(self, app, work: "lightning_app.LightningWork") -> None: + self.stop_work(app, work) diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index ab5ae29c092a5..265a47919b870 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -1,5 +1,6 @@ import fnmatch import json +import os import random import re import string @@ -230,6 +231,14 @@ def dispatch( else: ignore_functions = None + # Create a default dotignore if it doesn't exist + if not (root / DOT_IGNORE_FILENAME).is_file(): + with open(root / DOT_IGNORE_FILENAME, "w") as f: + f.write("venv/\n") + if (root / "bin" / "activate").is_file() or (root / "pyvenv.cfg").is_file(): + # the user is developing inside venv + f.write("bin/\ninclude/\nlib/\npyvenv.cfg\n") + repo = LocalSourceCodeDir(path=root, ignore_functions=ignore_functions) self._check_uploaded_folder(root, repo) requirements_file = root / "requirements.txt" @@ -312,52 +321,58 @@ def dispatch( self._ensure_cluster_project_binding(project.project_id, cluster_id) # Resolve the app name, instance, and cluster ID + existing_app = None existing_instance = None app_name = app_config.name - # List existing instances + # List existing apps # TODO: Add pagination, otherwise this could break if users have a lot of apps. - find_instances_resp = self.backend.client.lightningapp_instance_service_list_lightningapp_instances( + all_apps = self.backend.client.lightningapp_v2_service_list_lightningapps_v2( project_id=project.project_id - ) + ).lightningapps - # Seach for instances with the given name (possibly with some random characters appended) + # Seach for apps with the given name (possibly with some random characters appended) pattern = re.escape(f"{app_name}-") + ".{4}" - instances = [ + all_apps = [ lightningapp - for lightningapp in find_instances_resp.lightningapps + for lightningapp in all_apps if lightningapp.name == app_name or (re.fullmatch(pattern, lightningapp.name) is not None) ] - # If instances exist and cluster is None, mimic cluster selection logic to choose a default - if cluster_id is None and len(instances) > 0: + # If apps exist and cluster is None, mimic cluster selection logic to choose a default + if cluster_id is None and len(all_apps) > 0: # Determine the cluster ID cluster_id = self._get_default_cluster(project.project_id) # If an instance exists on the cluster with the same base name - restart it - for instance in instances: - if instance.spec.cluster_id == cluster_id: - existing_instance = instance + for app in all_apps: + instances = self.backend.client.lightningapp_instance_service_list_lightningapp_instances( + project_id=project.project_id, + app_id=app.id, + ).lightningapps + if instances and instances[0].spec.cluster_id == cluster_id: + existing_app = app + existing_instance = instances[0] break - # If instances exist but not on the cluster - choose a randomised name - if len(instances) > 0 and existing_instance is None: + # If apps exist but not on the cluster - choose a randomised name + if len(all_apps) > 0 and existing_app is None: name_exists = True while name_exists: random_name = self._randomise_name(app_name) - name_exists = any([instance.name == random_name for instance in instances]) + name_exists = any([app.name == random_name for app in all_apps]) app_name = random_name # Create the app if it doesn't exist - if existing_instance is None: + if existing_app is None: app_body = Body7(name=app_name, can_download_source_code=True) lit_app = self.backend.client.lightningapp_v2_service_create_lightningapp_v2( project_id=project.project_id, body=app_body ) app_id = lit_app.id else: - app_id = existing_instance.spec.app_id + app_id = existing_app.id # check if user has sufficient credits to run an app # if so set the desired state to running otherwise, create the app in stopped state, @@ -556,15 +571,10 @@ def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None: f"Your application folder '{root.absolute()}' is more than {CLOUD_UPLOAD_WARNING} MB. " f"The total size is {round(app_folder_size_in_mb, 2)} MB. {len(files)} files were uploaded.\n" + largest_paths_msg - + "Perhaps you should try running the app in an empty directory." + + "Perhaps you should try running the app in an empty directory.\n" + + "You can ignore some files or folders by adding them to `.lightningignore`.\n" + + " You can also set the `self.lightningingore` attribute in a Flow or Work." ) - if not (root / DOT_IGNORE_FILENAME).is_file(): - warning_msg += ( - "\nIn order to ignore some files or folder, create a `.lightningignore` file and add the paths to" - " ignore. You can also set the `lightningingore` attribute in a Flow or Work." - ) - else: - warning_msg += "\nYou can ignore some files or folders by adding them to `.lightningignore`." logger.warn(warning_msg) @@ -580,6 +590,10 @@ def _project_has_sufficient_credits(self, project: V1Membership, app: Optional[L @classmethod def load_app_from_file(cls, filepath: str) -> "LightningApp": """Load a LightningApp from a file, mocking the imports.""" + + # Pretend we are running in the cloud when loading the app locally + os.environ["LAI_RUNNING_IN_CLOUD"] = "1" + try: app = load_app_from_file(filepath, raise_exception=True, mock_imports=True) except FileNotFoundError as e: @@ -590,6 +604,8 @@ def load_app_from_file(cls, filepath: str) -> "LightningApp": # Create a generic app. logger.info("Could not load the app locally. Starting the app directly on the cloud.") app = LightningApp(EmptyFlow()) + finally: + del os.environ["LAI_RUNNING_IN_CLOUD"] return app @staticmethod diff --git a/src/lightning_app/runners/runtime.py b/src/lightning_app/runners/runtime.py index a30b78f9178a0..c6d8c3d4394b9 100644 --- a/src/lightning_app/runners/runtime.py +++ b/src/lightning_app/runners/runtime.py @@ -121,7 +121,7 @@ def terminate(self) -> None: self._add_stopped_status_to_work(work) # Publish the updated state and wait for the frontend to update. - self.app.api_publish_state_queue.put(self.app.state) + self.app.api_publish_state_queue.put((self.app.state, self.app.status)) for thread in self.threads + self.app.threads: thread.join(timeout=0) diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index 3b152786b682a..bc3d092b280dd 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -515,12 +515,29 @@ def _lightning_dispatched() -> bool: return bool(int(os.getenv("LIGHTNING_DISPATCHED", 0))) +def _using_debugger() -> bool: + """This method is used to detect whether the app is run with a debugger attached.""" + if "LIGHTNING_DETECTED_DEBUGGER" in os.environ: + return True + + # Collect the information about the process. + parent_process = os.popen(f"ps -ax | grep -i {os.getpid()} | grep -v grep").read() + + # Detect whether VSCode or PyCharm debugger are used + use_debugger = "debugpy" in parent_process or "pydev" in parent_process + + # Store the result to avoid multiple popen calls. + if use_debugger: + os.environ["LIGHTNING_DETECTED_DEBUGGER"] = "1" + return use_debugger + + def _should_dispatch_app() -> bool: return ( - __debug__ - and "_pytest.doctest" not in sys.modules - and not _lightning_dispatched() + not _lightning_dispatched() and "LIGHTNING_APP_STATE_URL" not in os.environ + # Keep last to avoid running it if already dispatched + and _using_debugger() ) diff --git a/src/lightning_app/utilities/app_status.py b/src/lightning_app/utilities/app_status.py new file mode 100644 index 0000000000000..232c3f0b65210 --- /dev/null +++ b/src/lightning_app/utilities/app_status.py @@ -0,0 +1,29 @@ +from datetime import datetime +from typing import Any, Dict, Optional + +from pydantic import BaseModel + + +class WorkStatus(BaseModel): + """The ``WorkStatus`` captures the status of a work according to the app.""" + + stage: str + timestamp: float + reason: Optional[str] = None + message: Optional[str] = None + count: int = 1 + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + assert self.timestamp > 0 and self.timestamp < (int(datetime.now().timestamp()) + 10) + + +class AppStatus(BaseModel): + """The ``AppStatus`` captures the current status of the app and its components.""" + + # ``True`` when the app UI is ready to be viewed + is_ui_ready: bool + + # The statuses of ``LightningWork`` objects currently associated with this app + work_statuses: Dict[str, WorkStatus] diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index 0ec6eabd3022c..b45840a0d9489 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -281,7 +281,7 @@ def _check_version_and_upgrade(): prompt = f"A newer version of {__package_name__} is available ({new_version}). Would you like to upgrade?" if click.confirm(prompt, default=True): - command = f"pip install '{__package_name__}=={new_version}'" + command = f"pip install {__package_name__}=={new_version}" logger.info(f"âš¡ RUN: {command}") diff --git a/src/lightning_app/utilities/cloud.py b/src/lightning_app/utilities/cloud.py index 20ab6d14827c9..6db634649fdbb 100644 --- a/src/lightning_app/utilities/cloud.py +++ b/src/lightning_app/utilities/cloud.py @@ -39,4 +39,4 @@ def _sigterm_flow_handler(*_, app: "lightning_app.LightningApp"): def is_running_in_cloud() -> bool: """Returns True if the Lightning App is running in the cloud.""" - return "LIGHTNING_APP_STATE_URL" in os.environ + return bool(int(os.environ.get("LAI_RUNNING_IN_CLOUD", "0"))) or "LIGHTNING_APP_STATE_URL" in os.environ diff --git a/src/lightning_app/utilities/enum.py b/src/lightning_app/utilities/enum.py index 11cd7fabc4299..4c92ffba3db11 100644 --- a/src/lightning_app/utilities/enum.py +++ b/src/lightning_app/utilities/enum.py @@ -1,5 +1,4 @@ import enum -from dataclasses import dataclass from datetime import datetime, timezone from typing import Optional @@ -47,18 +46,6 @@ class WorkStageStatus: FAILED = "failed" -@dataclass -class WorkStatus: - stage: WorkStageStatus - timestamp: float - reason: Optional[str] = None - message: Optional[str] = None - count: int = 1 - - def __post_init__(self): - assert self.timestamp > 0 and self.timestamp < (int(datetime.now().timestamp()) + 10) - - def make_status(stage: str, message: Optional[str] = None, reason: Optional[str] = None): status = { "stage": stage, diff --git a/src/lightning_app/utilities/frontend.py b/src/lightning_app/utilities/frontend.py index 470036436a63c..afc5f21539862 100644 --- a/src/lightning_app/utilities/frontend.py +++ b/src/lightning_app/utilities/frontend.py @@ -22,11 +22,12 @@ def update_index_file(ui_root: str, info: Optional[AppInfo] = None, root_path: s entry_file = Path(ui_root) / "index.html" original_file = Path(ui_root) / "index.original.html" - if not original_file.exists(): - shutil.copyfile(entry_file, original_file) # keep backup - else: - # revert index.html in case it was modified after creating original.html - shutil.copyfile(original_file, entry_file) + if root_path: + if not original_file.exists(): + shutil.copyfile(entry_file, original_file) # keep backup + else: + # revert index.html in case it was modified after creating original.html + shutil.copyfile(original_file, entry_file) if info: with original_file.open() as f: diff --git a/src/lightning_app/utilities/introspection.py b/src/lightning_app/utilities/introspection.py index 9184f433acb5d..a2c9010c803f7 100644 --- a/src/lightning_app/utilities/introspection.py +++ b/src/lightning_app/utilities/introspection.py @@ -149,7 +149,7 @@ class LightningLoggerVisitor(LightningVisitor): Names of methods that are part of the Logger API. """ - class_name = "LightningLoggerBase" + class_name = "Logger" methods: Set[str] = {"log_hyperparams", "log_metrics"} @@ -248,10 +248,6 @@ class LightningAcceleratorVisitor(LightningVisitor): class_name = "Accelerator" -class LightningLoggerBaseVisitor(LightningVisitor): - class_name = "LightningLoggerBase" - - class LightningLoopVisitor(LightningVisitor): class_name = "Loop" @@ -264,8 +260,8 @@ class LightningLiteVisitor(LightningVisitor): class_name = "Fabric" -class LightningBaseProfilerVisitor(LightningVisitor): - class_name = "BaseProfiler" +class LightningProfilerVisitor(LightningVisitor): + class_name = "Profiler" class Scanner: @@ -295,11 +291,11 @@ class Scanner: LightningStrategyVisitor, LightningPrecisionPluginVisitor, LightningAcceleratorVisitor, - LightningLoggerBaseVisitor, + LightningLoggerVisitor, LightningLoopVisitor, TorchMetricVisitor, LightningLiteVisitor, - LightningBaseProfilerVisitor, + LightningProfilerVisitor, ] def __init__(self, path: str, glob_pattern: str = "**/*.py"): diff --git a/src/lightning_fabric/cli.py b/src/lightning_fabric/cli.py index 913ed4d98cf17..a50c580c37ad8 100644 --- a/src/lightning_fabric/cli.py +++ b/src/lightning_fabric/cli.py @@ -20,7 +20,6 @@ from lightning_fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator from lightning_fabric.utilities.device_parser import _parse_gpu_ids -from lightning_fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_13 _log = logging.getLogger(__name__) @@ -148,15 +147,6 @@ def _get_num_processes(accelerator: str, devices: str) -> int: def _torchrun_launch(args: Namespace, script_args: List[str]) -> None: """This will invoke `torchrun` programmatically to launch the given script in new processes.""" - - if _IS_WINDOWS and _TORCH_GREATER_EQUAL_1_13: # pragma: no cover - # TODO: remove once import issue is resolved: https://github.com/pytorch/pytorch/issues/85427 - _log.error( - "On the Windows platform, this launcher is currently only supported on torch < 1.13 due to a bug" - " upstream: https://github.com/pytorch/pytorch/issues/85427" - ) - raise SystemExit(1) - import torch.distributed.run as torchrun if args.strategy == "dp": diff --git a/src/lightning_fabric/connector.py b/src/lightning_fabric/connector.py index ac102e0a94ca7..a64097eaf61d5 100644 --- a/src/lightning_fabric/connector.py +++ b/src/lightning_fabric/connector.py @@ -402,7 +402,7 @@ def _check_strategy_and_fallback(self) -> None: # TODO this logic should apply to both str and object config strategy_flag = "" if isinstance(self._strategy_flag, Strategy) else self._strategy_flag - if strategy_flag in ("ddp_spawn", "ddp_spawn_find_unused_parameters_false") and ( + if strategy_flag == "ddp_spawn" and ( TorchElasticEnvironment.detect() or KubeflowEnvironment.detect() or SLURMEnvironment.detect() diff --git a/src/lightning_fabric/strategies/ddp.py b/src/lightning_fabric/strategies/ddp.py index 6fea032be5842..c27980f0af3d0 100644 --- a/src/lightning_fabric/strategies/ddp.py +++ b/src/lightning_fabric/strategies/ddp.py @@ -43,9 +43,7 @@ _DDP_FORK_ALIASES = ( "ddp_fork", - "ddp_fork_find_unused_parameters_false", "ddp_notebook", - "ddp_notebook_find_unused_parameters_false", ) @@ -177,21 +175,6 @@ def register_strategies(cls, strategy_registry: Dict) -> None: start_method=start_method, ) - entries = ( - ("ddp_find_unused_parameters_false", "popen"), - ("ddp_spawn_find_unused_parameters_false", "spawn"), - ("ddp_fork_find_unused_parameters_false", "fork"), - ("ddp_notebook_find_unused_parameters_false", "fork"), - ) - for name, start_method in entries: - strategy_registry.register( - name, - cls, - description=f"DDP strategy with `find_unused_parameters` as False and `start_method={start_method!r}`", - find_unused_parameters=False, - start_method=start_method, - ) - def _setup_distributed(self) -> None: self._set_world_ranks() rank_zero_only.rank = self.global_rank diff --git a/src/lightning_fabric/strategies/fairscale.py b/src/lightning_fabric/strategies/fairscale.py index 27cd4c4611eb1..e78be8ec83083 100644 --- a/src/lightning_fabric/strategies/fairscale.py +++ b/src/lightning_fabric/strategies/fairscale.py @@ -103,24 +103,11 @@ def setup_optimizer(self, optimizer: Optimizer) -> Optimizer: @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: - strategy_registry.register( - "ddp_sharded_find_unused_parameters_false", - cls, - description="DDP Sharded Strategy with `find_unused_parameters` as False", - find_unused_parameters=False, - ) strategy_registry.register( "ddp_sharded", cls, description=cls.__class__.__name__, ) - strategy_registry.register( - "ddp_sharded_spawn_find_unused_parameters_false", - cls, - description="DDP Spawn Sharded Strategy with `find_unused_parameters` as False", - find_unused_parameters=False, - start_method="spawn", - ) strategy_registry.register("ddp_sharded_spawn", cls, description=cls.__class__.__name__, start_method="spawn") diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index a3cb0dced11b9..fe7dcf31c2d79 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -72,7 +72,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `description`, `env_prefix` and `env_parse` parameters in `LightningCLI.__init__` in favour of giving them through `parser_kwargs` ([#15651](https://github.com/Lightning-AI/lightning/pull/15651)) -- +- Deprecated `pytorch_lightning.profiler` in favor of `pytorch_lightning.profilers` ([#16059](https://github.com/PyTorchLightning/pytorch-lightning/pull/16059)) ### Removed @@ -98,6 +98,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `pytorch_lightning.accelerators.GPUAccelerator` in favor of `pytorch_lightning.accelerators.CUDAAccelerator` ([#16050](https://github.com/Lightning-AI/lightning/pull/16050)) +- Removed the deprecated `pytorch_lightning.profiler.*` classes in favor of `pytorch_lightning.profilers` ([#16059](https://github.com/PyTorchLightning/pytorch-lightning/pull/16059)) + ### Fixed @@ -512,7 +514,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#12184](https://github.com/Lightning-AI/lightning/pull/12184)) - Deprecated LightningCLI's registries in favor of importing the respective package ([#13221](https://github.com/Lightning-AI/lightning/pull/13221)) - Deprecated public utilities in `pytorch_lightning.utilities.cli.LightningCLI` in favor of equivalent copies in `pytorch_lightning.cli.LightningCLI` ([#13767](https://github.com/Lightning-AI/lightning/pull/13767)) -- Deprecated `pytorch_lightning.profiler` in favor of `pytorch_lightning.profilers` ([#12308](https://github.com/Lightning-AI/lightning/pull/12308)) +- Deprecated `pytorch_lightning.profiler.*` in favor of `pytorch_lightning.profilers` ([#12308](https://github.com/Lightning-AI/lightning/pull/12308)) ### Removed diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index cd5698821b253..67ddf6eeca6a6 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -84,9 +84,9 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs | Linux py3.7 \[TPUs\*\*\*\] | - | - | | Linux py3.8 \[IPUs\] | - | - | | Linux py3.8 \[HPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | -| Linux py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml) | -| OSX py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml) | -| Windows py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-tests.yml) | +| Linux py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml) | +| OSX py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml) | +| Windows py3.{7,9} | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-tests-pytorch.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ diff --git a/src/pytorch_lightning/_graveyard/profiler.py b/src/pytorch_lightning/_graveyard/profiler.py index 258a68443e605..6c53e4a7eb86b 100644 --- a/src/pytorch_lightning/_graveyard/profiler.py +++ b/src/pytorch_lightning/_graveyard/profiler.py @@ -11,12 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import sys from typing import Any -import pytorch_lightning.profiler.base as base + +def _patch_sys_modules() -> None: + # TODO: Remove in v2.0.0 + self = sys.modules[__name__] + sys.modules["pytorch_lightning.profiler.advanced"] = self + sys.modules["pytorch_lightning.profiler.base"] = self + sys.modules["pytorch_lightning.profiler.profiler"] = self + sys.modules["pytorch_lightning.profiler.pytorch"] = self + sys.modules["pytorch_lightning.profiler.simple"] = self + sys.modules["pytorch_lightning.profiler.xla"] = self -class _AbstractProfiler: +class AbstractProfiler: # TODO: Remove in v2.0.0 def __init__(self, *_: Any, **__: Any) -> None: raise NotImplementedError( @@ -25,14 +35,85 @@ def __init__(self, *_: Any, **__: Any) -> None: ) -class _BaseProfiler: +class BaseProfiler: # TODO: Remove in v2.0.0 def __init__(self, *_: Any, **__: Any) -> None: raise RuntimeError( - "`pytorch_lightning.profiler.base.AbstractProfiler` was deprecated in v1.6 and is no longer supported" + "`pytorch_lightning.profiler.base.BaseProfiler` was deprecated in v1.6 and is no longer supported" " as of v1.9. Use `pytorch_lightning.profilers.Profiler` instead." ) -base.AbstractProfiler = _AbstractProfiler -base.BaseProfiler = _BaseProfiler +class AdvancedProfiler: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.advanced.AdvancedProfiler` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.AdvancedProfiler` instead." + ) + + +class PassThroughProfiler: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.base.PassThroughProfiler` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.PassThroughProfiler` instead." + ) + + +class Profiler: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.profiler.Profiler` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.Profiler` instead." + ) + + +class PyTorchProfiler: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.pytorch.PyTorchProfiler` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.PyTorchProfiler` instead." + ) + + +class RegisterRecordFunction: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.pytorch.RegisterRecordFunction` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.pytorch.RegisterRecordFunction` instead." + ) + + +class ScheduleWrapper: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.pytorch.ScheduleWrapper` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.pytorch.ScheduleWrapper` instead." + ) + + +class SimpleProfiler: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.simple.SimpleProfiler` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.SimpleProfiler` instead." + ) + + +class XLAProfiler: + # TODO: Remove in v2.0.0 + def __init__(self, *_: Any, **__: Any) -> None: + raise RuntimeError( + "`pytorch_lightning.profiler.xla.XLAProfiler` was deprecated in v1.7.0 and is not longer" + " supported as of v1.9.0. Use `pytorch_lightning.profilers.XLAProfiler` instead." + ) + + +_patch_sys_modules() diff --git a/src/pytorch_lightning/callbacks/model_checkpoint.py b/src/pytorch_lightning/callbacks/model_checkpoint.py index 4cc0b69e09397..9d1dfacbf506a 100644 --- a/src/pytorch_lightning/callbacks/model_checkpoint.py +++ b/src/pytorch_lightning/callbacks/model_checkpoint.py @@ -649,7 +649,7 @@ def _save_last_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[ previous, self.last_model_path = self.last_model_path, filepath self._save_checkpoint(trainer, filepath) if previous and previous != filepath: - trainer.strategy.remove_checkpoint(previous) + self._remove_checkpoint(trainer, previous) def _save_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor]) -> None: assert self.monitor @@ -668,7 +668,7 @@ def _save_none_monitor_checkpoint(self, trainer: "pl.Trainer", monitor_candidate previous, self.best_model_path = self.best_model_path, filepath self._save_checkpoint(trainer, filepath) if self.save_top_k == 1 and previous and previous != filepath: - trainer.strategy.remove_checkpoint(previous) + self._remove_checkpoint(trainer, previous) def _update_best_and_save( self, current: Tensor, trainer: "pl.Trainer", monitor_candidates: Dict[str, Tensor] @@ -710,7 +710,7 @@ def _update_best_and_save( self._save_checkpoint(trainer, filepath) if del_filepath is not None and filepath != del_filepath: - trainer.strategy.remove_checkpoint(del_filepath) + self._remove_checkpoint(trainer, del_filepath) def to_yaml(self, filepath: Optional[_PATH] = None) -> None: """Saves the `best_k_models` dict containing the checkpoint paths with the corresponding scores to a YAML @@ -727,3 +727,7 @@ def file_exists(self, filepath: _PATH, trainer: "pl.Trainer") -> bool: state to diverge between ranks.""" exists = self._fs.exists(filepath) return trainer.strategy.broadcast(exists) + + def _remove_checkpoint(self, trainer: "pl.Trainer", filepath: str) -> None: + """Calls the strategy to remove the checkpoint file.""" + trainer.strategy.remove_checkpoint(filepath) diff --git a/src/pytorch_lightning/profiler/__init__.py b/src/pytorch_lightning/profiler/__init__.py index 0e97d02feb202..376c221290175 100644 --- a/src/pytorch_lightning/profiler/__init__.py +++ b/src/pytorch_lightning/profiler/__init__.py @@ -11,12 +11,70 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.profilers.advanced import AdvancedProfiler -from pytorch_lightning.profilers.base import PassThroughProfiler -from pytorch_lightning.profilers.profiler import Profiler -from pytorch_lightning.profilers.pytorch import PyTorchProfiler -from pytorch_lightning.profilers.simple import SimpleProfiler -from pytorch_lightning.profilers.xla import XLAProfiler +from typing import Any + +from pytorch_lightning.profilers.advanced import AdvancedProfiler as NewAdvancedProfiler +from pytorch_lightning.profilers.base import PassThroughProfiler as NewPassThroughProfiler +from pytorch_lightning.profilers.profiler import Profiler as NewProfiler +from pytorch_lightning.profilers.pytorch import PyTorchProfiler as NewPyTorchProfiler +from pytorch_lightning.profilers.simple import SimpleProfiler as NewSimpleProfiler +from pytorch_lightning.profilers.xla import XLAProfiler as NewXLAProfiler +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation + + +class AdvancedProfiler(NewAdvancedProfiler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.profiler.AdvancedProfiler` is deprecated in v1.9.0 and will be removed in v1.10.0." + " Use the equivalent `pytorch_lightning.profilers.AdvancedProfiler` class instead." + ) + super().__init__(*args, **kwargs) + + +class PassThroughProfiler(NewPassThroughProfiler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.profiler.PassThroughProfiler` is deprecated in v1.9.0 and will be removed in v1.10.0." + " Use the equivalent `pytorch_lightning.profilers.PassThroughProfiler` class instead." + ) + super().__init__(*args, **kwargs) + + +class Profiler(NewProfiler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.profiler.Profiler` is deprecated in v1.9.0 and will be removed in v1.10.0." + " Use the equivalent `pytorch_lightning.profilers.Profiler` class instead." + ) + super().__init__(*args, **kwargs) + + +class PyTorchProfiler(NewPyTorchProfiler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.profiler.PyTorchProfiler` is deprecated in v1.9.0 and will be removed in v1.10.0." + " Use the equivalent `pytorch_lightning.profilers.PyTorchProfiler` class instead." + ) + super().__init__(*args, **kwargs) + + +class SimpleProfiler(NewSimpleProfiler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.profiler.SimpleProfiler` is deprecated in v1.9.0 and will be removed in v1.10.0." + " Use the equivalent `pytorch_lightning.profilers.SimpleProfiler` class instead." + ) + super().__init__(*args, **kwargs) + + +class XLAProfiler(NewXLAProfiler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.profiler.XLAProfiler` is deprecated in v1.9.0 and will be removed in v1.10.0." + " Use the equivalent `pytorch_lightning.profilers.XLAProfiler` class instead." + ) + super().__init__(*args, **kwargs) + __all__ = [ "Profiler", diff --git a/src/pytorch_lightning/profiler/advanced.py b/src/pytorch_lightning/profiler/advanced.py deleted file mode 100644 index d0456f7afa303..0000000000000 --- a/src/pytorch_lightning/profiler/advanced.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.profilers.advanced import AdvancedProfiler as NewAdvancedProfiler -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class AdvancedProfiler(NewAdvancedProfiler): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.AdvancedProfiler` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.AdvancedProfiler` class instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/profiler/base.py b/src/pytorch_lightning/profiler/base.py deleted file mode 100644 index 72eeccf38070b..0000000000000 --- a/src/pytorch_lightning/profiler/base.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Profiler to check if there are any bottlenecks in your code.""" -from pytorch_lightning.profilers.base import PassThroughProfiler as NewPassThroughProfiler -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class PassThroughProfiler(NewPassThroughProfiler): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.PassThroughProfiler` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.PassThroughProfiler` class instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/profiler/profiler.py b/src/pytorch_lightning/profiler/profiler.py deleted file mode 100644 index 544a0a83f6054..0000000000000 --- a/src/pytorch_lightning/profiler/profiler.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.profilers.profiler import Profiler as NewProfiler -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class Profiler(NewProfiler): - """ - .. deprecated:: v1.7 - `pytorch_lightning.profiler.Profiler` is deprecated in v1.7 and will be removed in v1.9. - Use the equivalent `pytorch_lightning.profilers.Profiler` class instead. - """ - - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.Profiler` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.Profiler` class instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/profiler/pytorch.py b/src/pytorch_lightning/profiler/pytorch.py deleted file mode 100644 index 488ce3b654673..0000000000000 --- a/src/pytorch_lightning/profiler/pytorch.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.profilers.pytorch import PyTorchProfiler as NewPyTorchProfiler -from pytorch_lightning.profilers.pytorch import RegisterRecordFunction as NewRegisterRecordFuncion -from pytorch_lightning.profilers.pytorch import ScheduleWrapper as NewScheduleWrapper -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class RegisterRecordFunction(NewRegisterRecordFuncion): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.pytorch.RegisterRecordFunction` is deprecated in v1.7 and will be removed in" - " in v1.9. Use the equivalent `pytorch_lightning.profilers.pytorch.RegisterRecordFunction` class instead." - ) - super().__init__(*args, **kwargs) - - -class ScheduleWrapper(NewScheduleWrapper): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.pytorch.ScheduleWrapper` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.pytorch.ScheduleWrapper` class instead." - ) - super().__init__(*args, **kwargs) - - -class PyTorchProfiler(NewPyTorchProfiler): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.PyTorchProfiler` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.PyTorchProfiler` class instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/profiler/simple.py b/src/pytorch_lightning/profiler/simple.py deleted file mode 100644 index 9438f516b2c93..0000000000000 --- a/src/pytorch_lightning/profiler/simple.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.profilers.simple import SimpleProfiler as NewSimpleProfiler -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class SimpleProfiler(NewSimpleProfiler): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.SimpleProfiler` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.SimpleProfiler` class instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/profiler/xla.py b/src/pytorch_lightning/profiler/xla.py deleted file mode 100644 index 0cdc0196001ff..0000000000000 --- a/src/pytorch_lightning/profiler/xla.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.profilers.xla import XLAProfiler as NewXLAProfiler -from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation - - -class XLAProfiler(NewXLAProfiler): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "`pytorch_lightning.profiler.XLAProfiler` is deprecated in v1.7 and will be removed in v1.9." - " Use the equivalent `pytorch_lightning.profilers.XLAProfiler` class instead." - ) - super().__init__(*args, **kwargs) diff --git a/tests/tests_app/cli/test_cloud_cli.py b/tests/tests_app/cli/test_cloud_cli.py index 598a90368da88..3b9317a3a9613 100644 --- a/tests/tests_app/cli/test_cloud_cli.py +++ b/tests/tests_app/cli/test_cloud_cli.py @@ -11,6 +11,7 @@ from lightning_cloud.openapi import ( V1LightningappV2, V1ListLightningappInstancesResponse, + V1ListLightningappsV2Response, V1ListMembershipsResponse, V1Membership, ) @@ -36,6 +37,9 @@ class FakeResponse: class FakeLightningClient: + def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs): + return V1ListLightningappsV2Response(lightningapps=[]) + def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs): return V1ListLightningappInstancesResponse(lightningapps=[]) @@ -182,7 +186,7 @@ def __init__(self, *args, message, **kwargs): super().__init__() self.message = message - def lightningapp_instance_service_list_lightningapp_instances(self, *args, **kwargs): + def lightningapp_v2_service_list_lightningapps_v2(self, *args, **kwargs): raise ApiException( http_resp=HttpHeaderDict( data=self.message, diff --git a/tests/tests_app/components/multi_node/test_base.py b/tests/tests_app/components/multi_node/test_base.py index 8582047a2ff20..8fd4f28e679a0 100644 --- a/tests/tests_app/components/multi_node/test_base.py +++ b/tests/tests_app/components/multi_node/test_base.py @@ -13,7 +13,7 @@ class Work(LightningWork): def run(self): pass - with pytest.warns(UserWarning, match=escape("You set MultiNode(num_nodes=1, ...)` but ")): + with pytest.warns(UserWarning, match=escape("You set MultiNode(num_nodes=2, ...)` but ")): MultiNode(Work, num_nodes=2, cloud_compute=CloudCompute("gpu")) with no_warning_call(UserWarning, match=escape("You set MultiNode(num_nodes=1, ...)` but ")): diff --git a/tests/tests_app/components/multi_node/test_trainer.py b/tests/tests_app/components/multi_node/test_trainer.py index 6b77604d43d7f..7cd28a0e09992 100644 --- a/tests/tests_app/components/multi_node/test_trainer.py +++ b/tests/tests_app/components/multi_node/test_trainer.py @@ -66,7 +66,7 @@ def test_trainer_run_executor_mps_forced_cpu(accelerator_given, accelerator_expe ({"strategy": "ddp_sharded_spawn"}, {"strategy": "ddp_sharded"}), ], ) -@pytest.mark.skipif(not module_available("pytorch"), reason="Lightning is not available") +@pytest.mark.skipif(not module_available("torch"), reason="PyTorch is not available") def test_trainer_run_executor_arguments_choices( args_given: dict, args_expected: dict, diff --git a/tests/tests_app/components/serve/test_auto_scaler.py b/tests/tests_app/components/serve/test_auto_scaler.py new file mode 100644 index 0000000000000..c3cfa99c9d69b --- /dev/null +++ b/tests/tests_app/components/serve/test_auto_scaler.py @@ -0,0 +1,175 @@ +import time +from unittest import mock +from unittest.mock import patch + +import pytest + +from lightning_app import CloudCompute, LightningWork +from lightning_app.components import AutoScaler, Text + + +class EmptyWork(LightningWork): + def run(self): + pass + + +class AutoScaler1(AutoScaler): + def scale(self, replicas: int, metrics) -> int: + # only upscale + return replicas + 1 + + +class AutoScaler2(AutoScaler): + def scale(self, replicas: int, metrics) -> int: + # only downscale + return replicas - 1 + + +def test_num_replicas_after_init(): + """Test the number of works is the same as min_replicas after initialization.""" + min_replicas = 2 + auto_scaler = AutoScaler(EmptyWork, min_replicas=min_replicas) + assert auto_scaler.num_replicas == min_replicas + + +@patch("uvicorn.run") +@patch("lightning_app.components.serve.auto_scaler._LoadBalancer.url") +@patch("lightning_app.components.serve.auto_scaler.AutoScaler.num_pending_requests") +def test_num_replicas_not_above_max_replicas(*_): + """Test self.num_replicas doesn't exceed max_replicas.""" + max_replicas = 6 + auto_scaler = AutoScaler1( + EmptyWork, + min_replicas=1, + max_replicas=max_replicas, + scale_out_interval=0.001, + scale_in_interval=0.001, + ) + + for _ in range(max_replicas + 1): + time.sleep(0.002) + auto_scaler.run() + + assert auto_scaler.num_replicas == max_replicas + + +@patch("uvicorn.run") +@patch("lightning_app.components.serve.auto_scaler._LoadBalancer.url") +@patch("lightning_app.components.serve.auto_scaler.AutoScaler.num_pending_requests") +def test_num_replicas_not_belo_min_replicas(*_): + """Test self.num_replicas doesn't exceed max_replicas.""" + min_replicas = 1 + auto_scaler = AutoScaler2( + EmptyWork, + min_replicas=min_replicas, + max_replicas=4, + scale_out_interval=0.001, + scale_in_interval=0.001, + ) + + for _ in range(3): + time.sleep(0.002) + auto_scaler.run() + + assert auto_scaler.num_replicas == min_replicas + + +@pytest.mark.parametrize( + "replicas, metrics, expected_replicas", + [ + pytest.param(1, {"pending_requests": 1, "pending_works": 0}, 2, id="increase if no pending work"), + pytest.param(1, {"pending_requests": 1, "pending_works": 1}, 1, id="dont increase if pending works"), + pytest.param(8, {"pending_requests": 1, "pending_works": 0}, 7, id="reduce if requests < 25% capacity"), + pytest.param(8, {"pending_requests": 2, "pending_works": 0}, 8, id="dont reduce if requests >= 25% capacity"), + ], +) +def test_scale(replicas, metrics, expected_replicas): + """Test `scale()`, the default scaling strategy.""" + auto_scaler = AutoScaler( + EmptyWork, + min_replicas=1, + max_replicas=8, + max_batch_size=1, + ) + + assert auto_scaler.scale(replicas, metrics) == expected_replicas + + +def test_scale_from_zero_min_replica(): + auto_scaler = AutoScaler( + EmptyWork, + min_replicas=0, + max_replicas=2, + max_batch_size=10, + ) + + resp = auto_scaler.scale(0, {"pending_requests": 0, "pending_works": 0}) + assert resp == 0 + + resp = auto_scaler.scale(0, {"pending_requests": 1, "pending_works": 0}) + assert resp == 1 + + resp = auto_scaler.scale(0, {"pending_requests": 1, "pending_works": 1}) + assert resp <= 0 + + +def test_create_work_cloud_compute_cloned(): + """Test CloudCompute is cloned to avoid creating multiple works in a single machine.""" + cloud_compute = CloudCompute("gpu") + auto_scaler = AutoScaler(EmptyWork, cloud_compute=cloud_compute) + _ = auto_scaler.create_work() + assert auto_scaler._work_kwargs["cloud_compute"] is not cloud_compute + + +fastapi_mock = mock.MagicMock() +mocked_fastapi_creater = mock.MagicMock(return_value=fastapi_mock) + + +@patch("lightning_app.components.serve.auto_scaler._create_fastapi", mocked_fastapi_creater) +@patch("lightning_app.components.serve.auto_scaler.uvicorn.run", mock.MagicMock()) +def test_API_ACCESS_ENDPOINT_creation(): + auto_scaler = AutoScaler(EmptyWork, input_type=Text, output_type=Text) + assert auto_scaler.load_balancer._work_name == "EmptyWork" + + auto_scaler.load_balancer.run() + fastapi_mock.mount.assert_called_once_with("/endpoint-info", mock.ANY, name="static") + + +def test_autoscaler_scale_up(monkeypatch): + monkeypatch.setattr(AutoScaler, "num_pending_works", 0) + monkeypatch.setattr(AutoScaler, "num_pending_requests", 100) + monkeypatch.setattr(AutoScaler, "scale", mock.MagicMock(return_value=1)) + monkeypatch.setattr(AutoScaler, "create_work", mock.MagicMock()) + monkeypatch.setattr(AutoScaler, "add_work", mock.MagicMock()) + + auto_scaler = AutoScaler(EmptyWork, min_replicas=0, max_replicas=4, scale_out_interval=0.001) + + # Mocking the attributes + auto_scaler._last_autoscale = time.time() - 100000 + auto_scaler.num_replicas = 0 + + # triggering scale up + auto_scaler.autoscale() + auto_scaler.scale.assert_called_once() + auto_scaler.create_work.assert_called_once() + auto_scaler.add_work.assert_called_once() + + +def test_autoscaler_scale_down(monkeypatch): + monkeypatch.setattr(AutoScaler, "num_pending_works", 0) + monkeypatch.setattr(AutoScaler, "num_pending_requests", 0) + monkeypatch.setattr(AutoScaler, "scale", mock.MagicMock(return_value=0)) + monkeypatch.setattr(AutoScaler, "remove_work", mock.MagicMock()) + monkeypatch.setattr(AutoScaler, "workers", mock.MagicMock()) + + auto_scaler = AutoScaler(EmptyWork, min_replicas=0, max_replicas=4, scale_in_interval=0.001) + + # Mocking the attributes + auto_scaler._last_autoscale = time.time() - 100000 + auto_scaler.num_replicas = 1 + auto_scaler.__dict__["load_balancer"] = mock.MagicMock() + + # triggering scale up + auto_scaler.autoscale() + auto_scaler.scale.assert_called_once() + auto_scaler.remove_work.assert_called_once() diff --git a/tests/tests_app/components/serve/test_python_server.py b/tests/tests_app/components/serve/test_python_server.py index 313638e9ec42a..45275af9f87b7 100644 --- a/tests/tests_app/components/serve/test_python_server.py +++ b/tests/tests_app/components/serve/test_python_server.py @@ -32,14 +32,14 @@ def test_python_server_component(): def test_image_sample_data(): - data = Image()._get_sample_data() + data = Image().get_sample_data() assert isinstance(data, dict) assert "image" in data assert len(data["image"]) > 100 def test_number_sample_data(): - data = Number()._get_sample_data() + data = Number().get_sample_data() assert isinstance(data, dict) assert "prediction" in data assert data["prediction"] == 463 diff --git a/tests/tests_app/components/test_auto_scaler.py b/tests/tests_app/components/test_auto_scaler.py deleted file mode 100644 index 672b05bbc9a15..0000000000000 --- a/tests/tests_app/components/test_auto_scaler.py +++ /dev/null @@ -1,100 +0,0 @@ -import time -from unittest.mock import patch - -import pytest - -from lightning_app import CloudCompute, LightningWork -from lightning_app.components import AutoScaler - - -class EmptyWork(LightningWork): - def run(self): - pass - - -class AutoScaler1(AutoScaler): - def scale(self, replicas: int, metrics) -> int: - # only upscale - return replicas + 1 - - -class AutoScaler2(AutoScaler): - def scale(self, replicas: int, metrics) -> int: - # only downscale - return replicas - 1 - - -def test_num_replicas_after_init(): - """Test the number of works is the same as min_replicas after initialization.""" - min_replicas = 2 - auto_scaler = AutoScaler(EmptyWork, min_replicas=min_replicas) - assert auto_scaler.num_replicas == min_replicas - - -@patch("uvicorn.run") -@patch("lightning_app.components.auto_scaler._LoadBalancer.url") -@patch("lightning_app.components.auto_scaler.AutoScaler.num_pending_requests") -def test_num_replicas_not_above_max_replicas(*_): - """Test self.num_replicas doesn't exceed max_replicas.""" - max_replicas = 6 - auto_scaler = AutoScaler1( - EmptyWork, - min_replicas=1, - max_replicas=max_replicas, - autoscale_interval=0.001, - ) - - for _ in range(max_replicas + 1): - time.sleep(0.002) - auto_scaler.run() - - assert auto_scaler.num_replicas == max_replicas - - -@patch("uvicorn.run") -@patch("lightning_app.components.auto_scaler._LoadBalancer.url") -@patch("lightning_app.components.auto_scaler.AutoScaler.num_pending_requests") -def test_num_replicas_not_belo_min_replicas(*_): - """Test self.num_replicas doesn't exceed max_replicas.""" - min_replicas = 1 - auto_scaler = AutoScaler2( - EmptyWork, - min_replicas=min_replicas, - max_replicas=4, - autoscale_interval=0.001, - ) - - for _ in range(3): - time.sleep(0.002) - auto_scaler.run() - - assert auto_scaler.num_replicas == min_replicas - - -@pytest.mark.parametrize( - "replicas, metrics, expected_replicas", - [ - pytest.param(1, {"pending_requests": 1, "pending_works": 0}, 2, id="increase if no pending work"), - pytest.param(1, {"pending_requests": 1, "pending_works": 1}, 1, id="dont increase if pending works"), - pytest.param(8, {"pending_requests": 1, "pending_works": 0}, 7, id="reduce if requests < 25% capacity"), - pytest.param(8, {"pending_requests": 2, "pending_works": 0}, 8, id="dont reduce if requests >= 25% capacity"), - ], -) -def test_scale(replicas, metrics, expected_replicas): - """Test `scale()`, the default scaling strategy.""" - auto_scaler = AutoScaler( - EmptyWork, - min_replicas=1, - max_replicas=8, - max_batch_size=1, - ) - - assert auto_scaler.scale(replicas, metrics) == expected_replicas - - -def test_create_work_cloud_compute_cloned(): - """Test CloudCompute is cloned to avoid creating multiple works in a single machine.""" - cloud_compute = CloudCompute("gpu") - auto_scaler = AutoScaler(EmptyWork, cloud_compute=cloud_compute) - _ = auto_scaler.create_work() - assert auto_scaler._work_kwargs["cloud_compute"] is not cloud_compute diff --git a/tests/tests_app/core/scripts/lightning_overrides.py b/tests/tests_app/core/scripts/lightning_overrides.py index 641addb97374c..6bb58776356f8 100644 --- a/tests/tests_app/core/scripts/lightning_overrides.py +++ b/tests/tests_app/core/scripts/lightning_overrides.py @@ -9,10 +9,10 @@ from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.callbacks import Callback - from pytorch_lightning.loggers import LightningLoggerBase - from pytorch_lightning.loops.base import Loop + from pytorch_lightning.loggers import Logger + from pytorch_lightning.loops import Loop from pytorch_lightning.plugins import PrecisionPlugin - from pytorch_lightning.profiler.base import BaseProfiler + from pytorch_lightning.profilers import Profiler if __name__ == "__main__": @@ -38,7 +38,7 @@ class BoringAccelerator(Accelerator): class BoringCallback(Callback): pass - class BoringLightningLoggerBase(LightningLoggerBase): + class BoringLogger(Logger): pass class BoringLoop(Loop): @@ -47,5 +47,5 @@ class BoringLoop(Loop): class BoringMetric(Metric): pass - class BoringBaseProfiler(BaseProfiler): + class BoringProfiler(Profiler): pass diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index 04b89c927941a..adad9fba932e0 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -12,7 +12,7 @@ import pytest import requests from deepdiff import DeepDiff, Delta -from fastapi import HTTPException +from fastapi import HTTPException, Request from httpx import AsyncClient from pydantic import BaseModel @@ -31,6 +31,7 @@ from lightning_app.runners import MultiProcessRuntime from lightning_app.storage.drive import Drive from lightning_app.testing.helpers import _MockQueue +from lightning_app.utilities.app_status import AppStatus from lightning_app.utilities.component import _set_frontend_context, _set_work_context from lightning_app.utilities.enum import AppStage from lightning_app.utilities.load_app import extract_metadata_from_app @@ -195,7 +196,7 @@ def test_update_publish_state_and_maybe_refresh_ui(): publish_state_queue = _MockQueue("publish_state_queue") api_response_queue = _MockQueue("api_response_queue") - publish_state_queue.put(app.state_with_changes) + publish_state_queue.put((app.state_with_changes, None)) thread = UIRefresher(publish_state_queue, api_response_queue) thread.run_once() @@ -226,7 +227,7 @@ def get(self, timeout: int = 0): has_started_queue = _MockQueue("has_started_queue") api_response_queue = _MockQueue("api_response_queue") state = app.state_with_changes - publish_state_queue.put(state) + publish_state_queue.put((state, AppStatus(is_ui_ready=True, work_statuses={}))) spec = extract_metadata_from_app(app) ui_refresher = start_server( publish_state_queue, @@ -284,6 +285,9 @@ def get(self, timeout: int = 0): {"name": "main_4", "content": "https://te"}, ] + response = await client.get("/api/v1/status") + assert response.json() == {"is_ui_ready": True, "work_statuses": {}} + response = await client.post("/api/v1/state", json={"state": new_state}, headers=headers) assert change_state_queue._queue[1].to_dict() == { "values_changed": {"root['vars']['counter']": {"new_value": 1}} @@ -479,10 +483,13 @@ def run(self): if self.counter == 501: self._exit() - def request(self, config: InputRequestModel) -> OutputRequestModel: + def request(self, config: InputRequestModel, request: Request) -> OutputRequestModel: self.counter += 1 if config.index % 5 == 0: raise HTTPException(status_code=400, detail="HERE") + assert request.body() + assert request.json() + assert request.headers return OutputRequestModel(name=config.name, counter=self.counter) def configure_api(self): diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index ea552adad7972..d397bb23e58f6 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -124,6 +124,7 @@ def test_simple_app(tmpdir): "_paths": {}, "_port": None, "_restarting": False, + "_display_name": "", }, "calls": {"latest_call_hash": None}, "changes": {}, @@ -140,6 +141,7 @@ def test_simple_app(tmpdir): "_paths": {}, "_port": None, "_restarting": False, + "_display_name": "", }, "calls": {"latest_call_hash": None}, "changes": {}, @@ -969,7 +971,7 @@ def run(self): def test_state_size_constant_growth(): app = LightningApp(SizeFlow()) MultiProcessRuntime(app, start_server=False).dispatch() - assert app.root._state_sizes[0] <= 7824 + assert app.root._state_sizes[0] <= 7888 assert app.root._state_sizes[20] <= 26500 diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index c8e9921f29eec..6aad7a9ee510b 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from functools import partial from time import time -from unittest.mock import ANY, MagicMock +from unittest.mock import ANY import pytest from deepdiff import DeepDiff, Delta @@ -19,7 +19,7 @@ from lightning_app.storage.path import _storage_root_dir from lightning_app.structures import Dict as LDict from lightning_app.structures import List as LList -from lightning_app.testing.helpers import EmptyFlow, EmptyWork +from lightning_app.testing.helpers import _MockQueue, EmptyFlow, EmptyWork from lightning_app.utilities.app_helpers import ( _delta_to_app_state_delta, _LightningAppRef, @@ -329,6 +329,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", @@ -352,6 +353,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", @@ -391,6 +393,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", @@ -414,6 +417,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", @@ -887,21 +891,37 @@ def run(self): def test_flow_ready(): - """This test validates the api publish state queue is populated only once ready is True.""" + """This test validates that the app status queue is populated correctly.""" + + mock_queue = _MockQueue("api_publish_state_queue") def run_patch(method): - app.api_publish_state_queue = MagicMock() - app.should_publish_changes_to_api = False + app.should_publish_changes_to_api = True + app.api_publish_state_queue = mock_queue method() + state = {"done": False} + + def lagged_run_once(method): + """Ensure that the full loop is run after the app exits.""" + new_done = method() + if state["done"]: + return True + state["done"] = new_done + return False + app = LightningApp(FlowReady()) app._run = partial(run_patch, method=app._run) + app.run_once = partial(lagged_run_once, method=app.run_once) MultiProcessRuntime(app, start_server=False).dispatch() - # Validates the state has been added only when ready was true. - state = app.api_publish_state_queue.put._mock_call_args[0][0] - call_hash = state["works"]["w"]["calls"]["latest_call_hash"] - assert state["works"]["w"]["calls"][call_hash]["statuses"][0]["stage"] == "succeeded" + _, first_status = mock_queue.get() + assert not first_status.is_ui_ready + + _, last_status = mock_queue.get() + while len(mock_queue) > 0: + _, last_status = mock_queue.get() + assert last_status.is_ui_ready def test_structures_register_work_cloudcompute(): diff --git a/tests/tests_app/core/test_lightning_work.py b/tests/tests_app/core/test_lightning_work.py index cb97eabfa237c..ea3288e6b761b 100644 --- a/tests/tests_app/core/test_lightning_work.py +++ b/tests/tests_app/core/test_lightning_work.py @@ -1,6 +1,6 @@ from queue import Empty from re import escape -from unittest.mock import Mock +from unittest.mock import MagicMock, Mock import pytest @@ -11,7 +11,7 @@ from lightning_app.storage import Path from lightning_app.testing.helpers import _MockQueue, EmptyFlow, EmptyWork from lightning_app.testing.testing import LightningTestApp -from lightning_app.utilities.enum import WorkStageStatus +from lightning_app.utilities.enum import make_status, WorkStageStatus from lightning_app.utilities.exceptions import LightningWorkException from lightning_app.utilities.packaging.build_config import BuildConfig from lightning_app.utilities.proxies import ProxyWorkRun, WorkRunner @@ -384,3 +384,36 @@ def run(self): def test_lightning_app_work_start(cache_calls, parallel): app = LightningApp(FlowStart(cache_calls, parallel)) MultiProcessRuntime(app, start_server=False).dispatch() + + +def test_lightning_work_delete(): + work = WorkCounter() + + with pytest.raises(Exception, match="Can't delete the work"): + work.delete() + + mock = MagicMock() + work._backend = mock + work.delete() + assert work == mock.delete_work._mock_call_args_list[0].args[1] + + +class WorkDisplay(LightningWork): + def __init__(self): + super().__init__() + + def run(self): + pass + + +def test_lightning_work_display_name(): + work = WorkDisplay() + assert work.state_vars["vars"]["_display_name"] == "" + work.display_name = "Hello" + assert work.state_vars["vars"]["_display_name"] == "Hello" + + work._calls["latest_call_hash"] = "test" + work._calls["test"] = {"statuses": [make_status(WorkStageStatus.PENDING)]} + with pytest.raises(RuntimeError, match="The display name can be set only before the work has started."): + work.display_name = "HELLO" + work.display_name = "Hello" diff --git a/tests/tests_app/runners/backends/__init__.py b/tests/tests_app/runners/backends/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_app/runners/backends/test_mp_process.py b/tests/tests_app/runners/backends/test_mp_process.py new file mode 100644 index 0000000000000..868a5b37717da --- /dev/null +++ b/tests/tests_app/runners/backends/test_mp_process.py @@ -0,0 +1,28 @@ +from unittest import mock +from unittest.mock import MagicMock, Mock + +from lightning_app import LightningApp, LightningWork +from lightning_app.runners.backends import MultiProcessingBackend + + +@mock.patch("lightning_app.core.app.AppStatus") +@mock.patch("lightning_app.runners.backends.mp_process.multiprocessing") +def test_backend_create_work_with_set_start_method(multiprocessing_mock, *_): + backend = MultiProcessingBackend(entrypoint_file="fake.py") + work = Mock(spec=LightningWork) + work._start_method = "test_start_method" + + app = LightningApp(work) + app.caller_queues = MagicMock() + app.delta_queue = MagicMock() + app.readiness_queue = MagicMock() + app.error_queue = MagicMock() + app.request_queues = MagicMock() + app.response_queues = MagicMock() + app.copy_request_queues = MagicMock() + app.copy_response_queues = MagicMock() + app.flow_to_work_delta_queues = MagicMock() + + backend.create_work(app=app, work=work) + multiprocessing_mock.get_context.assert_called_with("test_start_method") + multiprocessing_mock.get_context().Process().start.assert_called_once() diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index cd9e4c2923d6a..cb4bd5ddaa3c0 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -31,6 +31,7 @@ V1LightningworkSpec, V1ListClustersResponse, V1ListLightningappInstancesResponse, + V1ListLightningappsV2Response, V1ListMembershipsResponse, V1ListProjectClusterBindingsResponse, V1Membership, @@ -210,6 +211,13 @@ def test_new_instance_on_different_cluster(self, cloud_backend, project_id, old_ app.flows = [] app.frontend = {} + existing_app = MagicMock() + existing_app.name = app_name + existing_app.id = "test-id" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=[existing_app] + ) + existing_instance = MagicMock() existing_instance.name = app_name existing_instance.status.phase = V1LightningappInstanceState.STOPPED @@ -234,6 +242,67 @@ def test_new_instance_on_different_cluster(self, cloud_backend, project_id, old_ assert args[1]["body"].name.startswith(app_name) assert args[1]["body"].cluster_id == new_cluster + def test_running_deleted_app(self, cloud_backend, project_id): + """Deleted apps show up in list apps but not in list instances. + + This tests that we don't try to reacreate a previously deleted app. + """ + app_name = "test-app" + + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="Default Project", project_id=project_id)] + ) + mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease( + cluster_id=DEFAULT_CLUSTER + ) + + mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse( + [ + Externalv1Cluster(id=DEFAULT_CLUSTER), + ] + ) + + mock_client.projects_service_list_project_cluster_bindings.return_value = V1ListProjectClusterBindingsResponse( + clusters=[ + V1ProjectClusterBinding(cluster_id=DEFAULT_CLUSTER), + ] + ) + + # Mock all clusters as global clusters + mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( + id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) + ) + + cloud_backend.client = mock_client + + app = mock.MagicMock() + app.flows = [] + app.frontend = {} + + existing_app = MagicMock() + existing_app.name = app_name + existing_app.id = "test-id" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=[existing_app] + ) + + # Simulate the app as deleted so no instance to return + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[]) + ) + + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + cloud_runtime._check_uploaded_folder = mock.MagicMock() + + cloud_runtime.dispatch(name=app_name) + + # Check that a new name was used which starts with and does not equal the old name + mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once() + args = mock_client.lightningapp_v2_service_create_lightningapp_release_instance.call_args + assert args[1]["body"].name != app_name + assert args[1]["body"].name.startswith(app_name) + @pytest.mark.parametrize("flow_cloud_compute", [None, CloudCompute(name="t2.medium")]) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) def test_run_with_default_flow_compute_config(self, monkeypatch, flow_cloud_compute): @@ -458,6 +527,9 @@ def test_call_with_work_app(self, lightningapps, start_with_flow, monkeypatch, t lightningapps[0].name = "myapp" lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED lightningapps[0].spec.cluster_id = "test" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -632,6 +704,9 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch lightningapps[0].name = "myapp" lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED lightningapps[0].spec.cluster_id = "test" + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -786,6 +861,9 @@ def test_call_with_work_app_and_app_comment_command_execution_set(self, lightnin mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) ) + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -912,6 +990,9 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) ) + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -1127,6 +1208,9 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo mock_client.cluster_service_get_cluster.side_effect = lambda cluster_id: V1GetClusterResponse( id=cluster_id, spec=V1ClusterSpec(cluster_type=V1ClusterType.GLOBAL) ) + mock_client.lightningapp_v2_service_list_lightningapps_v2.return_value = V1ListLightningappsV2Response( + lightningapps=lightningapps + ) mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=lightningapps) ) @@ -1334,7 +1418,7 @@ def test_check_uploaded_folder(monkeypatch, tmpdir, caplog): assert "The total size is 15.0 MB" in caplog.text assert "3 files were uploaded" in caplog.text assert "files:\n6.0 MB: c.jpg\n5.0 MB: b.txt\n4.0 MB: a.png\nPerhaps" in caplog.text # tests the order - assert "create a `.lightningignore` file" in caplog.text + assert "adding them to `.lightningignore`." in caplog.text assert "lightningingore` attribute in a Flow or Work" in caplog.text @@ -1498,8 +1582,6 @@ def run(self): def test_programmatic_lightningignore(monkeypatch, caplog, tmpdir): - monkeypatch.setenv("LIGHTNING_DISPATCHED", "0") # this is not cleaned up - mock_client = mock.MagicMock() mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( memberships=[V1Membership(name="test-project", project_id="test-project-id")] @@ -1566,11 +1648,59 @@ def run(self): assert "2 files were uploaded" # a.txt and .lightningignore assert "files:\n5.0 MB: a.txt\nPerhaps" in caplog.text # only this file appears - # replicate how the app would dispatch the app, and call `run` - monkeypatch.setenv("LIGHTNING_DISPATCHED", "1") flow.run() +def test_default_lightningignore(monkeypatch, caplog, tmpdir): + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="test-project", project_id="test-project-id")] + ) + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[]) + ) + mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease( + cluster_id="test" + ) + cloud_backend = mock.MagicMock(client=mock_client) + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + + class MyWork(LightningWork): + def run(self): + pass + + app = LightningApp(MyWork()) + + path = Path(tmpdir) + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=path / "entrypoint.py") + monkeypatch.setattr(LocalSourceCodeDir, "upload", mock.MagicMock()) + + # write some files + write_file_of_size(path / "a.txt", 5 * 1000 * 1000) + write_file_of_size(path / "venv" / "foo.txt", 4 * 1000 * 1000) + + assert not (path / ".lightningignore").exists() + + with mock.patch( + "lightning_app.runners.cloud._parse_lightningignore", wraps=_parse_lightningignore + ) as parse_mock, mock.patch( + "lightning_app.source_code.local._copytree", wraps=_copytree + ) as copy_mock, caplog.at_level( + logging.WARN + ): + cloud_runtime.dispatch() + + parse_mock.assert_called_once_with(()) + assert copy_mock.mock_calls[0].kwargs["ignore_functions"][0].args[1] == set() + + assert (path / ".lightningignore").exists() + + assert f"Your application folder '{path.absolute()}' is more than 2 MB" in caplog.text + assert "The total size is 5.0 MB" in caplog.text + assert "2 files were uploaded" # a.txt and .lightningignore + assert "files:\n5.0 MB: a.txt\nPerhaps" in caplog.text # only this file appears + + @pytest.mark.parametrize( "lightning_app_instance, lightning_cloud_url, expected_url", [ diff --git a/tests/tests_app/runners/test_multiprocess.py b/tests/tests_app/runners/test_multiprocess.py index 2e1a34ab38677..48bbedf555d63 100644 --- a/tests/tests_app/runners/test_multiprocess.py +++ b/tests/tests_app/runners/test_multiprocess.py @@ -68,7 +68,7 @@ def run(self): assert _get_context().value == "work" -class ContxtFlow(LightningFlow): +class ContextFlow(LightningFlow): def __init__(self): super().__init__() self.work = ContextWork() @@ -83,7 +83,7 @@ def run(self): def test_multiprocess_runtime_sets_context(): """Test that the runtime sets the global variable COMPONENT_CONTEXT in Flow and Work.""" - MultiProcessRuntime(LightningApp(ContxtFlow())).dispatch() + MultiProcessRuntime(LightningApp(ContextFlow())).dispatch() @pytest.mark.parametrize( diff --git a/tests/tests_app/storage/test_orchestrator.py b/tests/tests_app/storage/test_orchestrator.py index ca671e6b93704..4b391a890f1a9 100644 --- a/tests/tests_app/storage/test_orchestrator.py +++ b/tests/tests_app/storage/test_orchestrator.py @@ -39,7 +39,7 @@ def test_orchestrator(): # orchestrator is now waiting for a response for copier in Work A assert "work_b" in orchestrator.waiting_for_response - assert not request_queues["work_a"] + assert len(request_queues["work_a"]) == 0 assert request in copy_request_queues["work_a"] assert request.destination == "work_b" @@ -54,7 +54,7 @@ def test_orchestrator(): # orchestrator processes confirmation and confirms to the pending request from Work B orchestrator.run_once("work_a") - assert not copy_response_queues["work_a"] + assert len(copy_response_queues["work_a"]) == 0 assert response in response_queues["work_b"] assert not orchestrator.waiting_for_response orchestrator.run_once("work_b") @@ -71,7 +71,7 @@ def test_orchestrator(): assert response.exception is None # all queues should be empty - assert all(not queue for queue in request_queues.values()) - assert all(not queue for queue in response_queues.values()) - assert all(not queue for queue in copy_request_queues.values()) - assert all(not queue for queue in copy_response_queues.values()) + assert all(len(queue) == 0 for queue in request_queues.values()) + assert all(len(queue) == 0 for queue in response_queues.values()) + assert all(len(queue) == 0 for queue in copy_request_queues.values()) + assert all(len(queue) == 0 for queue in copy_response_queues.values()) diff --git a/tests/tests_app/storage/test_path.py b/tests/tests_app/storage/test_path.py index 3cd501f7344c8..2310b8034c303 100644 --- a/tests/tests_app/storage/test_path.py +++ b/tests/tests_app/storage/test_path.py @@ -606,7 +606,7 @@ def test_path_response_not_matching_reqeuest(tmpdir): path.get() # simulate a response that has a different hash than the request had - assert not response_queue + assert len(response_queue) == 0 response.path = str(path) response.hash = "other_hash" response_queue.put(response) diff --git a/tests/tests_app/structures/test_structures.py b/tests/tests_app/structures/test_structures.py index 3346da5a858fc..852589a4443eb 100644 --- a/tests/tests_app/structures/test_structures.py +++ b/tests/tests_app/structures/test_structures.py @@ -44,6 +44,7 @@ def run(self): "_host": "127.0.0.1", "_paths": {}, "_restarting": False, + "_display_name": "", "_internal_ip": "", "_cloud_compute": { "type": "__cloud_compute__", @@ -76,6 +77,7 @@ def run(self): "_host": "127.0.0.1", "_paths": {}, "_restarting": False, + "_display_name": "", "_internal_ip": "", "_cloud_compute": { "type": "__cloud_compute__", @@ -108,6 +110,7 @@ def run(self): "_host": "127.0.0.1", "_paths": {}, "_restarting": False, + "_display_name": "", "_internal_ip": "", "_cloud_compute": { "type": "__cloud_compute__", @@ -193,6 +196,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", @@ -225,6 +229,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", @@ -252,6 +257,7 @@ def run(self): "_paths": {}, "_restarting": False, "_internal_ip": "", + "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", "name": "default", diff --git a/tests/tests_app/utilities/test_cloud.py b/tests/tests_app/utilities/test_cloud.py index 573ec46106b84..6e93ad1e68d57 100644 --- a/tests/tests_app/utilities/test_cloud.py +++ b/tests/tests_app/utilities/test_cloud.py @@ -4,13 +4,18 @@ from lightning_app.utilities.cloud import is_running_in_cloud -@mock.patch.dict(os.environ, clear=True) -def test_is_running_locally(): - """We can determine if Lightning is running locally.""" - assert not is_running_in_cloud() - - -@mock.patch.dict(os.environ, {"LIGHTNING_APP_STATE_URL": "127.0.0.1"}) def test_is_running_cloud(): """We can determine if Lightning is running in the cloud.""" - assert is_running_in_cloud() + with mock.patch.dict(os.environ, {}, clear=True): + assert not is_running_in_cloud() + + with mock.patch.dict(os.environ, {"LAI_RUNNING_IN_CLOUD": "0"}, clear=True): + assert not is_running_in_cloud() + + # in the cloud, LIGHTNING_APP_STATE_URL is defined + with mock.patch.dict(os.environ, {"LIGHTNING_APP_STATE_URL": "defined"}, clear=True): + assert is_running_in_cloud() + + # LAI_RUNNING_IN_CLOUD is used to fake the value of `is_running_in_cloud` when loading the app for --cloud + with mock.patch.dict(os.environ, {"LAI_RUNNING_IN_CLOUD": "1"}): + assert is_running_in_cloud() diff --git a/tests/tests_app/utilities/test_introspection.py b/tests/tests_app/utilities/test_introspection.py index c6f08dc49de58..b6dac2ad255e4 100644 --- a/tests/tests_app/utilities/test_introspection.py +++ b/tests/tests_app/utilities/test_introspection.py @@ -41,17 +41,21 @@ def test_introspection_lightning(): def test_introspection_lightning_overrides(): """This test validates the scanner can find all the subclasses from primitives classes from PyTorch Lightning in the provided files.""" + scanner = Scanner(str(os.path.join(_PROJECT_ROOT, "tests/tests_app/core/scripts/lightning_cli.py"))) + scan = scanner.scan() + assert set(scan) == {"LightningDataModule", "LightningModule"} + scanner = Scanner(str(os.path.join(_PROJECT_ROOT, "tests/tests_app/core/scripts/lightning_overrides.py"))) scan = scanner.scan() - assert sorted(scan.keys()) == [ + assert set(scan) == { "Accelerator", - "BaseProfiler", + "Profiler", "Callback", "LightningDataModule", - "LightningLoggerBase", + "Logger", "LightningModule", "Loop", "Metric", "PrecisionPlugin", "Trainer", - ] + } diff --git a/tests/tests_app/utilities/test_proxies.py b/tests/tests_app/utilities/test_proxies.py index 4b8a5f25f71e3..a53d8e85a3d37 100644 --- a/tests/tests_app/utilities/test_proxies.py +++ b/tests/tests_app/utilities/test_proxies.py @@ -250,6 +250,7 @@ def __call__(self): state = deepcopy(self.work.state) self.work._calls[call_hash]["statuses"].append( { + "name": self.work.name, "stage": WorkStageStatus.FAILED, "reason": WorkFailureReasons.TIMEOUT, "timestamp": time.time(), @@ -547,7 +548,7 @@ def run(self, use_setattr=False, use_containers=False): # 1. Simulate no state changes ############################## work.run(use_setattr=False, use_containers=False) - assert not delta_queue + assert len(delta_queue) == 0 ############################ # 2. Simulate a setattr call @@ -563,16 +564,16 @@ def run(self, use_setattr=False, use_containers=False): assert len(observer._delta_memory) == 1 # The observer should not trigger any deltas being sent and only consume the delta memory - assert not delta_queue + assert len(delta_queue) == 0 observer.run_once() - assert not delta_queue + assert len(delta_queue) == 0 assert not observer._delta_memory ################################ # 3. Simulate a container update ################################ work.run(use_setattr=False, use_containers=True) - assert not delta_queue + assert len(delta_queue) == 0 assert not observer._delta_memory observer.run_once() observer.run_once() # multiple runs should not affect how many deltas are sent unless there are changes @@ -591,7 +592,7 @@ def run(self, use_setattr=False, use_containers=False): delta = delta_queue.get().delta.to_dict() assert delta == {"values_changed": {"root['vars']['var']": {"new_value": 3}}} - assert not delta_queue + assert len(delta_queue) == 0 assert len(observer._delta_memory) == 1 observer.run_once() @@ -599,7 +600,7 @@ def run(self, use_setattr=False, use_containers=False): assert delta["values_changed"] == {"root['vars']['dict']['counter']": {"new_value": 2}} assert delta["iterable_item_added"] == {"root['vars']['list'][1]": 1} - assert not delta_queue + assert len(delta_queue) == 0 assert not observer._delta_memory diff --git a/tests/tests_examples_app/conftest.py b/tests/tests_examples_app/conftest.py index fcefa6287c3c6..fa2c2a14dcbba 100644 --- a/tests/tests_examples_app/conftest.py +++ b/tests/tests_examples_app/conftest.py @@ -5,8 +5,8 @@ import psutil import pytest +from tests_examples_app.public import _PATH_EXAMPLES -from lightning_app import _PROJECT_ROOT from lightning_app.storage.path import _storage_root_dir from lightning_app.utilities.component import _set_context from lightning_app.utilities.packaging import cloud_compute @@ -24,11 +24,11 @@ def pytest_sessionstart(*_): """Pytest hook that get called after the Session object has been created and before performing collection and entering the run test loop.""" for name, url in GITHUB_APP_URLS.items(): - if not os.path.exists(os.path.join(_PROJECT_ROOT, "examples", name)): - path_examples = os.path.join(_PROJECT_ROOT, "examples") - Popen(["git", "clone", url, name], cwd=path_examples).wait(timeout=90) + app_path = _PATH_EXAMPLES / name + if not os.path.exists(app_path): + Popen(["git", "clone", url, name], cwd=_PATH_EXAMPLES).wait(timeout=90) else: - Popen(["git", "pull", "main"], cwd=os.path.join(_PROJECT_ROOT, "examples", name)).wait(timeout=90) + Popen(["git", "pull", "main"], cwd=app_path).wait(timeout=90) def pytest_sessionfinish(session, exitstatus): diff --git a/tests/tests_examples_app/local/__init__.py b/tests/tests_examples_app/local/__init__.py index dbd6181f8e89a..1e7d17cc6b536 100644 --- a/tests/tests_examples_app/local/__init__.py +++ b/tests/tests_examples_app/local/__init__.py @@ -1,5 +1,3 @@ -import os +from pathlib import Path -from lightning_app import _PROJECT_ROOT - -_PATH_APPS = os.path.join(_PROJECT_ROOT, "tests", "tests_examples_app", "apps") +_PATH_APPS = Path(__file__).resolve().parents[1] / "apps" diff --git a/tests/tests_examples_app/public/__init__.py b/tests/tests_examples_app/public/__init__.py index b70149ce10b11..1f1db5e15eaee 100644 --- a/tests/tests_examples_app/public/__init__.py +++ b/tests/tests_examples_app/public/__init__.py @@ -1,5 +1,3 @@ -import os +from pathlib import Path -from lightning_app import _PROJECT_ROOT - -_PATH_EXAMPLES = os.path.join(_PROJECT_ROOT, "examples") +_PATH_EXAMPLES = Path(__file__).resolve().parents[3] / "examples" diff --git a/tests/tests_examples_app/public/test_multi_node.py b/tests/tests_examples_app/public/test_multi_node.py index ef6c2058c4519..6d06b762c7d1b 100644 --- a/tests/tests_examples_app/public/test_multi_node.py +++ b/tests/tests_examples_app/public/test_multi_node.py @@ -1,10 +1,11 @@ import os -import sys from unittest import mock import pytest +from lightning_utilities.core.imports import package_available from tests_examples_app.public import _PATH_EXAMPLES +from lightning_app.testing.helpers import _RunIf from lightning_app.testing.testing import application_testing, LightningTestApp @@ -12,33 +13,13 @@ class LightningTestMultiNodeApp(LightningTestApp): def on_before_run_once(self): res = super().on_before_run_once() if self.works and all(w.has_stopped for w in self.works): - assert len([w for w in self.works]) == 2 + assert len(self.works) == 2 return True return res -@pytest.mark.skip(reason="flaky") -@mock.patch("lightning_app.components.multi_node.base.is_running_in_cloud", return_value=True) -def test_multi_node_example(_, monkeypatch): - monkeypatch.chdir(os.path.join(_PATH_EXAMPLES, "app_multi_node")) - command_line = [ - "app.py", - "--blocking", - "False", - "--open-ui", - "False", - ] - result = application_testing(LightningTestMultiNodeApp, command_line) - assert result.exit_code == 0 - - -class LightningTestMultiNodeWorksApp(LightningTestApp): - def on_before_run_once(self): - res = super().on_before_run_once() - if self.works and all(w.has_stopped for w in self.works): - assert len([w for w in self.works]) == 2 - return True - return res +# for the skip to work, the package needs to be installed without editable mode +_SKIP_LIGHTNING_UNAVAILABLE = pytest.mark.skipif(not package_available("lightning"), reason="script requires lightning") @pytest.mark.parametrize( @@ -46,15 +27,21 @@ def on_before_run_once(self): [ "train_pytorch.py", "train_any.py", - # "app_lite_work.py", "train_pytorch_spawn.py", - # "app_pl_work.py": TODO Add once https://github.com/Lightning-AI/lightning/issues/15556 is resolved. + pytest.param("train_fabric.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), + pytest.param("train_lt_script.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), + pytest.param("train_lt.py", marks=_SKIP_LIGHTNING_UNAVAILABLE), ], ) -@pytest.mark.skipif(sys.platform == "win32", reason="flaky") +@_RunIf(skip_windows=True) # flaky @mock.patch("lightning_app.components.multi_node.base.is_running_in_cloud", return_value=True) def test_multi_node_examples(_, app_name, monkeypatch): + # note: this test will fail locally: + # * if you installed `lightning_app`, then the examples need to be + # rewritten to use `lightning_app` imports (CI does this) + # * if you installed `lightning`, then the imports in this file and mocks + # need to be changed to use `lightning`. monkeypatch.chdir(os.path.join(_PATH_EXAMPLES, "app_multi_node")) command_line = [app_name, "--blocking", "False", "--open-ui", "False", "--setup"] - result = application_testing(LightningTestMultiNodeWorksApp, command_line) + result = application_testing(LightningTestMultiNodeApp, command_line) assert result.exit_code == 0 diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py index 7cfce2b9b63bc..0976d3cbd3727 100644 --- a/tests/tests_fabric/strategies/test_ddp.py +++ b/tests/tests_fabric/strategies/test_ddp.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock from unittest.mock import MagicMock, Mock import pytest @@ -62,3 +63,19 @@ def test_ddp_no_backward_sync(): pass module.no_sync.assert_called_once() + + +@mock.patch("lightning_fabric.strategies.ddp.DistributedDataParallel") +def test_ddp_extra_kwargs(ddp_mock): + """Test that additional kwargs passed to the DDPStrategy get passed down to the DistributedDataParallel + wrapper.""" + module = torch.nn.Linear(1, 1) + strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")]) + strategy.setup_module(module) + ddp_mock.assert_called_with(module=module, device_ids=None) + + ddp_mock.reset_mock() + + strategy = DDPStrategy(parallel_devices=[torch.device("cpu"), torch.device("cpu")], find_unused_parameters=True) + strategy.setup_module(module) + ddp_mock.assert_called_with(module=module, device_ids=None, find_unused_parameters=True) diff --git a/tests/tests_fabric/strategies/test_fairscale_integration.py b/tests/tests_fabric/strategies/test_fairscale_integration.py index 3009a03aa73b5..1f5a080606732 100644 --- a/tests/tests_fabric/strategies/test_fairscale_integration.py +++ b/tests/tests_fabric/strategies/test_fairscale_integration.py @@ -69,18 +69,3 @@ def test_fairscale_multi_process_checkpoint_state_consolidation(with_fairscale_o weights is identical to the saved one.""" lite = ShardedSaveAndLoad(strategy=strategy, accelerator=accelerator, devices=2) lite.run(tmpdir, with_fairscale_oss=with_fairscale_oss) - - -@pytest.mark.parametrize( - "strategy, expected_find_unused_parameters", - [ - ("ddp_sharded", None), - ("ddp_sharded_find_unused_parameters_false", False), - ("ddp_sharded_spawn", None), - ("ddp_sharded_spawn_find_unused_parameters_false", False), - ], -) -def test_fairscale_find_unused_parameters_from_registry(strategy, expected_find_unused_parameters): - lite = BoringLite(strategy=strategy) - if expected_find_unused_parameters is not None: - assert lite._strategy._ddp_kwargs["find_unused_parameters"] is False diff --git a/tests/tests_fabric/strategies/test_registry.py b/tests/tests_fabric/strategies/test_registry.py index 5bf9c14eaea4d..6076c3110974f 100644 --- a/tests/tests_fabric/strategies/test_registry.py +++ b/tests/tests_fabric/strategies/test_registry.py @@ -43,9 +43,7 @@ def __init__(self, param1, param2): def test_available_strategies_in_registry(): expected = { - "ddp_sharded_find_unused_parameters_false", "ddp_sharded", - "ddp_find_unused_parameters_false", "ddp", "deepspeed", "deepspeed_stage_1", @@ -54,14 +52,10 @@ def test_available_strategies_in_registry(): "deepspeed_stage_3", "deepspeed_stage_3_offload", "deepspeed_stage_3_offload_nvme", - "ddp_sharded_spawn_find_unused_parameters_false", "ddp_sharded_spawn", "ddp_spawn", "ddp_fork", "ddp_notebook", - "ddp_spawn_find_unused_parameters_false", - "ddp_fork_find_unused_parameters_false", - "ddp_notebook_find_unused_parameters_false", "single_tpu", "tpu_spawn", "xla", diff --git a/tests/tests_fabric/test_cli.py b/tests/tests_fabric/test_cli.py index 87fc3dd528e6e..4fc0c64a2f30b 100644 --- a/tests/tests_fabric/test_cli.py +++ b/tests/tests_fabric/test_cli.py @@ -16,21 +16,10 @@ from unittest.mock import Mock import pytest +import torch.distributed.run from tests_fabric.helpers.runif import RunIf from lightning_fabric.cli import _run_model -from lightning_fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_13 - -if not (_IS_WINDOWS and _TORCH_GREATER_EQUAL_1_13): - import torch.distributed.run - - -def skip_windows_pt_1_13(): - # https://github.com/pytorch/pytorch/issues/85427 - return pytest.mark.skipif( - condition=(_IS_WINDOWS and _TORCH_GREATER_EQUAL_1_13), - reason="Torchelastic import bug in 1.13 affecting Windows", - ) @pytest.fixture @@ -40,7 +29,6 @@ def fake_script(tmp_path): return str(script) -@skip_windows_pt_1_13() @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_cli_env_vars_defaults(monkeypatch, fake_script): monkeypatch.setattr(torch.distributed, "run", Mock()) @@ -55,7 +43,6 @@ def test_cli_env_vars_defaults(monkeypatch, fake_script): assert os.environ["LT_PRECISION"] == "32" -@skip_windows_pt_1_13() @pytest.mark.parametrize("accelerator", ["cpu", "gpu", "cuda", pytest.param("mps", marks=RunIf(mps=True))]) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @@ -67,7 +54,6 @@ def test_cli_env_vars_accelerator(_, accelerator, monkeypatch, fake_script): assert os.environ["LT_ACCELERATOR"] == accelerator -@skip_windows_pt_1_13() @pytest.mark.parametrize("strategy", ["dp", "ddp", "deepspeed"]) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @@ -79,7 +65,6 @@ def test_cli_env_vars_strategy(_, strategy, monkeypatch, fake_script): assert os.environ["LT_STRATEGY"] == strategy -@skip_windows_pt_1_13() @pytest.mark.parametrize("devices", ["1", "2", "0,", "1,0", "-1"]) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) @mock.patch("lightning_fabric.accelerators.cuda.num_cuda_devices", return_value=2) @@ -92,7 +77,6 @@ def test_cli_env_vars_devices_cuda(_, devices, monkeypatch, fake_script): @RunIf(mps=True) -@skip_windows_pt_1_13() @pytest.mark.parametrize("accelerator", ["mps", "gpu"]) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_cli_env_vars_devices_mps(accelerator, monkeypatch, fake_script): @@ -103,7 +87,6 @@ def test_cli_env_vars_devices_mps(accelerator, monkeypatch, fake_script): assert os.environ["LT_DEVICES"] == "1" -@skip_windows_pt_1_13() @pytest.mark.parametrize("num_nodes", ["1", "2", "3"]) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_cli_env_vars_num_nodes(num_nodes, monkeypatch, fake_script): @@ -114,7 +97,6 @@ def test_cli_env_vars_num_nodes(num_nodes, monkeypatch, fake_script): assert os.environ["LT_NUM_NODES"] == num_nodes -@skip_windows_pt_1_13() @pytest.mark.parametrize("precision", ["64", "32", "16", "bf16"]) @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_cli_env_vars_precision(precision, monkeypatch, fake_script): @@ -125,7 +107,6 @@ def test_cli_env_vars_precision(precision, monkeypatch, fake_script): assert os.environ["LT_PRECISION"] == precision -@skip_windows_pt_1_13() @mock.patch.dict(os.environ, os.environ.copy(), clear=True) def test_cli_torchrun_defaults(monkeypatch, fake_script): torchrun_mock = Mock() @@ -145,7 +126,6 @@ def test_cli_torchrun_defaults(monkeypatch, fake_script): ) -@skip_windows_pt_1_13() @pytest.mark.parametrize( "devices,expected", [ diff --git a/tests/tests_fabric/test_connector.py b/tests/tests_fabric/test_connector.py index 336efe5fd304c..95524d7481fc4 100644 --- a/tests/tests_fabric/test_connector.py +++ b/tests/tests_fabric/test_connector.py @@ -81,7 +81,7 @@ def test_strategy_choice_ddp_on_cpu(): def _test_strategy_choice_ddp_and_cpu(ddp_strategy_class): connector = _Connector( - strategy=ddp_strategy_class(find_unused_parameters=True), + strategy=ddp_strategy_class(), accelerator="cpu", devices=2, ) @@ -379,9 +379,7 @@ def test_invalid_strategy_choice(): ["strategy", "strategy_class"], [ ("ddp_spawn", DDPStrategy), - ("ddp_spawn_find_unused_parameters_false", DDPStrategy), ("ddp", DDPStrategy), - ("ddp_find_unused_parameters_false", DDPStrategy), ], ) def test_strategy_choice_cpu_str(strategy, strategy_class): @@ -394,9 +392,7 @@ def test_strategy_choice_cpu_str(strategy, strategy_class): ["strategy", "strategy_class"], [ ("ddp_spawn", DDPStrategy), - ("ddp_spawn_find_unused_parameters_false", DDPStrategy), ("ddp", DDPStrategy), - ("ddp_find_unused_parameters_false", DDPStrategy), ("dp", DataParallelStrategy), ("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPShardedStrategy), @@ -780,9 +776,7 @@ def test_precision_selection_amp_ddp(strategy, devices, is_custom_plugin, plugin assert isinstance(connector.precision, plugin_cls) -@pytest.mark.parametrize( - ["strategy", "strategy_cls"], [("DDP", DDPStrategy), ("DDP_FIND_UNUSED_PARAMETERS_FALSE", DDPStrategy)] -) +@pytest.mark.parametrize(["strategy", "strategy_cls"], [("DDP", DDPStrategy), ("Ddp", DDPStrategy)]) def test_strategy_str_passed_being_case_insensitive(strategy, strategy_cls): connector = _Connector(strategy=strategy) assert isinstance(connector.strategy, strategy_cls) diff --git a/tests/tests_pytorch/benchmarks/test_sharded_parity.py b/tests/tests_pytorch/benchmarks/test_sharded_parity.py deleted file mode 100644 index 782df5ce924d5..0000000000000 --- a/tests/tests_pytorch/benchmarks/test_sharded_parity.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import time -from typing import Type - -import pytest -import torch - -from pytorch_lightning import seed_everything, Trainer -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset -from pytorch_lightning.strategies import DDPSpawnShardedStrategy -from tests_pytorch.helpers.runif import RunIf - - -class SeedTrainLoaderModel(BoringModel): - """Overrides training loader to ensure we enforce the same seed for all DDP processes.""" - - def train_dataloader(self): - seed_everything(42) - return torch.utils.data.DataLoader(RandomDataset(32, 64)) - - -class SeedTrainLoaderManualModel(SeedTrainLoaderModel): - def training_step(self, batch, batch_idx, optimizer_idx): - # manual - # access your optimizers with use_pl_optimizer=False. Default is True - (opt_a, opt_b) = self.optimizers(use_pl_optimizer=True) - loss_1 = self.step(batch) - - self.manual_backward(loss_1) - opt_a.step() - - # fake discriminator - loss_2 = self.step(batch[0]) - - # ensure we forward the correct params to the optimizer - # without retain_graph we can't do multiple backward passes - self.manual_backward(loss_2) - # todo: understand why synchronization breaks there. - # self.manual_backward(loss_2, retain_graph=True) - opt_b.step() - - assert self.layer.weight.grad is None or torch.all(self.layer.weight.grad == 0) - - def training_epoch_end(self, outputs) -> None: - # outputs should be an array with an entry per optimizer - assert len(outputs) == 2 - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 - - @property - def automatic_optimization(self) -> bool: - return False - - -class SeedTrainLoaderMultipleOptimizersModel(SeedTrainLoaderModel): - def training_step(self, batch, batch_idx, optimizer_idx): - output = self.layer(batch) - loss = self.loss(batch, output) - return {"loss": loss} - - def training_epoch_end(self, outputs) -> None: - # outputs should be an array with an entry per optimizer - assert len(outputs) == 2 - - def configure_optimizers(self): - optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) - optimizer_2 = torch.optim.SGD(self.layer.parameters(), lr=0.1) - return optimizer, optimizer_2 - - -def record_ddp_fit_model_stats(trainer, model, use_cuda): - """Helper to calculate wall clock time for fit + max allocated memory. - - Args: - trainer: The trainer object. - model: The model to fit. - use_cuda: Whether to sync CUDA kernels. - - Returns: - Max Memory if using GPUs, and total wall clock time. - """ - max_memory = None - - time_start = time.perf_counter() - if use_cuda: - torch.cuda.reset_peak_memory_stats() - torch.cuda.synchronize() - - trainer.fit(model) - - if use_cuda: - torch.cuda.synchronize() - max_memory = torch.cuda.max_memory_allocated() / 2**20 - - total_time = time.perf_counter() - time_start - - return max_memory, total_time - - -def plugin_parity_test( - model_cls: Type[SeedTrainLoaderModel], - seed: int = 42, - gpus: int = 0, - precision: int = 32, - max_percent_speed_diff: float = 0.1, -): - """Ensures that the trained model is identical to the standard DDP implementation. Also checks for speed/memory - regressions, we should expect always less memory but performance to fluctuate. - - Args: - model_cls: Model class to use for test. - seed: Seed for generators. Note that this does not handle the seed for data-loading on multi-process. - gpus: Number of GPUS to enable. - precision: Whether to use AMP or normal FP32 training. - max_percent_speed_diff: The maximum speed difference compared to normal DDP training. - This is more a safety net for variability in CI which can vary in speed, not for benchmarking. - """ - - # Train normal DDP - seed_everything(seed) - ddp_model = model_cls() - use_cuda = gpus > 0 - - trainer = Trainer( - fast_dev_run=True, - max_epochs=1, - accelerator="gpu", - devices=gpus, - precision=precision, - strategy="ddp_spawn", - benchmark=False, - ) - - max_memory_ddp, ddp_time = record_ddp_fit_model_stats(trainer=trainer, model=ddp_model, use_cuda=use_cuda) - - # Reset and train Custom DDP - seed_everything(seed) - custom_plugin_model = model_cls() - - trainer = Trainer( - fast_dev_run=True, - max_epochs=1, - accelerator="gpu", - devices=gpus, - precision=precision, - strategy="ddp_sharded_spawn", - benchmark=False, - ) - assert isinstance(trainer.strategy, DDPSpawnShardedStrategy) - - max_memory_custom, custom_model_time = record_ddp_fit_model_stats( - trainer=trainer, model=custom_plugin_model, use_cuda=use_cuda - ) - - # Assert model parameters are identical after fit - for ddp_param, custom_param in zip(ddp_model.parameters(), custom_plugin_model.parameters()): - assert torch.equal(ddp_param, custom_param), "Model parameters are different between DDP and Custom plugin" - - # Assert speed parity by ensuring percentage difference between custom/ddp is below threshold - percent_diff = (custom_model_time - ddp_time) / custom_model_time - - assert ( - percent_diff <= max_percent_speed_diff - ), f"Custom DDP was too slow compared to regular DDP, Custom Plugin Time: {custom_model_time}, DDP Time: {ddp_time}" - - if use_cuda: - # Assert CUDA memory parity - assert max_memory_custom <= max_memory_ddp, ( - "Custom plugin used too much memory compared to DDP, " - f"Custom Mem: {max_memory_custom}, DDP Mem: {max_memory_ddp}" - ) - - -@RunIf(fairscale=True) -@pytest.mark.parametrize( - "kwargs", - [ - pytest.param(dict(gpus=1, model_cls=SeedTrainLoaderModel), marks=RunIf(min_cuda_gpus=1)), - pytest.param( - dict(gpus=1, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_cuda_gpus=1, amp_native=True) - ), - pytest.param(dict(gpus=2, model_cls=SeedTrainLoaderModel), marks=RunIf(min_cuda_gpus=2)), - pytest.param( - dict(gpus=2, precision=16, model_cls=SeedTrainLoaderModel), marks=RunIf(min_cuda_gpus=2, amp_native=True) - ), - pytest.param( - dict(gpus=2, model_cls=SeedTrainLoaderMultipleOptimizersModel), - marks=[ - RunIf(min_cuda_gpus=2), - pytest.mark.skip(reason="TODO: Current issue with multiple optimizers and FairScale."), - ], - ), - pytest.param( - dict(gpus=2, model_cls=SeedTrainLoaderManualModel), - marks=[ - RunIf(min_cuda_gpus=2), - pytest.mark.skip(reason="TODO: Current issue with multiple optimizers and FairScale."), - ], - ), - ], -) -def test_ddp_spawn_sharded_strategy(kwargs): - if kwargs["gpus"] > 1: - # TODO: decrease speed diff since only 2 GPUs sharding 2 optimizers - kwargs["max_percent_speed_diff"] = 0.25 - plugin_parity_test(**kwargs) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index cfd367eb73d74..b5264a09827c7 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -20,6 +20,7 @@ from lightning_utilities.test.warning import no_warning_call from torch.utils.data import DataLoader +import pytorch_lightning.profiler as profiler from pytorch_lightning import Trainer from pytorch_lightning.accelerators.cpu import CPUAccelerator from pytorch_lightning.cli import LightningCLI @@ -313,3 +314,21 @@ def test_v1_8_1_deprecated_rank_zero_only(): with pytest.deprecated_call(match="rank_zero_only` has been deprecated in v1.8.1"): rank_zero_only(lambda: None) + + +@pytest.mark.parametrize( + "cls", + [ + profiler.AdvancedProfiler, + profiler.PassThroughProfiler, + profiler.PyTorchProfiler, + profiler.SimpleProfiler, + pytest.param(profiler.XLAProfiler, marks=RunIf(tpu=True)), + ], +) +def test_profiler_classes_deprecated_warning(cls): + with pytest.deprecated_call( + match=f"profiler.{cls.__name__}` is deprecated in v1.9.0 and will be removed in v1.10.0." + f" Use .*profilers.{cls.__name__}` class instead." + ): + cls() diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py index 90dd4372bf89e..7f07baf315f33 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py @@ -23,15 +23,7 @@ from pytorch_lightning.cli import LightningCLI, SaveConfigCallback from pytorch_lightning.core.module import LightningModule from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.profiler.advanced import AdvancedProfiler -from pytorch_lightning.profiler.base import PassThroughProfiler -from pytorch_lightning.profiler.profiler import Profiler -from pytorch_lightning.profiler.pytorch import PyTorchProfiler, RegisterRecordFunction, ScheduleWrapper -from pytorch_lightning.profiler.simple import SimpleProfiler -from pytorch_lightning.profiler.xla import XLAProfiler -from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only -from tests_pytorch.helpers.runif import RunIf def test_lightning_logger_base_deprecation_warning(): @@ -173,36 +165,3 @@ def test_lightningCLI_old_module_deprecation(): with pytest.deprecated_call(match=r"instantiate_class.*deprecated in v1.7.*Use the equivalent function"): assert isinstance(old_cli.instantiate_class(tuple(), {"class_path": "pytorch_lightning.Trainer"}), Trainer) - - -def test_profiler_deprecation_warning(): - assert "Profiler` is deprecated in v1.7" in Profiler.__doc__ - - -@pytest.mark.parametrize( - "cls", - [ - AdvancedProfiler, - PassThroughProfiler, - PyTorchProfiler, - SimpleProfiler, - pytest.param(XLAProfiler, marks=RunIf(tpu=True)), - ], -) -def test_profiler_classes_deprecated_warning(cls): - with pytest.deprecated_call( - match=f"profiler.{cls.__name__}` is deprecated in v1.7 and will be removed in v1.9." - f" Use .*profilers.{cls.__name__}` class instead." - ): - cls() - - -@pytest.mark.skipif(not _KINETO_AVAILABLE, reason="Requires PyTorch Profiler Kineto") -def test_pytorch_profiler_schedule_wrapper_deprecation_warning(): - with pytest.deprecated_call(match="ScheduleWrapper` is deprecated in v1.7 and will be removed in v1.9."): - _ = ScheduleWrapper(None) - - -def test_pytorch_profiler_register_record_function_deprecation_warning(): - with pytest.deprecated_call(match="RegisterRecordFunction` is deprecated in v1.7 and will be removed in in v1.9."): - _ = RegisterRecordFunction(None) diff --git a/tests/tests_pytorch/graveyard/test_profiler.py b/tests/tests_pytorch/graveyard/test_profiler.py index 8341ef4aeb76f..9d192b3eee8d1 100644 --- a/tests/tests_pytorch/graveyard/test_profiler.py +++ b/tests/tests_pytorch/graveyard/test_profiler.py @@ -23,6 +23,60 @@ def test_v2_0_0_base_profilers(): AbstractProfiler() with pytest.raises( - RuntimeError, match="AbstractProfiler` was deprecated in v1.6 and is no longer supported as of v1.9." + RuntimeError, match="BaseProfiler` was deprecated in v1.6 and is no longer supported as of v1.9." ): BaseProfiler() + + from pytorch_lightning.profiler.advanced import AdvancedProfiler + + with pytest.raises( + RuntimeError, match="AdvancedProfiler` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + AdvancedProfiler() + + from pytorch_lightning.profiler.base import PassThroughProfiler + + with pytest.raises( + RuntimeError, match="PassThroughProfiler` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + PassThroughProfiler() + + from pytorch_lightning.profiler.profiler import Profiler + + with pytest.raises(RuntimeError, match="Profiler` was deprecated in v1.7.0 and is not longer supported as of v1.9"): + Profiler() + + from pytorch_lightning.profiler.pytorch import PyTorchProfiler + + with pytest.raises( + RuntimeError, match="PyTorchProfiler` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + PyTorchProfiler() + + from pytorch_lightning.profiler.pytorch import RegisterRecordFunction + + with pytest.raises( + RuntimeError, match="RegisterRecordFunction` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + RegisterRecordFunction() + + from pytorch_lightning.profiler.pytorch import ScheduleWrapper + + with pytest.raises( + RuntimeError, match="ScheduleWrapper` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + ScheduleWrapper() + + from pytorch_lightning.profiler.simple import SimpleProfiler + + with pytest.raises( + RuntimeError, match="SimpleProfiler` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + SimpleProfiler() + + from pytorch_lightning.profiler.xla import XLAProfiler + + with pytest.raises( + RuntimeError, match="XLAProfiler` was deprecated in v1.7.0 and is not longer supported as of v1.9" + ): + XLAProfiler()