Merge branch 'master' into master

Lightning-AI · Jun 6, 2024 · bf0bc9f · bf0bc9f
2 parents 9dbbc61 + a611de0
commit bf0bc9f
Show file tree

Hide file tree

Showing 80 changed files with 2,279 additions and 332 deletions.
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
@@ -31,6 +31,7 @@ pr:
       - "src/lightning/pytorch/**"
       - "src/pytorch_lightning/*"
       - "tests/tests_pytorch/**"
+      - "tests/run_standalone_*.sh"
       - "pyproject.toml" # includes pytest config
       - "requirements/fabric/**"
       - "src/lightning/fabric/**"

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -5,49 +5,36 @@
 # the repo. Unless a later match takes precedence,
 # @global-owner1 and @global-owner2 will be requested for
 # review when someone opens a pull request.
-* @lantiga @borda @tchaton @awaelchli
+* @lantiga @borda @tchaton @awaelchli @justusschock
 
 # CI/CD and configs
-/.actions/                  @borda @carmocca @ethanwharris @justusschock
-/.github/                   @borda @carmocca @ethanwharris @justusschock
-/.azure/                    @borda @carmocca @ethanwharris @justusschock
+/.actions/                  @borda @ethanwharris @justusschock
+/.github/                   @borda @ethanwharris @justusschock
+/.azure/                    @borda @ethanwharris @justusschock
 /.azure/app-cloud-e2e.yml   @awaelchli @ethanwharris @lantiga
-/dockers/                   @borda @carmocca @ethanwharris @justusschock
-*.yml                       @borda @carmocca @ethanwharris @justusschock
+/dockers/                   @borda @ethanwharris @justusschock
+*.yml                       @borda @ethanwharris @justusschock
 
 # Docs
-/docs/                                      @edenlightning @lantiga @borda @awaelchli
-/docs/*/conf.py                             @borda @awaelchli @carmocca
-/.github/*.md                               @edenlightning @williamfalcon @lantiga @borda
-/.github/ISSUE_TEMPLATE/                    @edenlightning @borda @tchaton @awaelchli
-/docs/source-fabric/conf.py                 @borda @awaelchli @carmocca
-/docs/source-fabric/index.rst               @awaelchli @lantiga @carmocca
-/docs/source-pytorch/conf.py                @borda @awaelchli @carmocca
+/docs/                                      @lantiga @borda @awaelchli
+/docs/*/conf.py                             @borda @awaelchli
+/.github/*.md                               @williamfalcon @lantiga @borda
+/.github/ISSUE_TEMPLATE/                    @borda @tchaton @awaelchli
+/docs/source-fabric/conf.py                 @borda @awaelchli
+/docs/source-fabric/index.rst               @awaelchli @lantiga
+/docs/source-pytorch/conf.py                @borda @awaelchli
 /docs/source-pytorch/index.rst              @williamfalcon @lantiga
 /docs/source-pytorch/levels                 @williamfalcon @lantiga
-/docs/source-app/                           @williamfalcon @lantiga @tchaton @awaelchli
-/docs/source-app/index.rst                  @williamfalcon @lantiga
-/docs/source-app/expertise_levels           @williamfalcon @lantiga
+/docs/source-app/                           @williamfalcon @lantiga @tchaton
 
 # PyTorch Lightning
-/src/lightning/pytorch                      @williamfalcon @awaelchli @carmocca @justusschock
-/src/pytorch_lightning                      @williamfalcon @awaelchli @carmocca @justusschock
-/tests/tests_pytorch                        @awaelchli @carmocca @justusschock @borda
-
-# Core APIs
-/src/lightning/pytorch/callbacks/callback.py @williamfalcon @awaelchli @carmocca
-/src/lightning/pytorch/core/datamodule.py    @williamFalcon @awaelchli @carmocca
-/src/lightning/pytorch/trainer/trainer.py    @williamfalcon @tchaton @awaelchli @carmocca
-/src/lightning/pytorch/core/hooks.py         @williamfalcon @tchaton @awaelchli @carmocca
-/src/lightning/pytorch/core/module.py        @williamfalcon @tchaton @awaelchli @carmocca
+/src/lightning/pytorch                      @lantiga @borda @tchaton @awaelchli @justusschock
 
 # Lightning Data
-/src/lightning/data/      @tchaton
+/src/lightning/data/                        @tchaton @lantiga
 
 # Lightning Fabric
-/src/lightning/fabric                       @awaelchli @carmocca @justusschock
-/src/lightning_fabric                       @awaelchli @carmocca @justusschock
-/tests/tests_fabric                         @awaelchli @carmocca @justusschock
+/src/lightning/fabric                       @lantiga @borda @tchaton @awaelchli @justusschock
 
 # Lightning App
 /src/lightning/app                          @tchaton @lantiga @awaelchli @ethanwharris
@@ -59,8 +46,8 @@
 /.github/CODEOWNERS                  @williamfalcon
 /SECURITY.md                         @williamfalcon @lantiga
 /README.md                           @williamfalcon @lantiga
-/setup.py                            @williamfalcon @borda @carmocca
-/src/pytorch_lightning/__about__.py  @williamfalcon @borda @carmocca
+/setup.py                            @williamfalcon @borda
+/src/pytorch_lightning/__about__.py  @williamfalcon @borda
 /src/lightning_app/__about__.py      @williamfalcon @lantiga @borda
 /src/lightning_fabric/__about__.py   @williamfalcon @borda @awaelchli
-/src/*/__setup__.py                  @borda @carmocca @justusschock
+/src/*/__setup__.py                  @borda @justusschock
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -121,11 +121,11 @@ To build the documentation locally, simply execute the following commands from p
 
 All added or edited code shall be the own original work of the particular contributor.
 If you use some third-party implementation, all such blocks/functions/modules shall be properly referred and if possible also agreed by code's author. For example - `This code is inspired from http://...`.
-In case you adding new dependencies, make sure that they are compatible with the actual PyTorch Lightning license (ie. dependencies should be _at least_ as permissive as the PyTorch Lightning license).
+In case you are adding new dependencies, make sure that they are compatible with the actual PyTorch Lightning license (i.e. dependencies should be _at least_ as permissive as the PyTorch Lightning license).
 
 ### Coding Style
 
-1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`.
+1. Use f-strings for output formation (except logging when we stay with lazy `logging.info("Hello %s!", name)`).
 1. You can use [pre-commit](https://pre-commit.com/) to make sure your code style is correct.
 
 ### Documentation
@@ -234,9 +234,9 @@ Here are tutorials:
 
 Here is the process to create a new test
 
-- 0. Optional: Follow tutorials !
-- 1. Find a file in tests/ which match what you want to test. If none, create one.
-- 2. Use this template to get started !
+- 0. Optional: Follow tutorials!
+- 1. Find a file in tests/ which matches what you want to test. If none, create one.
+- 2. Use this template to get started!
 - 3. Use **BoringModel and derivates to test out your code**.
 
 ```python

diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
@@ -60,6 +60,7 @@ subprojects:
       - "src/lightning/pytorch/**"
       - "src/pytorch_lightning/*"
       - "tests/tests_pytorch/**"
+      - "tests/run_standalone_*.sh"
       - "pyproject.toml" # includes pytest config
       - "requirements/fabric/**"
       - "src/lightning/fabric/**"
@@ -201,14 +202,14 @@ subprojects:
       - ".azure/gpu-tests-fabric.yml"
       - "examples/fabric/**"
       - "examples/run_fabric_examples.sh"
-      - "tests/run_standalone_*.sh"
       - "requirements/fabric/**"
       - "src/lightning/__init__.py"
       - "src/lightning/__setup__.py"
       - "src/lightning/__version__.py"
       - "src/lightning/fabric/**"
       - "src/lightning_fabric/*"
       - "tests/tests_fabric/**"
+      - "tests/run_standalone_*.sh"
       - "pyproject.toml" # includes pytest config
       - "!requirements/*/docs.txt"
       - "!*.md"

diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
@@ -108,11 +108,14 @@ jobs:
       - name: Full build for deployment
         if: github.event_name != 'pull_request'
         run: echo "DOCS_FETCH_ASSETS=1" >> $GITHUB_ENV
+      - name: Build without warnings
+        if: github.event_name != 'workflow_dispatch'
+        run: echo "BUILD_SPHINX_OPTS=-W --keep-going" >> $GITHUB_ENV
       - name: Make ${{ matrix.target }}
         working-directory: ./docs/source-${{ matrix.pkg-name }}
         # allow failing link check and doctest if you run with dispatch
-        continue-on-error: ${{ (matrix.target == 'doctest' || matrix.target == 'linkcheck')  && github.event_name == 'workflow_dispatch' }}
-        run: make ${{ matrix.target }} --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going"
+        continue-on-error: ${{ (matrix.target == 'doctest' || matrix.target == 'linkcheck') && github.event_name == 'workflow_dispatch' }}
+        run: make ${{ matrix.target }} --debug --jobs $(nproc) SPHINXOPTS="$BUILD_SPHINX_OPTS"
 
       - name: Keep artifact
         if: github.event_name == 'pull_request'

diff --git a/docs/source-app/conf.py b/docs/source-app/conf.py
@@ -41,11 +41,6 @@
 # The full version, including alpha/beta/rc tags
 release = lightning.__version__
 
-# Options for the linkcode extension
-# ----------------------------------
-github_user = "Lightning-AI"
-github_repo = project
-
 # -- Project documents -------------------------------------------------------
 
 if _FETCH_S3_ASSETS:
@@ -71,7 +66,7 @@
     "sphinx_toolbox.collapse",
     "sphinx.ext.todo",
     "sphinx.ext.coverage",
-    "sphinx.ext.linkcode",
+    # "sphinx.ext.linkcode",
     "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
     # 'sphinxcontrib.mockautodoc',  # raises error: directive 'automodule' is already registered ...
@@ -324,15 +319,6 @@ def setup(app):
     app.add_js_file("copybutton.js")
     app.add_css_file("main.css")
 
-
-# copy all notebooks to local folder
-path_nbs = os.path.join(_PATH_HERE, "notebooks")
-if not os.path.isdir(path_nbs):
-    os.mkdir(path_nbs)
-for path_ipynb in glob.glob(os.path.join(_PATH_ROOT, "notebooks", "*.ipynb")):
-    path_ipynb2 = os.path.join(path_nbs, os.path.basename(path_ipynb))
-    shutil.copy(path_ipynb, path_ipynb2)
-
 # copy all examples to local folder
 path_examples = os.path.join(_PATH_HERE, "..", "examples")
 if not os.path.isdir(path_examples):
@@ -370,44 +356,6 @@ def _package_list_from_file(file):
 autodoc_mock_imports = MOCK_PACKAGES
 
 
-# Resolve function
-# This function is used to populate the (source-app) links in the API
-def linkcode_resolve(domain, info):
-    def find_source():
-        # try to find the file and line number, based on code from numpy:
-        # https://github.com/numpy/numpy/blob/master/doc/source/conf.py#L286
-        obj = sys.modules[info["module"]]
-        for part in info["fullname"].split("."):
-            obj = getattr(obj, part)
-        fname = inspect.getsourcefile(obj)
-        # https://github.com/rtfd/readthedocs.org/issues/5735
-        if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")):
-            # /home/docs/checkouts/readthedocs.org/user_builds/pytorch_lightning/checkouts/
-            #  devel/pytorch_lightning/utilities/cls_experiment.py#L26-L176
-            path_top = os.path.abspath(os.path.join("..", "..", ".."))
-            fname = os.path.relpath(fname, start=path_top)
-        else:
-            # Local build, imitate master
-            fname = "master/" + os.path.relpath(fname, start=os.path.abspath(".."))
-        source, lineno = inspect.getsourcelines(obj)
-        return fname, lineno, lineno + len(source) - 1
-
-    if domain != "py" or not info["module"]:
-        return None
-    try:
-        filename = "%s#L%d-L%d" % find_source()
-    except Exception:
-        filename = info["module"].replace(".", "/") + ".py"
-    # import subprocess
-    # tag = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE,
-    #                        universal_newlines=True).communicate()[0][:-1]
-    branch = filename.split("/")[0]
-    # do mapping from latest tags to master
-    branch = {"latest": "master", "stable": "master"}.get(branch, branch)
-    filename = "/".join([branch] + filename.split("/")[1:])
-    return f"https://github.com/{github_user}/{github_repo}/blob/{filename}"
-
-
 autosummary_generate = True
 
 autodoc_member_order = "groupwise"
@@ -456,3 +404,9 @@ def find_source():
 
 # ignore all links in any CHANGELOG file
 linkcheck_exclude_documents = [r"^(.*\/)*CHANGELOG.*$"]
+
+
+# ignore the following relative links (false positive errors during linkcheck)
+linkcheck_ignore = [
+    "https://www.openai.com/index/clip/",
+]
diff --git a/docs/source-app/get_started/what_app_can_do.rst b/docs/source-app/get_started/what_app_can_do.rst
@@ -85,7 +85,7 @@ Find the `ScratchPad App <https://lightning.ai/app/hvUwbEG70B-ScratchPad%2C%20No
 InVideo Search (Public)
 ***********************
 
-This App lets you find anything you're looking for inside a video. The engine is powered by `Open AI CLIP <https://openai.com/blog/clip/>`_.
+This App lets you find anything you're looking for inside a video. The engine is powered by `Open AI CLIP <https://www.openai.com/index/clip/>`_.
 
 Find the `InVideo Search App <https://lightning.ai/app/7pmQNIDxAE-InVideo%20Search>`_  on the App Gallery and the `InVideo Search App codebase. <https://github.com/Lightning-AI/LAI-InVideo-search-App>`_ in GitHub.
 

diff --git a/docs/source-app/glossary/restful_api/restful_api.rst b/docs/source-app/glossary/restful_api/restful_api.rst
@@ -25,7 +25,7 @@ These methods are guidelines to organize your RESTful Services and help users un
 * **`PUT`:** Updates/replaces existing resources.
 * **`DELETE`:** Deletes resources.
 
-Learn more about `HTTP Methods for RESTful Services here <https://www.restapitutorial.com/lessons/httpmethods.html#:~:text=The%20primary%20or%20most%2Dcommonly,but%20are%20utilized%20less%20frequently.>`_.
+Learn more about `HTTP Methods for RESTful Services here <https://www.restapitutorial.com/introduction/whatisrest>`_.
 
 The Lightning App framework uses the popular `FastAPI <https://fastapi.tiangolo.com/>`_ and `Pydantic <https://pydantic-docs.helpmanual.io/>`_ frameworks under the hood. This means you can use all their features while building your App.
 

diff --git a/docs/source-app/workflows/build_rest_api/index.rst b/docs/source-app/workflows/build_rest_api/index.rst
@@ -25,7 +25,7 @@ These methods are guidelines to organize your RESTful Services and help users un
 * **`PUT`:** Updates/replaces existing resources.
 * **`DELETE`:** Deletes resources.
 
-Learn more about `HTTP Methods for RESTful Services here <https://www.restapitutorial.com/lessons/httpmethods.html#:~:text=The%20primary%20or%20most%2Dcommonly,but%20are%20utilized%20less%20frequently.>`_.
+Learn more about `HTTP Methods for RESTful Services here <https://www.restapitutorial.com/introduction/whatisrest>`_.
 
 The Lightning App framework uses the popular `FastAPI <https://fastapi.tiangolo.com/>`_ and `Pydantic <https://pydantic-docs.helpmanual.io/>`_ frameworks under the hood. This means you can use all their features while building your App.
 

diff --git a/docs/source-fabric/_static/main.css b/docs/source-fabric/_static/main.css
@@ -1,3 +1,13 @@
 col {
   width: 50% !important;
 }
+
+ul.no-bullets {
+    list-style-type: none; /* Remove default bullets */
+    padding-left: 0;       /* Remove default padding */
+}
+
+ul.no-bullets li {
+    padding-left: 0.5em;
+    text-indent: -2em;
+}
diff --git a/docs/source-fabric/advanced/model_init.rst b/docs/source-fabric/advanced/model_init.rst
@@ -61,15 +61,15 @@ When loading a model from a checkpoint, for example when fine-tuning, set ``empt
 ----
 
 
-********************************************
-Model-parallel training (FSDP and DeepSpeed)
-********************************************
+***************************************************
+Model-parallel training (FSDP, TP, DeepSpeed, etc.)
+***************************************************
 
-When training sharded models with :doc:`FSDP <model_parallel/fsdp>` or DeepSpeed, using :meth:`~lightning.fabric.fabric.Fabric.init_module` is necessary in most cases because otherwise model initialization gets very slow (minutes) or (and that's more likely) you run out of CPU memory due to the size of the model.
+When training distributed models with :doc:`FSDP/TP <model_parallel/index>` or DeepSpeed, using :meth:`~lightning.fabric.fabric.Fabric.init_module` is necessary in most cases because otherwise model initialization gets very slow (minutes) or (and that's more likely) you run out of CPU memory due to the size of the model.
 
 .. code-block:: python
 
-    # Recommended for FSDP and DeepSpeed
+    # Recommended for FSDP, TP and DeepSpeed
     with fabric.init_module(empty_init=True):
         model = GPT3()  # parameters are placed on the meta-device
 
@@ -81,4 +81,4 @@ When training sharded models with :doc:`FSDP <model_parallel/fsdp>` or DeepSpeed
 
 .. note::
     Empty-init is experimental and the behavior may change in the future.
-    For FSDP on PyTorch 2.1+, it is required that all user-defined modules that manage parameters implement a ``reset_parameters()`` method (all PyTorch built-in modules have this too).
+    For distributed models on PyTorch 2.1+, it is required that all user-defined modules that manage parameters implement a ``reset_parameters()`` method (all PyTorch built-in modules have this too).
diff --git a/docs/source-fabric/advanced/model_parallel/fsdp.rst b/docs/source-fabric/advanced/model_parallel/fsdp.rst
@@ -1,14 +1,11 @@
-###########################################
-Training models with billions of parameters
-###########################################
+#####################################################
+Training models with billions of parameters with FSDP
+#####################################################
 
 Use Fully Sharded Data Parallel (FSDP) to train large models with billions of parameters efficiently on multiple GPUs and across multiple machines.
 
-.. note:: This is an experimental feature.
-
-
 Today, large models with billions of parameters are trained with many GPUs across several machines in parallel.
-Even a single H100 GPU with 80 GB of VRAM (the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision).
+Even a single H100 GPU with 80 GB of VRAM (one of the biggest today) is not enough to train just a 30B parameter model (even with batch size 1 and 16-bit precision).
 The memory consumption for training is generally made up of
 
 1. the model parameters,
@@ -19,7 +16,7 @@ The memory consumption for training is generally made up of
 |
 
 When the sum of these memory components exceed the VRAM of a single GPU, regular data-parallel training (DDP) can no longer be employed.
-One of the methods that can alleviate this limitation is called **model-parallel** training, and known as **FSDP** in PyTorch, and in this guide, you will learn how to effectively scale large models with it.
+One of the methods that can alleviate this limitation is called **Fully Sharded Data Parallel (FSDP)**, and in this guide, you will learn how to effectively scale large models with it.
 
 
 ----