diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7dd66a97d72a3c..78ed6b02b8e30c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -176,7 +176,6 @@ jobs:
             - run: python utils/check_config_attributes.py
             - run: python utils/check_doctest_list.py
             - run: make deps_table_check_updated
-            - run: python utils/tests_fetcher.py --sanity_check
             - run: python utils/update_metadata.py --check-only
             - run: python utils/check_task_guides.py
 
diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 6627812a666b6f..a7df11e8fb849a 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
   push:
     branches:
-      - docker-image*
+      - build_ci_docker_image*
   repository_dispatch:
   workflow_call:
     inputs:
@@ -67,35 +67,6 @@ jobs:
           push: true
           tags: huggingface/transformers-all-latest-gpu-push-ci
 
-  latest-with-torch-nightly-docker:
-    name: "Nightly PyTorch + Stable TensorFlow"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-latest
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-all-latest-gpu
-          build-args: |
-            REF=main
-            PYTORCH=pre
-          push: true
-          tags: huggingface/transformers-all-latest-torch-nightly-gpu
-
   latest-torch-deepspeed-docker:
     name: "Latest PyTorch + DeepSpeed"
     runs-on: ubuntu-latest
@@ -153,34 +124,6 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
 
-  nightly-torch-deepspeed-docker:
-    name: "Nightly PyTorch + DeepSpeed"
-    # Push CI doesn't need this image
-    if: inputs.image_postfix != '-push-ci'
-    runs-on: ubuntu-latest
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
-
   doc-builder:
     name: "Doc builder"
     # Push CI doesn't need this image
diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml
new file mode 100644
index 00000000000000..f13dda7daa82bc
--- /dev/null
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@@ -0,0 +1,75 @@
+name: Build docker images (Nightly CI)
+
+on:
+  workflow_call:
+  push:
+    branches:
+      - build_nightly_ci_docker_image*
+
+concurrency:
+  group: docker-images-builds
+  cancel-in-progress: false
+
+jobs:
+  latest-with-torch-nightly-docker:
+    name: "Nightly PyTorch + Stable TensorFlow"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+            PYTORCH=pre
+          push: true
+          tags: huggingface/transformers-all-latest-torch-nightly-gpu
+
+  nightly-torch-deepspeed-docker:
+    name: "Nightly PyTorch + DeepSpeed"
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
\ No newline at end of file
diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml
index 3a0e1612454c58..18d88f2d52fa75 100644
--- a/.github/workflows/build-past-ci-docker-images.yml
+++ b/.github/workflows/build-past-ci-docker-images.yml
@@ -3,7 +3,7 @@ name: Build docker images (Past CI)
 on:
   push:
     branches:
-      - past-ci-docker-image*
+      - build_past_ci_docker_image*
 
 concurrency:
   group: docker-images-builds
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"]
+        version: ["1.13", "1.12", "1.11", "1.10", "1.9"]
     runs-on: ubuntu-latest
     steps:
       -
@@ -24,6 +24,17 @@ jobs:
       -
         name: Check out code
         uses: actions/checkout@v3
+      -
+        id: get-base-image
+        name: Get Base Image
+        env:
+          framework_version: ${{ matrix.version }}
+        run: |
+          echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["pytorch"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT
+      -
+        name: Print Base Image
+        run: |
+          echo ${{ steps.get-base-image.outputs.base_image }}
       -
         name: Login to DockerHub
         uses: docker/login-action@v2
@@ -37,6 +48,7 @@ jobs:
           context: ./docker/transformers-past-gpu
           build-args: |
             REF=main
+            BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }}
             FRAMEWORK=pytorch
             VERSION=${{ matrix.version }}
           push: true
@@ -47,7 +59,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        version: ["2.8", "2.7", "2.6", "2.5"]
+        version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"]
     runs-on: ubuntu-latest
     steps:
       -
@@ -57,37 +69,16 @@ jobs:
         name: Check out code
         uses: actions/checkout@v3
       -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+        id: get-base-image
+        name: Get Base Image
+        env:
+          framework_version: ${{ matrix.version }}
+        run: |
+          echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["tensorflow"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT
       -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-past-gpu
-          build-args: |
-            REF=main
-            FRAMEWORK=tensorflow
-            VERSION=${{ matrix.version }}
-          push: true
-          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
-
-  past-tensorflow-docker-2-4:
-    name: "Past TensorFlow Docker"
-    strategy:
-      fail-fast: false
-      matrix:
-        version: ["2.4"]
-    runs-on: ubuntu-latest
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v3
+        name: Print Base Image
+        run: |
+          echo ${{ steps.get-base-image.outputs.base_image }}
       -
         name: Login to DockerHub
         uses: docker/login-action@v2
@@ -101,8 +92,8 @@ jobs:
           context: ./docker/transformers-past-gpu
           build-args: |
             REF=main
-            BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04
+            BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }}
             FRAMEWORK=tensorflow
             VERSION=${{ matrix.version }}
           push: true
-          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
\ No newline at end of file
+          tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu
diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml
new file mode 100644
index 00000000000000..e86e6a16662c56
--- /dev/null
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@@ -0,0 +1,143 @@
+name: Self-hosted runner (nightly-past-ci-caller)
+
+on:
+  schedule:
+    # 2 am on each Sunday and Thursday
+    - cron: "0 2 * * 0,4"
+  push:
+    branches:
+      - run_nightly_ci*
+      - run_past_ci*
+
+jobs:
+  build_nightly_ci_images:
+    name: Build Nightly CI Docker Images
+    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
+    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
+    secrets: inherit
+
+  run_nightly_ci:
+    name: Nightly CI
+    needs: [build_nightly_ci_images]
+    uses: ./.github/workflows/self-nightly-scheduled.yml
+    secrets: inherit
+
+  run_past_ci_pytorch_1-13:
+    name: PyTorch 1.13
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_nightly_ci]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.13"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-12:
+    name: PyTorch 1.12
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-13]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.12"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-11:
+    name: PyTorch 1.11
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-12]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.11"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-10:
+    name: PyTorch 1.10
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-11]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.10"
+    secrets: inherit
+
+  run_past_ci_pytorch_1-9:
+    name: PyTorch 1.9
+    if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
+    needs: [run_past_ci_pytorch_1-10]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: pytorch
+      version: "1.9"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-11:
+    name: TensorFlow 2.11
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_pytorch_1-9]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.11"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-10:
+    name: TensorFlow 2.10
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-11]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.10"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-9:
+    name: TensorFlow 2.9
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-10]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.9"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-8:
+    name: TensorFlow 2.8
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-9]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.8"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-7:
+    name: TensorFlow 2.7
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-8]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.7"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-6:
+    name: TensorFlow 2.6
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-7]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.6"
+    secrets: inherit
+
+  run_past_ci_tensorflow_2-5:
+    name: TensorFlow 2.5
+    if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))
+    needs: [run_past_ci_tensorflow_2-6]
+    uses: ./.github/workflows/self-past.yml
+    with:
+      framework: tensorflow
+      version: "2.5"
+    secrets: inherit
diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index ca5186e736f416..b3e13cbb1b7d52 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -1,4 +1,4 @@
-name: Self-hosted runner (nightly)
+name: Self-hosted runner (nightly-ci)
 
 # Note that each job's dependencies go into a corresponding docker file.
 #
@@ -8,9 +8,7 @@ name: Self-hosted runner (nightly)
 
 on:
   repository_dispatch:
-# Disable temporarily until the test suite can be run under 12 hours.
-#  schedule:
-#    - cron: "0 16 * * *"
+  workflow_call:
 
 env:
   HF_HOME: /mnt/cache
@@ -33,7 +31,7 @@ jobs:
           fetch-depth: 2
 
       - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
 
   check_runners:
     name: Check Runners
@@ -41,7 +39,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -56,7 +54,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -96,7 +94,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [single-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -143,7 +141,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_tests_multi_gpu:
@@ -153,7 +151,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup.outputs.matrix) }}
         machine_type: [multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -200,7 +198,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_all_tests_torch_cuda_extensions_gpu:
@@ -209,7 +207,7 @@ jobs:
       fail-fast: false
       matrix:
         machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
     needs: setup
     container:
       image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
@@ -258,7 +256,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
+          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly
           path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 
   send_results:
@@ -292,7 +290,7 @@ jobs:
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: nightly-build
+          CI_EVENT: Nightly CI
           RUNNER_STATUS: ${{ needs.check_runner_status.result }}
           RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
           SETUP_STATUS: ${{ needs.setup.result }}
@@ -302,3 +300,11 @@ jobs:
           pip install slack_sdk
           pip show slack_sdk
           python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
+
+
+      # delete-artifact
+      - uses: geekyeggo/delete-artifact@v2
+        with:
+          name: |
+              single-*
+              multi-*
\ No newline at end of file
diff --git a/.github/workflows/self-past-caller.yml b/.github/workflows/self-past-caller.yml
deleted file mode 100644
index 2cc81dac8ca281..00000000000000
--- a/.github/workflows/self-past-caller.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-name: Self-hosted runner (past-ci-caller)
-
-on:
-  push:
-    branches:
-      - run-past-ci*
-
-jobs:
-  run_past_ci_pytorch_1-11:
-    name: PyTorch 1.11
-    if: always()
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.11"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-10:
-    name: PyTorch 1.10
-    if: always()
-    needs: [run_past_ci_pytorch_1-11]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.10"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-9:
-    name: PyTorch 1.9
-    if: always()
-    needs: [run_past_ci_pytorch_1-10]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.9"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-8:
-    name: PyTorch 1.8
-    if: always()
-    needs: [run_past_ci_pytorch_1-9]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.8"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-7:
-    name: PyTorch 1.7
-    if: always()
-    needs: [run_past_ci_pytorch_1-8]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.7"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-6:
-    name: PyTorch 1.6
-    if: always()
-    needs: [run_past_ci_pytorch_1-7]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.6"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-5:
-    name: PyTorch 1.5
-    if: always()
-    needs: [run_past_ci_pytorch_1-6]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.5"
-    secrets: inherit
-
-  run_past_ci_pytorch_1-4:
-    name: PyTorch 1.4
-    if: always()
-    needs: [run_past_ci_pytorch_1-5]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: pytorch
-      version: "1.4"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-8:
-    name: TensorFlow 2.8
-    if: always()
-    needs: [run_past_ci_pytorch_1-4]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.8"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-7:
-    name: TensorFlow 2.7
-    if: always()
-    needs: [run_past_ci_tensorflow_2-8]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.7"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-6:
-    name: TensorFlow 2.6
-    if: always()
-    needs: [run_past_ci_tensorflow_2-7]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.6"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-5:
-    name: TensorFlow 2.5
-    if: always()
-    needs: [run_past_ci_tensorflow_2-6]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.5"
-    secrets: inherit
-
-  run_past_ci_tensorflow_2-4:
-    name: TensorFlow 2.4
-    if: always()
-    needs: [run_past_ci_tensorflow_2-5]
-    uses: ./.github/workflows/self-past.yml
-    with:
-      framework: tensorflow
-      version: "2.4"
-    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index 12ddcc6658374e..bcb6639a79810e 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -1,4 +1,4 @@
-name: Self-hosted runner (past)
+name: Self-hosted runner (past-ci)
 
 # Note that each job's dependencies go into a corresponding docker file.
 #
@@ -157,7 +157,7 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
   run_tests_multi_gpu:
@@ -223,14 +223,80 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
           path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
 
+  run_all_tests_torch_cuda_extensions_gpu:
+    name: Torch CUDA extension tests
+    if: inputs.framework == 'pytorch'
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    needs: setup
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again*
+        working-directory: /
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          rm -rf DeepSpeed
+          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
+
   send_results:
     name: Send results to webhook
     runs-on: ubuntu-latest
     if: always()
-    needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_all_tests_torch_cuda_extensions_gpu
+    ]
     steps:
       - name: Preliminary job status
         shell: bash
@@ -272,4 +338,11 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }}
-          path: test_failure_tables
\ No newline at end of file
+          path: test_failure_tables
+
+      # delete-artifact
+      - uses: geekyeggo/delete-artifact@v2
+        with:
+          name: |
+              single-*
+              multi-*
\ No newline at end of file
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index f535efba27ca5d..3ebf38062c4207 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -10,6 +10,9 @@ on:
   repository_dispatch:
   schedule:
     - cron: "0 2 * * *"
+  push:
+    branches:
+      - run_scheduled_ci*
 
 env:
   HF_HOME: /mnt/cache
diff --git a/Makefile b/Makefile
index 400a35bbfe2e7f..9e1d197cb6473a 100644
--- a/Makefile
+++ b/Makefile
@@ -41,7 +41,6 @@ repo-consistency:
 	python utils/check_config_docstrings.py
 	python utils/check_config_attributes.py
 	python utils/check_doctest_list.py
-	python utils/tests_fetcher.py --sanity_check
 	python utils/update_metadata.py --check-only
 	python utils/check_task_guides.py
 
diff --git a/docker/transformers-past-gpu/Dockerfile b/docker/transformers-past-gpu/Dockerfile
index 99fb550c6a35d8..8ecc83c339d973 100644
--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_DOCKER_IMAGE="nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04"
+ARG BASE_DOCKER_IMAGE
 FROM $BASE_DOCKER_IMAGE
 LABEL maintainer="Hugging Face"
 
@@ -8,7 +8,7 @@ ARG DEBIAN_FRONTEND=noninteractive
 SHELL ["sh", "-lc"]
 
 RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
 RUN git lfs install
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 
@@ -23,6 +23,9 @@ RUN cd transformers && python3 setup.py develop
 ARG FRAMEWORK
 ARG VERSION
 
+# Control `setuptools` version to avoid some issues
+RUN [ "$VERSION" != "1.9" -a "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
+
 # Remove all frameworks
 # (`accelerate` requires `torch`, and this causes import issues for TF-only testing)
 RUN python3 -m pip uninstall -y torch torchvision torchaudio accelerate tensorflow jax flax
@@ -34,4 +37,20 @@ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --ve
 RUN echo "INSTALL_CMD = $INSTALL_CMD"
 RUN $INSTALL_CMD
 
+RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+
+# Uninstall `torch-tensorrt` and `apex` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt apex
+
+# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+RUN python3 -m pip uninstall -y deepspeed
+# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
+# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
+# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
index 573e09c22a9c05..fcb599ddc232d6 100644
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -1,10 +1,11 @@
-FROM nvcr.io/nvidia/pytorch:21.03-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
+FROM nvcr.io/nvidia/pytorch:22.08-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu113'
+ARG CUDA='cu117'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
@@ -20,6 +21,9 @@ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio
 
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
 
+# Uninstall `torch-tensorrt` and `apex` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt apex
+
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
@@ -27,23 +31,23 @@ RUN python3 -m pip uninstall -y deepspeed
 # RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
 #    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
-# For `torchdynamo` tests
-# (see https://github.com/huggingface/transformers/pull/17765)
-RUN git clone https://github.com/pytorch/functorch
-RUN python3 -m pip install --no-cache-dir ./functorch[aot]
-RUN cd functorch && python3 setup.py develop
-
-RUN git clone https://github.com/pytorch/torchdynamo
-RUN python3 -m pip install -r ./torchdynamo/requirements.txt
-RUN cd torchdynamo && python3 setup.py develop
-
-# install TensorRT
-RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
-RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
-
-# install torch_tensorrt (fx path)
-RUN git clone https://github.com/pytorch/TensorRT.git
-RUN cd TensorRT/py && python3 setup.py install --fx-only
+## For `torchdynamo` tests
+## (see https://github.com/huggingface/transformers/pull/17765)
+#RUN git clone https://github.com/pytorch/functorch
+#RUN python3 -m pip install --no-cache-dir ./functorch[aot]
+#RUN cd functorch && python3 setup.py develop
+#
+#RUN git clone https://github.com/pytorch/torchdynamo
+#RUN python3 -m pip install -r ./torchdynamo/requirements.txt
+#RUN cd torchdynamo && python3 setup.py develop
+#
+## install TensorRT
+#RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
+#RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
+#
+## install torch_tensorrt (fx path)
+#RUN git clone https://github.com/pytorch/TensorRT.git
+#RUN cd TensorRT/py && python3 setup.py install --fx-only
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/docs/source/en/generation_strategies.mdx b/docs/source/en/generation_strategies.mdx
index 831c8772b6c63c..00ee9221fe6803 100644
--- a/docs/source/en/generation_strategies.mdx
+++ b/docs/source/en/generation_strategies.mdx
@@ -216,11 +216,11 @@ We pride ourselves on being the best in the business and our customer service is
 ### Multinomial sampling
 
 As opposed to greedy search that always chooses a token with the highest probability as the
-next token, multinomial sampling randomly selects the next token based on the probability distribution over the entire
+next token, multinomial sampling (also called ancestral sampling) randomly selects the next token based on the probability distribution over the entire
 vocabulary given by the model. Every token with a non-zero probability has a chance of being selected, thus reducing the
 risk of repetition.
 
-To enable multinomial sampling set `do_sample=True`.
+To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
 
 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -232,7 +232,7 @@ To enable multinomial sampling set `do_sample=True`.
 >>> prompt = "Today was an amazing day because"
 >>> inputs = tokenizer(prompt, return_tensors="pt")
 
->>> outputs = model.generate(**inputs, do_sample=True, max_new_tokens=100)
+>>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Today was an amazing day because we are now in the final stages of our trip to New York City which was very tough. \
 It is a difficult schedule and a challenging part of the year but still worth it. I have been taking things easier and \
diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index b8545669e2cf5f..71ac03f908e182 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -175,7 +175,7 @@ pytz==2022.1
 pytz-deprecation-shim==0.1.0.post0
 PyYAML==6.0
 ray==1.11.0
-redis==4.5.3
+redis==4.5.4
 regex==2022.3.15
 requests==2.27.1
 requests-oauthlib==1.3.1
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 73e1bdb214e63f..95c8064ee40445 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -106,12 +106,12 @@ class MinLengthLogitsProcessor(LogitsProcessor):
 
     def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
         if not isinstance(min_length, int) or min_length < 0:
-            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+            raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")
 
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         if not all([isinstance(i, int) for i in eos_token_id]) or any([i < 0 for i in eos_token_id]):
-            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+            logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
 
         self.min_length = min_length
         self.eos_token_id = eos_token_id
@@ -148,7 +148,7 @@ def __init__(self, prompt_length_to_skip: int, min_new_tokens: int, eos_token_id
         if isinstance(eos_token_id, int):
             eos_token_id = [eos_token_id]
         if not all([isinstance(i, int) for i in eos_token_id]) or any([i < 0 for i in eos_token_id]):
-            raise ValueError(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
+            logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")
 
         self.prompt_length_to_skip = prompt_length_to_skip
         self.min_new_tokens = min_new_tokens
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index a93eab9581970d..ce31f9ddd6a232 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -1123,33 +1123,32 @@ def __init__(self):
 
 
 class NeptuneCallback(TrainerCallback):
-    """TrainerCallback that sends the logs to [Neptune](https://neptune.ai).
+    """TrainerCallback that sends the logs to [Neptune](https://app.neptune.ai).
 
     Args:
-        api_token (`str`, optional):
-            Neptune API token obtained upon registration. You can leave this argument out if you have saved your token
-            to the `NEPTUNE_API_TOKEN` environment variable (strongly recommended). See full setup instructions in the
-            [docs](https://docs.neptune.ai/getting-started/installation).
-        project (`str`, optional):
-            Name of an existing Neptune project, in the form: "workspace-name/project-name". You can find and copy the
-            name from the project Settings -> Properties in Neptune. If None (default), the value of the
-            `NEPTUNE_PROJECT` environment variable will be used.
-        name (`str`, optional): Custom name for the run.
+        api_token (`str`, *optional*): Neptune API token obtained upon registration.
+            You can leave this argument out if you have saved your token to the `NEPTUNE_API_TOKEN` environment
+            variable (strongly recommended). See full setup instructions in the
+            [docs](https://docs.neptune.ai/setup/installation).
+        project (`str`, *optional*): Name of an existing Neptune project, in the form "workspace-name/project-name".
+            You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
+            value of the `NEPTUNE_PROJECT` environment variable is used.
+        name (`str`, *optional*): Custom name for the run.
         base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
-            that will contain all of the logged metadata.
-        log_parameters (`bool`, optional, defaults to True):
+            that will contain all of the metadata logged by the callback.
+        log_parameters (`bool`, *optional*, defaults to `True`):
             If True, logs all Trainer arguments and model parameters provided by the Trainer.
-        log_checkpoints (`str`, optional, defaults to None):
-            If "same", uploads checkpoints whenever they are saved by the Trainer. If "last", uploads only the most
-            recently saved checkpoint. If "best", uploads the best checkpoint (among the ones saved by the Trainer). If
-            None, does not upload checkpoints.
-        run (`Run`, optional):
-            Pass a Neptune run object if you want to continue logging to an existing run. Read more about resuming runs
-            in the [docs](https://docs.neptune.ai/how-to-guides/neptune-api/resume-run).
-        **neptune_run_kwargs (optional):
+        log_checkpoints (`str`, *optional*): If "same", uploads checkpoints whenever they are saved by the Trainer.
+            If "last", uploads only the most recently saved checkpoint. If "best", uploads the best checkpoint (among
+            the ones saved by the Trainer). If `None`, does not upload checkpoints.
+        run (`Run`, *optional*): Pass a Neptune run object if you want to continue logging to an existing run.
+            Read more about resuming runs in the [docs](https://docs.neptune.ai/logging/to_existing_object).
+        **neptune_run_kwargs (*optional*):
             Additional keyword arguments to be passed directly to the
-            [neptune.init_run()](https://docs.neptune.ai/api-reference/neptune#.init_run) function when a new run is
-            created.
+            [`neptune.init_run()`](https://docs.neptune.ai/api/neptune#init_run) function when a new run is created.
+
+    For instructions and examples, see the [Transformers integration
+    guide](https://docs.neptune.ai/integrations/transformers) in the Neptune documentation.
     """
 
     integration_version_key = "source_code/integrations/transformers"
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 384876fb6de239..9a6c29c27bdf63 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1736,6 +1736,41 @@ def save_pretrained(
             for ignore_key in self._keys_to_ignore_on_save:
                 if ignore_key in state_dict.keys():
                     del state_dict[ignore_key]
+        if safe_serialization:
+            # Safetensors does not allow tensor aliasing.
+            # We're going to remove aliases before saving
+            ptrs = collections.defaultdict(list)
+            for name, tensor in state_dict.items():
+                ptrs[tensor.data_ptr()].append(name)
+
+            # These are all the pointers of shared tensors.
+            shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+            warn_names = set()
+            for names in shared_ptrs.values():
+                # Removing the keys which are declared as known duplicates on
+                # load. This allows to make sure the name which is kept is consistent.
+                if self._keys_to_ignore_on_load_missing is not None:
+                    for name in names:
+                        matches_pattern = any(re.search(pat, name) for pat in self._keys_to_ignore_on_load_missing)
+                        if matches_pattern and name in state_dict:
+                            del state_dict[name]
+
+                # When not all duplicates have been cleaned, still remove those keys, but put a clear warning.
+                # If the link between tensors was done at runtime then `from_pretrained` will not get
+                # the key back leading to random tensor. A proper warning will be shown
+                # during reload (if applicable), but since the file is not necessarily compatible with
+                # the config, better show a proper warning.
+                found = 0
+                for name in names:
+                    if name in state_dict:
+                        found += 1
+                        if found > 1:
+                            del state_dict[name]
+                            warn_names.add(name)
+            if len(warn_names) > 0:
+                logger.warning_once(
+                    f"Removed shared tensor {warn_names} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading",
+                )
 
         # Shard the model if it is too big.
         weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
@@ -2813,6 +2848,11 @@ def _fix_key(key):
         missing_keys = list(set(expected_keys) - set(loaded_keys))
         unexpected_keys = list(set(loaded_keys) - set(expected_keys))
 
+        # Some tensors maybe have been already filled by another key (tied weights).
+        existing_ptrs = {model_state_dict[k].data_ptr() for k in loaded_keys if k in model_state_dict}
+        missing_keys = [
+            k for k in missing_keys if k in model_state_dict and model_state_dict[k].data_ptr() not in existing_ptrs
+        ]
         # Some models may have keys that are not in the state by design, removing them before needlessly warning
         # the user.
         if cls._keys_to_ignore_on_load_missing is not None:
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index fd10c5cd11763c..b23b0c39d62147 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -24,7 +24,6 @@
 from ...dynamic_module_utils import get_class_from_dynamic_module
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
 from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging
 from ..encoder_decoder import EncoderDecoderConfig
 from .auto_factory import _LazyAutoMapping
@@ -37,6 +36,12 @@
 )
 
 
+if is_tokenizers_available():
+    from ...tokenization_utils_fast import PreTrainedTokenizerFast
+else:
+    PreTrainedTokenizerFast = None
+
+
 logger = logging.get_logger(__name__)
 
 if TYPE_CHECKING:
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index 59ea4ac7798a91..50808ec65c5db8 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -113,24 +113,28 @@ def resize(
         **kwargs,
     ) -> np.ndarray:
         """
-        Resize an image.
-
-        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
-        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
-        resized to the max size while preserving the aspect ratio.
+        Resize an image to `(size["height"], size["width"])`.
 
         Args:
             image (`np.ndarray`):
                 Image to resize.
             size (`Dict[str, int]`):
-                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
-            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
-                Resampling filter to use when resiizing the image.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the image. If not provided, it will be the same as the input image.
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
         """
         size = get_size_dict(size, default_to_square=True)
-        output_size = (size["width"], size["height"])
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
         return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
 
     def rescale(
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 46f0c9b11ce498..9b00274a0b14ca 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1238,8 +1238,28 @@ def __init__(self, config: Blip2Config):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
 
     @add_start_docstrings_to_model_forward(BLIP_2_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 6fd2e8fdd18412..eabc6e5e690d34 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -244,7 +244,7 @@ class DetaObjectDetectionOutput(ModelOutput):
 
 
 def _get_clones(module, N):
-    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+    return nn.ModuleList([module for i in range(N)])
 
 
 def inverse_sigmoid(x, eps=1e-5):
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index 36b8ab72ab2ce7..30325b82f787c9 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -52,6 +52,9 @@ class LlamaConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -85,6 +88,7 @@ def __init__(
         num_hidden_layers=32,
         num_attention_heads=32,
         hidden_act="silu",
+        max_position_embeddings=2048,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=True,
@@ -95,6 +99,7 @@ def __init__(
         **kwargs,
     ):
         self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 6d79536627fc20..c3f5285441bc60 100755
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -160,42 +160,24 @@ def forward(self, x):
 class LlamaAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-    ):
+    def __init__(self, config: LlamaConfig):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.head_dim = hidden_size // num_heads
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
 
-        if (self.head_dim * num_heads) != self.hidden_size:
+        if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
                 f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {num_heads})."
+                f" and `num_heads`: {self.num_heads})."
             )
-        self.q_proj = nn.Linear(
-            hidden_size,
-            num_heads * self.head_dim,
-            bias=False,
-        )
-        self.k_proj = nn.Linear(
-            hidden_size,
-            num_heads * self.head_dim,
-            bias=False,
-        )
-        self.v_proj = nn.Linear(
-            hidden_size,
-            num_heads * self.head_dim,
-            bias=False,
-        )
-        self.o_proj = nn.Linear(
-            num_heads * self.head_dim,
-            hidden_size,
-            bias=False,
-        )
-        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim)
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -270,10 +252,7 @@ class LlamaDecoderLayer(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-        )
+        self.self_attn = LlamaAttention(config=config)
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
@@ -630,8 +609,6 @@ def custom_forward(*inputs):
 
 
 class LlamaForCausalLM(LlamaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
diff --git a/src/transformers/models/nllb_moe/configuration_nllb_moe.py b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
index 03a37bb35d6b4f..3ff222b93cf68a 100644
--- a/src/transformers/models/nllb_moe/configuration_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
@@ -125,7 +125,7 @@ class NllbMoeConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
-    model_type = "nllb_moe"
+    model_type = "nllb-moe"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
 
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index 8642602cf97db5..dead3d8a042413 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -357,9 +357,10 @@ def __init__(
         initializer_factor=1.0,
         initializer_range=0.02,
         is_vqa=False,
+        tie_word_embeddings=False,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
         if text_config is None:
             text_config = {}
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 16ce6fa035c44b..e331da14e810e8 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -564,7 +564,7 @@ def decode(
         >>> # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer:
         >>> # https://huggingface.co/datasets/common_voice/viewer/en/train
         >>> word_offsets[:4]
-        [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.64, 'end_time': 1.88}, {'word': 'A', 'start_time': 2.12, 'end_time': 2.14}, {'word': 'MILE', 'start_time': 2.26, 'end_time': 2.46}]
+        [{'word': 'WHY', 'start_time': 1.42, 'end_time': 1.54}, {'word': 'DOES', 'start_time': 1.66, 'end_time': 1.9}, {'word': 'MILISANDRA', 'start_time': 2.26, 'end_time': 2.9}, {'word': 'LOOK', 'start_time': 3.0, 'end_time': 3.16}]
         ```"""
 
         from pyctcdecode.constants import (
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index c8c0549a467414..7beab782c70ddf 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -32,7 +32,6 @@
 from ..models.auto.modeling_auto import AutoModelForDepthEstimation
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
-from ..tokenization_utils_fast import PreTrainedTokenizerFast
 from ..utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
     is_kenlm_available,
@@ -139,9 +138,13 @@
         AutoModelForZeroShotImageClassification,
         AutoModelForZeroShotObjectDetection,
     )
+
+
 if TYPE_CHECKING:
     from ..modeling_tf_utils import TFPreTrainedModel
     from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_fast import PreTrainedTokenizerFast
+
 
 logger = logging.get_logger(__name__)
 
@@ -495,7 +498,7 @@ def pipeline(
     task: str = None,
     model: Optional = None,
     config: Optional[Union[str, PretrainedConfig]] = None,
-    tokenizer: Optional[Union[str, PreTrainedTokenizer, PreTrainedTokenizerFast]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     image_processor: Optional[Union[str, BaseImageProcessor]] = None,
     framework: Optional[str] = None,
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index 567aab56eaeac5..76cf4c0ea48c77 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -354,14 +354,14 @@ def model_inputs(self):
 
     @cached_property
     def tokenizer(self):
-        return NllbTokenizer.from_pretrained("ArthurZ/random-nllb-moe-2-experts")
+        return NllbTokenizer.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts")
 
     @cached_property
     def big_model(self):
         return NllbMoeForConditionalGeneration.from_pretrained("facebook/nllb-moe-54b")
 
     def inference_no_head(self):
-        model = NllbMoeModel.from_pretrained("ArthurZ/random-nllb-moe-2-experts").eval()
+        model = NllbMoeModel.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts").eval()
         with torch.no_grad():
             output = model(**self.model_inputs)
         # fmt: off
@@ -382,7 +382,7 @@ def test_inference_logits(self):
         and `transformers` implementation of NLLB-MoE transformers. We only check the logits
         of the second sample of the batch, as it is padded.
         """
-        model = NllbMoeForConditionalGeneration.from_pretrained("ArthurZ/random-nllb-moe-2-experts").eval()
+        model = NllbMoeForConditionalGeneration.from_pretrained("hf-internal-testing/random-nllb-moe-2-experts").eval()
         with torch.no_grad():
             output = model(**self.model_inputs)
 
diff --git a/tests/repo_utils/test_tests_fetcher.py b/tests/repo_utils/test_tests_fetcher.py
index cd0109b5359d4e..e02a917700dd2f 100644
--- a/tests/repo_utils/test_tests_fetcher.py
+++ b/tests/repo_utils/test_tests_fetcher.py
@@ -13,52 +13,661 @@
 # limitations under the License.
 
 import os
+import shutil
 import sys
+import tempfile
 import unittest
+from contextlib import contextmanager
+from pathlib import Path
 
 from git import Repo
 
+from transformers.testing_utils import CaptureStdout
 
-git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-sys.path.append(os.path.join(git_repo_path, "utils"))
 
-transformers_path = os.path.join(git_repo_path, "src", "transformers")
-# Tests are run against this specific commit for reproducibility
-# https://github.com/huggingface/transformers/tree/07f6690206e39ed7a4d9dbc58824314f7089bb38
-GIT_TEST_SHA = "07f6690206e39ed7a4d9dbc58824314f7089bb38"
+REPO_PATH = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+sys.path.append(os.path.join(REPO_PATH, "utils"))
 
-from tests_fetcher import checkout_commit, clean_code, get_module_dependencies  # noqa: E402
+import tests_fetcher  # noqa: E402
+from tests_fetcher import (  # noqa: E402
+    checkout_commit,
+    clean_code,
+    create_module_to_test_map,
+    create_reverse_dependency_map,
+    create_reverse_dependency_tree,
+    diff_is_docstring_only,
+    extract_imports,
+    get_all_tests,
+    get_diff,
+    get_module_dependencies,
+    get_tree_starting_at,
+    infer_tests_to_run,
+    parse_commit_message,
+    print_tree_deps_of,
+)
 
 
-class CheckDummiesTester(unittest.TestCase):
+BERT_MODELING_FILE = "src/transformers/models/bert/modeling_bert.py"
+BERT_MODEL_FILE = """from ...modeling_utils import PreTrainedModel
+from ...utils import is_torch_available
+from .configuration_bert import BertConfig
+
+class BertModel:
+    '''
+    This is the docstring.
+    '''
+    This is the code
+"""
+
+BERT_MODEL_FILE_NEW_DOCSTRING = """from ...modeling_utils import PreTrainedModel
+from ...utils import is_torch_available
+from .configuration_bert import BertConfig
+
+class BertModel:
+    '''
+    This is the docstring. It has been updated.
+    '''
+    This is the code
+"""
+
+BERT_MODEL_FILE_NEW_CODE = """from ...modeling_utils import PreTrainedModel
+from ...utils import is_torch_available
+from .configuration_bert import BertConfig
+
+class BertModel:
+    '''
+    This is the docstring.
+    '''
+    This is the code. It has been updated
+"""
+
+
+def create_tmp_repo(tmp_dir, models=None):
+    """
+    Creates a repository in a temporary directory mimicking the structure of Transformers. Uses the list of models
+    provided (which defaults to just `["bert"]`).
+    """
+    tmp_dir = Path(tmp_dir)
+    if tmp_dir.exists():
+        shutil.rmtree(tmp_dir)
+    tmp_dir.mkdir(exist_ok=True)
+    repo = Repo.init(tmp_dir)
+
+    if models is None:
+        models = ["bert"]
+    class_names = [model[0].upper() + model[1:] for model in models]
+
+    transformers_dir = tmp_dir / "src" / "transformers"
+    transformers_dir.mkdir(parents=True, exist_ok=True)
+    with open(transformers_dir / "__init__.py", "w") as f:
+        init_lines = ["from .utils import cached_file, is_torch_available"]
+        init_lines.extend(
+            [f"from .models.{model} import {cls}Config, {cls}Model" for model, cls in zip(models, class_names)]
+        )
+        f.write("\n".join(init_lines) + "\n")
+    with open(transformers_dir / "configuration_utils.py", "w") as f:
+        f.write("from .utils import cached_file\n\ncode")
+    with open(transformers_dir / "modeling_utils.py", "w") as f:
+        f.write("from .utils import cached_file\n\ncode")
+
+    utils_dir = tmp_dir / "src" / "transformers" / "utils"
+    utils_dir.mkdir(exist_ok=True)
+    with open(utils_dir / "__init__.py", "w") as f:
+        f.write("from .hub import cached_file\nfrom .imports import is_torch_available\n")
+    with open(utils_dir / "hub.py", "w") as f:
+        f.write("import huggingface_hub\n\ncode")
+    with open(utils_dir / "imports.py", "w") as f:
+        f.write("code")
+
+    model_dir = tmp_dir / "src" / "transformers" / "models"
+    model_dir.mkdir(parents=True, exist_ok=True)
+    with open(model_dir / "__init__.py", "w") as f:
+        f.write("\n".join([f"import {model}" for model in models]))
+
+    for model, cls in zip(models, class_names):
+        model_dir = tmp_dir / "src" / "transformers" / "models" / model
+        model_dir.mkdir(parents=True, exist_ok=True)
+        with open(model_dir / "__init__.py", "w") as f:
+            f.write(f"from .configuration_{model} import {cls}Config\nfrom .modeling_{model} import {cls}Model\n")
+        with open(model_dir / f"configuration_{model}.py", "w") as f:
+            f.write("from ...configuration_utils import PretrainedConfig\ncode")
+        with open(model_dir / f"modeling_{model}.py", "w") as f:
+            modeling_code = BERT_MODEL_FILE.replace("bert", model).replace("Bert", cls)
+            f.write(modeling_code)
+
+    test_dir = tmp_dir / "tests"
+    test_dir.mkdir(exist_ok=True)
+    with open(test_dir / "test_modeling_common.py", "w") as f:
+        f.write("from transformers.modeling_utils import PreTrainedModel\ncode")
+
+    for model, cls in zip(models, class_names):
+        test_model_dir = test_dir / "models" / model
+        test_model_dir.mkdir(parents=True, exist_ok=True)
+        (test_model_dir / "__init__.py").touch()
+        with open(test_model_dir / f"test_modeling_{model}.py", "w") as f:
+            f.write(
+                f"from transformers import {cls}Config, {cls}Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
+            )
+
+    repo.index.add(["src", "tests"])
+    repo.index.commit("Initial commit")
+    repo.create_head("main")
+    repo.head.reference = repo.refs.main
+    repo.delete_head("master")
+    return repo
+
+
+@contextmanager
+def patch_transformer_repo_path(new_folder):
+    """
+    Temporarily patches the variables defines in `tests_fetcher` to use a different location for the repo.
+    """
+    old_repo_path = tests_fetcher.PATH_TO_REPO
+    tests_fetcher.PATH_TO_REPO = Path(new_folder).resolve()
+    tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
+    tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
+    try:
+        yield
+    finally:
+        tests_fetcher.PATH_TO_REPO = old_repo_path
+        tests_fetcher.PATH_TO_TRANFORMERS = tests_fetcher.PATH_TO_REPO / "src/transformers"
+        tests_fetcher.PATH_TO_TESTS = tests_fetcher.PATH_TO_REPO / "tests"
+
+
+def commit_changes(filenames, contents, repo, commit_message="Commit"):
+    """
+    Commit new `contents` to `filenames` inside a given `repo`.
+    """
+    if not isinstance(filenames, list):
+        filenames = [filenames]
+    if not isinstance(contents, list):
+        contents = [contents]
+
+    folder = Path(repo.working_dir)
+    for filename, content in zip(filenames, contents):
+        with open(folder / filename, "w") as f:
+            f.write(content)
+    repo.index.add(filenames)
+    commit = repo.index.commit(commit_message)
+    return commit.hexsha
+
+
+class TestFetcherTester(unittest.TestCase):
+    def test_checkout_commit(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            repo = create_tmp_repo(tmp_folder)
+            initial_sha = repo.head.commit.hexsha
+            new_sha = commit_changes(BERT_MODELING_FILE, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
+
+            assert repo.head.commit.hexsha == new_sha
+            with checkout_commit(repo, initial_sha):
+                assert repo.head.commit.hexsha == initial_sha
+                with open(tmp_folder / BERT_MODELING_FILE) as f:
+                    assert f.read() == BERT_MODEL_FILE
+
+            assert repo.head.commit.hexsha == new_sha
+            with open(tmp_folder / BERT_MODELING_FILE) as f:
+                assert f.read() == BERT_MODEL_FILE_NEW_DOCSTRING
+
     def test_clean_code(self):
         # Clean code removes all strings in triple quotes
-        self.assertEqual(clean_code('"""\nDocstring\n"""\ncode\n"""Long string"""\ncode\n'), "code\ncode")
-        self.assertEqual(clean_code("'''\nDocstring\n'''\ncode\n'''Long string'''\ncode\n'''"), "code\ncode")
+        assert clean_code('"""\nDocstring\n"""\ncode\n"""Long string"""\ncode\n') == "code\ncode"
+        assert clean_code("'''\nDocstring\n'''\ncode\n'''Long string'''\ncode\n'''") == "code\ncode"
 
         # Clean code removes all comments
-        self.assertEqual(clean_code("code\n# Comment\ncode"), "code\ncode")
-        self.assertEqual(clean_code("code  # inline comment\ncode"), "code  \ncode")
+        assert clean_code("code\n# Comment\ncode") == "code\ncode"
+        assert clean_code("code  # inline comment\ncode") == "code  \ncode"
 
-    def test_checkout_commit(self):
-        repo = Repo(git_repo_path)
-        self.assertNotEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
-        with checkout_commit(repo, GIT_TEST_SHA):
-            self.assertEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
-        self.assertNotEqual(repo.head.commit.hexsha, GIT_TEST_SHA)
+    def test_get_all_tests(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_all_tests() == ["tests/models/bert", "tests/test_modeling_common.py"]
+
+    def test_get_all_tests_on_full_repo(self):
+        all_tests = get_all_tests()
+        assert "tests/models/albert" in all_tests
+        assert "tests/models/bert" in all_tests
+        assert "tests/repo_utils" in all_tests
+        assert "tests/test_pipeline_mixin.py" in all_tests
+        assert "tests/models" not in all_tests
+        assert "tests/__pycache__" not in all_tests
+        assert "tests/models/albert/test_modeling_albert.py" not in all_tests
+        assert "tests/repo_utils/test_tests_fetcher.py" not in all_tests
+
+    def test_diff_is_docstring_only(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            repo = create_tmp_repo(tmp_folder)
+
+            branching_point = repo.refs.main.commit
+            bert_file = BERT_MODELING_FILE
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
+            assert diff_is_docstring_only(repo, branching_point, bert_file)
+
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_CODE, repo)
+            assert not diff_is_docstring_only(repo, branching_point, bert_file)
+
+    def test_get_diff(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            repo = create_tmp_repo(tmp_folder)
+
+            initial_commit = repo.refs.main.commit
+            bert_file = BERT_MODELING_FILE
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING, repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == []
+
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_DOCSTRING + "\n# Adding a comment\n", repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == []
+
+            commit_changes(bert_file, BERT_MODEL_FILE_NEW_CODE, repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == [
+                "src/transformers/models/bert/modeling_bert.py"
+            ]
+
+            commit_changes("src/transformers/utils/hub.py", "import huggingface_hub\n\nnew code", repo)
+            assert get_diff(repo, repo.head.commit, repo.head.commit.parents) == ["src/transformers/utils/hub.py"]
+            assert get_diff(repo, repo.head.commit, [initial_commit]) == [
+                "src/transformers/models/bert/modeling_bert.py",
+                "src/transformers/utils/hub.py",
+            ]
+
+    def test_extract_imports_relative(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            expected_bert_imports = [
+                ("src/transformers/modeling_utils.py", ["PreTrainedModel"]),
+                ("src/transformers/utils/__init__.py", ["is_torch_available"]),
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+            ]
+            expected_utils_imports = [
+                ("src/transformers/utils/hub.py", ["cached_file"]),
+                ("src/transformers/utils/imports.py", ["is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+                assert extract_imports("src/transformers/utils/__init__.py") == expected_utils_imports
+
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from ...utils import cached_file, is_torch_available\nfrom .configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+            # Test with multi-line imports
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from ...utils import (\n    cached_file,\n    is_torch_available\n)\nfrom .configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+    def test_extract_imports_absolute(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from transformers.utils import cached_file, is_torch_available\nfrom transformers.models.bert.configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+            # Test with multi-line imports
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from transformers.utils import (\n    cached_file,\n    is_torch_available\n)\nfrom transformers.models.bert.configuration_bert import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/models/bert/configuration_bert.py", ["BertConfig"]),
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
+
+            # Test with base imports
+            with open(tmp_folder / BERT_MODELING_FILE, "w") as f:
+                f.write(
+                    "from transformers.utils import (\n    cached_file,\n    is_torch_available\n)\nfrom transformers import BertConfig\n"
+                )
+            expected_bert_imports = [
+                ("src/transformers/__init__.py", ["BertConfig"]),
+                ("src/transformers/utils/__init__.py", ["cached_file", "is_torch_available"]),
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert extract_imports(BERT_MODELING_FILE) == expected_bert_imports
 
     def test_get_module_dependencies(self):
-        bert_module = os.path.join(transformers_path, "models", "bert", "modeling_bert.py")
-        expected_deps = [
-            "activations.py",
-            "modeling_outputs.py",
-            "modeling_utils.py",
-            "pytorch_utils.py",
-            "models/bert/configuration_bert.py",
-        ]
-        expected_deps = {os.path.join(transformers_path, f) for f in expected_deps}
-        repo = Repo(git_repo_path)
-        with checkout_commit(repo, GIT_TEST_SHA):
-            deps = get_module_dependencies(bert_module)
-        deps = {os.path.expanduser(f) for f in deps}
-        self.assertEqual(deps, expected_deps)
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            expected_bert_dependencies = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/utils/imports.py",
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
+
+            expected_test_bert_dependencies = [
+                "tests/test_modeling_common.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+            ]
+
+            with patch_transformer_repo_path(tmp_folder):
+                assert (
+                    get_module_dependencies("tests/models/bert/test_modeling_bert.py")
+                    == expected_test_bert_dependencies
+                )
+
+            # Test with a submodule
+            (tmp_folder / "src/transformers/utils/logging.py").touch()
+            with open(tmp_folder / BERT_MODELING_FILE, "a") as f:
+                f.write("from ...utils import logging\n")
+
+            expected_bert_dependencies = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/utils/logging.py",
+                "src/transformers/utils/imports.py",
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
+
+            # Test with an object non-imported in the init
+            create_tmp_repo(tmp_folder)
+            with open(tmp_folder / BERT_MODELING_FILE, "a") as f:
+                f.write("from ...utils import CONSTANT\n")
+
+            expected_bert_dependencies = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/utils/__init__.py",
+                "src/transformers/utils/imports.py",
+            ]
+            with patch_transformer_repo_path(tmp_folder):
+                assert get_module_dependencies(BERT_MODELING_FILE) == expected_bert_dependencies
+
+    def test_create_reverse_dependency_tree(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                tree = create_reverse_dependency_tree()
+
+            init_edges = [
+                "src/transformers/utils/hub.py",
+                "src/transformers/utils/imports.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+            ]
+            assert {f for f, g in tree if g == "src/transformers/__init__.py"} == set(init_edges)
+
+            bert_edges = [
+                "src/transformers/modeling_utils.py",
+                "src/transformers/utils/imports.py",
+                "src/transformers/models/bert/configuration_bert.py",
+            ]
+            assert {f for f, g in tree if g == "src/transformers/models/bert/modeling_bert.py"} == set(bert_edges)
+
+            test_bert_edges = [
+                "tests/test_modeling_common.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+            ]
+            assert {f for f, g in tree if g == "tests/models/bert/test_modeling_bert.py"} == set(test_bert_edges)
+
+    def test_get_tree_starting_at(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                edges = create_reverse_dependency_tree()
+
+                bert_tree = get_tree_starting_at("src/transformers/models/bert/modeling_bert.py", edges)
+                config_utils_tree = get_tree_starting_at("src/transformers/configuration_utils.py", edges)
+
+            expected_bert_tree = [
+                "src/transformers/models/bert/modeling_bert.py",
+                [("src/transformers/models/bert/modeling_bert.py", "tests/models/bert/test_modeling_bert.py")],
+            ]
+            assert bert_tree == expected_bert_tree
+
+            expected_config_tree = [
+                "src/transformers/configuration_utils.py",
+                [("src/transformers/configuration_utils.py", "src/transformers/models/bert/configuration_bert.py")],
+                [
+                    ("src/transformers/models/bert/configuration_bert.py", "tests/models/bert/test_modeling_bert.py"),
+                    (
+                        "src/transformers/models/bert/configuration_bert.py",
+                        "src/transformers/models/bert/modeling_bert.py",
+                    ),
+                ],
+            ]
+            # Order of the edges is random
+            assert [set(v) for v in config_utils_tree] == [set(v) for v in expected_config_tree]
+
+    def test_print_tree_deps_of(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+
+            # There are two possible outputs since the order of the last two lines is non-deterministic.
+            expected_std_out = """src/transformers/models/bert/modeling_bert.py
+  tests/models/bert/test_modeling_bert.py
+src/transformers/configuration_utils.py
+  src/transformers/models/bert/configuration_bert.py
+    src/transformers/models/bert/modeling_bert.py
+    tests/models/bert/test_modeling_bert.py"""
+
+            expected_std_out_2 = """src/transformers/models/bert/modeling_bert.py
+  tests/models/bert/test_modeling_bert.py
+src/transformers/configuration_utils.py
+  src/transformers/models/bert/configuration_bert.py
+    tests/models/bert/test_modeling_bert.py
+    src/transformers/models/bert/modeling_bert.py"""
+
+            with patch_transformer_repo_path(tmp_folder), CaptureStdout() as cs:
+                print_tree_deps_of("src/transformers/models/bert/modeling_bert.py")
+                print_tree_deps_of("src/transformers/configuration_utils.py")
+
+            assert cs.out.strip() in [expected_std_out, expected_std_out_2]
+
+    def test_create_reverse_dependency_map(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                reverse_map = create_reverse_dependency_map()
+
+            # impact of BERT modeling file (note that we stop at the inits and don't go down further)
+            expected_bert_deps = {
+                "src/transformers/__init__.py",
+                "src/transformers/models/bert/__init__.py",
+                "tests/models/bert/test_modeling_bert.py",
+            }
+            assert set(reverse_map["src/transformers/models/bert/modeling_bert.py"]) == expected_bert_deps
+
+            # init gets the direct deps (and their recursive deps)
+            expected_init_deps = {
+                "src/transformers/utils/__init__.py",
+                "src/transformers/utils/hub.py",
+                "src/transformers/utils/imports.py",
+                "src/transformers/models/bert/__init__.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+                "src/transformers/configuration_utils.py",
+                "src/transformers/modeling_utils.py",
+                "tests/test_modeling_common.py",
+                "tests/models/bert/test_modeling_bert.py",
+            }
+            assert set(reverse_map["src/transformers/__init__.py"]) == expected_init_deps
+
+            expected_init_deps = {
+                "src/transformers/__init__.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+                "tests/models/bert/test_modeling_bert.py",
+            }
+            assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
+
+            # Test that with more models init of bert only gets deps to bert.
+            create_tmp_repo(tmp_folder, models=["bert", "gpt2"])
+            with patch_transformer_repo_path(tmp_folder):
+                reverse_map = create_reverse_dependency_map()
+
+            # init gets the direct deps (and their recursive deps)
+            expected_init_deps = {
+                "src/transformers/__init__.py",
+                "src/transformers/models/bert/configuration_bert.py",
+                "src/transformers/models/bert/modeling_bert.py",
+                "tests/models/bert/test_modeling_bert.py",
+            }
+            assert set(reverse_map["src/transformers/models/bert/__init__.py"]) == expected_init_deps
+
+    def test_create_module_to_test_map(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
+            create_tmp_repo(tmp_folder, models=models)
+            with patch_transformer_repo_path(tmp_folder):
+                test_map = create_module_to_test_map(filter_models=True)
+
+            for model in models:
+                assert test_map[f"src/transformers/models/{model}/modeling_{model}.py"] == [
+                    f"tests/models/{model}/test_modeling_{model}.py"
+                ]
+
+            # Init got filtered
+            expected_init_tests = {
+                "tests/test_modeling_common.py",
+                "tests/models/bert/test_modeling_bert.py",
+                "tests/models/gpt2/test_modeling_gpt2.py",
+            }
+            assert set(test_map["src/transformers/__init__.py"]) == expected_init_tests
+
+    def test_infer_tests_to_run(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            commit_changes("src/transformers/models/bert/modeling_bert.py", BERT_MODEL_FILE_NEW_CODE, repo)
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+
+            assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
+
+            # Fake a new model addition
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            branch = repo.create_head("new_model")
+            branch.checkout()
+
+            with open(tmp_folder / "src/transformers/__init__.py", "a") as f:
+                f.write("from .models.t5 import T5Config, T5Model\n")
+
+            model_dir = tmp_folder / "src/transformers/models/t5"
+            model_dir.mkdir(exist_ok=True)
+
+            with open(model_dir / "__init__.py", "w") as f:
+                f.write("from .configuration_t5 import T5Config\nfrom .modeling_t5 import T5Model\n")
+            with open(model_dir / "configuration_t5.py", "w") as f:
+                f.write("from ...configuration_utils import PretrainedConfig\ncode")
+            with open(model_dir / "modeling_t5.py", "w") as f:
+                modeling_code = BERT_MODEL_FILE.replace("bert", "t5").replace("Bert", "T5")
+                f.write(modeling_code)
+
+            test_dir = tmp_folder / "tests/models/t5"
+            test_dir.mkdir(exist_ok=True)
+            (test_dir / "__init__.py").touch()
+            with open(test_dir / "test_modeling_t5.py", "w") as f:
+                f.write(
+                    "from transformers import T5Config, T5Model\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode"
+                )
+
+            repo.index.add(["src", "tests"])
+            repo.index.commit("Add T5 model")
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt")
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+
+            expected_tests = {
+                "tests/models/bert/test_modeling_bert.py",
+                "tests/models/gpt2/test_modeling_gpt2.py",
+                "tests/models/t5/test_modeling_t5.py",
+                "tests/test_modeling_common.py",
+            }
+            assert set(tests_to_run.split(" ")) == expected_tests
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", filter_models=False)
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+
+            expected_tests = [f"tests/models/{name}/test_modeling_{name}.py" for name in models + ["t5"]]
+            expected_tests = set(expected_tests + ["tests/test_modeling_common.py"])
+            assert set(tests_to_run.split(" ")) == expected_tests
+
+    def test_infer_tests_to_run_with_test_modifs(self):
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            tmp_folder = Path(tmp_folder)
+            models = ["bert", "gpt2"] + [f"bert{i}" for i in range(10)]
+            repo = create_tmp_repo(tmp_folder, models=models)
+
+            commit_changes(
+                "tests/models/bert/test_modeling_bert.py",
+                "from transformers import BertConfig, BertModel\nfrom ...test_modeling_common import ModelTesterMixin\n\ncode1",
+                repo,
+            )
+
+            with patch_transformer_repo_path(tmp_folder):
+                infer_tests_to_run(tmp_folder / "test-output.txt", diff_with_last_commit=True)
+                with open(tmp_folder / "test-output.txt", "r") as f:
+                    tests_to_run = f.read()
+
+            assert tests_to_run == "tests/models/bert/test_modeling_bert.py"
+
+    def test_parse_commit_message(self):
+        assert parse_commit_message("Normal commit") == {"skip": False, "no_filter": False, "test_all": False}
+
+        assert parse_commit_message("[skip ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
+        assert parse_commit_message("[ci skip] commit") == {"skip": True, "no_filter": False, "test_all": False}
+        assert parse_commit_message("[skip-ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
+        assert parse_commit_message("[skip_ci] commit") == {"skip": True, "no_filter": False, "test_all": False}
+
+        assert parse_commit_message("[no filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
+        assert parse_commit_message("[no-filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
+        assert parse_commit_message("[no_filter] commit") == {"skip": False, "no_filter": True, "test_all": False}
+        assert parse_commit_message("[filter-no] commit") == {"skip": False, "no_filter": True, "test_all": False}
+
+        assert parse_commit_message("[test all] commit") == {"skip": False, "no_filter": False, "test_all": True}
+        assert parse_commit_message("[all test] commit") == {"skip": False, "no_filter": False, "test_all": True}
+        assert parse_commit_message("[test-all] commit") == {"skip": False, "no_filter": False, "test_all": True}
+        assert parse_commit_message("[all_test] commit") == {"skip": False, "no_filter": False, "test_all": True}
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index f71366d2183829..030555aece7365 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -27,6 +27,7 @@
 import unittest
 import unittest.mock as mock
 import warnings
+from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Tuple
 
@@ -1626,6 +1627,41 @@ def check_same_values(layer_1, layer_2):
             # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
             # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
+    @require_safetensors
+    def test_can_use_safetensors(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model_tied = model_class(config)
+            with tempfile.TemporaryDirectory() as d:
+                try:
+                    model_tied.save_pretrained(d, safe_serialization=True)
+                except Exception as e:
+                    raise Exception(f"Class {model_class.__name__} cannot be saved using safetensors: {e}")
+
+                model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True)
+                # Checking the state dicts are correct
+                reloaded_state = model_reloaded.state_dict()
+                for k, v in model_tied.state_dict().items():
+                    self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded")
+                    torch.testing.assert_close(
+                        v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}"
+                    )
+
+                # Checking the tensor sharing are correct
+                ptrs = defaultdict(list)
+                for k, v in model_tied.state_dict().items():
+                    ptrs[v.data_ptr()].append(k)
+
+                shared_ptrs = {k: v for k, v in ptrs.items() if len(v) > 1}
+
+                for _, shared_names in shared_ptrs.items():
+                    reloaded_ptrs = {reloaded_state[k].data_ptr() for k in shared_names}
+                    self.assertEqual(
+                        len(reloaded_ptrs),
+                        1,
+                        f"The shared pointers are incorrect, found different pointers for keys {shared_names}",
+                    )
+
     def test_tied_model_weights_key_ignore(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
diff --git a/utils/get_ci_error_statistics.py b/utils/get_ci_error_statistics.py
index 09dc4d7dd226e7..e9dc52b5bbe0d4 100644
--- a/utils/get_ci_error_statistics.py
+++ b/utils/get_ci_error_statistics.py
@@ -66,12 +66,12 @@ def get_artifacts_links(worflow_run_id, token=None):
 def download_artifact(artifact_name, artifact_url, output_dir, token):
     """Download a GitHub Action artifact from a URL.
 
-    The URL is of the from `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`,
+    The URL is of the form `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`,
     but it can't be used to download directly. We need to get a redirect URL first.
     See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
     """
     # Get the redirect URL first
-    cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: token {token}" {artifact_url}'
+    cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}'
     output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     o = output.stdout.decode("utf-8")
     lines = o.splitlines()
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 0aefd5844d325c..7251b4d400c935 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -590,23 +590,20 @@ def post_reply(self):
                     time.sleep(1)
 
 
-def retrieve_artifact(name: str, gpu: Optional[str]):
+def retrieve_artifact(artifact_path: str, gpu: Optional[str]):
     if gpu not in [None, "single", "multi"]:
         raise ValueError(f"Invalid GPU for artifact. Passed GPU: `{gpu}`.")
 
-    if gpu is not None:
-        name = f"{gpu}-gpu_{name}"
-
     _artifact = {}
 
-    if os.path.exists(name):
-        files = os.listdir(name)
+    if os.path.exists(artifact_path):
+        files = os.listdir(artifact_path)
         for file in files:
             try:
-                with open(os.path.join(name, file)) as f:
+                with open(os.path.join(artifact_path, file)) as f:
                     _artifact[file.split(".")[0]] = f.read()
             except UnicodeDecodeError as e:
-                raise ValueError(f"Could not open {os.path.join(name, file)}.") from e
+                raise ValueError(f"Could not open {os.path.join(artifact_path, file)}.") from e
 
     return _artifact
 
@@ -629,8 +626,14 @@ def add_path(self, path: str, gpu: str = None):
 
     directories = filter(os.path.isdir, os.listdir())
     for directory in directories:
-        if directory.startswith("single-gpu"):
-            artifact_name = directory[len("single-gpu") + 1 :]
+        artifact_name = directory
+
+        name_parts = artifact_name.split("_postfix_")
+        if len(name_parts) > 1:
+            artifact_name = name_parts[0]
+
+        if artifact_name.startswith("single-gpu"):
+            artifact_name = artifact_name[len("single-gpu") + 1 :]
 
             if artifact_name in _available_artifacts:
                 _available_artifacts[artifact_name].single_gpu = True
@@ -639,7 +642,7 @@ def add_path(self, path: str, gpu: str = None):
 
             _available_artifacts[artifact_name].add_path(directory, gpu="single")
 
-        elif directory.startswith("multi-gpu"):
+        elif artifact_name.startswith("multi-gpu"):
             artifact_name = directory[len("multi-gpu") + 1 :]
 
             if artifact_name in _available_artifacts:
@@ -649,7 +652,6 @@ def add_path(self, path: str, gpu: str = None):
 
             _available_artifacts[artifact_name].add_path(directory, gpu="multi")
         else:
-            artifact_name = directory
             if artifact_name not in _available_artifacts:
                 _available_artifacts[artifact_name] = Artifact(artifact_name)
 
@@ -805,10 +807,12 @@ def prepare_reports(title, header, reports, to_truncate=True):
         framework, version = ci_event.replace("Past CI - ", "").split("-")
         framework = "PyTorch" if framework == "pytorch" else "TensorFlow"
         job_name_prefix = f"{framework} {version}"
+    elif ci_event.startswith("Nightly CI"):
+        job_name_prefix = "Nightly CI"
 
     for model in model_results.keys():
         for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
-            artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"])
+            artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
             if "stats" in artifact:
                 # Link to the GitHub Action job
                 # The job names use `matrix.folder` which contain things like `models/bert` instead of `models_bert`
@@ -901,7 +905,7 @@ def prepare_reports(title, header, reports, to_truncate=True):
             else:
                 additional_results[key]["job_link"][artifact_path["gpu"]] = github_actions_job_links.get(key)
 
-            artifact = retrieve_artifact(artifact_path["name"], artifact_path["gpu"])
+            artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"])
             stacktraces = handle_stacktraces(artifact["failures_line"])
 
             failed, success, time_spent = handle_test_results(artifact["stats"])
diff --git a/utils/past_ci_versions.py b/utils/past_ci_versions.py
index c50bbb9b14c98e..61495ab2a46fcd 100644
--- a/utils/past_ci_versions.py
+++ b/utils/past_ci_versions.py
@@ -4,6 +4,18 @@
 
 past_versions_testing = {
     "pytorch": {
+        "1.13": {
+            "torch": "1.13.1",
+            "torchvision": "0.14.1",
+            "torchaudio": "0.13.1",
+            "python": 3.9,
+            "cuda": "cu116",
+            "install": (
+                "python3 -m pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1"
+                " --extra-index-url https://download.pytorch.org/whl/cu116"
+            ),
+            "base_image": "nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04",
+        },
         "1.12": {
             "torch": "1.12.1",
             "torchvision": "0.13.1",
@@ -14,6 +26,7 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1"
                 " --extra-index-url https://download.pytorch.org/whl/cu113"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "1.11": {
             "torch": "1.11.0",
@@ -25,6 +38,7 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0"
                 " --extra-index-url https://download.pytorch.org/whl/cu113"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "1.10": {
             "torch": "1.10.2",
@@ -36,6 +50,7 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.10.2 torchvision==0.11.3 torchaudio==0.10.2"
                 " --extra-index-url https://download.pytorch.org/whl/cu113"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         # torchaudio < 0.10 has no CUDA-enabled binary distributions
         "1.9": {
@@ -48,87 +63,44 @@
                 "python3 -m pip install --no-cache-dir -U torch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1"
                 " --extra-index-url https://download.pytorch.org/whl/cu111"
             ),
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-        "1.8": {
-            "torch": "1.8.1",
-            "torchvision": "0.9.1",
-            "torchaudio": "0.8.1",
-            "python": 3.9,
-            "cuda": "cu111",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1"
-                " --extra-index-url https://download.pytorch.org/whl/cu111"
-            ),
-        },
-        "1.7": {
-            "torch": "1.7.1",
-            "torchvision": "0.8.2",
-            "torchaudio": "0.7.2",
-            "python": 3.9,
-            "cuda": "cu110",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2"
-                " --extra-index-url https://download.pytorch.org/whl/cu110"
-            ),
-        },
-        "1.6": {
-            "torch": "1.6.0",
-            "torchvision": "0.7.0",
-            "torchaudio": "0.6.0",
-            "python": 3.8,
-            "cuda": "cu101",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.6.0 torchvision==0.7.0 torchaudio==0.6.0"
-                " --extra-index-url https://download.pytorch.org/whl/cu101"
-            ),
+    },
+    "tensorflow": {
+        "2.11": {
+            "tensorflow": "2.11.1",
+            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.11.1",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-        "1.5": {
-            "torch": "1.5.1",
-            "torchvision": "0.6.1",
-            "torchaudio": "0.5.1",
-            "python": 3.8,
-            "cuda": "cu101",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.5.1 torchvision==0.6.1 torchaudio==0.5.1"
-                " --extra-index-url https://download.pytorch.org/whl/cu101"
-            ),
+        "2.10": {
+            "tensorflow": "2.10.1",
+            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.10.1",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-        "1.4": {
-            "torch": "1.4.0",
-            "torchvision": "0.5.0",
-            "torchaudio": "0.4.0",
-            "python": 3.8,
-            "cuda": "cu100",
-            "install": (
-                "python3 -m pip install --no-cache-dir -U torch==1.4.0 torchvision==0.5.0 torchaudio==0.4.0"
-                " --extra-index-url https://download.pytorch.org/whl/cu100"
-            ),
+        "2.9": {
+            "tensorflow": "2.9.3",
+            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.9.3",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
-    },
-    "tensorflow": {
         "2.8": {
             "tensorflow": "2.8.2",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.8.2",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "2.7": {
             "tensorflow": "2.7.3",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.7.3",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "2.6": {
             "tensorflow": "2.6.5",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.6.5",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
         "2.5": {
             "tensorflow": "2.5.3",
             "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.5.3",
-        },
-        # need another `nvidia:cuda` docker image, otherwise GPU not working
-        "2.4": {
-            "tensorflow": "2.4.4",
-            "install": "python3 -m pip install --no-cache-dir -U tensorflow==2.4.4",
-            # This should be specified as a docker build argument.
-            # We keep the information here for reference only.
-            "base_docker": "nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04",
+            "base_image": "nvidia/cuda:11.2.2-cudnn8-devel-ubuntu20.04",
         },
     },
 }
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index 85a8e2e198f873..2a6ac35ce4f9ba 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -13,6 +13,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Welcome to tests_fetcher V2.
+This util is designed to fetch tests to run on a PR so that only the tests impacted by the modifications are run, and
+when too many models are being impacted, only run the tests of a subset of core models. It works like this.
+
+Stage 1: Identify the modified files. This takes all the files from the branching point to the current commit (so
+all modifications in a PR, not just the last commit) but excludes modifications that are on docstrings or comments
+only.
+
+Stage 2: Extract the tests to run. This is done by looking at the imports in each module and test file: if module A
+imports module B, then changing module B impacts module A, so the tests using module A should be run. We thus get the
+dependencies of each model and then recursively builds the 'reverse' map of dependencies to get all modules and tests
+impacted by a given file. We then only keep the tests (and only the code models tests if there are too many modules).
+
+Caveats:
+  - This module only filters tests by files (not individual tests) so it's better to have tests for different things
+    in different files.
+  - This module assumes inits are just importing things, not really building objects, so it's better to structure
+    them this way and move objects building in separate submodules.
+"""
+
 import argparse
 import collections
 import json
@@ -24,13 +45,36 @@
 from git import Repo
 
 
-# This script is intended to be run from the root of the repo but you can adapt this constant if you need to.
-PATH_TO_TRANFORMERS = "."
-
-# A temporary way to trigger all pipeline tests contained in model test files after PR #21516
-all_model_test_files = [str(x) for x in Path("tests/models/").glob("**/**/test_modeling_*.py")]
-
-all_pipeline_test_files = [str(x) for x in Path("tests/pipelines/").glob("**/test_pipelines_*.py")]
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+PATH_TO_TRANFORMERS = PATH_TO_REPO / "src/transformers"
+PATH_TO_TESTS = PATH_TO_REPO / "tests"
+
+# List here the models to always test.
+IMPORTANT_MODELS = [
+    # Most downloaded models
+    "bert",
+    "clip",
+    "t5",
+    "xlm-roberta",
+    "gpt2",
+    "bart",
+    "mpnet",
+    "gpt-j",
+    "wav2vec2",
+    "deberta-v2",
+    "layoutlm",
+    "opt",
+    "longformer",
+    "vit",
+    # Pipeline-specific model (to be sure each pipeline has one model in this list)
+    "tapas",
+    "vilt",
+    "clap",
+    "detr",
+    "owlvit",
+    "dpt",
+    "videomae",
+]
 
 
 @contextmanager
@@ -79,17 +123,21 @@ def get_all_tests():
     - folders under `tests/models`: `bert`, `gpt2`, etc.
     - test files under `tests`: `test_modeling_common.py`, `test_tokenization_common.py`, etc.
     """
-    test_root_dir = os.path.join(PATH_TO_TRANFORMERS, "tests")
 
     # test folders/files directly under `tests` folder
-    tests = os.listdir(test_root_dir)
-    tests = sorted(filter(lambda x: os.path.isdir(x) or x.startswith("tests/test_"), [f"tests/{x}" for x in tests]))
+    tests = os.listdir(PATH_TO_TESTS)
+    tests = [f"tests/{f}" for f in tests if "__pycache__" not in f]
+    tests = sorted([f for f in tests if (PATH_TO_REPO / f).is_dir() or f.startswith("tests/test_")])
 
     # model specific test folders
-    model_tests_folders = os.listdir(os.path.join(test_root_dir, "models"))
-    model_test_folders = sorted(filter(os.path.isdir, [f"tests/models/{x}" for x in model_tests_folders]))
+    model_test_folders = os.listdir(PATH_TO_TESTS / "models")
+    model_test_folders = [f"tests/models/{f}" for f in model_test_folders if "__pycache__" not in f]
+    model_test_folders = sorted([f for f in model_test_folders if (PATH_TO_REPO / f).is_dir()])
 
     tests.remove("tests/models")
+    # Sagemaker tests are not meant to be run on the CI.
+    if "tests/sagemaker" in tests:
+        tests.remove("tests/sagemaker")
     tests = model_test_folders + tests
 
     return tests
@@ -99,11 +147,12 @@ def diff_is_docstring_only(repo, branching_point, filename):
     """
     Check if the diff is only in docstrings in a filename.
     """
+    folder = Path(repo.working_dir)
     with checkout_commit(repo, branching_point):
-        with open(filename, "r", encoding="utf-8") as f:
+        with open(folder / filename, "r", encoding="utf-8") as f:
             old_content = f.read()
 
-    with open(filename, "r", encoding="utf-8") as f:
+    with open(folder / filename, "r", encoding="utf-8") as f:
         new_content = f.read()
 
     old_content_clean = clean_code(old_content)
@@ -112,31 +161,6 @@ def diff_is_docstring_only(repo, branching_point, filename):
     return old_content_clean == new_content_clean
 
 
-def get_modified_python_files(diff_with_last_commit=False):
-    """
-    Return a list of python files that have been modified between:
-
-    - the current head and the main branch if `diff_with_last_commit=False` (default)
-    - the current head and its parent commit otherwise.
-    """
-    repo = Repo(PATH_TO_TRANFORMERS)
-
-    if not diff_with_last_commit:
-        print(f"main is at {repo.refs.main.commit}")
-        print(f"Current head is at {repo.head.commit}")
-
-        branching_commits = repo.merge_base(repo.refs.main, repo.head)
-        for commit in branching_commits:
-            print(f"Branching commit: {commit}")
-        return get_diff(repo, repo.head.commit, branching_commits)
-    else:
-        print(f"main is at {repo.head.commit}")
-        parent_commits = repo.head.commit.parents
-        for commit in parent_commits:
-            print(f"Parent commit: {commit}")
-        return get_diff(repo, repo.head.commit, parent_commits)
-
-
 def get_diff(repo, base_commit, commits):
     """
     Get's the diff between one or several commits and the head of the repository.
@@ -166,96 +190,173 @@ def get_diff(repo, base_commit, commits):
     return code_diff
 
 
-def get_module_dependencies(module_fname):
+def get_modified_python_files(diff_with_last_commit=False):
+    """
+    Return a list of python files that have been modified between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+    """
+    repo = Repo(PATH_TO_REPO)
+
+    if not diff_with_last_commit:
+        print(f"main is at {repo.refs.main.commit}")
+        print(f"Current head is at {repo.head.commit}")
+
+        branching_commits = repo.merge_base(repo.refs.main, repo.head)
+        for commit in branching_commits:
+            print(f"Branching commit: {commit}")
+        return get_diff(repo, repo.head.commit, branching_commits)
+    else:
+        print(f"main is at {repo.head.commit}")
+        parent_commits = repo.head.commit.parents
+        for commit in parent_commits:
+            print(f"Parent commit: {commit}")
+        return get_diff(repo, repo.head.commit, parent_commits)
+
+
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+(\.+\S+)\s+import\s+([^\n]+) -> Line only contains from .xxx import yyy and we catch .xxx and yyy
+# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
+#           other import.
+_re_single_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+([^\n]+)(?=\n)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\) -> Line continues with from .xxx import (yyy) and we catch .xxx and yyy
+# yyy will take multiple lines otherwise there wouldn't be parenthesis.
+_re_multi_line_relative_imports = re.compile(r"(?:^|\n)\s*from\s+(\.+\S+)\s+import\s+\(([^\)]+)\)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+transformers(\S*)\s+import\s+([^\n]+) -> Line only contains from transformers.xxx import yyy and we catch
+#           .xxx and yyy
+# (?=\n) -> Look-ahead to a new line. We can't just put \n here or using find_all on this re will only catch every
+#           other import.
+_re_single_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+transformers(\S*)\s+import\s+([^\n]+)(?=\n)")
+# (:?^|\n) -> Non-catching group for the beginning of the doc or a new line.
+# \s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\) -> Line continues with from transformers.xxx import (yyy) and we
+# catch .xxx and yyy. yyy will take multiple lines otherwise there wouldn't be parenthesis.
+_re_multi_line_direct_imports = re.compile(r"(?:^|\n)\s*from\s+transformers(\S*)\s+import\s+\(([^\)]+)\)")
+
+
+def extract_imports(module_fname, cache=None):
     """
-    Get the dependencies of a module.
+    Get the imports a given module makes. This takes a module filename and returns the list of module filenames
+    imported in the module with the objects imported in that module filename.
     """
-    with open(os.path.join(PATH_TO_TRANFORMERS, module_fname), "r", encoding="utf-8") as f:
+    if cache is not None and module_fname in cache:
+        return cache[module_fname]
+
+    with open(PATH_TO_REPO / module_fname, "r", encoding="utf-8") as f:
         content = f.read()
 
-    module_parts = module_fname.split(os.path.sep)
+    # Filter out all docstrings to not get imports in code examples.
+    splits = content.split('"""')
+    content = "".join(splits[::2])
+
+    module_parts = str(module_fname).split(os.path.sep)
     imported_modules = []
 
     # Let's start with relative imports
-    relative_imports = re.findall(r"from\s+(\.+\S+)\s+import\s+([^\n]+)\n", content)
-    relative_imports = [mod for mod, imp in relative_imports if "# tests_ignore" not in imp]
-    for imp in relative_imports:
+    relative_imports = _re_single_line_relative_imports.findall(content)
+    relative_imports = [
+        (mod, imp) for mod, imp in relative_imports if "# tests_ignore" not in imp and imp.strip() != "("
+    ]
+    multiline_relative_imports = _re_multi_line_relative_imports.findall(content)
+    relative_imports += [(mod, imp) for mod, imp in multiline_relative_imports if "# tests_ignore" not in imp]
+
+    for module, imports in relative_imports:
         level = 0
-        while imp.startswith("."):
-            imp = imp[1:]
+        while module.startswith("."):
+            module = module[1:]
             level += 1
 
-        if len(imp) > 0:
-            dep_parts = module_parts[: len(module_parts) - level] + imp.split(".")
+        if len(module) > 0:
+            dep_parts = module_parts[: len(module_parts) - level] + module.split(".")
         else:
-            dep_parts = module_parts[: len(module_parts) - level] + ["__init__.py"]
+            dep_parts = module_parts[: len(module_parts) - level]
         imported_module = os.path.sep.join(dep_parts)
-        # We ignore the main init import as it's only for the __version__ that it's done
-        # and it would add everything as a dependency.
-        if not imported_module.endswith("transformers/__init__.py"):
-            imported_modules.append(imported_module)
+        imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
 
     # Let's continue with direct imports
-    # The import from the transformers module are ignored for the same reason we ignored the
-    # main init before.
-    direct_imports = re.findall(r"from\s+transformers\.(\S+)\s+import\s+([^\n]+)\n", content)
-    direct_imports = [mod for mod, imp in direct_imports if "# tests_ignore" not in imp]
-    for imp in direct_imports:
-        import_parts = imp.split(".")
-        dep_parts = ["src", "transformers"] + import_parts
-        imported_modules.append(os.path.sep.join(dep_parts))
+    direct_imports = _re_single_line_direct_imports.findall(content)
+    direct_imports = [(mod, imp) for mod, imp in direct_imports if "# tests_ignore" not in imp and imp.strip() != "("]
+    multiline_direct_imports = _re_multi_line_direct_imports.findall(content)
+    direct_imports += [(mod, imp) for mod, imp in multiline_direct_imports if "# tests_ignore" not in imp]
 
-    # Now let's just check that we have proper module files, or append an init for submodules
-    dependencies = []
-    for imported_module in imported_modules:
-        if os.path.isfile(os.path.join(PATH_TO_TRANFORMERS, f"{imported_module}.py")):
-            dependencies.append(f"{imported_module}.py")
-        elif os.path.isdir(os.path.join(PATH_TO_TRANFORMERS, imported_module)) and os.path.isfile(
-            os.path.sep.join([PATH_TO_TRANFORMERS, imported_module, "__init__.py"])
-        ):
-            dependencies.append(os.path.sep.join([imported_module, "__init__.py"]))
-    return dependencies
+    for module, imports in direct_imports:
+        import_parts = module.split(".")[1:]  # ignore the first .
+        dep_parts = ["src", "transformers"] + import_parts
+        imported_module = os.path.sep.join(dep_parts)
+        imported_modules.append((imported_module, [imp.strip() for imp in imports.split(",")]))
 
+    result = []
+    for module_file, imports in imported_modules:
+        if (PATH_TO_REPO / f"{module_file}.py").is_file():
+            module_file = f"{module_file}.py"
+        elif (PATH_TO_REPO / module_file).is_dir() and (PATH_TO_REPO / module_file / "__init__.py").is_file():
+            module_file = os.path.sep.join([module_file, "__init__.py"])
+        imports = [imp for imp in imports if len(imp) > 0 and re.match("^[A-Za-z0-9_]*$", imp)]
+        if len(imports) > 0:
+            result.append((module_file, imports))
 
-def get_test_dependencies(test_fname):
-    """
-    Get the dependencies of a test file.
-    """
-    with open(os.path.join(PATH_TO_TRANFORMERS, test_fname), "r", encoding="utf-8") as f:
-        content = f.read()
+    if cache is not None:
+        cache[module_fname] = result
 
-    # Tests only have relative imports for other test files
-    # TODO Sylvain: handle relative imports cleanly
-    relative_imports = re.findall(r"from\s+(\.\S+)\s+import\s+([^\n]+)\n", content)
-    relative_imports = [test for test, imp in relative_imports if "# tests_ignore" not in imp]
+    return result
 
-    def _convert_relative_import_to_file(relative_import):
-        level = 0
-        while relative_import.startswith("."):
-            level += 1
-            relative_import = relative_import[1:]
 
-        directory = os.path.sep.join(test_fname.split(os.path.sep)[:-level])
-        return os.path.join(directory, f"{relative_import.replace('.', os.path.sep)}.py")
+def get_module_dependencies(module_fname, cache=None):
+    """
+    Get the dependencies of a module from the module filename as a list of module filenames. This will resolve any
+    __init__ we pass: if we import from a submodule utils, the dependencies will be utils/foo.py and utils/bar.py (if
+    the objects imported actually come from utils.foo and utils.bar) not utils/__init__.py.
+    """
+    dependencies = []
+    imported_modules = extract_imports(module_fname, cache=cache)
+    # The while loop is to recursively traverse all inits we may encounter.
+    while len(imported_modules) > 0:
+        new_modules = []
+        for module, imports in imported_modules:
+            # If we end up in an __init__ we are often not actually importing from this init (except in the case where
+            # the object is fully defined in the __init__)
+            if module.endswith("__init__.py"):
+                # So we get the imports from that init then try to find where our objects come from.
+                new_imported_modules = extract_imports(module, cache=cache)
+                for new_module, new_imports in new_imported_modules:
+                    if any([i in new_imports for i in imports]):
+                        if new_module not in dependencies:
+                            new_modules.append((new_module, [i for i in new_imports if i in imports]))
+                        imports = [i for i in imports if i not in new_imports]
+                if len(imports) > 0:
+                    # If there are any objects lefts, they may be a submodule
+                    path_to_module = PATH_TO_REPO / module.replace("__init__.py", "")
+                    dependencies.extend(
+                        [
+                            os.path.join(module.replace("__init__.py", ""), f"{i}.py")
+                            for i in imports
+                            if (path_to_module / f"{i}.py").is_file()
+                        ]
+                    )
+                    imports = [i for i in imports if not (path_to_module / f"{i}.py").is_file()]
+                    if len(imports) > 0:
+                        # Then if there are still objects left, they are fully defined in the init, so we keep it as a
+                        # dependency.
+                        dependencies.append(module)
+            else:
+                dependencies.append(module)
 
-    dependencies = [_convert_relative_import_to_file(relative_import) for relative_import in relative_imports]
-    return [f for f in dependencies if os.path.isfile(os.path.join(PATH_TO_TRANFORMERS, f))]
+        imported_modules = new_modules
+    return dependencies
 
 
 def create_reverse_dependency_tree():
     """
     Create a list of all edges (a, b) which mean that modifying a impacts b with a going over all module and test files.
     """
-    modules = [
-        str(f.relative_to(PATH_TO_TRANFORMERS))
-        for f in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
-    ]
-    module_edges = [(d, m) for m in modules for d in get_module_dependencies(m)]
-
-    tests = [str(f.relative_to(PATH_TO_TRANFORMERS)) for f in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/*.py")]
-    test_edges = [(d, t) for t in tests for d in get_test_dependencies(t)]
+    cache = {}
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    edges = [(dep, mod) for mod in all_modules for dep in get_module_dependencies(mod, cache=cache)]
 
-    return module_edges + test_edges
+    return list(set(edges))
 
 
 def get_tree_starting_at(module, edges):
@@ -264,13 +365,17 @@ def get_tree_starting_at(module, edges):
     starting at module], [list of edges starting at the preceding level], ...]
     """
     vertices_seen = [module]
-    new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module]
+    new_edges = [edge for edge in edges if edge[0] == module and edge[1] != module and "__init__.py" not in edge[1]]
     tree = [module]
     while len(new_edges) > 0:
         tree.append(new_edges)
         final_vertices = list({edge[1] for edge in new_edges})
         vertices_seen.extend(final_vertices)
-        new_edges = [edge for edge in edges if edge[0] in final_vertices and edge[1] not in vertices_seen]
+        new_edges = [
+            edge
+            for edge in edges
+            if edge[0] in final_vertices and edge[1] not in vertices_seen and "__init__.py" not in edge[1]
+        ]
 
     return tree
 
@@ -308,290 +413,159 @@ def create_reverse_dependency_map():
     Create the dependency map from module/test filename to the list of modules/tests that depend on it (even
     recursively).
     """
-    modules = [
-        str(f.relative_to(PATH_TO_TRANFORMERS))
-        for f in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
-    ]
-    # We grab all the dependencies of each module.
-    direct_deps = {m: get_module_dependencies(m) for m in modules}
-
-    # We add all the dependencies of each test file
-    tests = [str(f.relative_to(PATH_TO_TRANFORMERS)) for f in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/*.py")]
-    direct_deps.update({t: get_test_dependencies(t) for t in tests})
-
-    all_files = modules + tests
+    cache = {}
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
 
     # This recurses the dependencies
     something_changed = True
     while something_changed:
         something_changed = False
-        for m in all_files:
+        for m in all_modules:
             for d in direct_deps[m]:
+                if d.endswith("__init__.py"):
+                    continue
                 if d not in direct_deps:
                     raise ValueError(f"KeyError:{d}. From {m}")
-                for dep in direct_deps[d]:
-                    if dep not in direct_deps[m]:
-                        direct_deps[m].append(dep)
-                        something_changed = True
+                new_deps = set(direct_deps[d]) - set(direct_deps[m])
+                if len(new_deps) > 0:
+                    direct_deps[m].extend(list(new_deps))
+                    something_changed = True
 
     # Finally we can build the reverse map.
     reverse_map = collections.defaultdict(list)
-    for m in all_files:
-        if m.endswith("__init__.py"):
-            reverse_map[m].extend(direct_deps[m])
+    for m in all_modules:
         for d in direct_deps[m]:
             reverse_map[d].append(m)
 
+    for m in [f for f in all_modules if f.endswith("__init__.py")]:
+        direct_deps = get_module_dependencies(m, cache=cache)
+        deps = sum([reverse_map[d] for d in direct_deps if not d.endswith("__init__.py")], direct_deps)
+        reverse_map[m] = list(set(deps) - {m})
+
     return reverse_map
 
 
-# Any module file that has a test name which can't be inferred automatically from its name should go here. A better
-# approach is to (re-)name the test file accordingly, and second best to add the correspondence map here.
-SPECIAL_MODULE_TO_TEST_MAP = {
-    "commands/add_new_model_like.py": "utils/test_add_new_model_like.py",
-    "configuration_utils.py": "test_configuration_common.py",
-    "convert_graph_to_onnx.py": "onnx/test_onnx.py",
-    "data/data_collator.py": "trainer/test_data_collator.py",
-    "deepspeed.py": "deepspeed/",
-    "feature_extraction_sequence_utils.py": "test_sequence_feature_extraction_common.py",
-    "feature_extraction_utils.py": "test_feature_extraction_common.py",
-    "file_utils.py": ["utils/test_file_utils.py", "utils/test_model_output.py"],
-    "image_processing_utils.py": ["test_image_processing_common.py", "utils/test_image_processing_utils.py"],
-    "image_transforms.py": "test_image_transforms.py",
-    "utils/generic.py": ["utils/test_file_utils.py", "utils/test_model_output.py", "utils/test_generic.py"],
-    "utils/hub.py": "utils/test_hub_utils.py",
-    "modelcard.py": "utils/test_model_card.py",
-    "modeling_flax_utils.py": "test_modeling_flax_common.py",
-    "modeling_tf_utils.py": ["test_modeling_tf_common.py", "utils/test_modeling_tf_core.py"],
-    "modeling_utils.py": ["test_modeling_common.py", "utils/test_offline.py"],
-    "models/auto/modeling_auto.py": [
-        "models/auto/test_modeling_auto.py",
-        "models/auto/test_modeling_tf_pytorch.py",
-        "models/bort/test_modeling_bort.py",
-        "models/dit/test_modeling_dit.py",
-    ],
-    "models/auto/modeling_flax_auto.py": "models/auto/test_modeling_flax_auto.py",
-    "models/auto/modeling_tf_auto.py": [
-        "models/auto/test_modeling_tf_auto.py",
-        "models/auto/test_modeling_tf_pytorch.py",
-        "models/bort/test_modeling_tf_bort.py",
-    ],
-    "models/gpt2/modeling_gpt2.py": [
-        "models/gpt2/test_modeling_gpt2.py",
-        "models/megatron_gpt2/test_modeling_megatron_gpt2.py",
-    ],
-    "models/dpt/modeling_dpt.py": [
-        "models/dpt/test_modeling_dpt.py",
-        "models/dpt/test_modeling_dpt_hybrid.py",
-    ],
-    "optimization.py": "optimization/test_optimization.py",
-    "optimization_tf.py": "optimization/test_optimization_tf.py",
-    "pipelines/__init__.py": all_pipeline_test_files + all_model_test_files,
-    "pipelines/base.py": all_pipeline_test_files + all_model_test_files,
-    "pipelines/text2text_generation.py": [
-        "pipelines/test_pipelines_text2text_generation.py",
-        "pipelines/test_pipelines_summarization.py",
-        "pipelines/test_pipelines_translation.py",
-    ],
-    "pipelines/zero_shot_classification.py": "pipelines/test_pipelines_zero_shot.py",
-    "testing_utils.py": "utils/test_skip_decorators.py",
-    "tokenization_utils.py": ["test_tokenization_common.py", "tokenization/test_tokenization_utils.py"],
-    "tokenization_utils_base.py": ["test_tokenization_common.py", "tokenization/test_tokenization_utils.py"],
-    "tokenization_utils_fast.py": [
-        "test_tokenization_common.py",
-        "tokenization/test_tokenization_utils.py",
-        "tokenization/test_tokenization_fast.py",
-    ],
-    "trainer.py": [
-        "trainer/test_trainer.py",
-        "extended/test_trainer_ext.py",
-        "trainer/test_trainer_distributed.py",
-        "trainer/test_trainer_tpu.py",
-    ],
-    "train_pt_utils.py": "trainer/test_trainer_utils.py",
-    "utils/versions.py": "utils/test_versions_utils.py",
-}
-
-
-def module_to_test_file(module_fname):
-    """
-    Returns the name of the file(s) where `module_fname` is tested.
-    """
-    splits = module_fname.split(os.path.sep)
-
-    # Special map has priority
-    short_name = os.path.sep.join(splits[2:])
-    if short_name in SPECIAL_MODULE_TO_TEST_MAP:
-        test_file = SPECIAL_MODULE_TO_TEST_MAP[short_name]
-        if isinstance(test_file, str):
-            return f"tests/{test_file}"
-        return [f"tests/{f}" for f in test_file]
-
-    module_name = splits[-1]
-    # Fast tokenizers are tested in the same file as the slow ones.
-    if module_name.endswith("_fast.py"):
-        module_name = module_name.replace("_fast.py", ".py")
-
-    # Special case for pipelines submodules
-    if len(splits) >= 2 and splits[-2] == "pipelines":
-        default_test_file = f"tests/pipelines/test_pipelines_{module_name}"
-        return [default_test_file] + all_model_test_files
-    # Special case for benchmarks submodules
-    elif len(splits) >= 2 and splits[-2] == "benchmark":
-        return ["tests/benchmark/test_benchmark.py", "tests/benchmark/test_benchmark_tf.py"]
-    # Special case for commands submodules
-    elif len(splits) >= 2 and splits[-2] == "commands":
-        return "tests/utils/test_cli.py"
-    # Special case for onnx submodules
-    elif len(splits) >= 2 and splits[-2] == "onnx":
-        return ["tests/onnx/test_features.py", "tests/onnx/test_onnx.py", "tests/onnx/test_onnx_v2.py"]
-    # Special case for utils (not the one in src/transformers, the ones at the root of the repo).
-    elif len(splits) > 0 and splits[0] == "utils":
-        default_test_file = f"tests/repo_utils/test_{module_name}"
-    elif len(splits) > 4 and splits[2] == "models":
-        default_test_file = f"tests/models/{splits[3]}/test_{module_name}"
-    elif len(splits) > 2 and splits[2].startswith("generation"):
-        default_test_file = f"tests/generation/test_{module_name}"
-    elif len(splits) > 2 and splits[2].startswith("trainer"):
-        default_test_file = f"tests/trainer/test_{module_name}"
-    else:
-        default_test_file = f"tests/utils/test_{module_name}"
-
-    if os.path.isfile(default_test_file):
-        return default_test_file
-
-    # Processing -> processor
-    if "processing" in default_test_file:
-        test_file = default_test_file.replace("processing", "processor")
-        if os.path.isfile(test_file):
-            return test_file
-
-
-# This list contains the list of test files we expect never to be launched from a change in a module/util. Those are
-# launched separately.
-EXPECTED_TEST_FILES_NEVER_TOUCHED = [
-    "tests/generation/test_framework_agnostic.py",  # Mixins inherited by actual test classes
-    "tests/mixed_int8/test_mixed_int8.py",  # Mixed-int8 bitsandbytes test
-    "tests/pipelines/test_pipelines_common.py",  # Actually checked by the pipeline based file
-    "tests/sagemaker/test_single_node_gpu.py",  # SageMaker test
-    "tests/sagemaker/test_multi_node_model_parallel.py",  # SageMaker test
-    "tests/sagemaker/test_multi_node_data_parallel.py",  # SageMaker test
-    "tests/test_pipeline_mixin.py",  # Contains no test of its own (only the common tester class)
-    "tests/utils/test_doc_samples.py",  # Doc tests
-]
+def create_module_to_test_map(reverse_map=None, filter_models=False):
+    """
+    Extract the tests from the reverse_dependency_map and potentially filters the model tests.
+    """
+    if reverse_map is None:
+        reverse_map = create_reverse_dependency_map()
+    test_map = {module: [f for f in deps if f.startswith("tests")] for module, deps in reverse_map.items()}
 
+    if not filter_models:
+        return test_map
 
-def _print_list(l):
-    return "\n".join([f"- {f}" for f in l])
+    num_model_tests = len(list(PATH_TO_TESTS.glob("models/*")))
+
+    def has_many_models(tests):
+        model_tests = {Path(t).parts[2] for t in tests if t.startswith("tests/models/")}
+        return len(model_tests) > num_model_tests // 2
+
+    def filter_tests(tests):
+        return [t for t in tests if not t.startswith("tests/models/") or Path(t).parts[2] in IMPORTANT_MODELS]
+
+    return {module: (filter_tests(tests) if has_many_models(tests) else tests) for module, tests in test_map.items()}
 
 
-def sanity_check():
+def check_imports_all_exist():
     """
-    Checks that all test files can be touched by a modification in at least one module/utils. This test ensures that
-    newly-added test files are properly mapped to some module or utils, so they can be run by the CI.
+    Isn't used per se by the test fetcher but might be used later as a quality check. Putting this here for now so the
+    code is not lost.
     """
-    # Grab all module and utils
-    all_files = [
-        str(p.relative_to(PATH_TO_TRANFORMERS))
-        for p in (Path(PATH_TO_TRANFORMERS) / "src/transformers").glob("**/*.py")
-    ]
-    all_files += [
-        str(p.relative_to(PATH_TO_TRANFORMERS)) for p in (Path(PATH_TO_TRANFORMERS) / "utils").glob("**/*.py")
-    ]
+    cache = {}
+    all_modules = list(PATH_TO_TRANFORMERS.glob("**/*.py")) + list(PATH_TO_TESTS.glob("**/*.py"))
+    all_modules = [str(mod.relative_to(PATH_TO_REPO)) for mod in all_modules]
+    direct_deps = {m: get_module_dependencies(m, cache=cache) for m in all_modules}
 
-    # Compute all the test files we get from those.
-    test_files_found = []
-    for f in all_files:
-        test_f = module_to_test_file(f)
-        if test_f is not None:
-            if isinstance(test_f, str):
-                test_files_found.append(test_f)
-            else:
-                test_files_found.extend(test_f)
-
-    # Some of the test files might actually be subfolders so we grab the tests inside.
-    test_files = []
-    for test_f in test_files_found:
-        if os.path.isdir(os.path.join(PATH_TO_TRANFORMERS, test_f)):
-            test_files.extend(
-                [
-                    str(p.relative_to(PATH_TO_TRANFORMERS))
-                    for p in (Path(PATH_TO_TRANFORMERS) / test_f).glob("**/test*.py")
-                ]
-            )
+    for module, deps in direct_deps.items():
+        for dep in deps:
+            if not (PATH_TO_REPO / dep).is_file():
+                print(f"{module} has dependency on {dep} which does not exist.")
+
+
+def _print_list(l):
+    return "\n".join([f"- {f}" for f in l])
+
+
+def create_json_map(test_files_to_run, json_output_file):
+    if json_output_file is None:
+        return
+
+    test_map = {}
+    for test_file in test_files_to_run:
+        # `test_file` is a path to a test folder/file, starting with `tests/`. For example,
+        #   - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
+        #   - `tests/trainer/test_trainer.py` or `tests/trainer`
+        #   - `tests/test_modeling_common.py`
+        names = test_file.split(os.path.sep)
+        if names[1] == "models":
+            # take the part like `models/bert` for modeling tests
+            key = os.path.sep.join(names[1:3])
+        elif len(names) > 2 or not test_file.endswith(".py"):
+            # test folders under `tests` or python files under them
+            # take the part like tokenization, `pipeline`, etc. for other test categories
+            key = os.path.sep.join(names[1:2])
         else:
-            test_files.append(test_f)
+            # common test files directly under `tests/`
+            key = "common"
 
-    # Compare to existing test files
-    existing_test_files = [
-        str(p.relative_to(PATH_TO_TRANFORMERS)) for p in (Path(PATH_TO_TRANFORMERS) / "tests").glob("**/test*.py")
-    ]
-    not_touched_test_files = [f for f in existing_test_files if f not in test_files]
+        if key not in test_map:
+            test_map[key] = []
+        test_map[key].append(test_file)
 
-    should_be_tested = set(not_touched_test_files) - set(EXPECTED_TEST_FILES_NEVER_TOUCHED)
-    if len(should_be_tested) > 0:
-        raise ValueError(
-            "The following test files are not currently associated with any module or utils files, which means they "
-            f"will never get run by the CI:\n{_print_list(should_be_tested)}\n. Make sure the names of these test "
-            "files match the name of the module or utils they are testing, or adapt the constant "
-            "`SPECIAL_MODULE_TO_TEST_MAP` in `utils/tests_fetcher.py` to add them. If your test file is triggered "
-            "separately and is not supposed to be run by the regular CI, add it to the "
-            "`EXPECTED_TEST_FILES_NEVER_TOUCHED` constant instead."
-        )
+    # sort the keys & values
+    keys = sorted(test_map.keys())
+    test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
+    with open(json_output_file, "w", encoding="UTF-8") as fp:
+        json.dump(test_map, fp, ensure_ascii=False)
 
 
-def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, json_output_file=None):
+def infer_tests_to_run(
+    output_file, diff_with_last_commit=False, filters=None, filter_models=True, json_output_file=None
+):
     modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit)
     print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}")
 
     # Create the map that will give us all impacted modules.
-    impacted_modules_map = create_reverse_dependency_map()
+    reverse_map = create_reverse_dependency_map()
     impacted_files = modified_files.copy()
     for f in modified_files:
-        if f in impacted_modules_map:
-            impacted_files.extend(impacted_modules_map[f])
+        if f in reverse_map:
+            impacted_files.extend(reverse_map[f])
 
     # Remove duplicates
     impacted_files = sorted(set(impacted_files))
     print(f"\n### IMPACTED FILES ###\n{_print_list(impacted_files)}")
 
     # Grab the corresponding test files:
-    if "setup.py" in impacted_files:
+    if "setup.py" in modified_files:
         test_files_to_run = ["tests"]
         repo_utils_launch = True
     else:
-        # Grab the corresponding test files:
-        test_files_to_run = []
-        for f in impacted_files:
-            # Modified test files are always added
-            if f.startswith("tests/"):
-                test_files_to_run.append(f)
-            # Example files are tested separately
-            elif f.startswith("examples/pytorch"):
-                test_files_to_run.append("examples/pytorch/test_pytorch_examples.py")
-                test_files_to_run.append("examples/pytorch/test_accelerate_examples.py")
-            elif f.startswith("examples/tensorflow"):
-                test_files_to_run.append("examples/tensorflow/test_tensorflow_examples.py")
-            elif f.startswith("examples/flax"):
-                test_files_to_run.append("examples/flax/test_flax_examples.py")
-            else:
-                new_tests = module_to_test_file(f)
-                if new_tests is not None:
-                    if isinstance(new_tests, str):
-                        test_files_to_run.append(new_tests)
-                    else:
-                        test_files_to_run.extend(new_tests)
-
-        # Remove duplicates
+        # All modified tests need to be run.
+        test_files_to_run = [
+            f for f in modified_files if f.startswith("tests") and f.split(os.path.sep)[-1].startswith("test")
+        ]
+        # Then we grab the corresponding test files.
+        test_map = create_module_to_test_map(reverse_map=reverse_map, filter_models=filter_models)
+        for f in modified_files:
+            if f in test_map:
+                test_files_to_run.extend(test_map[f])
         test_files_to_run = sorted(set(test_files_to_run))
+        # Remove SageMaker tests
+        test_files_to_run = [f for f in test_files_to_run if not f.split(os.path.sep)[1] == "sagemaker"]
         # Make sure we did not end up with a test file that was removed
-        test_files_to_run = [f for f in test_files_to_run if os.path.isfile(f) or os.path.isdir(f)]
+        test_files_to_run = [f for f in test_files_to_run if (PATH_TO_REPO / f).exists()]
         if filters is not None:
             filtered_files = []
-            for filter in filters:
-                filtered_files.extend([f for f in test_files_to_run if f.startswith(filter)])
+            for _filter in filters:
+                filtered_files.extend([f for f in test_files_to_run if f.startswith(_filter)])
             test_files_to_run = filtered_files
-        repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in test_files_to_run)
+
+        repo_utils_launch = any(f.split(os.path.sep)[1] == "repo_utils" for f in modified_files)
 
     if repo_utils_launch:
         repo_util_file = Path(output_file).parent / "test_repo_utils.txt"
@@ -610,34 +584,7 @@ def infer_tests_to_run(output_file, diff_with_last_commit=False, filters=None, j
         if "tests" in test_files_to_run:
             test_files_to_run = get_all_tests()
 
-        if json_output_file is not None:
-            test_map = {}
-            for test_file in test_files_to_run:
-                # `test_file` is a path to a test folder/file, starting with `tests/`. For example,
-                #   - `tests/models/bert/test_modeling_bert.py` or `tests/models/bert`
-                #   - `tests/trainer/test_trainer.py` or `tests/trainer`
-                #   - `tests/test_modeling_common.py`
-                names = test_file.split(os.path.sep)
-                if names[1] == "models":
-                    # take the part like `models/bert` for modeling tests
-                    key = "/".join(names[1:3])
-                elif len(names) > 2 or not test_file.endswith(".py"):
-                    # test folders under `tests` or python files under them
-                    # take the part like tokenization, `pipeline`, etc. for other test categories
-                    key = "/".join(names[1:2])
-                else:
-                    # common test files directly under `tests/`
-                    key = "common"
-
-                if key not in test_map:
-                    test_map[key] = []
-                test_map[key].append(test_file)
-
-            # sort the keys & values
-            keys = sorted(test_map.keys())
-            test_map = {k: " ".join(sorted(test_map[k])) for k in keys}
-            with open(json_output_file, "w", encoding="UTF-8") as fp:
-                json.dump(test_map, fp, ensure_ascii=False)
+        create_json_map(test_files_to_run, json_output_file)
 
 
 def filter_tests(output_file, filters):
@@ -667,11 +614,29 @@ def filter_tests(output_file, filters):
         f.write(" ".join(test_files))
 
 
+def parse_commit_message(commit_message):
+    """
+    Parses the commit message to detect if a command is there to skip, force all or part of the CI.
+
+    Returns a dictionary of strings to bools with keys skip, test_all_models and test_all.
+    """
+    if commit_message is None:
+        return {"skip": False, "no_filter": False, "test_all": False}
+
+    command_search = re.search(r"\[([^\]]*)\]", commit_message)
+    if command_search is not None:
+        command = command_search.groups()[0]
+        command = command.lower().replace("-", " ").replace("_", " ")
+        skip = command in ["ci skip", "skip ci", "circleci skip", "skip circleci"]
+        no_filter = set(command.split(" ")) == {"no", "filter"}
+        test_all = set(command.split(" ")) == {"test", "all"}
+        return {"skip": skip, "no_filter": no_filter, "test_all": test_all}
+    else:
+        return {"skip": False, "no_filter": False, "test_all": False}
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--sanity_check", action="store_true", help="Only test that all tests and modules are accounted for."
-    )
     parser.add_argument(
         "--output_file", type=str, default="test_list.txt", help="Where to store the list of tests to run"
     )
@@ -704,33 +669,54 @@ def filter_tests(output_file, filters):
         help="Will only print the tree of modules depending on the file passed.",
         default=None,
     )
+    parser.add_argument(
+        "--commit_message",
+        type=str,
+        help="The commit message (which could contain a command to force all tests or skip the CI).",
+        default=None,
+    )
     args = parser.parse_args()
     if args.print_dependencies_of is not None:
         print_tree_deps_of(args.print_dependencies_of)
-    elif args.sanity_check:
-        sanity_check()
     elif args.filter_tests:
         filter_tests(args.output_file, ["pipelines", "repo_utils"])
     else:
-        repo = Repo(PATH_TO_TRANFORMERS)
+        repo = Repo(PATH_TO_REPO)
+        commit_message = repo.head.commit.message
+        commit_flags = parse_commit_message(commit_message)
+        if commit_flags["skip"]:
+            print("Force-skipping the CI")
+            quit()
+        if commit_flags["no_filter"]:
+            print("Running all tests fetched without filtering.")
+        if commit_flags["test_all"]:
+            print("Force-launching all tests")
 
         diff_with_last_commit = args.diff_with_last_commit
         if not diff_with_last_commit and not repo.head.is_detached and repo.head.ref == repo.refs.main:
             print("main branch detected, fetching tests against last commit.")
             diff_with_last_commit = True
 
-        try:
-            infer_tests_to_run(
-                args.output_file,
-                diff_with_last_commit=diff_with_last_commit,
-                filters=args.filters,
-                json_output_file=args.json_output_file,
-            )
-            filter_tests(args.output_file, ["repo_utils"])
-        except Exception as e:
-            print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.")
+        if not commit_flags["test_all"]:
+            try:
+                infer_tests_to_run(
+                    args.output_file,
+                    diff_with_last_commit=diff_with_last_commit,
+                    filters=args.filters,
+                    json_output_file=args.json_output_file,
+                    filter_models=not commit_flags["no_filter"],
+                )
+                filter_tests(args.output_file, ["repo_utils"])
+            except Exception as e:
+                print(f"\nError when trying to grab the relevant tests: {e}\n\nRunning all tests.")
+                commit_flags["test_all"] = True
+
+        if commit_flags["test_all"]:
             with open(args.output_file, "w", encoding="utf-8") as f:
                 if args.filters is None:
                     f.write("./tests/")
                 else:
                     f.write(" ".join(args.filters))
+
+            test_files_to_run = get_all_tests()
+            create_json_map(test_files_to_run, args.json_output_file)