diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml
deleted file mode 100644
index b344ecfd59527d..00000000000000
--- a/.github/workflows/self-comment-ci.yml
+++ /dev/null
@@ -1,253 +0,0 @@
-name: PR comment GitHub CI
-
-on:
- issue_comment:
- types:
- - created
- branches-ignore:
- - main
-concurrency:
- group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow') }}
- cancel-in-progress: true
-
-jobs:
- get-pr-number:
- runs-on: ubuntu-22.04
- name: Get PR number
- # For security: only allow team members to run
- if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
- outputs:
- PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
- steps:
- - name: Get PR number
- shell: bash
- run: |
- if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
- echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
- else
- echo "PR_NUMBER=" >> $GITHUB_ENV
- fi
-
- - name: Check PR number
- shell: bash
- run: |
- echo "${{ env.PR_NUMBER }}"
-
- - name: Set PR number
- id: set_pr_number
- run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
-
- get-sha:
- runs-on: ubuntu-22.04
- needs: get-pr-number
- if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
- outputs:
- PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
- steps:
- - uses: actions/checkout@v4
- with:
- fetch-depth: "0"
- ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
-
- - name: Get SHA
- id: get_sha
- env:
- PR_NUMBER: ${{needs.get-pr-number.outputs.PR_NUMBER}}
- run: |
- git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
- git checkout refs/remotes/pull/$PR_NUMBER/head
- echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
- echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-
- # use a python script to handle this complex logic
- # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
- # case 2: `run-slow model_1, model_2`
- get-tests:
- runs-on: ubuntu-22.04
- needs: get-pr-number
- if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
- permissions: write-all
- outputs:
- models: ${{ steps.models_to_run.outputs.models }}
- steps:
- - uses: actions/checkout@v4
- with:
- fetch-depth: "0"
- ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
-
- - name: Get models to test
- env:
- PR_COMMENT: ${{ github.event.comment.body }}
- run: |
- python -m pip install GitPython
- python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
- echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
-
- - name: Show models to test
- id: models_to_run
- run: |
- echo "${{ env.models }}"
- echo "models=${{ env.models }}" >> $GITHUB_ENV
- echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
-
- - name: Reply to the comment
- if: ${{ env.models != '[]' }}
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- gh api \
- --method POST \
- -H "Accept: application/vnd.github+json" \
- -H "X-GitHub-Api-Version: 2022-11-28" \
- repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
- -f "body=This comment contains run-slow, running the specified jobs: ${{ env.models }} ..."
-
- create_run:
- name: Create run
- if: ${{ needs.get-tests.outputs.models != '[]' }}
- needs: [get-sha, get-tests]
- permissions: write-all
- runs-on: ubuntu-22.04
- steps:
- - name: Create Run
- id: create_run
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
- # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
- GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
- run: |
- gh api \
- --method POST \
- -H "Accept: application/vnd.github+json" \
- -H "X-GitHub-Api-Version: 2022-11-28" \
- repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
- -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"
-
- run_models_gpu:
- name: Run all tests for the model
- if: ${{ needs.get-tests.outputs.models != '[]' }}
- needs: [get-pr-number, get-tests, create_run]
- strategy:
- fail-fast: false
- matrix:
- folders: ${{ fromJson(needs.get-tests.outputs.models) }}
- machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
- runs-on:
- group: '${{ matrix.machine_type }}'
- container:
- image: huggingface/transformers-all-latest-gpu
- options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
- steps:
- - name: Echo input and matrix info
- shell: bash
- run: |
- echo "${{ matrix.folders }}"
-
- - name: Echo folder ${{ matrix.folders }}
- shell: bash
- # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
- # set the artifact folder names (because the character `/` is not allowed).
- run: |
- echo "${{ matrix.folders }}"
- matrix_folders=${{ matrix.folders }}
- matrix_folders=${matrix_folders/'models/'/'models_'}
- echo "$matrix_folders"
- echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
- - name: Checkout to PR merge commit
- working-directory: /transformers
- run: |
- git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
- git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
- git log -1 --format=%H
-
- - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
- working-directory: /transformers
- run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
- - name: NVIDIA-SMI
- run: |
- nvidia-smi
-
- - name: Set `machine_type` for report and artifact names
- working-directory: /transformers
- shell: bash
- run: |
- echo "${{ matrix.machine_type }}"
- if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
- machine_type=single-gpu
- elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
- machine_type=multi-gpu
- else
- machine_type=${{ matrix.machine_type }}
- fi
- echo "$machine_type"
- echo "machine_type=$machine_type" >> $GITHUB_ENV
-
- - name: Environment
- working-directory: /transformers
- run: |
- python3 utils/print_env.py
-
- - name: Show installed libraries and their versions
- working-directory: /transformers
- run: pip freeze
-
- - name: Run all tests on GPU
- working-directory: /transformers
- run: |
- export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
- echo $CUDA_VISIBLE_DEVICES
- python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
- - name: Failure short reports
- if: ${{ failure() }}
- continue-on-error: true
- run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
- - name: Make sure report directory exists
- shell: bash
- run: |
- mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
- echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
- echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
- - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
- if: ${{ always() }}
- uses: actions/upload-artifact@v4
- with:
- name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
- path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-
- update_run_status:
- name: Update Check Run Status
- needs: [get-sha, create_run, run_models_gpu]
- permissions: write-all
- if: ${{ always() && needs.create_run.result == 'success' }}
- runs-on: ubuntu-22.04
- env:
- GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
- steps:
- - name: Get `run_models_gpu` job status
- run: |
- echo "${{ needs.run_models_gpu.result }}"
- if [ "${{ needs.run_models_gpu.result }}" = "cancelled" ]; then
- echo "STATUS=failure" >> $GITHUB_ENV
- elif [ "${{ needs.run_models_gpu.result }}" = "skipped" ]; then
- echo "STATUS=success" >> $GITHUB_ENV
- else
- echo "STATUS=${{ needs.run_models_gpu.result }}" >> $GITHUB_ENV
- fi
-
- - name: Update PR commit statuses
- run: |
- echo "${{ needs.run_models_gpu.result }}"
- echo "${{ env.STATUS }}"
- gh api \
- --method POST \
- -H "Accept: application/vnd.github+json" \
- -H "X-GitHub-Api-Version: 2022-11-28" \
- repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
- -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 3cb2acdc53bb1a..44d1ceb2bfdd5e 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -69,6 +69,10 @@ RUN python3 -m pip install --no-cache-dir optimum-quanto
# Add eetq for quantization testing
RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
+# Add flute-kernel and fast_hadamard_transform for quantization testing
+RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
+RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1
+
# When installing in editable mode, `transformers` is not recognized as a package.
# this line must be added in order for python to be aware of transformers.
RUN cd transformers && python3 setup.py develop
diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml
index 287f4dffbb384e..c306b0ada80691 100644
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@@ -30,26 +30,26 @@
- local: conversations
title: الدردشة مع المحولات
title: البرامج التعليمية
-# - sections:
-# - isExpanded: false
-# sections:
+- sections:
+ - isExpanded: false
+ sections:
# - local: tasks/sequence_classification
# title: تصنيف النصوص
# - local: tasks/token_classification
# title: تصنيف الرموز
-# - local: tasks/question_answering
-# title: الإجابة على الأسئلة
+ - local: tasks/question_answering
+ title: الإجابة على الأسئلة
# - local: tasks/language_modeling
# title: نمذجة اللغة السببية
# - local: tasks/masked_language_modeling
# title: نمذجة اللغة المقنعة
# - local: tasks/translation
# title: الترجمة
-# - local: tasks/summarization
-# title: التلخيص
+ - local: tasks/summarization
+ title: التلخيص
# - local: tasks/multiple_choice
# title: الاختيار المتعدد
-# title: معالجة اللغات الطبيعية
+ title: معالجة اللغات الطبيعية
# - isExpanded: false
# sections:
# - local: tasks/audio_classification
@@ -107,7 +107,7 @@
# - local: tasks/prompting
# title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
# title: الإرشاد
-# title: أدلة المهام
+ title: أدلة المهام
- sections:
- local: fast_tokenizers
title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers
diff --git a/docs/source/ar/tasks/question_answering.md b/docs/source/ar/tasks/question_answering.md
new file mode 100644
index 00000000000000..0c4b66443d814a
--- /dev/null
+++ b/docs/source/ar/tasks/question_answering.md
@@ -0,0 +1,432 @@
+
+
+# الإجابة على الأسئلة (Question answering)
+
+[[open-in-colab]]
+
+
+
+تُقدّم مهام الإجابة على الأسئلة إجابةً بناءً على سؤال. إذا سبق لك أن سألت مساعدًا افتراضيًا مثل Alexa أو Siri أو Google عن حالة الطقس، فأنت قد استخدمت نموذج للإجابة على الأسئلة من قبل. هناك نوعان شائعان لمهام الإجابة على الأسئلة:
+
+- الاستخراجية: استخراج الإجابة من السياق المحدد.
+- التلخيصية: إنشاء إجابة من السياق تجيب على السؤال بشكل صحيح.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [SQuAD](https://huggingface.co/datasets/squad) للإجابة على الأسئلة الاستخراجية.
+2. استخدام النموذج المضبوط للاستدلال.
+
+
+
+لمشاهدة جميع الهياكل والنسخ المتوافقة مع هذه المهمة، نوصي بالرجوع إلى [صفحة المهمة](https://huggingface.co/tasks/question-answering)
+
+
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات SQuAD
+
+ابدأ بتحميل جزء أصغر من مجموعة بيانات SQuAD من مكتبة 🤗 Datasets. سيتيح لك ذلك فرصة للتجربة والتحقق من عمل كل شيء بشكل صحيح قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad", split="train[:5000]")
+```
+
+قم بتقسيم تقسيم `train` لمجموعة البيانات إلى مجموعة تدريب واختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> squad = squad.train_test_split(test_size=0.2)
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+هناك العديد من الحقول المهمة هنا:
+
+- `answers`: موقع بداية الرمز المميز للإجابة ونص الإجابة.
+- `context`: معلومات أساسية يحتاج النموذج إلى استخراج الإجابة منها.
+- `question`: السؤال الذي يجب على النموذج الإجابة عليه.
+
+## المعالجة المسبقة (Preprocess)
+
+
+
+الخطوة التالية هي تحميل المحلل اللغوى DistilBERT لمعالجة حقلي `question` و `context`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+هناك بعض خطوات المعالجة المسبقة الخاصة بمهام الإجابة على الأسئلة التي يجب أن تكون على دراية بها:
+
+1. قد تحتوي بعض الأمثلة في مجموعة البيانات على `context` طويلًا يتجاوز الحد الأقصى لطول مدخل النموذج. للتعامل مع النصوص الأطول، يتم اقتطاع `context` فقط عن طريق تعيين `truncation="only_second"`.
+2. بعد ذلك، يتم تحديد مواضع بداية ونهاية الإجابة في `context` الأصلي عن طريق تعيين
+ `return_offset_mapping=True`.
+3. باستخدام التعيين، يمكن الآن تحديد رموز بداية ونهاية الإجابة. استخدم طريقة [`~tokenizers.Encoding.sequence_ids`]
+ لتحديد أجزاء الإزاحة التي تتوافق مع `question` و `context`.
+
+فيما يلي كيفية إنشاء دالة لقص وتعيين رموز البداية والنهاية لـ `answer` إلى `context`:
+
+```py
+>>> def preprocess_function(examples):
+... questions = [q.strip() for q in examples["question"]]
+... inputs = tokenizer(
+... questions,
+... examples["context"],
+... max_length=384,
+... truncation="only_second",
+... return_offsets_mapping=True,
+... padding="max_length",
+... )
+
+... offset_mapping = inputs.pop("offset_mapping")
+... answers = examples["answers"]
+... start_positions = []
+... end_positions = []
+
+... for i, offset in enumerate(offset_mapping):
+... answer = answers[i]
+... start_char = answer["answer_start"][0]
+... end_char = answer["answer_start"][0] + len(answer["text"][0])
+... sequence_ids = inputs.sequence_ids(i)
+
+... # Find the start and end of the context
+... idx = 0
+... while sequence_ids[idx] != 1:
+... idx += 1
+... context_start = idx
+... while sequence_ids[idx] == 1:
+... idx += 1
+... context_end = idx - 1
+
+... # If the answer is not fully inside the context, label it (0, 0)
+... if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+... start_positions.append(0)
+... end_positions.append(0)
+... else:
+... # Otherwise it's the start and end token positions
+... idx = context_start
+... while idx <= context_end and offset[idx][0] <= start_char:
+... idx += 1
+... start_positions.append(idx - 1)
+
+... idx = context_end
+... while idx >= context_start and offset[idx][1] >= end_char:
+... idx -= 1
+... end_positions.append(idx + 1)
+
+... inputs["start_positions"] = start_positions
+... inputs["end_positions"] = end_positions
+... return inputs
+```
+
+لتطبيق المعالجة المسبقة على كامل مجموعة البيانات، استخدم [`~datasets.Dataset.map`] من مكتبة 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات دفعة واحدة. قم بإزالة أي أعمدة لا تحتاجها:
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DefaultDataCollator`]. بخلاف مجمّعات البيانات الأخرى في 🤗 Transformers، لا يطبق [`DefaultDataCollator`] أي معالجة مسبقة إضافية مثل الحشو.
+
+
+
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+
+
+
+## التدريب (Train)
+
+
+
+
+
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], ألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل DistilBERT باستخدام [`AutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد المعاملات الفائقة للتدريب في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك).
+2. مرر معاملات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة البيانات، والمُحلّل النصي، ومُجمّع البيانات.
+3. استدعِ ـ [`~Trainer.train`] لضبط النموذج.
+
+```py
+>>> training_args = TrainingArguments(
+... output_dir="my_awesome_qa_model",
+... eval_strategy="epoch",
+... learning_rate=2e-5,
+... per_device_train_batch_size=16,
+... per_device_eval_batch_size=16,
+... num_train_epochs=3,
+... weight_decay=0.01,
+... push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+... model=model,
+... args=training_args,
+... train_dataset=tokenized_squad["train"],
+... eval_dataset=tokenized_squad["test"],
+... processing_class=tokenizer,
+... data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك في Hub باستخدام الدالة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+
+
+
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن، وجدول معدل التعلم، وبعض المعاملات الفائقة للتدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+... init_lr=2e-5,
+... num_warmup_steps=0,
+... num_train_steps=total_train_steps,
+... )
+```
+
+ثم يمكنك تحميل DistilBERT باستخدام [`TFAutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+... tokenized_squad["train"],
+... shuffle=True,
+... batch_size=16,
+... collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+... tokenized_squad["test"],
+... shuffle=False,
+... batch_size=16,
+... collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+آخر شيء يجب إعداده قبل بدء التدريب هو توفير طريقة لدفع نموذجك إلى Hub. يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالجك المعجمي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+... output_dir="my_awesome_qa_model",
+... tokenizer=tokenizer,
+... )
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العهود، ومعاودة الاتصال الخاصة بك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
+```
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+
+
+
+
+
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للإجابة على الأسئلة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) المقابل
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+
+
+
+## التقييم (Evaluate)
+
+يتطلب التقييم للإجابة على الأسئلة قدرًا كبيرًا من المعالجة اللاحقة. لتوفير وقتك، يتخطى هذا الدليل خطوة التقييم. لا يزال [`Trainer`] يحسب خسارة التقييم أثناء التدريب، مما يعني أنك لست تجهل تمامًا أداء نموذجك.
+
+إذا كان لديك المزيد من الوقت وتهتم بكيفية تقييم نموذجك للإجابة على الأسئلة، فألق نظرة على فصل [الإجابة على الأسئلة](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) من دورة 🤗 Hugging Face!
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+حدد سؤالًا وسياقًا ليقوم النموذج بالتنبؤ بالإجابة عليه:
+
+```py
+>>> question = "How many programming languages does BLOOM support?"
+>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+```
+
+أبسط طريقة لتجربة نموذجك المُدرَّب للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن لـ `pipeline` للإجابة على الأسئلة باستخدام نموذجك، ومرِّر النص إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+>>> question_answerer(question=question, context=context)
+{'score': 0.2058267742395401,
+ 'start': 10,
+ 'end': 95,
+ 'answer': '176 مليار معامل ويمكنه إنشاء نصوص بـ 46 لغة طبيعية و 13'}
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+
+
+
+ قسّم النص وأرجع تنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="pt")
+```
+
+مرر مدخلاتك إلى النموذج وأرجع `logits`:
+
+```py
+>>> import torch
+>>> from transformers import AutoModelForQuestionAnswering
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> with torch.no_grad():
+... outputs = model(**inputs)
+```
+
+احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
+
+```py
+>>> answer_start_index = outputs.start_logits.argmax()
+>>> answer_end_index = outputs.end_logits.argmax()
+```
+
+استخلاص الإجابة من الرموز المتوقعة:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+
+
+قم بتحليل النص المعجمي وأعد موترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="tf")
+```
+
+مرر مدخلاتك إلى النموذج وأعد `logits`:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> outputs = model(**inputs)
+```
+
+احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
+
+```py
+>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+```
+
+استخلاص الإجابة من الرموز المتوقعة:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+
+
diff --git a/docs/source/ar/tasks/summarization.md b/docs/source/ar/tasks/summarization.md
new file mode 100644
index 00000000000000..17dbcb42e8374e
--- /dev/null
+++ b/docs/source/ar/tasks/summarization.md
@@ -0,0 +1,397 @@
+
+
+# التلخيص (Summarization)
+
+[[open-in-colab]]
+
+
+
+يقوم التلخيص بإنشاء نسخة مختصرة من مستند أو مقال، حيث يلتقط جميع المعلومات المهمة. بالإضافة إلى الترجمة، يعتبر التلخيص مثالاً آخر على مهمة يمكن صياغتها كتسلسل إلى تسلسل. يمكن أن يكون التلخيص:
+
+- استخراجي: استخراج أهم المعلومات من مستند.
+- تجريدي: إنشاء نص جديد يلخص أهم المعلومات.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط دقيق [T5](https://huggingface.co/google-t5/t5-small) على مجموعة فرعية من مشاريع قوانين ولاية كاليفورنيا من مجموعة بيانات [BillSum](https://huggingface.co/datasets/billsum) للتلخيص التجريدي.
+2. استخدام النموذج المضبوط بدقة للتنبؤ.
+
+
+
+لمشاهدة جميع البنى ونقاط التفتيش المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/summarization)
+
+
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate rouge_score
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات BillSum
+
+ابدأ بتحميل جزء صغير من بيانات مشاريع القوانين الخاصة بولاية كاليفورنيا من مجموعة بيانات BillSum في مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+قسّم مجموعة البيانات إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+ثم ألقِ نظرة على مثال:
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+هناك مُدخلان سترغب في استخدامهما:
+
+- `text`: نص القانون الذي سيكون مُدخلًا للنموذج.
+- `summary`: نسخة مُختصرة من `text` والتي ستكون هدف النموذج.
+
+## المعالجة المسبقة (Preprocess)
+
+الخطوة التالية هي تحميل مجزء النصوص T5 لمعالجة `text` و `summary`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "google-t5/t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+وظيفة المعالجة المسبقة التي تريد إنشاءها تحتاج إلى:
+
+1. إضافة بادئة للمُدخل باستخدام توجيه حتى يعرف T5 أن هذه مهمة تلخيص. تتطلب بعض النماذج القادرة على مهام البرمجة اللغوية العصبية المتعددة توجيهات لمهام مُحددة.
+2. استخدام مُعامل الكلمة الرئيسية `text_target` عند ترميز التصنيفات.
+3. قصّ التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي تم تعيينه بواسطة مُعامل `max_length`.
+
+```py
+>>> prefix = "summarize: "
+
+>>> def preprocess_function(examples):
+... inputs = [prefix + doc for doc in examples["text"]]
+... model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+... model_inputs["labels"] = labels["input_ids"]
+... return model_inputs
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء عملية التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+
+
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+
+
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+
+
+
+## التقييم (Evaluate)
+
+يُعد تضمين مقياس أثناء التدريب مفيدًا غالبًا لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) الخاصة بـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
+
+```py
+>>> import evaluate
+
+>>> rouge = evaluate.load("rouge")
+```
+
+ثم قم بإنشاء دالة تُمرر تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب مقياس ROUGE:
+
+```py
+>>> import numpy as np
+
+>>> def compute_metrics(eval_pred):
+... predictions, labels = eval_pred
+... decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+... labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+... result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+
+... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+... result["gen_len"] = np.mean(prediction_lens)
+
+... return {k: round(v, 4) for k, v in result.items()}
+```
+
+دالة `compute_metrics` الخاصة بك جاهزة الآن، وستعود إليها عند إعداد التدريب الخاص بك.
+
+## التدريب (Train)
+
+
+
+
+
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`]، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+في هذه المرحلة، لم يتبق سوى ثلاث خطوات:
+
+1. حدد مُعامِلات التدريب الخاصة بك في [`Seq2SeqTrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يُحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (تحتاج إلى تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس ROUGE وحفظ نقطة تفتيش التدريب.
+2. مرر مُعامِلات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمُحلِّل اللغوي وجامع البيانات ودالة `compute_metrics`.
+3. استدعِ [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+... output_dir="my_awesome_billsum_model",
+... eval_strategy="epoch",
+... learning_rate=2e-5,
+... per_device_train_batch_size=16,
+... per_device_eval_batch_size=16,
+... weight_decay=0.01,
+... save_total_limit=3,
+... num_train_epochs=4,
+... predict_with_generate=True,
+... fp16=True, #change to bf16=True for XPU
+... push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+... model=model,
+... args=training_args,
+... train_dataset=tokenized_billsum["train"],
+... eval_dataset=tokenized_billsum["test"],
+... processing_class=tokenizer,
+... data_collator=data_collator,
+... compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+
+
+
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+... tokenized_billsum["train"],
+... shuffle=True,
+... batch_size=16,
+... collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+... tokenized_billsum["test"],
+... shuffle=False,
+... batch_size=16,
+... collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة افتراضيًا، لذلك لست بحاجة إلى تحديد واحدة ما لم تكن ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer) # No loss argument!
+```
+
+آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب درجة ROUGE من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
+```
+
+حدد مكان دفع نموذجك ومُحلِّلك اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+... output_dir="my_awesome_billsum_model",
+... tokenizer=tokenizer,
+... )
+```
+
+ثم اجمع استدعاءاتك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+
+
+
+
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للتجميع، ألقِ نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb) المقابل.
+
+
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+خدد بعض النصوص الذي ترغب في تلخيصها. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مُدخلاتك اعتمادًا على المهمة التي تعمل عليها. بالنسبة التلخيص، يجب عليك إضافة بادئة إلى مُدخلاتك كما هو موضح أدناه:
+
+```py
+>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
+```
+
+أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. استخدم `pipeline` للتلخيص باستخدام نموذجك، ومرر نصك إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline("summarization", model="username/my_awesome_billsum_model")
+>>> summarizer(text)
+[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+
+
+قسم النص وإرجع `input_ids` كتنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+استخدم طريقة [`~generation.GenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation).
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+
+
+قسم النص وإرجع `input_ids` كتنسورات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation).
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+
+
\ No newline at end of file
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 18de03e1df8016..a076f704b8ede2 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -173,6 +173,8 @@
title: Quanto
- local: quantization/eetq
title: EETQ
+ - local: quantization/higgs
+ title: HIGGS
- local: quantization/hqq
title: HQQ
- local: quantization/fbgemm_fp8
@@ -653,6 +655,8 @@
title: DiNAT
- local: model_doc/dinov2
title: DINOV2
+ - local: model_doc/dinov2_with_registers
+ title: DINOv2 with Registers
- local: model_doc/dit
title: DiT
- local: model_doc/dpt
diff --git a/docs/source/en/fsdp.md b/docs/source/en/fsdp.md
index 6b90ab5ad6d6d5..2c4f114dec85cb 100644
--- a/docs/source/en/fsdp.md
+++ b/docs/source/en/fsdp.md
@@ -58,7 +58,7 @@ Otherwise, you can choose a size-based wrapping policy where FSDP is applied to
### Checkpointing
-Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`]` method.
+Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method.
```py
# directory containing checkpoints
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 967049d89cbe12..dcecfc872d61d0 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -127,6 +127,7 @@ Flax), PyTorch, and/or TensorFlow.
| [DialoGPT](model_doc/dialogpt) | ✅ | ✅ | ✅ |
| [DiNAT](model_doc/dinat) | ✅ | ❌ | ❌ |
| [DINOv2](model_doc/dinov2) | ✅ | ❌ | ✅ |
+| [DINOv2 with Registers](model_doc/dinov2_with_registers) | ✅ | ❌ | ❌ |
| [DistilBERT](model_doc/distilbert) | ✅ | ✅ | ✅ |
| [DiT](model_doc/dit) | ✅ | ❌ | ✅ |
| [DonutSwin](model_doc/donut) | ✅ | ❌ | ❌ |
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index 9b500b69374c88..037660d0638cbd 100755
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -57,6 +57,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
[[autodoc]] quantizers.base.HfQuantizer
+## HiggsConfig
+
+[[autodoc]] HiggsConfig
+
## HqqConfig
[[autodoc]] HqqConfig
diff --git a/docs/source/en/model_doc/dinov2_with_registers.md b/docs/source/en/model_doc/dinov2_with_registers.md
new file mode 100644
index 00000000000000..360ebf9b8f8a15
--- /dev/null
+++ b/docs/source/en/model_doc/dinov2_with_registers.md
@@ -0,0 +1,54 @@
+
+
+# DINOv2 with Registers
+
+## Overview
+
+The DINOv2 with Registers model was proposed in [Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski.
+
+The [Vision Transformer](vit) (ViT) is a transformer encoder model (BERT-like) originally introduced to do supervised image classification on ImageNet.
+
+Next, people figured out ways to make ViT work really well on self-supervised image feature extraction (i.e. learning meaningful features, also called embeddings) on images without requiring any labels. Some example papers here include [DINOv2](dinov2) and [MAE](vit_mae).
+
+The authors of DINOv2 noticed that ViTs have artifacts in attention maps. It’s due to the model using some image patches as “registers”. The authors propose a fix: just add some new tokens (called "register" tokens), which you only use during pre-training (and throw away afterwards). This results in:
+- no artifacts
+- interpretable attention maps
+- and improved performances.
+
+The abstract from the paper is the following:
+
+*Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.*
+
+
+
+ Visualization of attention maps of various models trained with vs. without registers. Taken from the original paper.
+
+Tips:
+
+- Usage of DINOv2 with Registers is identical to DINOv2 without, you'll just get better performance.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/facebookresearch/dinov2).
+
+
+## Dinov2WithRegistersConfig
+
+[[autodoc]] Dinov2WithRegistersConfig
+
+## Dinov2WithRegistersModel
+
+[[autodoc]] Dinov2WithRegistersModel
+ - forward
+
+## Dinov2WithRegistersForImageClassification
+
+[[autodoc]] Dinov2WithRegistersForImageClassification
+ - forward
\ No newline at end of file
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index 930f41b6fefba7..364141c8e406b2 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -238,6 +238,7 @@ For now, Transformers supports SDPA inference and training for the following arc
* [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
* [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
* [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
+* [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
* [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
* [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
* [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel)
@@ -331,10 +332,11 @@ In that case, you should see a warning message and we will fall back to the (slo
-By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
+By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.nn.attention.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) as a context manager:
```diff
import torch
++ from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@@ -343,7 +345,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=to
input_text = "Hello my dog is cute and"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
++ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@@ -461,7 +463,7 @@ generated_ids = model.generate(**inputs)
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
```
-To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:
+To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:
```py
max_memory_mapping = {0: "1GB", 1: "2GB"}
@@ -517,6 +519,7 @@ It is often possible to combine several of the optimization techniques described
```py
import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# load model in 4-bit
@@ -535,7 +538,7 @@ input_text = "Hello my dog is cute and"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
# enable FlashAttention
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
diff --git a/docs/source/en/quantization/higgs.md b/docs/source/en/quantization/higgs.md
new file mode 100644
index 00000000000000..d2aa9c9dc497d5
--- /dev/null
+++ b/docs/source/en/quantization/higgs.md
@@ -0,0 +1,66 @@
+
+
+# HIGGS
+
+HIGGS is a 0-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and SOTA performance. You can find more information in the paper [arxiv.org/abs/2411.17525](https://arxiv.org/abs/2411.17525).
+
+Runtime support for HIGGS is implemented through [FLUTE](https://arxiv.org/abs/2407.10960), and its [library](https://github.com/HanGuo97/flute).
+
+## Quantization Example
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+ "google/gemma-2-9b-it",
+ quantization_config=HiggsConfig(bits=4),
+ device_map="auto",
+)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
+
+tokenizer.decode(model.generate(
+ **tokenizer("Hi,", return_tensors="pt").to(model.device),
+ temperature=0.5,
+ top_p=0.80,
+)[0])
+```
+
+## Pre-quantized models
+
+Some pre-quantized models can be found in the [official collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e) on Hugging Face Hub.
+
+## Current Limitations
+
+**Architectures**
+
+Currently, FLUTE, and HIGGS by extension, **only support Llama 3 and 3.0 of 8B, 70B and 405B parameters, as well as Gemma-2 9B and 27B**. We're working on allowing to run more diverse models as well as allow arbitrary models by modifying the FLUTE compilation procedure.
+
+**torch.compile**
+
+HIGGS is fully compatible with `torch.compile`. Compiling `model.forward`, as described [here](../perf_torch_compile.md), here're the speedups it provides on RTX 4090 for `Llama-3.1-8B-Instruct` (forward passes/sec):
+
+| Batch Size | BF16 (With `torch.compile`) | HIGGS 4bit (No `torch.compile`) | HIGGS 4bit (With `torch.compile`) |
+|------------|-----------------------------|----------------------------------|-----------------------------------|
+| 1 | 59 | 41 | 124 |
+| 4 | 57 | 42 | 123 |
+| 16 | 56 | 41 | 120 |
+
+
+**Quantized training**
+
+Currently, HIGGS doesn't support quantized training (and backward passes in general). We're working on adding support for it.
\ No newline at end of file
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index f3508aed0674f6..48840fad646fd0 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -54,6 +54,7 @@ Use the table below to help you decide which quantization method to use.
| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ |
| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp |
| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ |
+| [HIGGS](./higgs) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2 - 4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute |
| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ |
| [optimum-quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto |
| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM |
diff --git a/docs/source/ja/perf_infer_gpu_many.md b/docs/source/ja/perf_infer_gpu_many.md
index 378bb2a248fe11..18a19c849eb2c7 100644
--- a/docs/source/ja/perf_infer_gpu_many.md
+++ b/docs/source/ja/perf_infer_gpu_many.md
@@ -34,7 +34,7 @@ BetterTransformerは、テキスト、画像、音声モデルの単一GPUおよ
Flash Attentionは、fp16またはbf16 dtypeを使用しているモデルにのみ使用できます。BetterTransformerを使用する前に、モデルを適切なdtypeにキャストしてください。
-
+
### Decoder models
@@ -53,11 +53,12 @@ model.to_bettertransformer()
# Use it for training or inference
```
-SDPAは、ハードウェアや問題のサイズなどの特定の設定で[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを呼び出すこともできます。Flash Attentionを有効にするか、特定の設定(ハードウェア、問題のサイズ)で利用可能かを確認するには、[`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel)をコンテキストマネージャとして使用します。
+SDPAは、ハードウェアや問題のサイズなどの特定の設定で[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを呼び出すこともできます。Flash Attentionを有効にするか、特定の設定(ハードウェア、問題のサイズ)で利用可能かを確認するには、[`torch.nn.kernel.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html)をコンテキストマネージャとして使用します。
```diff
import torch
++ from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@@ -68,7 +69,7 @@ model.to_bettertransformer()
input_text = "Hello my dog is cute and"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
++ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@@ -105,6 +106,7 @@ BetterTransformerのパフォーマンスの詳細については、この[ブ
```py
import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
@@ -118,7 +120,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_c
input_text = "Hello my dog is cute and"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
diff --git a/docs/source/ja/perf_infer_gpu_one.md b/docs/source/ja/perf_infer_gpu_one.md
index d6a9b309164dbf..6a3dc5fa64a852 100644
--- a/docs/source/ja/perf_infer_gpu_one.md
+++ b/docs/source/ja/perf_infer_gpu_one.md
@@ -55,8 +55,8 @@ model_id = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
- model_id,
- torch_dtype=torch.bfloat16,
+ model_id,
+ torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
```
@@ -112,7 +112,7 @@ model_id = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
- model_id,
+ model_id,
load_in_8bit=True,
attn_implementation="flash_attention_2",
)
@@ -130,7 +130,7 @@ model_id = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
- model_id,
+ model_id,
load_in_4bit=True,
attn_implementation="flash_attention_2",
)
@@ -149,7 +149,7 @@ model_id = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
- model_id,
+ model_id,
load_in_4bit=True,
attn_implementation="flash_attention_2",
)
@@ -173,7 +173,7 @@ BetterTransformerは、テキスト、画像、およびオーディオモデル
Flash Attentionは、fp16またはbf16のdtypeを使用するモデルにのみ使用できます。BetterTransformerを使用する前に、モデルを適切なdtypeにキャストしてください。
-
+
### Encoder models
@@ -214,11 +214,12 @@ model.to_bettertransformer()
# Use it for training or inference
```
-SDPAは、ハードウェアや問題のサイズに応じて[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを使用することもできます。Flash Attentionを有効にするか、特定の設定(ハードウェア、問題サイズ)で使用可能かどうかを確認するには、[`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel)をコンテキストマネージャとして使用します。
+SDPAは、ハードウェアや問題のサイズに応じて[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを使用することもできます。Flash Attentionを有効にするか、特定の設定(ハードウェア、問題サイズ)で使用可能かどうかを確認するには、[`torch.nn.attention.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html)をコンテキストマネージャとして使用します。
```diff
import torch
++ from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@@ -229,7 +230,7 @@ model.to_bettertransformer()
input_text = "Hello my dog is cute and"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
++ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@@ -421,6 +422,7 @@ In this example, the first GPU will use 1GB of memory and the second 2GB.
```py
import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
@@ -434,7 +436,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_c
input_text = "Hello my dog is cute and"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
diff --git a/docs/source/zh/fsdp.md b/docs/source/zh/fsdp.md
index a322ec81e52c35..4688b021f7416e 100644
--- a/docs/source/zh/fsdp.md
+++ b/docs/source/zh/fsdp.md
@@ -74,7 +74,7 @@ FSDP 是通过包装网络中的每个层来应用的。通常,包装是以嵌
应该使用 `fsdp_state_dict_type: SHARDED_STATE_DICT` 来保存中间检查点,
因为在排名 0 上保存完整状态字典需要很长时间,通常会导致 `NCCL Timeout` 错误,因为在广播过程中会无限期挂起。
-您可以使用 [`~accelerate.Accelerator.load_state`]` 方法加载分片状态字典以恢复训练。
+您可以使用 [`~accelerate.Accelerator.load_state`] 方法加载分片状态字典以恢复训练。
```py
# 包含检查点的目录
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5510ac6c8ad512..7df1af049de626 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -404,6 +404,7 @@
"models.dialogpt": [],
"models.dinat": ["DinatConfig"],
"models.dinov2": ["Dinov2Config"],
+ "models.dinov2_with_registers": ["Dinov2WithRegistersConfig"],
"models.distilbert": [
"DistilBertConfig",
"DistilBertTokenizer",
@@ -998,6 +999,7 @@
"EetqConfig",
"FbgemmFp8Config",
"GPTQConfig",
+ "HiggsConfig",
"HqqConfig",
"QuantoConfig",
"TorchAoConfig",
@@ -2159,6 +2161,14 @@
"Dinov2PreTrainedModel",
]
)
+ _import_structure["models.dinov2_with_registers"].extend(
+ [
+ "Dinov2WithRegistersBackbone",
+ "Dinov2WithRegistersForImageClassification",
+ "Dinov2WithRegistersModel",
+ "Dinov2WithRegistersPreTrainedModel",
+ ]
+ )
_import_structure["models.distilbert"].extend(
[
"DistilBertForMaskedLM",
@@ -5361,6 +5371,7 @@
from .models.detr import DetrConfig
from .models.dinat import DinatConfig
from .models.dinov2 import Dinov2Config
+ from .models.dinov2_with_registers import Dinov2WithRegistersConfig
from .models.distilbert import (
DistilBertConfig,
DistilBertTokenizer,
@@ -6023,6 +6034,7 @@
EetqConfig,
FbgemmFp8Config,
GPTQConfig,
+ HiggsConfig,
HqqConfig,
QuantoConfig,
TorchAoConfig,
@@ -7017,6 +7029,12 @@
Dinov2Model,
Dinov2PreTrainedModel,
)
+ from .models.dinov2_with_registers import (
+ Dinov2WithRegistersBackbone,
+ Dinov2WithRegistersForImageClassification,
+ Dinov2WithRegistersModel,
+ Dinov2WithRegistersPreTrainedModel,
+ )
from .models.distilbert import (
DistilBertForMaskedLM,
DistilBertForMultipleChoice,
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 32c828cd6e5b44..e0149decde3101 100755
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -63,6 +63,7 @@
"load_dequant_gguf_tensor",
"load_gguf",
],
+ "higgs": ["HiggsLinear", "dequantize_higgs", "quantize_with_higgs", "replace_with_higgs_linear"],
"hqq": ["prepare_for_hqq_linear"],
"integration_utils": [
"INTEGRATION_TO_CALLBACK",
@@ -166,6 +167,7 @@
load_dequant_gguf_tensor,
load_gguf,
)
+ from .higgs import HiggsLinear, dequantize_higgs, quantize_with_higgs, replace_with_higgs_linear
from .hqq import prepare_for_hqq_linear
from .integration_utils import (
INTEGRATION_TO_CALLBACK,
diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py
index 2501261b55e091..b10a3b599174cd 100644
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@@ -363,13 +363,14 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", st
if state.SCB is None:
state.SCB = weight.SCB
- im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
- im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im)
- im, Sim = bnb.functional.transform(im, "col32")
- if state.CxB is None:
- state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
- out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
- return bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t().to(dtype)
+ if hasattr(bnb.functional, "int8_vectorwise_dequant"):
+ # Use bitsandbytes API if available (requires v0.45.0+)
+ dequantized = bnb.functional.int8_vectorwise_dequant(weight.data, state.SCB)
+ else:
+ # Multiply by (scale/127) to dequantize.
+ dequantized = weight.data * state.SCB.view(-1, 1) * 7.874015718698502e-3
+
+ return dequantized.to(dtype)
def _create_accelerate_new_hook(old_hook):
diff --git a/src/transformers/integrations/flash_attention.py b/src/transformers/integrations/flash_attention.py
index b8407bc29c6a8a..a3ca4bea484d22 100644
--- a/src/transformers/integrations/flash_attention.py
+++ b/src/transformers/integrations/flash_attention.py
@@ -44,6 +44,9 @@ def flash_attention_forward(
else:
target_dtype = next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype
+ # FA2 always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
+ kwargs.pop("is_causal", None)
+
attn_output = _flash_attention_forward(
query,
key,
diff --git a/src/transformers/integrations/higgs.py b/src/transformers/integrations/higgs.py
new file mode 100644
index 00000000000000..5a8f6537bb2bd5
--- /dev/null
+++ b/src/transformers/integrations/higgs.py
@@ -0,0 +1,657 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"HIGGS through FLUTE (Flexible Lookup Table Engine for LUT-quantized LLMs) integration file"
+
+from math import sqrt
+
+from ..utils import (
+ is_flute_available,
+ is_hadamard_available,
+ is_torch_available,
+)
+
+
+if is_torch_available():
+ import torch
+ from torch import nn
+
+
+if is_flute_available():
+ import flute.utils
+
+if is_hadamard_available():
+ from fast_hadamard_transform import hadamard_transform
+
+if is_flute_available():
+ import flute.utils
+ from flute.integrations.higgs import prepare_data_transposed
+
+
+def pad_to_block(tensor, dims, had_block_size, value=0):
+ pad_dims = [0 for _ in range(2 * len(tensor.shape))]
+ for dim in dims:
+ size = tensor.shape[dim]
+ next_multiple_of_1024 = ((size - 1) // had_block_size + 1) * had_block_size
+ delta = next_multiple_of_1024 - size
+ pad_dims[-2 * dim - 1] = delta
+
+ return nn.functional.pad(tensor, pad_dims, "constant", value)
+
+
+def get_higgs_grid(p: int, n: int):
+ if (p, n) == (2, 256):
+ return torch.tensor(
+ [
+ [-2.501467704772949, 0.17954708635807037],
+ [-0.6761789321899414, 1.2728623151779175],
+ [-1.8025816679000854, 0.7613157629966736],
+ [-0.538287878036499, -2.6028504371643066],
+ [0.8415029644966125, -0.8600977659225464],
+ [0.7023013234138489, 3.3138747215270996],
+ [0.5699077844619751, 2.5782253742218018],
+ [3.292393207550049, -0.6016128063201904],
+ [0.5561617016792297, -1.7723814249038696],
+ [-2.1012380123138428, 0.020958125591278076],
+ [0.46085724234580994, 0.8428705334663391],
+ [1.4548040628433228, -0.6156039237976074],
+ [3.210029363632202, 0.3546904921531677],
+ [0.8893890976905823, -0.5967988967895508],
+ [0.8618854284286499, -3.2061192989349365],
+ [1.1360996961593628, -0.23852407932281494],
+ [1.6646337509155273, -0.9265465140342712],
+ [1.4767773151397705, 1.2476022243499756],
+ [-1.0511897802352905, 1.94503915309906],
+ [-1.56318998336792, -0.3264186680316925],
+ [-0.1829211413860321, 0.2922491431236267],
+ [-0.8950616717338562, -1.3887052536010742],
+ [-0.08206957578659058, -1.329533576965332],
+ [-0.487422913312912, 1.4817842245101929],
+ [-1.6769757270812988, -2.8269758224487305],
+ [-1.5057679414749146, 1.8905963897705078],
+ [1.8335362672805786, 1.0515104532241821],
+ [0.3273945450782776, 1.0491033792495728],
+ [-3.295924186706543, -0.7021600008010864],
+ [-1.8428784608840942, -1.2315762042999268],
+ [-0.8575026392936707, -1.7005949020385742],
+ [-1.120667815208435, 0.6467998027801514],
+ [-0.1588846743106842, -1.804071068763733],
+ [-0.8539647459983826, 0.5645008683204651],
+ [-1.4192019701004028, -0.6175029873847961],
+ [1.0799058675765991, 1.7871345281600952],
+ [1.171311855316162, 0.7511613965034485],
+ [2.162078380584717, 0.8044339418411255],
+ [1.3969420194625854, -1.243762493133545],
+ [-0.23818807303905487, 0.053944624960422516],
+ [2.304199457168579, -1.2667627334594727],
+ [1.4225027561187744, 0.568610668182373],
+ [0.376836895942688, -0.7134661674499512],
+ [2.0404467582702637, 0.4087389409542084],
+ [0.7639489769935608, -1.1367933750152588],
+ [0.3622530400753021, -1.4827953577041626],
+ [0.4100743532180786, 0.36108437180519104],
+ [-1.5867475271224976, -1.618212342262268],
+ [-2.2769672870635986, -1.2132309675216675],
+ [0.9184022545814514, -0.34428009390830994],
+ [-0.3902314603328705, 0.21785245835781097],
+ [3.120687484741211, 1.3077973127365112],
+ [1.587440848350525, -1.6506884098052979],
+ [-1.718808889389038, -0.038405973464250565],
+ [-0.6888407468795776, -0.8402308821678162],
+ [-0.7981445789337158, -1.1117373704910278],
+ [-2.4124443531036377, 1.3419722318649292],
+ [-0.6611530184745789, 0.9939885139465332],
+ [-0.33103418350219727, -0.16702833771705627],
+ [-2.4091389179229736, -2.326857566833496],
+ [1.6610108613967896, -2.159703254699707],
+ [0.014884627424180508, 0.3887578248977661],
+ [0.029668325558304787, 1.8786455392837524],
+ [1.180362582206726, 2.699317216873169],
+ [1.821286678314209, -0.5960053205490112],
+ [-0.44835323095321655, 3.327436685562134],
+ [-0.3714401423931122, -2.1466753482818604],
+ [-1.1103475093841553, -2.4536871910095215],
+ [-0.39110705256462097, 0.6670510172843933],
+ [0.474752813577652, -1.1959707736968994],
+ [-0.013110585510730743, -2.52519154548645],
+ [-2.0836575031280518, -1.703289270401001],
+ [-1.1077687740325928, -0.1252644956111908],
+ [-0.4138077199459076, 1.1837692260742188],
+ [-1.977599024772644, 1.688241720199585],
+ [-1.659559965133667, -2.1387736797332764],
+ [0.03242531046271324, 0.6526556015014648],
+ [0.9127950072288513, 0.6099498867988586],
+ [-0.38478314876556396, 0.433487206697464],
+ [0.27454206347465515, -0.27719801664352417],
+ [0.10388526320457458, 2.2812814712524414],
+ [-0.014394169673323631, -3.177137613296509],
+ [-1.2871228456497192, -0.8961855173110962],
+ [0.5720916986465454, -0.921597957611084],
+ [1.1159656047821045, -0.7609877586364746],
+ [2.4383342266082764, -2.2983546257019043],
+ [-0.294057160615921, -0.9770799875259399],
+ [-0.9342701435089111, 1.107579231262207],
+ [-1.549338698387146, 3.090520143508911],
+ [2.6076579093933105, 2.051239013671875],
+ [-0.9259037375450134, 1.407211184501648],
+ [-0.1747353971004486, 0.540488600730896],
+ [-0.8963701725006104, 0.8271111249923706],
+ [0.6480194926261902, 1.0128909349441528],
+ [0.980783998966217, -0.06156221032142639],
+ [-0.16883476078510284, 1.0601658821105957],
+ [0.5839992761611938, 0.004697148688137531],
+ [-0.34228450059890747, -1.2423977851867676],
+ [2.500824451446533, 0.3665279746055603],
+ [-0.17641609907150269, 1.3529551029205322],
+ [0.05378641560673714, 2.817232847213745],
+ [-1.2391047477722168, 2.354328155517578],
+ [0.630434513092041, -0.668536365032196],
+ [1.7576488256454468, 0.6738647818565369],
+ [0.4435231387615204, 0.6000469326972961],
+ [-0.08794835954904556, -0.11511358618736267],
+ [1.6540337800979614, 0.33995017409324646],
+ [-0.04202975332736969, -0.5375117063522339],
+ [-0.4247745871543884, -0.7897617220878601],
+ [0.06695003807544708, 1.2000739574432373],
+ [-3.2508881092071533, 0.28734830021858215],
+ [-1.613816261291504, 0.4944162368774414],
+ [1.3598989248275757, 0.26117825508117676],
+ [2.308382511138916, 1.3462618589401245],
+ [-1.2137469053268433, -1.9254342317581177],
+ [-0.4889402985572815, 1.8136259317398071],
+ [-0.1870335340499878, -0.3480615019798279],
+ [1.0766386985778809, -1.0627082586288452],
+ [0.4651014506816864, 2.131748914718628],
+ [-0.1306295394897461, -0.7811847925186157],
+ [0.06433182954788208, -1.5397958755493164],
+ [-0.2894323468208313, -0.5789554715156555],
+ [-0.6081662178039551, 0.4845278263092041],
+ [2.697964668273926, -0.18515698611736298],
+ [0.1277363896369934, -0.7221432328224182],
+ [0.8700758218765259, 0.35042452812194824],
+ [0.22088994085788727, 0.495242178440094],
+ [-2.5843818187713623, -0.8000828623771667],
+ [0.6732649803161621, -1.4362232685089111],
+ [-1.5286413431167603, 1.0417330265045166],
+ [-1.1222513914108276, -0.6269875764846802],
+ [-0.9752035140991211, -0.8750635385513306],
+ [-2.6369473934173584, 0.6918523907661438],
+ [0.14478731155395508, -0.041986867785453796],
+ [-1.5629483461380005, 1.4369450807571411],
+ [0.38952457904815674, -2.16428804397583],
+ [-0.16885095834732056, 0.7976621985435486],
+ [-3.12416934967041, 1.256506085395813],
+ [0.6843105554580688, -0.4203019142150879],
+ [1.9345275163650513, 1.934950351715088],
+ [0.012184220366179943, -2.1080918312072754],
+ [-0.6350273489952087, 0.7358828186988831],
+ [-0.837304949760437, -0.6214472651481628],
+ [0.08211923390626907, -0.9472538232803345],
+ [2.9332995414733887, -1.4956780672073364],
+ [1.3806978464126587, -0.2916182279586792],
+ [0.06773144006729126, 0.9285762310028076],
+ [-1.1943119764328003, 1.5963770151138306],
+ [1.6395620107650757, -0.32285431027412415],
+ [-1.390851378440857, -0.08273141086101532],
+ [1.816330909729004, -1.2812227010726929],
+ [0.7921574711799622, -2.1135804653167725],
+ [0.5817914605140686, 1.2644577026367188],
+ [1.929347038269043, -0.2386285960674286],
+ [0.8877345323562622, 1.190008521080017],
+ [1.4732073545455933, 0.8935023546218872],
+ [-2.8518524169921875, -1.5478795766830444],
+ [0.2439267635345459, 0.7576767802238464],
+ [0.5246709585189819, -2.606659412384033],
+ [1.150876760482788, 1.4073830842971802],
+ [-0.2643202245235443, 2.0634236335754395],
+ [1.555483341217041, -0.0023102816194295883],
+ [2.0830578804016113, -1.7225427627563477],
+ [-0.5424830317497253, -1.070199728012085],
+ [0.9168899655342102, 0.8955540060997009],
+ [-0.8120972514152527, 2.696739912033081],
+ [-0.29908373951911926, -1.5310651063919067],
+ [1.2320337295532227, -1.556247353553772],
+ [1.8612544536590576, 0.08704725652933121],
+ [0.22133447229862213, -1.8091708421707153],
+ [-0.4403655230998993, -0.38571012020111084],
+ [-1.88539457321167, 1.192205786705017],
+ [2.239687919616699, 0.004709010478109121],
+ [1.139495611190796, 0.45733731985092163],
+ [-1.507995367050171, 0.19716016948223114],
+ [0.46986445784568787, 1.5422041416168213],
+ [-1.2573751211166382, -0.35984551906585693],
+ [-1.7415345907211304, -0.6020717024803162],
+ [1.0751984119415283, 0.19006384909152985],
+ [2.24186635017395, -0.46343153715133667],
+ [0.3610347509384155, -0.07658443599939346],
+ [-1.3111497163772583, 0.432013601064682],
+ [0.6164408326148987, 0.24538464844226837],
+ [-1.9266542196273804, -0.3256155550479889],
+ [-0.5870336890220642, -0.1879584938287735],
+ [-1.0476511716842651, 0.3677721917629242],
+ [-1.229940414428711, 1.2433830499649048],
+ [0.18550436198711395, 0.22753673791885376],
+ [-0.017921989783644676, 0.12625974416732788],
+ [1.1659504175186157, -0.5020995736122131],
+ [-0.5983408093452454, -1.40438973903656],
+ [0.7519024014472961, -0.16282692551612854],
+ [0.9920787811279297, -1.344896912574768],
+ [-0.8103678226470947, 0.3064485788345337],
+ [0.6956969499588013, 1.8208192586898804],
+ [-2.7830491065979004, -0.2299390584230423],
+ [-0.34681546688079834, 2.4890666007995605],
+ [-1.4452646970748901, -1.2216600179672241],
+ [-2.1872897148132324, 0.8926076292991638],
+ [1.706072211265564, -2.8440372943878174],
+ [1.1119003295898438, -2.4923460483551025],
+ [-2.582794666290283, 2.0973289012908936],
+ [0.04987720400094986, -0.2964983284473419],
+ [-2.063807487487793, -0.7847916483879089],
+ [-0.4068813621997833, 0.9135897755622864],
+ [-0.9814359545707703, -0.3874954879283905],
+ [-1.4227229356765747, 0.7337291240692139],
+ [0.3065044581890106, 1.3125417232513428],
+ [1.2160996198654175, -1.9643305540084839],
+ [-1.2163853645324707, 0.14608727395534515],
+ [-2.3030710220336914, -0.37558120489120483],
+ [0.9232977628707886, 2.1843791007995605],
+ [-0.1989777386188507, 1.651851773262024],
+ [-0.714374840259552, -0.39365994930267334],
+ [-0.7805715799331665, -2.099881887435913],
+ [0.9015759229660034, -1.7053706645965576],
+ [0.1033422127366066, 1.5256654024124146],
+ [-1.8773194551467896, 2.324174165725708],
+ [1.9227174520492554, 2.7441604137420654],
+ [-0.5994020104408264, 0.23984014987945557],
+ [1.3496100902557373, -0.9126054644584656],
+ [-0.8765304088592529, -3.1877026557922363],
+ [-1.2040035724639893, -1.5169521570205688],
+ [1.4261796474456787, 2.150200128555298],
+ [1.463774561882019, 1.6656692028045654],
+ [0.20364105701446533, -0.4988172650337219],
+ [0.5195154547691345, -0.24067887663841248],
+ [-1.1116786003112793, -1.1599653959274292],
+ [-0.8490808606147766, -0.1681060940027237],
+ [0.3189965784549713, -0.9641751646995544],
+ [-0.5664751529693604, -0.5951744318008423],
+ [-1.6347930431365967, -0.9137664437294006],
+ [0.44048091769218445, -0.47259435057640076],
+ [-2.147747039794922, 0.47442489862442017],
+ [1.834734320640564, 1.4462147951126099],
+ [1.1777573823928833, 1.0659226179122925],
+ [-0.9568989872932434, 0.09495053440332413],
+ [-1.838529348373413, 0.2950586676597595],
+ [-0.4800611734390259, 0.014894310384988785],
+ [-0.5235516428947449, -1.7687653303146362],
+ [2.0735011100769043, -0.8825281262397766],
+ [2.637502431869507, 0.8455678224563599],
+ [2.606602907180786, -0.7848446369171143],
+ [-1.1886937618255615, 0.9330510497093201],
+ [0.38082656264305115, 0.13328030705451965],
+ [0.6847941875457764, 0.7384101152420044],
+ [1.2638574838638306, -0.007309418171644211],
+ [0.18292222917079926, -1.22371244430542],
+ [0.8143821954727173, 1.4976691007614136],
+ [0.6571850776672363, 0.48368802666664124],
+ [-0.6991601586341858, 2.150190830230713],
+ [0.8101756572723389, 0.10206498205661774],
+ [-0.08768226951360703, -1.084917664527893],
+ [-0.7208092212677002, 0.03657956421375275],
+ [0.3211449086666107, 1.803687334060669],
+ [-0.7835946083068848, 1.6869111061096191],
+ ]
+ )
+ if (p, n) == (2, 64):
+ return torch.tensor(
+ [
+ [-2.7216711044311523, 0.14431366324424744],
+ [-0.766914427280426, 1.7193410396575928],
+ [-2.2575762271881104, 1.2476624250411987],
+ [1.233758807182312, -2.3560616970062256],
+ [0.8701965808868408, -0.2649352252483368],
+ [1.4506438970565796, 2.1776366233825684],
+ [-0.06305818259716034, 1.9049758911132812],
+ [2.536226511001587, 0.563927412033081],
+ [0.4599496126174927, -1.8745561838150024],
+ [-1.900517225265503, -0.30703988671302795],
+ [0.09386251866817474, 0.8755807280540466],
+ [1.946500539779663, -0.6743080615997314],
+ [2.1338934898376465, 1.4581491947174072],
+ [0.9429940581321716, -0.8038390278816223],
+ [2.0697755813598633, -1.614896535873413],
+ [0.772676408290863, 0.22017823159694672],
+ [1.0689979791641235, -1.525044322013855],
+ [0.6813604831695557, 1.1345642805099487],
+ [0.4706456661224365, 2.606626272201538],
+ [-1.294018030166626, -0.4372096061706543],
+ [-0.09134224057197571, 0.4610418677330017],
+ [-0.7907772064208984, -0.48412787914276123],
+ [0.060459110885858536, -0.9172890186309814],
+ [-0.5855047702789307, 2.56172513961792],
+ [0.11484206467866898, -2.659848213195801],
+ [-1.5893300771713257, 2.188580274581909],
+ [1.6750942468643188, 0.7089915871620178],
+ [-0.445697546005249, 0.7452405095100403],
+ [-1.8539940118789673, -1.8377939462661743],
+ [-1.5791912078857422, -1.017285943031311],
+ [-1.030419945716858, -1.5746369361877441],
+ [-1.9511750936508179, 0.43696075677871704],
+ [-0.3446580767631531, -1.8953213691711426],
+ [-1.4219647645950317, 0.7676230669021606],
+ [-0.9191089272499084, 0.5021472573280334],
+ [0.20464491844177246, 1.3684605360031128],
+ [0.5402919054031372, 0.6699410676956177],
+ [1.8903915882110596, 0.03638288006186485],
+ [0.4723062515258789, -0.6216739416122437],
+ [-0.41345009207725525, -0.22752176225185394],
+ [2.7119064331054688, -0.5111885070800781],
+ [1.065286636352539, 0.6950305700302124],
+ [0.40629103779792786, -0.14339995384216309],
+ [1.2815024852752686, 0.17108257114887238],
+ [0.01785222627222538, -0.43778058886528015],
+ [0.054590027779340744, -1.4225547313690186],
+ [0.3076786696910858, 0.30697619915008545],
+ [-0.9498570561408997, -0.9576997756958008],
+ [-2.4640724658966064, -0.9660449028015137],
+ [1.3714425563812256, -0.39760473370552063],
+ [-0.4857747256755829, 0.2386789172887802],
+ [1.2797833681106567, 1.3097363710403442],
+ [0.5508887767791748, -1.1777795553207397],
+ [-1.384316325187683, 0.1465839296579361],
+ [-0.46556955575942993, -1.2442727088928223],
+ [-0.3915477693080902, -0.7319604158401489],
+ [-1.4005504846572876, 1.3890998363494873],
+ [-0.8647305965423584, 1.0617644786834717],
+ [-0.8901953101158142, -0.01650036871433258],
+ [-0.9893633723258972, -2.4662880897521973],
+ [1.445534110069275, -1.049334168434143],
+ [-0.041650623083114624, 0.012734669260680676],
+ [-0.3302375078201294, 1.26217782497406],
+ [0.6934980154037476, 1.7714335918426514],
+ ]
+ )
+ elif (p, n) == (2, 16):
+ return torch.tensor(
+ [
+ [-0.8996632695198059, -1.6360418796539307],
+ [-0.961183488368988, 1.5999565124511719],
+ [-1.882026195526123, 0.678778350353241],
+ [0.36300793290138245, -1.9667866230010986],
+ [-0.6814072728157043, -0.576818585395813],
+ [0.7270012497901917, 0.6186859607696533],
+ [0.3359416127204895, 1.8371193408966064],
+ [1.859930396080017, 0.036668598651885986],
+ [0.17208248376846313, -0.9401724338531494],
+ [-1.7599700689315796, -0.6244229674339294],
+ [-0.8993809223175049, 0.32267823815345764],
+ [0.839488685131073, -0.3017036020755768],
+ [1.5314953327178955, 1.2942044734954834],
+ [-0.0011779458727687597, 0.00022069070837460458],
+ [1.4274526834487915, -1.207889199256897],
+ [-0.16123905777931213, 0.8787511587142944],
+ ]
+ )
+ elif (p, n) == (1, 16):
+ return torch.tensor(
+ [
+ [-2.7325894832611084],
+ [-2.069017171859741],
+ [-1.6180464029312134],
+ [-1.2562311887741089],
+ [-0.9423404335975647],
+ [-0.6567591428756714],
+ [-0.38804829120635986],
+ [-0.12839503586292267],
+ [0.12839503586292267],
+ [0.38804829120635986],
+ [0.6567591428756714],
+ [0.9423404335975647],
+ [1.2562311887741089],
+ [1.6180464029312134],
+ [2.069017171859741],
+ [2.7325894832611084],
+ ]
+ )
+ elif (p, n) == (1, 8):
+ return torch.tensor(
+ [
+ [-2.1519455909729004],
+ [-1.3439092636108398],
+ [-0.7560052871704102],
+ [-0.2450941801071167],
+ [0.2450941801071167],
+ [0.7560052871704102],
+ [1.3439092636108398],
+ [2.1519455909729004],
+ ]
+ )
+ elif (p, n) == (1, 4):
+ return torch.tensor([[-1.5104175806045532], [-0.4527800381183624], [0.4527800381183624], [1.5104175806045532]])
+ else:
+ raise NotImplementedError(f"Unsupported p={p}, n={n}")
+
+
+def quantize_with_higgs(weight, bits: int = 4, p: int = 2, group_size: int = 256, hadamard_size: int = 1024):
+ assert len(weight.shape) == 2, "Only 2D weights are supported for now"
+
+ grid = get_higgs_grid(p, 2 ** (p * bits)).to(weight.device)
+ grid_norm_2 = torch.linalg.norm(grid, axis=-1) ** 2
+
+ device = weight.device
+ dtype = weight.dtype
+ weight = weight.clone().float()
+ # Pad to Hadamard transform size
+ weight = pad_to_block(weight, [1], hadamard_size)
+
+ # Scale and Hadamard transform
+ mult = weight.shape[1] // hadamard_size
+ weight = weight.reshape(-1, mult, hadamard_size)
+ scales = torch.linalg.norm(weight, axis=-1)
+ weight = hadamard_transform(weight, 1) / scales[:, :, None]
+
+ # Pad to edenn_d and project
+ weight = pad_to_block(weight, [2], p).reshape(weight.shape[0], mult, -1, p)
+
+ # Quantize
+ codes = torch.empty(weight.shape[:-1], device=device, dtype=torch.uint8)
+ for i in range(0, weight.shape[0], 64):
+ codes[i : i + 64] = torch.argmax(2 * weight[i : i + 64] @ grid.T - grid_norm_2, dim=-1).to(torch.uint8)
+ del weight
+
+ codes = codes.reshape(codes.shape[0], -1)
+ scales = scales / sqrt(hadamard_size)
+
+ weight, scales, tables, tables2 = prepare_data_transposed(
+ codes,
+ torch.repeat_interleave(scales.to(dtype), hadamard_size // group_size, dim=1),
+ grid.to(dtype),
+ num_bits=bits,
+ group_size=group_size,
+ vector_size=p,
+ dtype=dtype,
+ device=device,
+ )
+
+ return {
+ "weight": weight,
+ "scales": scales,
+ "tables": tables,
+ "tables2": tables2.view(dtype=torch.float16),
+ }
+
+
+class HiggsLinear(torch.nn.Module):
+ def __init__(
+ self,
+ in_features: int,
+ out_features: int,
+ num_bits: int,
+ bias=True,
+ dtype: torch.dtype = None,
+ device: torch.device = None,
+ group_size: int = 256,
+ hadamard_size: int = 1024,
+ ):
+ super().__init__()
+ self.in_features = in_features
+ self.out_features = out_features
+ self.num_bits = num_bits
+ self.group_size = group_size
+ self.hadamard_size = hadamard_size
+ self.num_sms_packed = nn.Parameter(torch.tensor(-1, dtype=torch.int32, device=device), requires_grad=False)
+
+ assert in_features % group_size == 0
+ assert num_bits in [2, 3, 4]
+
+ self.weight = nn.Parameter(
+ torch.empty((out_features * num_bits // 16, in_features), dtype=torch.int16, device=device),
+ requires_grad=False,
+ )
+ self.scales = nn.Parameter(
+ torch.empty((out_features, in_features // group_size), dtype=dtype, device=device), requires_grad=False
+ )
+ self.tables = nn.Parameter(torch.empty((2**num_bits,), dtype=dtype, device=device), requires_grad=False)
+ self.tables2 = nn.Parameter(
+ torch.empty((2**num_bits, 2**num_bits, 2), dtype=dtype, device=device), requires_grad=False
+ )
+
+ if bias:
+ self.bias = nn.Parameter(torch.empty(out_features, device=device, dtype=dtype), requires_grad=False)
+ else:
+ self.register_parameter("bias", None)
+
+ self.workspace = None # must be set externally to be reused among layers
+
+ def forward(self, x):
+ x = pad_to_block(x, [-1], self.hadamard_size)
+
+ if self.workspace is None:
+ raise Exception("Workspace must be set before calling forward")
+
+ return flute.qgemm_hadamard(
+ x,
+ self.weight,
+ self.scales,
+ self.tables,
+ self.tables2.view(dtype=torch.float32),
+ self.workspace,
+ self.num_bits,
+ self.group_size,
+ self.hadamard_size,
+ )
+
+
+def replace_with_higgs_linear(
+ model,
+ quantization_config=None,
+ current_key_name=None,
+ has_been_replaced=False,
+):
+ """
+ Public method that recursively replaces the Linear layers of the given model with HIGGS quantized layers.
+ `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
+ conversion has been successfull or not.
+
+ Args:
+ model (`torch.nn.Module`):
+ The model to convert, can be any `torch.nn.Module` instance.
+ quantization_config (`HiggsConfig`):
+ The quantization config object that contains the quantization parameters.
+ current_key_name (`list`, *optional*):
+ A list that contains the current key name. This is used for recursion and should not be passed by the user.
+ has_been_replaced (`bool`, *optional*):
+ A boolean that indicates if the conversion has been successful or not. This is used for recursion and
+ should not be passed by the user.
+ """
+
+ from accelerate import init_empty_weights
+
+ for name, module in model.named_children():
+ if current_key_name is None:
+ current_key_name = []
+ current_key_name.append(name)
+
+ if isinstance(module, nn.Linear):
+ # Check if the current key is not in the `quantization_config.modules_to_not_convert`
+ current_key_name_str = ".".join(current_key_name)
+ if not any(current_key_name_str.endswith(key) for key in quantization_config.modules_to_not_convert):
+ with init_empty_weights():
+ in_features = module.in_features
+ out_features = module.out_features
+
+ model._modules[name] = HiggsLinear(
+ in_features,
+ out_features,
+ bias=module.bias is not None,
+ num_bits=quantization_config.bits,
+ hadamard_size=quantization_config.hadamard_size,
+ group_size=quantization_config.group_size,
+ )
+ has_been_replaced = True
+
+ # Store the module class in case we need to transpose the weight later
+ model._modules[name].source_cls = type(module)
+ # Force requires grad to False to avoid unexpected errors
+ model._modules[name].requires_grad_(False)
+ if len(list(module.children())) > 0:
+ _, has_been_replaced = replace_with_higgs_linear(
+ module,
+ quantization_config=quantization_config,
+ current_key_name=current_key_name,
+ has_been_replaced=has_been_replaced,
+ )
+ # Remove the last key for recursion
+ current_key_name.pop(-1)
+ return model, has_been_replaced
+
+
+def dequantize_higgs(model, current_key_name=None):
+ """
+ Dequantizes the HiggsLinear layers in the given model by replacing them with standard torch.nn.Linear layers.
+ Args:
+ model (torch.nn.Module): The model containing HiggsLinear layers to be dequantized.
+ current_key_name (list, optional): A list to keep track of the current module names during recursion. Defaults to None.
+ Returns:
+ torch.nn.Module: The model with HiggsLinear layers replaced by torch.nn.Linear layers.
+ """
+
+ with torch.no_grad():
+ for name, module in model.named_children():
+ if current_key_name is None:
+ current_key_name = []
+ current_key_name.append(name)
+
+ if isinstance(module, HiggsLinear):
+ in_features = module.in_features
+ out_features = module.out_features
+
+ model._modules[name] = torch.nn.Linear(
+ in_features,
+ out_features,
+ bias=module.bias is not None,
+ device=module.scales.device,
+ dtype=module.scales.dtype,
+ )
+
+ model._modules[name].weight.data = module(
+ torch.eye(in_features, device=module.scales.device, dtype=module.scales.dtype)
+ ).T.contiguous()
+
+ if len(list(module.children())) > 0:
+ _ = dequantize_higgs(
+ module,
+ current_key_name=current_key_name,
+ )
+ # Remove the last key for recursion
+ current_key_name.pop(-1)
+ return model
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index a6d4a1cc5b54ed..ead3f1a03717dd 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -5050,18 +5050,6 @@ def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
logger.warning_once(warn_string)
- @property
- def _is_quantized_training_enabled(self):
- warnings.warn(
- "`_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead",
- FutureWarning,
- )
-
- if not hasattr(self, "hf_quantizer"):
- return False
-
- return self.hf_quantizer.is_trainable
-
@property
def supports_tp_plan(self):
"""
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7fcaddde704cf7..ff03d09966a4d6 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -77,6 +77,7 @@
dialogpt,
dinat,
dinov2,
+ dinov2_with_registers,
distilbert,
dit,
donut,
diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
index 6481d6f3c434c7..b96697bc0779e6 100644
--- a/src/transformers/models/aria/modeling_aria.py
+++ b/src/transformers/models/aria/modeling_aria.py
@@ -1012,7 +1012,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 69ce8efa10c76c..6c052aa0eaa0f3 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -94,6 +94,7 @@
("detr", "DetrConfig"),
("dinat", "DinatConfig"),
("dinov2", "Dinov2Config"),
+ ("dinov2_with_registers", "Dinov2WithRegistersConfig"),
("distilbert", "DistilBertConfig"),
("donut-swin", "DonutSwinConfig"),
("dpr", "DPRConfig"),
@@ -404,6 +405,7 @@
("dialogpt", "DialoGPT"),
("dinat", "DiNAT"),
("dinov2", "DINOv2"),
+ ("dinov2_with_registers", "DINOv2 with Registers"),
("distilbert", "DistilBERT"),
("dit", "DiT"),
("donut-swin", "DonutSwin"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index e8a2dece432476..861754f591769b 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -92,6 +92,7 @@
("detr", "DetrModel"),
("dinat", "DinatModel"),
("dinov2", "Dinov2Model"),
+ ("dinov2_with_registers", "Dinov2WithRegistersModel"),
("distilbert", "DistilBertModel"),
("donut-swin", "DonutSwinModel"),
("dpr", "DPRQuestionEncoder"),
@@ -584,6 +585,7 @@
("detr", "DetrModel"),
("dinat", "DinatModel"),
("dinov2", "Dinov2Model"),
+ ("dinov2_with_registers", "Dinov2WithRegistersModel"),
("dpt", "DPTModel"),
("efficientformer", "EfficientFormerModel"),
("efficientnet", "EfficientNetModel"),
@@ -659,6 +661,7 @@
),
("dinat", "DinatForImageClassification"),
("dinov2", "Dinov2ForImageClassification"),
+ ("dinov2_with_registers", "Dinov2WithRegistersForImageClassification"),
(
"efficientformer",
(
@@ -1373,6 +1376,7 @@
("convnextv2", "ConvNextV2Backbone"),
("dinat", "DinatBackbone"),
("dinov2", "Dinov2Backbone"),
+ ("dinov2_with_registers", "Dinov2WithRegistersBackbone"),
("focalnet", "FocalNetBackbone"),
("hiera", "HieraBackbone"),
("maskformer-swin", "MaskFormerSwinBackbone"),
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 086f8ce03c62fc..9d7325c502d6b7 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -740,7 +740,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py
index 11bc411a00c005..90a02dd5bb9fee 100644
--- a/src/transformers/models/chameleon/modeling_chameleon.py
+++ b/src/transformers/models/chameleon/modeling_chameleon.py
@@ -1385,7 +1385,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
index 60849c2efb74d5..3d88fc1929c30b 100644
--- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
@@ -149,7 +149,7 @@ def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
- parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+ parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint")
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
args = parser.parse_args()
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 616c93a46e4f4a..5c8f1b3957ab38 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -583,7 +583,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 7b8b9547ac1c33..a65d3ee64a234a 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -910,7 +910,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 0d2c4297e0d473..3f2e7c384d7d63 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1111,7 +1111,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 60fea55d87be5d..683b683008f2da 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -285,9 +285,9 @@ def forward(
shape_q = (*query_states.shape[:-1], -1, self.head_dim)
shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
- query_states = query_states.reshape(shape_q).transpose(1, 2)
- key_states = key_states.reshape(shape_kv).transpose(1, 2)
- value_states = value_states.reshape(shape_kv).transpose(1, 2)
+ query_states = query_states.view(shape_q).transpose(1, 2)
+ key_states = key_states.view(shape_kv).transpose(1, 2)
+ value_states = value_states.view(shape_kv).transpose(1, 2)
if layer_past is not None:
past_key, past_value = layer_past
diff --git a/src/transformers/models/dinov2_with_registers/__init__.py b/src/transformers/models/dinov2_with_registers/__init__.py
new file mode 100644
index 00000000000000..2d10027b6a3b63
--- /dev/null
+++ b/src/transformers/models/dinov2_with_registers/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+ from .configuration_dinov2_with_registers import *
+ from .modeling_dinov2_with_registers import *
+else:
+ import sys
+
+ _file = globals()["__file__"]
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
new file mode 100644
index 00000000000000..80c095cb464838
--- /dev/null
+++ b/src/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
@@ -0,0 +1,166 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an
+ Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+ [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ mlp_ratio (`int`, *optional*, defaults to 4):
+ Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+ The epsilon used by the layer normalization layers.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 16):
+ The size (resolution) of each patch.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ qkv_bias (`bool`, *optional*, defaults to `True`):
+ Whether to add a bias to the queries, keys and values.
+ layerscale_value (`float`, *optional*, defaults to 1.0):
+ Initial value to use for layer scale.
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
+ Stochastic depth rate per sample (when applied in the main path of residual layers).
+ use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+ Whether to use the SwiGLU feedforward neural network.
+ num_register_tokens (`int`, *optional*, defaults to 4):
+ Number of register tokens to use.
+ interpolate_antialias (`bool`, *optional*, defaults to `True`):
+ Whether to use antialiasing when interpolating the image patches.
+ interpolate_offset (`float`, *optional*, defaults to 0.0):
+ Offset to use when interpolating the image patches.
+ out_features (`List[str]`, *optional*):
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+ out_indices (`List[int]`, *optional*):
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+ apply_layernorm (`bool`, *optional*, defaults to `True`):
+ Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+ reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+ Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+ case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+ seq_len, hidden_size)`.
+
+ Example:
+
+ ```python
+ >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel
+
+ >>> # Initializing a Dinov2WithRegisters base style configuration
+ >>> configuration = Dinov2WithRegistersConfig()
+
+ >>> # Initializing a model (with random weights) from the base style configuration
+ >>> model = Dinov2WithRegistersModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "dinov2-with-registers-base"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ mlp_ratio=4,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.0,
+ attention_probs_dropout_prob=0.0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-6,
+ image_size=224,
+ patch_size=16,
+ num_channels=3,
+ qkv_bias=True,
+ layerscale_value=1.0,
+ drop_path_rate=0.0,
+ use_swiglu_ffn=False,
+ num_register_tokens=4,
+ interpolate_antialias=True,
+ interpolate_offset=0.0,
+ out_features=None,
+ out_indices=None,
+ apply_layernorm=True,
+ reshape_hidden_states=True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.mlp_ratio = mlp_ratio
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.qkv_bias = qkv_bias
+ self.layerscale_value = layerscale_value
+ self.drop_path_rate = drop_path_rate
+ self.use_swiglu_ffn = use_swiglu_ffn
+ self.num_register_tokens = num_register_tokens
+ self.interpolate_antialias = interpolate_antialias
+ self.interpolate_offset = interpolate_offset
+ self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+ self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+ out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+ )
+ self.apply_layernorm = apply_layernorm
+ self.reshape_hidden_states = reshape_hidden_states
+
+
+__all__ = ["Dinov2WithRegistersConfig"]
diff --git a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
new file mode 100644
index 00000000000000..0ff2697f74667e
--- /dev/null
+++ b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DINOv2 with Registers checkpoints from the original repository.
+
+URL: https://github.com/facebookresearch/dinov2/tree/main
+"""
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torchvision import transforms
+
+from transformers import (
+ BitImageProcessor,
+ Dinov2WithRegistersConfig,
+ Dinov2WithRegistersForImageClassification,
+ Dinov2WithRegistersModel,
+)
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_dinov2_with_registers_config(model_name, image_classifier=False):
+ config = Dinov2WithRegistersConfig(image_size=518, patch_size=14)
+
+ # size of the architecture
+ if "vits" in model_name:
+ config.hidden_size = 384
+ config.num_attention_heads = 6
+ elif "vitb" in model_name:
+ pass
+ elif "vitl" in model_name:
+ config.hidden_size = 1024
+ config.num_hidden_layers = 24
+ config.num_attention_heads = 16
+ elif "vitg" in model_name:
+ config.use_swiglu_ffn = True
+ config.hidden_size = 1536
+ config.num_hidden_layers = 40
+ config.num_attention_heads = 24
+ else:
+ raise ValueError("Model not supported")
+
+ if image_classifier:
+ repo_id = "huggingface/label-files"
+ filename = "imagenet-1k-id2label.json"
+ config.num_labels = 1000
+ config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+ config.id2label = {int(k): v for k, v in config.id2label.items()}
+
+ return config
+
+
+def create_rename_keys(config):
+ rename_keys = []
+ # fmt: off
+
+ # patch embedding layer
+ rename_keys.append(("cls_token", "embeddings.cls_token"))
+ rename_keys.append(("mask_token", "embeddings.mask_token"))
+ rename_keys.append(("pos_embed", "embeddings.position_embeddings"))
+ rename_keys.append(("register_tokens", "embeddings.register_tokens"))
+ rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight"))
+ rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias"))
+
+ for i in range(config.num_hidden_layers):
+ # layernorms
+ rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight"))
+ rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias"))
+ rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight"))
+ rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias"))
+ # MLP
+ if config.use_swiglu_ffn:
+ rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight"))
+ rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias"))
+ rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight"))
+ rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias"))
+ else:
+ rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight"))
+ rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias"))
+ rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight"))
+ rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias"))
+ # layerscale
+ rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1"))
+ rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1"))
+ # attention projection layer
+ rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight"))
+ rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias"))
+
+ # final layernorm
+ rename_keys.append(("norm.weight", "layernorm.weight"))
+ rename_keys.append(("norm.bias", "layernorm.bias"))
+
+ # fmt: on
+ return rename_keys
+
+
+def rename_key(dct, old, new):
+ val = dct.pop(old)
+ dct[new] = val
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+ for i in range(config.num_hidden_layers):
+ # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+ in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+ in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+ # next, add query, keys and values (in that order) to the state dict
+ state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
+ state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+ state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+ config.hidden_size : config.hidden_size * 2, :
+ ]
+ state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+ config.hidden_size : config.hidden_size * 2
+ ]
+ state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
+ state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+ return image
+
+
+@torch.no_grad()
+def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
+ """
+ Copy/paste/tweak model's weights to our Dinov2WithRegisters structure.
+ """
+
+ # define default Dinov2WithRegisters configuration
+ image_classifier = "1layer" in model_name
+ config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier)
+
+ # load original model from torch hub
+ original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", ""))
+ original_model.eval()
+
+ # load state_dict of original model, remove and rename some keys
+ state_dict = original_model.state_dict()
+ rename_keys = create_rename_keys(config)
+ for src, dest in rename_keys:
+ rename_key(state_dict, src, dest)
+ read_in_q_k_v(state_dict, config)
+
+ for key, val in state_dict.copy().items():
+ val = state_dict.pop(key)
+ if "w12" in key:
+ key = key.replace("w12", "weights_in")
+ if "w3" in key:
+ key = key.replace("w3", "weights_out")
+ state_dict[key] = val
+
+ # load HuggingFace model
+ if image_classifier:
+ model = Dinov2WithRegistersForImageClassification(config).eval()
+ model.dinov2_with_registers.load_state_dict(state_dict)
+ model_name_to_classifier_dict_url = {
+ "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth",
+ "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth",
+ "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth",
+ "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth",
+ }
+ url = model_name_to_classifier_dict_url[model_name]
+ classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+ model.classifier.weight = nn.Parameter(classifier_state_dict["weight"])
+ model.classifier.bias = nn.Parameter(classifier_state_dict["bias"])
+ else:
+ model = Dinov2WithRegistersModel(config).eval()
+ model.load_state_dict(state_dict)
+
+ # load image
+ image = prepare_img()
+
+ # preprocess image
+ transformations = transforms.Compose(
+ [
+ transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+ transforms.CenterCrop(224),
+ transforms.ToTensor(),
+ transforms.Normalize(
+ mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values
+ std=IMAGENET_DEFAULT_STD, # across a large photo dataset.
+ ),
+ ]
+ )
+
+ original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension
+
+ processor = BitImageProcessor(
+ size={"shortest_edge": 256},
+ resample=PILImageResampling.BICUBIC,
+ image_mean=IMAGENET_DEFAULT_MEAN,
+ image_std=IMAGENET_DEFAULT_STD,
+ )
+ pixel_values = processor(image, return_tensors="pt").pixel_values
+
+ assert torch.allclose(original_pixel_values, pixel_values)
+
+ with torch.no_grad():
+ outputs = model(pixel_values, output_hidden_states=True)
+ original_outputs = original_model(pixel_values)
+
+ # assert values
+ if image_classifier:
+ print("Predicted class:")
+ class_idx = outputs.logits.argmax(-1).item()
+ print(model.config.id2label[class_idx])
+ else:
+ assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape
+ assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3)
+ print("Looks ok!")
+
+ if pytorch_dump_folder_path is not None:
+ Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+ print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+ model.save_pretrained(pytorch_dump_folder_path)
+ print(f"Saving image processor to {pytorch_dump_folder_path}")
+ processor.save_pretrained(pytorch_dump_folder_path)
+
+ if push_to_hub:
+ model_name_to_hf_name = {
+ "dinov2_vits14_reg": "dinov2-with-registers-small",
+ "dinov2_vitb14_reg": "dinov2-with-registers-base",
+ "dinov2_vitl14_reg": "dinov2-with-registers-large",
+ "dinov2_vitg14_reg": "dinov2-with-registers-giant",
+ "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer",
+ "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer",
+ "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer",
+ "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer",
+ }
+
+ name = model_name_to_hf_name[model_name]
+ model.push_to_hub(f"nielsr/{name}")
+ processor.push_to_hub(f"nielsr/{name}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ # Required parameters
+ parser.add_argument(
+ "--model_name",
+ default="dinov2_vits14_reg",
+ type=str,
+ choices=[
+ "dinov2_vits14_reg",
+ "dinov2_vitb14_reg",
+ "dinov2_vitl14_reg",
+ "dinov2_vitg14_reg",
+ "dinov2_vits14_reg_1layer",
+ "dinov2_vitb14_reg_1layer",
+ "dinov2_vitl14_reg_1layer",
+ "dinov2_vitg14_reg_1layer",
+ ],
+ help="Name of the model you'd like to convert.",
+ )
+ parser.add_argument(
+ "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+ )
+ parser.add_argument(
+ "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+ )
+
+ args = parser.parse_args()
+ convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
new file mode 100644
index 00000000000000..4ebefa8bded12b
--- /dev/null
+++ b/src/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -0,0 +1,926 @@
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+# Do NOT edit this file manually as any edits will be overwritten by the generation of
+# the file from the modular. If any change should be done, please apply the change to the
+# modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig
+
+
+logger = logging.get_logger(__name__)
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base"
+
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2WithRegistersConfig"
+
+
+class Dinov2WithRegistersPatchEmbeddings(nn.Module):
+ """
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+ Transformer.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels, hidden_size = config.num_channels, config.hidden_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.num_patches = num_patches
+
+ self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+ num_channels = pixel_values.shape[1]
+ if num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ f" Expected {self.num_channels} but got {num_channels}."
+ )
+ embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+ return embeddings
+
+
+class Dinov2WithRegistersEmbeddings(nn.Module):
+ """
+ Construct the CLS token, mask token, register tokens, position and patch embeddings.
+ """
+
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+
+ self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+ self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+ self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+ self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+ num_patches = self.patch_embeddings.num_patches
+ self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+ resolution images.
+
+ Source:
+ https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+ """
+
+ num_patches = embeddings.shape[1] - 1
+ num_positions = self.position_embeddings.shape[1] - 1
+ if num_patches == num_positions and height == width:
+ return self.position_embeddings
+ class_pos_embed = self.position_embeddings[:, 0]
+ patch_pos_embed = self.position_embeddings[:, 1:]
+ dim = embeddings.shape[-1]
+ height = height // self.config.patch_size
+ width = width // self.config.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ height, width = height + self.config.interpolate_offset, width + self.config.interpolate_offset
+ patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+ target_dtype = patch_pos_embed.dtype
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.to(dtype=torch.float32),
+ scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
+ mode="bicubic",
+ align_corners=False,
+ antialias=self.config.interpolate_antialias,
+ )
+ patch_pos_embed = patch_pos_embed.to(dtype=target_dtype)
+ if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+ raise ValueError("Width or height does not match with the interpolated position embeddings")
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+ batch_size, _, height, width = pixel_values.shape
+ target_dtype = self.patch_embeddings.projection.weight.dtype
+ embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+ if bool_masked_pos is not None:
+ embeddings = torch.where(
+ bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+ )
+
+ # add the [CLS] token to the embedded patch tokens
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+ embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+ # add positional encoding to each token
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+ # add register tokens
+ embeddings = torch.cat(
+ (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+ )
+
+ embeddings = self.dropout(embeddings)
+
+ return embeddings
+
+
+class Dinov2WithRegistersSelfAttention(nn.Module):
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+ raise ValueError(
+ f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+ f"heads {config.num_attention_heads}."
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+ self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+ self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+ self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+ x = x.view(new_x_shape)
+ return x.permute(0, 2, 1, 3)
+
+ def forward(
+ self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ mixed_query_layer = self.query(hidden_states)
+
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(attention_probs)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = attention_probs * head_mask
+
+ context_layer = torch.matmul(attention_probs, value_layer)
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(new_context_layer_shape)
+
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+ return outputs
+
+
+class Dinov2WithRegistersSdpaSelfAttention(Dinov2WithRegistersSelfAttention):
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__(config)
+ self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+
+ def forward(
+ self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ if output_attentions:
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+ logger.warning_once(
+ "Dinov2WithRegistersModel is using Dinov2WithRegistersSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+ )
+ return super().forward(
+ hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions
+ )
+
+ mixed_query_layer = self.query(hidden_states)
+
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
+ query_layer = self.transpose_for_scores(mixed_query_layer)
+
+ context_layer = torch.nn.functional.scaled_dot_product_attention(
+ query_layer,
+ key_layer,
+ value_layer,
+ head_mask,
+ self.attention_probs_dropout_prob if self.training else 0.0,
+ is_causal=False,
+ scale=None,
+ )
+
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+ context_layer = context_layer.view(new_context_layer_shape)
+
+ return context_layer, None
+
+
+class Dinov2WithRegistersSelfOutput(nn.Module):
+ """
+ The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the
+ layernorm applied before each block.
+ """
+
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+ hidden_states = self.dense(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ return hidden_states
+
+
+class Dinov2WithRegistersAttention(nn.Module):
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+ self.attention = Dinov2WithRegistersSelfAttention(config)
+ self.output = Dinov2WithRegistersSelfOutput(config)
+ self.pruned_heads = set()
+
+ def prune_heads(self, heads: Set[int]) -> None:
+ if len(heads) == 0:
+ return
+ heads, index = find_pruneable_heads_and_indices(
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+ )
+
+ # Prune linear layers
+ self.attention.query = prune_linear_layer(self.attention.query, index)
+ self.attention.key = prune_linear_layer(self.attention.key, index)
+ self.attention.value = prune_linear_layer(self.attention.value, index)
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+ # Update hyper params and store pruned heads
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+ self.pruned_heads = self.pruned_heads.union(heads)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+ attention_output = self.output(self_outputs[0], hidden_states)
+
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
+ return outputs
+
+
+class Dinov2WithRegistersSdpaAttention(Dinov2WithRegistersAttention):
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__(config)
+ self.attention = Dinov2WithRegistersSdpaSelfAttention(config)
+
+
+class Dinov2WithRegistersLayerScale(nn.Module):
+ def __init__(self, config) -> None:
+ super().__init__()
+ self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+ return hidden_state * self.lambda1
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+ """
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+ argument.
+ """
+ if drop_prob == 0.0 or not training:
+ return input
+ keep_prob = 1 - drop_prob
+ shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+ random_tensor.floor_() # binarize
+ output = input.div(keep_prob) * random_tensor
+ return output
+
+
+class Dinov2WithRegistersDropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
+ super().__init__()
+ self.drop_prob = drop_prob
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ return drop_path(hidden_states, self.drop_prob, self.training)
+
+ def extra_repr(self) -> str:
+ return "p={}".format(self.drop_prob)
+
+
+class Dinov2WithRegistersMLP(nn.Module):
+ def __init__(self, config) -> None:
+ super().__init__()
+ in_features = out_features = config.hidden_size
+ hidden_features = int(config.hidden_size * config.mlp_ratio)
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+ if isinstance(config.hidden_act, str):
+ self.activation = ACT2FN[config.hidden_act]
+ else:
+ self.activation = config.hidden_act
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+ hidden_state = self.fc1(hidden_state)
+ hidden_state = self.activation(hidden_state)
+ hidden_state = self.fc2(hidden_state)
+ return hidden_state
+
+
+class Dinov2WithRegistersSwiGLUFFN(nn.Module):
+ def __init__(self, config) -> None:
+ super().__init__()
+ in_features = out_features = config.hidden_size
+ hidden_features = int(config.hidden_size * config.mlp_ratio)
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+ self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+ self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+ hidden_state = self.weights_in(hidden_state)
+ x1, x2 = hidden_state.chunk(2, dim=-1)
+ hidden = nn.functional.silu(x1) * x2
+ return self.weights_out(hidden)
+
+
+DINOV2_WITH_REGISTERS_ATTENTION_CLASSES = {
+ "eager": Dinov2WithRegistersAttention,
+ "sdpa": Dinov2WithRegistersSdpaAttention,
+}
+
+
+class Dinov2WithRegistersLayer(nn.Module):
+ """This corresponds to the Block class in the original implementation."""
+
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+
+ self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+ self.attention = DINOV2_WITH_REGISTERS_ATTENTION_CLASSES[config._attn_implementation](config)
+ self.layer_scale1 = Dinov2WithRegistersLayerScale(config)
+ self.drop_path = (
+ Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+ )
+
+ self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ if config.use_swiglu_ffn:
+ self.mlp = Dinov2WithRegistersSwiGLUFFN(config)
+ else:
+ self.mlp = Dinov2WithRegistersMLP(config)
+ self.layer_scale2 = Dinov2WithRegistersLayerScale(config)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+ self_attention_outputs = self.attention(
+ self.norm1(hidden_states), # in Dinov2WithRegisters, layernorm is applied before self-attention
+ head_mask,
+ output_attentions=output_attentions,
+ )
+ attention_output = self_attention_outputs[0]
+
+ attention_output = self.layer_scale1(attention_output)
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ # first residual connection
+ hidden_states = self.drop_path(attention_output) + hidden_states
+
+ # in Dinov2WithRegisters, layernorm is also applied after self-attention
+ layer_output = self.norm2(hidden_states)
+ layer_output = self.mlp(layer_output)
+ layer_output = self.layer_scale2(layer_output)
+
+ # second residual connection
+ layer_output = self.drop_path(layer_output) + hidden_states
+
+ outputs = (layer_output,) + outputs
+
+ return outputs
+
+
+class Dinov2WithRegistersEncoder(nn.Module):
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+ self.config = config
+ self.layer = nn.ModuleList([Dinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ output_hidden_states: bool = False,
+ return_dict: bool = True,
+ ) -> Union[tuple, BaseModelOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_head_mask = head_mask[i] if head_mask is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer_module.__call__,
+ hidden_states,
+ layer_head_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+class Dinov2WithRegistersPreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = Dinov2WithRegistersConfig
+ base_model_prefix = "dinov2_with_registers"
+ main_input_name = "pixel_values"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["Dinov2WithRegistersSwiGLUFFN"]
+ _supports_sdpa = True
+
+ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+ """Initialize the weights"""
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
+ # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+ # `trunc_normal_cpu` not implemented in `half` issues
+ module.weight.data = nn.init.trunc_normal_(
+ module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+ ).to(module.weight.dtype)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.LayerNorm):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, Dinov2WithRegistersEmbeddings):
+ module.position_embeddings.data = nn.init.trunc_normal_(
+ module.position_embeddings.data.to(torch.float32),
+ mean=0.0,
+ std=self.config.initializer_range,
+ ).to(module.position_embeddings.dtype)
+
+ module.cls_token.data = nn.init.trunc_normal_(
+ module.cls_token.data.to(torch.float32),
+ mean=0.0,
+ std=self.config.initializer_range,
+ ).to(module.cls_token.dtype)
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
+
+
+DINOV2_WITH_REGISTERS_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`Dinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`BitImageProcessor.preprocess`] for details.
+
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+ pre-training.
+
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.",
+ DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class Dinov2WithRegistersModel(Dinov2WithRegistersPreTrainedModel):
+ def __init__(self, config: Dinov2WithRegistersConfig):
+ super().__init__(config)
+ self.config = config
+
+ self.embeddings = Dinov2WithRegistersEmbeddings(config)
+ self.encoder = Dinov2WithRegistersEncoder(config)
+
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+ return self.embeddings.patch_embeddings
+
+ def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ for layer, heads in heads_to_prune.items():
+ self.encoder.layer[layer].attention.prune_heads(heads)
+
+ @add_start_docstrings_to_model_forward(DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=BaseModelOutputWithPooling,
+ config_class=_CONFIG_FOR_DOC,
+ modality="vision",
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ bool_masked_pos: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+ embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+ encoder_outputs = self.encoder(
+ embedding_output,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.layernorm(sequence_output)
+ pooled_output = sequence_output[:, 0, :]
+
+ if not return_dict:
+ head_outputs = (sequence_output, pooled_output)
+ return head_outputs + encoder_outputs[1:]
+
+ return BaseModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2_with_registers-small-imagenet1k-1-layer"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+ [`BitImageProcessor.preprocess`] for details.
+
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ """
+ Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+ of the [CLS] token) e.g. for ImageNet.
+ """,
+ DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedModel):
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__(config)
+
+ self.num_labels = config.num_labels
+ self.dinov2_with_registers = Dinov2WithRegistersModel(config)
+
+ # Classifier head
+ self.classifier = (
+ nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+ )
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @add_start_docstrings_to_model_forward(DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING)
+ @add_code_sample_docstrings(
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
+ output_type=ImageClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+ )
+ def forward(
+ self,
+ pixel_values: Optional[torch.Tensor] = None,
+ head_mask: Optional[torch.Tensor] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[tuple, ImageClassifierOutput]:
+ r"""
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+ """
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.dinov2_with_registers(
+ pixel_values,
+ head_mask=head_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ sequence_output = outputs[0] # batch_size, sequence_length, hidden_size
+
+ cls_token = sequence_output[:, 0]
+ patch_tokens = sequence_output[:, 1:]
+
+ linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+
+ logits = self.classifier(linear_input)
+
+ loss = None
+ if labels is not None:
+ # move labels to correct device to enable model parallelism
+ labels = labels.to(logits.device)
+ if self.config.problem_type is None:
+ if self.num_labels == 1:
+ self.config.problem_type = "regression"
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+ self.config.problem_type = "single_label_classification"
+ else:
+ self.config.problem_type = "multi_label_classification"
+
+ if self.config.problem_type == "regression":
+ loss_fct = MSELoss()
+ if self.num_labels == 1:
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
+ else:
+ loss = loss_fct(logits, labels)
+ elif self.config.problem_type == "single_label_classification":
+ loss_fct = CrossEntropyLoss()
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+ elif self.config.problem_type == "multi_label_classification":
+ loss_fct = BCEWithLogitsLoss()
+ loss = loss_fct(logits, labels)
+
+ if not return_dict:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return ImageClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ """
+ Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer.
+ """,
+ DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMixin):
+ def __init__(self, config):
+ super().__init__(config)
+ super()._init_backbone(config)
+ self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+ self.embeddings = Dinov2WithRegistersEmbeddings(config)
+ self.encoder = Dinov2WithRegistersEncoder(config)
+
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ self.num_register_tokens = config.num_register_tokens
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+ return self.embeddings.patch_embeddings
+
+ @add_start_docstrings_to_model_forward(DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ pixel_values: torch.Tensor,
+ output_hidden_states: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> BackboneOutput:
+ """
+ Returns:
+
+ Examples:
+ Returns:
+
+ Examples:
+
+
+ ```python
+ >>> from transformers import AutoImageProcessor, AutoBackbone
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+ >>> model = AutoBackbone.from_pretrained(
+ ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+ ... )
+
+ >>> inputs = processor(image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> feature_maps = outputs.feature_maps
+ >>> list(feature_maps[-1].shape)
+ [1, 768, 16, 16]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+ embedding_output = self.embeddings(pixel_values)
+
+ outputs = self.encoder(
+ embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+ )
+
+ hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+ feature_maps = ()
+ for stage, hidden_state in zip(self.stage_names, hidden_states):
+ if stage in self.out_features:
+ if self.config.apply_layernorm:
+ hidden_state = self.layernorm(hidden_state)
+ if self.config.reshape_hidden_states:
+ hidden_state = hidden_state[:, self.num_register_tokens + 1 :]
+ # this was actually a bug in the original implementation that we copied here,
+ # cause normally the order is height, width
+ batch_size, _, height, width = pixel_values.shape
+ patch_size = self.config.patch_size
+ hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+ hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+ feature_maps += (hidden_state,)
+
+ if not return_dict:
+ if output_hidden_states:
+ output = (feature_maps,) + outputs[1:]
+ else:
+ output = (feature_maps,) + outputs[2:]
+ return output
+
+ return BackboneOutput(
+ feature_maps=feature_maps,
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
+ attentions=outputs.attentions if output_attentions else None,
+ )
+
+
+__all__ = [
+ "Dinov2WithRegistersPreTrainedModel",
+ "Dinov2WithRegistersModel",
+ "Dinov2WithRegistersForImageClassification",
+ "Dinov2WithRegistersBackbone",
+]
diff --git a/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
new file mode 100644
index 00000000000000..bbfacd2b5f571d
--- /dev/null
+++ b/src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
@@ -0,0 +1,381 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ....transformers.models.dinov2.modeling_dinov2 import (
+ Dinov2Backbone,
+ Dinov2Encoder,
+ Dinov2ForImageClassification,
+ Dinov2Model,
+ Dinov2PatchEmbeddings,
+ Dinov2PreTrainedModel,
+)
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import BackboneOutput
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an
+ Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+ with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+ [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+ Args:
+ hidden_size (`int`, *optional*, defaults to 768):
+ Dimensionality of the encoder layers and the pooler layer.
+ num_hidden_layers (`int`, *optional*, defaults to 12):
+ Number of hidden layers in the Transformer encoder.
+ num_attention_heads (`int`, *optional*, defaults to 12):
+ Number of attention heads for each attention layer in the Transformer encoder.
+ mlp_ratio (`int`, *optional*, defaults to 4):
+ Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ initializer_range (`float`, *optional*, defaults to 0.02):
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+ The epsilon used by the layer normalization layers.
+ image_size (`int`, *optional*, defaults to 224):
+ The size (resolution) of each image.
+ patch_size (`int`, *optional*, defaults to 16):
+ The size (resolution) of each patch.
+ num_channels (`int`, *optional*, defaults to 3):
+ The number of input channels.
+ qkv_bias (`bool`, *optional*, defaults to `True`):
+ Whether to add a bias to the queries, keys and values.
+ layerscale_value (`float`, *optional*, defaults to 1.0):
+ Initial value to use for layer scale.
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
+ Stochastic depth rate per sample (when applied in the main path of residual layers).
+ use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+ Whether to use the SwiGLU feedforward neural network.
+ num_register_tokens (`int`, *optional*, defaults to 4):
+ Number of register tokens to use.
+ interpolate_antialias (`bool`, *optional*, defaults to `True`):
+ Whether to use antialiasing when interpolating the image patches.
+ interpolate_offset (`float`, *optional*, defaults to 0.0):
+ Offset to use when interpolating the image patches.
+ out_features (`List[str]`, *optional*):
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+ out_indices (`List[int]`, *optional*):
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
+ same order as defined in the `stage_names` attribute.
+ apply_layernorm (`bool`, *optional*, defaults to `True`):
+ Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+ reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+ Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+ case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+ seq_len, hidden_size)`.
+
+ Example:
+
+ ```python
+ >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel
+
+ >>> # Initializing a Dinov2WithRegisters base style configuration
+ >>> configuration = Dinov2WithRegistersConfig()
+
+ >>> # Initializing a model (with random weights) from the base style configuration
+ >>> model = Dinov2WithRegistersModel(configuration)
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "dinov2-with-registers-base"
+
+ def __init__(
+ self,
+ hidden_size=768,
+ num_hidden_layers=12,
+ num_attention_heads=12,
+ mlp_ratio=4,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.0,
+ attention_probs_dropout_prob=0.0,
+ initializer_range=0.02,
+ layer_norm_eps=1e-6,
+ image_size=224,
+ patch_size=16,
+ num_channels=3,
+ qkv_bias=True,
+ layerscale_value=1.0,
+ drop_path_rate=0.0,
+ use_swiglu_ffn=False,
+ num_register_tokens=4,
+ interpolate_antialias=True,
+ interpolate_offset=0.0,
+ out_features=None,
+ out_indices=None,
+ apply_layernorm=True,
+ reshape_hidden_states=True,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.mlp_ratio = mlp_ratio
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.initializer_range = initializer_range
+ self.layer_norm_eps = layer_norm_eps
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.qkv_bias = qkv_bias
+ self.layerscale_value = layerscale_value
+ self.drop_path_rate = drop_path_rate
+ self.use_swiglu_ffn = use_swiglu_ffn
+ self.num_register_tokens = num_register_tokens
+ self.interpolate_antialias = interpolate_antialias
+ self.interpolate_offset = interpolate_offset
+ self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+ self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+ out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+ )
+ self.apply_layernorm = apply_layernorm
+ self.reshape_hidden_states = reshape_hidden_states
+
+
+class Dinov2WithRegistersPatchEmbeddings(Dinov2PatchEmbeddings):
+ pass
+
+
+class Dinov2WithRegistersEmbeddings(nn.Module):
+ """
+ Construct the CLS token, mask token, register tokens, position and patch embeddings.
+ """
+
+ def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+ super().__init__()
+
+ self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+ self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+ self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+ self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+ num_patches = self.patch_embeddings.num_patches
+ self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
+ self.config = config
+
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+ resolution images.
+
+ Source:
+ https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+ """
+
+ num_patches = embeddings.shape[1] - 1
+ num_positions = self.position_embeddings.shape[1] - 1
+ if num_patches == num_positions and height == width:
+ return self.position_embeddings
+ class_pos_embed = self.position_embeddings[:, 0]
+ patch_pos_embed = self.position_embeddings[:, 1:]
+ dim = embeddings.shape[-1]
+ height = height // self.config.patch_size
+ width = width // self.config.patch_size
+ # we add a small number to avoid floating point error in the interpolation
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
+ height, width = height + self.config.interpolate_offset, width + self.config.interpolate_offset
+ patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+ target_dtype = patch_pos_embed.dtype
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed.to(dtype=torch.float32),
+ scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
+ mode="bicubic",
+ align_corners=False,
+ antialias=self.config.interpolate_antialias,
+ )
+ patch_pos_embed = patch_pos_embed.to(dtype=target_dtype)
+ if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+ raise ValueError("Width or height does not match with the interpolated position embeddings")
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+ def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+ batch_size, _, height, width = pixel_values.shape
+ target_dtype = self.patch_embeddings.projection.weight.dtype
+ embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+ if bool_masked_pos is not None:
+ embeddings = torch.where(
+ bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+ )
+
+ # add the [CLS] token to the embedded patch tokens
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+ embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+ # add positional encoding to each token
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+ # add register tokens
+ embeddings = torch.cat(
+ (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+ )
+
+ embeddings = self.dropout(embeddings)
+
+ return embeddings
+
+
+class Dinov2WithRegistersEncoder(Dinov2Encoder):
+ pass
+
+
+class Dinov2WithRegistersPreTrainedModel(Dinov2PreTrainedModel):
+ pass
+
+
+class Dinov2WithRegistersModel(Dinov2Model):
+ pass
+
+
+class Dinov2WithRegistersForImageClassification(Dinov2ForImageClassification):
+ pass
+
+
+class Dinov2WithRegistersBackbone(Dinov2Backbone):
+ def __init__(self, config):
+ super().__init__(config)
+ super()._init_backbone(config)
+
+ self.num_register_tokens = config.num_register_tokens
+ self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+ self.embeddings = Dinov2WithRegistersEmbeddings(config)
+ self.encoder = Dinov2WithRegistersEncoder(config)
+
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+ return self.embeddings.patch_embeddings
+
+ def forward(
+ self,
+ pixel_values: torch.Tensor,
+ output_hidden_states: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> BackboneOutput:
+ """
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import AutoImageProcessor, AutoBackbone
+ >>> import torch
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+ >>> model = AutoBackbone.from_pretrained(
+ ... "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+ ... )
+
+ >>> inputs = processor(image, return_tensors="pt")
+
+ >>> outputs = model(**inputs)
+ >>> feature_maps = outputs.feature_maps
+ >>> list(feature_maps[-1].shape)
+ [1, 768, 16, 16]
+ ```"""
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+ embedding_output = self.embeddings(pixel_values)
+
+ outputs = self.encoder(
+ embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+ )
+
+ hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+ feature_maps = ()
+ for stage, hidden_state in zip(self.stage_names, hidden_states):
+ if stage in self.out_features:
+ if self.config.apply_layernorm:
+ hidden_state = self.layernorm(hidden_state)
+ if self.config.reshape_hidden_states:
+ hidden_state = hidden_state[:, self.num_register_tokens + 1 :]
+ # this was actually a bug in the original implementation that we copied here,
+ # cause normally the order is height, width
+ batch_size, _, height, width = pixel_values.shape
+ patch_size = self.config.patch_size
+ hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+ hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+ feature_maps += (hidden_state,)
+
+ if not return_dict:
+ if output_hidden_states:
+ output = (feature_maps,) + outputs[1:]
+ else:
+ output = (feature_maps,) + outputs[2:]
+ return output
+
+ return BackboneOutput(
+ feature_maps=feature_maps,
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
+ attentions=outputs.attentions if output_attentions else None,
+ )
+
+
+__all__ = [
+ "Dinov2WithRegistersConfig",
+ "Dinov2WithRegistersPreTrainedModel",
+ "Dinov2WithRegistersModel",
+ "Dinov2WithRegistersForImageClassification",
+ "Dinov2WithRegistersBackbone",
+]
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index e2ea12b03fe434..71cd6b6158ca0b 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -633,7 +633,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 95ad0d9719951d..706847650b818e 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -644,7 +644,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index ad53c7804ebeea..854c21576b5048 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -295,9 +295,9 @@ def forward(
shape_q = (*query_states.shape[:-1], -1, self.head_dim)
shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
- query_states = query_states.reshape(shape_q).transpose(1, 2)
- key_states = key_states.reshape(shape_kv).transpose(1, 2)
- value_states = value_states.reshape(shape_kv).transpose(1, 2)
+ query_states = query_states.view(shape_q).transpose(1, 2)
+ key_states = key_states.view(shape_kv).transpose(1, 2)
+ value_states = value_states.view(shape_kv).transpose(1, 2)
if layer_past is not None:
past_key, past_value = layer_past
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index ef23b5d208fd79..4e41c80d69f22e 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -792,7 +792,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 7152d72f5b7fc8..98418cb02d65ba 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -148,6 +148,7 @@ def flash_attention_forward(
norm_factor,
attention_dropout,
training,
+ position_ids=None,
target_dtype=None,
**_kwargs,
):
@@ -173,6 +174,7 @@ def flash_attention_forward(
attention_mask,
query_length,
dropout=attention_dropout,
+ position_ids=position_ids,
softmax_scale=norm_factor,
is_causal=True,
use_top_left_mask=flash_attn_uses_top_left_mask,
@@ -353,6 +355,7 @@ def forward(
key,
value,
attention_mask=attention_mask,
+ position_ids=position_ids,
head_mask=head_mask,
norm_factor=self.norm_factor,
attention_dropout=self.config.attention_dropout,
@@ -931,7 +934,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 71602f01e7d6f8..fba67ae03a5979 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -667,7 +667,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 4af8f73b5f5eea..00749b7eb07fbc 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -891,7 +891,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py
index 2e045e149d95de..7e758947b6dd8a 100644
--- a/src/transformers/models/granite/modeling_granite.py
+++ b/src/transformers/models/granite/modeling_granite.py
@@ -646,7 +646,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index b2ffbcbc695696..e6b9682b5ae803 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -1362,7 +1362,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 872f5206f20175..7ca5829e2063d8 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -283,45 +283,53 @@ def __call__(
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
inputs.update(image_inputs)
- if text is not None:
- if n_images_in_images != n_images_in_text:
- raise ValueError(
- f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
- )
-
- image_rows = inputs.pop("rows", [[0] * len(text)])
- image_cols = inputs.pop("cols", [[0] * len(text)])
-
- fake_image_token = self.fake_image_token.content
- image_token = self.image_token.content
- global_img_token = self.global_image_tag
-
- prompt_strings = []
- for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
- # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
- image_prompt_strings = []
- for n_rows, n_cols in zip(sample_rows, sample_cols):
- image_prompt_string = get_image_prompt_string(
- n_rows,
- n_cols,
- image_seq_len,
- image_token=image_token,
- fake_token_around_image=fake_image_token,
- global_img_token=global_img_token,
+ if text is not None:
+ if n_images_in_images != n_images_in_text:
+ raise ValueError(
+ f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
)
- image_prompt_strings.append(image_prompt_string)
- split_sample = sample.split(image_token)
- if len(split_sample) == 0:
- raise ValueError("The image token should be present in the text.")
+ image_rows = inputs.pop("rows", [[0] * len(text)])
+ image_cols = inputs.pop("cols", [[0] * len(text)])
+
+ fake_image_token = self.fake_image_token.content
+ image_token = self.image_token.content
+ global_img_token = self.global_image_tag
+
+ prompt_strings = []
+ for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
+ # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
+ image_prompt_strings = []
+ for n_rows, n_cols in zip(sample_rows, sample_cols):
+ image_prompt_string = get_image_prompt_string(
+ n_rows,
+ n_cols,
+ image_seq_len,
+ image_token=image_token,
+ fake_token_around_image=fake_image_token,
+ global_img_token=global_img_token,
+ )
+ image_prompt_strings.append(image_prompt_string)
- # Place in the image prompt strings where the image tokens are
- sample = split_sample[0]
- for i, image_prompt_string in enumerate(image_prompt_strings):
- sample += image_prompt_string + split_sample[i + 1]
- prompt_strings.append(sample)
+ split_sample = sample.split(image_token)
+ if len(split_sample) == 0:
+ raise ValueError("The image token should be present in the text.")
- text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
+ # Place in the image prompt strings where the image tokens are
+ sample = split_sample[0]
+ for i, image_prompt_string in enumerate(image_prompt_strings):
+ sample += image_prompt_string + split_sample[i + 1]
+ prompt_strings.append(sample)
+
+ text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
+ inputs.update(text_inputs)
+
+ elif text is not None:
+ if any(n_images_in_text):
+ raise ValueError(
+ f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed."
+ )
+ text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
inputs.update(text_inputs)
return inputs
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index 7b7fd5a90d69ed..a2a86fd4c22f4a 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -1126,7 +1126,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 5be33c26414cd7..df46e15bce0009 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -632,7 +632,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 29536d9ad6f284..15958e772c90eb 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -1600,7 +1600,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index 5126b3f73cdebd..a01c161e69bb1a 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -171,7 +171,7 @@ def __init__(
logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.")
backbone_config = CONFIG_MAPPING["swin"](
image_size=224,
- in_channels=3,
+ num_channels=3,
patch_size=4,
embed_dim=96,
depths=[2, 2, 18, 2],
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index d28ef6ca76d295..0adf968eb4a19f 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -131,7 +131,7 @@ def __init__(
# fall back to https://huggingface.co/microsoft/swin-base-patch4-window12-384-in22k
backbone_config = SwinConfig(
image_size=384,
- in_channels=3,
+ num_channels=3,
patch_size=4,
embed_dim=128,
depths=[2, 2, 18, 2],
diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py
index 3e0c4d7a5123a7..6523ab6812179c 100644
--- a/src/transformers/models/mllama/modeling_mllama.py
+++ b/src/transformers/models/mllama/modeling_mllama.py
@@ -1076,7 +1076,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
index 237fba6f645fa5..167ccd155805fb 100644
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -307,7 +307,7 @@ def eager_attention_forward(
dim: int,
output_attentions: Optional[bool] = False,
**_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor] | Tuple[torch.Tensor]:
+) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
# qkv: [batch_size, seqlen, 3, nheads, headdim]
cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
query, key, value = qkv.transpose(3, 1).unbind(dim=2)
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index dac356146f3015..4424e8b2fead5d 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -532,7 +532,7 @@ def eager_attention_forward(
dim: int,
output_attentions: Optional[bool] = False,
**_kwargs,
-) -> Tuple[torch.Tensor, torch.Tensor] | Tuple[torch.Tensor]:
+) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
# qkv: [batch_size, seqlen, 3, nheads, headdim]
cos, sin = module.rotary_emb(qkv, position_ids=position_ids)
query, key, value = qkv.transpose(3, 1).unbind(dim=2)
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 659a84c5fe3784..e4017536017f43 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -1192,7 +1192,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py
index a0a10bdc6f3550..75618f1c7e00c7 100644
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@@ -878,7 +878,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 11d3d99f4f72c9..39bfa726deeedf 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -608,7 +608,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py
index 49ae798e7f1101..89b5f4abe1c39c 100644
--- a/src/transformers/models/olmo2/modeling_olmo2.py
+++ b/src/transformers/models/olmo2/modeling_olmo2.py
@@ -609,7 +609,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index 86f56a1f571b94..d16831013f1360 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -201,7 +201,7 @@ def __init__(
logger.info("`backbone_config` is unset. Initializing the config with the default `Swin` backbone.")
backbone_config = CONFIG_MAPPING["swin"](
image_size=224,
- in_channels=3,
+ num_channels=3,
patch_size=4,
embed_dim=96,
depths=[2, 2, 6, 2],
diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
index d773396010a3cb..7b631a77fcdda3 100644
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -33,6 +33,7 @@
is_vision_available,
logging,
replace_return_docstrings,
+ torch_int,
)
from .configuration_owlv2 import Owlv2Config, Owlv2TextConfig, Owlv2VisionConfig
@@ -274,6 +275,7 @@ def to_tuple(self) -> Tuple[Any]:
class Owlv2VisionEmbeddings(nn.Module):
def __init__(self, config: Owlv2VisionConfig):
super().__init__()
+ self.patch_size = config.patch_size
self.config = config
self.embed_dim = config.hidden_size
self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))
@@ -291,15 +293,59 @@ def __init__(self, config: Owlv2VisionConfig):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
- def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
- batch_size = pixel_values.shape[0]
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.interpolate_pos_encoding
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+ images. This method is also adapted to support torch.jit tracing.
+
+ Adapted from:
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+ """
+
+ num_patches = embeddings.shape[1] - 1
+ position_embedding = self.position_embedding.weight.unsqueeze(0)
+ num_positions = position_embedding.shape[1] - 1
+
+ # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+ if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+ return self.position_embedding(self.position_ids)
+
+ class_pos_embed = position_embedding[:, :1]
+ patch_pos_embed = position_embedding[:, 1:]
+
+ dim = embeddings.shape[-1]
+
+ new_height = height // self.patch_size
+ new_width = width // self.patch_size
+
+ sqrt_num_positions = torch_int(num_positions**0.5)
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed,
+ size=(new_height, new_width),
+ mode="bicubic",
+ align_corners=False,
+ )
+
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+ return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+ batch_size, _, height, width = pixel_values.shape
patch_embeds = self.patch_embedding(pixel_values) # shape = [batch_size, num_channels, height, width]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
- embeddings = embeddings + self.position_embedding(self.position_ids)
-
+ if interpolate_pos_encoding:
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+ else:
+ embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@@ -610,6 +656,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -635,6 +683,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_base_image_embeds (`bool`, *optional*):
Whether or not to return the base image embeddings.
return_dict (`bool`, *optional*):
@@ -657,6 +707,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
`vision_model_last_hidden_state` under returned tensors for more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -673,6 +725,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -914,6 +968,7 @@ def forward(
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@@ -929,7 +984,7 @@ def forward(
expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
pixel_values = pixel_values.to(expected_input_dtype)
- hidden_states = self.embeddings(pixel_values)
+ hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.pre_layernorm(hidden_states)
encoder_outputs = self.encoder(
@@ -976,6 +1031,7 @@ def forward(
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@@ -1002,6 +1058,7 @@ def forward(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@@ -1084,6 +1141,7 @@ def get_image_features(
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
@@ -1115,6 +1173,7 @@ def get_image_features(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@@ -1133,6 +1192,7 @@ def forward(
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_base_image_embeds: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Owlv2Output]:
@@ -1165,6 +1225,7 @@ def forward(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@@ -1295,21 +1356,23 @@ def __init__(self, config: Owlv2Config):
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
self.sigmoid = nn.Sigmoid()
-
- self.sqrt_num_patches = config.vision_config.image_size // config.vision_config.patch_size
- self.box_bias = self.compute_box_bias(self.sqrt_num_patches)
+ self.config = config
+ self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
+ self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+ self.box_bias = self.compute_box_bias(self.num_patches_height, self.num_patches_width)
@staticmethod
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.normalize_grid_corner_coordinates
- def normalize_grid_corner_coordinates(num_patches: int) -> torch.Tensor:
+ def normalize_grid_corner_coordinates(num_patches_height: int, num_patches_width: int) -> torch.Tensor:
# Create grid coordinates using torch
- x_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32)
- y_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32)
+ x_coordinates = torch.arange(1, num_patches_width + 1, dtype=torch.float32)
+ y_coordinates = torch.arange(1, num_patches_height + 1, dtype=torch.float32)
xx, yy = torch.meshgrid(x_coordinates, y_coordinates, indexing="xy")
- # Stack the coordinates and divide by num_patches
+ # Stack the coordinates and divide by their respective patch counts
box_coordinates = torch.stack((xx, yy), dim=-1)
- box_coordinates /= num_patches
+ box_coordinates[..., 0] /= num_patches_width
+ box_coordinates[..., 1] /= num_patches_height
# Flatten (h, w, 2) -> (h*w, 2)
box_coordinates = box_coordinates.view(-1, 2)
@@ -1332,18 +1395,22 @@ def objectness_predictor(self, image_features: torch.FloatTensor) -> torch.Float
@lru_cache(maxsize=2)
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.compute_box_bias
- def compute_box_bias(self, num_patches: int, feature_map: Optional[torch.FloatTensor] = None) -> torch.Tensor:
+ def compute_box_bias(
+ self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
+ ) -> torch.Tensor:
if feature_map is not None:
raise ValueError("feature_map has been deprecated as an input. Please pass in num_patches instead")
# The box center is biased to its position on the feature grid
- box_coordinates = self.normalize_grid_corner_coordinates(num_patches)
+ box_coordinates = self.normalize_grid_corner_coordinates(num_patches_height, num_patches_width)
box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)
# Unnormalize xy
box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)
# The box size is biased to the patch size
- box_size = torch.full_like(box_coord_bias, 1.0 / num_patches)
+ box_size = torch.full_like(box_coord_bias, 1.0)
+ box_size[..., 0] /= num_patches_width
+ box_size[..., 1] /= num_patches_height
box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)
# Compute box bias
@@ -1355,6 +1422,7 @@ def box_predictor(
self,
image_feats: torch.FloatTensor,
feature_map: torch.FloatTensor,
+ interpolate_pos_encoding: bool = False,
) -> torch.FloatTensor:
"""
Args:
@@ -1362,6 +1430,8 @@ def box_predictor(
Features extracted from the image, returned by the `image_text_embedder` method.
feature_map:
A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
+ interpolate_pos_encoding:
+ Whether to interpolate the pre-trained position encodings.
Returns:
pred_boxes:
List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
@@ -1370,7 +1440,13 @@ def box_predictor(
pred_boxes = self.box_head(image_feats)
# Compute the location of each token on the grid and use it to compute a bias for the bbox prediction
- box_bias = self.box_bias.to(feature_map.device)
+ if interpolate_pos_encoding:
+ _, num_patches_height, num_patches_width, _ = feature_map.shape
+ box_bias = self.compute_box_bias(num_patches_height, num_patches_width)
+ else:
+ box_bias = self.box_bias
+
+ box_bias = box_bias.to(feature_map.device)
pred_boxes += box_bias
pred_boxes = self.sigmoid(pred_boxes)
return pred_boxes
@@ -1403,6 +1479,7 @@ def image_text_embedder(
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
) -> Tuple[torch.FloatTensor]:
# Encode text and image
outputs = self.owlv2(
@@ -1411,9 +1488,18 @@ def image_text_embedder(
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=True,
)
+ if interpolate_pos_encoding:
+ _, _, height, width = pixel_values.shape
+ num_patches_height = height // self.config.vision_config.patch_size
+ num_patches_width = width // self.config.vision_config.patch_size
+ else:
+ num_patches_height = self.num_patches_height
+ num_patches_width = self.num_patches_width
+
# Get image embeddings
last_hidden_state = outputs.vision_model_output[0]
image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state)
@@ -1425,11 +1511,11 @@ def image_text_embedder(
image_embeds = image_embeds[:, 1:, :] * class_token_out
image_embeds = self.layer_norm(image_embeds)
- # Resize to [batch_size, num_patches, num_patches, hidden_size]
+ # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size]
new_size = (
image_embeds.shape[0],
- self.sqrt_num_patches,
- self.sqrt_num_patches,
+ num_patches_height,
+ num_patches_width,
image_embeds.shape[-1],
)
image_embeds = image_embeds.reshape(new_size)
@@ -1443,9 +1529,20 @@ def image_embedder(
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
) -> Tuple[torch.FloatTensor]:
# Get Owlv2Model vision embeddings (same as CLIP)
- vision_outputs = self.owlv2.vision_model(pixel_values=pixel_values, return_dict=True)
+ vision_outputs = self.owlv2.vision_model(
+ pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True
+ )
+
+ if interpolate_pos_encoding:
+ _, _, height, width = pixel_values.shape
+ num_patches_height = height // self.config.vision_config.patch_size
+ num_patches_width = width // self.config.vision_config.patch_size
+ else:
+ num_patches_height = self.num_patches_height
+ num_patches_width = self.num_patches_width
# Apply post_layernorm to last_hidden_state, return non-projected output
last_hidden_state = vision_outputs[0]
@@ -1458,11 +1555,11 @@ def image_embedder(
image_embeds = image_embeds[:, 1:, :] * class_token_out
image_embeds = self.layer_norm(image_embeds)
- # Resize to [batch_size, num_patches, num_patches, hidden_size]
+ # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size]
new_size = (
image_embeds.shape[0],
- self.sqrt_num_patches,
- self.sqrt_num_patches,
+ num_patches_height,
+ num_patches_width,
image_embeds.shape[-1],
)
image_embeds = image_embeds.reshape(new_size)
@@ -1471,10 +1568,13 @@ def image_embedder(
# Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.embed_image_query
def embed_image_query(
- self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
+ self,
+ query_image_features: torch.FloatTensor,
+ query_feature_map: torch.FloatTensor,
+ interpolate_pos_encoding: bool = False,
) -> torch.FloatTensor:
_, class_embeds = self.class_predictor(query_image_features)
- pred_boxes = self.box_predictor(query_image_features, query_feature_map)
+ pred_boxes = self.box_predictor(query_image_features, query_feature_map, interpolate_pos_encoding)
pred_boxes_as_corners = center_to_corners_format(pred_boxes)
# Loop over query images
@@ -1519,6 +1619,7 @@ def image_guided_detection(
query_pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Owlv2ImageGuidedObjectDetectionOutput:
r"""
@@ -1576,26 +1677,33 @@ def image_guided_detection(
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Compute feature maps for the input and query images
- query_feature_map = self.image_embedder(pixel_values=query_pixel_values)[0]
+ query_feature_map = self.image_embedder(
+ pixel_values=query_pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+ )[0]
feature_map, vision_outputs = self.image_embedder(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
)
- batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
- image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+ batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
+ image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
- batch_size, num_patches, num_patches, hidden_dim = query_feature_map.shape
- query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+ batch_size, num_patches_height, num_patches_width, hidden_dim = query_feature_map.shape
+ query_image_feats = torch.reshape(
+ query_feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim)
+ )
# Get top class embedding and best box index for each query image in batch
- query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(query_image_feats, query_feature_map)
+ query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(
+ query_image_feats, query_feature_map, interpolate_pos_encoding
+ )
# Predict object classes [batch_size, num_patches, num_queries+1]
(pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds)
# Predict object boxes
- target_pred_boxes = self.box_predictor(image_feats, feature_map)
+ target_pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding)
if not return_dict:
output = (
@@ -1630,6 +1738,7 @@ def forward(
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Owlv2ObjectDetectionOutput:
r"""
@@ -1683,14 +1792,15 @@ def forward(
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
)
# Text and vision model outputs
text_outputs = outputs.text_model_output
vision_outputs = outputs.vision_model_output
- batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
- image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+ batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
+ image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
# Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim]
max_text_queries = input_ids.shape[0] // batch_size
@@ -1707,7 +1817,7 @@ def forward(
objectness_logits = self.objectness_predictor(image_feats)
# Predict object boxes
- pred_boxes = self.box_predictor(image_feats, feature_map)
+ pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding)
if not return_dict:
output = (
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 7c3e124a207ff7..570d154a554c03 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -33,6 +33,7 @@
is_vision_available,
logging,
replace_return_docstrings,
+ torch_int,
)
from .configuration_owlvit import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig
@@ -268,6 +269,7 @@ def to_tuple(self) -> Tuple[Any]:
class OwlViTVisionEmbeddings(nn.Module):
def __init__(self, config: OwlViTVisionConfig):
super().__init__()
+ self.patch_size = config.patch_size
self.config = config
self.embed_dim = config.hidden_size
self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))
@@ -285,15 +287,55 @@ def __init__(self, config: OwlViTVisionConfig):
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
- def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
- batch_size = pixel_values.shape[0]
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.interpolate_pos_encoding
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+ images. This method is also adapted to support torch.jit tracing.
+
+ Adapted from:
+ - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+ - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+ """
+
+ num_patches = embeddings.shape[1] - 1
+ position_embedding = self.position_embedding.weight.unsqueeze(0)
+ num_positions = position_embedding.shape[1] - 1
+
+ # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+ if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+ return self.position_embedding(self.position_ids)
+
+ class_pos_embed = position_embedding[:, :1]
+ patch_pos_embed = position_embedding[:, 1:]
+
+ dim = embeddings.shape[-1]
+
+ new_height = height // self.patch_size
+ new_width = width // self.patch_size
+
+ sqrt_num_positions = torch_int(num_positions**0.5)
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+ patch_pos_embed = nn.functional.interpolate(
+ patch_pos_embed,
+ size=(new_height, new_width),
+ mode="bicubic",
+ align_corners=False,
+ )
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+ return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+ def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+ batch_size, _, height, width = pixel_values.shape
patch_embeds = self.patch_embedding(pixel_values) # shape = [batch_size, num_channels, height, width]
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
- embeddings = embeddings + self.position_embedding(self.position_ids)
-
+ if interpolate_pos_encoding:
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+ else:
+ embeddings = embeddings + self.position_embedding(self.position_ids)
return embeddings
@@ -601,6 +643,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -626,6 +670,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -646,6 +692,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
`vision_model_last_hidden_state` under returned tensors for more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -662,6 +710,8 @@ def _init_weights(self, module):
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
+ interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+ Whether to interpolate the pre-trained position encodings.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@@ -899,6 +949,7 @@ def forward(
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: Optional[bool] = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@@ -914,7 +965,7 @@ def forward(
expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
pixel_values = pixel_values.to(expected_input_dtype)
- hidden_states = self.embeddings(pixel_values)
+ hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
hidden_states = self.pre_layernorm(hidden_states)
encoder_outputs = self.encoder(
@@ -960,6 +1011,7 @@ def forward(
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
@@ -986,6 +1038,7 @@ def forward(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@@ -1067,6 +1120,7 @@ def get_image_features(
pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> torch.FloatTensor:
r"""
@@ -1098,6 +1152,7 @@ def get_image_features(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@@ -1116,6 +1171,7 @@ def forward(
return_loss: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_base_image_embeds: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, OwlViTOutput]:
@@ -1148,6 +1204,7 @@ def forward(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=return_dict,
)
@@ -1275,20 +1332,22 @@ def __init__(self, config: OwlViTConfig):
self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
self.sigmoid = nn.Sigmoid()
-
- self.sqrt_num_patches = config.vision_config.image_size // config.vision_config.patch_size
- self.box_bias = self.compute_box_bias(self.sqrt_num_patches)
+ self.config = config
+ self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
+ self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+ self.box_bias = self.compute_box_bias(self.num_patches_height, self.num_patches_width)
@staticmethod
- def normalize_grid_corner_coordinates(num_patches: int) -> torch.Tensor:
+ def normalize_grid_corner_coordinates(num_patches_height: int, num_patches_width: int) -> torch.Tensor:
# Create grid coordinates using torch
- x_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32)
- y_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32)
+ x_coordinates = torch.arange(1, num_patches_width + 1, dtype=torch.float32)
+ y_coordinates = torch.arange(1, num_patches_height + 1, dtype=torch.float32)
xx, yy = torch.meshgrid(x_coordinates, y_coordinates, indexing="xy")
- # Stack the coordinates and divide by num_patches
+ # Stack the coordinates and divide by their respective patch counts
box_coordinates = torch.stack((xx, yy), dim=-1)
- box_coordinates /= num_patches
+ box_coordinates[..., 0] /= num_patches_width
+ box_coordinates[..., 1] /= num_patches_height
# Flatten (h, w, 2) -> (h*w, 2)
box_coordinates = box_coordinates.view(-1, 2)
@@ -1296,18 +1355,22 @@ def normalize_grid_corner_coordinates(num_patches: int) -> torch.Tensor:
return box_coordinates
@lru_cache(maxsize=2)
- def compute_box_bias(self, num_patches: int, feature_map: Optional[torch.FloatTensor] = None) -> torch.Tensor:
+ def compute_box_bias(
+ self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
+ ) -> torch.Tensor:
if feature_map is not None:
raise ValueError("feature_map has been deprecated as an input. Please pass in num_patches instead")
# The box center is biased to its position on the feature grid
- box_coordinates = self.normalize_grid_corner_coordinates(num_patches)
+ box_coordinates = self.normalize_grid_corner_coordinates(num_patches_height, num_patches_width)
box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)
# Unnormalize xy
box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)
# The box size is biased to the patch size
- box_size = torch.full_like(box_coord_bias, 1.0 / num_patches)
+ box_size = torch.full_like(box_coord_bias, 1.0)
+ box_size[..., 0] /= num_patches_width
+ box_size[..., 1] /= num_patches_height
box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)
# Compute box bias
@@ -1318,6 +1381,7 @@ def box_predictor(
self,
image_feats: torch.FloatTensor,
feature_map: torch.FloatTensor,
+ interpolate_pos_encoding: bool = False,
) -> torch.FloatTensor:
"""
Args:
@@ -1325,6 +1389,8 @@ def box_predictor(
Features extracted from the image, returned by the `image_text_embedder` method.
feature_map:
A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
+ interpolate_pos_encoding:
+ Whether to interpolate the pre-trained position encodings.
Returns:
pred_boxes:
List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
@@ -1333,7 +1399,13 @@ def box_predictor(
pred_boxes = self.box_head(image_feats)
# Compute the location of each token on the grid and use it to compute a bias for the bbox prediction
- box_bias = self.box_bias.to(feature_map.device)
+ if interpolate_pos_encoding:
+ _, num_patches_height, num_patches_width, _ = feature_map.shape
+ box_bias = self.compute_box_bias(num_patches_height, num_patches_width)
+ else:
+ box_bias = self.box_bias
+
+ box_bias = box_bias.to(feature_map.device)
pred_boxes += box_bias
pred_boxes = self.sigmoid(pred_boxes)
return pred_boxes
@@ -1364,6 +1436,7 @@ def image_text_embedder(
attention_mask: torch.Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
) -> Tuple[torch.FloatTensor]:
# Encode text and image
outputs = self.owlvit(
@@ -1372,9 +1445,18 @@ def image_text_embedder(
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=True,
)
+ if interpolate_pos_encoding:
+ _, _, height, width = pixel_values.shape
+ num_patches_height = height // self.config.vision_config.patch_size
+ num_patches_width = width // self.config.vision_config.patch_size
+ else:
+ num_patches_height = self.num_patches_height
+ num_patches_width = self.num_patches_width
+
# Get image embeddings
last_hidden_state = outputs.vision_model_output[0]
image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)
@@ -1386,11 +1468,11 @@ def image_text_embedder(
image_embeds = image_embeds[:, 1:, :] * class_token_out
image_embeds = self.layer_norm(image_embeds)
- # Resize to [batch_size, num_patches, num_patches, hidden_size]
+ # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size]
new_size = (
image_embeds.shape[0],
- self.sqrt_num_patches,
- self.sqrt_num_patches,
+ num_patches_height,
+ num_patches_width,
image_embeds.shape[-1],
)
image_embeds = image_embeds.reshape(new_size)
@@ -1403,9 +1485,20 @@ def image_embedder(
pixel_values: torch.FloatTensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
) -> Tuple[torch.FloatTensor]:
# Get OwlViTModel vision embeddings (same as CLIP)
- vision_outputs = self.owlvit.vision_model(pixel_values=pixel_values, return_dict=True)
+ vision_outputs = self.owlvit.vision_model(
+ pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True
+ )
+
+ if interpolate_pos_encoding:
+ _, _, height, width = pixel_values.shape
+ num_patches_height = height // self.config.vision_config.patch_size
+ num_patches_width = width // self.config.vision_config.patch_size
+ else:
+ num_patches_height = self.num_patches_height
+ num_patches_width = self.num_patches_width
# Apply post_layernorm to last_hidden_state, return non-projected output
last_hidden_state = vision_outputs[0]
@@ -1418,11 +1511,11 @@ def image_embedder(
image_embeds = image_embeds[:, 1:, :] * class_token_out
image_embeds = self.layer_norm(image_embeds)
- # Resize to [batch_size, num_patches, num_patches, hidden_size]
+ # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size]
new_size = (
image_embeds.shape[0],
- self.sqrt_num_patches,
- self.sqrt_num_patches,
+ num_patches_height,
+ num_patches_width,
image_embeds.shape[-1],
)
image_embeds = image_embeds.reshape(new_size)
@@ -1430,10 +1523,13 @@ def image_embedder(
return (image_embeds, vision_outputs)
def embed_image_query(
- self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
+ self,
+ query_image_features: torch.FloatTensor,
+ query_feature_map: torch.FloatTensor,
+ interpolate_pos_encoding: bool = False,
) -> torch.FloatTensor:
_, class_embeds = self.class_predictor(query_image_features)
- pred_boxes = self.box_predictor(query_image_features, query_feature_map)
+ pred_boxes = self.box_predictor(query_image_features, query_feature_map, interpolate_pos_encoding)
pred_boxes_as_corners = center_to_corners_format(pred_boxes)
# Loop over query images
@@ -1478,6 +1574,7 @@ def image_guided_detection(
query_pixel_values: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> OwlViTImageGuidedObjectDetectionOutput:
r"""
@@ -1520,26 +1617,33 @@ def image_guided_detection(
return_dict = return_dict if return_dict is not None else self.config.return_dict
# Compute feature maps for the input and query images
- query_feature_map = self.image_embedder(pixel_values=query_pixel_values)[0]
+ query_feature_map = self.image_embedder(
+ pixel_values=query_pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+ )[0]
feature_map, vision_outputs = self.image_embedder(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
)
- batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
- image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+ batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
+ image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
- batch_size, num_patches, num_patches, hidden_dim = query_feature_map.shape
- query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+ batch_size, num_patches_height, num_patches_width, hidden_dim = query_feature_map.shape
+ query_image_feats = torch.reshape(
+ query_feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim)
+ )
# Get top class embedding and best box index for each query image in batch
- query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(query_image_feats, query_feature_map)
+ query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(
+ query_image_feats, query_feature_map, interpolate_pos_encoding
+ )
# Predict object classes [batch_size, num_patches, num_queries+1]
(pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds)
# Predict object boxes
- target_pred_boxes = self.box_predictor(image_feats, feature_map)
+ target_pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding)
if not return_dict:
output = (
@@ -1574,6 +1678,7 @@ def forward(
attention_mask: Optional[torch.Tensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
+ interpolate_pos_encoding: bool = False,
return_dict: Optional[bool] = None,
) -> OwlViTObjectDetectionOutput:
r"""
@@ -1625,14 +1730,15 @@ def forward(
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
+ interpolate_pos_encoding=interpolate_pos_encoding,
)
# Text and vision model outputs
text_outputs = outputs.text_model_output
vision_outputs = outputs.vision_model_output
- batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
- image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+ batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
+ image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
# Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim]
max_text_queries = input_ids.shape[0] // batch_size
@@ -1646,7 +1752,7 @@ def forward(
(pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask)
# Predict object boxes
- pred_boxes = self.box_predictor(image_feats, feature_map)
+ pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding)
if not return_dict:
output = (
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index b4a231561ba791..9d58fe7eb1b3f3 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -519,7 +519,7 @@ def forward(
# mask out pad-token-ids in labels for BC
if labels is not None and self.pad_token_id in labels:
logger.warning_once(
- "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+ "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
"You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
)
labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 5783308f831541..e6fcfd37bccf6c 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -127,13 +127,13 @@ class PaliGemmaProcessor(ProcessorMixin):
r"""
Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
- [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the
+ [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`GemmaTokenizerFast`]. See the
[`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
Args:
image_processor ([`SiglipImageProcessor`], *optional*):
The image processor is a required input.
- tokenizer ([`LlamaTokenizerFast`], *optional*):
+ tokenizer ([`GemmaTokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
@@ -184,7 +184,7 @@ def __call__(
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
- and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+ and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
of the above two methods for more information.
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 8d3c20b9ace717..27712741b7c28f 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -683,7 +683,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 477896decd5318..5aa038d3ccfaa8 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -606,7 +606,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 176dadd5b883e1..41115a058d2e0a 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -1587,7 +1587,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/pixtral/image_processing_pixtral_fast.py b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
index 82fbf3b2c094a6..5fa23923fe7473 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral_fast.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral_fast.py
@@ -346,4 +346,7 @@ def preprocess(
batch_images.append(images)
batch_image_sizes.append(image_sizes)
- return BatchMixFeature(data={"pixel_values": batch_images, "image_sizes": batch_image_sizes}, tensor_type=None)
+ return BatchMixFeature(
+ data={"pixel_values": batch_images, "image_sizes": batch_image_sizes},
+ tensor_type=None,
+ )
diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py
index 6a64a27e007b3e..bb5366ef764fec 100644
--- a/src/transformers/models/pop2piano/modeling_pop2piano.py
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@@ -1000,7 +1000,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 36fb1ddf1390ac..5dba7594e7e9a1 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -617,7 +617,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index 9e67be1e1e55c2..7ea1d573544e4d 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -17,13 +17,14 @@
"""
from copy import deepcopy
-from typing import Optional, Union
+from typing import List, Optional, Union
import numpy as np
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
-from ...utils import TensorType, is_tf_available, is_torch_available
+from ...image_utils import ImageInput, VideoInput
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import AudioInput, BatchEncoding, PreTokenizedInput, TextInput
+from ...utils import is_tf_available, is_torch_available
if is_torch_available():
@@ -33,6 +34,23 @@
import tensorflow as tf
+class SamImagesKwargs(ImagesKwargs):
+ segmentation_maps: Optional[ImageInput]
+ input_points: Optional[List[List[float]]]
+ input_labels: Optional[List[List[int]]]
+ input_boxes: Optional[List[List[List[float]]]]
+ point_pad_value: Optional[int]
+
+
+class SamProcessorKwargs(ProcessingKwargs, total=False):
+ images_kwargs: SamImagesKwargs
+ _defaults = {
+ "images_kwargs": {
+ "point_pad_value": -10,
+ }
+ }
+
+
class SamProcessor(ProcessorMixin):
r"""
Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a
@@ -48,32 +66,50 @@ class SamProcessor(ProcessorMixin):
attributes = ["image_processor"]
image_processor_class = "SamImageProcessor"
+ # For backward compatibility. See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details.
+ optional_call_args = [
+ "segmentation_maps",
+ "input_points",
+ "input_labels",
+ "input_boxes",
+ ]
def __init__(self, image_processor):
super().__init__(image_processor)
- self.current_processor = self.image_processor
- self.point_pad_value = -10
self.target_size = self.image_processor.size["longest_edge"]
def __call__(
self,
- images=None,
- segmentation_maps=None,
- input_points=None,
- input_labels=None,
- input_boxes=None,
- return_tensors: Optional[Union[str, TensorType]] = None,
+ images: Optional[ImageInput] = None,
+ # The following is to capture `segmentation_maps`, `input_points`, `input_labels` and `input_boxes`
+ # arguments that may be passed as a positional argument.
+ # See transformers.processing_utils.ProcessorMixin.prepare_and_validate_optional_call_args for more details,
+ # or this conversation for more context:
+ # https://github.com/huggingface/transformers/pull/32544#discussion_r1720208116
+ # This behavior is only needed for backward compatibility and will be removed in future versions.
+ *args, # to be deprecated
+ text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+ audio: Optional[AudioInput] = None,
+ video: Optional[VideoInput] = None,
**kwargs,
) -> BatchEncoding:
"""
This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
points and bounding boxes for the model if they are provided.
"""
+ output_kwargs = self._merge_kwargs(
+ SamProcessorKwargs,
+ tokenizer_init_kwargs={},
+ **kwargs,
+ **self.prepare_and_validate_optional_call_args(*args),
+ )
+ input_points = output_kwargs["images_kwargs"].pop("input_points", None)
+ input_labels = output_kwargs["images_kwargs"].pop("input_labels", None)
+ input_boxes = output_kwargs["images_kwargs"].pop("input_boxes", None)
+
encoding_image_processor = self.image_processor(
images,
- segmentation_maps=segmentation_maps,
- return_tensors=return_tensors,
- **kwargs,
+ **output_kwargs["images_kwargs"],
)
# pop arguments that are not used in the foward but used nevertheless
@@ -94,7 +130,8 @@ def __call__(
input_points=input_points,
input_labels=input_labels,
input_boxes=input_boxes,
- return_tensors=return_tensors,
+ return_tensors=output_kwargs["common_kwargs"].get("return_tensors"),
+ point_pad_value=output_kwargs["images_kwargs"].get("point_pad_value"),
)
return encoding_image_processor
@@ -107,6 +144,7 @@ def _normalize_and_convert(
input_labels=None,
input_boxes=None,
return_tensors="pt",
+ point_pad_value=-10,
):
if input_points is not None:
if len(original_sizes) != len(input_points):
@@ -121,7 +159,9 @@ def _normalize_and_convert(
# check that all arrays have the same shape
if not all(point.shape == input_points[0].shape for point in input_points):
if input_labels is not None:
- input_points, input_labels = self._pad_points_and_labels(input_points, input_labels)
+ input_points, input_labels = self._pad_points_and_labels(
+ input_points, input_labels, point_pad_value
+ )
input_points = np.array(input_points)
@@ -174,7 +214,7 @@ def _normalize_and_convert(
return encoding_image_processor
- def _pad_points_and_labels(self, input_points, input_labels):
+ def _pad_points_and_labels(self, input_points, input_labels, point_pad_value):
r"""
The method pads the 2D points and labels to the maximum number of points in the batch.
"""
@@ -183,9 +223,9 @@ def _pad_points_and_labels(self, input_points, input_labels):
for i, point in enumerate(input_points):
if point.shape[0] != expected_nb_points:
point = np.concatenate(
- [point, np.zeros((expected_nb_points - point.shape[0], 2)) + self.point_pad_value], axis=0
+ [point, np.zeros((expected_nb_points - point.shape[0], 2)) + point_pad_value], axis=0
)
- input_labels[i] = np.append(input_labels[i], [self.point_pad_value])
+ input_labels[i] = np.append(input_labels[i], [point_pad_value])
processed_input_points.append(point)
input_points = processed_input_points
return input_points, input_labels
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 88dc437cdcb91d..7214a36e9a3921 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -938,7 +938,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index b150b04eea57b8..daeae8f9dcc2b3 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -1136,7 +1136,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 9012c8db9feb0a..fe6cfbc5c3fdf2 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1205,7 +1205,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py
index dfb14dfccec4c6..47e8944583b4ca 100644
--- a/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py
+++ b/src/transformers/models/timm_wrapper/modeling_timm_wrapper.py
@@ -82,6 +82,9 @@ class TimmWrapperPreTrainedModel(PreTrainedModel):
config_class = TimmWrapperConfig
_no_split_modules = []
+ # used in Trainer to avoid passing `loss_kwargs` to model forward
+ accepts_loss_kwargs = False
+
def __init__(self, *args, **kwargs):
requires_backends(self, ["vision", "timm"])
super().__init__(*args, **kwargs)
diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 1928ac8a5c20c9..af21f714eff294 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -1538,7 +1538,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index 985dc5e4426dff..2b007cb2c77157 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -849,7 +849,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index fb01823a29c017..21bb2c869b7633 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1375,7 +1375,7 @@ def _update_causal_mask(
output_attentions: bool,
):
if self.config._attn_implementation == "flash_attention_2":
- if attention_mask is not None and 0.0 in attention_mask:
+ if attention_mask is not None and (attention_mask == 0.0).any():
return attention_mask
return None
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
index 47b54cd27bcebe..d5b51d038ab8bb 100755
--- a/src/transformers/quantizers/auto.py
+++ b/src/transformers/quantizers/auto.py
@@ -24,6 +24,7 @@
EetqConfig,
FbgemmFp8Config,
GPTQConfig,
+ HiggsConfig,
HqqConfig,
QuantizationConfigMixin,
QuantizationMethod,
@@ -40,6 +41,7 @@
from .quantizer_eetq import EetqHfQuantizer
from .quantizer_fbgemm_fp8 import FbgemmFp8HfQuantizer
from .quantizer_gptq import GptqHfQuantizer
+from .quantizer_higgs import HiggsHfQuantizer
from .quantizer_hqq import HqqHfQuantizer
from .quantizer_quanto import QuantoHfQuantizer
from .quantizer_torchao import TorchAoHfQuantizer
@@ -54,6 +56,7 @@
"aqlm": AqlmHfQuantizer,
"quanto": QuantoHfQuantizer,
"eetq": EetqHfQuantizer,
+ "higgs": HiggsHfQuantizer,
"hqq": HqqHfQuantizer,
"compressed-tensors": CompressedTensorsHfQuantizer,
"fbgemm_fp8": FbgemmFp8HfQuantizer,
@@ -73,6 +76,7 @@
"hqq": HqqConfig,
"compressed-tensors": CompressedTensorsConfig,
"fbgemm_fp8": FbgemmFp8Config,
+ "higgs": HiggsConfig,
"torchao": TorchAoConfig,
"bitnet": BitNetConfig,
"vptq": VptqConfig,
diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py
index 7b81c93edf1fac..d7a756b23a07e7 100644
--- a/src/transformers/quantizers/quantizer_awq.py
+++ b/src/transformers/quantizers/quantizer_awq.py
@@ -52,6 +52,10 @@ def validate_environment(self, device_map, **kwargs):
if not is_accelerate_available():
raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
+ if self.quantization_config.version == AWQLinearVersion.GEMM and not torch.cuda.is_available():
+ logger.warning_once("No CUDA found, replace GEMM with IPEX version to support non-cuda AWQ model.")
+ self.quantization_config.version = AWQLinearVersion.IPEX
+
if self.quantization_config.version == AWQLinearVersion.IPEX:
if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"):
raise RuntimeError(
@@ -87,6 +91,7 @@ def validate_environment(self, device_map, **kwargs):
def update_torch_dtype(self, torch_dtype):
if torch_dtype is None:
torch_dtype = torch.float16
+ logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
elif torch_dtype != torch.float16:
logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
return torch_dtype
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
index 98d57e22524902..8657bda166254d 100644
--- a/src/transformers/quantizers/quantizer_bnb_4bit.py
+++ b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -29,6 +29,7 @@
is_accelerate_available,
is_bitsandbytes_available,
is_torch_available,
+ is_torch_npu_available,
is_torch_xpu_available,
logging,
)
@@ -171,6 +172,9 @@ def create_quantized_param(
old_value = getattr(module, tensor_name)
+ # `torch.Tensor.to()` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
+ if isinstance(target_device, int) and is_torch_npu_available():
+ target_device = f"npu:{target_device}"
if tensor_name == "bias":
if param_value is None:
new_value = old_value.to(target_device)
@@ -259,11 +263,12 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
torch_dtype = torch.float16
return torch_dtype
- # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_device_map
def update_device_map(self, device_map):
if device_map is None:
if torch.cuda.is_available():
device_map = {"": torch.cuda.current_device()}
+ elif is_torch_npu_available():
+ device_map = {"": f"npu:{torch.npu.current_device()}"}
elif is_torch_xpu_available():
device_map = {"": f"xpu:{torch.xpu.current_device()}"}
else:
diff --git a/src/transformers/quantizers/quantizer_compressed_tensors.py b/src/transformers/quantizers/quantizer_compressed_tensors.py
index 5064f2c019d74e..7d208087bbbfec 100644
--- a/src/transformers/quantizers/quantizer_compressed_tensors.py
+++ b/src/transformers/quantizers/quantizer_compressed_tensors.py
@@ -37,6 +37,13 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs):
super().__init__(quantization_config, **kwargs)
+
+ if not is_compressed_tensors_available():
+ raise ImportError(
+ "Using `compressed_tensors` quantized models requires the compressed-tensors library: "
+ "`pip install compressed-tensors`"
+ )
+
from compressed_tensors.compressors import ModelCompressor
self.compressor = ModelCompressor.from_compression_config(quantization_config)
diff --git a/src/transformers/quantizers/quantizer_gptq.py b/src/transformers/quantizers/quantizer_gptq.py
index 233a5279d3f90e..d47a2ba79cb60d 100644
--- a/src/transformers/quantizers/quantizer_gptq.py
+++ b/src/transformers/quantizers/quantizer_gptq.py
@@ -44,18 +44,25 @@ class GptqHfQuantizer(HfQuantizer):
def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
super().__init__(quantization_config, **kwargs)
+
+ if not is_optimum_available():
+ raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
from optimum.gptq import GPTQQuantizer
self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum())
def validate_environment(self, *args, **kwargs):
+ if not is_optimum_available():
+ raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
+
+ if not is_auto_gptq_available():
+ raise ImportError(
+ "Loading a GPTQ quantized model requires the auto-gptq library (`pip install auto-gptq`)"
+ )
+
gptq_supports_cpu = version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
if not gptq_supports_cpu and not torch.cuda.is_available():
raise RuntimeError("GPU is required to quantize or run quantize model.")
- elif not (is_optimum_available() and is_auto_gptq_available()):
- raise ImportError(
- "Loading a GPTQ quantized model requires optimum (`pip install optimum`) and auto-gptq library (`pip install auto-gptq`)"
- )
elif version.parse(importlib.metadata.version("auto_gptq")) < version.parse("0.4.2"):
raise ImportError(
"You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq`"
@@ -64,6 +71,7 @@ def validate_environment(self, *args, **kwargs):
def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
if torch_dtype is None:
torch_dtype = torch.float16
+ logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
elif torch_dtype != torch.float16:
logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.")
return torch_dtype
diff --git a/src/transformers/quantizers/quantizer_higgs.py b/src/transformers/quantizers/quantizer_higgs.py
new file mode 100644
index 00000000000000..f33e2f21e98fd8
--- /dev/null
+++ b/src/transformers/quantizers/quantizer_higgs.py
@@ -0,0 +1,232 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+ from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+ import torch
+
+logger = logging.get_logger(__name__)
+
+
+def get_num_sms_from_device(device):
+ target_device_cc = torch.cuda.get_device_capability(device=device)
+ if target_device_cc == (8, 6):
+ return 84
+ elif target_device_cc == (8, 0):
+ return 108
+ elif target_device_cc == (8, 9):
+ return 128
+ else:
+ raise NotImplementedError(
+ f"Device capability {target_device_cc} not supported for FLUTE (yet?) to verify your device capability check out https://developer.nvidia.com/cuda-gpus"
+ )
+
+
+class HiggsHfQuantizer(HfQuantizer):
+ """
+ Quantizer of the HIGGS method. Enables the loading of prequantized models and in-flight quantization of full-precision models.
+ """
+
+ requires_calibration = False
+ requires_parameters_quantization = True
+ required_packages = ["flute-kernel", "fast_hadamard_transform"]
+
+ def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+ super().__init__(quantization_config, **kwargs)
+ self.quantization_config = quantization_config
+
+ def validate_environment(self, device_map, **kwargs):
+ if not torch.cuda.is_available():
+ raise NotImplementedError("HIGGS quantization is only supported on GPU. Please use a different quantizer.")
+
+ if not is_accelerate_available():
+ raise ImportError("Using `higgs` quantization requires Accelerate: `pip install accelerate`")
+
+ if not is_flute_available():
+ raise ImportError("Using `higgs` quantization requires FLUTE: `pip install flute-kernel>=0.3.0`")
+
+ if not is_hadamard_available():
+ raise ImportError(
+ "Using `higgs` quantization requires fast_hadamard_transform: `pip install fast_hadamard_transform`"
+ )
+
+ if device_map is None:
+ raise ValueError(
+ "You are attempting to load a HIGGS model without setting device_map."
+ " Please set device_map comprised of 'cuda' devices."
+ )
+ elif isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+ raise ValueError(
+ "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
+ " This is not supported. Please remove the CPU or disk device from the device_map."
+ )
+
+ def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+ if torch_dtype is None:
+ logger.info("`torch_dtype` is None. Setting `torch_dtype=torch.float16` for FLUTE compatibility.")
+ torch_dtype = torch.float16
+ elif torch_dtype != torch.float16 and torch_dtype != torch.bfloat16:
+ raise ValueError(
+ f"Invalid `torch_dtype` {torch_dtype}. HIGGS quantization only supports `torch_dtype=torch.float16` or `torch_dtype=torch.bfloat16`."
+ )
+
+ return torch_dtype
+
+ def create_quantized_param(
+ self,
+ model: "PreTrainedModel",
+ param_value: "torch.Tensor",
+ param_name: str,
+ target_device: "torch.device",
+ state_dict: Dict[str, Any],
+ unexpected_keys: Optional[List[str]] = None,
+ ):
+ from ..integrations import quantize_with_higgs
+
+ """
+ Quantizes weights into weight and weight_scale
+ """
+ flute_dict = quantize_with_higgs(
+ param_value.to(target_device),
+ self.quantization_config.bits,
+ self.quantization_config.p,
+ self.quantization_config.group_size,
+ self.quantization_config.hadamard_size,
+ )
+
+ del param_value
+
+ module, tensor_name = get_module_from_name(model, param_name)
+ for key, value in flute_dict.items():
+ if key in module._parameters:
+ module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
+ elif key in module._buffers:
+ module._buffers[key] = torch.nn.Buffer(value)
+ else:
+ raise ValueError(f"Unexpected key {key} in module {module}")
+
+ if unexpected_keys is not None and param_name in unexpected_keys:
+ unexpected_keys.remove(param_name)
+
+ module.num_sms_packed = torch.nn.Parameter(
+ torch.tensor(get_num_sms_from_device(target_device), device=target_device, dtype=torch.int32),
+ requires_grad=False,
+ )
+
+ def _process_model_before_weight_loading(
+ self,
+ model: "PreTrainedModel",
+ **kwargs,
+ ):
+ from ..integrations import replace_with_higgs_linear
+
+ replace_with_higgs_linear(
+ model,
+ quantization_config=self.quantization_config,
+ )
+ model.config.quantization_config = self.quantization_config
+
+ def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+ import flute.utils
+
+ from ..integrations import HiggsLinear
+
+ flute_workspaces = {}
+ for name, module in model.named_modules():
+ if isinstance(module, HiggsLinear):
+ # Every HiggsLinear needs a "workspace": a buffer for the unpacking operation.
+ # This buffer needs to be on the same device as the weights, but can be reused across modules otherwise.
+ if module.weight.device not in flute_workspaces:
+ flute_workspaces[module.weight.device] = flute.utils.make_workspace_streamk(
+ device=module.weight.device
+ )
+ module.workspace = flute_workspaces[module.weight.device]
+
+ # FLUTE weights are packed in a way that is optimized for a specific number of SMs (GPU streaming multiprocessors).
+ # If the model is loaded on a different device than the one it was saved on, we need to repack the weights.
+ if module.num_sms_packed.item() != get_num_sms_from_device(module.weight.device):
+ new_device = module.weight.device
+ new_num_sms = get_num_sms_from_device(new_device)
+ module.weight.data = flute.utils.pack(
+ flute.utils.unpack(
+ weight=module.weight.data,
+ scales=module.scales.data,
+ workspace=module.workspace,
+ num_bits=module.num_bits,
+ group_size=module.group_size,
+ num_sms_packed=module.num_sms_packed.item(),
+ ).T.contiguous(),
+ module.num_bits,
+ module.group_size,
+ )
+ module.num_sms_packed = torch.nn.Parameter(
+ torch.tensor(new_num_sms, device=new_device, dtype=torch.int32),
+ requires_grad=False,
+ )
+
+ def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+ from ..integrations import HiggsLinear
+
+ not_missing_keys = []
+ for name, module in model.named_modules():
+ if isinstance(module, HiggsLinear):
+ for missing in missing_keys:
+ if (
+ (name in missing or name in f"{prefix}.{missing}")
+ and not missing.endswith(".weight")
+ and not missing.endswith(".bias")
+ ):
+ not_missing_keys.append(missing)
+ return [k for k in missing_keys if k not in not_missing_keys]
+
+ @property
+ def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+ return False
+
+ def is_serializable(self, safe_serialization=None):
+ return True
+
+ def check_quantized_param(
+ self,
+ model: "PreTrainedModel",
+ param_value: "torch.Tensor",
+ param_name: str,
+ state_dict: Dict[str, Any],
+ **kwargs,
+ ) -> bool:
+ from ..integrations import HiggsLinear
+
+ module, tensor_name = get_module_from_name(model, param_name)
+ if isinstance(module, HiggsLinear) and tensor_name == "weight" and param_value.dtype != torch.int16:
+ # Only quantize weights of HiggsLinear modules that are not already quantized
+ return True
+ else:
+ return False
+
+ def _dequantize(self, model):
+ from ..integrations import dequantize_higgs
+
+ model = dequantize_higgs(model)
+ return model
diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py
index 10d2b184ef146b..bcc9c57dfa006d 100644
--- a/src/transformers/quantizers/quantizer_torchao.py
+++ b/src/transformers/quantizers/quantizer_torchao.py
@@ -114,6 +114,9 @@ def update_torch_dtype(self, torch_dtype):
torch_dtype = torch.bfloat16
if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight":
if torch_dtype is None:
+ logger.info(
+ "Setting torch_dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no torch_dtype was specified in from_pretrained"
+ )
# we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op
torch_dtype = torch.float32
return torch_dtype
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 2f523ed36d983f..00a7ee59664df2 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -79,12 +79,14 @@
is_fbgemm_gpu_available,
is_flash_attn_2_available,
is_flax_available,
+ is_flute_available,
is_fsdp_available,
is_ftfy_available,
is_g2p_en_available,
is_galore_torch_available,
is_gguf_available,
is_grokadamw_available,
+ is_hadamard_available,
is_ipex_available,
is_jieba_available,
is_jinja_available,
@@ -1239,6 +1241,15 @@ def require_fbgemm_gpu(test_case):
return unittest.skipUnless(is_fbgemm_gpu_available(), "test requires fbgemm-gpu")(test_case)
+def require_flute_hadamard(test_case):
+ """
+ Decorator marking a test that requires higgs and hadamard
+ """
+ return unittest.skipUnless(
+ is_flute_available() and is_hadamard_available(), "test requires flute and fast_hadamard_transform"
+ )(test_case)
+
+
def require_phonemizer(test_case):
"""
Decorator marking a test that requires phonemizer
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c878d2b345cc31..655d5b260c1f36 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -622,7 +622,15 @@ def __init__(
else unwrapped_model.get_base_model().forward
)
forward_params = inspect.signature(model_forward).parameters
- self.model_accepts_loss_kwargs = any(k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values())
+
+ # Check if the model has explicit setup for loss kwargs,
+ # if not, check if `**kwargs` are in model.forward
+ if hasattr(model, "accepts_loss_kwargs"):
+ self.model_accepts_loss_kwargs = model.accepts_loss_kwargs
+ else:
+ self.model_accepts_loss_kwargs = any(
+ k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values()
+ )
self.neftune_noise_alpha = args.neftune_noise_alpha
@@ -3698,10 +3706,12 @@ def training_step(
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
- self.accelerator.backward(loss, **kwargs)
# Finally we need to normalize the loss for reporting
if num_items_in_batch is None:
- return loss.detach() / self.args.gradient_accumulation_steps
+ loss = loss / self.args.gradient_accumulation_steps
+
+ self.accelerator.backward(loss, **kwargs)
+
return loss.detach()
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 07d0571e44c9b9..5cd89b3701cff6 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -64,6 +64,7 @@ def __init__(
Union["PreTrainedTokenizerBase", "BaseImageProcessor", "FeatureExtractionMixin", "ProcessorMixin"]
] = None,
model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
+ compute_loss_func: Optional[Callable] = None,
compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
callbacks: Optional[List["TrainerCallback"]] = None,
optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
@@ -77,6 +78,7 @@ def __init__(
eval_dataset=eval_dataset,
processing_class=processing_class,
model_init=model_init,
+ compute_loss_func=compute_loss_func,
compute_metrics=compute_metrics,
callbacks=callbacks,
optimizers=optimizers,
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 6950e8e66d3ac1..a1b5b511a95e35 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -2164,7 +2164,7 @@ def _setup_devices(self) -> "torch.device":
if not is_accelerate_available():
raise ImportError(
f"Using the `Trainer` with `PyTorch` requires `accelerate>={ACCELERATE_MIN_VERSION}`: "
- "Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+ f"Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
)
# We delay the init of `PartialState` to the end for clarity
accelerator_state_kwargs = {"enabled": True, "use_configured_state": False}
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 2edfcdcd101c78..74b6d39fda52bb 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -140,12 +140,14 @@
is_flash_attn_greater_or_equal,
is_flash_attn_greater_or_equal_2_10,
is_flax_available,
+ is_flute_available,
is_fsdp_available,
is_ftfy_available,
is_g2p_en_available,
is_galore_torch_available,
is_gguf_available,
is_grokadamw_available,
+ is_hadamard_available,
is_hqq_available,
is_in_notebook,
is_ipex_available,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index e3463461ea07e5..922d67264bb142 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3635,6 +3635,34 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
+class Dinov2WithRegistersBackbone(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Dinov2WithRegistersForImageClassification(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Dinov2WithRegistersModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
+class Dinov2WithRegistersPreTrainedModel(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+
class DistilBertForMaskedLM(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index cfc8b88fd81ed6..f880535dd6fedb 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -128,6 +128,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
_faiss_available = False
_ftfy_available = _is_package_available("ftfy")
_g2p_en_available = _is_package_available("g2p_en")
+_hadamard_available = _is_package_available("fast_hadamard_transform")
_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
_jieba_available = _is_package_available("jieba")
_jinja_available = _is_package_available("jinja2")
@@ -332,6 +333,10 @@ def is_torch_deterministic():
return True
+def is_hadamard_available():
+ return _hadamard_available
+
+
def is_hqq_available(min_version: str = HQQ_MIN_VERSION):
return _hqq_available and version.parse(_hqq_version) >= version.parse(min_version)
@@ -615,6 +620,13 @@ def is_flax_available():
return _flax_available
+def is_flute_available():
+ try:
+ return importlib.util.find_spec("flute") is not None and importlib.metadata.version("flute-kernel") >= "0.3.0"
+ except importlib.metadata.PackageNotFoundError:
+ return False
+
+
def is_ftfy_available():
return _ftfy_available
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 44e47e4f6e65c2..3160c3481da1d7 100755
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -42,6 +42,7 @@ class QuantizationMethod(str, Enum):
VPTQ = "vptq"
QUANTO = "quanto"
EETQ = "eetq"
+ HIGGS = "higgs"
HQQ = "hqq"
COMPRESSED_TENSORS = "compressed-tensors"
FBGEMM_FP8 = "fbgemm_fp8"
@@ -1340,6 +1341,58 @@ def get_loading_attributes(self):
return loading_attibutes_dict
+@dataclass
+class HiggsConfig(QuantizationConfigMixin):
+ """
+ HiggsConfig is a configuration class for quantization using the HIGGS method.
+
+ Args:
+ bits (int, *optional*, defaults to 4):
+ Number of bits to use for quantization. Can be 2, 3 or 4. Default is 4.
+ p (int, *optional*, defaults to 2):
+ Quantization grid dimension. 1 and 2 are supported. 2 is always better in practice. Default is 2.
+ modules_to_not_convert (`list`, *optional*, default to ["lm_head"]):
+ List of linear layers that should not be quantized.
+ hadamard_size (int, *optional*, defaults to 512):
+ Hadamard size for the HIGGS method. Default is 512. Input dimension of matrices is padded to this value. Decreasing this below 512 will reduce the quality of the quantization.
+ group_size (int, *optional*, defaults to 256):
+ Group size for the HIGGS method. Can be 64, 128 or 256. Decreasing it barely affects the performance. Default is 256. Must be a divisor of hadamard_size.
+ """
+
+ def __init__(
+ self,
+ bits: int = 4,
+ p: int = 2,
+ modules_to_not_convert: Optional[List[str]] = None,
+ hadamard_size: int = 512,
+ group_size: int = 256,
+ **kwargs,
+ ):
+ if modules_to_not_convert is None:
+ modules_to_not_convert = ["lm_head"]
+ self.quant_method = QuantizationMethod.HIGGS
+ self.bits = bits
+ self.p = p
+ self.modules_to_not_convert = modules_to_not_convert
+ self.hadamard_size = hadamard_size
+ self.group_size = group_size
+
+ self.post_init()
+
+ def post_init(self):
+ r"""
+ Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+ """
+ if self.bits not in [2, 3, 4]:
+ raise ValueError("bits must be 2, 3, or 4")
+ if self.p not in [1, 2]:
+ raise ValueError("p must be 1 or 2. 2 is always better in practice")
+ if self.group_size not in [64, 128, 256]:
+ raise ValueError("group_size must be 64, 128, or 256")
+ if self.hadamard_size % self.group_size != 0:
+ raise ValueError("hadamard_size must be divisible by group_size")
+
+
@dataclass
class TorchAoConfig(QuantizationConfigMixin):
"""This is a config class for torchao quantization/sparsity techniques.
diff --git a/tests/models/dinov2_with_registers/__init__.py b/tests/models/dinov2_with_registers/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
new file mode 100644
index 00000000000000..6aa62138e6202c
--- /dev/null
+++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py
@@ -0,0 +1,369 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Dinov2WithRegisters model."""
+
+import unittest
+
+from transformers import Dinov2WithRegistersConfig
+from transformers.testing_utils import (
+ require_torch,
+ require_vision,
+ slow,
+ torch_device,
+)
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_backbone_common import BackboneTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+ import torch
+ from torch import nn
+
+ from transformers import (
+ Dinov2WithRegistersBackbone,
+ Dinov2WithRegistersForImageClassification,
+ Dinov2WithRegistersModel,
+ )
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import AutoImageProcessor
+
+
+class Dinov2WithRegistersModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=13,
+ image_size=30,
+ patch_size=2,
+ num_channels=3,
+ is_training=True,
+ use_labels=True,
+ hidden_size=32,
+ num_hidden_layers=2,
+ num_attention_heads=4,
+ intermediate_size=37,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ type_sequence_label_size=10,
+ initializer_range=0.02,
+ num_register_tokens=2,
+ mask_ratio=0.5,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.is_training = is_training
+ self.use_labels = use_labels
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.type_sequence_label_size = type_sequence_label_size
+ self.initializer_range = initializer_range
+ self.num_register_tokens = num_register_tokens
+ self.scope = scope
+
+ # in DINOv2 with Registers, the seq length equals the number of patches + 1 + num_register_tokens (we add 1 for the [CLS] token)
+ num_patches = (image_size // patch_size) ** 2
+ self.seq_length = num_patches + 1 + self.num_register_tokens
+ self.mask_ratio = mask_ratio
+ self.num_masks = int(mask_ratio * self.seq_length)
+ self.mask_length = num_patches
+
+ def prepare_config_and_inputs(self):
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+ labels = None
+ if self.use_labels:
+ labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+ config = self.get_config()
+
+ return config, pixel_values, labels
+
+ def get_config(self):
+ return Dinov2WithRegistersConfig(
+ image_size=self.image_size,
+ patch_size=self.patch_size,
+ num_channels=self.num_channels,
+ hidden_size=self.hidden_size,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ hidden_act=self.hidden_act,
+ hidden_dropout_prob=self.hidden_dropout_prob,
+ attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+ is_decoder=False,
+ initializer_range=self.initializer_range,
+ num_register_tokens=self.num_register_tokens,
+ )
+
+ def create_and_check_model(self, config, pixel_values, labels):
+ model = Dinov2WithRegistersModel(config=config)
+ model.to(torch_device)
+ model.eval()
+ result = model(pixel_values)
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+ def create_and_check_backbone(self, config, pixel_values, labels):
+ model = Dinov2WithRegistersBackbone(config=config)
+ model.to(torch_device)
+ model.eval()
+ result = model(pixel_values)
+
+ # verify hidden states
+ self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+ expected_size = self.image_size // config.patch_size
+ self.parent.assertListEqual(
+ list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size]
+ )
+
+ # verify channels
+ self.parent.assertEqual(len(model.channels), len(config.out_features))
+
+ # verify backbone works with out_features=None
+ config.out_features = None
+ model = Dinov2WithRegistersBackbone(config=config)
+ model.to(torch_device)
+ model.eval()
+ result = model(pixel_values)
+
+ # verify feature maps
+ self.parent.assertEqual(len(result.feature_maps), 1)
+ self.parent.assertListEqual(
+ list(result.feature_maps[0].shape), [self.batch_size, model.channels[0], expected_size, expected_size]
+ )
+
+ # verify channels
+ self.parent.assertEqual(len(model.channels), 1)
+
+ # verify backbone works with apply_layernorm=False and reshape_hidden_states=False
+ config.apply_layernorm = False
+ config.reshape_hidden_states = False
+
+ model = Dinov2WithRegistersBackbone(config=config)
+ model.to(torch_device)
+ model.eval()
+ result = model(pixel_values)
+
+ # verify feature maps
+ self.parent.assertEqual(len(result.feature_maps), 1)
+ self.parent.assertListEqual(
+ list(result.feature_maps[0].shape), [self.batch_size, self.seq_length, self.hidden_size]
+ )
+
+ def create_and_check_for_image_classification(self, config, pixel_values, labels):
+ config.num_labels = self.type_sequence_label_size
+ model = Dinov2WithRegistersForImageClassification(config)
+ model.to(torch_device)
+ model.eval()
+ result = model(pixel_values, labels=labels)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+ # test greyscale images
+ config.num_channels = 1
+ model = Dinov2WithRegistersForImageClassification(config)
+ model.to(torch_device)
+ model.eval()
+
+ pixel_values = floats_tensor([self.batch_size, 1, self.image_size, self.image_size])
+ result = model(pixel_values)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ (
+ config,
+ pixel_values,
+ labels,
+ ) = config_and_inputs
+ inputs_dict = {"pixel_values": pixel_values}
+ return config, inputs_dict
+
+
+@require_torch
+class Dinov2WithRegistersModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+ """
+ Here we also overwrite some of the tests of test_modeling_common.py, as Dinov2WithRegisters does not use input_ids, inputs_embeds,
+ attention_mask and seq_length.
+ """
+
+ all_model_classes = (
+ (
+ Dinov2WithRegistersModel,
+ Dinov2WithRegistersForImageClassification,
+ Dinov2WithRegistersBackbone,
+ )
+ if is_torch_available()
+ else ()
+ )
+ pipeline_model_mapping = (
+ {
+ "image-feature-extraction": Dinov2WithRegistersModel,
+ "image-classification": Dinov2WithRegistersForImageClassification,
+ }
+ if is_torch_available()
+ else {}
+ )
+ fx_compatible = False
+
+ test_pruning = False
+ test_resize_embeddings = False
+ test_head_masking = False
+
+ def setUp(self):
+ self.model_tester = Dinov2WithRegistersModelTester(self)
+ self.config_tester = ConfigTester(
+ self, config_class=Dinov2WithRegistersConfig, has_text_modality=False, hidden_size=37
+ )
+
+ def test_initialization(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ configs_no_init = _config_zero_init(config)
+ for model_class in self.all_model_classes:
+ model = model_class(config=configs_no_init)
+ for name, param in model.named_parameters():
+ if param.requires_grad and "register_tokens" not in name:
+ self.assertIn(
+ ((param.data.mean() * 1e9).round() / 1e9).item(),
+ [0.0, 1.0],
+ msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+ )
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ @unittest.skip(reason="Dinov2WithRegisters does not use inputs_embeds")
+ def test_inputs_embeds(self):
+ pass
+
+ @unittest.skip(
+ reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+ )
+ def test_training_gradient_checkpointing(self):
+ pass
+
+ @unittest.skip(
+ reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+ )
+ def test_training_gradient_checkpointing_use_reentrant(self):
+ pass
+
+ @unittest.skip(
+ reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+ )
+ def test_training_gradient_checkpointing_use_reentrant_false(self):
+ pass
+
+ def test_model_get_set_embeddings(self):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+ x = model.get_output_embeddings()
+ self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+ def test_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(*config_and_inputs)
+
+ def test_backbone(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_backbone(*config_and_inputs)
+
+ def test_for_image_classification(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
+
+ @unittest.skip(reason="Dinov2WithRegisters does not support feedforward chunking yet")
+ def test_feed_forward_chunking(self):
+ pass
+
+ @slow
+ def test_model_from_pretrained(self):
+ model_name = "facebook/dinov2-with-registers-base"
+ model = Dinov2WithRegistersModel.from_pretrained(model_name)
+ self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+ image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+ return image
+
+
+@require_torch
+@require_vision
+class Dinov2WithRegistersModelIntegrationTest(unittest.TestCase):
+ @cached_property
+ def default_image_processor(self):
+ return (
+ AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+ if is_vision_available()
+ else None
+ )
+
+ @slow
+ def test_inference_no_head(self):
+ model = Dinov2WithRegistersModel.from_pretrained("facebook/dinov2-with-registers-base").to(torch_device)
+
+ image_processor = self.default_image_processor
+ image = prepare_img()
+ inputs = image_processor(image, return_tensors="pt").to(torch_device)
+
+ # forward pass
+ with torch.no_grad():
+ outputs = model(**inputs)
+
+ # verify the last hidden states
+ # in DINOv2 with Registers, the seq length equals the number of patches + 1 + num_register_tokens (we add 1 for the [CLS] token)
+ num_patches = (image_processor.crop_size["height"] // model.config.patch_size) ** 2
+ expected_seq_length = num_patches + 1 + model.config.num_register_tokens
+ expected_shape = torch.Size((1, expected_seq_length, model.config.hidden_size))
+ self.assertEqual(outputs.last_hidden_state.shape, expected_shape)
+
+ expected_slice = torch.tensor(
+ [[-0.4636, -1.4582, -0.0274], [-1.4738, -0.8858, 0.3002], [0.0714, -0.2407, -1.5940]],
+ device=torch_device,
+ )
+ self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+
+@require_torch
+class Dinov2WithRegistersBackboneTest(unittest.TestCase, BackboneTesterMixin):
+ all_model_classes = (Dinov2WithRegistersBackbone,) if is_torch_available() else ()
+ config_class = Dinov2WithRegistersConfig
+
+ has_attentions = False
+
+ def setUp(self):
+ self.model_tester = Dinov2WithRegistersModelTester(self)
diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py
index 52d2f1539a4867..36c5d294844939 100644
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -505,3 +505,74 @@ def test_unstructured_kwargs(self):
self.assertEqual(inputs["pixel_values"].shape[3], 32)
self.assertEqual(len(inputs["input_ids"][0]), 120)
+
+ @require_torch
+ @require_vision
+ def test_text_only_inference(self):
+ """Test that the processor works correctly with text-only input."""
+ processor = self.get_processor()
+
+ text = "This is a simple text without images."
+ inputs = processor(text=text)
+
+ tokenized_sentence = processor.tokenizer(text, add_special_tokens=False)
+ expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]]
+
+ self.assertEqual(inputs["input_ids"], expected_input_ids)
+ self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])])
+ self.assertTrue("pixel_values" not in inputs)
+ self.assertTrue("pixel_attention_mask" not in inputs)
+
+ # Test batch of texts without image tokens
+ texts = ["First text.", "Second piece of text."]
+ batch_inputs = processor(text=texts, padding=True)
+
+ tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False)
+ tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False)
+
+ expected_1 = [self.bos_token_id] + tokenized_1["input_ids"]
+ expected_2 = [self.bos_token_id] + tokenized_2["input_ids"]
+
+ # Pad the shorter sequence
+ pad_len = len(expected_2) - len(expected_1)
+ if pad_len > 0:
+ padded_expected_1 = [self.padding_token_id] * pad_len + expected_1
+ expected_attention_1 = [0] * pad_len + [1] * len(expected_1)
+ self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2])
+ self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)])
+ else:
+ pad_len = -pad_len
+ padded_expected_2 = [self.padding_token_id] * pad_len + expected_2
+ expected_attention_2 = [0] * pad_len + [1] * len(expected_2)
+ self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2])
+ self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2])
+
+ @require_torch
+ @require_vision
+ def test_missing_images_error(self):
+ """Test that appropriate error is raised when images are referenced but not provided."""
+ processor = self.get_processor()
+
+ # Test single text with image token but no image
+ text = "Let me show you this image: What do you think?"
+ with self.assertRaises(ValueError) as context:
+ processor(text=text)
+ self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
+
+ # Test batch with image tokens but no images
+ texts = [
+ "First text with token.",
+ "Second text with token.",
+ ]
+ with self.assertRaises(ValueError) as context:
+ processor(text=texts)
+ self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
+
+ # Test with None as Images
+ with self.assertRaises(ValueError) as context:
+ processor(text=text, images=None)
+ self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
+
+ with self.assertRaises(ValueError) as context:
+ processor(text=texts, images=None)
+ self.assertTrue("tokens in the text but no images were passed" in str(context.exception))
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index df763aed48c749..b35f58e99a0402 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -828,6 +828,144 @@ def test_inference(self):
expected_logits = torch.tensor([[-6.2229, -8.2601]], device=torch_device)
self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+ @slow
+ def test_inference_interpolate_pos_encoding(self):
+ model_name = "google/owlv2-base-patch16"
+ model = Owlv2Model.from_pretrained(model_name).to(torch_device)
+ processor = OwlViTProcessor.from_pretrained(model_name)
+ processor.image_processor.size = {"height": 1024, "width": 1024}
+
+ image = prepare_img()
+ inputs = processor(
+ text=[["a photo of a cat", "a photo of a dog"]],
+ images=image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ # forward pass
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=True)
+
+ # verify the logits
+ self.assertEqual(
+ outputs.logits_per_image.shape,
+ torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+ )
+ self.assertEqual(
+ outputs.logits_per_text.shape,
+ torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+ )
+ expected_logits = torch.tensor([[-6.2520, -8.2970]], device=torch_device)
+ self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+ expected_shape = torch.Size((1, 4097, 768))
+ self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+
+ # Owlv2ForObjectDetection part.
+ model = Owlv2ForObjectDetection.from_pretrained(model_name).to(torch_device)
+ processor.image_processor.size = {"height": 1024, "width": 1024}
+
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=True)
+
+ num_queries = int((inputs.pixel_values.shape[-1] / model.config.vision_config.patch_size) ** 2)
+ self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+ expected_slice_boxes = torch.tensor(
+ [[0.2407, 0.0553, 0.4636], [0.1082, 0.0494, 0.1861], [0.2459, 0.0527, 0.4398]]
+ ).to(torch_device)
+ self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+ model = Owlv2ForObjectDetection.from_pretrained(model_name).to(torch_device)
+ query_image = prepare_img()
+ inputs = processor(
+ images=image,
+ query_images=query_image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model.image_guided_detection(**inputs, interpolate_pos_encoding=True)
+
+ # No need to check the logits, we just check inference runs fine.
+ num_queries = int((inputs.pixel_values.shape[-1] / model.config.vision_config.patch_size) ** 2)
+ self.assertEqual(outputs.target_pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
+ # Deactivate interpolate_pos_encoding on same model, and use default image size.
+ # Verify the dynamic change caused by the activation/deactivation of interpolate_pos_encoding of variables: self.sqrt_num_patches, self.box_bias from (OwlViTForObjectDetection).
+ processor = OwlViTProcessor.from_pretrained(model_name)
+
+ image = prepare_img()
+ inputs = processor(
+ text=[["a photo of a cat", "a photo of a dog"]],
+ images=image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=False)
+
+ num_queries = int((inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size) ** 2)
+ self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
+ expected_default_box_bias = torch.tensor(
+ [
+ [-4.0717, -4.0717, -4.0717, -4.0717],
+ [-3.3644, -4.0717, -4.0717, -4.0717],
+ [-2.9425, -4.0717, -4.0717, -4.0717],
+ ]
+ )
+
+ self.assertTrue(torch.allclose(model.box_bias[:3, :4], expected_default_box_bias, atol=1e-4))
+
+ # Interpolate with any resolution size.
+ processor.image_processor.size = {"height": 1264, "width": 1024}
+
+ image = prepare_img()
+ inputs = processor(
+ text=[["a photo of a cat", "a photo of a dog"]],
+ images=image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=True)
+
+ num_queries = int(
+ (inputs.pixel_values.shape[-2] // model.config.vision_config.patch_size)
+ * (inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size)
+ )
+ self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+ expected_slice_boxes = torch.tensor(
+ [[0.2438, 0.0945, 0.4675], [0.1361, 0.0431, 0.2406], [0.2465, 0.0428, 0.4429]]
+ ).to(torch_device)
+ self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+ query_image = prepare_img()
+ inputs = processor(
+ images=image,
+ query_images=query_image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model.image_guided_detection(**inputs, interpolate_pos_encoding=True)
+
+ # No need to check the logits, we just check inference runs fine.
+ num_queries = int(
+ (inputs.pixel_values.shape[-2] // model.config.vision_config.patch_size)
+ * (inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size)
+ )
+ self.assertEqual(outputs.target_pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
@slow
def test_inference_object_detection(self):
model_name = "google/owlv2-base-patch16"
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index e0599a50fb98b4..545fee0c4fe3af 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -821,6 +821,144 @@ def test_inference(self):
expected_logits = torch.tensor([[3.4613, 0.9403]], device=torch_device)
self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+ @slow
+ def test_inference_interpolate_pos_encoding(self):
+ model_name = "google/owlvit-base-patch32"
+ model = OwlViTModel.from_pretrained(model_name).to(torch_device)
+ processor = OwlViTProcessor.from_pretrained(model_name)
+ processor.image_processor.size = {"height": 800, "width": 800}
+
+ image = prepare_img()
+ inputs = processor(
+ text=[["a photo of a cat", "a photo of a dog"]],
+ images=image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ # forward pass
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=True)
+
+ # verify the logits
+ self.assertEqual(
+ outputs.logits_per_image.shape,
+ torch.Size((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+ )
+ self.assertEqual(
+ outputs.logits_per_text.shape,
+ torch.Size((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+ )
+ expected_logits = torch.tensor([[3.6278, 0.8861]], device=torch_device)
+ self.assertTrue(torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3))
+
+ expected_shape = torch.Size((1, 626, 768))
+ self.assertEqual(outputs.vision_model_output.last_hidden_state.shape, expected_shape)
+
+ # OwlViTForObjectDetection part.
+ model = OwlViTForObjectDetection.from_pretrained(model_name).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=True)
+
+ num_queries = int((inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size) ** 2)
+ self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
+ expected_slice_boxes = torch.tensor(
+ [[0.0680, 0.0422, 0.1347], [0.2071, 0.0450, 0.4146], [0.2000, 0.0418, 0.3476]]
+ ).to(torch_device)
+ self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+ model = OwlViTForObjectDetection.from_pretrained(model_name).to(torch_device)
+ query_image = prepare_img()
+ inputs = processor(
+ images=image,
+ query_images=query_image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model.image_guided_detection(**inputs, interpolate_pos_encoding=True)
+
+ # No need to check the logits, we just check inference runs fine.
+ num_queries = int((inputs.pixel_values.shape[-1] / model.config.vision_config.patch_size) ** 2)
+ self.assertEqual(outputs.target_pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
+ # Deactivate interpolate_pos_encoding on same model, and use default image size.
+ # Verify the dynamic change caused by the activation/deactivation of interpolate_pos_encoding of variables: (self.sqrt_num_patch_h, self.sqrt_num_patch_w), self.box_bias from (OwlViTForObjectDetection).
+ processor = OwlViTProcessor.from_pretrained(model_name)
+
+ image = prepare_img()
+ inputs = processor(
+ text=[["a photo of a cat", "a photo of a dog"]],
+ images=image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=False)
+
+ num_queries = int((inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size) ** 2)
+ self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
+ expected_default_box_bias = torch.tensor(
+ [
+ [-3.1332, -3.1332, -3.1332, -3.1332],
+ [-2.3968, -3.1332, -3.1332, -3.1332],
+ [-1.9452, -3.1332, -3.1332, -3.1332],
+ ]
+ )
+ self.assertTrue(torch.allclose(model.box_bias[:3, :4], expected_default_box_bias, atol=1e-4))
+
+ # Interpolate with any resolution size.
+ processor.image_processor.size = {"height": 1264, "width": 1024}
+
+ image = prepare_img()
+ inputs = processor(
+ text=[["a photo of a cat", "a photo of a dog"]],
+ images=image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model(**inputs, interpolate_pos_encoding=True)
+
+ num_queries = int(
+ (inputs.pixel_values.shape[-2] // model.config.vision_config.patch_size)
+ * (inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size)
+ )
+ self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
+ expected_slice_boxes = torch.tensor(
+ [[0.0499, 0.0301, 0.0983], [0.2244, 0.0365, 0.4663], [0.1387, 0.0314, 0.1859]]
+ ).to(torch_device)
+ self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
+
+ query_image = prepare_img()
+ inputs = processor(
+ images=image,
+ query_images=query_image,
+ max_length=16,
+ padding="max_length",
+ return_tensors="pt",
+ ).to(torch_device)
+
+ with torch.no_grad():
+ outputs = model.image_guided_detection(**inputs, interpolate_pos_encoding=True)
+
+ # No need to check the logits, we just check inference runs fine.
+ num_queries = int(
+ (inputs.pixel_values.shape[-2] // model.config.vision_config.patch_size)
+ * (inputs.pixel_values.shape[-1] // model.config.vision_config.patch_size)
+ )
+ self.assertEqual(outputs.target_pred_boxes.shape, torch.Size((1, num_queries, 4)))
+
@slow
def test_inference_object_detection(self):
model_name = "google/owlvit-base-patch32"
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
index a45ead50612933..1377b676917f47 100644
--- a/tests/models/pixtral/test_image_processing_pixtral.py
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -19,8 +19,15 @@
import numpy as np
import requests
-
-from transformers.testing_utils import require_torch, require_vision
+from packaging import version
+
+from transformers.testing_utils import (
+ require_torch,
+ require_torch_gpu,
+ require_vision,
+ slow,
+ torch_device,
+)
from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
@@ -157,6 +164,9 @@ def test_image_processor_properties(self):
self.assertTrue(hasattr(image_processing, "image_std"))
self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+ # The following tests are overriden as PixtralImageProcessor can return images of different sizes
+ # and thus doesn't support returning batched tensors
+
def test_call_pil(self):
for image_processing_class in self.image_processor_list:
# Initialize image_processing
@@ -273,6 +283,25 @@ def test_slow_fast_equivalence(self):
self.assertTrue(torch.allclose(encoding_slow.pixel_values[0][0], encoding_fast.pixel_values[0][0], atol=1e-2))
+ @slow
+ @require_torch_gpu
+ @require_vision
+ def test_can_compile_fast_image_processor(self):
+ if self.fast_image_processing_class is None:
+ self.skipTest("Skipping compilation test as fast image processor is not defined")
+ if version.parse(torch.__version__) < version.parse("2.3"):
+ self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+ torch.compiler.reset()
+ input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+ image_processor = self.fast_image_processing_class(**self.image_processor_dict)
+ output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+ image_processor = torch.compile(image_processor, mode="reduce-overhead")
+ output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+ self.assertTrue(torch.allclose(output_eager.pixel_values[0][0], output_compiled.pixel_values[0][0], atol=1e-4))
+
@unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet") # FIXME Amy
def test_call_numpy_4_channels(self):
pass
diff --git a/tests/models/sam/test_processor_sam.py b/tests/models/sam/test_processor_sam.py
index 22eb88d03d6b04..654f892062625a 100644
--- a/tests/models/sam/test_processor_sam.py
+++ b/tests/models/sam/test_processor_sam.py
@@ -26,7 +26,7 @@
)
from transformers.utils import is_tf_available, is_torch_available, is_vision_available
-from ...test_processing_common import prepare_image_inputs
+from ...test_processing_common import ProcessorTesterMixin, prepare_image_inputs
if is_vision_available():
@@ -43,7 +43,9 @@
@require_vision
@require_torchvision
-class SamProcessorTest(unittest.TestCase):
+class SamProcessorTest(ProcessorTesterMixin, unittest.TestCase):
+ processor_class = SamProcessor
+
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = SamImageProcessor()
@@ -56,11 +58,6 @@ def get_image_processor(self, **kwargs):
def tearDown(self):
shutil.rmtree(self.tmpdirname)
- # Processor tester class can't use ProcessorTesterMixin atm because the processor is atypical e.g. only contains an image processor
- def prepare_image_inputs(self):
- """This function prepares a list of PIL images."""
- return prepare_image_inputs()
-
def prepare_mask_inputs(self):
"""This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
or a list of PyTorch tensors if one specifies torchify=True.
@@ -69,6 +66,21 @@ def prepare_mask_inputs(self):
mask_inputs = [Image.fromarray(x) for x in mask_inputs]
return mask_inputs
+ def test_chat_template_save_loading(self):
+ self.skipTest("SamProcessor does not have a tokenizer")
+
+ def test_image_processor_defaults_preserved_by_image_kwargs(self):
+ self.skipTest("SamProcessor does not have a tokenizer")
+
+ def test_kwargs_overrides_default_image_processor_kwargs(self):
+ self.skipTest("SamProcessor does not have a tokenizer")
+
+ def test_kwargs_overrides_default_tokenizer_kwargs(self):
+ self.skipTest("SamProcessor does not have a tokenizer")
+
+ def test_tokenizer_defaults_preserved_by_kwargs(self):
+ self.skipTest("SamProcessor does not have a tokenizer")
+
def test_save_load_pretrained_additional_features(self):
processor = SamProcessor(image_processor=self.get_image_processor())
processor.save_pretrained(self.tmpdirname)
@@ -165,7 +177,7 @@ def get_image_processor(self, **kwargs):
def tearDown(self):
shutil.rmtree(self.tmpdirname)
- # Processor tester class can't use ProcessorTesterMixin as processor is atypical e.g. only contains an image processor and it assumes torch
+ # This is to avoid repeating the skipping of the common tests
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
return prepare_image_inputs()
@@ -248,7 +260,7 @@ def get_image_processor(self, **kwargs):
def tearDown(self):
shutil.rmtree(self.tmpdirname)
- # Processor tester class can't use ProcessorTesterMixin atm because the processor is atypical e.g. only contains an image processor
+ # This is to avoid repeating the skipping of the common tests
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
return prepare_image_inputs()
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
index c4287362b6bc1c..76094d0fe86272 100644
--- a/tests/quantization/bnb/test_4bit.py
+++ b/tests/quantization/bnb/test_4bit.py
@@ -172,7 +172,7 @@ def test_memory_footprint(self):
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_4bit = self.model_4bit.get_memory_footprint()
- self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+ self.assertAlmostEqual(mem_fp16 / mem_4bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
linear = get_some_linear_layer(self.model_4bit)
self.assertTrue(linear.weight.__class__ == Params4bit)
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 26e8cb2fc731ec..e73dd82f34a8ca 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -229,7 +229,7 @@ def test_memory_footprint(self):
mem_fp16 = self.model_fp16.get_memory_footprint()
mem_8bit = self.model_8bit.get_memory_footprint()
- self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE)
+ self.assertAlmostEqual(mem_fp16 / mem_8bit, self.EXPECTED_RELATIVE_DIFFERENCE, delta=1e-5)
self.assertTrue(get_some_linear_layer(self.model_8bit).weight.__class__ == Int8Params)
def test_linear_are_8bit(self):
@@ -938,8 +938,13 @@ class MixedInt8LlamaTest(MixedInt8Test):
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
EXPECTED_RELATIVE_DIFFERENCE = 1.7869331026479096
EXPECTED_OUTPUTS = set()
+
+ # Expected on Intel XPU
EXPECTED_OUTPUTS.add("Hello my name is John Smith and I am a software engineer. I")
+ # Expected on NVIDIA T4
+ EXPECTED_OUTPUTS.add("Hello my name is John and I am a software engineer. I have")
+
def test_int8_from_pretrained(self):
r"""
Test whether loading a 8bit model from the Hub works as expected
diff --git a/tests/quantization/higgs/__init__.py b/tests/quantization/higgs/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/quantization/higgs/test_higgs.py b/tests/quantization/higgs/test_higgs.py
new file mode 100644
index 00000000000000..26ee6bc0564777
--- /dev/null
+++ b/tests/quantization/higgs/test_higgs.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HiggsConfig, OPTForCausalLM
+from transformers.testing_utils import (
+ require_accelerate,
+ require_flute_hadamard,
+ require_torch_gpu,
+ require_torch_multi_gpu,
+ slow,
+ torch_device,
+)
+from transformers.utils import is_accelerate_available, is_torch_available
+
+
+if is_torch_available():
+ import torch
+
+if is_accelerate_available():
+ from accelerate import init_empty_weights
+
+
+@require_torch_gpu
+class HiggsConfigTest(unittest.TestCase):
+ def test_to_dict(self):
+ """
+ Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
+ """
+ quantization_config = HiggsConfig()
+ config_to_dict = quantization_config.to_dict()
+
+ for key in config_to_dict:
+ self.assertEqual(getattr(quantization_config, key), config_to_dict[key])
+
+ def test_from_dict(self):
+ """
+ Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict
+ """
+ dict = {"modules_to_not_convert": ["embed_tokens", "lm_head"], "quant_method": "higgs"}
+ quantization_config = HiggsConfig.from_dict(dict)
+
+ self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert)
+ self.assertEqual(dict["quant_method"], quantization_config.quant_method)
+
+
+@slow
+@require_torch_gpu
+@require_flute_hadamard
+@require_accelerate
+# @require_read_token
+class HiggsTest(unittest.TestCase):
+ model_name = "meta-llama/Meta-Llama-3.1-8B"
+
+ input_text = "A quick brown fox jumps over the"
+ max_new_tokens = 2
+
+ EXPECTED_OUTPUT = "A quick brown fox jumps over the lazy dog"
+
+ device_map = "cuda"
+
+ # called only once for all test in this class
+ @classmethod
+ def setUpClass(cls):
+ """
+ Setup quantized model
+ """
+ quantization_config = HiggsConfig()
+ cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+ cls.quantized_model = AutoModelForCausalLM.from_pretrained(
+ cls.model_name, device_map=cls.device_map, quantization_config=quantization_config
+ )
+
+ def tearDown(self):
+ gc.collect()
+ torch.cuda.empty_cache()
+ gc.collect()
+
+ def test_quantized_model_conversion(self):
+ """
+ Simple test that checks if the quantized model has been converted properly
+ """
+
+ from transformers.integrations import HiggsLinear, replace_with_higgs_linear
+
+ model_id = "facebook/opt-350m"
+ config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5")
+ quantization_config = HiggsConfig()
+
+ with init_empty_weights():
+ model = OPTForCausalLM(config)
+
+ nb_linears = 0
+ for module in model.modules():
+ if isinstance(module, torch.nn.Linear):
+ nb_linears += 1
+
+ model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
+ nb_higgs_linear = 0
+ for module in model.modules():
+ if isinstance(module, HiggsLinear):
+ nb_higgs_linear += 1
+
+ self.assertEqual(nb_linears - 1, nb_higgs_linear)
+
+ with init_empty_weights():
+ model = OPTForCausalLM(config)
+ quantization_config = HiggsConfig(modules_to_not_convert=["fc1"])
+ model, _ = replace_with_higgs_linear(model, quantization_config=quantization_config)
+ nb_higgs_linear = 0
+ for module in model.modules():
+ if isinstance(module, HiggsLinear):
+ nb_higgs_linear += 1
+
+ self.assertEqual(nb_linears - 24, nb_higgs_linear)
+
+ def test_quantized_model(self):
+ """
+ Simple test that checks if the quantized model is working properly
+ """
+ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+ output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+ self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+ def test_save_pretrained(self):
+ """
+ Simple test that checks if the quantized model is working properly after being saved and loaded
+ """
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ self.quantized_model.save_pretrained(tmpdirname)
+
+ model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map)
+
+ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+ output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+ self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+ @require_torch_multi_gpu
+ def test_quantized_model_multi_gpu(self):
+ """
+ Simple test that checks if the quantized model is working properly with multiple GPUs
+ set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS
+ """
+ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+ quantization_config = HiggsConfig()
+ quantized_model = AutoModelForCausalLM.from_pretrained(
+ self.model_name, device_map="auto", quantization_config=quantization_config
+ )
+ self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1})
+
+ output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+ self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+ @require_torch_multi_gpu
+ def test_save_pretrained_multi_gpu(self):
+ """
+ Simple test that checks if the quantized model is working properly after being saved and loaded
+ """
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ self.quantized_model.save_pretrained(tmpdirname)
+
+ model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto")
+ self.assertTrue(set(model.hf_device_map.values()) == {0, 1})
+
+ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+ output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+ self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+ @unittest.skip("This will almost surely OOM. Enable when swithed to a smaller model")
+ def test_dequantize(self):
+ """
+ Test the ability to dequantize a model
+ """
+ self.quantized_model.dequantize()
+
+ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
+
+ output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens)
+ self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
diff --git a/tests/quantization/vptq_integration/test_vptq.py b/tests/quantization/vptq_integration/test_vptq.py
index faa9a5879d1dcc..173afa7d003e43 100644
--- a/tests/quantization/vptq_integration/test_vptq.py
+++ b/tests/quantization/vptq_integration/test_vptq.py
@@ -44,7 +44,7 @@ def test_to_dict(self):
quantization_config = VptqConfig()
vptq_orig_config = quantization_config.to_dict()
- self.assertEqual(quantization_config.quant_config, vptq_orig_config["quant_config"])
+ self.assertEqual(vptq_orig_config["quant_method"], quantization_config.quant_method)
@slow
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index 221552175a93e3..1cb92174df1d8a 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -23,10 +23,18 @@
import numpy as np
import requests
+from packaging import version
from transformers import AutoImageProcessor, BatchFeature
from transformers.image_utils import AnnotationFormat, AnnotionFormat
-from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
+from transformers.testing_utils import (
+ check_json_file_has_correct_format,
+ require_torch,
+ require_torch_gpu,
+ require_vision,
+ slow,
+ torch_device,
+)
from transformers.utils import is_torch_available, is_vision_available
@@ -463,6 +471,25 @@ def test_image_processor_preprocess_arguments(self):
if not is_tested:
self.skipTest(reason="No validation found for `preprocess` method")
+ @slow
+ @require_torch_gpu
+ @require_vision
+ def test_can_compile_fast_image_processor(self):
+ if self.fast_image_processing_class is None:
+ self.skipTest("Skipping compilation test as fast image processor is not defined")
+ if version.parse(torch.__version__) < version.parse("2.3"):
+ self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+ torch.compiler.reset()
+ input_image = torch.randint(0, 255, (3, 224, 224), dtype=torch.uint8)
+ image_processor = self.fast_image_processing_class(**self.image_processor_dict)
+ output_eager = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+ image_processor = torch.compile(image_processor, mode="reduce-overhead")
+ output_compiled = image_processor(input_image, device=torch_device, return_tensors="pt")
+
+ self.assertTrue(torch.allclose(output_eager.pixel_values, output_compiled.pixel_values, atol=1e-4))
+
class AnnotationFormatTestMixin:
# this mixin adds a test to assert that usages of the
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 3dbe59f192293a..130eebf0b83801 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -1009,6 +1009,7 @@ def find_all_documented_objects() -> List[str]:
"ConvNextV2Backbone",
"DinatBackbone",
"Dinov2Backbone",
+ "Dinov2WithRegistersBackbone",
"FocalNetBackbone",
"HieraBackbone",
"MaskFormerSwinBackbone",