huggingface · patrickvonplaten · Apr 18, 2022 · Apr 13, 2022 · Apr 13, 2022 · Apr 13, 2022
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
@@ -190,6 +190,7 @@ Flax), PyTorch, and/or TensorFlow.
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |

diff --git a/docs/source/en/model_doc/data2vec.mdx b/docs/source/en/model_doc/data2vec.mdx
@@ -33,10 +33,13 @@ Models and code are available at www.github.com/pytorch/fairseq/tree/master/exam
 
 Tips:
 
-- Both Data2VecAudio and Data2VecText have been trained using the same self-supervised learning method.
-  In the case of Data2VecAudio, preprocessing is identical to [`RobertaModel`], including tokenization.
+- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
+- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
+- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
+- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
+
+This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten)
 
-This model was contributed by [edugp](https://huggingface.co/edugp).
 The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 
 
@@ -48,12 +51,16 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 
 [[autodoc]] Data2VecAudioConfig
 
+## Data2VecVisionConfig
+
+[[autodoc]] Data2VecVisionConfig
+
+
 ## Data2VecAudioModel
 
 [[autodoc]] Data2VecAudioModel
     - forward
 
-
 ## Data2VecAudioForAudioFrameClassification
 
 [[autodoc]] Data2VecAudioForAudioFrameClassification
@@ -108,3 +115,18 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
 
 [[autodoc]] Data2VecTextForQuestionAnswering
     - forward
+
+## Data2VecVisionModel
+
+[[autodoc]] Data2VecVisionModel
+    - forward
+
+## Data2VecVisionForImageClassification
+
+[[autodoc]] Data2VecVisionForImageClassification
+    - forward
+
+## Data2VecVisionForSemanticSegmentation
+
+[[autodoc]] Data2VecVisionForSemanticSegmentation
+    - forward
diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx
@@ -54,6 +54,7 @@ Ready-made configurations include the following architectures:
 - BlenderbotSmall
 - CamemBERT
 - Data2VecText
+- Data2VecVision
 - DistilBERT
 - ELECTRA
 - FlauBERT

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -170,7 +170,13 @@
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
     "models.cpm": ["CpmTokenizer"],
     "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
-    "models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecTextConfig"],
+    "models.data2vec": [
+        "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Data2VecAudioConfig",
+        "Data2VecTextConfig",
+        "Data2VecVisionConfig",
+    ],
     "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
     "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
     "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
@@ -868,6 +874,7 @@
         [
             "DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
             "DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST",
             "Data2VecAudioForAudioFrameClassification",
             "Data2VecAudioForCTC",
             "Data2VecAudioForSequenceClassification",
@@ -882,6 +889,10 @@
             "Data2VecTextForTokenClassification",
             "Data2VecTextModel",
             "Data2VecTextPreTrainedModel",
+            "Data2VecVisionForImageClassification",
+            "Data2VecVisionForSemanticSegmentation",
+            "Data2VecVisionModel",
+            "Data2VecVisionPreTrainedModel",
         ]
     )
     _import_structure["models.deberta"].extend(
@@ -2555,7 +2566,13 @@
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
     from .models.cpm import CpmTokenizer
     from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
-    from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecTextConfig
+    from .models.data2vec import (
+        DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Data2VecAudioConfig,
+        Data2VecTextConfig,
+        Data2VecVisionConfig,
+    )
     from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
     from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
     from .models.decision_transformer import (
@@ -3151,6 +3168,7 @@
         from .models.data2vec import (
             DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
             DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
             Data2VecAudioForAudioFrameClassification,
             Data2VecAudioForCTC,
             Data2VecAudioForSequenceClassification,
@@ -3165,6 +3183,10 @@
             Data2VecTextForTokenClassification,
             Data2VecTextModel,
             Data2VecTextPreTrainedModel,
+            Data2VecVisionForImageClassification,
+            Data2VecVisionForSemanticSegmentation,
+            Data2VecVisionModel,
+            Data2VecVisionPreTrainedModel,
         )
         from .models.deberta import (
             DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -59,6 +59,7 @@
         ("layoutlmv2", "LayoutLMv2Config"),
         ("plbart", "PLBartConfig"),
         ("beit", "BeitConfig"),
+        ("data2vec-vision", "Data2VecVisionConfig"),
         ("rembert", "RemBertConfig"),
         ("visual_bert", "VisualBertConfig"),
         ("canine", "CanineConfig"),
@@ -162,6 +163,7 @@
         ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("plbart", "PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("data2vec-vision", "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -349,12 +351,18 @@
         ("layoutxlm", "LayoutXLM"),
         ("data2vec-audio", "Data2VecAudio"),
         ("data2vec-text", "Data2VecText"),
+        ("data2vec-vision", "Data2VecVision"),
         ("dit", "DiT"),
     ]
 )
 
 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
-    [("openai-gpt", "openai"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec")]
+    [
+        ("openai-gpt", "openai"),
+        ("data2vec-audio", "data2vec"),
+        ("data2vec-text", "data2vec"),
+        ("data2vec-vision", "data2vec"),
+    ]
 )
 
 

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -55,6 +55,7 @@
         ("layoutlmv2", "LayoutLMv2Model"),
         ("plbart", "PLBartModel"),
         ("beit", "BeitModel"),
+        ("data2vec-vision", "Data2VecVisionModel"),
         ("rembert", "RemBertModel"),
         ("visual_bert", "VisualBertModel"),
         ("canine", "CanineModel"),
@@ -290,6 +291,7 @@
         ("vit", "ViTForImageClassification"),
         ("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")),
         ("beit", "BeitForImageClassification"),
+        ("data2vec-vision", "Data2VecVisionForImageClassification"),
         ("segformer", "SegformerForImageClassification"),
         ("imagegpt", "ImageGPTForImageClassification"),
         (
@@ -321,6 +323,7 @@
     [
         # Model for Semantic Segmentation mapping
         ("beit", "BeitForSemanticSegmentation"),
+        ("data2vec-vision", "Data2VecVisionForSemanticSegmentation"),
         ("segformer", "SegformerForSemanticSegmentation"),
         ("dpt", "DPTForSemanticSegmentation"),
     ]

diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
@@ -702,7 +702,8 @@ def forward(
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
 
         if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
 
         return BeitModelOutputWithPooling(
             last_hidden_state=sequence_output,
@@ -713,7 +714,7 @@ def forward(
 
 
 class BeitPooler(nn.Module):
-    def __init__(self, config: BeitModel) -> None:
+    def __init__(self, config: BeitConfig) -> None:
         super().__init__()
         self.layernorm = (
             nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None
@@ -736,7 +737,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     BEIT_START_DOCSTRING,
 )
 class BeitForMaskedImageModeling(BeitPreTrainedModel):
-    def __init__(self, config: BeitModel) -> None:
+    def __init__(self, config: BeitConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
@@ -817,7 +818,7 @@ def forward(
             masked_lm_loss = loss_fct(prediction_scores[bool_masked_pos], labels)
 
         if not return_dict:
-            output = (prediction_scores,) + outputs[2:]
+            output = (prediction_scores,) + outputs[1:]
             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
         return MaskedLMOutput(
@@ -836,7 +837,7 @@ def forward(
     BEIT_START_DOCSTRING,
 )
 class BeitForImageClassification(BeitPreTrainedModel):
-    def __init__(self, config: BeitModel) -> None:
+    def __init__(self, config: BeitConfig) -> None:
         super().__init__(config)
 
         self.num_labels = config.num_labels
@@ -1237,7 +1238,7 @@ def forward(
             return_dict=return_dict,
         )
 
-        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[2]
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
 
         # only keep certain features, and reshape
         # note that we do +1 as the encoder_hidden_states also includes the initial embeddings
@@ -1268,9 +1269,9 @@ def forward(
 
         if not return_dict:
             if output_hidden_states:
-                output = (logits,) + outputs[2:]
+                output = (logits,) + outputs[1:]
             else:
-                output = (logits,) + outputs[3:]
+                output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
         return SemanticSegmenterOutput(

diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
@@ -31,6 +31,11 @@
         "Data2VecTextConfig",
         "Data2VecTextOnnxConfig",
     ],
+    "configuration_data2vec_vision": [
+        "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Data2VecVisionConfig",
+        "Data2VecVisionOnnxConfig",
+    ],
 }
 
 if is_torch_available():
@@ -54,6 +59,14 @@
         "Data2VecTextModel",
         "Data2VecTextPreTrainedModel",
     ]
+    _import_structure["modeling_data2vec_vision"] = [
+        "DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "Data2VecVisionForImageClassification",
+        "Data2VecVisionForMaskedImageModeling",
+        "Data2VecVisionForSemanticSegmentation",
+        "Data2VecVisionModel",
+        "Data2VecVisionPreTrainedModel",
+    ]
 
 if TYPE_CHECKING:
     from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig
@@ -62,6 +75,11 @@
         Data2VecTextConfig,
         Data2VecTextOnnxConfig,
     )
+    from .configuration_data2vec_vision import (
+        DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Data2VecVisionConfig,
+        Data2VecVisionOnnxConfig,
+    )
 
     if is_torch_available():
         from .modeling_data2vec_audio import (
@@ -84,6 +102,14 @@
             Data2VecTextModel,
             Data2VecTextPreTrainedModel,
         )
+        from .modeling_data2vec_vision import (
+            DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
+            Data2VecVisionForImageClassification,
+            Data2VecVisionForMaskedImageModeling,
+            Data2VecVisionForSemanticSegmentation,
+            Data2VecVisionModel,
+            Data2VecVisionPreTrainedModel,
+        )
 
 else:
     import sys