From 37e0974afcbccdc85da59d51b44e1437b6b3caea Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 3 Mar 2023 00:18:11 +0000 Subject: [PATCH] Fix doctests for TFVisionTextDualEncoder (#21910) --- .../modeling_tf_vision_text_dual_encoder.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py index 112da0aebff8ed..7efc3e8ae31cc7 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py @@ -272,10 +272,10 @@ def get_text_features( ```python >>> from transformers import TFVisionTextDualEncoderModel, AutoTokenizer - >>> model = TFVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian") + >>> model = TFVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", from_pt=True) >>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian") - >>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt") + >>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="np") >>> text_features = model.get_text_features(**inputs) ```""" text_outputs = self.text_model( @@ -313,7 +313,7 @@ def get_image_features( >>> import requests >>> from transformers import TFVisionTextDualEncoderModel, AutoImageProcessor - >>> model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian") + >>> model = TFVisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian", from_pt=True) >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" @@ -380,7 +380,7 @@ def call( ... ] >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls] >>> inputs = processor( - ... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True + ... text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True ... ) >>> outputs = model( ... input_ids=inputs.input_ids, @@ -587,6 +587,8 @@ def from_vision_text_pretrained( if text_model.name != "text_model": raise ValueError("text model must be created with the name `text_model`.") + model(model.dummy_inputs) # Ensure model is fully built + return model @property