diff --git a/lavis/models/blip_diffusion_models/modeling_ctx_clip.py b/lavis/models/blip_diffusion_models/modeling_ctx_clip.py index 737b77d3f..e1d8bf6f8 100644 --- a/lavis/models/blip_diffusion_models/modeling_ctx_clip.py +++ b/lavis/models/blip_diffusion_models/modeling_ctx_clip.py @@ -13,9 +13,22 @@ from transformers.models.clip.modeling_clip import ( CLIPEncoder, CLIPPreTrainedModel, - _expand_mask, ) +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + class CtxCLIPTextModel(CLIPPreTrainedModel): config_class = CLIPTextConfig diff --git a/requirements.txt b/requirements.txt index 3f246993f..bf61fb500 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,11 +20,11 @@ scikit-image sentencepiece spacy streamlit -timm==0.4.12 +timm torch>=1.10.0 torchvision tqdm -transformers==4.33.2 +transformers webdataset wheel torchaudio @@ -35,5 +35,5 @@ peft easydict==1.9 pyyaml_env_tag==0.1 -open3d==0.13.0 +open3d h5py