Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#234 from LokeZhou/visualglm
Browse files Browse the repository at this point in the history
Visualglm
  • Loading branch information
LokeZhou authored Oct 16, 2023
2 parents 6d20f5e + 72da4f5 commit 737ca99
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 14 deletions.
1 change: 0 additions & 1 deletion applications/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ result = task(prompt=prompt)['result']
| [文本引导的图像放大(Text-Guided Image Upscaling)](./image2image/README.md/#文本引导的图像放大text-guided-image-upscaling) | `ldm-super-resolution-4x-openimages`||
| [文本引导的图像编辑(Text-Guided Image Inpainting)](./Inpainting/README.md/#文本引导的图像编辑text-guided-image-inpainting) | `stable-diffusion-2-inpainting` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像编辑text-guided-image-inpainting) |
| [文本引导的图像变换(Image-to-Image Text-Guided Generation)](./image2image/README.md/#文本引导的图像变换image-to-image-text-guided-generation) | `stable-diffusion-v1-5` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像变换image-to-image-text-guided-generation) |
| [文本图像双引导图像生成(Dual Text and Image Guided Generation)](./image2image/README.md/#文本图像双引导图像生成dual-text-and-image-guided-generation) | `versatile-diffusion` ||
| [文本条件的视频生成(Text-to-Video Generation)](./text2video/README.md/#文本条件的视频生成text-to-video-generation) | `text-to-video-ms-1.7b` ||
| [音频生成图像(Audio-to-Image Generation)](./Audio2Img/README.md/#audio-to-image) | `imagebind stable-diffusion-2-1-unclip` | |
| [音频描述(Audio-to-Caption Generation)](./Audio2Caption/README.md/#音频描述audio-to-caption-generation) | `chatglm-6b whisper` | |
Expand Down
1 change: 0 additions & 1 deletion applications/README_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ result = task(prompt=prompt)['result']
| [文本引导的图像放大(Text-Guided Image Upscaling)](./image2image/README.md/#文本引导的图像放大text-guided-image-upscaling) | `ldm-super-resolution-4x-openimages`||
| [文本引导的图像编辑(Text-Guided Image Inpainting)](./Inpainting/README.md/#文本引导的图像编辑text-guided-image-inpainting) | `stable-diffusion-2-inpainting` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像编辑text-guided-image-inpainting) |
| [文本引导的图像变换(Image-to-Image Text-Guided Generation)](./image2image/README.md/#文本引导的图像变换image-to-image-text-guided-generation) | `stable-diffusion-v1-5` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像变换image-to-image-text-guided-generation) |
| [文本图像双引导图像生成(Dual Text and Image Guided Generation)](./image2image/README.md/#文本图像双引导图像生成dual-text-and-image-guided-generation) | `versatile-diffusion` ||
| [文本条件的视频生成(Text-to-Video Generation)](./text2video/README.md/#文本条件的视频生成text-to-video-generation) | `text-to-video-ms-1.7b` ||

More applications under continuous development......
Expand Down
1 change: 1 addition & 0 deletions paddlemix/examples/visualglm/run_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def predict(args):
query = "写诗描述一下这个场景"
history = []
inputs = processor(image, query)

generate_ids, _ = model.generate(**inputs, **generate_kwargs)
responses = processor.get_responses(generate_ids)
history.append([query, responses[0]])
Expand Down
97 changes: 85 additions & 12 deletions paddlemix/models/visualglm/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1445,13 +1445,86 @@ def __init__(self, config: ChatGLMConfig):
super(ChatGLMForConditionalGenerationWithImage, self).__init__(config)
self.config = config

def generate_inputs_position_ids(self, input_ids):

MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
use_gmasks = []
mask_positions = []
for seq in input_ids:
mask_token = gMASK if gMASK in seq else MASK
use_gmask = mask_token == gMASK
use_gmasks.append(use_gmask)
mask_positions.append(paddle.where(seq == mask_token)[0][0])

position_ids = self.get_position_ids(input_ids, mask_positions=mask_positions, use_gmasks=use_gmasks)
return position_ids

def get_masks(self, input_ids):

batch_size, seq_length = input_ids.shape
context_lengths = []
for seq in input_ids:
context_lengths.append(paddle.where(seq == self.config.bos_token_id)[0][0])
attention_mask = paddle.tril(paddle.ones([batch_size, seq_length, seq_length]))
for i, context_length in enumerate(context_lengths):
attention_mask[i, :, :context_length] = 1
attention_mask = attention_mask.unsqueeze(1)
attention_mask = (attention_mask > 0.5).astype("int64")
return attention_mask

def prepare_inputs_for_generation(
self,
input_ids,
position_ids=None,
attention_mask=None,
past_key_values=None,
cache=None,
inputs_embeds=None,
**kwargs
):

if cache is None and inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}

if cache is not None or past_key_values is not None:
last_token = input_ids[:, -1].unsqueeze(-1)
attention_mask = attention_mask[:, :, -1:]
position_ids = position_ids[..., -1:]

if cache is None:
cache = past_key_values

model_inputs.update(
{
"input_ids": last_token,
"cache": cache[-1],
"position_ids": position_ids,
"use_cache": True,
"attention_mask": attention_mask,
}
)

return model_inputs
else:

model_inputs.update(
{
"cache": cache,
"position_ids": position_ids,
"use_cache": True,
"attention_mask": attention_mask,
}
)

return model_inputs

def forward(
self,
image_features: paddle.Tensor,
input_ids: paddle.Tensor,
input_ids: Optional[paddle.Tensor] = None,
position_ids: Optional[paddle.Tensor] = None,
attention_mask: Optional[paddle.Tensor] = None,
pre_image_length: Optional[int] = None,
cache: Optional[Tuple[paddle.Tensor]] = None,
inputs_embeds: Optional[paddle.Tensor] = None,
labels: Optional[paddle.Tensor] = None,
Expand All @@ -1460,12 +1533,6 @@ def forward(
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

if inputs_embeds is None and cache is None and image_features is not None:
pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1)
pre_txt_emb = self.chatglm.transformer.word_embeddings(pre_ids)
post_txt_emb = self.chatglm.transformer.word_embeddings(post_ids)
inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1)

outputs = super().forward(
input_ids=input_ids,
position_ids=position_ids,
Expand Down Expand Up @@ -1566,11 +1633,17 @@ def generate(
"""

image_features = self.encode_images(pixel_values)
attention_mask = self.language_model.get_masks(input_ids)
position_ids = self.language_model.generate_inputs_position_ids(input_ids)
if image_features is not None:
pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1)
pre_txt_emb = self.language_model.chatglm.transformer.word_embeddings(pre_ids)
post_txt_emb = self.language_model.chatglm.transformer.word_embeddings(post_ids)
inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1)

outputs = self.language_model.generate(
input_ids=input_ids,
image_features=image_features,
pre_image_length=pre_image_length,
inputs_embeds=inputs_embeds,
position_ids=position_ids,
attention_mask=attention_mask,
**generate_kwargs,
)
Expand Down

0 comments on commit 737ca99

Please sign in to comment.