diff --git a/applications/Automatic_label/README.md b/applications/Automatic_label/README.md index 1eaa40b52cfae..b912ec2f188ac 100644 --- a/applications/Automatic_label/README.md +++ b/applications/Automatic_label/README.md @@ -12,7 +12,8 @@ task = Appflow(app="auto_label", ) url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" image_pil = load_image(url) -result = task(image=image_pil) +blip2_prompt = 'describe the image' +result = task(image=image_pil,blip2_prompt = blip2_prompt) ``` 效果展示 diff --git a/applications/image2text/README.md b/applications/image2text/README.md new file mode 100644 index 0000000000000..0d75601493795 --- /dev/null +++ b/applications/image2text/README.md @@ -0,0 +1,67 @@ + + +### 图文生成(Image-to-Text Generation) + +## miniGPT4 +使用miniGPT4前,需要下载相应权重进行转换,具体可参考[miniGPT4](../../paddlemix/examples/minigpt4/README.md),在完成权重转换后,根据模型权重文件以及配置文件按下存放: +```bash +--PPMIX_HOME #默认路径 /root/.paddlemix 可通过export PPMIX_HOME 设置 + --models + --miniGPT4 + --MiniGPT4-7B + config.json + model_state.pdparams + special_tokens_map.json + image_preprocessor_config.json + preprocessor_config.json + tokenizer_config.json + model_config.json + sentencepiece.bpe.model + tokenizer.json + --MiniGPT4-13B + ... + ... + ... + +``` +完成之后,可使用appflow 一键预测 +```python +from paddlemix import Appflow +import requests + +task = Appflow(app="image2text_generation", + models=["miniGPT4/MiniGPT4-7B"]) +url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" +image = Image.open(requests.get(url, stream=True).raw) +minigpt4_text = "describe the image" +result = task(image=image,minigpt4_text=minigpt4_text) +``` + +效果展示 + +
+ +| Image | text | Generated text| +|:----:|:----:|:----:| +|![mugs](https://github.com/LokeZhou/PaddleMIX/assets/13300429/b5a95002-bb30-4683-8e62-ed21879f24e1) | describe the image|The image shows two mugs with cats on them, one is black and white and the other is blue and white. The mugs are sitting on a table with a book in the background. The mugs have a whimsical, cartoon-like appearance. The cats on the mugs are looking at each other with a playful expression. The overall style of the image is cute and fun.###| +
+ +## blip2 + +```python +from paddlemix import Appflow +from ppdiffusers.utils import load_image + +task = Appflow(app="image2text_generation", + models=["paddlemix/blip2-caption-opt2.7b"]) +url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png" +image_pil = load_image(url) +blip2_prompt = 'describe the image' +result = task(image=image_pil,blip2_prompt=blip2_prompt) +``` + +| Image | text | Generated text| +|:----:|:----:|:----:| +|![mugs](https://github.com/LokeZhou/PaddleMIX/assets/13300429/b5a95002-bb30-4683-8e62-ed21879f24e1) | describe the image|of the two coffee mugs with cats on them| + + diff --git a/paddlemix/appflow/configuration.py b/paddlemix/appflow/configuration.py index 115fcf3f78374..d718342d084d0 100644 --- a/paddlemix/appflow/configuration.py +++ b/paddlemix/appflow/configuration.py @@ -16,7 +16,7 @@ StableDiffusionImg2ImgTask, StableDiffusionUpscaleTask, ) -from .image2text_generation import Blip2CaptionTask +from .image2text_generation import Blip2CaptionTask, MiniGPT4Task from .openset_det_sam import OpenSetDetTask, OpenSetSegTask from .text2image_generation import StableDiffusionTask, VersatileDiffusionDualGuidedTask from .text2image_inpaiting import StableDiffusionInpaintTask @@ -137,4 +137,19 @@ "model": "damo-vilab/text-to-video-ms-1.7b", }, }, + "image2text_generation": { + "models": { + "paddlemix/blip2-caption-opt2.7b": { + "task_class": Blip2CaptionTask, + "task_flag": "autolabel_blip2-caption-opt2.7b", + }, + "miniGPT4/MiniGPT4-7B": { + "task_class": MiniGPT4Task, + "task_flag": "image2text_generation-MiniGPT4-7B", + }, + }, + "default": { + "model": "paddlemix/blip2-caption-opt2.7b", + }, + }, } diff --git a/paddlemix/appflow/image2text_generation.py b/paddlemix/appflow/image2text_generation.py index 1202560ee8fc5..d6bc380d5529d 100644 --- a/paddlemix/appflow/image2text_generation.py +++ b/paddlemix/appflow/image2text_generation.py @@ -17,7 +17,9 @@ import nltk from paddlenlp.transformers import AutoTokenizer +from paddlemix.models import MiniGPT4ForConditionalGeneration from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration +from paddlemix.processors import MiniGPT4Processor from paddlemix.processors.blip_processing import ( Blip2Processor, BlipImageProcessor, @@ -64,8 +66,8 @@ def _preprocess(self, inputs): """ """ image = inputs.get("image", None) assert image is not None, "The image is None" - - prompt = "describe the image" + prompt = inputs.get("blip2_prompt", None) + assert image is not None, "The blip2_prompt is None" blip2_input = self._processor( images=image, @@ -99,8 +101,7 @@ def _postprocess(self, inputs): generated_text = self._processor.batch_decode(inputs["result"], skip_special_tokens=True)[0].strip() logger.info("Generate text: {}".format(generated_text)) - inputs.pop("result", None) - + inputs["result"] = generated_text inputs["prompt"] = self._generate_tags(generated_text) return inputs @@ -111,6 +112,86 @@ def _generate_tags(self, caption): nltk.download(["punkt", "averaged_perceptron_tagger", "wordnet"]) tags_list = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) if pos[0] == "N"] tags_lemma = [lemma.lemmatize(w) for w in tags_list] - tags = ", ".join(map(str, tags_lemma)) + tags = ",".join(map(str, tags_lemma)) + tags = set(tags.split(",")) + new_tags = ",".join(tags) + return new_tags + + +class MiniGPT4Task(AppTask): + def __init__(self, task, model, **kwargs): + super().__init__(task=task, model=model, **kwargs) + + self._generate_kwargs = { + "max_length": 300, + "num_beams": 1, + "top_p": 1.0, + "top_k": 0, + "repetition_penalty": 1.0, + "length_penalty": 0.0, + "temperature": 1.0, + "decode_strategy": "greedy_search", + "eos_token_id": [[835], [2277, 29937]], + } + # Default to static mode + self._static_mode = False + + self._construct_processor(model) + self._construct_model(model) + + def _construct_processor(self, model): + """ + Construct the tokenizer for the predictor. + """ + + self._processor = MiniGPT4Processor.from_pretrained(model) + + def _construct_model(self, model): + """ + Construct the inference model for the predictor. + """ + # bulid model + model_instance = MiniGPT4ForConditionalGeneration.from_pretrained(self._task_path) + + self._model = model_instance + self._model.eval() - return tags + def _preprocess(self, inputs): + """ """ + image = inputs.get("image", None) + assert image is not None, "The image is None" + minigpt4_text = inputs.get("minigpt4_text", None) + assert minigpt4_text is not None, "The minigpt4_text is None" + + prompt = "Give the following image: ImageContent. You will be able to see the image once I provide it to you. Please answer my questions.###Human: ###Assistant:" + minigpt4_input = self._processor([image], minigpt4_text, prompt) + + inputs.pop("minigpt4_text", None) + inputs["minigpt4_input"] = minigpt4_input + + return inputs + + def _run_model(self, inputs): + """ + Run the task model from the outputs of the `_preprocess` function. + """ + generate_kwargs = inputs.get("generate_kwargs", None) + generate_kwargs = self._generate_kwargs if generate_kwargs is None else generate_kwargs + outputs = self._model.generate(**inputs["minigpt4_input"], **generate_kwargs) + + inputs.pop("minigpt4_input", None) + + inputs["result"] = outputs + + return inputs + + def _postprocess(self, inputs): + """ + The model output is tag ids, this function will convert the model output to raw text. + """ + generated_text = self._processor.batch_decode(inputs["result"][0])[0] + logger.info("Generate text: {}".format(generated_text)) + + inputs["result"] = generated_text + + return inputs