diff --git a/applications/Automatic_label/README.md b/applications/Automatic_label/README.md
index 1eaa40b52cfae..b912ec2f188ac 100644
--- a/applications/Automatic_label/README.md
+++ b/applications/Automatic_label/README.md
@@ -12,7 +12,8 @@ task = Appflow(app="auto_label",
)
url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png"
image_pil = load_image(url)
-result = task(image=image_pil)
+blip2_prompt = 'describe the image'
+result = task(image=image_pil,blip2_prompt = blip2_prompt)
```
效果展示
diff --git a/applications/image2text/README.md b/applications/image2text/README.md
new file mode 100644
index 0000000000000..0d75601493795
--- /dev/null
+++ b/applications/image2text/README.md
@@ -0,0 +1,67 @@
+
+
+### 图文生成(Image-to-Text Generation)
+
+## miniGPT4
+使用miniGPT4前,需要下载相应权重进行转换,具体可参考[miniGPT4](../../paddlemix/examples/minigpt4/README.md),在完成权重转换后,根据模型权重文件以及配置文件按下存放:
+```bash
+--PPMIX_HOME #默认路径 /root/.paddlemix 可通过export PPMIX_HOME 设置
+ --models
+ --miniGPT4
+ --MiniGPT4-7B
+ config.json
+ model_state.pdparams
+ special_tokens_map.json
+ image_preprocessor_config.json
+ preprocessor_config.json
+ tokenizer_config.json
+ model_config.json
+ sentencepiece.bpe.model
+ tokenizer.json
+ --MiniGPT4-13B
+ ...
+ ...
+ ...
+
+```
+完成之后,可使用appflow 一键预测
+```python
+from paddlemix import Appflow
+import requests
+
+task = Appflow(app="image2text_generation",
+ models=["miniGPT4/MiniGPT4-7B"])
+url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+image = Image.open(requests.get(url, stream=True).raw)
+minigpt4_text = "describe the image"
+result = task(image=image,minigpt4_text=minigpt4_text)
+```
+
+效果展示
+
+
+
+| Image | text | Generated text|
+|:----:|:----:|:----:|
+|![mugs](https://github.com/LokeZhou/PaddleMIX/assets/13300429/b5a95002-bb30-4683-8e62-ed21879f24e1) | describe the image|The image shows two mugs with cats on them, one is black and white and the other is blue and white. The mugs are sitting on a table with a book in the background. The mugs have a whimsical, cartoon-like appearance. The cats on the mugs are looking at each other with a playful expression. The overall style of the image is cute and fun.###|
+
+
+## blip2
+
+```python
+from paddlemix import Appflow
+from ppdiffusers.utils import load_image
+
+task = Appflow(app="image2text_generation",
+ models=["paddlemix/blip2-caption-opt2.7b"])
+url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+image_pil = load_image(url)
+blip2_prompt = 'describe the image'
+result = task(image=image_pil,blip2_prompt=blip2_prompt)
+```
+
+| Image | text | Generated text|
+|:----:|:----:|:----:|
+|![mugs](https://github.com/LokeZhou/PaddleMIX/assets/13300429/b5a95002-bb30-4683-8e62-ed21879f24e1) | describe the image|of the two coffee mugs with cats on them|
+
+
diff --git a/paddlemix/appflow/configuration.py b/paddlemix/appflow/configuration.py
index 115fcf3f78374..d718342d084d0 100644
--- a/paddlemix/appflow/configuration.py
+++ b/paddlemix/appflow/configuration.py
@@ -16,7 +16,7 @@
StableDiffusionImg2ImgTask,
StableDiffusionUpscaleTask,
)
-from .image2text_generation import Blip2CaptionTask
+from .image2text_generation import Blip2CaptionTask, MiniGPT4Task
from .openset_det_sam import OpenSetDetTask, OpenSetSegTask
from .text2image_generation import StableDiffusionTask, VersatileDiffusionDualGuidedTask
from .text2image_inpaiting import StableDiffusionInpaintTask
@@ -137,4 +137,19 @@
"model": "damo-vilab/text-to-video-ms-1.7b",
},
},
+ "image2text_generation": {
+ "models": {
+ "paddlemix/blip2-caption-opt2.7b": {
+ "task_class": Blip2CaptionTask,
+ "task_flag": "autolabel_blip2-caption-opt2.7b",
+ },
+ "miniGPT4/MiniGPT4-7B": {
+ "task_class": MiniGPT4Task,
+ "task_flag": "image2text_generation-MiniGPT4-7B",
+ },
+ },
+ "default": {
+ "model": "paddlemix/blip2-caption-opt2.7b",
+ },
+ },
}
diff --git a/paddlemix/appflow/image2text_generation.py b/paddlemix/appflow/image2text_generation.py
index 1202560ee8fc5..d6bc380d5529d 100644
--- a/paddlemix/appflow/image2text_generation.py
+++ b/paddlemix/appflow/image2text_generation.py
@@ -17,7 +17,9 @@
import nltk
from paddlenlp.transformers import AutoTokenizer
+from paddlemix.models import MiniGPT4ForConditionalGeneration
from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration
+from paddlemix.processors import MiniGPT4Processor
from paddlemix.processors.blip_processing import (
Blip2Processor,
BlipImageProcessor,
@@ -64,8 +66,8 @@ def _preprocess(self, inputs):
""" """
image = inputs.get("image", None)
assert image is not None, "The image is None"
-
- prompt = "describe the image"
+ prompt = inputs.get("blip2_prompt", None)
+ assert image is not None, "The blip2_prompt is None"
blip2_input = self._processor(
images=image,
@@ -99,8 +101,7 @@ def _postprocess(self, inputs):
generated_text = self._processor.batch_decode(inputs["result"], skip_special_tokens=True)[0].strip()
logger.info("Generate text: {}".format(generated_text))
- inputs.pop("result", None)
-
+ inputs["result"] = generated_text
inputs["prompt"] = self._generate_tags(generated_text)
return inputs
@@ -111,6 +112,86 @@ def _generate_tags(self, caption):
nltk.download(["punkt", "averaged_perceptron_tagger", "wordnet"])
tags_list = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) if pos[0] == "N"]
tags_lemma = [lemma.lemmatize(w) for w in tags_list]
- tags = ", ".join(map(str, tags_lemma))
+ tags = ",".join(map(str, tags_lemma))
+ tags = set(tags.split(","))
+ new_tags = ",".join(tags)
+ return new_tags
+
+
+class MiniGPT4Task(AppTask):
+ def __init__(self, task, model, **kwargs):
+ super().__init__(task=task, model=model, **kwargs)
+
+ self._generate_kwargs = {
+ "max_length": 300,
+ "num_beams": 1,
+ "top_p": 1.0,
+ "top_k": 0,
+ "repetition_penalty": 1.0,
+ "length_penalty": 0.0,
+ "temperature": 1.0,
+ "decode_strategy": "greedy_search",
+ "eos_token_id": [[835], [2277, 29937]],
+ }
+ # Default to static mode
+ self._static_mode = False
+
+ self._construct_processor(model)
+ self._construct_model(model)
+
+ def _construct_processor(self, model):
+ """
+ Construct the tokenizer for the predictor.
+ """
+
+ self._processor = MiniGPT4Processor.from_pretrained(model)
+
+ def _construct_model(self, model):
+ """
+ Construct the inference model for the predictor.
+ """
+ # bulid model
+ model_instance = MiniGPT4ForConditionalGeneration.from_pretrained(self._task_path)
+
+ self._model = model_instance
+ self._model.eval()
- return tags
+ def _preprocess(self, inputs):
+ """ """
+ image = inputs.get("image", None)
+ assert image is not None, "The image is None"
+ minigpt4_text = inputs.get("minigpt4_text", None)
+ assert minigpt4_text is not None, "The minigpt4_text is None"
+
+ prompt = "Give the following image: ImageContent. You will be able to see the image once I provide it to you. Please answer my questions.###Human: ###Assistant:"
+ minigpt4_input = self._processor([image], minigpt4_text, prompt)
+
+ inputs.pop("minigpt4_text", None)
+ inputs["minigpt4_input"] = minigpt4_input
+
+ return inputs
+
+ def _run_model(self, inputs):
+ """
+ Run the task model from the outputs of the `_preprocess` function.
+ """
+ generate_kwargs = inputs.get("generate_kwargs", None)
+ generate_kwargs = self._generate_kwargs if generate_kwargs is None else generate_kwargs
+ outputs = self._model.generate(**inputs["minigpt4_input"], **generate_kwargs)
+
+ inputs.pop("minigpt4_input", None)
+
+ inputs["result"] = outputs
+
+ return inputs
+
+ def _postprocess(self, inputs):
+ """
+ The model output is tag ids, this function will convert the model output to raw text.
+ """
+ generated_text = self._processor.batch_decode(inputs["result"][0])[0]
+ logger.info("Generate text: {}".format(generated_text))
+
+ inputs["result"] = generated_text
+
+ return inputs