forked from zsdonghao/Image-Captioning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_inference_demo.py
executable file
·123 lines (96 loc) · 4.53 KB
/
run_inference_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! /usr/bin/python
# -*- coding: utf8 -*-
"""Generate captions for images by a given model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import numpy as np
import tensorflow as tf
import tensorlayer as tl
from buildmodel import *
# # Directory containing model checkpoints.
# CHECKPOINT_DIR = "model/train"
# # Vocabulary file generated by the preprocessing script.
# VOCAB_FILE = "data/mscoco/word_counts.txt"
# # JPEG image file to caption.
# IMAGE_FILE= "data/mscoco/raw-data/val2014/COCO_val2014_000000224477.jpg, \
# data/mscoco/raw-data/val2014/COCO_val2014_000000192970.jpg"
FLAGS = tf.flags.FLAGS
tf.flags.DEFINE_string("checkpoint_path", "model/train",
"Model checkpoint file or directory containing a "
"model checkpoint file.")
tf.flags.DEFINE_string("vocab_file", "data/mscoco/word_counts.txt", "Text file containing the vocabulary.")
tf.flags.DEFINE_string("input_files", "data/mscoco/raw-data/val2014/COCO_val2014_000000224477.jpg, \
data/mscoco/raw-data/val2014/COCO_val2014_000000192970.jpg",
"File pattern or comma-separated list of file patterns "
"of image files.")
tf.logging.set_verbosity(tf.logging.INFO) # Enable tf.logging
max_caption_length = 30
top_k = 3
print("top k:%d" % top_k)
n_captions = 5
def main(_):
# Model checkpoint file or directory containing a model checkpoint file.
checkpoint_path = FLAGS.checkpoint_path #CHECKPOINT_DIR
# Text file containing the vocabulary.
vocab_file = FLAGS.vocab_file #VOCAB_FILE
# File pattern or comma-separated list of file patterns of image files.
input_files = FLAGS.input_files #IMAGE_FILE
mode = 'inference'
# Build the inference graph.
g = tf.Graph()
with g.as_default():
images, input_seqs, target_seqs, input_mask, input_feed = Build_Inputs(mode, input_file_pattern=None)
net_image_embeddings = Build_Image_Embeddings(mode, images, train_inception=False)
net_seq_embeddings = Build_Seq_Embeddings(input_seqs)
softmax, net_img_rnn, net_seq_rnn, state_feed = Build_Model(mode, net_image_embeddings, net_seq_embeddings, target_seqs, input_mask)
if tf.gfile.IsDirectory(checkpoint_path):
checkpoint_path = tf.train.latest_checkpoint(checkpoint_path)
if not checkpoint_path:
raise ValueError("No checkpoint file found in: %s" % checkpoint_path)
saver = tf.train.Saver()
def _restore_fn(sess):
tf.logging.info("Loading model from checkpoint: %s", checkpoint_path)
saver.restore(sess, checkpoint_path)
tf.logging.info("Successfully loaded checkpoint: %s",
os.path.basename(checkpoint_path))
restore_fn = _restore_fn
g.finalize()
# Create the vocabulary.
vocab = tl.nlp.Vocabulary(vocab_file)
filenames = []
for file_pattern in input_files.split(','):
filenames.extend(tf.gfile.Glob(file_pattern.strip())) # Glob gets a list of file names which match the file_pattern
tf.logging.info("Running caption generation on %d files matching %s",
len(filenames), input_files)
# Generate captions
with tf.Session(graph=g) as sess:
# Load the model from checkpoint.
restore_fn(sess)
for filename in filenames:
with tf.gfile.GFile(filename, "r") as f:
encoded_image = f.read() # it is string, haven't decode !
# print(filename)
print("Captions for image %s:" % os.path.basename(filename))
init_state = sess.run(net_img_rnn.final_state,feed_dict={"image_feed:0": encoded_image})
for _ in range(n_captions):
state = np.hstack((init_state.c, init_state.h)) # (1, 1024)
a_id = vocab.start_id
sentence = ''
for _ in range(max_caption_length - 1):
softmax_output, state = sess.run([softmax, net_seq_rnn.final_state],
feed_dict={ input_feed : [a_id],
state_feed : state,
})
state = np.hstack((state.c, state.h))
a_id = tl.nlp.sample_top(softmax_output[0], top_k=top_k)
word = vocab.id_to_word(a_id)
if a_id == vocab.end_id:
break
sentence += word + ' '
print('# %s #' % sentence)
# print("# %s #" % sentence)
if __name__ == "__main__":
tf.app.run()