From 0a7220ed201eee2ad463264a72d5ce42c33dbcc1 Mon Sep 17 00:00:00 2001 From: jwyang Date: Fri, 25 Aug 2017 18:18:04 -0400 Subject: [PATCH 01/13] use pretrained caffe model --- demo.py | 288 ++++++++++++++++++++++++++++++++++++++++++++++++ test_net.py | 11 +- trainval_net.py | 45 ++++++-- 3 files changed, 329 insertions(+), 15 deletions(-) create mode 100644 demo.py diff --git a/demo.py b/demo.py new file mode 100644 index 000000000..f05920844 --- /dev/null +++ b/demo.py @@ -0,0 +1,288 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Jiasen Lu, Jianwei Yang, based on code from Ross Girshick +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import _init_paths +import os +import sys +import numpy as np +import argparse +import pprint +import pdb +import time +import cv2 +import cPickle +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim as optim + +import torchvision.transforms as transforms +import torchvision.datasets as dset + +from roi_data_layer.roidb import combined_roidb +from roi_data_layer.roibatchLoader import roibatchLoader +from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir +from model.faster_rcnn.faster_rcnn import _fasterRCNN +from model.rpn.bbox_transform import clip_boxes +from model.nms.nms_wrapper import nms +from model.fast_rcnn.nms_wrapper import nms +from model.rpn.bbox_transform import bbox_transform_inv +from model.utils.network import save_net, load_net, vis_detections +import pdb + +def parse_args(): + """ + Parse input arguments + """ + parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') + parser.add_argument('--cfg', dest='cfg_file', + help='optional config file', + default='cfgs/vgg16.yml', type=str) + parser.add_argument('--imdb', dest='imdb_name', + help='dataset to train on', + default='voc_2007_trainval', type=str) + parser.add_argument('--imdbval', dest='imdbval_name', + help='dataset to validate on', + default='voc_2007_test', type=str) + parser.add_argument('--net', dest='net', + help='vgg16, res50, res101, res152', + default='vgg16', type=str) + parser.add_argument('--set', dest='set_cfgs', + help='set config keys', default=None, + nargs=argparse.REMAINDER) + parser.add_argument('--load_dir', dest='load_dir', + help='directory to load models', default="models", + nargs=argparse.REMAINDER) + parser.add_argument('--ngpu', dest='ngpu', + help='number of gpu', + default=1, type=int) + parser.add_argument('--checksession', dest='checksession', + help='checksession to load model', + default=4, type=int) + parser.add_argument('--checkepoch', dest='checkepoch', + help='checkepoch to load network', + default=6, type=int) + parser.add_argument('--checkpoint', dest='checkpoint', + help='checkpoint to load network', + default=10000, type=int) + + args = parser.parse_args() + return args + +lr = cfg.TRAIN.LEARNING_RATE +momentum = cfg.TRAIN.MOMENTUM +weight_decay = cfg.TRAIN.WEIGHT_DECAY + +def _get_image_blob(im): + """Converts an image into a network input. + Arguments: + im (ndarray): a color image in BGR order + Returns: + blob (ndarray): a data blob holding an image pyramid + im_scale_factors (list): list of image scales (relative to im) used + in the image pyramid + """ + im_orig = im.astype(np.float32, copy=True) + im_orig -= cfg.PIXEL_MEANS + + im_shape = im_orig.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + + processed_ims = [] + im_scale_factors = [] + + for target_size in cfg.TEST.SCALES: + im_scale = float(target_size) / float(im_size_min) + # Prevent the biggest axis from being more than MAX_SIZE + if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: + im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) + im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, + interpolation=cv2.INTER_LINEAR) + im_scale_factors.append(im_scale) + processed_ims.append(im) + + # Create a blob to hold the input images + blob = im_list_to_blob(processed_ims) + + return blob, np.array(im_scale_factors) + +def im_detect(net, im): + blobs, im_scales = _get_blobs(im) + assert len(im_scales) == 1, "Only single-image batch implemented" + + im_blob = blobs['data'] + blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) + + _, scores, bbox_pred, rois = net.test_image(blobs['data'], blobs['im_info']) + + boxes = rois[:, 1:5] / im_scales[0] + scores = np.reshape(scores, [scores.shape[0], -1]) + bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1]) + if cfg.TEST.BBOX_REG: + # Apply bounding-box regression deltas + box_deltas = bbox_pred + pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() + pred_boxes = _clip_boxes(pred_boxes, im.shape) + else: + # Simply repeat the boxes, once for each class + pred_boxes = np.tile(boxes, (1, scores.shape[1])) + + return scores, pred_boxes + +if __name__ == '__main__': + + args = parse_args() + + print('Called with args:') + print(args) + + if args.cfg_file is not None: + cfg_from_file(args.cfg_file) + if args.set_cfgs is not None: + cfg_from_list(args.set_cfgs) + + print('Using config:') + pprint.pprint(cfg) + np.random.seed(cfg.RNG_SEED) + + # train set + # -- Note: Use validation set and disable the flipped to enable faster loading. + cfg.TRAIN.USE_FLIPPED = False + imdb, roidb = combined_roidb(args.imdbval_name) + imdb.competition_mode(on=True) + + print('{:d} roidb entries'.format(len(roidb))) + + input_dir = args.load_dir + "/" + args.net + if not os.path.exists(input_dir): + raise Exception('There is no input directory for loading network') + load_name = os.path.join(input_dir, + 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) + + fasterRCNN = _fasterRCNN(args.net, imdb.classes) + checkpoint = torch.load(load_name) + fasterRCNN.load_state_dict(checkpoint['model']) + print('load model successfully!') + + # pdb.set_trace() + + print("load checkpoint %s" % (load_name)) + + # initilize the tensor holder here. + im_data = torch.FloatTensor(1) + im_info = torch.FloatTensor(1) + num_boxes = torch.LongTensor(1) + gt_boxes = torch.FloatTensor(1) + + # ship to cuda + if args.ngpu > 0: + im_data = im_data.cuda() + im_info = im_info.cuda() + num_boxes = num_boxes.cuda() + gt_boxes = gt_boxes.cuda() + + # make variable + im_data = Variable(im_data, volatile=True) + im_info = Variable(im_info, volatile=True) + num_boxes = Variable(num_boxes, volatile=True) + gt_boxes = Variable(gt_boxes, volatile=True) + + if args.ngpu > 0: + cfg.CUDA = True + + if args.ngpu > 0: + fasterRCNN.cuda() + + fasterRCNN.eval() + + start = time.time() + max_per_image = 100 + thresh = 0.05 + vis = False + + imglist = os.listdir(args.demo_root_folder) + num_images = len(imglist) + + print('Loaded Photo: {} images.'.format(num_images)) + + for i in range(num_images): + + # Load the demo image + im_file = os.path.join(cfg.DATA_DIR, 'images', imglist[i]) + im = cv2.imread(im_file) + + # Detect all object classes and regress object bounds + scores, boxes = im_detect(net, im) + + blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) + + det_tic = time.time() + rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + scores = cls_prob.data + boxes = rois[:, :, 1:5] / data[1][0][2] + + if cfg.TEST.BBOX_REG: + # Apply bounding-box regression deltas + box_deltas = bbox_pred.data + if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # Optionally normalize targets by a precomputed mean and stdev + box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() + box_deltas = box_deltas.view(1, -1, 84) + pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) + pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) + else: + # Simply repeat the boxes, once for each class + pred_boxes = np.tile(boxes, (1, scores.shape[1])) + + scores = scores.squeeze().cpu().numpy() + pred_boxes = pred_boxes.squeeze().cpu().numpy() + # _t['im_detect'].tic() + det_toc = time.time() + detect_time = det_toc - det_tic + + misc_tic = time.time() + + if vis: + im = cv2.imread(imdb.image_path_at(i)) + im2show = np.copy(im) + + for j in xrange(1, imdb.num_classes): + inds = np.where(scores[:, j] > thresh)[0] + cls_scores = scores[inds, j] + cls_boxes = pred_boxes[inds, j * 4:(j + 1) * 4] + cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ + .astype(np.float32, copy=False) + keep = nms(cls_dets, cfg.TEST.NMS) + cls_dets = cls_dets[keep, :] + if vis: + im2show = vis_detections(im2show, imdb.classes[j], cls_dets) + all_boxes[j][i] = cls_dets + + # Limit to max_per_image detections *over all classes* + if max_per_image > 0: + image_scores = np.hstack([all_boxes[j][i][:, -1] + for j in xrange(1, imdb.num_classes)]) + if len(image_scores) > max_per_image: + image_thresh = np.sort(image_scores)[-max_per_image] + for j in xrange(1, imdb.num_classes): + keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] + all_boxes[j][i] = all_boxes[j][i][keep, :] + + misc_toc = time.time() + nms_time = misc_toc - misc_tic + + sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ + .format(i + 1, num_images, detect_time, nms_time)) + sys.stdout.flush() + + if vis: + cv2.imshow('test', im2show) + cv2.waitKey(0) diff --git a/test_net.py b/test_net.py index 3425f1daa..be4947d7f 100644 --- a/test_net.py +++ b/test_net.py @@ -56,7 +56,7 @@ def parse_args(): help='set config keys', default=None, nargs=argparse.REMAINDER) parser.add_argument('--load_dir', dest='load_dir', - help='directory to load models', default="/srv/share/models", + help='directory to load models', default="models", nargs=argparse.REMAINDER) parser.add_argument('--ngpu', dest='ngpu', help='number of gpu', @@ -157,10 +157,13 @@ def parse_args(): output_dir = get_output_dir(imdb, save_name) + # dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + # normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], + # std=[0.229, 0.224, 0.225])) + dataset = roibatchLoader(roidb, imdb.num_classes, training=False, - normalize = transforms.Normalize( - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225])) + normalize = False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0, diff --git a/trainval_net.py b/trainval_net.py index 58cf17acd..93b764b45 100644 --- a/trainval_net.py +++ b/trainval_net.py @@ -31,7 +31,6 @@ adjust_learning_rate, save_checkpoint from model.faster_rcnn.faster_rcnn import _fasterRCNN - import pdb def parse_args(): @@ -65,7 +64,7 @@ def parse_args(): default=10000, type=int) parser.add_argument('--save_dir', dest='save_dir', - help='directory to save models', default="/srv/share/models", + help='directory to save models', default="models", nargs=argparse.REMAINDER) parser.add_argument('--ngpu', dest='ngpu', help='number of gpu', @@ -78,7 +77,7 @@ def parse_args(): default="sgd", type=str) parser.add_argument('--lr_decay_step', dest='lr_decay_step', help='step to do learning rate decay, unit is epoch', - default=4, type=int) + default=5, type=int) parser.add_argument('--lr_decay_gamma', dest='lr_decay_gamma', help='learning rate decay ratio', default=0.1, type=float) @@ -86,7 +85,7 @@ def parse_args(): # set training session parser.add_argument('--s', dest='session', help='training session', - default=4, type=int) + default=1, type=int) # resume trained model parser.add_argument('--r', dest='resume', @@ -101,6 +100,10 @@ def parse_args(): parser.add_argument('--checkpoint', dest='checkpoint', help='checkpoint to load model', default=0, type=int) +# log and diaplay + parser.add_argument('--use_tfboard', dest='use_tfboard', + help='whether use tensorflow tensorboard', + default=False, type=bool) # if len(sys.argv) == 1: # parser.print_help() @@ -122,6 +125,11 @@ def parse_args(): print('Called with args:') print(args) + if args.use_tfboard: + from model.utils.logger import Logger + # Set the logger + logger = Logger('./logs') + if args.dataset == "pascal_voc": args.imdb_name = "voc_2007_trainval" args.imdbval_name = "voc_2007_test" @@ -159,9 +167,7 @@ def parse_args(): os.makedirs(output_dir) dataset = roibatchLoader(roidb, imdb.num_classes, training=False, - normalize = transforms.Normalize( - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225])) + normalize = False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=0) @@ -202,8 +208,7 @@ def parse_args(): if args.optimizer == "adam": lr = lr * 0.1 optimizer = torch.optim.Adam([ - {'params': fasterRCNN.RCNN_base.RCNN_base_model[0].parameters(), 'lr': lr * 0.0}, - {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr * 0.1}, + {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, {'params': fasterRCNN.RCNN_fc6.parameters()}, @@ -214,8 +219,7 @@ def parse_args(): elif args.optimizer == "sgd": optimizer = torch.optim.SGD([ - {'params': fasterRCNN.RCNN_base.RCNN_base_model[0].parameters(), 'lr': lr * 0.0}, - {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr * 0.1}, + {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, {'params': fasterRCNN.RCNN_fc6.parameters(), 'lr': lr}, @@ -271,6 +275,13 @@ def parse_args(): % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) print("\t\t\tfg/bg=(%d/%d)" % (0, 0)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" % (0, 0, 0, 0)) + if args.use_tfboard: + info = { + 'loss': loss_temp / args.disp_interval + } + for tag, value in info.items(): + logger.scalar_summary(tag, value, step) + else: print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) @@ -280,8 +291,19 @@ def parse_args(): fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_box.data[0], \ fasterRCNN.RCNN_loss_cls.data[0], \ fasterRCNN.RCNN_loss_bbox.data[0])) + if args.use_tfboard: + info = { + 'loss': loss_temp / args.disp_interval, + 'loss_rpn_cls': fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_cls.data[0], + 'loss_rpn_box': fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_box.data[0], + 'loss_rcnn_cls': fasterRCNN.RCNN_loss_cls.data[0], + 'loss_rcnn_box': fasterRCNN.RCNN_loss_bbox.data[0] + } + for tag, value in info.items(): + logger.scalar_summary(tag, value, step) loss_temp = 0 + if (step % args.checkpoint_interval == 0) and step > 0: # pdb.set_trace() save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) @@ -293,6 +315,7 @@ def parse_args(): }, save_name) print('save model: {}'.format(save_name)) + if epoch % args.lr_decay_step == 0: adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma From b1851595603221a3c557474e87471c7a4c76b9db Mon Sep 17 00:00:00 2001 From: jwyang Date: Fri, 25 Aug 2017 18:23:07 -0400 Subject: [PATCH 02/13] add all file --- lib/model/faster_rcnn/faster_rcnn.py | 4 +- lib/model/utils/blob.py | 3 +- lib/model/utils/logger.py | 72 ++++++ lib/model/utils/mobilenet_v1.py | 309 ++++++++++++++++++++++++++ lib/model/utils/network.py | 27 +-- lib/model/utils/resnet_v1.py | 315 +++++++++++++++++++++++++++ lib/model/utils/vgg16.py | 57 +++++ 7 files changed, 765 insertions(+), 22 deletions(-) create mode 100644 lib/model/utils/logger.py create mode 100644 lib/model/utils/mobilenet_v1.py create mode 100644 lib/model/utils/resnet_v1.py create mode 100644 lib/model/utils/vgg16.py diff --git a/lib/model/faster_rcnn/faster_rcnn.py b/lib/model/faster_rcnn/faster_rcnn.py index 5f704d4cf..2c7e2b9da 100644 --- a/lib/model/faster_rcnn/faster_rcnn.py +++ b/lib/model/faster_rcnn/faster_rcnn.py @@ -164,7 +164,9 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): ce_weights = rois_label.data.new(cls_score.size(1)).fill_(1) ce_weights[0] = float(self.fg_cnt) / self.bg_cnt - self.RCNN_loss_cls = F.cross_entropy(cls_score, label, weight=ce_weights) + # self.RCNN_loss_cls = F.cross_entropy(cls_score, label, weight=ce_weights) + + self.RCNN_loss_cls = F.cross_entropy(cls_score, label) # bounding box regression L1 loss # rois_target = torch.mul(rois_target, rois_inside_ws) diff --git a/lib/model/utils/blob.py b/lib/model/utils/blob.py index 69f7e23d3..8b388586b 100644 --- a/lib/model/utils/blob.py +++ b/lib/model/utils/blob.py @@ -28,7 +28,8 @@ def im_list_to_blob(ims): def prep_im_for_blob(im, pixel_means, target_size, max_size): """Mean subtract and scale an image for use in a blob.""" im = im.astype(np.float32, copy=False) - # im -= pixel_means + im -= pixel_means + # im = im[:, :, ::-1] im_shape = im.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) diff --git a/lib/model/utils/logger.py b/lib/model/utils/logger.py new file mode 100644 index 000000000..d7610b3c1 --- /dev/null +++ b/lib/model/utils/logger.py @@ -0,0 +1,72 @@ +# Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 +import tensorflow as tf +import numpy as np +import scipy.misc +try: + from StringIO import StringIO # Python 2.7 +except ImportError: + from io import BytesIO # Python 3.x + + +class Logger(object): + + def __init__(self, log_dir): + """Create a summary writer logging to log_dir.""" + self.writer = tf.summary.FileWriter(log_dir) + + def scalar_summary(self, tag, value, step): + """Log a scalar variable.""" + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) + self.writer.add_summary(summary, step) + + def image_summary(self, tag, images, step): + """Log a list of images.""" + + img_summaries = [] + for i, img in enumerate(images): + # Write the image to a string + try: + s = StringIO() + except: + s = BytesIO() + scipy.misc.toimage(img).save(s, format="png") + + # Create an Image object + img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), + height=img.shape[0], + width=img.shape[1]) + # Create a Summary value + img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) + + # Create and write Summary + summary = tf.Summary(value=img_summaries) + self.writer.add_summary(summary, step) + + def histo_summary(self, tag, values, step, bins=1000): + """Log a histogram of the tensor of values.""" + + # Create a histogram using numpy + counts, bin_edges = np.histogram(values, bins=bins) + + # Fill the fields of the histogram proto + hist = tf.HistogramProto() + hist.min = float(np.min(values)) + hist.max = float(np.max(values)) + hist.num = int(np.prod(values.shape)) + hist.sum = float(np.sum(values)) + hist.sum_squares = float(np.sum(values**2)) + + # Drop the start of the first bin + bin_edges = bin_edges[1:] + + # Add bin edges and counts + for edge in bin_edges: + hist.bucket_limit.append(edge) + for c in counts: + hist.bucket.append(c) + + # Create and write Summary + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) + self.writer.add_summary(summary, step) + self.writer.flush() + diff --git a/lib/model/utils/mobilenet_v1.py b/lib/model/utils/mobilenet_v1.py new file mode 100644 index 000000000..cdd77f1f0 --- /dev/null +++ b/lib/model/utils/mobilenet_v1.py @@ -0,0 +1,309 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Xinlei Chen +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorflow.contrib.slim as slim +from tensorflow.contrib.slim import losses +from tensorflow.contrib.slim import arg_scope +from tensorflow.contrib.slim.python.slim.nets import resnet_utils +import numpy as np +from collections import namedtuple + +from nets.network import Network +from model.config import cfg + +def separable_conv2d_same(inputs, kernel_size, stride, rate=1, scope=None): + """Strided 2-D separable convolution with 'SAME' padding. + Args: + inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. + kernel_size: An int with the kernel_size of the filters. + stride: An integer, the output stride. + rate: An integer, rate for atrous convolution. + scope: Scope. + Returns: + output: A 4-D tensor of size [batch, height_out, width_out, channels] with + the convolution output. + """ + + # By passing filters=None + # separable_conv2d produces only a depth-wise convolution layer + if stride == 1: + return slim.separable_conv2d(inputs, None, kernel_size, + depth_multiplier=1, stride=1, rate=rate, + padding='SAME', scope=scope) + else: + kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) + pad_total = kernel_size_effective - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + inputs = tf.pad(inputs, + [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + return slim.separable_conv2d(inputs, None, kernel_size, + depth_multiplier=1, stride=stride, rate=rate, + padding='VALID', scope=scope) + +# The following is adapted from: +# https://github.com/tensorflow/models/blob/master/slim/nets/mobilenet_v1.py + +# Conv and DepthSepConv named tuple define layers of the MobileNet architecture +# Conv defines 3x3 convolution layers +# DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution. +# stride is the stride of the convolution +# depth is the number of channels or filters in a layer +Conv = namedtuple('Conv', ['kernel', 'stride', 'depth']) +DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth']) + +# _CONV_DEFS specifies the MobileNet body +_CONV_DEFS = [ + Conv(kernel=3, stride=2, depth=32), + DepthSepConv(kernel=3, stride=1, depth=64), + DepthSepConv(kernel=3, stride=2, depth=128), + DepthSepConv(kernel=3, stride=1, depth=128), + DepthSepConv(kernel=3, stride=2, depth=256), + DepthSepConv(kernel=3, stride=1, depth=256), + DepthSepConv(kernel=3, stride=2, depth=512), + DepthSepConv(kernel=3, stride=1, depth=512), + DepthSepConv(kernel=3, stride=1, depth=512), + DepthSepConv(kernel=3, stride=1, depth=512), + DepthSepConv(kernel=3, stride=1, depth=512), + DepthSepConv(kernel=3, stride=1, depth=512), + # use stride 1 for the 13th layer + DepthSepConv(kernel=3, stride=1, depth=1024), + DepthSepConv(kernel=3, stride=1, depth=1024) +] + +# Modified mobilenet_v1 +def mobilenet_v1_base(inputs, + conv_defs, + starting_layer=0, + min_depth=8, + depth_multiplier=1.0, + output_stride=None, + reuse=False, + scope=None): + """Mobilenet v1. + Constructs a Mobilenet v1 network from inputs to the given final endpoint. + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + starting_layer: specifies the current starting layer. For region proposal + network it is 0, for region classification it is 12 by default. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + conv_defs: A list of ConvDef named tuples specifying the net architecture. + output_stride: An integer that specifies the requested ratio of input to + output spatial resolution. If not None, then we invoke atrous convolution + if necessary to prevent the network from reducing the spatial resolution + of the activation maps. + scope: Optional variable_scope. + Returns: + tensor_out: output tensor corresponding to the final_endpoint. + Raises: + ValueError: if depth_multiplier <= 0, or convolution type is not defined. + """ + depth = lambda d: max(int(d * depth_multiplier), min_depth) + end_points = {} + + # Used to find thinned depths for each layer. + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + + with tf.variable_scope(scope, 'MobilenetV1', [inputs], reuse=reuse): + # The current_stride variable keeps track of the output stride of the + # activations, i.e., the running product of convolution strides up to the + # current network layer. This allows us to invoke atrous convolution + # whenever applying the next convolution would result in the activations + # having output stride larger than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + net = inputs + for i, conv_def in enumerate(conv_defs): + end_point_base = 'Conv2d_%d' % (i + starting_layer) + + if output_stride is not None and current_stride == output_stride: + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + layer_stride = 1 + layer_rate = rate + rate *= conv_def.stride + else: + layer_stride = conv_def.stride + layer_rate = 1 + current_stride *= conv_def.stride + + if isinstance(conv_def, Conv): + end_point = end_point_base + net = resnet_utils.conv2d_same(net, depth(conv_def.depth), conv_def.kernel, + stride=conv_def.stride, + scope=end_point) + + elif isinstance(conv_def, DepthSepConv): + end_point = end_point_base + '_depthwise' + + net = separable_conv2d_same(net, conv_def.kernel, + stride=layer_stride, + rate=layer_rate, + scope=end_point) + + end_point = end_point_base + '_pointwise' + + net = slim.conv2d(net, depth(conv_def.depth), [1, 1], + stride=1, + scope=end_point) + + else: + raise ValueError('Unknown convolution type %s for layer %d' + % (conv_def.ltype, i)) + + return net + +# Modified arg_scope to incorporate configs +def mobilenet_v1_arg_scope(is_training=True, + weight_decay=cfg.MOBILENET.WEIGHT_DECAY, + stddev=0.09, + regularize_depthwise=cfg.MOBILENET.REGU_DEPTH): + batch_norm_params = { + 'is_training': False, + 'center': True, + 'scale': True, + 'decay': 0.9997, + 'epsilon': 0.001, + 'trainable': False, + } + + # Set weight_decay for weights in Conv and DepthSepConv layers. + weights_init = tf.truncated_normal_initializer(stddev=stddev) + regularizer = tf.contrib.layers.l2_regularizer(weight_decay) + if regularize_depthwise: + depthwise_regularizer = regularizer + else: + depthwise_regularizer = None + + with slim.arg_scope([slim.conv2d, slim.separable_conv2d], + trainable=is_training, + weights_initializer=weights_init, + activation_fn=tf.nn.relu6, + normalizer_fn=slim.batch_norm, + padding='SAME'): + with slim.arg_scope([slim.batch_norm], **batch_norm_params): + with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer): + with slim.arg_scope([slim.separable_conv2d], + weights_regularizer=depthwise_regularizer) as sc: + return sc + +class mobilenetv1(Network): + def __init__(self, batch_size=1): + Network.__init__(self, batch_size=batch_size) + self._depth_multiplier = cfg.MOBILENET.DEPTH_MULTIPLIER + self._scope = 'MobilenetV1' + + def _image_to_head(self, is_training, reuse=False): + # Base bottleneck + assert (0 <= cfg.MOBILENET.FIXED_LAYERS <= 12) + net_conv = self._image + if cfg.MOBILENET.FIXED_LAYERS > 0: + with slim.arg_scope(mobilenet_v1_arg_scope(is_training=False)): + net_conv = mobilenet_v1_base(net_conv, + _CONV_DEFS[:cfg.MOBILENET.FIXED_LAYERS], + starting_layer=0, + depth_multiplier=self._depth_multiplier, + reuse=reuse, + scope=self._scope) + if cfg.MOBILENET.FIXED_LAYERS < 12: + with slim.arg_scope(mobilenet_v1_arg_scope(is_training=is_training)): + net_conv = mobilenet_v1_base(net_conv, + _CONV_DEFS[cfg.MOBILENET.FIXED_LAYERS:12], + starting_layer=cfg.MOBILENET.FIXED_LAYERS, + depth_multiplier=self._depth_multiplier, + reuse=reuse, + scope=self._scope) + + self._act_summaries.append(net_conv) + self._layers['head'] = net_conv + + return net_conv + + def _head_to_tail(self, pool5, is_training, reuse=False): + with slim.arg_scope(mobilenet_v1_arg_scope(is_training=is_training)): + fc7 = mobilenet_v1_base(pool5, + _CONV_DEFS[12:], + starting_layer=12, + depth_multiplier=self._depth_multiplier, + reuse=reuse, + scope=self._scope) + # average pooling done by reduce_mean + fc7 = tf.reduce_mean(fc7, axis=[1, 2]) + return fc7 + + def _build_network(self, is_training=True): + # select initializers + if cfg.TRAIN.TRUNCATED: + initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01) + initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001) + else: + initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) + initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001) + + net_conv = self._image_to_head(is_training) + with tf.variable_scope(self._scope, self._scope): + # build the anchors for the image + self._anchor_component() + # region proposal network + rois = self._region_proposal(net_conv, is_training, initializer) + # region of interest pooling + if cfg.POOLING_MODE == 'crop': + pool5 = self._crop_pool_layer(net_conv, rois, "pool5") + else: + raise NotImplementedError + + fc7 = self._head_to_tail(pool5, is_training) + with tf.variable_scope(self._scope, self._scope): + # region classification + cls_prob, bbox_pred = self._region_classification(fc7, is_training, + initializer, initializer_bbox) + + self._score_summaries.update(self._predictions) + + return rois, cls_prob, bbox_pred + + def get_variables_to_restore(self, variables, var_keep_dic): + variables_to_restore = [] + + for v in variables: + # exclude the first conv layer to swap RGB to BGR + if v.name == (self._scope + '/Conv2d_0/weights:0'): + self._variables_to_fix[v.name] = v + continue + if v.name.split(':')[0] in var_keep_dic: + print('Variables restored: %s' % v.name) + variables_to_restore.append(v) + + return variables_to_restore + + def fix_variables(self, sess, pretrained_model): + print('Fix MobileNet V1 layers..') + with tf.variable_scope('Fix_MobileNet_V1') as scope: + with tf.device("/cpu:0"): + # fix RGB to BGR, and match the scale by (255.0 / 2.0) + Conv2d_0_rgb = tf.get_variable("Conv2d_0_rgb", + [3, 3, 3, max(int(32 * self._depth_multiplier), 8)], + trainable=False) + restorer_fc = tf.train.Saver({self._scope + "/Conv2d_0/weights": Conv2d_0_rgb}) + restorer_fc.restore(sess, pretrained_model) + + sess.run(tf.assign(self._variables_to_fix[self._scope + "/Conv2d_0/weights:0"], + tf.reverse(Conv2d_0_rgb / (255.0 / 2.0), [2]))) diff --git a/lib/model/utils/network.py b/lib/model/utils/network.py index 07f470077..1a6c97464 100644 --- a/lib/model/utils/network.py +++ b/lib/model/utils/network.py @@ -3,6 +3,8 @@ from torch.autograd import Variable import numpy as np import torchvision.models as models +from vgg16 import vgg16 +import cv2 import pdb def save_net(fname, net): @@ -56,28 +58,13 @@ def vis_detections(im, class_name, dets, thresh=0.8): 1.0, (0, 0, 255), thickness=1) return im -def slice_vgg16(model): - - slices = [] - # we fix conv1_1, conv1_2, conv2_1, conv2_2 - slices.append(nn.Sequential(*list(model.features.children())[:10])) - # we finetune conv3_1, conv3_2, conv3_3 - slices.append(nn.Sequential(*list(model.features.children())[10:17])) - # we retrain conv4_1, conv4_2, conv4_3, conv5_1, conv5_2, conv5_3 - slices.append(nn.Sequential(*list(model.features.children())[17:-1])) - - # we copy fc6 - slices.append(model.classifier[0]) - - # we copy fc7 - slices.append(model.classifier[3]) - - return slices def load_baseModel(model_name): if model_name == "vgg16": - pretrained_model = models.vgg16(pretrained=True) - return slice_vgg16(pretrained_model) + net = vgg16() + model_path = 'data/pretrained_model/{}_caffe.pth'.format(model_name) + net.load_pretrained_cnn(torch.load(model_path)) + return net.slice() elif model_name == "resnet50": return None @@ -106,4 +93,4 @@ def _smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_w for i in sorted(dim, reverse=True): loss_box = loss_box.sum(i) loss_box = loss_box.mean() - return loss_box \ No newline at end of file + return loss_box diff --git a/lib/model/utils/resnet_v1.py b/lib/model/utils/resnet_v1.py new file mode 100644 index 000000000..770261898 --- /dev/null +++ b/lib/model/utils/resnet_v1.py @@ -0,0 +1,315 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Zheqi He and Xinlei Chen +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from nets.network import Network +from model.config import cfg + +import utils.timer + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import math +import torch.utils.model_zoo as model_zoo + + +__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'resnet152'] + + +model_urls = { + 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', + 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth', + 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth', + 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth', + 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth', +} + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + def __init__(self, block, layers, num_classes=1000): + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(7) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def resnet18(pretrained=False): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [2, 2, 2, 2]) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) + return model + + +def resnet34(pretrained=False): + """Constructs a ResNet-34 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [3, 4, 6, 3]) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) + return model + + +def resnet50(pretrained=False): + """Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3]) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) + return model + + +def resnet101(pretrained=False): + """Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 23, 3]) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) + return model + + +def resnet152(pretrained=False): + """Constructs a ResNet-152 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 8, 36, 3]) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) + return model + +class resnetv1(Network): + def __init__(self, batch_size=1, num_layers=50): + Network.__init__(self, batch_size=batch_size) + self._num_layers = num_layers + + def _crop_pool_layer(self, bottom, rois): + return Network._crop_pool_layer(self, bottom, rois, cfg.RESNET.MAX_POOL) + + def _image_to_head(self): + net_conv = self._layers['head'](self._image) + self._act_summaries['conv']['value'] = net_conv + + return net_conv + + def _head_to_tail(self, pool5): + fc7 = self.resnet.layer4(pool5).mean(3).mean(2) # average pooling after layer4 + return fc7 + + def _init_modules(self): + # choose different blocks for different number of layers + if self._num_layers == 50: + self.resnet = resnet50() + + elif self._num_layers == 101: + self.resnet = resnet101() + + elif self._num_layers == 152: + self.resnet = resnet152() + + else: + # other numbers are not supported + raise NotImplementedError + + # Fix blocks + for p in self.resnet.bn1.parameters(): p.requires_grad=False + for p in self.resnet.conv1.parameters(): p.requires_grad=False + assert (0 <= cfg.RESNET.FIXED_BLOCKS < 4) + if cfg.RESNET.FIXED_BLOCKS >= 3: + for p in self.resnet.layer3.parameters(): p.requires_grad=False + if cfg.RESNET.FIXED_BLOCKS >= 2: + for p in self.resnet.layer2.parameters(): p.requires_grad=False + if cfg.RESNET.FIXED_BLOCKS >= 1: + for p in self.resnet.layer1.parameters(): p.requires_grad=False + + def set_bn_fix(m): + classname = m.__class__.__name__ + if classname.find('BatchNorm') != -1: + for p in m.parameters(): p.requires_grad=False + + self.resnet.apply(set_bn_fix) + + # Build resnet. + self._layers['head'] = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu, + self.resnet.maxpool,self.resnet.layer1,self.resnet.layer2,self.resnet.layer3) + + # rpn + self.rpn_net = nn.Conv2d(1024, 512, [3, 3], padding=1) + + self.rpn_cls_score_net = nn.Conv2d(512, self._num_anchors * 2, [1, 1]) + + self.rpn_bbox_pred_net = nn.Conv2d(512, self._num_anchors * 4, [1, 1]) + + self.cls_score_net = nn.Linear(2048, self._num_classes) + self.bbox_pred_net = nn.Linear(2048, self._num_classes * 4) + + self.init_weights() + + def train(self, mode=True): + # Override train so that the training mode is set as we want + nn.Module.train(self, mode) + if mode: + # Set fixed blocks to be in eval mode + self.resnet.eval() + self.resnet.layer1.train() + if cfg.RESNET.FIXED_BLOCKS >= 1: + self.resnet.layer2.train() + if cfg.RESNET.FIXED_BLOCKS >= 2: + self.resnet.layer3.train() + if cfg.RESNET.FIXED_BLOCKS >= 3: + self.resnet.layer4.train() + + def set_bn_eval(m): + classname = m.__class__.__name__ + if classname.find('BatchNorm') != -1: + m.eval() + + self.resnet.apply(set_bn_eval) + + def load_pretrained_cnn(self, state_dict): + self.resnet.load_state_dict(state_dict) diff --git a/lib/model/utils/vgg16.py b/lib/model/utils/vgg16.py new file mode 100644 index 000000000..0fc6ad295 --- /dev/null +++ b/lib/model/utils/vgg16.py @@ -0,0 +1,57 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Xinlei Chen +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import math +import torchvision.models as models + +class vgg16(): + def __init__(self): + self.vgg = models.vgg16() + # Remove fc8 + # self.vgg.classifier = nn.Sequential(*list(self.vgg.classifier._modules.values())[:-1]) + + # Fix the layers before conv3: + for layer in range(10): + for p in self.vgg.features[layer].parameters(): p.requires_grad = False + + # def _image_to_head(self): + # net_conv = self._layers['head'](self._image) + # self._act_summaries['conv']['value'] = net_conv + + # return net_conv + + # def _head_to_tail(self, pool5): + # pool5_flat = pool5.view(pool5.size(0), -1) + # fc7 = self.vgg.classifier(pool5_flat) + + # return fc7 + def slice(self): + + self.slices = [] + # we fix conv1_1, conv1_2, conv2_1, conv2_2 + self.slices.append(nn.Sequential(*list(self.vgg.features.children())[:10])) + # we finetune conv3_1, conv3_2, conv3_3 + self.slices.append(nn.Sequential(*list(self.vgg.features.children())[10:17])) + # we retrain conv4_1, conv4_2, conv4_3, conv5_1, conv5_2, conv5_3 + self.slices.append(nn.Sequential(*list(self.vgg.features.children())[17:-1])) + + # we copy fc6 + self.slices.append(self.vgg.classifier[0]) + + # we copy fc7 + self.slices.append(self.vgg.classifier[3]) + + return self.slices + + def load_pretrained_cnn(self, state_dict): + self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()}) \ No newline at end of file From 71a28b1025dfc9ba33775d9013e4ea4bfb70153b Mon Sep 17 00:00:00 2001 From: jwyang Date: Sat, 26 Aug 2017 23:27:46 -0400 Subject: [PATCH 03/13] initial push --- demo.py | 96 +++--- demo_cascade.py | 279 ++++++++++++++++ lib/model/faster_rcnn/faster_rcnn.py | 2 +- lib/model/faster_rcnn/faster_rcnn_cascade.py | 202 ++++++++++++ lib/model/rpn/proposal_target_layer_4.py | 203 ++++++++++++ lib/roi_data_layer/roibatchLoader.py | 58 +++- lib/roi_data_layer/roibatchLoader_aug.py | 162 ++++++++++ test_net_cascade.py | 256 +++++++++++++++ trainval_net.py | 4 +- trainval_net_cascade.py | 324 +++++++++++++++++++ 10 files changed, 1528 insertions(+), 58 deletions(-) create mode 100644 demo_cascade.py create mode 100644 lib/model/faster_rcnn/faster_rcnn_cascade.py create mode 100644 lib/model/rpn/proposal_target_layer_4.py create mode 100644 lib/roi_data_layer/roibatchLoader_aug.py create mode 100644 test_net_cascade.py create mode 100644 trainval_net_cascade.py diff --git a/demo.py b/demo.py index f05920844..7c65f4e26 100644 --- a/demo.py +++ b/demo.py @@ -24,6 +24,7 @@ import torchvision.transforms as transforms import torchvision.datasets as dset +from PIL import Image from roi_data_layer.roidb import combined_roidb from roi_data_layer.roibatchLoader import roibatchLoader @@ -34,6 +35,7 @@ from model.fast_rcnn.nms_wrapper import nms from model.rpn.bbox_transform import bbox_transform_inv from model.utils.network import save_net, load_net, vis_detections +from model.utils.blob import im_list_to_blob import pdb def parse_args(): @@ -59,6 +61,9 @@ def parse_args(): parser.add_argument('--load_dir', dest='load_dir', help='directory to load models', default="models", nargs=argparse.REMAINDER) + parser.add_argument('--image_dir', dest='image_dir', + help='directory to load images', default="data/images", + type=str) parser.add_argument('--ngpu', dest='ngpu', help='number of gpu', default=1, type=int) @@ -113,29 +118,6 @@ def _get_image_blob(im): return blob, np.array(im_scale_factors) -def im_detect(net, im): - blobs, im_scales = _get_blobs(im) - assert len(im_scales) == 1, "Only single-image batch implemented" - - im_blob = blobs['data'] - blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) - - _, scores, bbox_pred, rois = net.test_image(blobs['data'], blobs['im_info']) - - boxes = rois[:, 1:5] / im_scales[0] - scores = np.reshape(scores, [scores.shape[0], -1]) - bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1]) - if cfg.TEST.BBOX_REG: - # Apply bounding-box regression deltas - box_deltas = bbox_pred - pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() - pred_boxes = _clip_boxes(pred_boxes, im.shape) - else: - # Simply repeat the boxes, once for each class - pred_boxes = np.tile(boxes, (1, scores.shape[1])) - - return scores, pred_boxes - if __name__ == '__main__': args = parse_args() @@ -154,11 +136,6 @@ def im_detect(net, im): # train set # -- Note: Use validation set and disable the flipped to enable faster loading. - cfg.TRAIN.USE_FLIPPED = False - imdb, roidb = combined_roidb(args.imdbval_name) - imdb.competition_mode(on=True) - - print('{:d} roidb entries'.format(len(roidb))) input_dir = args.load_dir + "/" + args.net if not os.path.exists(input_dir): @@ -166,7 +143,15 @@ def im_detect(net, im): load_name = os.path.join(input_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) - fasterRCNN = _fasterRCNN(args.net, imdb.classes) + + classes = np.asarray(['__background__', + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor']) + + fasterRCNN = _fasterRCNN(args.net, classes) checkpoint = torch.load(load_name) fasterRCNN.load_state_dict(checkpoint['model']) print('load model successfully!') @@ -205,28 +190,47 @@ def im_detect(net, im): start = time.time() max_per_image = 100 thresh = 0.05 - vis = False + vis = True - imglist = os.listdir(args.demo_root_folder) + imglist = os.listdir(args.image_dir) num_images = len(imglist) print('Loaded Photo: {} images.'.format(num_images)) + for i in range(num_images): # Load the demo image - im_file = os.path.join(cfg.DATA_DIR, 'images', imglist[i]) - im = cv2.imread(im_file) + im_file = os.path.join(args.image_dir, imglist[i]) + # im = cv2.imread(im_file) + im = np.array(Image.open(im_file)) + if len(im.shape) == 2: + im = im[:,:,np.newaxis] + im = np.concatenate((im,im,im), axis=2) - # Detect all object classes and regress object bounds - scores, boxes = im_detect(net, im) + blobs, im_scales = _get_image_blob(im) + assert len(im_scales) == 1, "Only single-image batch implemented" + im_blob = blobs + im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) + + im_data_pt = torch.from_numpy(im_blob) + im_data_pt = im_data_pt.permute(0, 3, 1, 2) + im_info_pt = torch.from_numpy(im_info_np) + + im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) + im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) + gt_boxes.data.resize_(1, 1, 5).zero_() + num_boxes.data.resize_(1).zero_() + + # pdb.set_trace() - blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) det_tic = time.time() - rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss = \ + fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + scores = cls_prob.data - boxes = rois[:, :, 1:5] / data[1][0][2] + boxes = rois[:, :, 1:5] / im_scales[0] if cfg.TEST.BBOX_REG: # Apply bounding-box regression deltas @@ -251,10 +255,9 @@ def im_detect(net, im): misc_tic = time.time() if vis: - im = cv2.imread(imdb.image_path_at(i)) im2show = np.copy(im) - for j in xrange(1, imdb.num_classes): + for j in xrange(1, 21): inds = np.where(scores[:, j] > thresh)[0] cls_scores = scores[inds, j] cls_boxes = pred_boxes[inds, j * 4:(j + 1) * 4] @@ -263,18 +266,7 @@ def im_detect(net, im): keep = nms(cls_dets, cfg.TEST.NMS) cls_dets = cls_dets[keep, :] if vis: - im2show = vis_detections(im2show, imdb.classes[j], cls_dets) - all_boxes[j][i] = cls_dets - - # Limit to max_per_image detections *over all classes* - if max_per_image > 0: - image_scores = np.hstack([all_boxes[j][i][:, -1] - for j in xrange(1, imdb.num_classes)]) - if len(image_scores) > max_per_image: - image_thresh = np.sort(image_scores)[-max_per_image] - for j in xrange(1, imdb.num_classes): - keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] - all_boxes[j][i] = all_boxes[j][i][keep, :] + im2show = vis_detections(im2show, classes[j], cls_dets) misc_toc = time.time() nms_time = misc_toc - misc_tic diff --git a/demo_cascade.py b/demo_cascade.py new file mode 100644 index 000000000..3d10540f1 --- /dev/null +++ b/demo_cascade.py @@ -0,0 +1,279 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Jiasen Lu, Jianwei Yang, based on code from Ross Girshick +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import _init_paths +import os +import sys +import numpy as np +import argparse +import pprint +import pdb +import time +import cv2 +import cPickle +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim as optim + +import torchvision.transforms as transforms +import torchvision.datasets as dset +from PIL import Image +from roi_data_layer.roidb import combined_roidb +from roi_data_layer.roibatchLoader import roibatchLoader +from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir +from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN +from model.rpn.bbox_transform import clip_boxes +from model.nms.nms_wrapper import nms +from model.fast_rcnn.nms_wrapper import nms +from model.rpn.bbox_transform import bbox_transform_inv +from model.utils.network import save_net, load_net, vis_detections +from model.utils.blob import im_list_to_blob +import pdb + +def parse_args(): + """ + Parse input arguments + """ + parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') + parser.add_argument('--cfg', dest='cfg_file', + help='optional config file', + default='cfgs/vgg16.yml', type=str) + parser.add_argument('--imdb', dest='imdb_name', + help='dataset to train on', + default='voc_2007_trainval', type=str) + parser.add_argument('--imdbval', dest='imdbval_name', + help='dataset to validate on', + default='voc_2007_test', type=str) + parser.add_argument('--net', dest='net', + help='vgg16, res50, res101, res152', + default='vgg16', type=str) + parser.add_argument('--set', dest='set_cfgs', + help='set config keys', default=None, + nargs=argparse.REMAINDER) + parser.add_argument('--load_dir', dest='load_dir', + help='directory to load models', default="models", + nargs=argparse.REMAINDER) + parser.add_argument('--image_dir', dest='image_dir', + help='directory to load images', default="data/images", + type=str) + parser.add_argument('--ngpu', dest='ngpu', + help='number of gpu', + default=1, type=int) + parser.add_argument('--checksession', dest='checksession', + help='checksession to load model', + default=4, type=int) + parser.add_argument('--checkepoch', dest='checkepoch', + help='checkepoch to load network', + default=6, type=int) + parser.add_argument('--checkpoint', dest='checkpoint', + help='checkpoint to load network', + default=10000, type=int) + + args = parser.parse_args() + return args + +lr = cfg.TRAIN.LEARNING_RATE +momentum = cfg.TRAIN.MOMENTUM +weight_decay = cfg.TRAIN.WEIGHT_DECAY + +def _get_image_blob(im): + """Converts an image into a network input. + Arguments: + im (ndarray): a color image in BGR order + Returns: + blob (ndarray): a data blob holding an image pyramid + im_scale_factors (list): list of image scales (relative to im) used + in the image pyramid + """ + im_orig = im.astype(np.float32, copy=True) + im_orig -= cfg.PIXEL_MEANS + + im_shape = im_orig.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + + processed_ims = [] + im_scale_factors = [] + + for target_size in cfg.TEST.SCALES: + im_scale = float(target_size) / float(im_size_min) + # Prevent the biggest axis from being more than MAX_SIZE + if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: + im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) + im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, + interpolation=cv2.INTER_LINEAR) + im_scale_factors.append(im_scale) + processed_ims.append(im) + + # Create a blob to hold the input images + blob = im_list_to_blob(processed_ims) + + return blob, np.array(im_scale_factors) + +if __name__ == '__main__': + + args = parse_args() + + print('Called with args:') + print(args) + + if args.cfg_file is not None: + cfg_from_file(args.cfg_file) + if args.set_cfgs is not None: + cfg_from_list(args.set_cfgs) + + print('Using config:') + pprint.pprint(cfg) + np.random.seed(cfg.RNG_SEED) + + # train set + # -- Note: Use validation set and disable the flipped to enable faster loading. + + input_dir = args.load_dir + "/" + args.net + if not os.path.exists(input_dir): + raise Exception('There is no input directory for loading network') + load_name = os.path.join(input_dir, + 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) + + + classes = np.asarray(['__background__', + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor']) + + fasterRCNN = _fasterRCNN(args.net, classes) + checkpoint = torch.load(load_name) + fasterRCNN.load_state_dict(checkpoint['model']) + print('load model successfully!') + + # pdb.set_trace() + + print("load checkpoint %s" % (load_name)) + + # initilize the tensor holder here. + im_data = torch.FloatTensor(1) + im_info = torch.FloatTensor(1) + num_boxes = torch.LongTensor(1) + gt_boxes = torch.FloatTensor(1) + + # ship to cuda + if args.ngpu > 0: + im_data = im_data.cuda() + im_info = im_info.cuda() + num_boxes = num_boxes.cuda() + gt_boxes = gt_boxes.cuda() + + # make variable + im_data = Variable(im_data, volatile=True) + im_info = Variable(im_info, volatile=True) + num_boxes = Variable(num_boxes, volatile=True) + gt_boxes = Variable(gt_boxes, volatile=True) + + if args.ngpu > 0: + cfg.CUDA = True + + if args.ngpu > 0: + fasterRCNN.cuda() + + fasterRCNN.eval() + + start = time.time() + max_per_image = 100 + thresh = 0.05 + vis = True + + imglist = os.listdir(args.image_dir) + num_images = len(imglist) + + print('Loaded Photo: {} images.'.format(num_images)) + + + for i in range(num_images): + + # Load the demo image + im_file = os.path.join(args.image_dir, imglist[i]) + # im = cv2.imread(im_file) + im = np.array(Image.open(im_file)) + if len(im.shape) == 2: + im = im[:,:,np.newaxis] + im = np.concatenate((im,im,im), axis=2) + + blobs, im_scales = _get_image_blob(im) + assert len(im_scales) == 1, "Only single-image batch implemented" + im_blob = blobs + im_info_np = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) + + im_data_pt = torch.from_numpy(im_blob) + im_data_pt = im_data_pt.permute(0, 3, 1, 2) + im_info_pt = torch.from_numpy(im_info_np) + + im_data.data.resize_(im_data_pt.size()).copy_(im_data_pt) + im_info.data.resize_(im_info_pt.size()).copy_(im_info_pt) + gt_boxes.data.resize_(1, 1, 5).zero_() + num_boxes.data.resize_(1).zero_() + + # pdb.set_trace() + + + det_tic = time.time() + rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss = \ + fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + + scores = cls_prob.data + boxes = rois[:, :, 1:5] / im_scales[0] + + if cfg.TEST.BBOX_REG: + # Apply bounding-box regression deltas + box_deltas = bbox_pred.data + if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # Optionally normalize targets by a precomputed mean and stdev + box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() + box_deltas = box_deltas.view(1, -1, 4) + pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) + pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) + else: + # Simply repeat the boxes, once for each class + pred_boxes = np.tile(boxes, (1, scores.shape[1])) + + scores = scores.squeeze().cpu().numpy() + pred_boxes = pred_boxes.squeeze().cpu().numpy() + # _t['im_detect'].tic() + det_toc = time.time() + detect_time = det_toc - det_tic + + misc_tic = time.time() + + if vis: + im2show = np.copy(im) + + for j in xrange(1, 21): + inds = np.where(scores[:, j] > thresh)[0] + cls_scores = scores[inds, j] + cls_boxes = pred_boxes[inds, :] + cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ + .astype(np.float32, copy=False) + keep = nms(cls_dets, cfg.TEST.NMS) + cls_dets = cls_dets[keep, :] + if vis: + im2show = vis_detections(im2show, classes[j], cls_dets) + + misc_toc = time.time() + nms_time = misc_toc - misc_tic + + sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ + .format(i + 1, num_images, detect_time, nms_time)) + sys.stdout.flush() + + if vis: + cv2.imshow('test', im2show) + cv2.waitKey(0) diff --git a/lib/model/faster_rcnn/faster_rcnn.py b/lib/model/faster_rcnn/faster_rcnn.py index 2c7e2b9da..a875e0556 100644 --- a/lib/model/faster_rcnn/faster_rcnn.py +++ b/lib/model/faster_rcnn/faster_rcnn.py @@ -181,4 +181,4 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) - return rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss + return rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss \ No newline at end of file diff --git a/lib/model/faster_rcnn/faster_rcnn_cascade.py b/lib/model/faster_rcnn/faster_rcnn_cascade.py new file mode 100644 index 000000000..0d51d7858 --- /dev/null +++ b/lib/model/faster_rcnn/faster_rcnn_cascade.py @@ -0,0 +1,202 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as models + +from torch.autograd import Variable +import numpy as np + +from model.utils.config import cfg + +from model.rpn.rpn import _RPN +from model.roi_pooling.modules.roi_pool import _RoIPooling +# from model.roi_pooling_single.modules.roi_pool import _RoIPool +from model.rpn.proposal_target_layer_4 import _ProposalTargetLayer +from model.utils import network +import time +import pdb +from model.utils.network import _smooth_l1_loss + +# from model.utils.vgg16 import VGG16 + +class _RCNN_base(nn.Module): + def __init__(self, baseModels, classes): + super(_RCNN_base, self).__init__() + + if classes is not None: + self.classes = classes + self.n_classes = len(classes) + + self.RCNN_base_model = nn.Sequential() + for i in range(len(baseModels)): + self.RCNN_base_model.add_module('part{}'.format(i), baseModels[i]) + + virtual_input = torch.randn(1, 3, cfg.TRAIN.TRIM_HEIGHT, cfg.TRAIN.TRIM_WIDTH) + out = self.RCNN_base_model(Variable(virtual_input)) + self.feat_height = out.size(2) + self.feat_width = out.size(3) + self.dout_base_model = out.size(1) + # define rpn + self.RCNN_rpn = _RPN(self.feat_height, self.feat_width, self.dout_base_model) + self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes) + self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0) + # self.RCNN_roi_pool = _RoIPool(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0) + + def forward(self, im_data, im_info, gt_boxes, num_boxes): + im_info = im_info.data + gt_boxes = gt_boxes.data + num_boxes = num_boxes.data + + batch_size = im_data.size(0) + # feed image data to base model to obtain base feature map + base_feat = self.RCNN_base_model(im_data) + + # feed base feature map tp RPN to obtain rois + rois, rpn_loss_cls, rpn_loss_bbox = self.RCNN_rpn(base_feat, im_info, gt_boxes, num_boxes) + + # if it is training phrase, then use ground trubut bboxes for refining + if self.training: + + roi_data = self.RCNN_proposal_target(rois, gt_boxes, num_boxes) + rois, rois_label, rois_target, rois_inside_ws, rois_outside_ws = roi_data + + rois_label = Variable(rois_label.view(-1)) + rois_target = Variable(rois_target.view(-1, rois_target.size(2))) + rois_inside_ws = Variable(rois_inside_ws.view(-1, rois_inside_ws.size(2))) + rois_outside_ws = Variable(rois_outside_ws.view(-1, rois_outside_ws.size(2))) + + else: + rois_label = None + rois_target = None + rois_inside_ws = None + rois_outside_ws = None + rpn_loss_cls = 0 + rpn_loss_bbox = 0 + + # do roi pooling based on predicted rois + rois_var = Variable(rois.view(-1,5)) + pooled_feat = self.RCNN_roi_pool(base_feat, rois_var) + pooled_feat_all = pooled_feat.view(pooled_feat.size(0), -1) + + return rois, pooled_feat_all, rois_label, rois_target, rois_inside_ws, rois_outside_ws, rpn_loss_cls, rpn_loss_bbox + +class _fasterRCNN(nn.Module): + """ faster RCNN """ + def __init__(self, baseModel, classes, debug=False): + super(_fasterRCNN, self).__init__() + + if classes is not None: + self.classes = classes + self.n_classes = len(classes) + + # define base model, e.g., VGG16, ResNet, etc. + if baseModel == "vgg16": + slices = network.load_baseModel(baseModel) + self.RCNN_base = _RCNN_base(slices[:3], classes) + self.RCNN_fc6 = slices[3] + self.RCNN_fc7 = slices[4] + elif baseModel == "res50": + pretrained_model = models.resnet50(pretrained=True) + RCNN_base_model = nn.Sequential(*list(pretrained_model.children())[:-2]) + elif baseModel == "res101": + pretrained_model = models.resnet50(pretrained=True) + RCNN_base_model = nn.Sequential(*list(pretrained_model.children())[:-2]) + else: + raise RuntimeError('baseModel is not included.') + + self.dout_base_model = self.RCNN_base.dout_base_model + + self.RCNN_cls_score = nn.Sequential( + nn.Linear(4096, self.n_classes) + ) + + self.RCNN_bbox_pred = nn.Sequential( + nn.Linear(4096, 4) + ) + + # loss + self.RCNN_loss_cls = 0 + self.RCNN_loss_bbox = 0 + + # for log + self.debug = debug + + def forward(self, im_data, im_info, gt_boxes, num_boxes): + + + batch_size = im_data.size(0) + rois, pooled_feat_all, rois_label, rois_target, rois_inside_ws, rois_outside_ws, \ + rpn_loss_cls, rpn_loss_bbox = self.RCNN_base(im_data, im_info, gt_boxes, num_boxes) + + rpn_loss = rpn_loss_cls + rpn_loss_bbox + + # feed pooled features to top model + x = self.RCNN_fc6(pooled_feat_all) + x = F.relu(x, inplace = True) + x = F.dropout(x, training=self.training) + + x = self.RCNN_fc7(x) + x = F.relu(x, inplace = True) + x = F.dropout(x, training=self.training) + + # compute bbox offset + bbox_pred = self.RCNN_bbox_pred(x) + + # compute object classification probability + cls_score = self.RCNN_cls_score(x) + cls_prob = F.softmax(cls_score) + + # if not self.training: + # pdb.set_trace() + # from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes + # if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # # Optionally normalize targets by a precomputed mean and stdev + # box_deltas = bbox_pred.data.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + # + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() + # box_deltas = box_deltas.view(1, -1, 84) + # pred_boxes = bbox_transform_inv(rois, box_deltas, 1) + # pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) + + # # perform roi pooling again on pred_boxes + # rois_var = Variable(pred_boxes.view(-1,5)) + + # # do roi pooling based on predicted rois + + # pooled_feat = self.RCNN_roi_pool(base_feat, rois_var) + # pooled_feat_all = pooled_feat.view(pooled_feat.size(0), -1) + # # feed pooled features to top model + # x = self.RCNN_fc6(pooled_feat_all) + # x = F.relu(x, inplace = True) + # x = F.dropout(x, training=self.training) + + # x = self.RCNN_fc7(x) + # x = F.relu(x, inplace = True) + # x = F.dropout(x, training=self.training) + + self.RCNN_loss_cls = 0 + self.RCNN_loss_bbox = 0 + + if self.training: + # classification loss + label = rois_label.long() + self.fg_cnt = torch.sum(label.data.ne(0)) + self.bg_cnt = label.data.numel() - self.fg_cnt + + ce_weights = rois_label.data.new(cls_score.size(1)).fill_(1) + ce_weights[0] = float(self.fg_cnt) / self.bg_cnt + + # self.RCNN_loss_cls = F.cross_entropy(cls_score, label, weight=ce_weights) + + self.RCNN_loss_cls = F.cross_entropy(cls_score, label) + + # bounding box regression L1 loss + # rois_target = torch.mul(rois_target, rois_inside_ws) + # bbox_pred = torch.mul(bbox_pred, rois_inside_ws) + self.RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) + + rcnn_loss = self.RCNN_loss_cls + self.RCNN_loss_bbox + + cls_prob = cls_prob.view(batch_size, rois.size(1), -1) + bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) + + return rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss \ No newline at end of file diff --git a/lib/model/rpn/proposal_target_layer_4.py b/lib/model/rpn/proposal_target_layer_4.py new file mode 100644 index 000000000..606a2fe7e --- /dev/null +++ b/lib/model/rpn/proposal_target_layer_4.py @@ -0,0 +1,203 @@ +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- +# -------------------------------------------------------- +# Reorganized and modified by Jianwei Yang and Jiasen Lu +# -------------------------------------------------------- + +import torch +import torch.nn as nn +import numpy as np +import numpy.random as npr +from ..utils.config import cfg +from bbox_transform import bbox_transform, bbox_overlaps, bbox_overlaps_batch2, bbox_transform_batch2 +import pdb + +DEBUG = False + +class _ProposalTargetLayer(nn.Module): + """ + Assign object detection proposals to ground-truth targets. Produces proposal + classification labels and bounding-box regression targets. + """ + + def __init__(self, nclasses): + super(_ProposalTargetLayer, self).__init__() + self._num_classes = nclasses + self.BBOX_NORMALIZE_MEANS = torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS) + self.BBOX_NORMALIZE_STDS = torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS) + self.BBOX_INSIDE_WEIGHTS = torch.FloatTensor(cfg.TRAIN.BBOX_INSIDE_WEIGHTS) + + def forward(self, all_rois, gt_boxes, num_boxes): + + self.BBOX_NORMALIZE_MEANS = self.BBOX_NORMALIZE_MEANS.type_as(gt_boxes) + self.BBOX_NORMALIZE_STDS = self.BBOX_NORMALIZE_STDS.type_as(gt_boxes) + self.BBOX_INSIDE_WEIGHTS = self.BBOX_INSIDE_WEIGHTS.type_as(gt_boxes) + + gt_boxes_append = gt_boxes.new(gt_boxes.size()).zero_() + gt_boxes_append[:,:,1:5] = gt_boxes[:,:,:4] + + # Include ground-truth boxes in the set of candidate rois + all_rois = torch.cat([all_rois, gt_boxes_append], 1) + + num_images = 1 + rois_per_image = int(cfg.TRAIN.BATCH_SIZE / num_images) + fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) + + labels, rois, bbox_targets, bbox_inside_weights = self._sample_rois_pytorch( + all_rois, gt_boxes, fg_rois_per_image, + rois_per_image, self._num_classes) + + bbox_outside_weights = (bbox_inside_weights > 0).float() + + return rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights + + def backward(self, top, propagate_down, bottom): + """This layer does not propagate gradients.""" + pass + + def reshape(self, bottom, top): + """Reshaping happens during the call to forward.""" + pass + + def _get_bbox_regression_labels_pytorch(self, bbox_target_data, labels_batch, num_classes): + """Bounding-box regression targets (bbox_target_data) are stored in a + compact form b x N x (class, tx, ty, tw, th) + + This function expands those targets into the 4-of-4*K representation used + by the network (i.e. only one class has non-zero targets). + + Returns: + bbox_target (ndarray): b x N x 4K blob of regression targets + bbox_inside_weights (ndarray): b x N x 4K blob of loss weights + """ + + batch_size = labels_batch.size(0) + rois_per_image = labels_batch.size(1) + clss = labels_batch + bbox_targets = bbox_target_data.new(batch_size, rois_per_image, 4).zero_() + bbox_inside_weights = bbox_target_data.new(bbox_targets.size()).zero_() + + for b in range(batch_size): + # assert clss[b].sum() > 0 + if clss[b].sum() == 0: + continue + inds = torch.nonzero(clss[b] > 0).view(-1) + for i in range(inds.numel()): + ind = inds[i] + bbox_targets[b, ind, :] = bbox_target_data[b, ind, :] + bbox_inside_weights[b, ind, :] = self.BBOX_INSIDE_WEIGHTS + + return bbox_targets, bbox_inside_weights + + + def _compute_targets_pytorch(self, ex_rois, gt_rois): + """Compute bounding-box regression targets for an image.""" + + assert ex_rois.size(1) == gt_rois.size(1) + assert ex_rois.size(2) == 4 + assert gt_rois.size(2) == 4 + + batch_size = ex_rois.size(0) + rois_per_image = ex_rois.size(1) + + targets = bbox_transform_batch2(ex_rois, gt_rois) + + if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # Optionally normalize targets by a precomputed mean and stdev + targets = ((targets - self.BBOX_NORMALIZE_MEANS.expand_as(targets)) + / self.BBOX_NORMALIZE_STDS.expand_as(targets)) + + return targets + + + def _sample_rois_pytorch(self, all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): + """Generate a random sample of RoIs comprising foreground and background + examples. + """ + # overlaps: (rois x gt_boxes) + + overlaps, all_rois_zero, gt_boxes_zero = bbox_overlaps_batch2(all_rois, gt_boxes) + + max_overlaps, gt_assignment = torch.max(overlaps, 2) + + batch_size = overlaps.size(0) + num_proposal = overlaps.size(1) + num_boxes_per_img = overlaps.size(2) + + offset = torch.arange(0, batch_size)*20 + offset = offset.view(-1, 1).type_as(gt_assignment) + gt_assignment + + labels = gt_boxes[:,:,4].contiguous().view(-1).index(offset.view(-1))\ + .view(batch_size, -1) + + fg_mask = max_overlaps >= cfg.TRAIN.FG_THRESH + + labels_batch = labels.new(batch_size, rois_per_image).zero_() + rois_batch = all_rois.new(batch_size, rois_per_image, 5).zero_() + gt_rois_batch = all_rois.new(batch_size, rois_per_image, 5).zero_() + # Guard against the case when an image has fewer than max_fg_rois_per_image + # foreground RoIs + for i in range(batch_size): + + fg_inds = torch.nonzero(max_overlaps[i] >= cfg.TRAIN.FG_THRESH).view(-1) + fg_num_rois = fg_inds.numel() + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_inds = torch.nonzero((max_overlaps[i] < cfg.TRAIN.BG_THRESH_HI) & + (max_overlaps[i] >= cfg.TRAIN.BG_THRESH_LO)).view(-1) + bg_num_rois = bg_inds.numel() + + if fg_num_rois > 0 and bg_num_rois > 0: + # sampling fg + fg_rois_per_this_image = min(fg_rois_per_image, fg_num_rois) + rand_num = torch.randperm(fg_num_rois).type_as(all_rois).long() + fg_inds = fg_inds[rand_num[:fg_rois_per_this_image]] + + # sampling bg + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + rand_num = torch.floor(torch.rand(bg_rois_per_this_image).type_as(all_rois) + * bg_num_rois).long() + bg_inds = bg_inds[rand_num] + elif fg_num_rois > 0 and bg_num_rois == 0: + # sampling fg + rand_num = torch.floor(torch.rand(rois_per_image).type_as(all_rois) + * fg_num_rois).long() + fg_inds = fg_inds[rand_num] + fg_rois_per_this_image = rois_per_image + bg_rois_per_this_image = 0 + elif bg_num_rois > 0 and fg_num_rois == 0: + # sampling bg + rand_num = torch.floor(torch.rand(rois_per_image).type_as(all_rois) + * bg_num_rois).long() + bg_inds = bg_inds[rand_num] + bg_rois_per_this_image = rois_per_image + fg_rois_per_this_image = 0 + else: + print("bg_num_rois = 0 and fg_num_rois = 0, this should not happen!") + pdb.set_trace() + + # The indices that we're selecting (both fg and bg) + keep_inds = torch.cat([fg_inds, bg_inds], 0) + + # Select sampled values from various arrays: + labels_batch[i].copy_(labels[i][keep_inds]) + + # Clamp labels for the background RoIs to 0 + labels_batch[i][fg_rois_per_this_image:] = 0 + + rois_batch[i].copy_(all_rois[i][keep_inds]) + rois_batch[i,:,0] = i + + gt_rois_batch[i].copy_(gt_boxes[i][gt_assignment[i][keep_inds]]) + + bbox_target_data = self._compute_targets_pytorch( + rois_batch[:,:,1:5], gt_rois_batch[:,:,:4]) + + bbox_targets, bbox_inside_weights = \ + self._get_bbox_regression_labels_pytorch(bbox_target_data, labels_batch, num_classes) + + return labels_batch, rois_batch, bbox_targets, bbox_inside_weights diff --git a/lib/roi_data_layer/roibatchLoader.py b/lib/roi_data_layer/roibatchLoader.py index 45d28864b..163a0d534 100644 --- a/lib/roi_data_layer/roibatchLoader.py +++ b/lib/roi_data_layer/roibatchLoader.py @@ -45,12 +45,38 @@ def __getitem__(self, index): ################################################## # we crop the input image to fixed size randomly # ################################################## - # trim_data = torch.FloatTensor(1, self.trim_height, self.trim_width, 3) + # trim_data = torch.FloatTensor(1, self.trim_height, self.trim_width, 3) if data_height > data_width: # if height > width, then crop on height # randomly generate an y start point # while True: - y_s = np.random.randint(data_height - self.trim_height + 1) + # assign score to y axis + y_score = torch.FloatTensor(data_height).zero_() + for i in range(gt_boxes.size(0)): + rg = torch.arange(int(gt_boxes[i, 1]), int(gt_boxes[i, 3])) + score = -(rg - gt_boxes[i, 1]) * (rg - gt_boxes[i, 3]) / (gt_boxes[i, 3] - gt_boxes[i, 1])**2 + y_score[int(gt_boxes[i, 1]):int(gt_boxes[i, 3])] += score + + # find the inds with maximal score in y_score + if data_height > self.trim_height: + + ys = torch.arange(0, data_height - self.trim_height, 5).long() + y_score_cum = torch.FloatTensor(ys.size()).zero_() + + for i in range(ys.size(0)): + s = ys[i] + y_score_cum[i] = y_score[s:s + self.trim_height].sum() + + _, order = torch.sort(y_score_cum, 0, True) + + ys_ordered = ys[order] + rand_num = torch.randint(min(5, ys_ordered.size(0))) + + ys = ys_ordered[rand_num] + ys = min(ys, data_width - self.trim_width) + else: + y_s = 0 + trim_data = data[:, y_s:(y_s + self.trim_height), :] # shift y coordiante of gt_boxes @@ -67,7 +93,33 @@ def __getitem__(self, index): elif data_height <= data_width: # if height <= width, then crop on width # while True: - x_s = np.random.randint(data_width - self.trim_width + 1) + + # assign score to y axis + x_score = torch.FloatTensor(data_width).zero_() + for i in range(gt_boxes.size(0)): + rg = torch.arange(int(gt_boxes[i, 0]), int(gt_boxes[i, 2])) + score = -(rg - gt_boxes[i, 0]) * (rg - gt_boxes[i, 2]) / (gt_boxes[i, 2] - gt_boxes[i, 0])**2 + x_score[int(gt_boxes[i, 0]):int(gt_boxes[i, 2])] += score + + # find the inds with maximal score in y_score + if data_width > self.trim_width: + xs = torch.arange(0, data_width - self.trim_width, 5).long() + x_score_cum = torch.FloatTensor(xs.size()).zero_() + + for i in range(xs.size(0)): + s = xs[i] + x_score_cum[i] = x_score[s:s + self.trim_width].sum() + + _, order = torch.sort(x_score_cum, 0, True) + + xs_ordered = xs[order] + rand_num = torch.randint(min(5, xs_ordered.size(0))) + + xs = xs_ordered[rand_num] + xs = min(xs, data_width - self.trim_width) + else: + x_s = 0 + trim_data = data[:, :, x_s:(x_s + self.trim_width), :] # shift x coordiante of gt_boxes diff --git a/lib/roi_data_layer/roibatchLoader_aug.py b/lib/roi_data_layer/roibatchLoader_aug.py new file mode 100644 index 000000000..163a0d534 --- /dev/null +++ b/lib/roi_data_layer/roibatchLoader_aug.py @@ -0,0 +1,162 @@ + +"""The data layer used during training to train a Fast R-CNN network. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch.utils.data as data +from PIL import Image +import torch + +from model.utils.config import cfg +from roi_data_layer.minibatch import get_minibatch +from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes + +import numpy as np +import random +import time +import pdb + +class roibatchLoader(data.Dataset): + def __init__(self, roidb, num_classes, training=True, normalize=None): + self._roidb = roidb + self._num_classes = num_classes + # we make the height of image consistent to trim_height, trim_width + self.trim_height = cfg.TRAIN.TRIM_HEIGHT + self.trim_width = cfg.TRAIN.TRIM_WIDTH + self.max_num_box = 20 + self.training = training + self.normalize = normalize + + def __getitem__(self, index): + + minibatch_db = [self._roidb[index]] + blobs = get_minibatch(minibatch_db, self._num_classes) + data = torch.from_numpy(blobs['data']) + im_info = torch.from_numpy(blobs['im_info']) + # we need to random shuffle the bounding box. + np.random.shuffle(blobs['gt_boxes']) + gt_boxes = torch.from_numpy(blobs['gt_boxes']) + data_height, data_width = data.size(1), data.size(2) + + if self.training: + ################################################## + # we crop the input image to fixed size randomly # + ################################################## + # trim_data = torch.FloatTensor(1, self.trim_height, self.trim_width, 3) + if data_height > data_width: + # if height > width, then crop on height + # randomly generate an y start point + # while True: + # assign score to y axis + y_score = torch.FloatTensor(data_height).zero_() + for i in range(gt_boxes.size(0)): + rg = torch.arange(int(gt_boxes[i, 1]), int(gt_boxes[i, 3])) + score = -(rg - gt_boxes[i, 1]) * (rg - gt_boxes[i, 3]) / (gt_boxes[i, 3] - gt_boxes[i, 1])**2 + y_score[int(gt_boxes[i, 1]):int(gt_boxes[i, 3])] += score + + # find the inds with maximal score in y_score + if data_height > self.trim_height: + + ys = torch.arange(0, data_height - self.trim_height, 5).long() + y_score_cum = torch.FloatTensor(ys.size()).zero_() + + for i in range(ys.size(0)): + s = ys[i] + y_score_cum[i] = y_score[s:s + self.trim_height].sum() + + _, order = torch.sort(y_score_cum, 0, True) + + ys_ordered = ys[order] + rand_num = torch.randint(min(5, ys_ordered.size(0))) + + ys = ys_ordered[rand_num] + ys = min(ys, data_width - self.trim_width) + else: + y_s = 0 + + trim_data = data[:, y_s:(y_s + self.trim_height), :] + + # shift y coordiante of gt_boxes + gt_boxes[:, 1] = gt_boxes[:, 1] - y_s + gt_boxes[:, 3] = gt_boxes[:, 3] - y_s + + # update gt bounding box according the trip + gt_boxes[:, 1].clamp_(0, self.trim_height - 1) + gt_boxes[:, 3].clamp_(0, self.trim_height - 1) + + # update im_info + im_info[0, 0] = self.trim_height + + elif data_height <= data_width: + # if height <= width, then crop on width + # while True: + + # assign score to y axis + x_score = torch.FloatTensor(data_width).zero_() + for i in range(gt_boxes.size(0)): + rg = torch.arange(int(gt_boxes[i, 0]), int(gt_boxes[i, 2])) + score = -(rg - gt_boxes[i, 0]) * (rg - gt_boxes[i, 2]) / (gt_boxes[i, 2] - gt_boxes[i, 0])**2 + x_score[int(gt_boxes[i, 0]):int(gt_boxes[i, 2])] += score + + # find the inds with maximal score in y_score + if data_width > self.trim_width: + xs = torch.arange(0, data_width - self.trim_width, 5).long() + x_score_cum = torch.FloatTensor(xs.size()).zero_() + + for i in range(xs.size(0)): + s = xs[i] + x_score_cum[i] = x_score[s:s + self.trim_width].sum() + + _, order = torch.sort(x_score_cum, 0, True) + + xs_ordered = xs[order] + rand_num = torch.randint(min(5, xs_ordered.size(0))) + + xs = xs_ordered[rand_num] + xs = min(xs, data_width - self.trim_width) + else: + x_s = 0 + + trim_data = data[:, :, x_s:(x_s + self.trim_width), :] + + # shift x coordiante of gt_boxes + gt_boxes[:, 0] = gt_boxes[:, 0] - x_s + gt_boxes[:, 2] = gt_boxes[:, 2] - x_s + + # update gt bounding box according the trip + gt_boxes[:, 0].clamp_(0, self.trim_width - 1) + gt_boxes[:, 2].clamp_(0, self.trim_width - 1) + + im_info[0, 1] = self.trim_width + + num_boxes = min(gt_boxes.size(0), self.max_num_box) + + gt_boxes_padding = torch.FloatTensor(self.max_num_box, 5).zero_() + # take the top num_boxes + gt_boxes_padding[:num_boxes,:] = gt_boxes[:num_boxes] + + # permute trim_data to adapt to downstream processing + trim_data = trim_data.permute(0, 3, 1, 2).contiguous().view(3, self.trim_height, self.trim_width) + im_info = im_info.view(3) + + if self.normalize: + trim_data = trim_data / 255.0 + trim_data = self.normalize(trim_data) + + return trim_data, im_info, gt_boxes, num_boxes + else: + data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) + num_boxes = gt_boxes.size(0) + im_info = im_info.view(3) + + if self.normalize: + data = data / 255.0 + data = self.normalize(data) + + return data, im_info, gt_boxes, num_boxes + + def __len__(self): + return len(self._roidb) diff --git a/test_net_cascade.py b/test_net_cascade.py new file mode 100644 index 000000000..2e99186cd --- /dev/null +++ b/test_net_cascade.py @@ -0,0 +1,256 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Jiasen Lu, Jianwei Yang, based on code from Ross Girshick +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import _init_paths +import os +import sys +import numpy as np +import argparse +import pprint +import pdb +import time +import cv2 +import cPickle +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim as optim + +import torchvision.transforms as transforms + +from roi_data_layer.roidb import combined_roidb +from roi_data_layer.roibatchLoader import roibatchLoader +from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir +from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN +from model.rpn.bbox_transform import clip_boxes +from model.nms.nms_wrapper import nms +from model.fast_rcnn.nms_wrapper import nms +from model.rpn.bbox_transform import bbox_transform_inv +from model.utils.network import save_net, load_net, vis_detections +import pdb + +def parse_args(): + """ + Parse input arguments + """ + parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') + parser.add_argument('--cfg', dest='cfg_file', + help='optional config file', + default='cfgs/vgg16.yml', type=str) + parser.add_argument('--imdb', dest='imdb_name', + help='dataset to train on', + default='voc_2007_trainval', type=str) + parser.add_argument('--imdbval', dest='imdbval_name', + help='dataset to validate on', + default='voc_2007_test', type=str) + parser.add_argument('--net', dest='net', + help='vgg16, res50, res101, res152', + default='vgg16', type=str) + parser.add_argument('--set', dest='set_cfgs', + help='set config keys', default=None, + nargs=argparse.REMAINDER) + parser.add_argument('--load_dir', dest='load_dir', + help='directory to load models', default="models", + nargs=argparse.REMAINDER) + parser.add_argument('--ngpu', dest='ngpu', + help='number of gpu', + default=1, type=int) + parser.add_argument('--checksession', dest='checksession', + help='checksession to load model', + default=4, type=int) + parser.add_argument('--checkepoch', dest='checkepoch', + help='checkepoch to load network', + default=6, type=int) + parser.add_argument('--checkpoint', dest='checkpoint', + help='checkpoint to load network', + default=10000, type=int) + + args = parser.parse_args() + return args + +lr = cfg.TRAIN.LEARNING_RATE +momentum = cfg.TRAIN.MOMENTUM +weight_decay = cfg.TRAIN.WEIGHT_DECAY + +if __name__ == '__main__': + + args = parse_args() + + print('Called with args:') + print(args) + + if args.cfg_file is not None: + cfg_from_file(args.cfg_file) + if args.set_cfgs is not None: + cfg_from_list(args.set_cfgs) + + print('Using config:') + pprint.pprint(cfg) + np.random.seed(cfg.RNG_SEED) + + # train set + # -- Note: Use validation set and disable the flipped to enable faster loading. + cfg.TRAIN.USE_FLIPPED = False + imdb, roidb = combined_roidb(args.imdbval_name) + imdb.competition_mode(on=True) + + print('{:d} roidb entries'.format(len(roidb))) + + input_dir = args.load_dir + "/" + args.net + if not os.path.exists(input_dir): + raise Exception('There is no input directory for loading network') + load_name = os.path.join(input_dir, + 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) + + fasterRCNN = _fasterRCNN(args.net, imdb.classes) + checkpoint = torch.load(load_name) + fasterRCNN.load_state_dict(checkpoint['model']) + print('load model successfully!') + + # pdb.set_trace() + + print("load checkpoint %s" % (load_name)) + + # initilize the tensor holder here. + im_data = torch.FloatTensor(1) + im_info = torch.FloatTensor(1) + num_boxes = torch.LongTensor(1) + gt_boxes = torch.FloatTensor(1) + + # ship to cuda + if args.ngpu > 0: + im_data = im_data.cuda() + im_info = im_info.cuda() + num_boxes = num_boxes.cuda() + gt_boxes = gt_boxes.cuda() + + # make variable + im_data = Variable(im_data, volatile=True) + im_info = Variable(im_info, volatile=True) + num_boxes = Variable(num_boxes, volatile=True) + gt_boxes = Variable(gt_boxes, volatile=True) + + if args.ngpu > 0: + cfg.CUDA = True + + if args.ngpu > 0: + fasterRCNN.cuda() + + fasterRCNN.eval() + + start = time.time() + max_per_image = 100 + thresh = 0.05 + vis = False + + save_name = 'faster_rcnn_10' + num_images = len(imdb.image_index) + all_boxes = [[[] for _ in xrange(num_images)] + for _ in xrange(imdb.num_classes)] + + output_dir = get_output_dir(imdb, save_name) + + + # dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + # normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], + # std=[0.229, 0.224, 0.225])) + + dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + normalize = False) + + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, + shuffle=False, num_workers=0, + pin_memory=True) + + data_iter = iter(dataloader) + + _t = {'im_detect': time.time(), 'misc': time.time()} + det_file = os.path.join(output_dir, 'detections.pkl') + + for i in range(num_images): + + data = data_iter.next() + im_data.data.resize_(data[0].size()).copy_(data[0]) + im_info.data.resize_(data[1].size()).copy_(data[1]) + gt_boxes.data.resize_(data[2].size()).copy_(data[2]) + num_boxes.data.resize_(data[3].size()).copy_(data[3]) + + det_tic = time.time() + rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + scores = cls_prob.data + boxes = rois[:, :, 1:5] / data[1][0][2] + + if cfg.TEST.BBOX_REG: + # Apply bounding-box regression deltas + box_deltas = bbox_pred.data + if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: + # Optionally normalize targets by a precomputed mean and stdev + box_deltas = box_deltas.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ + + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() + box_deltas = box_deltas.view(1, -1, 4) + pred_boxes = bbox_transform_inv(boxes, box_deltas, 1) + pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) + else: + # Simply repeat the boxes, once for each class + pred_boxes = np.tile(boxes, (1, scores.shape[1])) + + scores = scores.squeeze().cpu().numpy() + pred_boxes = pred_boxes.squeeze().cpu().numpy() + # _t['im_detect'].tic() + det_toc = time.time() + detect_time = det_toc - det_tic + + misc_tic = time.time() + + if vis: + im = cv2.imread(imdb.image_path_at(i)) + im2show = np.copy(im) + + for j in xrange(1, imdb.num_classes): + inds = np.where(scores[:, j] > thresh)[0] + cls_scores = scores[inds, j] + cls_boxes = pred_boxes[inds, :] + cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \ + .astype(np.float32, copy=False) + keep = nms(cls_dets, cfg.TEST.NMS) + cls_dets = cls_dets[keep, :] + if vis: + im2show = vis_detections(im2show, imdb.classes[j], cls_dets) + all_boxes[j][i] = cls_dets + + # Limit to max_per_image detections *over all classes* + if max_per_image > 0: + image_scores = np.hstack([all_boxes[j][i][:, -1] + for j in xrange(1, imdb.num_classes)]) + if len(image_scores) > max_per_image: + image_thresh = np.sort(image_scores)[-max_per_image] + for j in xrange(1, imdb.num_classes): + keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] + all_boxes[j][i] = all_boxes[j][i][keep, :] + + misc_toc = time.time() + nms_time = misc_toc - misc_tic + + sys.stdout.write('im_detect: {:d}/{:d} {:.3f}s {:.3f}s \r' \ + .format(i + 1, num_images, detect_time, nms_time)) + sys.stdout.flush() + + if vis: + cv2.imshow('test', im2show) + cv2.waitKey(0) + + with open(det_file, 'wb') as f: + cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL) + + print('Evaluating detections') + imdb.evaluate_detections(all_boxes, output_dir) + + end = time.time() + print("test time: %0.4fs" % (end - start)) diff --git a/trainval_net.py b/trainval_net.py index 93b764b45..e3077ee1b 100644 --- a/trainval_net.py +++ b/trainval_net.py @@ -129,7 +129,7 @@ def parse_args(): from model.utils.logger import Logger # Set the logger logger = Logger('./logs') - + if args.dataset == "pascal_voc": args.imdb_name = "voc_2007_trainval" args.imdbval_name = "voc_2007_test" @@ -166,7 +166,7 @@ def parse_args(): if not os.path.exists(output_dir): os.makedirs(output_dir) - dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + dataset = roibatchLoader(roidb, imdb.num_classes, training=True, normalize = False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, diff --git a/trainval_net_cascade.py b/trainval_net_cascade.py new file mode 100644 index 000000000..10418da59 --- /dev/null +++ b/trainval_net_cascade.py @@ -0,0 +1,324 @@ +# -------------------------------------------------------- +# Pytorch multi-GPU Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Jiasen Lu, Jianwei Yang, based on code from Ross Girshick +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import _init_paths +import os +import sys +import numpy as np +import argparse +import pprint +import pdb +import time + +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim as optim + +import torchvision.transforms as transforms + +from roi_data_layer.roidb import combined_roidb +from roi_data_layer.roibatchLoader import roibatchLoader +from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir +from model.utils import network +from model.utils.network import weights_normal_init, save_net, load_net, \ + adjust_learning_rate, save_checkpoint + +from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN +import pdb + +def parse_args(): + """ + Parse input arguments + """ + parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') + parser.add_argument('--dataset', dest='dataset', + help='training dataset', + default='pascal_voc', type=str) + parser.add_argument('--net', dest='net', + help='vgg16, res50, res101, res152', + default='vgg16', type=str) + parser.add_argument('--imdb', dest='imdb_name', + help='dataset to train on', + default='voc_2007_trainval', type=str) + parser.add_argument('--imdbval', dest='imdbval_name', + help='dataset to validate on', + default='voc_2007_test', type=str) + parser.add_argument('--start_epoch', dest='start_epoch', + help='starting epoch', + default=1, type=int) + parser.add_argument('--epochs', dest='max_epochs', + help='number of epochs to train', + default=20, type=int) + parser.add_argument('--disp_interval', dest='disp_interval', + help='number of iterations to display', + default=100, type=int) + parser.add_argument('--checkpoint_interval', dest='checkpoint_interval', + help='number of iterations to display', + default=10000, type=int) + + parser.add_argument('--save_dir', dest='save_dir', + help='directory to save models', default="models", + nargs=argparse.REMAINDER) + parser.add_argument('--ngpu', dest='ngpu', + help='number of gpu', + default=1, type=int) + + +# config optimization + parser.add_argument('--o', dest='optimizer', + help='training optimizer', + default="sgd", type=str) + parser.add_argument('--lr_decay_step', dest='lr_decay_step', + help='step to do learning rate decay, unit is epoch', + default=5, type=int) + parser.add_argument('--lr_decay_gamma', dest='lr_decay_gamma', + help='learning rate decay ratio', + default=0.1, type=float) + +# set training session + parser.add_argument('--s', dest='session', + help='training session', + default=1, type=int) + +# resume trained model + parser.add_argument('--r', dest='resume', + help='resume checkpoint or not', + default=False, type=bool) + parser.add_argument('--checksession', dest='checksession', + help='checksession to load model', + default=1, type=int) + parser.add_argument('--checkepoch', dest='checkepoch', + help='checkepoch to load model', + default=1, type=int) + parser.add_argument('--checkpoint', dest='checkpoint', + help='checkpoint to load model', + default=0, type=int) +# log and diaplay + parser.add_argument('--use_tfboard', dest='use_tfboard', + help='whether use tensorflow tensorboard', + default=False, type=bool) + + # if len(sys.argv) == 1: + # parser.print_help() + # sys.exit(1) + + args = parser.parse_args() + return args + + +lr = cfg.TRAIN.LEARNING_RATE +momentum = cfg.TRAIN.MOMENTUM +weight_decay = cfg.TRAIN.WEIGHT_DECAY +use_multiGPU = False + +if __name__ == '__main__': + + args = parse_args() + + print('Called with args:') + print(args) + + if args.use_tfboard: + from model.utils.logger import Logger + # Set the logger + logger = Logger('./logs') + + if args.dataset == "pascal_voc": + args.imdb_name = "voc_2007_trainval" + args.imdbval_name = "voc_2007_test" + args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] + elif args.dataset == "pascal_voc_0712": + args.imdb_name = "voc_2007_trainval+voc_2012_trainval" + args.imdbval_name = "voc_2007_test" + args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] + elif args.dataset == "coco": + args.imdb_name = "coco_2014_train+coco_2014_valminusminival" + args.imdbval_name = "coco_2014_minival" + args.set_cfgs = ['ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] + + args.cfg_file = "cfgs/{}.yml".format(args.net) + + if args.cfg_file is not None: + cfg_from_file(args.cfg_file) + if args.set_cfgs is not None: + cfg_from_list(args.set_cfgs) + + print('Using config:') + pprint.pprint(cfg) + np.random.seed(cfg.RNG_SEED) + + # train set + # -- Note: Use validation set and disable the flipped to enable faster loading. + cfg.TRAIN.USE_FLIPPED = True + imdb, roidb = combined_roidb(args.imdb_name) + train_size = len(roidb) + + print('{:d} roidb entries'.format(len(roidb))) + + output_dir = args.save_dir + "/" + args.net + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + normalize = False) + + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, + shuffle=True, num_workers=0) + + # initilize the tensor holder here. + im_data = torch.FloatTensor(1) + im_info = torch.FloatTensor(1) + num_boxes = torch.LongTensor(1) + gt_boxes = torch.FloatTensor(1) + + # ship to cuda + if args.ngpu > 0: + im_data = im_data.cuda() + im_info = im_info.cuda() + num_boxes = num_boxes.cuda() + gt_boxes = gt_boxes.cuda() + + # make variable + im_data = Variable(im_data) + im_info = Variable(im_info) + num_boxes = Variable(num_boxes) + gt_boxes = Variable(gt_boxes) + + if args.ngpu > 0: + cfg.CUDA = True + + # initilize the network here. + fasterRCNN = _fasterRCNN(args.net, imdb.classes) + # weights_normal_init(fasterRCNN) + weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_ConvReLU) + weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_cls_score) + weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_bbox_pred) + weights_normal_init(fasterRCNN.RCNN_cls_score) + weights_normal_init(fasterRCNN.RCNN_bbox_pred, 0.001) + + params = list(fasterRCNN.parameters()) + + if args.optimizer == "adam": + lr = lr * 0.1 + optimizer = torch.optim.Adam([ + {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, + {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, + {'params': fasterRCNN.RCNN_fc6.parameters()}, + {'params': fasterRCNN.RCNN_fc7.parameters()}, + {'params': fasterRCNN.RCNN_cls_score.parameters()}, + {'params': fasterRCNN.RCNN_bbox_pred.parameters()}, + ], lr = lr) + + elif args.optimizer == "sgd": + optimizer = torch.optim.SGD([ + {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, + {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, + {'params': fasterRCNN.RCNN_fc6.parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_fc7.parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_cls_score.parameters()}, + {'params': fasterRCNN.RCNN_bbox_pred.parameters()}, + ], lr = lr, momentum=momentum, weight_decay=weight_decay) + + if args.resume: + load_name = os.path.join(output_dir, + 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) + print("loading checkpoint %s" % (load_name)) + checkpoint = torch.load(load_name) + args.session = checkpoint['session'] + args.start_epoch = checkpoint['epoch'] + fasterRCNN.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("loaded checkpoint %s" % (load_name)) + + if use_multiGPU: + fasterRCNN.RCNN_base = nn.DataParallel(fasterRCNN.RCNN_base) + + if args.ngpu > 0: + fasterRCNN.cuda() + + for epoch in range(args.start_epoch, args.max_epochs): + loss_temp = 0 + start = time.time() + + data_iter = iter(dataloader) + + for step in range(train_size): + data = data_iter.next() + im_data.data.resize_(data[0].size()).copy_(data[0]) + im_info.data.resize_(data[1].size()).copy_(data[1]) + gt_boxes.data.resize_(data[2].size()).copy_(data[2]) + num_boxes.data.resize_(data[3].size()).copy_(data[3]) + + fasterRCNN.zero_grad() + _, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + loss = (rpn_loss.sum() + rcnn_loss.sum()) / rpn_loss.size(0) + loss_temp += loss.data[0] + + # backward + optimizer.zero_grad() + loss.backward() + network.clip_gradient(fasterRCNN, 10.) + optimizer.step() + + if step % args.disp_interval == 0: + if use_multiGPU: + print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ + % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) + print("\t\t\tfg/bg=(%d/%d)" % (0, 0)) + print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" % (0, 0, 0, 0)) + if args.use_tfboard: + info = { + 'loss': loss_temp / args.disp_interval + } + for tag, value in info.items(): + logger.scalar_summary(tag, value, step) + + else: + print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ + % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) + print("\t\t\tfg/bg=(%d/%d)" % (fasterRCNN.fg_cnt, fasterRCNN.bg_cnt)) + print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box: %.4f" % + (fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_cls.data[0], \ + fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_box.data[0], \ + fasterRCNN.RCNN_loss_cls.data[0], \ + fasterRCNN.RCNN_loss_bbox.data[0])) + if args.use_tfboard: + info = { + 'loss': loss_temp / args.disp_interval, + 'loss_rpn_cls': fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_cls.data[0], + 'loss_rpn_box': fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_box.data[0], + 'loss_rcnn_cls': fasterRCNN.RCNN_loss_cls.data[0], + 'loss_rcnn_box': fasterRCNN.RCNN_loss_bbox.data[0] + } + for tag, value in info.items(): + logger.scalar_summary(tag, value, step) + + loss_temp = 0 + + if (step % args.checkpoint_interval == 0) and step > 0: + # pdb.set_trace() + save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) + save_checkpoint({ + 'session': args.session, + 'epoch': epoch + 1, + 'model': fasterRCNN.state_dict(), + "optimizer": optimizer.state_dict(), + }, save_name) + print('save model: {}'.format(save_name)) + + + if epoch % args.lr_decay_step == 0: + adjust_learning_rate(optimizer, args.lr_decay_gamma) + lr *= args.lr_decay_gamma + + end = time.time() + print(end - start) From dce4bf3fd82e36e5e1ef4642ecd441ad023c7db4 Mon Sep 17 00:00:00 2001 From: jiasen Date: Sun, 27 Aug 2017 18:40:31 -0400 Subject: [PATCH 04/13] add ratio batch loader --- lib/roi_data_layer/roibatchLoader.py | 3 ++- lib/roi_data_layer/roidb.py | 23 ++++++++++++++--- trainval_net_cascade.py | 37 +++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/lib/roi_data_layer/roibatchLoader.py b/lib/roi_data_layer/roibatchLoader.py index 163a0d534..af61093fa 100644 --- a/lib/roi_data_layer/roibatchLoader.py +++ b/lib/roi_data_layer/roibatchLoader.py @@ -20,7 +20,7 @@ import pdb class roibatchLoader(data.Dataset): - def __init__(self, roidb, num_classes, training=True, normalize=None): + def __init__(self, roidb, ratio_list, num_classes, training=True, normalize=None): self._roidb = roidb self._num_classes = num_classes # we make the height of image consistent to trim_height, trim_width @@ -29,6 +29,7 @@ def __init__(self, roidb, num_classes, training=True, normalize=None): self.max_num_box = 20 self.training = training self.normalize = normalize + self.ratio_list = ratio_list def __getitem__(self, index): diff --git a/lib/roi_data_layer/roidb.py b/lib/roi_data_layer/roidb.py index 37f6b9c51..e1aa0dae0 100644 --- a/lib/roi_data_layer/roidb.py +++ b/lib/roi_data_layer/roidb.py @@ -5,10 +5,9 @@ import numpy as np from model.utils.config import cfg -from model.rpn.bbox_transform import bbox_transform -from model.utils.cython_bbox import bbox_overlaps from datasets.factory import get_imdb import PIL +import pdb def prepare_roidb(imdb): """Enrich the imdb's roidb by adding some derived quantities that @@ -17,6 +16,7 @@ def prepare_roidb(imdb): each ground-truth box. The class with maximum overlap is also recorded. """ + roidb = imdb.roidb if not (imdb.name.startswith('coco')): sizes = [PIL.Image.open(imdb.image_path_at(i)).size @@ -44,6 +44,19 @@ def prepare_roidb(imdb): assert all(max_classes[nonzero_inds] != 0) +def rank_roidb_ratio(roidb): + # rank roidb based on the ratio between width and height. + ratio_list = [] + for i in range(len(roidb)): + width = roidb[i]['width'] + height = roidb[i]['height'] + ratio = width / float(height) + ratio_list.append(ratio) + + ratio_list = np.array(ratio_list) + ratio_index = np.argsort(ratio_list) + return ratio_index + def combined_roidb(imdb_names): """ Combine multiple roidbs @@ -59,6 +72,7 @@ def get_training_roidb(imdb): print('Preparing training data...') prepare_roidb(imdb) + #ratio_index = rank_roidb_ratio(imdb) print('done') return imdb.roidb @@ -73,6 +87,9 @@ def get_roidb(imdb_name): roidbs = [get_roidb(s) for s in imdb_names.split('+')] roidb = roidbs[0] + + ratio_list = rank_roidb_ratio(roidb) + if len(roidbs) > 1: for r in roidbs[1:]: roidb.extend(r) @@ -80,4 +97,4 @@ def get_roidb(imdb_name): imdb = datasets.imdb.imdb(imdb_names, tmp.classes) else: imdb = get_imdb(imdb_names) - return imdb, roidb + return imdb, roidb, ratio_list diff --git a/trainval_net_cascade.py b/trainval_net_cascade.py index 10418da59..8e54e83bf 100644 --- a/trainval_net_cascade.py +++ b/trainval_net_cascade.py @@ -22,6 +22,7 @@ import torch.optim as optim import torchvision.transforms as transforms +from torch.utils.data.sampler import Sampler from roi_data_layer.roidb import combined_roidb from roi_data_layer.roibatchLoader import roibatchLoader @@ -70,6 +71,9 @@ def parse_args(): help='number of gpu', default=1, type=int) + parser.add_argument('--bs', dest='batch_size', + help='batch_size', + default=1, type=int) # config optimization parser.add_argument('--o', dest='optimizer', @@ -113,6 +117,31 @@ def parse_args(): return args +class sampler(Sampler): + def __init__(self, data_source, batch_size): + num_data = len(data_source) + self.num_batch = int(num_data / batch_size) + self.batch_size = batch_size + self.range = torch.arange(0,batch_size).view(1, batch_size).long() + self.leftover_flag = False + if num_data % batch_size: + self.leftover = torch.arange(self.num_batch*batch_size, num_data).long() + self.leftover_flag = True + + def __iter__(self): + rand_num = torch.randperm(self.num_batch).view(-1,1)\ + .expand(self.num_batch, self.batch_size) + self.range + rand_num = rand_num.view(-1) + + if self.leftover_flag: + rand_num = torch.cat((rand_num, self.leftover),0) + + return iter(rand_num) + + def __len__(self): + return len(self.data_source) + + lr = cfg.TRAIN.LEARNING_RATE momentum = cfg.TRAIN.MOMENTUM weight_decay = cfg.TRAIN.WEIGHT_DECAY @@ -157,7 +186,7 @@ def parse_args(): # train set # -- Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = True - imdb, roidb = combined_roidb(args.imdb_name) + imdb, roidb, ratio_list = combined_roidb(args.imdb_name) train_size = len(roidb) print('{:d} roidb entries'.format(len(roidb))) @@ -166,11 +195,11 @@ def parse_args(): if not os.path.exists(output_dir): os.makedirs(output_dir) - dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + dataset = roibatchLoader(roidb, ratio_list, imdb.num_classes, training=False, normalize = False) - dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, - shuffle=True, num_workers=0) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, + sampler=sampler(dataset, args.batch_size), num_workers=0) # initilize the tensor holder here. im_data = torch.FloatTensor(1) From b0c7219c3d94c86f3ad4902a3d4a74e66577ba98 Mon Sep 17 00:00:00 2001 From: jwyang Date: Mon, 28 Aug 2017 23:04:12 -0400 Subject: [PATCH 05/13] batching data according to aspect ratios, and training in batch --- aspect_ratio.py | 268 ++++++++++++++++++++++++++++++++++++++++ demo.py | 12 +- test_net.py | 17 ++- test_net_cascade.py | 12 +- trainval_net.py | 77 ++++++++---- trainval_net_cascade.py | 37 +++--- 6 files changed, 361 insertions(+), 62 deletions(-) create mode 100644 aspect_ratio.py diff --git a/aspect_ratio.py b/aspect_ratio.py new file mode 100644 index 000000000..5d81f531b --- /dev/null +++ b/aspect_ratio.py @@ -0,0 +1,268 @@ +# -------------------------------------------------------- +# Pytorch multi-GPU Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Jiasen Lu, Jianwei Yang, based on code from Ross Girshick +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import _init_paths +import os +import sys +import numpy as np +import argparse +import pprint +import pdb +import time + +import torch +from torch.autograd import Variable +import torch.nn as nn +import torch.optim as optim + +import torchvision.transforms as transforms + +from roi_data_layer.roidb import combined_roidb +from roi_data_layer.roibatchLoader import roibatchLoader +from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir +from model.utils import network +from model.utils.network import weights_normal_init, save_net, load_net, \ + adjust_learning_rate, save_checkpoint + +from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN +import pdb + +def parse_args(): + """ + Parse input arguments + """ + parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') + parser.add_argument('--dataset', dest='dataset', + help='training dataset', + default='pascal_voc', type=str) + parser.add_argument('--net', dest='net', + help='vgg16, res50, res101, res152', + default='vgg16', type=str) + parser.add_argument('--imdb', dest='imdb_name', + help='dataset to train on', + default='voc_2007_trainval', type=str) + parser.add_argument('--imdbval', dest='imdbval_name', + help='dataset to validate on', + default='voc_2007_test', type=str) + parser.add_argument('--start_epoch', dest='start_epoch', + help='starting epoch', + default=1, type=int) + parser.add_argument('--epochs', dest='max_epochs', + help='number of epochs to train', + default=20, type=int) + parser.add_argument('--disp_interval', dest='disp_interval', + help='number of iterations to display', + default=100, type=int) + parser.add_argument('--checkpoint_interval', dest='checkpoint_interval', + help='number of iterations to display', + default=10000, type=int) + + parser.add_argument('--save_dir', dest='save_dir', + help='directory to save models', default="models", + nargs=argparse.REMAINDER) + parser.add_argument('--ngpu', dest='ngpu', + help='number of gpu', + default=1, type=int) + + +# config optimization + parser.add_argument('--o', dest='optimizer', + help='training optimizer', + default="sgd", type=str) + parser.add_argument('--lr_decay_step', dest='lr_decay_step', + help='step to do learning rate decay, unit is epoch', + default=5, type=int) + parser.add_argument('--lr_decay_gamma', dest='lr_decay_gamma', + help='learning rate decay ratio', + default=0.1, type=float) + +# set training session + parser.add_argument('--s', dest='session', + help='training session', + default=1, type=int) + +# resume trained model + parser.add_argument('--r', dest='resume', + help='resume checkpoint or not', + default=False, type=bool) + parser.add_argument('--checksession', dest='checksession', + help='checksession to load model', + default=1, type=int) + parser.add_argument('--checkepoch', dest='checkepoch', + help='checkepoch to load model', + default=1, type=int) + parser.add_argument('--checkpoint', dest='checkpoint', + help='checkpoint to load model', + default=10000, type=int) +# log and diaplay + parser.add_argument('--use_tfboard', dest='use_tfboard', + help='whether use tensorflow tensorboard', + default=False, type=bool) + + # if len(sys.argv) == 1: + # parser.print_help() + # sys.exit(1) + + args = parser.parse_args() + return args + + +lr = cfg.TRAIN.LEARNING_RATE +momentum = cfg.TRAIN.MOMENTUM +weight_decay = cfg.TRAIN.WEIGHT_DECAY +use_multiGPU = False + +if __name__ == '__main__': + + args = parse_args() + + print('Called with args:') + print(args) + + if args.use_tfboard: + from model.utils.logger import Logger + # Set the logger + logger = Logger('./logs') + + if args.dataset == "pascal_voc": + args.imdb_name = "voc_2007_trainval" + args.imdbval_name = "voc_2007_test" + args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] + elif args.dataset == "pascal_voc_0712": + args.imdb_name = "voc_2007_trainval+voc_2012_trainval" + args.imdbval_name = "voc_2007_test" + args.set_cfgs = ['ANCHOR_SCALES', '[8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] + elif args.dataset == "coco": + args.imdb_name = "coco_2014_train+coco_2014_valminusminival" + args.imdbval_name = "coco_2014_minival" + args.set_cfgs = ['ANCHOR_SCALES', '[4, 8, 16, 32]', 'ANCHOR_RATIOS', '[0.5,1,2]'] + + args.cfg_file = "cfgs/{}.yml".format(args.net) + + if args.cfg_file is not None: + cfg_from_file(args.cfg_file) + if args.set_cfgs is not None: + cfg_from_list(args.set_cfgs) + + print('Using config:') + pprint.pprint(cfg) + np.random.seed(cfg.RNG_SEED) + + # train set + # -- Note: Use validation set and disable the flipped to enable faster loading. + cfg.TRAIN.USE_FLIPPED = True + imdb, roidb = combined_roidb(args.imdb_name) + train_size = len(roidb) + + print('{:d} roidb entries'.format(len(roidb))) + + output_dir = args.save_dir + "/" + args.net + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + dataset = roibatchLoader(roidb, imdb.num_classes, training=False, + normalize = False) + + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, + shuffle=True, num_workers=0) + + # initilize the tensor holder here. + im_data = torch.FloatTensor(1) + im_info = torch.FloatTensor(1) + num_boxes = torch.LongTensor(1) + gt_boxes = torch.FloatTensor(1) + + # ship to cuda + if args.ngpu > 0: + im_data = im_data.cuda() + im_info = im_info.cuda() + num_boxes = num_boxes.cuda() + gt_boxes = gt_boxes.cuda() + + # make variable + im_data = Variable(im_data) + im_info = Variable(im_info) + num_boxes = Variable(num_boxes) + gt_boxes = Variable(gt_boxes) + + if args.ngpu > 0: + cfg.CUDA = True + + # initilize the network here. + fasterRCNN = _fasterRCNN(args.net, imdb.classes) + # weights_normal_init(fasterRCNN) + weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_ConvReLU) + weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_cls_score) + weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_bbox_pred) + weights_normal_init(fasterRCNN.RCNN_cls_score) + weights_normal_init(fasterRCNN.RCNN_bbox_pred, 0.001) + + params = list(fasterRCNN.parameters()) + + if args.optimizer == "adam": + lr = lr * 0.1 + optimizer = torch.optim.Adam([ + {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, + {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, + {'params': fasterRCNN.RCNN_fc6.parameters()}, + {'params': fasterRCNN.RCNN_fc7.parameters()}, + {'params': fasterRCNN.RCNN_cls_score.parameters()}, + {'params': fasterRCNN.RCNN_bbox_pred.parameters()}, + ], lr = lr) + + elif args.optimizer == "sgd": + optimizer = torch.optim.SGD([ + {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, + {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, + {'params': fasterRCNN.RCNN_fc6.parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_fc7.parameters(), 'lr': lr}, + {'params': fasterRCNN.RCNN_cls_score.parameters()}, + {'params': fasterRCNN.RCNN_bbox_pred.parameters()}, + ], lr = lr, momentum=momentum, weight_decay=weight_decay) + + if args.resume: + load_name = os.path.join(output_dir, + 'faster_rcnn_{}_{}_{}.pth'.format(args.checksession, args.checkepoch, args.checkpoint)) + print("loading checkpoint %s" % (load_name)) + checkpoint = torch.load(load_name) + args.session = checkpoint['session'] + args.start_epoch = checkpoint['epoch'] + fasterRCNN.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("loaded checkpoint %s" % (load_name)) + + if use_multiGPU: + fasterRCNN.RCNN_base = nn.DataParallel(fasterRCNN.RCNN_base) + + if args.ngpu > 0: + fasterRCNN.cuda() + + loss_temp = 0 + start = time.time() + + data_iter = iter(dataloader) + + aspect_ratio = torch.FloatTensor(train_size).zero_() + for step in range(train_size): + data = data_iter.next() + im_data.data.resize_(data[0].size()).copy_(data[0]) + im_info.data.resize_(data[1].size()).copy_(data[1]) + gt_boxes.data.resize_(data[2].size()).copy_(data[2]) + num_boxes.data.resize_(data[3].size()).copy_(data[3]) + + # aspect_ratio = height / width + aspect_ratio[step] = data[1][0][0] / data[1][0][1] + + + pdb.set_trace() + + end = time.time() + print(end - start) diff --git a/demo.py b/demo.py index 7c65f4e26..df9a6bfec 100644 --- a/demo.py +++ b/demo.py @@ -197,16 +197,18 @@ def _get_image_blob(im): print('Loaded Photo: {} images.'.format(num_images)) + im_file_target = os.path.join(args.image_dir, imglist[0]) + im_target = cv2.imread(im_file_target) for i in range(num_images): # Load the demo image im_file = os.path.join(args.image_dir, imglist[i]) - # im = cv2.imread(im_file) - im = np.array(Image.open(im_file)) - if len(im.shape) == 2: - im = im[:,:,np.newaxis] - im = np.concatenate((im,im,im), axis=2) + im = cv2.imread(im_file) + # im = np.array(Image.open(im_file)) + # if len(im.shape) == 2: + # im = im[:,:,np.newaxis] + # im = np.concatenate((im,im,im), axis=2) blobs, im_scales = _get_image_blob(im) assert len(im_scales) == 1, "Only single-image batch implemented" diff --git a/test_net.py b/test_net.py index be4947d7f..444f19be6 100644 --- a/test_net.py +++ b/test_net.py @@ -70,6 +70,9 @@ def parse_args(): parser.add_argument('--checkpoint', dest='checkpoint', help='checkpoint to load network', default=10000, type=int) + parser.add_argument('--bs', dest='batch_size', + help='batch_size', + default=1, type=int) args = parser.parse_args() return args @@ -97,7 +100,7 @@ def parse_args(): # train set # -- Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = False - imdb, roidb = combined_roidb(args.imdbval_name) + imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdbval_name) imdb.competition_mode(on=True) print('{:d} roidb entries'.format(len(roidb))) @@ -156,16 +159,10 @@ def parse_args(): output_dir = get_output_dir(imdb, save_name) + dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ + imdb.num_classes, training=False, normalize = False) - # dataset = roibatchLoader(roidb, imdb.num_classes, training=False, - # normalize = transforms.Normalize( - # mean=[0.485, 0.456, 0.406], - # std=[0.229, 0.224, 0.225])) - - dataset = roibatchLoader(roidb, imdb.num_classes, training=False, - normalize = False) - - dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, + dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) diff --git a/test_net_cascade.py b/test_net_cascade.py index 2e99186cd..b6fe00876 100644 --- a/test_net_cascade.py +++ b/test_net_cascade.py @@ -70,7 +70,9 @@ def parse_args(): parser.add_argument('--checkpoint', dest='checkpoint', help='checkpoint to load network', default=10000, type=int) - + parser.add_argument('--bs', dest='batch_size', + help='batch_size', + default=1, type=int) args = parser.parse_args() return args @@ -97,7 +99,7 @@ def parse_args(): # train set # -- Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = False - imdb, roidb = combined_roidb(args.imdbval_name) + imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdbval_name) imdb.competition_mode(on=True) print('{:d} roidb entries'.format(len(roidb))) @@ -162,10 +164,10 @@ def parse_args(): # mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225])) - dataset = roibatchLoader(roidb, imdb.num_classes, training=False, - normalize = False) + dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ + imdb.num_classes, training=False, normalize = False) - dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, + dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) diff --git a/trainval_net.py b/trainval_net.py index e3077ee1b..1ad603284 100644 --- a/trainval_net.py +++ b/trainval_net.py @@ -22,7 +22,7 @@ import torch.optim as optim import torchvision.transforms as transforms - +from torch.utils.data.sampler import Sampler from roi_data_layer.roidb import combined_roidb from roi_data_layer.roibatchLoader import roibatchLoader from model.utils.config import cfg, cfg_from_file, cfg_from_list, get_output_dir @@ -69,7 +69,9 @@ def parse_args(): parser.add_argument('--ngpu', dest='ngpu', help='number of gpu', default=1, type=int) - + parser.add_argument('--bs', dest='batch_size', + help='batch_size', + default=1, type=int) # config optimization parser.add_argument('--o', dest='optimizer', @@ -118,6 +120,30 @@ def parse_args(): weight_decay = cfg.TRAIN.WEIGHT_DECAY use_multiGPU = False +class sampler(Sampler): + def __init__(self, train_size, batch_size): + num_data = train_size + self.num_per_batch = int(num_data / batch_size) + self.batch_size = batch_size + self.range = torch.arange(0,batch_size).view(1, batch_size).long() + self.leftover_flag = False + if num_data % batch_size: + self.leftover = torch.arange(self.num_per_batch*batch_size, num_data).long() + self.leftover_flag = True + def __iter__(self): + rand_num = torch.randperm(self.num_per_batch).view(-1,1) * self.batch_size + self.rand_num = rand_num.expand(self.num_per_batch, self.batch_size) + self.range + + self.rand_num_view = self.rand_num.view(-1) + + if self.leftover_flag: + self.rand_num_view = torch.cat((self.rand_num_view, self.leftover),0) + + return iter(self.rand_num_view) + + def __len__(self): + return num_data + if __name__ == '__main__': args = parse_args() @@ -157,7 +183,7 @@ def parse_args(): # train set # -- Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = True - imdb, roidb = combined_roidb(args.imdb_name) + imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdb_name) train_size = len(roidb) print('{:d} roidb entries'.format(len(roidb))) @@ -166,11 +192,13 @@ def parse_args(): if not os.path.exists(output_dir): os.makedirs(output_dir) - dataset = roibatchLoader(roidb, imdb.num_classes, training=True, - normalize = False) + sampler_batch = sampler(train_size, args.batch_size) - dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, - shuffle=True, num_workers=0) + dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ + imdb.num_classes, training=True, normalize = False) + + dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, + sampler=sampler_batch, num_workers=args.batch_size) # initilize the tensor holder here. im_data = torch.FloatTensor(1) @@ -251,13 +279,13 @@ def parse_args(): data_iter = iter(dataloader) - for step in range(train_size): + for step in range(int(train_size / args.batch_size)): data = data_iter.next() im_data.data.resize_(data[0].size()).copy_(data[0]) im_info.data.resize_(data[1].size()).copy_(data[1]) gt_boxes.data.resize_(data[2].size()).copy_(data[2]) num_boxes.data.resize_(data[3].size()).copy_(data[3]) - + fasterRCNN.zero_grad() _, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) loss = (rpn_loss.sum() + rcnn_loss.sum()) / rpn_loss.size(0) @@ -271,8 +299,8 @@ def parse_args(): if step % args.disp_interval == 0: if use_multiGPU: - print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ - % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) + print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ + % (args.session, epoch, step, loss_temp / args.disp_interval, lr)) print("\t\t\tfg/bg=(%d/%d)" % (0, 0)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" % (0, 0, 0, 0)) if args.use_tfboard: @@ -283,8 +311,8 @@ def parse_args(): logger.scalar_summary(tag, value, step) else: - print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ - % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) + print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ + % (args.session, epoch, step, loss_temp / args.disp_interval, lr)) print("\t\t\tfg/bg=(%d/%d)" % (fasterRCNN.fg_cnt, fasterRCNN.bg_cnt)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box: %.4f" % (fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_cls.data[0], \ @@ -304,21 +332,20 @@ def parse_args(): loss_temp = 0 - if (step % args.checkpoint_interval == 0) and step > 0: - # pdb.set_trace() - save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) - save_checkpoint({ - 'session': args.session, - 'epoch': epoch + 1, - 'model': fasterRCNN.state_dict(), - "optimizer": optimizer.state_dict(), - }, save_name) - print('save model: {}'.format(save_name)) - - if epoch % args.lr_decay_step == 0: adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma + + save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) + save_checkpoint({ + 'session': args.session, + 'epoch': epoch + 1, + 'model': fasterRCNN.state_dict(), + "optimizer": optimizer.state_dict(), + }, save_name) + print('save model: {}'.format(save_name)) + + end = time.time() print(end - start) diff --git a/trainval_net_cascade.py b/trainval_net_cascade.py index 8e54e83bf..59e28d62f 100644 --- a/trainval_net_cascade.py +++ b/trainval_net_cascade.py @@ -73,7 +73,7 @@ def parse_args(): parser.add_argument('--bs', dest='batch_size', help='batch_size', - default=1, type=int) + default=4, type=int) # config optimization parser.add_argument('--o', dest='optimizer', @@ -118,28 +118,28 @@ def parse_args(): class sampler(Sampler): - def __init__(self, data_source, batch_size): - num_data = len(data_source) - self.num_batch = int(num_data / batch_size) + def __init__(self, train_size, batch_size): + num_data = train_size + self.num_per_batch = int(num_data / batch_size) self.batch_size = batch_size self.range = torch.arange(0,batch_size).view(1, batch_size).long() self.leftover_flag = False if num_data % batch_size: - self.leftover = torch.arange(self.num_batch*batch_size, num_data).long() + self.leftover = torch.arange(self.num_per_batch*batch_size, num_data).long() self.leftover_flag = True - def __iter__(self): - rand_num = torch.randperm(self.num_batch).view(-1,1)\ - .expand(self.num_batch, self.batch_size) + self.range - rand_num = rand_num.view(-1) + rand_num = torch.randperm(self.num_per_batch).view(-1,1) * self.batch_size + self.rand_num = rand_num.expand(self.num_per_batch, self.batch_size) + self.range + + self.rand_num_view = self.rand_num.view(-1) if self.leftover_flag: - rand_num = torch.cat((rand_num, self.leftover),0) + self.rand_num_view = torch.cat((self.rand_num_view, self.leftover),0) - return iter(rand_num) + return iter(self.rand_num_view) def __len__(self): - return len(self.data_source) + return num_data lr = cfg.TRAIN.LEARNING_RATE @@ -186,7 +186,7 @@ def __len__(self): # train set # -- Note: Use validation set and disable the flipped to enable faster loading. cfg.TRAIN.USE_FLIPPED = True - imdb, roidb, ratio_list = combined_roidb(args.imdb_name) + imdb, roidb, ratio_list, ratio_index = combined_roidb(args.imdb_name) train_size = len(roidb) print('{:d} roidb entries'.format(len(roidb))) @@ -195,11 +195,13 @@ def __len__(self): if not os.path.exists(output_dir): os.makedirs(output_dir) - dataset = roibatchLoader(roidb, ratio_list, imdb.num_classes, training=False, - normalize = False) + sampler_batch = sampler(train_size, args.batch_size) + + dataset = roibatchLoader(roidb, ratio_list, ratio_index, args.batch_size, \ + imdb.num_classes, training=True, normalize = False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, - sampler=sampler(dataset, args.batch_size), num_workers=0) + sampler=sampler_batch, num_workers=2) # initilize the tensor holder here. im_data = torch.FloatTensor(1) @@ -280,8 +282,9 @@ def __len__(self): data_iter = iter(dataloader) - for step in range(train_size): + for step in range(int(train_size / args.batch_size)): data = data_iter.next() + im_data.data.resize_(data[0].size()).copy_(data[0]) im_info.data.resize_(data[1].size()).copy_(data[1]) gt_boxes.data.resize_(data[2].size()).copy_(data[2]) From 20121d4772f3c7c9bf4851101229b573c467684b Mon Sep 17 00:00:00 2001 From: jwyang Date: Mon, 28 Aug 2017 23:09:36 -0400 Subject: [PATCH 06/13] batching data according to aspect ratios, and training in batch --- lib/model/faster_rcnn/faster_rcnn.py | 11 +- lib/model/faster_rcnn/faster_rcnn_cascade.py | 7 - lib/roi_data_layer/roibatchLoader.py | 128 +++++++------------ lib/roi_data_layer/roidb.py | 6 +- 4 files changed, 55 insertions(+), 97 deletions(-) diff --git a/lib/model/faster_rcnn/faster_rcnn.py b/lib/model/faster_rcnn/faster_rcnn.py index a875e0556..f725f36b2 100644 --- a/lib/model/faster_rcnn/faster_rcnn.py +++ b/lib/model/faster_rcnn/faster_rcnn.py @@ -73,10 +73,15 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): rpn_loss_cls = 0 rpn_loss_bbox = 0 - rois_var = Variable(rois.view(-1,5)) - - # do roi pooling based on predicted rois + # pooled_feats = [] + # for i in range(rois.size(0)): + # rois_var_i = Variable(rois[i]) + # base_feat_i = base_feat[i] + # pooled_feat = self.RCNN_roi_pool(base_feat_i.unsqueeze(0), rois_var_i) + # pooled_feats.append(pooled_feat) + # pooled_feat_all = torch.cat(pooled_feats, 0) + rois_var = Variable(rois.view(-1,5)) pooled_feat = self.RCNN_roi_pool(base_feat, rois_var) pooled_feat_all = pooled_feat.view(pooled_feat.size(0), -1) diff --git a/lib/model/faster_rcnn/faster_rcnn_cascade.py b/lib/model/faster_rcnn/faster_rcnn_cascade.py index 0d51d7858..fe1587307 100644 --- a/lib/model/faster_rcnn/faster_rcnn_cascade.py +++ b/lib/model/faster_rcnn/faster_rcnn_cascade.py @@ -182,16 +182,9 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): self.fg_cnt = torch.sum(label.data.ne(0)) self.bg_cnt = label.data.numel() - self.fg_cnt - ce_weights = rois_label.data.new(cls_score.size(1)).fill_(1) - ce_weights[0] = float(self.fg_cnt) / self.bg_cnt - - # self.RCNN_loss_cls = F.cross_entropy(cls_score, label, weight=ce_weights) - self.RCNN_loss_cls = F.cross_entropy(cls_score, label) # bounding box regression L1 loss - # rois_target = torch.mul(rois_target, rois_inside_ws) - # bbox_pred = torch.mul(bbox_pred, rois_inside_ws) self.RCNN_loss_bbox = _smooth_l1_loss(bbox_pred, rois_target, rois_inside_ws, rois_outside_ws) rcnn_loss = self.RCNN_loss_cls + self.RCNN_loss_bbox diff --git a/lib/roi_data_layer/roibatchLoader.py b/lib/roi_data_layer/roibatchLoader.py index af61093fa..67ed13ca5 100644 --- a/lib/roi_data_layer/roibatchLoader.py +++ b/lib/roi_data_layer/roibatchLoader.py @@ -20,7 +20,7 @@ import pdb class roibatchLoader(data.Dataset): - def __init__(self, roidb, ratio_list, num_classes, training=True, normalize=None): + def __init__(self, roidb, ratio_list, ratio_index, batch_size, num_classes, training=True, normalize=None): self._roidb = roidb self._num_classes = num_classes # we make the height of image consistent to trim_height, trim_width @@ -30,10 +30,21 @@ def __init__(self, roidb, ratio_list, num_classes, training=True, normalize=None self.training = training self.normalize = normalize self.ratio_list = ratio_list + self.ratio_index = ratio_index + self.batch_size = batch_size + self.data_size = len(self.ratio_list) def __getitem__(self, index): + if self.training: + index_ratio = int(self.ratio_index[index]) + else: + index_ratio = index + + # get the anchor index for current sample index + # here we set the anchor index to the last one + # sample in this group - minibatch_db = [self._roidb[index]] + minibatch_db = [self._roidb[index_ratio]] blobs = get_minibatch(minibatch_db, self._num_classes) data = torch.from_numpy(blobs['data']) im_info = torch.from_numpy(blobs['im_info']) @@ -43,95 +54,44 @@ def __getitem__(self, index): data_height, data_width = data.size(1), data.size(2) if self.training: - ################################################## - # we crop the input image to fixed size randomly # - ################################################## - # trim_data = torch.FloatTensor(1, self.trim_height, self.trim_width, 3) - if data_height > data_width: - # if height > width, then crop on height - # randomly generate an y start point - # while True: - # assign score to y axis - y_score = torch.FloatTensor(data_height).zero_() - for i in range(gt_boxes.size(0)): - rg = torch.arange(int(gt_boxes[i, 1]), int(gt_boxes[i, 3])) - score = -(rg - gt_boxes[i, 1]) * (rg - gt_boxes[i, 3]) / (gt_boxes[i, 3] - gt_boxes[i, 1])**2 - y_score[int(gt_boxes[i, 1]):int(gt_boxes[i, 3])] += score - - # find the inds with maximal score in y_score - if data_height > self.trim_height: - - ys = torch.arange(0, data_height - self.trim_height, 5).long() - y_score_cum = torch.FloatTensor(ys.size()).zero_() - - for i in range(ys.size(0)): - s = ys[i] - y_score_cum[i] = y_score[s:s + self.trim_height].sum() - - _, order = torch.sort(y_score_cum, 0, True) - - ys_ordered = ys[order] - rand_num = torch.randint(min(5, ys_ordered.size(0))) - - ys = ys_ordered[rand_num] - ys = min(ys, data_width - self.trim_width) - else: - y_s = 0 - - trim_data = data[:, y_s:(y_s + self.trim_height), :] - - # shift y coordiante of gt_boxes - gt_boxes[:, 1] = gt_boxes[:, 1] - y_s - gt_boxes[:, 3] = gt_boxes[:, 3] - y_s - - # update gt bounding box according the trip - gt_boxes[:, 1].clamp_(0, self.trim_height - 1) - gt_boxes[:, 3].clamp_(0, self.trim_height - 1) + ######################################################## + # padding the input image to fixed size for each group # + ######################################################## - # update im_info - im_info[0, 0] = self.trim_height - - elif data_height <= data_width: - # if height <= width, then crop on width - # while True: + # NOTE1: need to cope with the case where a group cover both conditions. + # NOTE2: need to consider the situation for the tail samples. + # NOTE3: need to implement a parallel data loader + if data_height > (data_width): - # assign score to y axis - x_score = torch.FloatTensor(data_width).zero_() - for i in range(gt_boxes.size(0)): - rg = torch.arange(int(gt_boxes[i, 0]), int(gt_boxes[i, 2])) - score = -(rg - gt_boxes[i, 0]) * (rg - gt_boxes[i, 2]) / (gt_boxes[i, 2] - gt_boxes[i, 0])**2 - x_score[int(gt_boxes[i, 0]):int(gt_boxes[i, 2])] += score - # find the inds with maximal score in y_score - if data_width > self.trim_width: - xs = torch.arange(0, data_width - self.trim_width, 5).long() - x_score_cum = torch.FloatTensor(xs.size()).zero_() + anchor_idx = (np.floor((index) / self.batch_size)) * self.batch_size + anchor_idx = min(int(anchor_idx), self.data_size - 1) + ratio = self.ratio_list[anchor_idx] - for i in range(xs.size(0)): - s = xs[i] - x_score_cum[i] = x_score[s:s + self.trim_width].sum() + padding_data = torch.FloatTensor(int(np.ceil(data_width / ratio)), \ + data_width, 3).zero_() - _, order = torch.sort(x_score_cum, 0, True) + padding_data[:data_height, :, :] = data[0] - xs_ordered = xs[order] - rand_num = torch.randint(min(5, xs_ordered.size(0))) + # update im_info + im_info[0, 0] = padding_data.size(0) - xs = xs_ordered[rand_num] - xs = min(xs, data_width - self.trim_width) - else: - x_s = 0 + # print("height %d %d \n" %(index, anchor_idx)) - trim_data = data[:, :, x_s:(x_s + self.trim_width), :] + elif (data_height <= data_width): + + anchor_idx = (np.ceil((index + 1) / self.batch_size)) * self.batch_size - 1 + anchor_idx = min(int(anchor_idx), self.data_size - 1) + ratio = self.ratio_list[anchor_idx] - # shift x coordiante of gt_boxes - gt_boxes[:, 0] = gt_boxes[:, 0] - x_s - gt_boxes[:, 2] = gt_boxes[:, 2] - x_s + padding_data = torch.FloatTensor(data_height, \ + int(np.ceil(data_height * ratio)), 3).zero_() - # update gt bounding box according the trip - gt_boxes[:, 0].clamp_(0, self.trim_width - 1) - gt_boxes[:, 2].clamp_(0, self.trim_width - 1) + padding_data[:, :data_width, :] = data[0] - im_info[0, 1] = self.trim_width + im_info[0, 1] = padding_data.size(1) + + # print("width %d %d \n" %(index, anchor_idx)) num_boxes = min(gt_boxes.size(0), self.max_num_box) @@ -140,14 +100,14 @@ def __getitem__(self, index): gt_boxes_padding[:num_boxes,:] = gt_boxes[:num_boxes] # permute trim_data to adapt to downstream processing - trim_data = trim_data.permute(0, 3, 1, 2).contiguous().view(3, self.trim_height, self.trim_width) + padding_data = padding_data.permute(2, 0, 1).contiguous() im_info = im_info.view(3) if self.normalize: - trim_data = trim_data / 255.0 - trim_data = self.normalize(trim_data) + padding_data = padding_data / 255.0 + padding_data = self.normalize(padding_data) - return trim_data, im_info, gt_boxes, num_boxes + return padding_data, im_info, gt_boxes_padding, num_boxes else: data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) num_boxes = gt_boxes.size(0) diff --git a/lib/roi_data_layer/roidb.py b/lib/roi_data_layer/roidb.py index e1aa0dae0..330927c6f 100644 --- a/lib/roi_data_layer/roidb.py +++ b/lib/roi_data_layer/roidb.py @@ -55,7 +55,7 @@ def rank_roidb_ratio(roidb): ratio_list = np.array(ratio_list) ratio_index = np.argsort(ratio_list) - return ratio_index + return ratio_list[ratio_index], ratio_index def combined_roidb(imdb_names): """ @@ -88,7 +88,7 @@ def get_roidb(imdb_name): roidbs = [get_roidb(s) for s in imdb_names.split('+')] roidb = roidbs[0] - ratio_list = rank_roidb_ratio(roidb) + ratio_list, ratio_index = rank_roidb_ratio(roidb) if len(roidbs) > 1: for r in roidbs[1:]: @@ -97,4 +97,4 @@ def get_roidb(imdb_name): imdb = datasets.imdb.imdb(imdb_names, tmp.classes) else: imdb = get_imdb(imdb_names) - return imdb, roidb, ratio_list + return imdb, roidb, ratio_list, ratio_index From 71979d55f1f17911cb593fd238d88826cfc190b7 Mon Sep 17 00:00:00 2001 From: jwyang Date: Mon, 28 Aug 2017 23:16:58 -0400 Subject: [PATCH 07/13] minor change to proposal_layer --- lib/model/rpn/proposal_layer.py | 4 +--- trainval_net.py | 8 +++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/model/rpn/proposal_layer.py b/lib/model/rpn/proposal_layer.py index 4a1df9ba5..65110fba2 100644 --- a/lib/model/rpn/proposal_layer.py +++ b/lib/model/rpn/proposal_layer.py @@ -144,7 +144,7 @@ def forward(self, input): proposals_keep = proposals _, order = torch.sort(scores_keep, 1, True) - # output = scores.new(batch_size, post_nms_topN, 5).zero_() + output = scores.new(batch_size, post_nms_topN, 5).zero_() for i in range(batch_size): # # 3. remove predicted boxes with either height or width < threshold # # (NOTE: convert min_size to input image scale stored in im_info[2]) @@ -175,8 +175,6 @@ def forward(self, input): # padding 0 at the end. num_proposal = proposals_single.size(0) - - output = scores.new(batch_size, num_proposal, 5).zero_() output[i,:,0] = i output[i,:num_proposal,1:] = proposals_single diff --git a/trainval_net.py b/trainval_net.py index 1ad603284..ebb26fe55 100644 --- a/trainval_net.py +++ b/trainval_net.py @@ -285,7 +285,7 @@ def __len__(self): im_info.data.resize_(data[1].size()).copy_(data[1]) gt_boxes.data.resize_(data[2].size()).copy_(data[2]) num_boxes.data.resize_(data[3].size()).copy_(data[3]) - + fasterRCNN.zero_grad() _, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) loss = (rpn_loss.sum() + rcnn_loss.sum()) / rpn_loss.size(0) @@ -298,9 +298,11 @@ def __len__(self): optimizer.step() if step % args.disp_interval == 0: + if step > 0: + loss_temp = loss_temp / args.disp_interval if use_multiGPU: print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ - % (args.session, epoch, step, loss_temp / args.disp_interval, lr)) + % (args.session, epoch, step, loss_temp, lr)) print("\t\t\tfg/bg=(%d/%d)" % (0, 0)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" % (0, 0, 0, 0)) if args.use_tfboard: @@ -312,7 +314,7 @@ def __len__(self): else: print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ - % (args.session, epoch, step, loss_temp / args.disp_interval, lr)) + % (args.session, epoch, step, loss_temp, lr)) print("\t\t\tfg/bg=(%d/%d)" % (fasterRCNN.fg_cnt, fasterRCNN.bg_cnt)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box: %.4f" % (fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_cls.data[0], \ From 0e7da2dc99d6894c023fd29e99554b88cd67d7ff Mon Sep 17 00:00:00 2001 From: jwyang Date: Tue, 29 Aug 2017 16:44:54 -0400 Subject: [PATCH 08/13] fix a bug in anchor_target_layer, make it work to train in batch --- lib/model/faster_rcnn/faster_rcnn.py | 2 +- lib/model/rpn/anchor_target_layer.py | 14 +++++----- lib/model/rpn/bbox_transform.py | 38 ++++++++++++++-------------- lib/model/rpn/rpn.py | 12 ++++----- trainval_net.py | 37 ++++++++++++++------------- 5 files changed, 54 insertions(+), 49 deletions(-) diff --git a/lib/model/faster_rcnn/faster_rcnn.py b/lib/model/faster_rcnn/faster_rcnn.py index f725f36b2..f2c4d0fe3 100644 --- a/lib/model/faster_rcnn/faster_rcnn.py +++ b/lib/model/faster_rcnn/faster_rcnn.py @@ -186,4 +186,4 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): cls_prob = cls_prob.view(batch_size, rois.size(1), -1) bbox_pred = bbox_pred.view(batch_size, rois.size(1), -1) - return rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss \ No newline at end of file + return rois, cls_prob, bbox_pred, rpn_loss, rcnn_loss diff --git a/lib/model/rpn/anchor_target_layer.py b/lib/model/rpn/anchor_target_layer.py index 98f52daa6..3759e63c5 100644 --- a/lib/model/rpn/anchor_target_layer.py +++ b/lib/model/rpn/anchor_target_layer.py @@ -80,7 +80,7 @@ def forward(self, input): shifts = torch.from_numpy(np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()) shifts = shifts.contiguous().type_as(rpn_cls_score).float() - + A = self._num_anchors K = shifts.size(0) @@ -115,7 +115,7 @@ def forward(self, input): gt_max_overlaps[gt_max_overlaps==0] = 1e-5 keep = torch.sum(overlaps.eq(gt_max_overlaps.view(batch_size,1,-1).expand_as(overlaps)), 2) - + if torch.sum(keep) > 0: labels[keep>0] = 1 @@ -126,14 +126,14 @@ def forward(self, input): labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) - + sum_fg = torch.sum((labels == 1).int(), 1) sum_bg = torch.sum((labels == 0).int(), 1) for i in range(batch_size): # subsample positive labels if we have too many if sum_fg[i] > num_fg: - fg_inds = torch.nonzero(labels[i] == 1).view(-1) + fg_inds = torch.nonzero(labels[i] == 1).view(-1) rand_num = torch.randperm(fg_inds.size(0)).type_as(gt_boxes).long() disable_inds = fg_inds[rand_num[:fg_inds.size(0)-num_fg]] labels[i][disable_inds] = -1 @@ -147,6 +147,8 @@ def forward(self, input): disable_inds = bg_inds[rand_num[:bg_inds.size(0)-num_bg]] labels[i][disable_inds] = -1 + offset = torch.arange(0, batch_size)*20 + argmax_overlaps = argmax_overlaps + offset.view(batch_size, 1).type_as(argmax_overlaps) bbox_targets = _compute_targets_batch(anchors, gt_boxes.view(-1,5)[argmax_overlaps.view(-1), :].view(batch_size, -1, 5)) # use a single value instead of 4 values for easy index. @@ -158,7 +160,7 @@ def forward(self, input): negative_weights = 1.0 / num_examples else: assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & - (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) + (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) bbox_outside_weights[labels == 1] = positive_weights bbox_outside_weights[labels == 0] = negative_weights @@ -225,4 +227,4 @@ def _compute_targets(ex_rois, gt_rois): def _compute_targets_batch(ex_rois, gt_rois): """Compute bounding-box regression targets for an image.""" - return bbox_transform_batch(ex_rois, gt_rois[:, :, :4]) \ No newline at end of file + return bbox_transform_batch(ex_rois, gt_rois[:, :, :4]) diff --git a/lib/model/rpn/bbox_transform.py b/lib/model/rpn/bbox_transform.py index 58d9c08ce..e0316dd3d 100644 --- a/lib/model/rpn/bbox_transform.py +++ b/lib/model/rpn/bbox_transform.py @@ -30,7 +30,7 @@ def bbox_transform(ex_rois, gt_rois): targets = torch.stack( (targets_dx, targets_dy, targets_dw, targets_dh),1) - + return targets def bbox_transform_batch(ex_rois, gt_rois): @@ -52,7 +52,7 @@ def bbox_transform_batch(ex_rois, gt_rois): targets = torch.stack( (targets_dx, targets_dy, targets_dw, targets_dh),2) - + return targets def bbox_transform_batch2(ex_rois, gt_rois): @@ -74,7 +74,7 @@ def bbox_transform_batch2(ex_rois, gt_rois): targets = torch.stack( (targets_dx, targets_dy, targets_dw, targets_dh),2) - + return targets @@ -134,7 +134,7 @@ def clip_boxes(boxes, im_shape, batch_size): boxes[i,:,0::4].clamp_(0, im_shape[i, 1]-1) boxes[i,:,1::4].clamp_(0, im_shape[i, 0]-1) boxes[i,:,2::4].clamp_(0, im_shape[i, 1]-1) - boxes[i,:,3::4].clamp_(0, im_shape[i, 0]-1) + boxes[i,:,3::4].clamp_(0, im_shape[i, 0]-1) return boxes @@ -149,20 +149,20 @@ def bbox_overlaps(anchors, gt_boxes): N = anchors.size(0) K = gt_boxes.size(0) - gt_boxes_area = ((gt_boxes[:,2] - gt_boxes[:,0] + 1) * + gt_boxes_area = ((gt_boxes[:,2] - gt_boxes[:,0] + 1) * (gt_boxes[:,3] - gt_boxes[:,1] + 1)).view(1, K) - anchors_area = ((anchors[:,2] - anchors[:,0] + 1) * + anchors_area = ((anchors[:,2] - anchors[:,0] + 1) * (anchors[:,3] - anchors[:,1] + 1)).view(N, 1) - boxes = anchors.view(N, 1, 4).expand(N, K, 4) + boxes = anchors.view(N, 1, 4).expand(N, K, 4) query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4) - iw = (torch.min(boxes[:,:,2], query_boxes[:,:,2]) - + iw = (torch.min(boxes[:,:,2], query_boxes[:,:,2]) - torch.max(boxes[:,:,0], query_boxes[:,:,0]) + 1) iw[iw < 0] = 0 - ih = (torch.min(boxes[:,:,3], query_boxes[:,:,3]) - + ih = (torch.min(boxes[:,:,3], query_boxes[:,:,3]) - torch.max(boxes[:,:,1], query_boxes[:,:,1]) + 1) ih[ih < 0] = 0 @@ -189,7 +189,7 @@ def bbox_overlaps_batch(anchors, gt_boxes): gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1) gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1) - gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) + gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1) anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1) @@ -198,20 +198,20 @@ def bbox_overlaps_batch(anchors, gt_boxes): gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1) anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1) - boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) + boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4) - iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - + iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1) iw[iw < 0] = 0 - ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - + ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1) ih[ih < 0] = 0 ua = anchors_area + gt_boxes_area - (iw * ih) overlaps = iw * ih / ua - # mask the overlap here. + # mask the overlap here. overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0) overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), 0) @@ -235,7 +235,7 @@ def bbox_overlaps_batch2(anchors, gt_boxes): gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1) gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1) - gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) + gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1) anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1) @@ -244,21 +244,21 @@ def bbox_overlaps_batch2(anchors, gt_boxes): gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1) anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1) - boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) + boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4) - iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - + iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1) iw[iw < 0] = 0 - ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - + ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1) ih[ih < 0] = 0 ua = anchors_area + gt_boxes_area - (iw * ih) overlaps = iw * ih / ua - # mask the overlap here. + # mask the overlap here. overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), -1) overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1) diff --git a/lib/model/rpn/rpn.py b/lib/model/rpn/rpn.py index abc30bb27..f27600f65 100644 --- a/lib/model/rpn/rpn.py +++ b/lib/model/rpn/rpn.py @@ -26,7 +26,7 @@ def __init__(self, feat_height, feat_width, din=512): nn.Conv2d(self.din, 512, 3, 1, 1, bias=True), nn.ReLU(True) ) - + # define bg/fg classifcation score layer self.nc_score_out = len(self.anchor_scales) * 3 * 2 # 2(bg/fg) * 9 (anchors) self.RPN_cls_score = nn.Conv2d(512, self.nc_score_out, 1, 1, 0) @@ -84,13 +84,13 @@ def forward(self, base_feat, im_info, gt_boxes, num_boxes): # proposal layer cfg_key = 'TRAIN' if self.training else 'TEST' - + rois = self.RPN_proposal((rpn_cls_prob.data, rpn_bbox_pred.data, im_info, self.shifts, cfg_key)) self.rpn_loss_cls = 0 self.rpn_loss_box = 0 - + # generating training labels and build the rpn loss if self.training: assert gt_boxes is not None @@ -114,17 +114,17 @@ def forward(self, base_feat, im_info, gt_boxes, num_boxes): self.rpn_loss_cls += F.cross_entropy(rpn_cls_score_single, rpn_label_v) - self.rpn_loss_cls = self.rpn_loss_cls / batch_size + self.rpn_loss_cls /= batch_size rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] - + # compute bbox regression loss rpn_bbox_inside_weights = Variable(rpn_bbox_inside_weights) rpn_bbox_outside_weights = Variable(rpn_bbox_outside_weights) rpn_bbox_targets = Variable(rpn_bbox_targets) #self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets_v, size_average=False) / (fg_cnt + 1e-4) - self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, + self.rpn_loss_box = _smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights, sigma=3, dim=[1,2,3]) return rois, self.rpn_loss_cls, self.rpn_loss_box diff --git a/trainval_net.py b/trainval_net.py index ebb26fe55..d6535f300 100644 --- a/trainval_net.py +++ b/trainval_net.py @@ -122,27 +122,27 @@ def parse_args(): class sampler(Sampler): def __init__(self, train_size, batch_size): - num_data = train_size - self.num_per_batch = int(num_data / batch_size) - self.batch_size = batch_size - self.range = torch.arange(0,batch_size).view(1, batch_size).long() - self.leftover_flag = False - if num_data % batch_size: - self.leftover = torch.arange(self.num_per_batch*batch_size, num_data).long() - self.leftover_flag = True + num_data = train_size + self.num_per_batch = int(num_data / batch_size) + self.batch_size = batch_size + self.range = torch.arange(0,batch_size).view(1, batch_size).long() + self.leftover_flag = False + if num_data % batch_size: + self.leftover = torch.arange(self.num_per_batch*batch_size, num_data).long() + self.leftover_flag = True def __iter__(self): - rand_num = torch.randperm(self.num_per_batch).view(-1,1) * self.batch_size - self.rand_num = rand_num.expand(self.num_per_batch, self.batch_size) + self.range + rand_num = torch.randperm(self.num_per_batch).long().view(-1,1) * self.batch_size + self.rand_num = rand_num.expand(self.num_per_batch, self.batch_size) + self.range - self.rand_num_view = self.rand_num.view(-1) + self.rand_num_view = self.rand_num.view(-1) - if self.leftover_flag: - self.rand_num_view = torch.cat((self.rand_num_view, self.leftover),0) + if self.leftover_flag: + self.rand_num_view = torch.cat((self.rand_num_view, self.leftover),0) - return iter(self.rand_num_view) + return iter(self.rand_num_view) def __len__(self): - return num_data + return num_data if __name__ == '__main__': @@ -265,6 +265,8 @@ def __len__(self): args.start_epoch = checkpoint['epoch'] fasterRCNN.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) + lr = optimizer.param_groups[0]['lr'] + # lr = checkpoint['lr'] print("loaded checkpoint %s" % (load_name)) if use_multiGPU: @@ -288,6 +290,7 @@ def __len__(self): fasterRCNN.zero_grad() _, cls_prob, bbox_pred, rpn_loss, rcnn_loss = fasterRCNN(im_data, im_info, gt_boxes, num_boxes) + loss = (rpn_loss.sum() + rcnn_loss.sum()) / rpn_loss.size(0) loss_temp += loss.data[0] @@ -299,7 +302,7 @@ def __len__(self): if step % args.disp_interval == 0: if step > 0: - loss_temp = loss_temp / args.disp_interval + loss_temp = loss_temp / args.disp_interval if use_multiGPU: print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ % (args.session, epoch, step, loss_temp, lr)) @@ -338,13 +341,13 @@ def __len__(self): adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma - save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint({ 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.state_dict(), "optimizer": optimizer.state_dict(), + "lr": lr, }, save_name) print('save model: {}'.format(save_name)) From 46b2d1e36c51cb12395ca66599e3b9cd2a405ae2 Mon Sep 17 00:00:00 2001 From: Jiasen Lu Date: Tue, 29 Aug 2017 17:10:31 -0400 Subject: [PATCH 09/13] Delete roibatchLoader_aug.py --- lib/roi_data_layer/roibatchLoader_aug.py | 162 ----------------------- 1 file changed, 162 deletions(-) delete mode 100644 lib/roi_data_layer/roibatchLoader_aug.py diff --git a/lib/roi_data_layer/roibatchLoader_aug.py b/lib/roi_data_layer/roibatchLoader_aug.py deleted file mode 100644 index 163a0d534..000000000 --- a/lib/roi_data_layer/roibatchLoader_aug.py +++ /dev/null @@ -1,162 +0,0 @@ - -"""The data layer used during training to train a Fast R-CNN network. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import torch.utils.data as data -from PIL import Image -import torch - -from model.utils.config import cfg -from roi_data_layer.minibatch import get_minibatch -from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes - -import numpy as np -import random -import time -import pdb - -class roibatchLoader(data.Dataset): - def __init__(self, roidb, num_classes, training=True, normalize=None): - self._roidb = roidb - self._num_classes = num_classes - # we make the height of image consistent to trim_height, trim_width - self.trim_height = cfg.TRAIN.TRIM_HEIGHT - self.trim_width = cfg.TRAIN.TRIM_WIDTH - self.max_num_box = 20 - self.training = training - self.normalize = normalize - - def __getitem__(self, index): - - minibatch_db = [self._roidb[index]] - blobs = get_minibatch(minibatch_db, self._num_classes) - data = torch.from_numpy(blobs['data']) - im_info = torch.from_numpy(blobs['im_info']) - # we need to random shuffle the bounding box. - np.random.shuffle(blobs['gt_boxes']) - gt_boxes = torch.from_numpy(blobs['gt_boxes']) - data_height, data_width = data.size(1), data.size(2) - - if self.training: - ################################################## - # we crop the input image to fixed size randomly # - ################################################## - # trim_data = torch.FloatTensor(1, self.trim_height, self.trim_width, 3) - if data_height > data_width: - # if height > width, then crop on height - # randomly generate an y start point - # while True: - # assign score to y axis - y_score = torch.FloatTensor(data_height).zero_() - for i in range(gt_boxes.size(0)): - rg = torch.arange(int(gt_boxes[i, 1]), int(gt_boxes[i, 3])) - score = -(rg - gt_boxes[i, 1]) * (rg - gt_boxes[i, 3]) / (gt_boxes[i, 3] - gt_boxes[i, 1])**2 - y_score[int(gt_boxes[i, 1]):int(gt_boxes[i, 3])] += score - - # find the inds with maximal score in y_score - if data_height > self.trim_height: - - ys = torch.arange(0, data_height - self.trim_height, 5).long() - y_score_cum = torch.FloatTensor(ys.size()).zero_() - - for i in range(ys.size(0)): - s = ys[i] - y_score_cum[i] = y_score[s:s + self.trim_height].sum() - - _, order = torch.sort(y_score_cum, 0, True) - - ys_ordered = ys[order] - rand_num = torch.randint(min(5, ys_ordered.size(0))) - - ys = ys_ordered[rand_num] - ys = min(ys, data_width - self.trim_width) - else: - y_s = 0 - - trim_data = data[:, y_s:(y_s + self.trim_height), :] - - # shift y coordiante of gt_boxes - gt_boxes[:, 1] = gt_boxes[:, 1] - y_s - gt_boxes[:, 3] = gt_boxes[:, 3] - y_s - - # update gt bounding box according the trip - gt_boxes[:, 1].clamp_(0, self.trim_height - 1) - gt_boxes[:, 3].clamp_(0, self.trim_height - 1) - - # update im_info - im_info[0, 0] = self.trim_height - - elif data_height <= data_width: - # if height <= width, then crop on width - # while True: - - # assign score to y axis - x_score = torch.FloatTensor(data_width).zero_() - for i in range(gt_boxes.size(0)): - rg = torch.arange(int(gt_boxes[i, 0]), int(gt_boxes[i, 2])) - score = -(rg - gt_boxes[i, 0]) * (rg - gt_boxes[i, 2]) / (gt_boxes[i, 2] - gt_boxes[i, 0])**2 - x_score[int(gt_boxes[i, 0]):int(gt_boxes[i, 2])] += score - - # find the inds with maximal score in y_score - if data_width > self.trim_width: - xs = torch.arange(0, data_width - self.trim_width, 5).long() - x_score_cum = torch.FloatTensor(xs.size()).zero_() - - for i in range(xs.size(0)): - s = xs[i] - x_score_cum[i] = x_score[s:s + self.trim_width].sum() - - _, order = torch.sort(x_score_cum, 0, True) - - xs_ordered = xs[order] - rand_num = torch.randint(min(5, xs_ordered.size(0))) - - xs = xs_ordered[rand_num] - xs = min(xs, data_width - self.trim_width) - else: - x_s = 0 - - trim_data = data[:, :, x_s:(x_s + self.trim_width), :] - - # shift x coordiante of gt_boxes - gt_boxes[:, 0] = gt_boxes[:, 0] - x_s - gt_boxes[:, 2] = gt_boxes[:, 2] - x_s - - # update gt bounding box according the trip - gt_boxes[:, 0].clamp_(0, self.trim_width - 1) - gt_boxes[:, 2].clamp_(0, self.trim_width - 1) - - im_info[0, 1] = self.trim_width - - num_boxes = min(gt_boxes.size(0), self.max_num_box) - - gt_boxes_padding = torch.FloatTensor(self.max_num_box, 5).zero_() - # take the top num_boxes - gt_boxes_padding[:num_boxes,:] = gt_boxes[:num_boxes] - - # permute trim_data to adapt to downstream processing - trim_data = trim_data.permute(0, 3, 1, 2).contiguous().view(3, self.trim_height, self.trim_width) - im_info = im_info.view(3) - - if self.normalize: - trim_data = trim_data / 255.0 - trim_data = self.normalize(trim_data) - - return trim_data, im_info, gt_boxes, num_boxes - else: - data = data.permute(0, 3, 1, 2).contiguous().view(3, data_height, data_width) - num_boxes = gt_boxes.size(0) - im_info = im_info.view(3) - - if self.normalize: - data = data / 255.0 - data = self.normalize(data) - - return data, im_info, gt_boxes, num_boxes - - def __len__(self): - return len(self._roidb) From f90836d6f88eff8a51a8c532c8fc91efc7f2aceb Mon Sep 17 00:00:00 2001 From: jiasen Date: Tue, 29 Aug 2017 21:54:06 -0400 Subject: [PATCH 10/13] change faster_rcnn arch to support multiple base network --- lib/model/faster_rcnn/faster_rcnn_cascade.py | 98 +++++-------------- .../{utils => faster_rcnn}/mobilenet_v1.py | 0 lib/model/{utils => faster_rcnn}/resnet_v1.py | 47 ++++----- lib/model/faster_rcnn/vgg16.py | 50 ++++++++++ lib/model/rpn/rpn.py | 7 +- lib/model/utils/network.py | 10 -- lib/model/utils/vgg16.py | 57 ----------- trainval_net_cascade.py | 55 ++++------- 8 files changed, 117 insertions(+), 207 deletions(-) rename lib/model/{utils => faster_rcnn}/mobilenet_v1.py (100%) rename lib/model/{utils => faster_rcnn}/resnet_v1.py (89%) create mode 100644 lib/model/faster_rcnn/vgg16.py delete mode 100644 lib/model/utils/vgg16.py diff --git a/lib/model/faster_rcnn/faster_rcnn_cascade.py b/lib/model/faster_rcnn/faster_rcnn_cascade.py index fe1587307..78d4bf14a 100644 --- a/lib/model/faster_rcnn/faster_rcnn_cascade.py +++ b/lib/model/faster_rcnn/faster_rcnn_cascade.py @@ -27,9 +27,7 @@ def __init__(self, baseModels, classes): self.classes = classes self.n_classes = len(classes) - self.RCNN_base_model = nn.Sequential() - for i in range(len(baseModels)): - self.RCNN_base_model.add_module('part{}'.format(i), baseModels[i]) + self.RCNN_base_model = baseModels virtual_input = torch.randn(1, 3, cfg.TRAIN.TRIM_HEIGHT, cfg.TRAIN.TRIM_WIDTH) out = self.RCNN_base_model(Variable(virtual_input)) @@ -40,7 +38,6 @@ def __init__(self, baseModels, classes): self.RCNN_rpn = _RPN(self.feat_height, self.feat_width, self.dout_base_model) self.RCNN_proposal_target = _ProposalTargetLayer(self.n_classes) self.RCNN_roi_pool = _RoIPooling(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0) - # self.RCNN_roi_pool = _RoIPool(cfg.POOLING_SIZE, cfg.POOLING_SIZE, 1.0/16.0) def forward(self, im_data, im_info, gt_boxes, num_boxes): im_info = im_info.data @@ -82,62 +79,47 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): class _fasterRCNN(nn.Module): """ faster RCNN """ - def __init__(self, baseModel, classes, debug=False): + def __init__(self, classes): super(_fasterRCNN, self).__init__() - - if classes is not None: - self.classes = classes - self.n_classes = len(classes) - - # define base model, e.g., VGG16, ResNet, etc. - if baseModel == "vgg16": - slices = network.load_baseModel(baseModel) - self.RCNN_base = _RCNN_base(slices[:3], classes) - self.RCNN_fc6 = slices[3] - self.RCNN_fc7 = slices[4] - elif baseModel == "res50": - pretrained_model = models.resnet50(pretrained=True) - RCNN_base_model = nn.Sequential(*list(pretrained_model.children())[:-2]) - elif baseModel == "res101": - pretrained_model = models.resnet50(pretrained=True) - RCNN_base_model = nn.Sequential(*list(pretrained_model.children())[:-2]) - else: - raise RuntimeError('baseModel is not included.') - - self.dout_base_model = self.RCNN_base.dout_base_model - - self.RCNN_cls_score = nn.Sequential( - nn.Linear(4096, self.n_classes) - ) - - self.RCNN_bbox_pred = nn.Sequential( - nn.Linear(4096, 4) - ) - + self.classes = classes + self.n_classes = len(classes) # loss self.RCNN_loss_cls = 0 self.RCNN_loss_bbox = 0 - # for log - self.debug = debug + def _init_weights(self): + def normal_init(m, mean, stddev, truncated=False): + """ + weight initalizer: truncated normal and random normal. + """ + # x is a parameter + if truncated: + m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation + else: + m.weight.data.normal_(mean, stddev) + m.bias.data.zero_() + + normal_init(self.RCNN_base.RCNN_rpn.RPN_Conv, 0, 0.01, cfg.TRAIN.TRUNCATED) + normal_init(self.RCNN_base.RCNN_rpn.RPN_cls_score, 0, 0.01, cfg.TRAIN.TRUNCATED) + normal_init(self.RCNN_base.RCNN_rpn.RPN_bbox_pred, 0, 0.01, cfg.TRAIN.TRUNCATED) + normal_init(self.RCNN_cls_score, 0, 0.01, cfg.TRAIN.TRUNCATED) + normal_init(self.RCNN_bbox_pred, 0, 0.001, cfg.TRAIN.TRUNCATED) + + def create_architecture(self): + self._init_modules() + self._init_weights() def forward(self, im_data, im_info, gt_boxes, num_boxes): - batch_size = im_data.size(0) rois, pooled_feat_all, rois_label, rois_target, rois_inside_ws, rois_outside_ws, \ rpn_loss_cls, rpn_loss_bbox = self.RCNN_base(im_data, im_info, gt_boxes, num_boxes) + # get the rpn loss. rpn_loss = rpn_loss_cls + rpn_loss_bbox # feed pooled features to top model - x = self.RCNN_fc6(pooled_feat_all) - x = F.relu(x, inplace = True) - x = F.dropout(x, training=self.training) - - x = self.RCNN_fc7(x) - x = F.relu(x, inplace = True) - x = F.dropout(x, training=self.training) + x = self.RCNN_top(pooled_feat_all) # compute bbox offset bbox_pred = self.RCNN_bbox_pred(x) @@ -146,32 +128,6 @@ def forward(self, im_data, im_info, gt_boxes, num_boxes): cls_score = self.RCNN_cls_score(x) cls_prob = F.softmax(cls_score) - # if not self.training: - # pdb.set_trace() - # from model.rpn.bbox_transform import bbox_transform_inv, clip_boxes - # if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: - # # Optionally normalize targets by a precomputed mean and stdev - # box_deltas = bbox_pred.data.view(-1, 4) * torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_STDS).cuda() \ - # + torch.FloatTensor(cfg.TRAIN.BBOX_NORMALIZE_MEANS).cuda() - # box_deltas = box_deltas.view(1, -1, 84) - # pred_boxes = bbox_transform_inv(rois, box_deltas, 1) - # pred_boxes = clip_boxes(pred_boxes, im_info.data, 1) - - # # perform roi pooling again on pred_boxes - # rois_var = Variable(pred_boxes.view(-1,5)) - - # # do roi pooling based on predicted rois - - # pooled_feat = self.RCNN_roi_pool(base_feat, rois_var) - # pooled_feat_all = pooled_feat.view(pooled_feat.size(0), -1) - # # feed pooled features to top model - # x = self.RCNN_fc6(pooled_feat_all) - # x = F.relu(x, inplace = True) - # x = F.dropout(x, training=self.training) - - # x = self.RCNN_fc7(x) - # x = F.relu(x, inplace = True) - # x = F.dropout(x, training=self.training) self.RCNN_loss_cls = 0 self.RCNN_loss_bbox = 0 diff --git a/lib/model/utils/mobilenet_v1.py b/lib/model/faster_rcnn/mobilenet_v1.py similarity index 100% rename from lib/model/utils/mobilenet_v1.py rename to lib/model/faster_rcnn/mobilenet_v1.py diff --git a/lib/model/utils/resnet_v1.py b/lib/model/faster_rcnn/resnet_v1.py similarity index 89% rename from lib/model/utils/resnet_v1.py rename to lib/model/faster_rcnn/resnet_v1.py index 770261898..9bc069654 100644 --- a/lib/model/utils/resnet_v1.py +++ b/lib/model/faster_rcnn/resnet_v1.py @@ -7,8 +7,8 @@ from __future__ import division from __future__ import print_function -from nets.network import Network -from model.config import cfg +from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN, _RCNN_base +from model.utils.config import cfg import utils.timer @@ -224,37 +224,27 @@ def resnet152(pretrained=False): return model class resnetv1(Network): - def __init__(self, batch_size=1, num_layers=50): - Network.__init__(self, batch_size=batch_size) - self._num_layers = num_layers + def __init__(self, classes, num_layers=50): + _fasterRCNN.__init__(self, classes) + self.model_path = 'data/pretrained_model/resnet101_caffe.pth' - def _crop_pool_layer(self, bottom, rois): - return Network._crop_pool_layer(self, bottom, rois, cfg.RESNET.MAX_POOL) + # def _crop_pool_layer(self, bottom, rois): + # return Network._crop_pool_layer(self, bottom, rois, cfg.RESNET.MAX_POOL) - def _image_to_head(self): - net_conv = self._layers['head'](self._image) - self._act_summaries['conv']['value'] = net_conv + # def _image_to_head(self): + # net_conv = self._layers['head'](self._image) + # self._act_summaries['conv']['value'] = net_conv - return net_conv + # return net_conv - def _head_to_tail(self, pool5): - fc7 = self.resnet.layer4(pool5).mean(3).mean(2) # average pooling after layer4 - return fc7 + # def _head_to_tail(self, pool5): + # fc7 = self.resnet.layer4(pool5).mean(3).mean(2) # average pooling after layer4 + # return fc7 def _init_modules(self): - # choose different blocks for different number of layers - if self._num_layers == 50: - self.resnet = resnet50() - elif self._num_layers == 101: - self.resnet = resnet101() - - elif self._num_layers == 152: - self.resnet = resnet152() - - else: - # other numbers are not supported - raise NotImplementedError + self.resnet = resnet101() + self.load_pretrained_cnn() # Fix blocks for p in self.resnet.bn1.parameters(): p.requires_grad=False @@ -311,5 +301,6 @@ def set_bn_eval(m): self.resnet.apply(set_bn_eval) - def load_pretrained_cnn(self, state_dict): - self.resnet.load_state_dict(state_dict) + def load_pretrained_cnn(self): + state_dict = torch.load(self.model_path) + self.resnet.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()}) diff --git a/lib/model/faster_rcnn/vgg16.py b/lib/model/faster_rcnn/vgg16.py new file mode 100644 index 000000000..bfa30c693 --- /dev/null +++ b/lib/model/faster_rcnn/vgg16.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Xinlei Chen +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import math +import torchvision.models as models +from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN, _RCNN_base +import pdb + +class vgg16(_fasterRCNN): + def __init__(self, classes): + _fasterRCNN.__init__(self, classes) + self.model_path = 'data/pretrained_model/vgg16_caffe.pth' + + def _init_modules(self): + + self.vgg = models.vgg16() + self.load_pretrained_cnn() + + self.vgg.classifier = nn.Sequential(*list(self.vgg.classifier._modules.values())[:-1]) + + # not using the last maxpool layer + self.vgg.features = nn.Sequential(*list(self.vgg.features._modules.values())[:-1]) + + # Fix the layers before conv3: + for layer in range(10): + for p in self.vgg.features[layer].parameters(): p.requires_grad = False + + self.RCNN_base = _RCNN_base(self.vgg.features, self.classes) + + self.RCNN_top = self.vgg.classifier + + # not using the last maxpool layer + self.RCNN_cls_score = nn.Linear(4096, self.n_classes) + self.RCNN_bbox_pred = nn.Linear(4096, 4) + + + def load_pretrained_cnn(self): + state_dict = torch.load(self.model_path) + + self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()}) diff --git a/lib/model/rpn/rpn.py b/lib/model/rpn/rpn.py index f27600f65..3544d4584 100644 --- a/lib/model/rpn/rpn.py +++ b/lib/model/rpn/rpn.py @@ -22,10 +22,7 @@ def __init__(self, feat_height, feat_width, din=512): self.feat_stride = cfg.FEAT_STRIDE[0] # define the convrelu layers processing input feature map - self.RPN_ConvReLU = nn.Sequential( - nn.Conv2d(self.din, 512, 3, 1, 1, bias=True), - nn.ReLU(True) - ) + self.RPN_Conv = nn.Conv2d(self.din, 512, 3, 1, 1, bias=True) # define bg/fg classifcation score layer self.nc_score_out = len(self.anchor_scales) * 3 * 2 # 2(bg/fg) * 9 (anchors) @@ -71,7 +68,7 @@ def forward(self, base_feat, im_info, gt_boxes, num_boxes): self.shifts = self.shifts.type_as(im_info) # return feature map after convrelu layer - rpn_conv1 = self.RPN_ConvReLU(base_feat) + rpn_conv1 = F.relu(self.RPN_Conv(base_feat), inplace=True) # get rpn classification score rpn_cls_score = self.RPN_cls_score(rpn_conv1) diff --git a/lib/model/utils/network.py b/lib/model/utils/network.py index 1a6c97464..40e536040 100644 --- a/lib/model/utils/network.py +++ b/lib/model/utils/network.py @@ -3,7 +3,6 @@ from torch.autograd import Variable import numpy as np import torchvision.models as models -from vgg16 import vgg16 import cv2 import pdb @@ -59,15 +58,6 @@ def vis_detections(im, class_name, dets, thresh=0.8): return im -def load_baseModel(model_name): - if model_name == "vgg16": - net = vgg16() - model_path = 'data/pretrained_model/{}_caffe.pth'.format(model_name) - net.load_pretrained_cnn(torch.load(model_path)) - return net.slice() - elif model_name == "resnet50": - return None - def adjust_learning_rate(optimizer, decay=0.1): """Sets the learning rate to the initial LR decayed by 0.5 every 20 epochs""" for param_group in optimizer.param_groups: diff --git a/lib/model/utils/vgg16.py b/lib/model/utils/vgg16.py deleted file mode 100644 index 0fc6ad295..000000000 --- a/lib/model/utils/vgg16.py +++ /dev/null @@ -1,57 +0,0 @@ -# -------------------------------------------------------- -# Tensorflow Faster R-CNN -# Licensed under The MIT License [see LICENSE for details] -# Written by Xinlei Chen -# -------------------------------------------------------- -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable -import math -import torchvision.models as models - -class vgg16(): - def __init__(self): - self.vgg = models.vgg16() - # Remove fc8 - # self.vgg.classifier = nn.Sequential(*list(self.vgg.classifier._modules.values())[:-1]) - - # Fix the layers before conv3: - for layer in range(10): - for p in self.vgg.features[layer].parameters(): p.requires_grad = False - - # def _image_to_head(self): - # net_conv = self._layers['head'](self._image) - # self._act_summaries['conv']['value'] = net_conv - - # return net_conv - - # def _head_to_tail(self, pool5): - # pool5_flat = pool5.view(pool5.size(0), -1) - # fc7 = self.vgg.classifier(pool5_flat) - - # return fc7 - def slice(self): - - self.slices = [] - # we fix conv1_1, conv1_2, conv2_1, conv2_2 - self.slices.append(nn.Sequential(*list(self.vgg.features.children())[:10])) - # we finetune conv3_1, conv3_2, conv3_3 - self.slices.append(nn.Sequential(*list(self.vgg.features.children())[10:17])) - # we retrain conv4_1, conv4_2, conv4_3, conv5_1, conv5_2, conv5_3 - self.slices.append(nn.Sequential(*list(self.vgg.features.children())[17:-1])) - - # we copy fc6 - self.slices.append(self.vgg.classifier[0]) - - # we copy fc7 - self.slices.append(self.vgg.classifier[3]) - - return self.slices - - def load_pretrained_cnn(self, state_dict): - self.vgg.load_state_dict({k:v for k,v in state_dict.items() if k in self.vgg.state_dict()}) \ No newline at end of file diff --git a/trainval_net_cascade.py b/trainval_net_cascade.py index 59e28d62f..3f8d764b0 100644 --- a/trainval_net_cascade.py +++ b/trainval_net_cascade.py @@ -31,7 +31,7 @@ from model.utils.network import weights_normal_init, save_net, load_net, \ adjust_learning_rate, save_checkpoint -from model.faster_rcnn.faster_rcnn_cascade import _fasterRCNN +from model.faster_rcnn.vgg16 import vgg16 import pdb def parse_args(): @@ -141,10 +141,6 @@ def __iter__(self): def __len__(self): return num_data - -lr = cfg.TRAIN.LEARNING_RATE -momentum = cfg.TRAIN.MOMENTUM -weight_decay = cfg.TRAIN.WEIGHT_DECAY use_multiGPU = False if __name__ == '__main__': @@ -226,38 +222,25 @@ def __len__(self): cfg.CUDA = True # initilize the network here. - fasterRCNN = _fasterRCNN(args.net, imdb.classes) - # weights_normal_init(fasterRCNN) - weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_ConvReLU) - weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_cls_score) - weights_normal_init(fasterRCNN.RCNN_base.RCNN_rpn.RPN_bbox_pred) - weights_normal_init(fasterRCNN.RCNN_cls_score) - weights_normal_init(fasterRCNN.RCNN_bbox_pred, 0.001) - - params = list(fasterRCNN.parameters()) + fasterRCNN = vgg16(imdb.classes) + fasterRCNN.create_architecture() + + lr = cfg.TRAIN.LEARNING_RATE + params = [] + for key, value in dict(fasterRCNN.named_parameters()).items(): + if value.requires_grad: + if 'bias' in key: + params += [{'params':[value],'lr':lr*(cfg.TRAIN.DOUBLE_BIAS + 1), \ + 'weight_decay': cfg.TRAIN.BIAS_DECAY and cfg.TRAIN.WEIGHT_DECAY or 0}] + else: + params += [{'params':[value],'lr':lr, 'weight_decay': cfg.TRAIN.WEIGHT_DECAY}] if args.optimizer == "adam": lr = lr * 0.1 - optimizer = torch.optim.Adam([ - {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, - {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, - {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, - {'params': fasterRCNN.RCNN_fc6.parameters()}, - {'params': fasterRCNN.RCNN_fc7.parameters()}, - {'params': fasterRCNN.RCNN_cls_score.parameters()}, - {'params': fasterRCNN.RCNN_bbox_pred.parameters()}, - ], lr = lr) + optimizer = torch.optim.Adam(params) elif args.optimizer == "sgd": - optimizer = torch.optim.SGD([ - {'params': fasterRCNN.RCNN_base.RCNN_base_model[1].parameters(), 'lr': lr}, - {'params': fasterRCNN.RCNN_base.RCNN_base_model[2].parameters()}, - {'params': fasterRCNN.RCNN_base.RCNN_rpn.parameters()}, - {'params': fasterRCNN.RCNN_fc6.parameters(), 'lr': lr}, - {'params': fasterRCNN.RCNN_fc7.parameters(), 'lr': lr}, - {'params': fasterRCNN.RCNN_cls_score.parameters()}, - {'params': fasterRCNN.RCNN_bbox_pred.parameters()}, - ], lr = lr, momentum=momentum, weight_decay=weight_decay) + optimizer = torch.optim.SGD(params, momentum=cfg.TRAIN.MOMENTUM) if args.resume: load_name = os.path.join(output_dir, @@ -303,8 +286,8 @@ def __len__(self): if step % args.disp_interval == 0: if use_multiGPU: - print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ - % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) + print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ + % (args.session, epoch, step, loss_temp / args.disp_interval, lr)) print("\t\t\tfg/bg=(%d/%d)" % (0, 0)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" % (0, 0, 0, 0)) if args.use_tfboard: @@ -315,8 +298,8 @@ def __len__(self): logger.scalar_summary(tag, value, step) else: - print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr4ft: %.2e, lr4tr: %.2e" \ - % (args.session, epoch, step, loss_temp / args.disp_interval, lr * 0.1, lr)) + print("[session %d][epoch %2d][iter %4d] loss: %.4f, lr: %.2e" \ + % (args.session, epoch, step, loss_temp / args.disp_interval, lr)) print("\t\t\tfg/bg=(%d/%d)" % (fasterRCNN.fg_cnt, fasterRCNN.bg_cnt)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box: %.4f" % (fasterRCNN.RCNN_base.RCNN_rpn.rpn_loss_cls.data[0], \ From a32f872cebc96d617a5893849e869a736e4df237 Mon Sep 17 00:00:00 2001 From: jwyang Date: Tue, 29 Aug 2017 23:07:21 -0400 Subject: [PATCH 11/13] add vg and imagenet, change trainval --- lib/datasets/factory.py | 7 + lib/datasets/imagenet.py | 207 ++++++++++++++++++++++ lib/datasets/vg.py | 374 +++++++++++++++++++++++++++++++++++++++ lib/datasets/vg_eval.py | 125 +++++++++++++ trainval_net.py | 19 +- trainval_net_cascade.py | 23 +-- 6 files changed, 735 insertions(+), 20 deletions(-) create mode 100644 lib/datasets/imagenet.py create mode 100755 lib/datasets/vg.py create mode 100755 lib/datasets/vg_eval.py diff --git a/lib/datasets/factory.py b/lib/datasets/factory.py index 7c03c84a5..88530fbdf 100644 --- a/lib/datasets/factory.py +++ b/lib/datasets/factory.py @@ -13,6 +13,8 @@ __sets = {} from datasets.pascal_voc import pascal_voc from datasets.coco import coco +from datasets.imagenet import imagenet +from datasets.vg import vg import numpy as np @@ -34,6 +36,11 @@ name = 'coco_{}_{}'.format(year, split) __sets[name] = (lambda split=split, year=year: coco(split, year)) +# Set up vg_ +for version in ['1600-400-20']: + for split in ['minitrain', 'train', 'minival', 'val', 'test']: + name = 'vg_{}_{}'.format(version,split) + __sets[name] = (lambda split=split, version=version: vg(version, split)) def get_imdb(name): """Get an imdb (image database) by name.""" diff --git a/lib/datasets/imagenet.py b/lib/datasets/imagenet.py new file mode 100644 index 000000000..951b20768 --- /dev/null +++ b/lib/datasets/imagenet.py @@ -0,0 +1,207 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import datasets +import datasets.imagenet +import os, sys +from datasets.imdb import imdb +import xml.dom.minidom as minidom +import numpy as np +import scipy.sparse +import scipy.io as sio +import utils.cython_bbox +import cPickle +import subprocess + +class imagenet(imdb): + def __init__(self, image_set, devkit_path, data_path): + imdb.__init__(self, image_set) + self._image_set = image_set + self._devkit_path = devkit_path + self._data_path = data_path + synsets_image = sio.loadmat(os.path.join(self._devkit_path, 'data', 'meta_det.mat')) + synsets_video = sio.loadmat(os.path.join(self._devkit_path, 'data', 'meta_vid.mat')) + self._classes_image = ('__background__',) + self._wnid_image = (0,) + + self._classes = ('__background__',) + self._wnid = (0,) + + for i in xrange(200): + self._classes_image = self._classes_image + (synsets_image['synsets'][0][i][2][0],) + self._wnid_image = self._wnid_image + (synsets_image['synsets'][0][i][1][0],) + + for i in xrange(30): + self._classes = self._classes + (synsets_video['synsets'][0][i][2][0],) + self._wnid = self._wnid + (synsets_video['synsets'][0][i][1][0],) + + self._wnid_to_ind_image = dict(zip(self._wnid_image, xrange(201))) + self._class_to_ind_image = dict(zip(self._classes_image, xrange(201))) + + self._wnid_to_ind = dict(zip(self._wnid, xrange(31))) + self._class_to_ind = dict(zip(self._classes, xrange(31))) + + #check for valid intersection between video and image classes + self._valid_image_flag = [0]*201 + + for i in range(1,201): + if self._wnid_image[i] in self._wnid_to_ind: + self._valid_image_flag[i] = 1 + + self._image_ext = ['.JPEG'] + + self._image_index = self._load_image_set_index() + # Default to roidb handler + self._roidb_handler = self.gt_roidb + + # Specific config options + self.config = {'cleanup' : True, + 'use_salt' : True, + 'top_k' : 2000} + + assert os.path.exists(self._devkit_path), 'Devkit path does not exist: {}'.format(self._devkit_path) + assert os.path.exists(self._data_path), 'Path does not exist: {}'.format(self._data_path) + + def image_path_at(self, i): + """ + Return the absolute path to image i in the image sequence. + """ + return self.image_path_from_index(self._image_index[i]) + + def image_path_from_index(self, index): + """ + Construct an image path from the image's "index" identifier. + """ + image_path = os.path.join(self._data_path, 'Data', self._image_set, index + self._image_ext[0]) + assert os.path.exists(image_path), 'path does not exist: {}'.format(image_path) + return image_path + + def _load_image_set_index(self): + """ + Load the indexes listed in this dataset's image set file. + """ + # Example path to image set file: + # self._data_path + /ImageSets/val.txt + + if self._image_set == 'train': + image_set_file = os.path.join(self._data_path, 'ImageSets', 'trainr.txt') + image_index = [] + if os.path.exists(image_set_file): + f = open(image_set_file, 'r') + data = f.read().split() + for lines in data: + if lines != '': + image_index.append(lines) + f.close() + return image_index + + for i in range(1,31): + print(i) + image_set_file = os.path.join(self._data_path, 'ImageSets', 'train_' + str(i) + '.txt') + with open(image_set_file) as f: + tmp_index = [x.strip() for x in f.readlines()] + vtmp_index = [] + for line in tmp_index: + image_list = os.popen('ls ' + self._data_path + '/Data/train/' + line + '/*.JPEG').read().split() + tmp_list = [] + for imgs in image_list: + tmp_list.append(imgs[:-5]) + vtmp_index = vtmp_index + tmp_list + + num_lines = len(vtmp_index) + ids = np.random.permutation(num_lines) + count = 0 + while count < 2000: + image_index.append(vtmp_index[ids[count % num_lines]]) + count = count + 1 + + for i in range(1,201): + if self._valid_image_flag[i] == 1: + image_set_file = os.path.join(self._data_path, 'ImageSets', 'train_pos_' + str(i) + '.txt') + with open(image_set_file) as f: + tmp_index = [x.strip() for x in f.readlines()] + num_lines = len(tmp_index) + ids = np.random.permutation(num_lines) + count = 0 + while count < 2000: + image_index.append(tmp_index[ids[count % num_lines]]) + count = count + 1 + image_set_file = os.path.join(self._data_path, 'ImageSets', 'trainr.txt') + f = open(image_set_file, 'w') + for lines in image_index: + f.write(lines + '\n') + f.close() + else: + image_set_file = os.path.join(self._data_path, 'ImageSets', 'val.txt') + with open(image_set_file) as f: + image_index = [x.strip() for x in f.readlines()] + return image_index + + def gt_roidb(self): + """ + Return the database of ground-truth regions of interest. + This function loads/saves from/to a cache file to speed up future calls. + """ + cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') + if os.path.exists(cache_file): + with open(cache_file, 'rb') as fid: + roidb = cPickle.load(fid) + print '{} gt roidb loaded from {}'.format(self.name, cache_file) + return roidb + + gt_roidb = [self._load_imagenet_annotation(index) + for index in self.image_index] + with open(cache_file, 'wb') as fid: + cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) + print 'wrote gt roidb to {}'.format(cache_file) + + return gt_roidb + + + def _load_imagenet_annotation(self, index): + """ + Load image and bounding boxes info from txt files of imagenet. + """ + filename = os.path.join(self._data_path, 'Annotations', self._image_set, index + '.xml') + + # print 'Loading: {}'.format(filename) + def get_data_from_tag(node, tag): + return node.getElementsByTagName(tag)[0].childNodes[0].data + + with open(filename) as f: + data = minidom.parseString(f.read()) + + objs = data.getElementsByTagName('object') + num_objs = len(objs) + + boxes = np.zeros((num_objs, 4), dtype=np.uint16) + gt_classes = np.zeros((num_objs), dtype=np.int32) + overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) + + # Load object bounding boxes into a data frame. + for ix, obj in enumerate(objs): + x1 = float(get_data_from_tag(obj, 'xmin')) + y1 = float(get_data_from_tag(obj, 'ymin')) + x2 = float(get_data_from_tag(obj, 'xmax')) + y2 = float(get_data_from_tag(obj, 'ymax')) + cls = self._wnid_to_ind[ + str(get_data_from_tag(obj, "name")).lower().strip()] + boxes[ix, :] = [x1, y1, x2, y2] + gt_classes[ix] = cls + overlaps[ix, cls] = 1.0 + + overlaps = scipy.sparse.csr_matrix(overlaps) + + return {'boxes' : boxes, + 'gt_classes': gt_classes, + 'gt_overlaps' : overlaps, + 'flipped' : False} + +if __name__ == '__main__': + d = datasets.imagenet('val', '') + res = d.roidb + from IPython import embed; embed() diff --git a/lib/datasets/vg.py b/lib/datasets/vg.py new file mode 100755 index 000000000..81ec6e28a --- /dev/null +++ b/lib/datasets/vg.py @@ -0,0 +1,374 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick +# -------------------------------------------------------- + +import os +from datasets.imdb import imdb +import datasets.ds_utils as ds_utils +import xml.etree.ElementTree as ET +import numpy as np +import scipy.sparse +import utils.cython_bbox +import cPickle +import gzip +import PIL +import json +from vg_eval import vg_eval +from fast_rcnn.config import cfg +import pdb + +class vg(imdb): + def __init__(self, version, image_set, ): + imdb.__init__(self, 'vg_' + version + '_' + image_set) + self._version = version + self._image_set = image_set + self._data_path = os.path.join(cfg.DATA_DIR, 'genome') + self._img_path = os.path.join(cfg.DATA_DIR, 'vg') + # VG specific config options + self.config = {'cleanup' : False} + + # Load classes + print("loading classes\n") + self._classes = ['__background__'] + self._class_to_ind = {} + self._class_to_ind[self._classes[0]] = 0 + with open(os.path.join(self._data_path, self._version, 'objects_vocab.txt')) as f: + count = 1 + for object in f.readlines(): + names = [n.lower().strip() for n in object.split(',')] + self._classes.append(names[0]) + for n in names: + self._class_to_ind[n] = count + count += 1 + print("loaded classes\n") + + # Load attributes + print("loading attributes\n") + self._attributes = ['__no_attribute__'] + self._attribute_to_ind = {} + self._attribute_to_ind[self._attributes[0]] = 0 + with open(os.path.join(self._data_path, self._version, 'attributes_vocab.txt')) as f: + count = 1 + for att in f.readlines(): + names = [n.lower().strip() for n in att.split(',')] + self._attributes.append(names[0]) + for n in names: + self._attribute_to_ind[n] = count + count += 1 + print("loaded attributes\n") + + # Load relations + print("loading relations\n") + self._relations = ['__no_relation__'] + self._relation_to_ind = {} + self._relation_to_ind[self._relations[0]] = 0 + with open(os.path.join(self._data_path, self._version, 'relations_vocab.txt')) as f: + count = 1 + for rel in f.readlines(): + names = [n.lower().strip() for n in rel.split(',')] + self._relations.append(names[0]) + for n in names: + self._relation_to_ind[n] = count + count += 1 + print("loaded relations\n") + + pdb.set_trace() + print("loading images\n") + self._image_ext = '.jpg' + self._image_index, self._id_to_dir = self._load_image_set_index() + print("loaded images\n") + pdb.set_trace() + def image_path_at(self, i): + """ + Return the absolute path to image i in the image sequence. + """ + return self.image_path_from_index(self._image_index[i]) + + def image_path_from_index(self, index): + """ + Construct an image path from the image's "index" identifier. + """ + folder = self._id_to_dir[index] + image_path = os.path.join(self._img_path, folder, + str(index) + self._image_ext) + assert os.path.exists(image_path), \ + 'Path does not exist: {}'.format(image_path) + return image_path + + def _image_split_path(self): + if self._image_set == "minitrain": + return os.path.join(self._data_path, 'train.txt') + if self._image_set == "minival": + return os.path.join(self._data_path, 'val.txt') + else: + return os.path.join(self._data_path, self._image_set+'.txt') + + def _load_image_set_index(self): + """ + Load the indexes listed in this dataset's image set file. + """ + training_split_file = self._image_split_path() + assert os.path.exists(training_split_file), \ + 'Path does not exist: {}'.format(training_split_file) + with open(training_split_file) as f: + metadata = f.readlines() + if self._image_set == "minitrain": + metadata = metadata[:1000] + elif self._image_set == "minival": + metadata = metadata[:100] + + image_index = [] + id_to_dir = {} + for line in metadata: + im_file,ann_file = line.split() + image_id = int(ann_file.split('/')[-1].split('.')[0]) + filename = self._annotation_path(image_id) + if os.path.exists(filename): + # Some images have no bboxes after object filtering, so there + # is no xml annotation for these. + tree = ET.parse(filename) + for obj in tree.findall('object'): + obj_name = obj.find('name').text.lower().strip() + if obj_name in self._class_to_ind: + # We have to actually load and check these to make sure they have + # at least one object actually in vocab + image_index.append(image_id) + id_to_dir[image_id] = im_file.split('/')[0] + break + return image_index, id_to_dir + + def gt_roidb(self): + """ + Return the database of ground-truth regions of interest. + + This function loads/saves from/to a cache file to speed up future calls. + """ + cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') + if os.path.exists(cache_file): + fid = gzip.open(cache_file,'rb') + roidb = cPickle.load(fid) + fid.close() + print '{} gt roidb loaded from {}'.format(self.name, cache_file) + return roidb + + pdb.set_trace() + + gt_roidb = [self._load_vg_annotation(index) + for index in self.image_index] + + fid = gzip.open(cache_file,'wb') + cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL) + fid.close() + print 'wrote gt roidb to {}'.format(cache_file) + return gt_roidb + + def _get_size(self, index): + return PIL.Image.open(self.image_path_from_index(index)).size + + def _annotation_path(self, index): + return os.path.join(self._data_path, 'xml', str(index) + '.xml') + + def _load_vg_annotation(self, index): + """ + Load image and bounding boxes info from XML file in the PASCAL VOC + format. + """ + width, height = self._get_size(index) + filename = self._annotation_path(index) + tree = ET.parse(filename) + objs = tree.findall('object') + num_objs = len(objs) + + boxes = np.zeros((num_objs, 4), dtype=np.uint16) + gt_classes = np.zeros((num_objs), dtype=np.int32) + # Max of 16 attributes are observed in the data + gt_attributes = np.zeros((num_objs, 16), dtype=np.int32) + overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) + # "Seg" area for pascal is just the box area + seg_areas = np.zeros((num_objs), dtype=np.float32) + + # Load object bounding boxes into a data frame. + obj_dict = {} + ix = 0 + for obj in objs: + obj_name = obj.find('name').text.lower().strip() + if obj_name in self._class_to_ind: + bbox = obj.find('bndbox') + x1 = max(0,float(bbox.find('xmin').text)) + y1 = max(0,float(bbox.find('ymin').text)) + x2 = min(width-1,float(bbox.find('xmax').text)) + y2 = min(height-1,float(bbox.find('ymax').text)) + # If bboxes are not positive, just give whole image coords (there are a few examples) + if x2 < x1 or y2 < y1: + print 'Failed bbox in %s, object %s' % (filename, obj_name) + x1 = 0 + y1 = 0 + x2 = width-1 + y2 = width-1 + cls = self._class_to_ind[obj_name] + obj_dict[obj.find('object_id').text] = ix + atts = obj.findall('attribute') + n = 0 + for att in atts: + att = att.text.lower().strip() + if att in self._attribute_to_ind: + gt_attributes[ix, n] = self._attribute_to_ind[att] + n += 1 + if n >= 16: + break + boxes[ix, :] = [x1, y1, x2, y2] + gt_classes[ix] = cls + overlaps[ix, cls] = 1.0 + seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) + ix += 1 + + overlaps = scipy.sparse.csr_matrix(overlaps) + gt_attributes = scipy.sparse.csr_matrix(gt_attributes) + + rels = tree.findall('relation') + num_rels = len(rels) + gt_relations = set() # Avoid duplicates + for rel in rels: + pred = rel.find('predicate').text + if pred: # One is empty + pred = pred.lower().strip() + if pred in self._relation_to_ind: + try: + triple = [] + triple.append(obj_dict[rel.find('subject_id').text]) + triple.append(self._relation_to_ind[pred]) + triple.append(obj_dict[rel.find('object_id').text]) + gt_relations.add(tuple(triple)) + except: + pass # Object not in dictionary + gt_relations = np.array(list(gt_relations), dtype=np.int32) + + return {'boxes' : boxes, + 'gt_classes': gt_classes, + 'gt_attributes' : gt_attributes, + 'gt_relations' : gt_relations, + 'gt_overlaps' : overlaps, + 'width' : width, + 'height': height, + 'flipped' : False, + 'seg_areas' : seg_areas} + + def evaluate_detections(self, all_boxes, output_dir): + self._write_voc_results_file(self.classes, all_boxes, output_dir) + self._do_python_eval(output_dir) + if self.config['cleanup']: + for cls in self._classes: + if cls == '__background__': + continue + filename = self._get_vg_results_file_template(output_dir).format(cls) + os.remove(filename) + + def evaluate_attributes(self, all_boxes, output_dir): + self._write_voc_results_file(self.attributes, all_boxes, output_dir) + self._do_python_eval(output_dir, eval_attributes = True) + if self.config['cleanup']: + for cls in self._attributes: + if cls == '__no_attribute__': + continue + filename = self._get_vg_results_file_template(output_dir).format(cls) + os.remove(filename) + + def _get_vg_results_file_template(self, output_dir): + filename = 'detections_' + self._image_set + '_{:s}.txt' + path = os.path.join(output_dir, filename) + return path + + def _write_voc_results_file(self, classes, all_boxes, output_dir): + for cls_ind, cls in enumerate(classes): + if cls == '__background__': + continue + print 'Writing "{}" vg results file'.format(cls) + filename = self._get_vg_results_file_template(output_dir).format(cls) + with open(filename, 'wt') as f: + for im_ind, index in enumerate(self.image_index): + dets = all_boxes[cls_ind][im_ind] + if dets == []: + continue + # the VOCdevkit expects 1-based indices + for k in xrange(dets.shape[0]): + f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. + format(str(index), dets[k, -1], + dets[k, 0] + 1, dets[k, 1] + 1, + dets[k, 2] + 1, dets[k, 3] + 1)) + + + def _do_python_eval(self, output_dir, pickle=True, eval_attributes = False): + # We re-use parts of the pascal voc python code for visual genome + aps = [] + nposs = [] + thresh = [] + # The PASCAL VOC metric changed in 2010 + use_07_metric = False + print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No') + if not os.path.isdir(output_dir): + os.mkdir(output_dir) + # Load ground truth + gt_roidb = self.gt_roidb() + if eval_attributes: + classes = self._attributes + else: + classes = self._classes + for i, cls in enumerate(classes): + if cls == '__background__' or cls == '__no_attribute__': + continue + filename = self._get_vg_results_file_template(output_dir).format(cls) + rec, prec, ap, scores, npos = vg_eval( + filename, gt_roidb, self.image_index, i, ovthresh=0.5, + use_07_metric=use_07_metric, eval_attributes=eval_attributes) + + # Determine per class detection thresholds that maximise f score + if npos > 1: + f = np.nan_to_num((prec*rec)/(prec+rec)) + thresh += [scores[np.argmax(f)]] + else: + thresh += [0] + aps += [ap] + nposs += [float(npos)] + print('AP for {} = {:.4f} (npos={:,})'.format(cls, ap, npos)) + if pickle: + with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f: + cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap, + 'scores': scores, 'npos':npos}, f) + + # Set thresh to mean for classes with poor results + thresh = np.array(thresh) + avg_thresh = np.mean(thresh[thresh!=0]) + thresh[thresh==0] = avg_thresh + if eval_attributes: + filename = 'attribute_thresholds_' + self._image_set + '.txt' + else: + filename = 'object_thresholds_' + self._image_set + '.txt' + path = os.path.join(output_dir, filename) + with open(path, 'wt') as f: + for i, cls in enumerate(classes[1:]): + f.write('{:s} {:.3f}\n'.format(cls, thresh[i])) + + weights = np.array(nposs) + weights /= weights.sum() + print('Mean AP = {:.4f}'.format(np.mean(aps))) + print('Weighted Mean AP = {:.4f}'.format(np.average(aps, weights=weights))) + print('Mean Detection Threshold = {:.3f}'.format(avg_thresh)) + print('~~~~~~~~') + print('Results:') + for ap,npos in zip(aps,nposs): + print('{:.3f}\t{:.3f}'.format(ap,npos)) + print('{:.3f}'.format(np.mean(aps))) + print('~~~~~~~~') + print('') + print('--------------------------------------------------------------') + print('Results computed with the **unofficial** PASCAL VOC Python eval code.') + print('--------------------------------------------------------------') + + +if __name__ == '__main__': + d = datasets.vg('val') + res = d.roidb + from IPython import embed; embed() diff --git a/lib/datasets/vg_eval.py b/lib/datasets/vg_eval.py new file mode 100755 index 000000000..740f0b073 --- /dev/null +++ b/lib/datasets/vg_eval.py @@ -0,0 +1,125 @@ +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- + +import xml.etree.ElementTree as ET +import os +import cPickle +import numpy as np +from voc_eval import voc_ap + + + +def vg_eval( detpath, + gt_roidb, + image_index, + classindex, + ovthresh=0.5, + use_07_metric=False, + eval_attributes=False): + """rec, prec, ap, sorted_scores, npos = voc_eval( + detpath, + gt_roidb, + image_index, + classindex, + [ovthresh], + [use_07_metric]) + + Top level function that does the Visual Genome evaluation. + + detpath: Path to detections + gt_roidb: List of ground truth structs. + image_index: List of image ids. + classindex: Category index + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # extract gt objects for this class + class_recs = {} + npos = 0 + for item,imagename in zip(gt_roidb,image_index): + if eval_attributes: + bbox = item['boxes'][np.where(np.any(item['gt_attributes'].toarray() == classindex, axis=1))[0], :] + else: + bbox = item['boxes'][np.where(item['gt_classes'] == classindex)[0], :] + difficult = np.zeros((bbox.shape[0],)).astype(np.bool) + det = [False] * bbox.shape[0] + npos = npos + sum(~difficult) + class_recs[str(imagename)] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + if npos == 0: + # No ground truth examples + return 0,0,0,0,npos + + # read dets + with open(detpath, 'r') as f: + lines = f.readlines() + if len(lines) == 0: + # No detection examples + return 0,0,0,0,npos + + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = -np.sort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap, sorted_scores, npos diff --git a/trainval_net.py b/trainval_net.py index d6535f300..9d0821722 100644 --- a/trainval_net.py +++ b/trainval_net.py @@ -338,18 +338,19 @@ def __len__(self): loss_temp = 0 if epoch % args.lr_decay_step == 0: + adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma - save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) - save_checkpoint({ - 'session': args.session, - 'epoch': epoch + 1, - 'model': fasterRCNN.state_dict(), - "optimizer": optimizer.state_dict(), - "lr": lr, - }, save_name) - print('save model: {}'.format(save_name)) + # pdb.set_trace() + save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) + save_checkpoint({ + 'session': args.session, + 'epoch': epoch + 1, + 'model': fasterRCNN.state_dict(), + "optimizer": optimizer.state_dict(), + }, save_name) + print('save model: {}'.format(save_name)) end = time.time() diff --git a/trainval_net_cascade.py b/trainval_net_cascade.py index 3f8d764b0..1ae22d1f3 100644 --- a/trainval_net_cascade.py +++ b/trainval_net_cascade.py @@ -251,6 +251,7 @@ def __len__(self): args.start_epoch = checkpoint['epoch'] fasterRCNN.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) + lr = optimizer.param_groups[0]['lr'] print("loaded checkpoint %s" % (load_name)) if use_multiGPU: @@ -319,21 +320,21 @@ def __len__(self): loss_temp = 0 - if (step % args.checkpoint_interval == 0) and step > 0: - # pdb.set_trace() - save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) - save_checkpoint({ - 'session': args.session, - 'epoch': epoch + 1, - 'model': fasterRCNN.state_dict(), - "optimizer": optimizer.state_dict(), - }, save_name) - print('save model: {}'.format(save_name)) - if epoch % args.lr_decay_step == 0: + adjust_learning_rate(optimizer, args.lr_decay_gamma) lr *= args.lr_decay_gamma + # pdb.set_trace() + save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) + save_checkpoint({ + 'session': args.session, + 'epoch': epoch + 1, + 'model': fasterRCNN.state_dict(), + "optimizer": optimizer.state_dict(), + }, save_name) + print('save model: {}'.format(save_name)) + end = time.time() print(end - start) From 6def46044be4f04b45704562396b39108ec12bf0 Mon Sep 17 00:00:00 2001 From: jiasen Date: Wed, 30 Aug 2017 13:59:00 -0400 Subject: [PATCH 12/13] fix rand bug that cause GPU error. It seems a torch lib bug. use numpy instead --- lib/model/rpn/proposal_target_layer_4.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/lib/model/rpn/proposal_target_layer_4.py b/lib/model/rpn/proposal_target_layer_4.py index 606a2fe7e..7e2f922ad 100644 --- a/lib/model/rpn/proposal_target_layer_4.py +++ b/lib/model/rpn/proposal_target_layer_4.py @@ -154,25 +154,33 @@ def _sample_rois_pytorch(self, all_rois, gt_boxes, fg_rois_per_image, rois_per_i if fg_num_rois > 0 and bg_num_rois > 0: # sampling fg fg_rois_per_this_image = min(fg_rois_per_image, fg_num_rois) - rand_num = torch.randperm(fg_num_rois).type_as(all_rois).long() + rand_num = torch.randperm(fg_num_rois).long().cuda() fg_inds = fg_inds[rand_num[:fg_rois_per_this_image]] # sampling bg bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image - rand_num = torch.floor(torch.rand(bg_rois_per_this_image).type_as(all_rois) - * bg_num_rois).long() + + # Seems torch.rand has a bug, it will generate very large number and make an error. + # We use numpy rand instead. + #rand_num = (torch.rand(bg_rois_per_this_image) * bg_num_rois).long().cuda() + rand_num = np.floor(np.random.rand(bg_rois_per_this_image) * bg_num_rois) + rand_num = torch.from_numpy(rand_num).long().cuda() bg_inds = bg_inds[rand_num] + elif fg_num_rois > 0 and bg_num_rois == 0: # sampling fg - rand_num = torch.floor(torch.rand(rois_per_image).type_as(all_rois) - * fg_num_rois).long() + #rand_num = torch.floor(torch.rand(rois_per_image) * fg_num_rois).long().cuda() + rand_num = np.floor(np.random.rand(rois_per_image) * fg_num_rois) + rand_num = torch.from_numpy(rand_num).long().cuda() fg_inds = fg_inds[rand_num] fg_rois_per_this_image = rois_per_image bg_rois_per_this_image = 0 elif bg_num_rois > 0 and fg_num_rois == 0: # sampling bg - rand_num = torch.floor(torch.rand(rois_per_image).type_as(all_rois) - * bg_num_rois).long() + #rand_num = torch.floor(torch.rand(rois_per_image) * bg_num_rois).long().cuda() + rand_num = np.floor(np.random.rand(rois_per_image) * bg_num_rois) + rand_num = torch.from_numpy(rand_num).long().cuda() + bg_inds = bg_inds[rand_num] bg_rois_per_this_image = rois_per_image fg_rois_per_this_image = 0 From 58b7d01ffcc64a2be3741e4e8655e2061c103ec8 Mon Sep 17 00:00:00 2001 From: jwyang Date: Wed, 30 Aug 2017 14:06:53 -0400 Subject: [PATCH 13/13] minor change to vg, make it compatible to roidb --- lib/datasets/imagenet.py | 1 - lib/datasets/imdb.py | 1 + lib/datasets/vg.py | 36 ++++++++++++++++++++---------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/lib/datasets/imagenet.py b/lib/datasets/imagenet.py index 951b20768..67987883c 100644 --- a/lib/datasets/imagenet.py +++ b/lib/datasets/imagenet.py @@ -13,7 +13,6 @@ import numpy as np import scipy.sparse import scipy.io as sio -import utils.cython_bbox import cPickle import subprocess diff --git a/lib/datasets/imdb.py b/lib/datasets/imdb.py index aa994266b..92c23202c 100644 --- a/lib/datasets/imdb.py +++ b/lib/datasets/imdb.py @@ -15,6 +15,7 @@ import numpy as np import scipy.sparse from model.utils.config import cfg +import pdb ROOT_DIR = osp.join(osp.dirname(__file__), '..', '..') diff --git a/lib/datasets/vg.py b/lib/datasets/vg.py index 81ec6e28a..a35d01025 100755 --- a/lib/datasets/vg.py +++ b/lib/datasets/vg.py @@ -11,13 +11,13 @@ import xml.etree.ElementTree as ET import numpy as np import scipy.sparse -import utils.cython_bbox import cPickle import gzip import PIL import json from vg_eval import vg_eval -from fast_rcnn.config import cfg +from model.utils.config import cfg +import pickle import pdb class vg(imdb): @@ -31,7 +31,6 @@ def __init__(self, version, image_set, ): self.config = {'cleanup' : False} # Load classes - print("loading classes\n") self._classes = ['__background__'] self._class_to_ind = {} self._class_to_ind[self._classes[0]] = 0 @@ -43,10 +42,8 @@ def __init__(self, version, image_set, ): for n in names: self._class_to_ind[n] = count count += 1 - print("loaded classes\n") # Load attributes - print("loading attributes\n") self._attributes = ['__no_attribute__'] self._attribute_to_ind = {} self._attribute_to_ind[self._attributes[0]] = 0 @@ -58,10 +55,8 @@ def __init__(self, version, image_set, ): for n in names: self._attribute_to_ind[n] = count count += 1 - print("loaded attributes\n") # Load relations - print("loading relations\n") self._relations = ['__no_relation__'] self._relation_to_ind = {} self._relation_to_ind[self._relations[0]] = 0 @@ -73,14 +68,25 @@ def __init__(self, version, image_set, ): for n in names: self._relation_to_ind[n] = count count += 1 - print("loaded relations\n") - pdb.set_trace() - print("loading images\n") self._image_ext = '.jpg' - self._image_index, self._id_to_dir = self._load_image_set_index() - print("loaded images\n") - pdb.set_trace() + load_index_from_file = False + if os.path.exists(os.path.join(self._data_path, "vg_image_index.p")): + with open(os.path.join(self._data_path, "vg_image_index.p"), 'rb') as fp: + self._image_index = pickle.load(fp) + load_index_from_file = True + + load_id_from_file = False + if os.path.exists(os.path.join(self._data_path, "vg_id_to_dir.p")): + with open(os.path.join(self._data_path, "vg_id_to_dir.p"), 'rb') as fp: + self._id_to_dir = pickle.load(fp) + load_id_from_file = True + + if not load_index_from_file or not load_id_from_file: + self._image_index, self._id_to_dir = self._load_image_set_index() + + self._roidb_handler = self.gt_roidb + def image_path_at(self, i): """ Return the absolute path to image i in the image sequence. @@ -119,7 +125,7 @@ def _load_image_set_index(self): metadata = metadata[:1000] elif self._image_set == "minival": metadata = metadata[:100] - + image_index = [] id_to_dir = {} for line in metadata: @@ -154,8 +160,6 @@ def gt_roidb(self): print '{} gt roidb loaded from {}'.format(self.name, cache_file) return roidb - pdb.set_trace() - gt_roidb = [self._load_vg_annotation(index) for index in self.image_index]