您的位置:首页 > 其它

faster rcnn源码解读2

2017-04-13 17:01 357 查看
参考链接:
http://lib.csdn.net/article/deeplearning/57865?knId=1726 http://blog.csdn.net/iamzhangzhuping/article/category/6230157 http://blog.csdn.net/u010668907/article/category/6237110 具体训练流程

首先看tools/train_faster_rcnn_alt_opt.py

(1)部分参数说明

net_name: {ZF, VGG_CNN_M_1024, VGG16}

pretrained_model: data/imagenet_models/${net_name}.v2.caffemodel

cfg_file: experiments/cfgs/faster_rcnn_alt_opt.yml

imdb_name: "voc_2007_trainval" or "voc_2007_test"

(2)cfg.TRAIN.HAS_RPN = True表示用xml提供的propoal。cfg是配置文件,它的默认值放在上面的cfg_file里,其他还可以自己写配置文件之后与默认配置文件融合。

(3)net_name是用get_solvers()找到网络。还要用到cfg的参数MODELS_DIR,

    例子是join(MODELS_DIR,net_name,'faster_rcnn_alt_opt',
'stage1_rpn_solver60k80k.pt')

(4)imdb_name在factory中被拆成‘2007’(year)和‘trainval’/‘test’(split)到类pascal_voc中产生相应的imdb

(5) 整个step的大致流程:

(ImageNet model)->stage1_rpn_train->rpn_test|(proposal_path)

(ImageNetmodel)->stage1_fast_rcnn_train-> stage2_rpn_train-> rpn_test-> stage2_fast_rcnn_train

 (6) 数据imdb和roidb。 roidb原本是imdb的一个属性,但imdb其实是为了计算roidb存在的,他所有的其他属性和方法都是为了计算roidb

#coding:utf-8
#!/usr/bin/env python

# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

"""Train a Faster R-CNN network using alternating optimization.
This tool implements the alternating optimization algorithm described in our
NIPS 2015 paper ("Faster R-CNN: Towards Real-time Object Detection with Region
Proposal Networks." Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun.)
"""

import _init_paths
from fast_rcnn.train import get_training_roidb, train_net
from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir
from datasets.factory import get_imdb
from rpn.generate import imdb_proposals
import argparse
import pprint
import numpy as np
import sys, os
import multiprocessing as mp
import cPickle
import shutil

def parse_args():
"""
Parse input arguments
"""
parser = argparse.ArgumentParser(description='Train a Faster R-CNN network')
#训练时设置使用哪个GPU
parser.add_argument('--gpu', dest='gpu_id',
help='GPU device id to use [0]',
default=0, type=int)
#设置训练时使用哪种网络模型
parser.add_argument('--net_name', dest='net_name',
help='network name (e.g., "ZF")',
default=None, type=str)
#指定预训练的模型来初始化网络
parser.add_argument('--weights', dest='pretrained_model',
help='initialize with pretrained model weights',
default=None, type=str)
#加载配置文件
parser.add_argument('--cfg', dest='cfg_file',
help='optional config file',
default=None, type=str)
#训练使用的数据集
parser.add_argument('--imdb', dest='imdb_name',
help='dataset to train on',
default='voc_2007_trainval', type=str)
parser.add_argument('--set', dest='set_cfgs',
help='set config keys', default=None,
nargs=argparse.REMAINDER)

if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)

args = parser.parse_args()
return args

def get_roidb(imdb_name, rpn_file=None):
#得到图像集(image database)的名字,如pascalvoc——2007——trainval
imdb = get_imdb(imdb_name)
print 'Loaded dataset `{:s}` for training'.format(imdb.name)
#设置网络得到proposal的方法,有selective search和RPN、gt,selective search已弃用
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
#判断之前是否已经有RPN网络提取得到的region proposal文件
if rpn_file is not None:
imdb.config['rpn_file'] = rpn_file
roidb = get_training_roidb(imdb)
return roidb, imdb

def get_solvers(net_name):
# Faster R-CNN Alternating Optimization
n = 'faster_rcnn_alt_opt'
# Solver for each training stage
solvers = [[net_name, n, 'stage1_rpn_solver60k80k.pt'],
[net_name, n, 'stage1_fast_rcnn_solver30k40k.pt'],
[net_name, n, 'stage2_rpn_solver60k80k.pt'],
[net_name, n, 'stage2_fast_rcnn_solver30k40k.pt']]
solvers = [os.path.join(cfg.MODELS_DIR, *s) for s in solvers]
# Iterations for each training stage
#每一轮训练的最大迭代次数,建议测试时都设置为100
max_iters = [80000, 40000, 80000, 40000]
# max_iters = [100, 100, 100, 100]
# Test prototxt for the RPN
rpn_test_prototxt = os.path.join(
cfg.MODELS_DIR, net_name, n, 'rpn_test.pt')
return solvers, max_iters, rpn_test_prototxt

# ------------------------------------------------------------------------------
# Pycaffe doesn't reliably free GPU memory when instantiated nets are discarded
# (e.g. "del net" in Python code). To work around this issue, each training
# stage is executed in a separate process using multiprocessing.Process.
# ------------------------------------------------------------------------------

def _init_caffe(cfg):
"""Initialize pycaffe in a training process.
"""

import caffe
# fix the random seeds (numpy and caffe) for reproducibility
np.random.seed(cfg.RNG_SEED)
caffe.set_random_seed(cfg.RNG_SEED)
# set up caffe
caffe.set_mode_gpu()
caffe.set_device(cfg.GPU_ID)

#训练RPN
def train_rpn(queue=None, imdb_name=None, init_model=None, solver=None,
max_iters=None, cfg=None):
"""Train a Region Proposal Network in a separate training process.
"""

# Not using any proposals, just ground-truth boxes
cfg.TRAIN.HAS_RPN = True
cfg.TRAIN.BBOX_REG = False  # applies only to Fast R-CNN bbox regression
#训练RPN时使用ground-truth
cfg.TRAIN.PROPOSAL_METHOD = 'gt'
#每次训练RPN只用一张图片
cfg.TRAIN.IMS_PER_BATCH = 1
print 'Init model: {}'.format(init_model)
print('Using config:')
pprint.pprint(cfg)

import caffe
_init_caffe(cfg)

roidb, imdb = get_roidb(imdb_name)
print 'roidb len: {}'.format(len(roidb))
output_dir = get_output_dir(imdb)
print 'Output will be saved to `{:s}`'.format(output_dir)
#开始训练RPN网络
model_paths = train_net(solver, roidb, output_dir,
pretrained_model=init_model,
max_iters=max_iters)
#只保留最后得到的网络模型
# Cleanup all but the final model
for i in model_paths[:-1]:
os.remove(i)
rpn_model_path = model_paths[-1]
# Send final model path through the multiprocessing queue
queue.put({'model_path': rpn_model_path})

#用训练完的RPN产生region proposal并存到磁盘上
def rpn_generate(queue=None, imdb_name=None, rpn_model_path=None, cfg=None,
rpn_test_prototxt=None):
"""Use a trained RPN to generate proposals.
"""

cfg.TEST.RPN_PRE_NMS_TOP_N = -1     # no pre NMS filtering
cfg.TEST.RPN_POST_NMS_TOP_N = 2000  # limit top boxes after NMS
print 'RPN model: {}'.format(rpn_model_path)
print('Using config:')
pprint.pprint(cfg)

import caffe
_init_caffe(cfg)

# NOTE: the matlab implementation computes proposals on flipped images, too.
# We compute them on the image once and then flip the already computed
# proposals. This might cause a minor loss in mAP (less proposal jittering).
imdb = get_imdb(imdb_name)
print 'Loaded dataset `{:s}` for proposal generation'.format(imdb.name)

# Load RPN and configure output directory
rpn_net = caffe.Net(rpn_test_prototxt, rpn_model_path, caffe.TEST)
output_dir = get_output_dir(imdb)
print 'Output will be saved to `{:s}`'.format(output_dir)
# Generate proposals on the imdb
rpn_proposals = imdb_proposals(rpn_net, imdb)
# Write proposals to disk and send the proposal file path through the
# multiprocessing queue
rpn_net_name = os.path.splitext(os.path.basename(rpn_model_path))[0]
rpn_proposals_path = os.path.join(
output_dir, rpn_net_name + '_proposals.pkl')
with open(rpn_proposals_path, 'wb') as f:
cPickle.dump(rpn_proposals, f, cPickle.HIGHEST_PROTOCOL)
print 'Wrote RPN proposals to {}'.format(rpn_proposals_path)
queue.put({'proposal_path': rpn_proposals_path})

#训练fast-rcnn
def train_fast_rcnn(queue=None, imdb_name=None, init_model=None, solver=None,
max_iters=None, cfg=None, rpn_file=None):
"""Train a Fast R-CNN using proposals generated by an RPN.
"""
#conv5后面现在接的是fast-rcnn
cfg.TRAIN.HAS_RPN = False           # not generating prosals on-the-fly
#roidb由刚刚训练完的RPN产生
cfg.TRAIN.PROPOSAL_METHOD = 'rpn'   # use pre-computed RPN proposals instead
#每次训练fast-rcnn使用两张图片
cfg.TRAIN.IMS_PER_BATCH = 2
print 'Init model: {}'.format(init_model)
print 'RPN proposals: {}'.format(rpn_file)
print('Using config:')
pprint.pprint(cfg)

import caffe
_init_caffe(cfg)

roidb, imdb = get_roidb(imdb_name, rpn_file=rpn_file)
output_dir = get_output_dir(imdb)
print 'Output will be saved to `{:s}`'.format(output_dir)
# Train Fast R-CNN
model_paths = train_net(solver, roidb, output_dir,
pretrained_model=init_model,
max_iters=max_iters)
# Cleanup all but the final model
for i in model_paths[:-1]:
os.remove(i)
fast_rcnn_model_path = model_paths[-1]
# Send Fast R-CNN model path over the multiprocessing queue
queue.put({'model_path': fast_rcnn_model_path})

if __name__ == '__main__':
args = parse_args()

print('Called with args:')
print(args)

if args.cfg_file is not None:
cfg_from_file(args.cfg_file)
if args.set_cfgs is not None:
cfg_from_list(args.set_cfgs)
cfg.GPU_ID = args.gpu_id

# --------------------------------------------------------------------------
# Pycaffe doesn't reliably free GPU memory when instantiated nets are
# discarded (e.g. "del net" in Python code). To work around this issue, each
# training stage is executed in a separate process using
# multiprocessing.Process.
# --------------------------------------------------------------------------

# queue for communicated results between processes
mp_queue = mp.Queue()
# solves, iters, etc. for each training stage
solvers, max_iters, rpn_test_prototxt = get_solvers(args.net_name)

print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Stage 1 RPN, init from ImageNet model'
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

cfg.TRAIN.SNAPSHOT_INFIX = 'stage1'
mp_kwargs = dict(
queue=mp_queue,
imdb_name=args.imdb_name,
init_model=args.pretrained_model,
solver=solvers[0],
max_iters=max_iters[0],
cfg=cfg)
p = mp.Process(target=train_rpn, kwargs=mp_kwargs)
p.start()
rpn_stage1_out = mp_queue.get()
p.join()

print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Stage 1 RPN, generate proposals'
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

mp_kwargs = dict(
queue=mp_queue,
imdb_name=args.imdb_name,
rpn_model_path=str(rpn_stage1_out['model_path']),
cfg=cfg,
rpn_test_prototxt=rpn_test_prototxt)
p = mp.Process(target=rpn_generate, kwargs=mp_kwargs)
p.start()
rpn_stage1_out['proposal_path'] = mp_queue.get()['proposal_path']
p.join()

print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Stage 1 Fast R-CNN using RPN proposals, init from ImageNet model'
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

cfg.TRAIN.SNAPSHOT_INFIX = 'stage1'
mp_kwargs = dict(
queue=mp_queue,
imdb_name=args.imdb_name,
init_model=args.pretrained_model,
solver=solvers[1],
max_iters=max_iters[1],
cfg=cfg,
rpn_file=rpn_stage1_out['proposal_path'])
p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs)
p.start()
fast_rcnn_stage1_out = mp_queue.get()
p.join()

print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Stage 2 RPN, init from stage 1 Fast R-CNN model'
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

cfg.TRAIN.SNAPSHOT_INFIX = 'stage2'
mp_kwargs = dict(
queue=mp_queue,
imdb_name=args.imdb_name,
init_model=str(fast_rcnn_stage1_out['model_path']),
solver=solvers[2],
max_iters=max_iters[2],
cfg=cfg)
p = mp.Process(target=train_rpn, kwargs=mp_kwargs)
p.start()
rpn_stage2_out = mp_queue.get()
p.join()

print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Stage 2 RPN, generate proposals'
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

mp_kwargs = dict(
queue=mp_queue,
imdb_name=args.imdb_name,
rpn_model_path=str(rpn_stage2_out['model_path']),
cfg=cfg,
rpn_test_prototxt=rpn_test_prototxt)
p = mp.Process(target=rpn_generate, kwargs=mp_kwargs)
p.start()
rpn_stage2_out['proposal_path'] = mp_queue.get()['proposal_path']
p.join()

print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
print 'Stage 2 Fast R-CNN, init from stage 2 RPN R-CNN model'
print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'

cfg.TRAIN.SNAPSHOT_INFIX = 'stage2'
mp_kwargs = dict(
queue=mp_queue,
imdb_name=args.imdb_name,
init_model=str(rpn_stage2_out['model_path']),
solver=solvers[3],
max_iters=max_iters[3],
cfg=cfg,
rpn_file=rpn_stage2_out['proposal_path'])
p = mp.Process(target=train_fast_rcnn, kwargs=mp_kwargs)
p.start()
fast_rcnn_stage2_out = mp_queue.get()
p.join()

# Create final model (just a copy of the last stage)
final_path = os.path.join(
os.path.dirname(fast_rcnn_stage2_out['model_path']),
args.net_name + '_faster_rcnn_final.caffemodel')
print 'cp {} -> {}'.format(
fast_rcnn_stage2_out['model_path'], final_path)
shutil.copy(fast_rcnn_stage2_out['model_path'], final_path)
print 'Final model: {}'.format(final_path)


lib/rpn/generate.py利用rpn网络前向计算产生proposal

#coding:utf-8
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

from fast_rcnn.config import cfg
from utils.blob import im_list_to_blob
from utils.timer import Timer
import numpy as np
import cv2

def _vis_proposals(im, dets, thresh=0.5):
"""Draw detected bounding boxes."""
inds = np.where(dets[:, -1] >= thresh)[0]
if len(inds) == 0:
return

class_name = 'obj'
im = im[:, :, (2, 1, 0)]
fig, ax = plt.subplots(figsize=(12, 12))
ax.imshow(im, aspect='equal')
for i in inds:
bbox = dets[i, :4]
score = dets[i, -1]

ax.add_patch(
plt.Rectangle((bbox[0], bbox[1]),
bbox[2] - bbox[0],
bbox[3] - bbox[1], fill=False,
edgecolor='red', linewidth=3.5)
)
ax.text(bbox[0], bbox[1] - 2,
'{:s} {:.3f}'.format(class_name, score),
bbox=dict(facecolor='blue', alpha=0.5),
fontsize=14, color='white')

ax.set_title(('{} detections with '
'p({} | box) >= {:.1f}').format(class_name, class_name,
thresh),
fontsize=14)
plt.axis('off')
plt.tight_layout()
plt.draw()

def _get_image_blob(im):
"""Converts an image into a network input.

Arguments:
im (ndarray): a color image in BGR order

Returns:
blob (ndarray): a data blob holding an image pyramid
im_scale_factors (list): list of image scales (relative to im) used
in the image pyramid
"""
im_orig = im.astype(np.float32, copy=True)
im_orig -= cfg.PIXEL_MEANS

im_shape = im_orig.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])

processed_ims = []

assert len(cfg.TEST.SCALES) == 1
target_size = cfg.TEST.SCALES[0]

im_scale = float(target_size) / float(im_size_min)
# Prevent the biggest axis from being more than MAX_SIZE
if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
interpolation=cv2.INTER_LINEAR)
im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :]
processed_ims.append(im)

# Create a blob to hold the input images
blob = im_list_to_blob(processed_ims)

return blob, im_info
#在一张图片上RPN前向计算产生region proposal
def im_proposals(net, im):
"""Generate RPN proposals on a single image."""
blobs = {}
blobs['data'], blobs['im_info'] = _get_image_blob(im)
net.blobs['data'].reshape(*(blobs['data'].shape))
net.blobs['im_info'].reshape(*(blobs['im_info'].shape))
blobs_out = net.forward(
data=blobs['data'].astype(np.float32, copy=False),
im_info=blobs['im_info'].astype(np.float32, copy=False))

scale = blobs['im_info'][0, 2]
#boxes是列表,是所有roi box的坐标
boxes = blobs_out['rois'][:, 1:].copy() / scale
scores = blobs_out['scores'].copy()
return boxes, scores

#对imdb中所有的图像计算Region Proposal
def imdb_proposals(net, imdb):
"""Generate RPN proposals on all images in an imdb."""

_t = Timer()
imdb_boxes = [[] for _ in xrange(imdb.num_images)]
for i in xrange(imdb.num_images):
im = cv2.imread(imdb.image_path_at(i))
_t.tic()
imdb_boxes[i], scores = im_proposals(net, im)
_t.toc()
print 'im_proposals: {:d}/{:d} {:.3f}s' \
.format(i + 1, imdb.num_images, _t.average_time)
if 0:
dets = np.hstack((imdb_boxes[i], scores))
# from IPython import embed; embed()
_vis_proposals(im, dets[:3, :], thresh=0.9)
plt.show()

return imdb_boxes


lib/fast_rcnn/train.py

#coding:utf-8
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

"""Train a Fast R-CNN network."""

import caffe
from fast_rcnn.config import cfg
import roi_data_layer.roidb as rdl_roidb
from utils.timer import Timer
import numpy as np
import os

from caffe.proto import caffe_pb2
import google.protobuf as pb2

class SolverWrapper(object):
"""A simple wrapper around Caffe's solver.
This wrapper gives us control over he snapshotting process, which we
use to unnormalize the learned bounding-box regression weights.
"""

def __init__(self, solver_prototxt, roidb, output_dir,
pretrained_model=None):
"""Initialize the SolverWrapper."""
self.output_dir = output_dir

if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
# RPN can only use precomputed normalization because there are no
# fixed statistics to compute a priori
assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED

if cfg.TRAIN.BBOX_REG:
print 'Computing bounding-box regression targets...'
#bbox_stds是什么
self.bbox_means, self.bbox_stds = \
rdl_roidb.add_bbox_regression_targets(roidb)
print 'done'

self.solver = caffe.SGDSolver(solver_prototxt)
#加载在ImageNet上训练得到的预训练模型
if pretrained_model is not None:
print ('Loading pretrained model '
'weights from {:s}').format(pretrained_model)
self.solver.net.copy_from(pretrained_model)
#解析得到训练时的参数,学习率等
self.solver_param = caffe_pb2.SolverParameter()
with open(solver_prototxt, 'rt') as f:
pb2.text_format.Merge(f.read(), self.solver_param)
#设置输入
self.solver.net.layers[0].set_roidb(roidb)
#迭代达到10000次、20000次。。。时存结果
def snapshot(self):
"""Take a snapshot of the network after unnormalizing the learned
bounding-box regression weights. This enables easy use at test-time.
"""
net = self.solver.net

scale_bbox_params = (cfg.TRAIN.BBOX_REG and
cfg.TRAIN.BBOX_NORMALIZE_TARGETS and
net.params.has_key('bbox_pred'))

if scale_bbox_params:
# save original values
orig_0 = net.params['bbox_pred'][0].data.copy()
orig_1 = net.params['bbox_pred'][1].data.copy()

# scale and shift with bbox reg unnormalization; then save snapshot
net.params['bbox_pred'][0].data[...] = \
(net.params['bbox_pred'][0].data *
self.bbox_stds[:, np.newaxis])
net.params['bbox_pred'][1].data[...] = \
(net.params['bbox_pred'][1].data *
self.bbox_stds + self.bbox_means)

infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
filename = (self.solver_param.snapshot_prefix + infix +
'_iter_{:d}'.format(self.solver.iter) + '.caffemodel')
filename = os.path.join(self.output_dir, filename)

net.save(str(filename))
print 'Wrote snapshot to: {:s}'.format(filename)

if scale_bbox_params:
# restore net to original state
net.params['bbox_pred'][0].data[...] = orig_0
net.params['bbox_pred'][1].data[...] = orig_1
return filename
#迭代一次
def train_model(self, max_iters):
"""Network training loop."""
last_snapshot_iter = -1
timer = Timer()
model_paths = []
while self.solver.iter < max_iters:
# Make one SGD update
timer.tic()
self.solver.step(1)
timer.toc()
if self.solver.iter % (10 * self.solver_param.display) == 0:
print 'speed: {:.3f}s / iter'.format(timer.average_time)

if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
last_snapshot_iter = self.solver.iter
model_paths.append(self.snapshot())

if last_snapshot_iter != self.solver.iter:
model_paths.append(self.snapshot())
return model_paths

def get_training_roidb(imdb):
"""Returns a roidb (Region of Interest database) for use in training."""
#如果设置使用水平翻转的图像
if cfg.TRAIN.USE_FLIPPED:
print 'Appending horizontally-flipped training examples...'
#把原来image database里所有的图像水平翻转加入到imdb里
imdb.append_flipped_images()
print 'done'

print 'Preparing training data...'
rdl_roidb.prepare_roidb(imdb)
print 'done'

return imdb.roidb

#过滤产生符合条件的roidb
def filter_roidb(roidb):
"""Remove roidb entries that have no usable RoIs."""

def is_valid(entry):
#满足roidb中至少有一个前景或背景的roidb才有效
# Valid images have:
#   (1) At least one foreground RoI OR
#   (2) At least one background RoI
overlaps = entry['max_overlaps']
# find boxes with sufficient overlap
fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
(overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# image is only valid if such boxes exist
valid = len(fg_inds) > 0 or len(bg_inds) > 0
return valid

num = len(roidb)
filtered_roidb = [entry for entry in roidb if is_valid(entry)]
num_after = len(filtered_roidb)
print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
num, num_after)
return filtered_roidb

def train_net(solver_prototxt, roidb, output_dir,
pretrained_model=None, max_iters=40000):
"""Train a Fast R-CNN network."""

roidb = filter_roidb(roidb)
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)

print 'Solving...'
model_paths = sw.train_model(max_iters)
print 'done solving'
return model_paths


lib/roi_data_layer/roidb.py

roidb是一个重要的数据结构,roidb是一个列表,里面的每个元素是字典,对应一张图片的所有roi信息

{'image':imageindex,'width':w,'height':h,'gt_overlaps':二维array,每张图片上所有roi与各个类别的gt的overlap,'max_classes':max_cls,每个roi属于那一类别的大,'max_overlaps':每个roi与gt最大重叠率的大小}
#coding:utf-8
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

"""Transform a roidb into a trainable roidb by adding a bunch of metadata."""

import numpy as np
from fast_rcnn.config import cfg
from fast_rcnn.bbox_transform import bbox_transform
from utils.cython_bbox import bbox_overlaps
import PIL

#准备roidb
def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
#得到每幅图像的宽和高
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in xrange(imdb.num_images)]
roidb = imdb.roidb
#roidb是一个列表,里面的每个元素是一个字典,对应一张图片的所有roi信息
for i in xrange(len(imdb.image_index)):
#字典{'image':imageindex,'width':w,'height':h,'gt_overlaps':二维array,每张图片上所有roi与各个类别的gt的overlap
#'max_classes':max_cls,每个roi属于那一类别的最大
roidb[i]['image'] = imdb.image_path_at(i)
roidb[i]['width'] = sizes[i][0]
roidb[i]['height'] = sizes[i][1]
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
#传递进来的roidb尚未经过下面的取最大值的操作
#下面得到每个roi与ground-truth的bbox的最大IoU值
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
#与哪个类别有最大IoU
max_classes = gt_overlaps.argmax(axis=1)
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks
# max overlap of 0 => class should be zero (background)
#确保max overlap=0的box都属于背景
zero_inds = np.where(max_overlaps == 0)[0]
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0]
assert all(max_classes[nonzero_inds] != 0)

def add_bbox_regression_targets(roidb):
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'

num_images = len(roidb)
# Infer number of classes from the number of columns in gt_overlaps
num_classes = roidb[0]['gt_overlaps'].shape[1]
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes)

if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:
# Compute values needed for means and stds
# var(x) = E(x^2) - E(x)^2
class_counts = np.zeros((num_classes, 1)) + cfg.EPS
sums = np.zeros((num_classes, 4))
squared_sums = np.zeros((num_classes, 4))
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
if cls_inds.size > 0:
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)

means = sums / class_counts
stds = np.sqrt(squared_sums / class_counts - means ** 2)

print 'bbox target means:'
print means
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds
print stds[1:, :].mean(axis=0) # ignore bg class

# Normalize targets
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"

# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()

#计算bbox回归时用到的回归目标值,包括region proposal相对grouynd-truth的bbox的
#坐标偏移量和长宽比例,这四个目标值都经过了归一化处理
def _compute_targets(rois, overlaps, labels):
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs
gt_inds = np.where(overlaps == 1)[0]
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
#如果roidb全部是背景,返回0矩阵
return np.zeros((rois.shape[0], 5), dtype=np.float32)
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]

# Get IoU overlap between each ex ROI and gt ROI
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))

# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target
gt_assignment = ex_gt_overlaps.argmax(axis=1)
gt_rois = rois[gt_inds[gt_assignment], :]
ex_rois = rois[ex_inds, :]

targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
#矩阵第一列是类别
targets[ex_inds, 0] = labels[ex_inds]
#后面四列是边框回归目标值
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
return targets

lib/datasets/imdb.py

#coding:utf-8
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

import os
import os.path as osp
import PIL
from utils.cython_bbox import bbox_overlaps
import numpy as np
import scipy.sparse
from fast_rcnn.config import cfg

class imdb(object):
"""Image database."""

def __init__(self, name):
#imdb的一些属性
self._name = name
self._num_classes = 0
self._classes = []
self._image_index = []
self._obj_proposer = 'selective_search'
self._roidb = None
self._roidb_handler = self.default_roidb
# Use this dict for storing dataset specific config options
self.config = {}

@property
def name(self):
return self._name

@property
def num_classes(self):
return len(self._classes)

@property
def classes(self):
return self._classes

@property
def image_index(self):
return self._image_index

@property
def roidb_handler(self):
return self._roidb_handler

@roidb_handler.setter
def roidb_handler(self, val):
self._roidb_handler = val

def set_proposal_method(self, method):
method = eval('self.' + method + '_roidb')
self.roidb_handler = method

@property
def roidb(self):
# A roidb is a list of dictionaries, each with the following keys:
#   boxes
#   gt_overlaps
#   gt_classes
#   flipped
if self._roidb is not None:
return self._roidb
self._roidb = self.roidb_handler()
return self._roidb

@property
def cache_path(self):
cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
if not os.path.exists(cache_path):
os.makedirs(cache_path)
return cache_path

@property
def num_images(self):
return len(self.image_index)

def image_path_at(self, i):
raise NotImplementedError

def default_roidb(self):
raise NotImplementedError

def evaluate_detections(self, all_boxes, output_dir=None):
"""
all_boxes is a list of length number-of-classes.
Each list element is a list of length number-of-images.
Each of those list elements is either an empty list []
or a numpy array of detection.

all_boxes[class][image] = [] or np.array of shape #dets x 5
"""
raise NotImplementedError

def _get_widths(self):
return [PIL.Image.open(self.image_path_at(i)).size[0]
for i in xrange(self.num_images)]
#对所有原始图像进行水平翻转
def append_flipped_images(self):
num_images = self.num_images
#得到所有图像的宽度存入list
widths = self._get_widths()
for i in xrange(num_images):
#复制每张图中所有的box坐标,这个boxes是一个列表,类似[(x1min,y1min,x1max,y1max),]
boxes = self.roidb[i]['boxes'].copy()
oldx1 = boxes[:, 0].copy()
oldx2 = boxes[:, 2].copy()
#水平翻转只用对横坐标做变换,容易得到x'=width-x
boxes[:, 0] = widths[i] - oldx2 - 1
boxes[:, 2] = widths[i] - oldx1 - 1
assert (boxes[:, 2] >= boxes[:, 0]).all()
#entry是一个字典,存了boxes坐标,和ground-truth的重叠率,类别,是否由水平翻转得到,
#重叠率和类别不会变,直接复制i
entry = {'boxes' : boxes,
'gt_overlaps' : self.roidb[i]['gt_overlaps'],
'gt_classes' : self.roidb[i]['gt_classes'],
'flipped' : True}
#把水平翻转得到的数据加入roidb
self.roidb.append(entry)
#数量变为原来的2倍
self._image_index = self._image_index * 2

def evaluate_recall(self, candidate_boxes=None, thresholds=None,
area='all', limit=None):
"""Evaluate detection proposal recall metrics.

Returns:
results: dictionary of results with keys
'ar': average recall
'recalls': vector recalls at each IoU overlap threshold
'thresholds': vector of IoU overlap thresholds
'gt_overlaps': vector of all ground-truth overlaps
"""
# Record max overlap value for each gt box
# Return vector of overlap values
areas = { 'all': 0, 'small': 1, 'medium': 2, 'large': 3,
'96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
area_ranges = [ [0**2, 1e5**2],    # all
[0**2, 32**2],     # small
[32**2, 96**2],    # medium
[96**2, 1e5**2],   # large
[96**2, 128**2],   # 96-128
[128**2, 256**2],  # 128-256
[256**2, 512**2],  # 256-512
[512**2, 1e5**2],  # 512-inf
]
assert areas.has_key(area), 'unknown area range: {}'.format(area)
area_range = area_ranges[areas[area]]
gt_overlaps = np.zeros(0)
num_pos = 0
for i in xrange(self.num_images):
# Checking for max_overlaps == 1 avoids including crowd annotations
# (...pretty hacking :/)
max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) &
(max_gt_overlaps == 1))[0]
gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
gt_areas = self.roidb[i]['seg_areas'][gt_inds]
valid_gt_inds = np.where((gt_areas >= area_range[0]) &
(gt_areas <= area_range[1]))[0]
gt_boxes = gt_boxes[valid_gt_inds, :]
num_pos += len(valid_gt_inds)

if candidate_boxes is None:
# If candidate_boxes is not supplied, the default is to use the
# non-ground-truth boxes from this roidb
non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
boxes = self.roidb[i]['boxes'][non_gt_inds, :]
else:
boxes = candidate_boxes[i]
if boxes.shape[0] == 0:
continue
if limit is not None and boxes.shape[0] > limit:
boxes = boxes[:limit, :]

overlaps = bbox_overlaps(boxes.astype(np.float),
gt_boxes.astype(np.float))

_gt_overlaps = np.zeros((gt_boxes.shape[0]))
for j in xrange(gt_boxes.shape[0]):
# find which proposal box maximally covers each gt box
argmax_overlaps = overlaps.argmax(axis=0)
# and get the iou amount of coverage for each gt box
max_overlaps = overlaps.max(axis=0)
# find which gt box is 'best' covered (i.e. 'best' = most iou)
gt_ind = max_overlaps.argmax()
gt_ovr = max_overlaps.max()
assert(gt_ovr >= 0)
# find the proposal box that covers the best covered gt box
box_ind = argmax_overlaps[gt_ind]
# record the iou coverage of this gt box
_gt_overlaps[j] = overlaps[box_ind, gt_ind]
assert(_gt_overlaps[j] == gt_ovr)
# mark the proposal box and the gt box as used
overlaps[box_ind, :] = -1
overlaps[:, gt_ind] = -1
# append recorded iou coverage level
gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))

gt_overlaps = np.sort(gt_overlaps)
if thresholds is None:
step = 0.05
thresholds = np.arange(0.5, 0.95 + 1e-5, step)
recalls = np.zeros_like(thresholds)
# compute recall for each iou threshold
for i, t in enumerate(thresholds):
recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
# ar = 2 * np.trapz(recalls, thresholds)
ar = recalls.mean()
return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds,
'gt_overlaps': gt_overlaps}

def create_roidb_from_box_list(self, box_list, gt_roidb):
#断言box_list的数目和图像数目一样,这里box_list[i]里存的是相应第i张图像里所有的bbox的坐标
assert len(box_list) == self.num_images, \
'Number of boxes must match number of ground-truth images'
#重要,roidb是一个列表,列表中的每一个元素是字典,存储了每张图象的boxes,gt_classes,'gt_overlaps',是否flipped信息
roidb = []
for i in xrange(self.num_images):
boxes = box_list[i]
num_boxes = boxes.shape[0]
#计算每个box和每一类目标的重叠率
overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)

if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
#取得ground-truth里bbox的坐标
gt_boxes = gt_roidb[i]['boxes']
#取得每个bbox对应的类别
gt_classes = gt_roidb[i]['gt_classes']
#计算roidb的bbox与ground-truth的bbox的重叠率
gt_overlaps = bbox_overlaps(boxes.astype(np.float),
gt_boxes.astype(np.float))
#与那一类的重叠率最大
argmaxes = gt_overlaps.argmax(axis=1)
maxes = gt_overlaps.max(axis=1)
I = np.where(maxes > 0)[0]
overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]

overlaps = scipy.sparse.csr_matrix(overlaps)
roidb.append({
'boxes' : boxes,
'gt_classes' : np.zeros((num_boxes,), dtype=np.int32),
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : np.zeros((num_boxes,), dtype=np.float32),
})
return roidb

@staticmethod
def merge_roidbs(a, b):
assert len(a) == len(b)
for i in xrange(len(a)):
a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
b[i]['gt_classes']))
a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
b[i]['gt_overlaps']])
a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
b[i]['seg_areas']))
return a

def competition_mode(self, on):
"""Turn competition mode on or off."""
pass


lib/datasets/pascal_voc.py

#CODING:UTF-8
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------

import os
from datasets.imdb import imdb
import datasets.ds_utils as ds_utils
import xml.etree.ElementTree as ET
import numpy as np
import scipy.sparse
import scipy.io as sio
import utils.cython_bbox
import cPickle
import subprocess
import uuid
from voc_eval import voc_eval
from fast_rcnn.config import cfg

class pascal_voc(imdb):
def __init__(self, image_set, year, devkit_path=None):
imdb.__init__(self, 'voc_' + year + '_' + image_set)
self._year = year
self._image_set = image_set
self._devkit_path = self._get_default_path() if devkit_path is None \
else devkit_path
self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
self._classes = ('__background__', # always index 0
'aeroplane', 'bicycle', 'bird', 'boat',
'bottle', 'bus', 'car', 'cat', 'chair',
'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant',
'sheep', 'sofa', 'train', 'tvmonitor')
self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
self._image_ext = '.jpg'
self._image_index = self._load_image_set_index()
# Default to roidb handler
self._roidb_handler = self.selective_search_roidb
self._salt = str(uuid.uuid4())
self._comp_id = 'comp4'

# PASCAL specific config options
self.config = {'cleanup'     : True,
'use_salt'    : True,
'use_diff'    : False,
'matlab_eval' : False,
'rpn_file'    : None,
'min_size'    : 2}

assert os.path.exists(self._devkit_path), \
'VOCdevkit path does not exist: {}'.format(self._devkit_path)
assert os.path.exists(self._data_path), \
'Path does not exist: {}'.format(self._data_path)

def image_path_at(self, i):
"""
Return the absolute path to image i in the image sequence.
"""
return self.image_path_from_index(self._image_index[i])

def image_path_from_index(self, index):
"""
Construct an image path from the image's "index" identifier.
"""
image_path = os.path.join(self._data_path, 'JPEGImages',
index + self._image_ext)
assert os.path.exists(image_path), \
'Path does not exist: {}'.format(image_path)
return image_path

def _load_image_set_index(self):
"""
Load the indexes listed in this dataset's image set file.
"""
# Example path to image set file:
# self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
self._image_set + '.txt')
assert os.path.exists(image_set_file), \
'Path does not exist: {}'.format(image_set_file)
with open(image_set_file) as f:
image_index = [x.strip() for x in f.readlines()]
return image_index

def _get_default_path(self):
"""
Return the default path where PASCAL VOC is expected to be installed.
"""
return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)

def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.

This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} gt roidb loaded from {}'.format(self.name, cache_file)
return roidb

gt_roidb = [self._load_pascal_annotation(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote gt roidb to {}'.format(cache_file)

return gt_roidb

def selective_search_roidb(self):
"""
Return the database of selective search regions of interest.
Ground-truth ROIs are also included.

This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path,
self.name + '_selective_search_roidb.pkl')

if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} ss roidb loaded from {}'.format(self.name, cache_file)
return roidb

if int(self._year) == 2007 or self._image_set != 'test':
gt_roidb = self.gt_roidb()
ss_roidb = self._load_selective_search_roidb(gt_roidb)
roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
else:
roidb = self._load_selective_search_roidb(None)
with open(cache_file, 'wb') as fid:
cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote ss roidb to {}'.format(cache_file)

return roidb

def rpn_roidb(self):
if int(self._year) == 2007 or self._image_set != 'test':
gt_roidb = self.gt_roidb()
rpn_roidb = self._load_rpn_roidb(gt_roidb)
roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
else:
roidb = self._load_rpn_roidb(None)

return roidb

def _load_rpn_roidb(self, gt_roidb):
filename = self.config['rpn_file']
print 'loading {}'.format(filename)
assert os.path.exists(filename), \
'rpn data not found at: {}'.format(filename)
#得到rpn产生的box
with open(filename, 'rb') as f:
box_list = cPickle.load(f)
#由box_list产生roidb
return self.create_roidb_from_box_list(box_list, gt_roidb)

def _load_selective_search_roidb(self, gt_roidb):
filename = os.path.abspath(os.path.join(cfg.DATA_DIR,
'selective_search_data',
self.name + '.mat'))
assert os.path.exists(filename), \
'Selective search data not found at: {}'.format(filename)
raw_data = sio.loadmat(filename)['boxes'].ravel()

box_list = []
for i in xrange(raw_data.shape[0]):
boxes = raw_data[i][:, (1, 0, 3, 2)] - 1
keep = ds_utils.unique_boxes(boxes)
boxes = boxes[keep, :]
keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
boxes = boxes[keep, :]
box_list.append(boxes)

return self.create_roidb_from_box_list(box_list, gt_roidb)

def _load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
#xml文件名
filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
#解析xml
tree = ET.parse(filename)
#找到所有object属性项
objs = tree.findall('object')
if not self.config['use_diff']:
# Exclude the samples labeled as difficult
non_diff_objs = [
obj for obj in objs if int(obj.find('difficult').text) == 0]
# if len(non_diff_objs) != len(objs):
#     print 'Removed {} difficult objects'.format(
#         len(objs) - len(non_diff_objs))
objs = non_diff_objs
num_objs = len(objs)
#boxes存储这张图片上所有bbox的坐标
boxes = np.zeros((num_objs, 4), dtype=np.uint16)
#gt_classes存储每个bbox的类别
gt_classes = np.zeros((num_objs), dtype=np.int32)
overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
# "Seg" area for pascal is just the box area
seg_areas = np.zeros((num_objs), dtype=np.float32)

# Load object bounding boxes into a data frame.
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
#从类别名字映射到ID
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
boxes[ix, :] = [x1, y1, x2, y2]
gt_classes[ix] = cls
#因为是groud-truth,所以重叠率设置为1
overlaps[ix, cls] = 1.0
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)

overlaps = scipy.sparse.csr_matrix(overlaps)
#返回一个字典
return {'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}

def _get_comp_id(self):
comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
else self._comp_id)
return comp_id

def _get_voc_results_file_template(self):
# VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
path = os.path.join(
self._devkit_path,
'results',
'VOC' + self._year,
'Main',
filename)
return path

def _write_voc_results_file(self, all_boxes):
for cls_ind, cls in enumerate(self.classes):
if cls == '__background__':
continue
print 'Writing {} VOC results file'.format(cls)
filename = self._get_voc_results_file_template().format(cls)
with open(filename, 'wt') as f:
for im_ind, index in enumerate(self.image_index):
dets = all_boxes[cls_ind][im_ind]
if dets == []:
continue
# the VOCdevkit expects 1-based indices
for k in xrange(dets.shape[0]):
f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
format(index, dets[k, -1],
dets[k, 0] + 1, dets[k, 1] + 1,
dets[k, 2] + 1, dets[k, 3] + 1))

def _do_python_eval(self, output_dir = 'output'):
annopath = os.path.join(
self._devkit_path,
'VOC' + self._year,
'Annotations',
'{:s}.xml')
imagesetfile = os.path.join(
self._devkit_path,
'VOC' + self._year,
'ImageSets',
'Main',
self._image_set + '.txt')
cachedir = os.path.join(self._devkit_path, 'annotations_cache')
aps = []
# The PASCAL VOC metric changed in 2010
use_07_metric = True if int(self._year) < 2010 else False
print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for i, cls in enumerate(self._classes):
if cls == '__background__':
continue
filename = self._get_voc_results_file_template().format(cls)
rec, prec, ap = voc_eval(
filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
use_07_metric=use_07_metric)
aps += [ap]
print('AP for {} = {:.4f}'.format(cls, ap))
with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
print('Mean AP = {:.4f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('Results:')
for ap in aps:
print('{:.3f}'.format(ap))
print('{:.3f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('')
print('--------------------------------------------------------------')
print('Results computed with the **unofficial** Python eval code.')
print('Results should be very close to the official MATLAB eval code.')
print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
print('-- Thanks, The Management')
print('--------------------------------------------------------------')

def _do_matlab_eval(self, output_dir='output'):
print '-----------------------------------------------------'
print 'Computing results with the official MATLAB eval code.'
print '-----------------------------------------------------'
path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
'VOCdevkit-matlab-wrapper')
cmd = 'cd {} && '.format(path)
cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
cmd += '-r "dbstop if error; '
cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
.format(self._devkit_path, self._get_comp_id(),
self._image_set, output_dir)
print('Running:\n{}'.format(cmd))
status = subprocess.call(cmd, shell=True)

def evaluate_detections(self, all_boxes, output_dir):
self._write_voc_results_file(all_boxes)
self._do_python_eval(output_dir)
if self.config['matlab_eval']:
self._do_matlab_eval(output_dir)
if self.config['cleanup']:
for cls in self._classes:
if cls == '__background__':
continue
filename = self._get_voc_results_file_template().format(cls)
os.remove(filename)

def competition_mode(self, on):
if on:
self.config['use_salt'] = False
self.config['cleanup'] = False
else:
self.config['use_salt'] = True
self.config['cleanup'] = True

if __name__ == '__main__':
from datasets.pascal_voc import pascal_voc
d = pascal_voc('trainval', '2007')
res = d.roidb
from IPython import embed; embed()
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: