faster rcnn 源码学习-------数据读入及RoIDataLayer相关模块解读 + Train的流程
2018-03-28 10:11
1581 查看
参考:https://www.cnblogs.com/Dzhen/p/6845852.html下面我和大家一起从训练最开始学习作者如何将原始数据读入并通过RoIDataLayer转化成网络训练所需的数据的总体过程。训练从./tools/train_net.py开始,进入主函数,我们只关注跟数据有关的模块。train_net.pyif __name__ == '__main__':
args = parse_args()
print('Called with args:')
print(args)
if args.cfg_file is not None:
cfg_from_file(args.cfg_file)
if args.set_cfgs is not None:
cfg_from_list(args.set_cfgs)
cfg.GPU_ID = args.gpu_id
print('Using config:')
pprint.pprint(cfg)
if not args.randomize:
# fix the random seeds (numpy and caffe) for reproducibility
np.random.seed(cfg.RNG_SEED)
caffe.set_random_seed(cfg.RNG_SEED)
# set up caffe
caffe.set_mode_gpu()
caffe.set_device(args.gpu_ivi
imdb, roidb = combined_roidb(args.imdb_name)
print '{:d} roidb entries'.format(len(roidb))
output_dir = get_output_dir(imdb)
print 'Output will be saved to `{:s}`'.format(output_dir)
train_net(args.solver, roidb, output_dir,
pretrained_model=args.pretrained_model,
max_iters=args.max_iters)
首先是imdb, roidb = combined_roidb(args.imdb_name)语句,传入的参数imdb_name默认是“voc_2007_trainval”,这只是一个数据集的名字而已,进入该文件下的combined_roidb函数,该函数首先有个内部函数get_roidb,下面有调用,进入这函数第一行imdb = get_imdb(imdb_name),这个get_imdb为factory中的函数,细看可以知道get_imdb这个函数通过给数据集的名字返回了该数据集对应的类的对象。
def combined_roidb(imdb_names):
def get_roidb(imdb_name):
imdb = get_imdb(imdb_name)
print 'Loaded dataset `{:s}` for training'.format(imdb.name)
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
roidb = get_training_roidb(imdb)
return roidb
roidb格式:<type 'list'>: [{'boxes': array([[0, 0, 0, 0]], dtype=uint16), 'gt_classes': array([15], dtype=int32), 'gt_overlaps': <1x21 sparse matrix of type '<type 'numpy.float32'>'
with 1 stored elements in Compressed Sparse Row format>, 'seg_areas': array([1.], dtype=float32), 'flipped': False}]
一共21类,当前图片gt对应的是第15类
roidbs = [get_roidb(s) for s in imdb_names.split('+')]
roidb = roidbs[0] 针对第一个数据集 实际上还是包含很多roidb
if len(roidbs) > 1:
for r in roidbs[1:]:
roidb.extend(r)
imdb = datasets.imdb.imdb(imdb_names)
else:
imdb = get_imdb(imdb_names) 再取imdb
return imdb, roidb
factory.py中:def get_imdb(name):
"""Get an imdb (image database) by name."""
if not __sets.has_key(name):
raise KeyError('Unknown dataset: {}'.format(name))
return __sets[name]()
这里的sets是个字典,在factory.py中创建(何时调用的创建函数?),字典key对应pascal_voc或coco等的一个实例:__sets = {}
from datasets.pascal_voc import pascal_voc
from datasets.coco import coco
import numpy as np
# Set up voc_<year>_<split> using selective search "fast" mode
for year in ['2007', '2012']:
for split in ['train', 'val', 'trainval', 'test']:
name = 'voc_{}_{}'.format(year, split) 占位 例如voc_2007_test
__sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
# Set up coco_2014_<split>
for year in ['2014']:
for split in ['train', 'val', 'minival', 'valminusminival']:
name = 'coco_{}_{}'.format(year, split)
__sets[name] = (lambda split=split, year=year: coco(split, year))
# Set up coco_2015_<split>
for year in ['2015']:
for split in ['test', 'test-dev']:
name = 'coco_{}_{}'.format(year, split)
__sets[name] = (lambda split=split, year=year: coco(split, year))
pascal_voc类在pascal_voc.py中定义:class pascal_voc(imdb):
def __init__(self, image_set, year, devkit_path=None):
imdb.__init__(self, 'voc_' + year + '_' + image_set)
self._year = year
self._image_set = image_set
self._devkit_path = self._get_default_path() if devkit_path is None \
else devkit_path
self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
# self._classes = ('__background__', # always index 0
# 'aeroplane', 'bicycle', 'bird', 'boat',
# 'bottle', 'bus', 'car', 'cat', 'chair',
# 'cow', 'diningtable', 'dog', 'horse',
# 'motorbike', 'person', 'pottedplant',
# 'sheep', 'sofa', 'train', 'tvmonitor')
self._classes = ('__background__', # always index 0
'hammerhead', 'boxing', 'barber_scissors', 'spray',
'hammer', 'slingshot', 'blade', 'plumb', 'knife_toy',
'knife_tile', 'plastic_scissors', 'kitchen_knife', 'gun',
'scissors', 'fullfold_knife', 'knife_edge',
'saw', 'blunt', 'knife', 'open_scissors')
self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
self._image_ext = '.jpg'
self._image_index = self._load_image_set_index() 这里读取test.txt/train.txt得知需要用到哪些图片
# Default to roidb handler
self._roidb_handler = self.selective_search_roidb
self._salt = str(uuid.uuid4())
self._comp_id = 'comp4'
# PASCAL specific config options
self.config = {'cleanup' : True,
'use_salt' : True,
'use_diff' : False,
'matlab_eval' : False,
'rpn_file' : None,
'min_size' : 2}
assert os.path.exists(self._devkit_path), \
'VOCdevkit path does not exist: {}'.format(self._devkit_path)
assert os.path.exists(self._data_path), \
'Path does not exist: {}'.format(self._data_path)
def image_path_at(self, i):
"""
Return the absolute path to image i in the image sequence.
"""
return self.image_path_from_index(self._image_index[i])
def image_path_from_index(self, index):
"""
Construct an image path from the image's "index" identifier.
"""
image_path = os.path.join(self._data_path, 'JPEGImages',
index + self._image_ext)
assert os.path.exists(image_path), \
'Path does not exist: {}'.format(image_path)
return image_path
def _load_image_set_index(self):
"""
Load the indexes listed in this dataset's image set file.
"""
# Example path to image set file:
# self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
self._image_set + '.txt')
assert os.path.exists(image_set_file), \
'Path does not exist: {}'.format(image_set_file)
with open(image_set_file) as f:
image_index = [x.strip() for x in f.readlines()]
return image_index
def _get_default_path(self):
"""
Return the default path where PASCAL VOC is expected to be installed.
"""
return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)
def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} gt roidb loaded from {}'.format(self.name, cache_file)
return roidb
gt_roidb = [self._load_pascal_annotation(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote gt roidb to {}'.format(cache_file)
return gt_roidb
def selective_search_roidb(self):
"""
Return the database of selective search regions of interest.
Ground-truth ROIs are also included.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path,
self.name + '_selective_search_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} ss roidb loaded from {}'.format(self.name, cache_file)
return roidb
if int(self._year) == 2007 or self._image_set != 'test':
gt_roidb = self.gt_roidb()
ss_roidb = self._load_selective_search_roidb(gt_roidb)
roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
else:
roidb = self._load_selective_search_roidb(None)
with open(cache_file, 'wb') as fid:
cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote ss roidb to {}'.format(cache_file)
return roidb
def rpn_roidb(self):
if int(self._year) == 2007 or self._image_set != 'test':
gt_roidb = self.gt_roidb()
rpn_roidb = self._load_rpn_roidb(gt_roidb)
roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
else:
roidb = self._load_rpn_roidb(None)
return roidb
def _load_rpn_roidb(self, gt_roidb):
filename = self.config['rpn_file']
print 'loading {}'.format(filename)
assert os.path.exists(filename), \
'rpn data not found at: {}'.format(filename)
with open(filename, 'rb') as f:
box_list = cPickle.load(f)
return self.create_roidb_from_box_list(box_list, gt_roidb)
def _load_selective_search_roidb(self, gt_roidb):
filename = os.path.abspath(os.path.join(cfg.DATA_DIR,
'selective_search_data',
self.name + '.mat'))
assert os.path.exists(filename), \
'Selective search data not found at: {}'.format(filename)
raw_data = sio.loadmat(filename)['boxes'].ravel()
box_list = []
for i in xrange(raw_data.shape[0]):
boxes = raw_data[i][:, (1, 0, 3, 2)] - 1
keep = ds_utils.unique_boxes(boxes)
boxes = boxes[keep, :]
keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
boxes = boxes[keep, :]
box_list.append(boxes)
return self.create_roidb_from_box_list(box_list, gt_roidb)
def _load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')
if not self.config['use_diff']:
# Exclude the samples labeled as difficult
non_diff_objs = [
obj for obj in objs if int(obj.find('difficult').text) == 0]
# if len(non_diff_objs) != len(objs):
# print 'Removed {} difficult objects'.format(
# len(objs) - len(non_diff_objs))
objs = non_diff_objs
num_objs = len(objs)
boxes = np.zeros((num_objs, 4), dtype=np.uint16)
gt_classes = np.zeros((num_objs), dtype=np.int32)
overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
# "Seg" area for pascal is just the box area
seg_areas = np.zeros((num_objs), dtype=np.float32)
# Load object bounding boxes into a data frame.
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
boxes[ix, :] = [x1, y1, x2, y2]
gt_classes[ix] = cls
overlaps[ix, cls] = 1.0
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
overlaps = scipy.sparse.csr_matrix(overlaps)
return {'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}
def _get_comp_id(self):
comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
else self._comp_id)
return comp_id
def _get_voc_results_file_template(self):
# VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
path = os.path.join(
self._devkit_path,
'results',
'VOC' + self._year,
'Main',
filename)
return path
def _write_voc_results_file(self, all_boxes):
for cls_ind, cls in enumerate(self.classes):
if cls == '__background__':
continue
print 'Writing {} VOC results file'.format(cls)
filename = self._get_voc_results_file_template().format(cls)
with open(filename, 'wt') as f:
for im_ind, index in enumerate(self.image_index):
dets = all_boxes[cls_ind][im_ind]
if dets == []:
continue
# the VOCdevkit expects 1-based indices
for k in xrange(dets.shape[0]):
f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
format(index, dets[k, -1],
dets[k, 0] + 1, dets[k, 1] + 1,
dets[k, 2] + 1, dets[k, 3] + 1))
def _do_python_eval(self, output_dir = 'output'):
annopath = os.path.join(
self._devkit_path,
'VOC' + self._year,
'Annotations',
'{:s}.xml')
imagesetfile = os.path.join(
self._devkit_path,
'VOC' + self._year,
'ImageSets',
'Main',
self._image_set + '.txt')
cachedir = os.path.join(self._devkit_path, 'annotations_cache')
aps = []
# The PASCAL VOC metric changed in 2010
use_07_metric = True if int(self._year) < 2010 else False
print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for i, cls in enumerate(self._classes):
if cls == '__background__':
continue
filename = self._get_voc_results_file_template().format(cls)
rec, prec, ap = voc_eval(
filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
use_07_metric=use_07_metric)
aps += [ap]
print('AP for {} = {:.4f}'.format(cls, ap))
with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
print('Mean AP = {:.4f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('Results:')
for ap in aps:
print('{:.3f}'.format(ap))
print('{:.3f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('')
print('--------------------------------------------------------------')
print('Results computed with the **unofficial** Python eval code.')
print('Results should be very close to the official MATLAB eval code.')
print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
print('-- Thanks, The Management')
print('--------------------------------------------------------------')
def _do_matlab_eval(self, output_dir='output'):
print '-----------------------------------------------------'
print 'Computing results with the official MATLAB eval code.'
print '-----------------------------------------------------'
path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
'VOCdevkit-matlab-wrapper')
cmd = 'cd {} && '.format(path)
cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
cmd += '-r "dbstop if error; '
cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
.format(self._devkit_path, self._get_comp_id(),
self._image_set, output_dir)
print('Running:\n{}'.format(cmd))
status = subprocess.call(cmd, shell=True)
def evaluate_detections(self, all_boxes, output_dir):
self._write_voc_results_file(all_boxes)
self._do_python_eval(output_dir)
if self.config['matlab_eval']:
self._do_matlab_eval(output_dir)
if self.config['cleanup']:
for cls in self._classes:
if cls == '__background__':
continue
filename = self._get_voc_results_file_template().format(cls)
os.remove(filename)
def competition_mode(self, on):
if on:
self.config['use_salt'] = False
self.config['cleanup'] = False
else:
self.config['use_salt'] = True
self.config['cleanup'] = True
到现在我们可以通过一些变量名知道imdb即影像数据集,roidb即感兴趣区域的数据集,factory即为工厂,它的函数get_imdb根据名字返回数据集对应的类的对象。假如imdb_name为“voc_2007_trainval”,我们可以知道get_imdb返回的是pascal_voc(‘trainval’, ‘2007’),而进入pascal_voc文件,我们可以看到他是一个继承于imdb类的子类(class pascal_voc(imdb):),在init函数中,我们可以知道该类记录了文件路径目标类别还有影像扩展名等与数据集相关的内容,而父类imdb中则是名字等一般性的内容。了解了这些内容以后,我们回到combined_roidb函数的get_roidb函数继续往下读,通过get_imdb记录了数据集的路径之类的信息后。get_roidb函数又调用了语句imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD),在config.py中我们看到cfg.TRAIN.PROPOSAL_METHOD值是selective_search,但在/experiments/cfgs/faster_rcnn_end2end.yml中它的值是gt,而imdb的set_proposal_method函数里为method = eval('self.' + method + '_roidb') self.roidb_handler = method其中eval函数把字符串当成表达式求值,从而我们知道下一行调用了gt_roidb函数,哪里调用了gt_roidb?? 答:imdb.py中的set_proposal_method函数 之后会执行pascal_voc中的gt_roidb不清楚何时执行进入pascal_voc下的gt_roidb函数,其中关于cache相当于缓存加速未来调用的,可以不管。我们看到该函数核心语句gt_roidb = [self._load_pascal_annotation(index) for index in self.image_index]在数据集类pascal_voc中我们知道image_index记录的是所有影像名的列表,可以知道对每张影像名调用_load_pascal_annotation,进入该函数一眼可以看到在路径后面加了“Annotations”即可以知道该函数读取的是影像对应存储目标的xml文件。最后返回了“boxes”,“gt_classes”等构成的字典。
_load_pascal_annotation函数上面有代码,最后返回
{'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}随后get_roidb函数调用了get_training_roidb(imdb),进入fast_rcnn下的train.py文件,进入get_training_roidb函数def get_training_roidb(imdb):
"""Returns a roidb (Region of Interest database) for use in training."""
if cfg.TRAIN.USE_FLIPPED:
print 'Appending horizontally-flipped training examples...'
imdb.append_flipped_images()
print 'done'
print 'Preparing training data...'
rdl_roidb.prepare_roidb(imdb)
print 'done'
return imdb.roidb
调用的append_flipped_images函数通过名字可以看出是增加翻转的,可以不管,然后该函数又调用了prepare_roidb,从上面我们已经知道image_index记录的是所有影像名的列表,在roidb.py中prepare_roidb通过循环for i in xrange(len(imdb.image_index))对每张影像在上面gt_roidb函数读取到的内容“gt_overlaps”进行了求最大处理,并记录影像路径还有影像高宽等信息在roidb中并返回,返回的roidb是一个长度与影像数一致的列表。由此知该函数即如函数名所示做准备prepare。def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in xrange(imdb.num_images)]
roidb = imdb.roidb roidb从哪里得到的??gt_roidb(self)吗?应该是set_proposal_method确定方法为gt,再调用pascal_voc中的gt_roidb赋值(# A roidb is a list of dictionaries, each with the following keys:#boxes#gt_overlaps #gt_classes #flipped)
for i in xrange(len(imdb.image_index)):
roidb[i]['image'] = imdb.image_path_at(i)
roidb[i]['width'] = sizes[i][0]
roidb[i]['height'] = sizes[i][1]
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1) 哪个类overlap最大
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks
# max overlap of 0 => class should be zero (background)
zero_inds = np.where(max_overlaps == 0)[0] overlap为0的index
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0] overlap不为0的index
assert all(max_classes[nonzero_inds] != 0)
现在回到combined_roidb函数,语句roidbs = [get_roidb(s) for s in imdb_names.split('+')]我们可以知道对于每个数据集返回了带有该数据集信息的imdb和包含每张影像感兴趣区域信息的roidb(包括每张影像点目标和影像宽高等信息),事实上roidb属于imdb。继续往下看,train_net.py调用了train_net(args.solver, roidb, output_dir,pretrained_model=args.pretrained_model,max_iters=args.max_iters)函数,此函数为fast_rcnn/train.py中的train_net函数该函数首先调用了filter_roidb,如函数名即对roi进行过滤def filter_roidb(roidb):
"""Remove roidb entries that have no usable RoIs."""
def is_valid(entry):
# Valid images have: 保留的标记必须是前景或者背景
# (1) At least one foreground RoI OR
# (2) At least one background RoI
overlaps = entry['max_overlaps']
# find boxes with sufficient overlap
fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
(overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# image is only valid if such boxes exist
valid = len(fg_inds) > 0 or len(bg_inds) > 0
return valid
num = len(roidb)
filtered_roidb = [entry for entry in roidb if is_valid(entry)]
num_after = len(filtered_roidb)
print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
num, num_after)
return filtered_roidb
然后调用了SolverWrapper,即是solver的封装类的对象,在init函数中,开始都是导入网络之类的准备性工作class SolverWrapper(object):
"""A simple wrapper around Caffe's solver.
This wrapper gives us control over he snapshotting process, which we
use to unnormalize the learned bounding-box regression weights.
"""
def __init__(self, solver_prototxt, roidb, output_dir,
pretrained_model=None):
"""Initialize the SolverWrapper."""
self.output_dir = output_dir
if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
# RPN can only use precomputed normalization because there are no
# fixed statistics to compute a priori
assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
if cfg.TRAIN.BBOX_REG:
print 'Computing bounding-box regression targets...'
self.bbox_means, self.bbox_stds = \
rdl_roidb.add_bbox_regression_targets(roidb) 赋值:预测框与gt的偏移 还是不知道预测框怎么进来的 看下test流程
print 'done'
self.solver = caffe.SGDSolver(solver_prototxt) 具体实现在哪里?
if pretrained_model is not None:
print ('Loading pretrained model '
'weights from {:s}').format(pretrained_model)
self.solver.net.copy_from(pretrained_model)
self.solver_param = caffe_pb2.SolverParameter()
with open(solver_prototxt, 'rt') as f:
pb2.text_format.Merge(f.read(), self.solver_param)
self.solver.net.layers[0].set_roidb(roidb)
其中
def add_bbox_regression_targets(roidb):
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
num_images = len(roidb)
# Infer number of classes from the number of columns in gt_overlaps
num_classes = roidb[0]['gt_overlaps'].shape[1]
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes) 计算一张图片的target,见下面
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:
# Compute values needed for means and stds
# var(x) = E(x^2) - E(x)^2
class_counts = np.zeros((num_classes, 1)) + cfg.EPS
sums = np.zeros((num_classes, 4))
squared_sums = np.zeros((num_classes, 4))
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
if cls_inds.size > 0:
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)
means = sums / class_counts
stds = np.sqrt(squared_sums / class_counts - means ** 2)
print 'bbox target means:'
print means
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds
print stds[1:, :].mean(axis=0) # ignore bg class
# Normalize targets
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"
# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()
计算一张图中gt和其他框的关系def _compute_targets(rois, overlaps, labels):
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs
gt_inds = np.where(overlaps == 1)[0] 可能后面rois会包含预测框以及gt的混合,所以这里把gt和预测分离开,求出偏移量并输出
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
return np.zeros((rois.shape[0], 5), dtype=np.float32)
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 只考虑最大iou比较大的框
# Get IoU overlap between each ex ROI and gt ROI
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target
gt_assignment = ex_gt_overlaps.argmax(axis=1)
gt_rois = rois[gt_inds[gt_assignment], :]
ex_rois = rois[ex_inds, :]
targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
targets[ex_inds, 0] = labels[ex_inds] 调用时 _compute_targets(rois, max_overlaps, max_classes),所以label是与当前预测iou最大的class 不过么时候会传进来候选框呢?
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
return targets
计算候选框(预测框?)和gt的偏移
def bbox_transform(ex_rois, gt_rois):
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
def snapshot(self):
"""Take a snapshot of the network after unnormalizing the learned
bounding-box regression weights. This enables easy use at test-time.
"""
net = self.solver.net
infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
filename = (self.solver_param.snapshot_prefix + infix +
'_iter_{:d}'.format(self.solver.iter) + '.caffemodel')
filename = os.path.join(self.output_dir, filename)
net.save(str(filename))
print 'Wrote snapshot to: {:s}'.format(filename)
return filename
训练开始
def train_model(self, max_iters):
"""Network training loop."""
last_snapshot_iter = -1
timer = Timer()
model_paths = []
while self.solver.iter < max_iters:
# Make one SGD update
timer.tic()
self.solver.step(1) 看cpp源码
timer.toc()
if self.solver.iter % (10 * self.solver_param.display) == 0:
print 'speed: {:.3f}s / iter'.format(timer.average_time)
if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
last_snapshot_iter = self.solver.iter
model_paths.append(self.snapshot())
if last_snapshot_iter != self.solver.iter:
model_paths.append(self.snapshot())
return model_paths
在最后一句话中,调用了self.solver.net.layers[0].set_roidb(roidb),这句话很关键,即调用了网络第一层也即是prototxt文件的第一层即RoIDataLayer层的set_roidb函数,把之前读取的roidb对象传进了网络。
def set_roidb(self, roidb):
"""Set the roidb to be used by this layer during training."""
self._roidb = roidb
self._shuffle_roidb_inds()
if cfg.TRAIN.USE_PREFETCH:
self._blob_queue = Queue(10)
self._prefetch_process = BlobFetcher(self._blob_queue,
self._roidb,
self._num_classes)
self._prefetch_process.start()
# Terminate the child process when the parent exists
def cleanup():
print 'Terminating BlobFetcher'
self._prefetch_process.terminate()
self._prefetch_process.join()
import atexit
atexit.register(cleanup)
==================================class BlobFetcher(Process):
"""Experimental class for prefetching blobs in a separate process."""
def __init__(self, queue, roidb, num_classes):
super(BlobFetcher, self).__init__() 父类process 在process.py定义
self._queue = queue
self._roidb = roidb
self._num_classes = num_classes
self._perm = None
self._cur = 0
self._shuffle_roidb_inds()
# fix the random seed for reproducibility
np.random.seed(cfg.RNG_SEED)
打乱顺序并将self._cur置0
def _shuffle_roidb_inds(self):
"""Randomly permute the training roidb."""
# TODO(rbg): remove duplicated code
self._perm = np.random.permutation(np.arange(len(self._roidb))) ._perm用来存shuffle后的index
self._cur = 0
def _get_next_minibatch_inds(self):
"""Return the roidb indices for the next minibatch."""
# TODO(rbg): remove duplicated code
if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
self._shuffle_roidb_inds()
db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] TRAIN.IMS_PER_BATCH很奇怪 config.py中是2,这里却是1
self._cur += cfg.TRAIN.IMS_PER_BATCH
return db_inds
def run(self):
print 'BlobFetcher started'
while True:
db_inds = self._get_next_minibatch_inds() 获取下个batch的对应index 利用self._cur
minibatch_db = [self._roidb[i] for i in db_inds] 由于上面的TRAIN.IMS_PER_BATCH是1,导致db_inds尺寸也是1,minibatch_db只针对一张图
blobs = get_minibatch(minibatch_db, self._num_classes)
self._queue.put(blobs)
============================================def train_net(solver_prototxt, roidb, output_dir,
pretrained_model=None, max_iters=40000):
"""Train a Fast R-CNN network."""
roidb = filter_roidb(roidb)
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)
print 'Solving...'
model_paths = sw.SolverWrapper(max_iters)
print 'done solving'
return model_paths
到现在我们再来看roidb对象里到底记录了哪些内容呢?包括数据集所有影像的影像路径,所有影像对应的目标,以及宽高等信息。有了这些信息,就可以在RoIDataLayer层的前向传播函数调用get_next_minibatch,
def setup(self, bottom, top):
"""Setup the RoIDataLayer."""
# parse the layer parameter string, which must be valid YAML
layer_params = yaml.load(self.param_str_)
self._num_classes = layer_params['num_classes']
self._name_to_top_map = {}
# data blob: holds a batch of N images, each with 3 channels
idx = 0
top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
self._name_to_top_map['data'] = idx
idx += 1
if cfg.TRAIN.HAS_RPN: 如果用RPN,则data给cov1,im_info和gt_boxes给rpn-data层 (rois, labels, bbox_targets,bbox_inside_weights, bbox_outside_weights由roi-data层(proposal_target)产生并分别传递给roi_pool,loss_cls,loss_bbox, loss_bbox, loss_bbox)
top[idx].reshape(1, 3)
self._name_to_top_map['im_info'] = idx
idx += 1
top[idx].reshape(1, 4)
self._name_to_top_map['gt_boxes'] = idx
idx += 1
else: # not using RPN 没找到对应情况,但肯定不是test,因为test里找不到这些参数
# rois blob: holds R regions of interest, each is a 5-tuple
# (n, x1, y1, x2, y2) specifying an image batch index n and a
# rectangle (x1, y1, x2, y2)
top[idx].reshape(1, 5)
self._name_to_top_map['rois'] = idx
idx += 1
# labels blob: R categorical labels in [0, ..., K] for K foreground
# classes plus background
top[idx].reshape(1)
self._name_to_top_map['labels'] = idx
idx += 1
if cfg.TRAIN.BBOX_REG:
# bbox_targets blob: R bounding-box regression targets with 4
# targets per class
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_targets'] = idx
idx += 1
# bbox_inside_weights blob: At most 4 targets per roi are active;
# thisbinary vector sepcifies the subset of active targets
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_inside_weights'] = idx
idx += 1
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_outside_weights'] = idx
idx += 1
print 'RoiDataLayer: name_to_top:', self._name_to_top_map
assert len(top) == len(self._name_to_top_map)
def forward(self, bottom, top):
"""Get blobs and copy them into this layer's top blob vector."""
blobs = self._get_next_minibatch()
for blob_name, blob in blobs.iteritems():
top_ind = self._name_to_top_map[blob_name]
# Reshape net's input blobs
top[top_ind].reshape(*(blob.shape))
# Copy data into net's input blobs
top[top_ind].data[...] = blob.astype(np.float32, copy=False)
先取index,再取对应数据def _get_next_minibatch(self):
"""Return the blobs to be used for the next minibatch.
If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
separate process and made available through self._blob_queue.
"""
if cfg.TRAIN.USE_PREFETCH:
return self._blob_queue.get()
else:
db_inds = self._get_next_minibatch_inds()
minibatch_db = [self._roidb[i] for i in db_inds]
return get_minibatch(minibatch_db, self._num_classes)
def get_minibatch(roidb, num_classes): 将下个minibatch的roidb封装为blob
"""Given a roidb, construct a minibatch sampled from it."""
num_images = len(roidb) 这里的输入roidb针对一张图的所有框,但其实是长度为1的list<dict>,所以num_images = 1
# Sample random scales to use for each image in this batch
random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
size=num_images)
assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
'num_images ({}) must divide BATCH_SIZE ({})'. \
format(num_images, cfg.TRAIN.BATCH_SIZE)
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 默认256/1
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 默认0.25 * 256
# Get the input image blob, formatted for caffe
im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
blobs = {'data': im_blob}
if cfg.TRAIN.HAS_RPN:
assert len(im_scales) == 1, "Single batch only"
assert len(roidb) == 1, "Single batch only"
# gt boxes: (x1, y1, x2, y2, cls) 因为长度为1,所以roidb[0]就是全部
gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 当前图中对应非背景的框的index
gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 生成blob中的gt_boxes,包括变换后的box坐标以及类型
blobs['gt_boxes'] = gt_boxes
blobs['im_info'] = np.array(
[[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], 生成blob中的im_info, 包括长,宽,放缩比
dtype=np.float32)
else: # not using RPN
# Now, build the region of interest and label blobs
rois_blob = np.zeros((0, 5), dtype=np.float32)
labels_blob = np.zeros((0), dtype=np.float32)
bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
# all_overlaps = []
for im_i in xrange(num_images):
labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
= _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,
num_classes)
# Add to RoIs blob
rois = _project_im_rois(im_rois, im_scales[im_i])
batch_ind = im_i * np.ones((rois.shape[0], 1))
rois_blob_this_image = np.hstack((batch_ind, rois))
rois_blob = np.vstack((rois_blob, rois_blob_this_image))
# Add to labels, bbox targets, and bbox loss blobs
labels_blob = np.hstack((labels_blob, labels))
bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
# all_overlaps = np.hstack((all_overlaps, overlaps))
# For debug visualizations
# _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)
blobs['rois'] = rois_blob
blobs['labels'] = labels_blob
if cfg.TRAIN.BBOX_REG:
blobs['bbox_targets'] = bbox_targets_blob
blobs['bbox_inside_weights'] = bbox_inside_blob
blobs['bbox_outside_weights'] = \
np.array(bbox_inside_blob > 0).astype(np.float32)
return blobs
利用roidb的信息(image, flipped)等找到原图,并对原图进行归一化,放缩等变换def _get_image_blob(roidb, scale_inds):
"""Builds an input blob from the images in the roidb at the specified
scales.
"""
num_images = len(roidb) 一般为1
processed_ims = []
im_scales = []
for i in xrange(num_images):
im = cv2.imread(roidb[i]['image']) roidb[i]中所有框对应一张图
if roidb[i]['flipped']:
im = im[:, ::-1, :] 后面可以加上样本增强
target_size = cfg.TRAIN.SCALES[scale_inds[i]]
im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
cfg.TRAIN.MAX_SIZE,cfg.TRAIN.IMAGE_STRIDE)
im_scales.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
blob = im_list_to_blob(processed_ims) list to blob
return blob, im_scales
归一化,变形后输出处理后的图以及变形的尺寸def prep_im_for_blob(im, pixel_means, target_size, max_size,stride): stride = IMAGE_STRIDE默认64
"""Mean subtract and scale an image for use in a blob."""
im = im.astype(np.float32, copy=False)
im -= pixel_means
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(target_size) / float(im_size_min)
# Prevent the biggest axis from being more than MAX_SIZE 如果长边*scale后超出最大尺寸,则按scale=最大尺寸/长边
if np.round(im_scale * im_size_max) > max_size:
im_scale = float(max_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 长宽等比例放缩
interpolation=cv2.INTER_LINEAR)
if stride == 0:
return im, im_scale
else:
# pad to product of stride 这个stride含义??
im_height = int(np.ceil(im.shape[0] / float(stride)) * stride) 向上取整,所以除后再乘回去可能会变大
im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
im_channel = im.shape[2]
padded_im = np.zeros((im_height, im_width, im_channel)) 填充
padded_im[:im.shape[0], :im.shape[1], :] = im
return padded_im, im_scale
该函数首先调用了_get_next_minibatch_inds此批次运行的影像下标,然后获取他们对应的roidb对象后组成列表,传进get_minibatch函数。在RoIDataLayer层中输出为data,im_info和gt_boxes,在get_minibatch函数中,他把所有的输出信息弄成blob以便进行前向传播。data通过roidb的“image”记录的路径信息然后调用opencv的读影像函数进行读取并进行尺度缩放后输出,对应代码为im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) blobs = {'data': im_blob},而其余的gt_boxes信息之前已经保存,可以直接将之前的信息保存到对应的blob结构中,最后输出。blobs['gt_boxes'] = gt_boxes blobs['im_info'] = np.array( [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32),最后返回blob,在前向传播中输出top layer。
args = parse_args()
print('Called with args:')
print(args)
if args.cfg_file is not None:
cfg_from_file(args.cfg_file)
if args.set_cfgs is not None:
cfg_from_list(args.set_cfgs)
cfg.GPU_ID = args.gpu_id
print('Using config:')
pprint.pprint(cfg)
if not args.randomize:
# fix the random seeds (numpy and caffe) for reproducibility
np.random.seed(cfg.RNG_SEED)
caffe.set_random_seed(cfg.RNG_SEED)
# set up caffe
caffe.set_mode_gpu()
caffe.set_device(args.gpu_ivi
imdb, roidb = combined_roidb(args.imdb_name)
print '{:d} roidb entries'.format(len(roidb))
output_dir = get_output_dir(imdb)
print 'Output will be saved to `{:s}`'.format(output_dir)
train_net(args.solver, roidb, output_dir,
pretrained_model=args.pretrained_model,
max_iters=args.max_iters)
首先是imdb, roidb = combined_roidb(args.imdb_name)语句,传入的参数imdb_name默认是“voc_2007_trainval”,这只是一个数据集的名字而已,进入该文件下的combined_roidb函数,该函数首先有个内部函数get_roidb,下面有调用,进入这函数第一行imdb = get_imdb(imdb_name),这个get_imdb为factory中的函数,细看可以知道get_imdb这个函数通过给数据集的名字返回了该数据集对应的类的对象。
def combined_roidb(imdb_names):
def get_roidb(imdb_name):
imdb = get_imdb(imdb_name)
print 'Loaded dataset `{:s}` for training'.format(imdb.name)
imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
roidb = get_training_roidb(imdb)
return roidb
roidb格式:<type 'list'>: [{'boxes': array([[0, 0, 0, 0]], dtype=uint16), 'gt_classes': array([15], dtype=int32), 'gt_overlaps': <1x21 sparse matrix of type '<type 'numpy.float32'>'
with 1 stored elements in Compressed Sparse Row format>, 'seg_areas': array([1.], dtype=float32), 'flipped': False}]
一共21类,当前图片gt对应的是第15类
roidbs = [get_roidb(s) for s in imdb_names.split('+')]
roidb = roidbs[0] 针对第一个数据集 实际上还是包含很多roidb
if len(roidbs) > 1:
for r in roidbs[1:]:
roidb.extend(r)
imdb = datasets.imdb.imdb(imdb_names)
else:
imdb = get_imdb(imdb_names) 再取imdb
return imdb, roidb
factory.py中:def get_imdb(name):
"""Get an imdb (image database) by name."""
if not __sets.has_key(name):
raise KeyError('Unknown dataset: {}'.format(name))
return __sets[name]()
这里的sets是个字典,在factory.py中创建(何时调用的创建函数?),字典key对应pascal_voc或coco等的一个实例:__sets = {}
from datasets.pascal_voc import pascal_voc
from datasets.coco import coco
import numpy as np
# Set up voc_<year>_<split> using selective search "fast" mode
for year in ['2007', '2012']:
for split in ['train', 'val', 'trainval', 'test']:
name = 'voc_{}_{}'.format(year, split) 占位 例如voc_2007_test
__sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
# Set up coco_2014_<split>
for year in ['2014']:
for split in ['train', 'val', 'minival', 'valminusminival']:
name = 'coco_{}_{}'.format(year, split)
__sets[name] = (lambda split=split, year=year: coco(split, year))
# Set up coco_2015_<split>
for year in ['2015']:
for split in ['test', 'test-dev']:
name = 'coco_{}_{}'.format(year, split)
__sets[name] = (lambda split=split, year=year: coco(split, year))
pascal_voc类在pascal_voc.py中定义:class pascal_voc(imdb):
def __init__(self, image_set, year, devkit_path=None):
imdb.__init__(self, 'voc_' + year + '_' + image_set)
self._year = year
self._image_set = image_set
self._devkit_path = self._get_default_path() if devkit_path is None \
else devkit_path
self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
# self._classes = ('__background__', # always index 0
# 'aeroplane', 'bicycle', 'bird', 'boat',
# 'bottle', 'bus', 'car', 'cat', 'chair',
# 'cow', 'diningtable', 'dog', 'horse',
# 'motorbike', 'person', 'pottedplant',
# 'sheep', 'sofa', 'train', 'tvmonitor')
self._classes = ('__background__', # always index 0
'hammerhead', 'boxing', 'barber_scissors', 'spray',
'hammer', 'slingshot', 'blade', 'plumb', 'knife_toy',
'knife_tile', 'plastic_scissors', 'kitchen_knife', 'gun',
'scissors', 'fullfold_knife', 'knife_edge',
'saw', 'blunt', 'knife', 'open_scissors')
self._class_to_ind = dict(zip(self.classes, xrange(self.num_classes)))
self._image_ext = '.jpg'
self._image_index = self._load_image_set_index() 这里读取test.txt/train.txt得知需要用到哪些图片
# Default to roidb handler
self._roidb_handler = self.selective_search_roidb
self._salt = str(uuid.uuid4())
self._comp_id = 'comp4'
# PASCAL specific config options
self.config = {'cleanup' : True,
'use_salt' : True,
'use_diff' : False,
'matlab_eval' : False,
'rpn_file' : None,
'min_size' : 2}
assert os.path.exists(self._devkit_path), \
'VOCdevkit path does not exist: {}'.format(self._devkit_path)
assert os.path.exists(self._data_path), \
'Path does not exist: {}'.format(self._data_path)
def image_path_at(self, i):
"""
Return the absolute path to image i in the image sequence.
"""
return self.image_path_from_index(self._image_index[i])
def image_path_from_index(self, index):
"""
Construct an image path from the image's "index" identifier.
"""
image_path = os.path.join(self._data_path, 'JPEGImages',
index + self._image_ext)
assert os.path.exists(image_path), \
'Path does not exist: {}'.format(image_path)
return image_path
def _load_image_set_index(self):
"""
Load the indexes listed in this dataset's image set file.
"""
# Example path to image set file:
# self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
self._image_set + '.txt')
assert os.path.exists(image_set_file), \
'Path does not exist: {}'.format(image_set_file)
with open(image_set_file) as f:
image_index = [x.strip() for x in f.readlines()]
return image_index
def _get_default_path(self):
"""
Return the default path where PASCAL VOC is expected to be installed.
"""
return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)
def gt_roidb(self):
"""
Return the database of ground-truth regions of interest.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} gt roidb loaded from {}'.format(self.name, cache_file)
return roidb
gt_roidb = [self._load_pascal_annotation(index)
for index in self.image_index]
with open(cache_file, 'wb') as fid:
cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote gt roidb to {}'.format(cache_file)
return gt_roidb
def selective_search_roidb(self):
"""
Return the database of selective search regions of interest.
Ground-truth ROIs are also included.
This function loads/saves from/to a cache file to speed up future calls.
"""
cache_file = os.path.join(self.cache_path,
self.name + '_selective_search_roidb.pkl')
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fid:
roidb = cPickle.load(fid)
print '{} ss roidb loaded from {}'.format(self.name, cache_file)
return roidb
if int(self._year) == 2007 or self._image_set != 'test':
gt_roidb = self.gt_roidb()
ss_roidb = self._load_selective_search_roidb(gt_roidb)
roidb = imdb.merge_roidbs(gt_roidb, ss_roidb)
else:
roidb = self._load_selective_search_roidb(None)
with open(cache_file, 'wb') as fid:
cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
print 'wrote ss roidb to {}'.format(cache_file)
return roidb
def rpn_roidb(self):
if int(self._year) == 2007 or self._image_set != 'test':
gt_roidb = self.gt_roidb()
rpn_roidb = self._load_rpn_roidb(gt_roidb)
roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
else:
roidb = self._load_rpn_roidb(None)
return roidb
def _load_rpn_roidb(self, gt_roidb):
filename = self.config['rpn_file']
print 'loading {}'.format(filename)
assert os.path.exists(filename), \
'rpn data not found at: {}'.format(filename)
with open(filename, 'rb') as f:
box_list = cPickle.load(f)
return self.create_roidb_from_box_list(box_list, gt_roidb)
def _load_selective_search_roidb(self, gt_roidb):
filename = os.path.abspath(os.path.join(cfg.DATA_DIR,
'selective_search_data',
self.name + '.mat'))
assert os.path.exists(filename), \
'Selective search data not found at: {}'.format(filename)
raw_data = sio.loadmat(filename)['boxes'].ravel()
box_list = []
for i in xrange(raw_data.shape[0]):
boxes = raw_data[i][:, (1, 0, 3, 2)] - 1
keep = ds_utils.unique_boxes(boxes)
boxes = boxes[keep, :]
keep = ds_utils.filter_small_boxes(boxes, self.config['min_size'])
boxes = boxes[keep, :]
box_list.append(boxes)
return self.create_roidb_from_box_list(box_list, gt_roidb)
def _load_pascal_annotation(self, index):
"""
Load image and bounding boxes info from XML file in the PASCAL VOC
format.
"""
filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
tree = ET.parse(filename)
objs = tree.findall('object')
if not self.config['use_diff']:
# Exclude the samples labeled as difficult
non_diff_objs = [
obj for obj in objs if int(obj.find('difficult').text) == 0]
# if len(non_diff_objs) != len(objs):
# print 'Removed {} difficult objects'.format(
# len(objs) - len(non_diff_objs))
objs = non_diff_objs
num_objs = len(objs)
boxes = np.zeros((num_objs, 4), dtype=np.uint16)
gt_classes = np.zeros((num_objs), dtype=np.int32)
overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
# "Seg" area for pascal is just the box area
seg_areas = np.zeros((num_objs), dtype=np.float32)
# Load object bounding boxes into a data frame.
for ix, obj in enumerate(objs):
bbox = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = float(bbox.find('xmin').text) - 1
y1 = float(bbox.find('ymin').text) - 1
x2 = float(bbox.find('xmax').text) - 1
y2 = float(bbox.find('ymax').text) - 1
cls = self._class_to_ind[obj.find('name').text.lower().strip()]
boxes[ix, :] = [x1, y1, x2, y2]
gt_classes[ix] = cls
overlaps[ix, cls] = 1.0
seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
overlaps = scipy.sparse.csr_matrix(overlaps)
return {'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}
def _get_comp_id(self):
comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
else self._comp_id)
return comp_id
def _get_voc_results_file_template(self):
# VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
path = os.path.join(
self._devkit_path,
'results',
'VOC' + self._year,
'Main',
filename)
return path
def _write_voc_results_file(self, all_boxes):
for cls_ind, cls in enumerate(self.classes):
if cls == '__background__':
continue
print 'Writing {} VOC results file'.format(cls)
filename = self._get_voc_results_file_template().format(cls)
with open(filename, 'wt') as f:
for im_ind, index in enumerate(self.image_index):
dets = all_boxes[cls_ind][im_ind]
if dets == []:
continue
# the VOCdevkit expects 1-based indices
for k in xrange(dets.shape[0]):
f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
format(index, dets[k, -1],
dets[k, 0] + 1, dets[k, 1] + 1,
dets[k, 2] + 1, dets[k, 3] + 1))
def _do_python_eval(self, output_dir = 'output'):
annopath = os.path.join(
self._devkit_path,
'VOC' + self._year,
'Annotations',
'{:s}.xml')
imagesetfile = os.path.join(
self._devkit_path,
'VOC' + self._year,
'ImageSets',
'Main',
self._image_set + '.txt')
cachedir = os.path.join(self._devkit_path, 'annotations_cache')
aps = []
# The PASCAL VOC metric changed in 2010
use_07_metric = True if int(self._year) < 2010 else False
print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for i, cls in enumerate(self._classes):
if cls == '__background__':
continue
filename = self._get_voc_results_file_template().format(cls)
rec, prec, ap = voc_eval(
filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
use_07_metric=use_07_metric)
aps += [ap]
print('AP for {} = {:.4f}'.format(cls, ap))
with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f:
cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
print('Mean AP = {:.4f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('Results:')
for ap in aps:
print('{:.3f}'.format(ap))
print('{:.3f}'.format(np.mean(aps)))
print('~~~~~~~~')
print('')
print('--------------------------------------------------------------')
print('Results computed with the **unofficial** Python eval code.')
print('Results should be very close to the official MATLAB eval code.')
print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
print('-- Thanks, The Management')
print('--------------------------------------------------------------')
def _do_matlab_eval(self, output_dir='output'):
print '-----------------------------------------------------'
print 'Computing results with the official MATLAB eval code.'
print '-----------------------------------------------------'
path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
'VOCdevkit-matlab-wrapper')
cmd = 'cd {} && '.format(path)
cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
cmd += '-r "dbstop if error; '
cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
.format(self._devkit_path, self._get_comp_id(),
self._image_set, output_dir)
print('Running:\n{}'.format(cmd))
status = subprocess.call(cmd, shell=True)
def evaluate_detections(self, all_boxes, output_dir):
self._write_voc_results_file(all_boxes)
self._do_python_eval(output_dir)
if self.config['matlab_eval']:
self._do_matlab_eval(output_dir)
if self.config['cleanup']:
for cls in self._classes:
if cls == '__background__':
continue
filename = self._get_voc_results_file_template().format(cls)
os.remove(filename)
def competition_mode(self, on):
if on:
self.config['use_salt'] = False
self.config['cleanup'] = False
else:
self.config['use_salt'] = True
self.config['cleanup'] = True
到现在我们可以通过一些变量名知道imdb即影像数据集,roidb即感兴趣区域的数据集,factory即为工厂,它的函数get_imdb根据名字返回数据集对应的类的对象。假如imdb_name为“voc_2007_trainval”,我们可以知道get_imdb返回的是pascal_voc(‘trainval’, ‘2007’),而进入pascal_voc文件,我们可以看到他是一个继承于imdb类的子类(class pascal_voc(imdb):),在init函数中,我们可以知道该类记录了文件路径目标类别还有影像扩展名等与数据集相关的内容,而父类imdb中则是名字等一般性的内容。了解了这些内容以后,我们回到combined_roidb函数的get_roidb函数继续往下读,通过get_imdb记录了数据集的路径之类的信息后。get_roidb函数又调用了语句imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD),在config.py中我们看到cfg.TRAIN.PROPOSAL_METHOD值是selective_search,但在/experiments/cfgs/faster_rcnn_end2end.yml中它的值是gt,而imdb的set_proposal_method函数里为method = eval('self.' + method + '_roidb') self.roidb_handler = method其中eval函数把字符串当成表达式求值,从而我们知道下一行调用了gt_roidb函数,哪里调用了gt_roidb?? 答:imdb.py中的set_proposal_method函数 之后会执行pascal_voc中的gt_roidb不清楚何时执行进入pascal_voc下的gt_roidb函数,其中关于cache相当于缓存加速未来调用的,可以不管。我们看到该函数核心语句gt_roidb = [self._load_pascal_annotation(index) for index in self.image_index]在数据集类pascal_voc中我们知道image_index记录的是所有影像名的列表,可以知道对每张影像名调用_load_pascal_annotation,进入该函数一眼可以看到在路径后面加了“Annotations”即可以知道该函数读取的是影像对应存储目标的xml文件。最后返回了“boxes”,“gt_classes”等构成的字典。
_load_pascal_annotation函数上面有代码,最后返回
{'boxes' : boxes,
'gt_classes': gt_classes,
'gt_overlaps' : overlaps,
'flipped' : False,
'seg_areas' : seg_areas}随后get_roidb函数调用了get_training_roidb(imdb),进入fast_rcnn下的train.py文件,进入get_training_roidb函数def get_training_roidb(imdb):
"""Returns a roidb (Region of Interest database) for use in training."""
if cfg.TRAIN.USE_FLIPPED:
print 'Appending horizontally-flipped training examples...'
imdb.append_flipped_images()
print 'done'
print 'Preparing training data...'
rdl_roidb.prepare_roidb(imdb)
print 'done'
return imdb.roidb
调用的append_flipped_images函数通过名字可以看出是增加翻转的,可以不管,然后该函数又调用了prepare_roidb,从上面我们已经知道image_index记录的是所有影像名的列表,在roidb.py中prepare_roidb通过循环for i in xrange(len(imdb.image_index))对每张影像在上面gt_roidb函数读取到的内容“gt_overlaps”进行了求最大处理,并记录影像路径还有影像高宽等信息在roidb中并返回,返回的roidb是一个长度与影像数一致的列表。由此知该函数即如函数名所示做准备prepare。def prepare_roidb(imdb):
"""Enrich the imdb's roidb by adding some derived quantities that
are useful for training. This function precomputes the maximum
overlap, taken over ground-truth boxes, between each ROI and
each ground-truth box. The class with maximum overlap is also
recorded.
"""
sizes = [PIL.Image.open(imdb.image_path_at(i)).size
for i in xrange(imdb.num_images)]
roidb = imdb.roidb roidb从哪里得到的??gt_roidb(self)吗?应该是set_proposal_method确定方法为gt,再调用pascal_voc中的gt_roidb赋值(# A roidb is a list of dictionaries, each with the following keys:#boxes#gt_overlaps #gt_classes #flipped)
for i in xrange(len(imdb.image_index)):
roidb[i]['image'] = imdb.image_path_at(i)
roidb[i]['width'] = sizes[i][0]
roidb[i]['height'] = sizes[i][1]
# need gt_overlaps as a dense array for argmax
gt_overlaps = roidb[i]['gt_overlaps'].toarray()
# max overlap with gt over classes (columns)
max_overlaps = gt_overlaps.max(axis=1)
# gt class that had the max overlap
max_classes = gt_overlaps.argmax(axis=1) 哪个类overlap最大
roidb[i]['max_classes'] = max_classes
roidb[i]['max_overlaps'] = max_overlaps
# sanity checks
# max overlap of 0 => class should be zero (background)
zero_inds = np.where(max_overlaps == 0)[0] overlap为0的index
assert all(max_classes[zero_inds] == 0)
# max overlap > 0 => class should not be zero (must be a fg class)
nonzero_inds = np.where(max_overlaps > 0)[0] overlap不为0的index
assert all(max_classes[nonzero_inds] != 0)
现在回到combined_roidb函数,语句roidbs = [get_roidb(s) for s in imdb_names.split('+')]我们可以知道对于每个数据集返回了带有该数据集信息的imdb和包含每张影像感兴趣区域信息的roidb(包括每张影像点目标和影像宽高等信息),事实上roidb属于imdb。继续往下看,train_net.py调用了train_net(args.solver, roidb, output_dir,pretrained_model=args.pretrained_model,max_iters=args.max_iters)函数,此函数为fast_rcnn/train.py中的train_net函数该函数首先调用了filter_roidb,如函数名即对roi进行过滤def filter_roidb(roidb):
"""Remove roidb entries that have no usable RoIs."""
def is_valid(entry):
# Valid images have: 保留的标记必须是前景或者背景
# (1) At least one foreground RoI OR
# (2) At least one background RoI
overlaps = entry['max_overlaps']
# find boxes with sufficient overlap
fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
(overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# image is only valid if such boxes exist
valid = len(fg_inds) > 0 or len(bg_inds) > 0
return valid
num = len(roidb)
filtered_roidb = [entry for entry in roidb if is_valid(entry)]
num_after = len(filtered_roidb)
print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after,
num, num_after)
return filtered_roidb
然后调用了SolverWrapper,即是solver的封装类的对象,在init函数中,开始都是导入网络之类的准备性工作class SolverWrapper(object):
"""A simple wrapper around Caffe's solver.
This wrapper gives us control over he snapshotting process, which we
use to unnormalize the learned bounding-box regression weights.
"""
def __init__(self, solver_prototxt, roidb, output_dir,
pretrained_model=None):
"""Initialize the SolverWrapper."""
self.output_dir = output_dir
if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
# RPN can only use precomputed normalization because there are no
# fixed statistics to compute a priori
assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
if cfg.TRAIN.BBOX_REG:
print 'Computing bounding-box regression targets...'
self.bbox_means, self.bbox_stds = \
rdl_roidb.add_bbox_regression_targets(roidb) 赋值:预测框与gt的偏移 还是不知道预测框怎么进来的 看下test流程
print 'done'
self.solver = caffe.SGDSolver(solver_prototxt) 具体实现在哪里?
if pretrained_model is not None:
print ('Loading pretrained model '
'weights from {:s}').format(pretrained_model)
self.solver.net.copy_from(pretrained_model)
self.solver_param = caffe_pb2.SolverParameter()
with open(solver_prototxt, 'rt') as f:
pb2.text_format.Merge(f.read(), self.solver_param)
self.solver.net.layers[0].set_roidb(roidb)
其中
def add_bbox_regression_targets(roidb):
"""Add information needed to train bounding-box regressors."""
assert len(roidb) > 0
assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
num_images = len(roidb)
# Infer number of classes from the number of columns in gt_overlaps
num_classes = roidb[0]['gt_overlaps'].shape[1]
for im_i in xrange(num_images):
rois = roidb[im_i]['boxes']
max_overlaps = roidb[im_i]['max_overlaps']
max_classes = roidb[im_i]['max_classes']
roidb[im_i]['bbox_targets'] = \
_compute_targets(rois, max_overlaps, max_classes) 计算一张图片的target,见下面
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Use fixed / precomputed "means" and "stds" instead of empirical values
means = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
stds = np.tile(
np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
else:
# Compute values needed for means and stds
# var(x) = E(x^2) - E(x)^2
class_counts = np.zeros((num_classes, 1)) + cfg.EPS
sums = np.zeros((num_classes, 4))
squared_sums = np.zeros((num_classes, 4))
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
if cls_inds.size > 0:
class_counts[cls] += cls_inds.size
sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
squared_sums[cls, :] += \
(targets[cls_inds, 1:] ** 2).sum(axis=0)
means = sums / class_counts
stds = np.sqrt(squared_sums / class_counts - means ** 2)
print 'bbox target means:'
print means
print means[1:, :].mean(axis=0) # ignore bg class
print 'bbox target stdevs:'
print stds
print stds[1:, :].mean(axis=0) # ignore bg class
# Normalize targets
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
print "Normalizing targets"
for im_i in xrange(num_images):
targets = roidb[im_i]['bbox_targets']
for cls in xrange(1, num_classes):
cls_inds = np.where(targets[:, 0] == cls)[0]
roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
else:
print "NOT normalizing targets"
# These values will be needed for making predictions
# (the predicts will need to be unnormalized and uncentered)
return means.ravel(), stds.ravel()
计算一张图中gt和其他框的关系def _compute_targets(rois, overlaps, labels):
"""Compute bounding-box regression targets for an image."""
# Indices of ground-truth ROIs
gt_inds = np.where(overlaps == 1)[0] 可能后面rois会包含预测框以及gt的混合,所以这里把gt和预测分离开,求出偏移量并输出
if len(gt_inds) == 0:
# Bail if the image has no ground-truth ROIs
return np.zeros((rois.shape[0], 5), dtype=np.float32)
# Indices of examples for which we try to make predictions
ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 只考虑最大iou比较大的框
# Get IoU overlap between each ex ROI and gt ROI
ex_gt_overlaps = bbox_overlaps(
np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
# Find which gt ROI each ex ROI has max overlap with:
# this will be the ex ROI's gt target
gt_assignment = ex_gt_overlaps.argmax(axis=1)
gt_rois = rois[gt_inds[gt_assignment], :]
ex_rois = rois[ex_inds, :]
targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
targets[ex_inds, 0] = labels[ex_inds] 调用时 _compute_targets(rois, max_overlaps, max_classes),所以label是与当前预测iou最大的class 不过么时候会传进来候选框呢?
targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
return targets
计算候选框(预测框?)和gt的偏移
def bbox_transform(ex_rois, gt_rois):
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
def snapshot(self):
"""Take a snapshot of the network after unnormalizing the learned
bounding-box regression weights. This enables easy use at test-time.
"""
net = self.solver.net
infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
filename = (self.solver_param.snapshot_prefix + infix +
'_iter_{:d}'.format(self.solver.iter) + '.caffemodel')
filename = os.path.join(self.output_dir, filename)
net.save(str(filename))
print 'Wrote snapshot to: {:s}'.format(filename)
return filename
训练开始
def train_model(self, max_iters):
"""Network training loop."""
last_snapshot_iter = -1
timer = Timer()
model_paths = []
while self.solver.iter < max_iters:
# Make one SGD update
timer.tic()
self.solver.step(1) 看cpp源码
timer.toc()
if self.solver.iter % (10 * self.solver_param.display) == 0:
print 'speed: {:.3f}s / iter'.format(timer.average_time)
if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
last_snapshot_iter = self.solver.iter
model_paths.append(self.snapshot())
if last_snapshot_iter != self.solver.iter:
model_paths.append(self.snapshot())
return model_paths
在最后一句话中,调用了self.solver.net.layers[0].set_roidb(roidb),这句话很关键,即调用了网络第一层也即是prototxt文件的第一层即RoIDataLayer层的set_roidb函数,把之前读取的roidb对象传进了网络。
def set_roidb(self, roidb):
"""Set the roidb to be used by this layer during training."""
self._roidb = roidb
self._shuffle_roidb_inds()
if cfg.TRAIN.USE_PREFETCH:
self._blob_queue = Queue(10)
self._prefetch_process = BlobFetcher(self._blob_queue,
self._roidb,
self._num_classes)
self._prefetch_process.start()
# Terminate the child process when the parent exists
def cleanup():
print 'Terminating BlobFetcher'
self._prefetch_process.terminate()
self._prefetch_process.join()
import atexit
atexit.register(cleanup)
==================================class BlobFetcher(Process):
"""Experimental class for prefetching blobs in a separate process."""
def __init__(self, queue, roidb, num_classes):
super(BlobFetcher, self).__init__() 父类process 在process.py定义
self._queue = queue
self._roidb = roidb
self._num_classes = num_classes
self._perm = None
self._cur = 0
self._shuffle_roidb_inds()
# fix the random seed for reproducibility
np.random.seed(cfg.RNG_SEED)
打乱顺序并将self._cur置0
def _shuffle_roidb_inds(self):
"""Randomly permute the training roidb."""
# TODO(rbg): remove duplicated code
self._perm = np.random.permutation(np.arange(len(self._roidb))) ._perm用来存shuffle后的index
self._cur = 0
def _get_next_minibatch_inds(self):
"""Return the roidb indices for the next minibatch."""
# TODO(rbg): remove duplicated code
if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
self._shuffle_roidb_inds()
db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] TRAIN.IMS_PER_BATCH很奇怪 config.py中是2,这里却是1
self._cur += cfg.TRAIN.IMS_PER_BATCH
return db_inds
def run(self):
print 'BlobFetcher started'
while True:
db_inds = self._get_next_minibatch_inds() 获取下个batch的对应index 利用self._cur
minibatch_db = [self._roidb[i] for i in db_inds] 由于上面的TRAIN.IMS_PER_BATCH是1,导致db_inds尺寸也是1,minibatch_db只针对一张图
blobs = get_minibatch(minibatch_db, self._num_classes)
self._queue.put(blobs)
============================================def train_net(solver_prototxt, roidb, output_dir,
pretrained_model=None, max_iters=40000):
"""Train a Fast R-CNN network."""
roidb = filter_roidb(roidb)
sw = SolverWrapper(solver_prototxt, roidb, output_dir,
pretrained_model=pretrained_model)
print 'Solving...'
model_paths = sw.SolverWrapper(max_iters)
print 'done solving'
return model_paths
到现在我们再来看roidb对象里到底记录了哪些内容呢?包括数据集所有影像的影像路径,所有影像对应的目标,以及宽高等信息。有了这些信息,就可以在RoIDataLayer层的前向传播函数调用get_next_minibatch,
def setup(self, bottom, top):
"""Setup the RoIDataLayer."""
# parse the layer parameter string, which must be valid YAML
layer_params = yaml.load(self.param_str_)
self._num_classes = layer_params['num_classes']
self._name_to_top_map = {}
# data blob: holds a batch of N images, each with 3 channels
idx = 0
top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3,
max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
self._name_to_top_map['data'] = idx
idx += 1
if cfg.TRAIN.HAS_RPN: 如果用RPN,则data给cov1,im_info和gt_boxes给rpn-data层 (rois, labels, bbox_targets,bbox_inside_weights, bbox_outside_weights由roi-data层(proposal_target)产生并分别传递给roi_pool,loss_cls,loss_bbox, loss_bbox, loss_bbox)
top[idx].reshape(1, 3)
self._name_to_top_map['im_info'] = idx
idx += 1
top[idx].reshape(1, 4)
self._name_to_top_map['gt_boxes'] = idx
idx += 1
else: # not using RPN 没找到对应情况,但肯定不是test,因为test里找不到这些参数
# rois blob: holds R regions of interest, each is a 5-tuple
# (n, x1, y1, x2, y2) specifying an image batch index n and a
# rectangle (x1, y1, x2, y2)
top[idx].reshape(1, 5)
self._name_to_top_map['rois'] = idx
idx += 1
# labels blob: R categorical labels in [0, ..., K] for K foreground
# classes plus background
top[idx].reshape(1)
self._name_to_top_map['labels'] = idx
idx += 1
if cfg.TRAIN.BBOX_REG:
# bbox_targets blob: R bounding-box regression targets with 4
# targets per class
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_targets'] = idx
idx += 1
# bbox_inside_weights blob: At most 4 targets per roi are active;
# thisbinary vector sepcifies the subset of active targets
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_inside_weights'] = idx
idx += 1
top[idx].reshape(1, self._num_classes * 4)
self._name_to_top_map['bbox_outside_weights'] = idx
idx += 1
print 'RoiDataLayer: name_to_top:', self._name_to_top_map
assert len(top) == len(self._name_to_top_map)
def forward(self, bottom, top):
"""Get blobs and copy them into this layer's top blob vector."""
blobs = self._get_next_minibatch()
for blob_name, blob in blobs.iteritems():
top_ind = self._name_to_top_map[blob_name]
# Reshape net's input blobs
top[top_ind].reshape(*(blob.shape))
# Copy data into net's input blobs
top[top_ind].data[...] = blob.astype(np.float32, copy=False)
先取index,再取对应数据def _get_next_minibatch(self):
"""Return the blobs to be used for the next minibatch.
If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
separate process and made available through self._blob_queue.
"""
if cfg.TRAIN.USE_PREFETCH:
return self._blob_queue.get()
else:
db_inds = self._get_next_minibatch_inds()
minibatch_db = [self._roidb[i] for i in db_inds]
return get_minibatch(minibatch_db, self._num_classes)
def get_minibatch(roidb, num_classes): 将下个minibatch的roidb封装为blob
"""Given a roidb, construct a minibatch sampled from it."""
num_images = len(roidb) 这里的输入roidb针对一张图的所有框,但其实是长度为1的list<dict>,所以num_images = 1
# Sample random scales to use for each image in this batch
random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
size=num_images)
assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \
'num_images ({}) must divide BATCH_SIZE ({})'. \
format(num_images, cfg.TRAIN.BATCH_SIZE)
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 默认256/1
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 默认0.25 * 256
# Get the input image blob, formatted for caffe
im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
blobs = {'data': im_blob}
if cfg.TRAIN.HAS_RPN:
assert len(im_scales) == 1, "Single batch only"
assert len(roidb) == 1, "Single batch only"
# gt boxes: (x1, y1, x2, y2, cls) 因为长度为1,所以roidb[0]就是全部
gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 当前图中对应非背景的框的index
gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 生成blob中的gt_boxes,包括变换后的box坐标以及类型
blobs['gt_boxes'] = gt_boxes
blobs['im_info'] = np.array(
[[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], 生成blob中的im_info, 包括长,宽,放缩比
dtype=np.float32)
else: # not using RPN
# Now, build the region of interest and label blobs
rois_blob = np.zeros((0, 5), dtype=np.float32)
labels_blob = np.zeros((0), dtype=np.float32)
bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32)
bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32)
# all_overlaps = []
for im_i in xrange(num_images):
labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \
= _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image,
num_classes)
# Add to RoIs blob
rois = _project_im_rois(im_rois, im_scales[im_i])
batch_ind = im_i * np.ones((rois.shape[0], 1))
rois_blob_this_image = np.hstack((batch_ind, rois))
rois_blob = np.vstack((rois_blob, rois_blob_this_image))
# Add to labels, bbox targets, and bbox loss blobs
labels_blob = np.hstack((labels_blob, labels))
bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets))
bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights))
# all_overlaps = np.hstack((all_overlaps, overlaps))
# For debug visualizations
# _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps)
blobs['rois'] = rois_blob
blobs['labels'] = labels_blob
if cfg.TRAIN.BBOX_REG:
blobs['bbox_targets'] = bbox_targets_blob
blobs['bbox_inside_weights'] = bbox_inside_blob
blobs['bbox_outside_weights'] = \
np.array(bbox_inside_blob > 0).astype(np.float32)
return blobs
利用roidb的信息(image, flipped)等找到原图,并对原图进行归一化,放缩等变换def _get_image_blob(roidb, scale_inds):
"""Builds an input blob from the images in the roidb at the specified
scales.
"""
num_images = len(roidb) 一般为1
processed_ims = []
im_scales = []
for i in xrange(num_images):
im = cv2.imread(roidb[i]['image']) roidb[i]中所有框对应一张图
if roidb[i]['flipped']:
im = im[:, ::-1, :] 后面可以加上样本增强
target_size = cfg.TRAIN.SCALES[scale_inds[i]]
im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
cfg.TRAIN.MAX_SIZE,cfg.TRAIN.IMAGE_STRIDE)
im_scales.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
blob = im_list_to_blob(processed_ims) list to blob
return blob, im_scales
归一化,变形后输出处理后的图以及变形的尺寸def prep_im_for_blob(im, pixel_means, target_size, max_size,stride): stride = IMAGE_STRIDE默认64
"""Mean subtract and scale an image for use in a blob."""
im = im.astype(np.float32, copy=False)
im -= pixel_means
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(target_size) / float(im_size_min)
# Prevent the biggest axis from being more than MAX_SIZE 如果长边*scale后超出最大尺寸,则按scale=最大尺寸/长边
if np.round(im_scale * im_size_max) > max_size:
im_scale = float(max_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 长宽等比例放缩
interpolation=cv2.INTER_LINEAR)
if stride == 0:
return im, im_scale
else:
# pad to product of stride 这个stride含义??
im_height = int(np.ceil(im.shape[0] / float(stride)) * stride) 向上取整,所以除后再乘回去可能会变大
im_width = int(np.ceil(im.shape[1] / float(stride)) * stride)
im_channel = im.shape[2]
padded_im = np.zeros((im_height, im_width, im_channel)) 填充
padded_im[:im.shape[0], :im.shape[1], :] = im
return padded_im, im_scale
该函数首先调用了_get_next_minibatch_inds此批次运行的影像下标,然后获取他们对应的roidb对象后组成列表,传进get_minibatch函数。在RoIDataLayer层中输出为data,im_info和gt_boxes,在get_minibatch函数中,他把所有的输出信息弄成blob以便进行前向传播。data通过roidb的“image”记录的路径信息然后调用opencv的读影像函数进行读取并进行尺度缩放后输出,对应代码为im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) blobs = {'data': im_blob},而其余的gt_boxes信息之前已经保存,可以直接将之前的信息保存到对应的blob结构中,最后输出。blobs['gt_boxes'] = gt_boxes blobs['im_info'] = np.array( [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32),最后返回blob,在前向传播中输出top layer。
相关文章推荐
- faster rcnn源码解读(四)之数据类型imdb.py和pascal_voc.py(主要是imdb和roidb数据类型的解说)
- faster rcnn源码解读(四)之数据类型imdb.py和pascal_voc.py(主要是imdb和roidb数据类型的解说)
- faster rcnn源码解读(四)之数据类型imdb.py和pascal_voc.py(主要是imdb和roidb数据类型的解说)
- faster rcnn源码解读(五)之layer(网络里的input-data)
- faster rcnn 源码解读
- ceph源码网络模块读取数据流程
- faster rcnn源码解读(六)之minibatch
- faster rcnn 源码解读
- 【A-faster rcnn源码相关】及训练日志
- Faster RCNN 源码解读(1) -- 文件结构分析
- r-cnn学习(四):train_faster_rcnn_alt_opt.py源码学习
- Faster RCNN 源码解读(2) -- NMS(非极大抑制)
- Linux 学习数据专题【管理、编程、源码分析】——Linux相关图书选购指南
- [深度学习] RCNNs系列(1) Ubuntu下Faster RCNN配置及训练和测试自己的数据方法
- faster rcnn 源码解读1
- 针对Faster RCNN具体细节以及源码的解读之RoIPooling层
- faster rcnn源码解读(三)train_faster_rcnn_alt_opt.py
- faster rcnn源码解读(三)train_faster_rcnn_alt_opt.py
- faster rcnn源码解读2
- faster rcnn源码解读(五)之layer(网络里的input-data)