这里以resnet101为基础网络的R-FCN网络为例,其中resnet 101中conv4以及先前的网络层都保持不变(stride=16),去除最后的avg pool层和全连接层,同时conv5以后进行了如下变化:
- conv5的第一个卷积快stride=2修改为stride=1;
- conv5 stage中的所有卷积网络层(resnet building block中的中间网络层)使用空洞卷积,将dilation转换为2,对先前stride=1修改带来的感受野的降低进行弥补;
- 最后一层增加一层1x1,1024的卷积网络,变换原先的2048维度的卷积feature map,降低计算代价;
# conv5的第一个卷积快stride=2修改为stride=1
res5a_branch1 = mx.symbol.Convolution(name='res5a_branch1', data=conv_feat, num_filter=2048, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True)
res5a_branch2a = mx.symbol.Convolution(name='res5a_branch2a', data=conv_feat, num_filter=512, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True)
# conv5 stage中的所有卷积网络层(resnet building block中的中间网络层)使用空洞卷积,将dilation转换为2,对先前stride=1修改带来的感受野的降低进行弥补
res5a_branch2b = mx.symbol.Convolution(name='res5a_branch2b', data=res5a_branch2a_relu, num_filter=512, pad=(2, 2), kernel=(3, 3), stride=(1, 1), dilate=(2, 2), no_bias=True, cudnn_off=True)
res5b_branch2b = mx.symbol.Convolution(name='res5b_branch2b', data=res5b_branch2a_relu, num_filter=512, pad=(2, 2), kernel=(3, 3), stride=(1, 1), dilate=(2, 2), no_bias=True, cudnn_off=True)
res5c_branch2b = mx.symbol.Convolution(name='res5c_branch2b', data=res5c_branch2a_relu, num_filter=512, pad=(2, 2), kernel=(3, 3), stride=(1, 1), dilate=(2, 2), no_bias=True, cudnn_off=True)
具体的基础网络架构如下图所示
该网络是R-FCN网络架构思想的核心,以下从源码实现来解读该实现细节。
// caffe中的网络流图都是从bottom流向top,和prototxt中对应的tag相同
template <typename Dtype>
void PSROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
// 获取bottom_data数据和bottom_rois数据,将data数据对应rois进行位置敏感的roi池化操作
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* bottom_rois = bottom[1]->cpu_data();
// 输出数据的layers,这里为top_data
Dtype* top_data = top[0]->mutable_cpu_data();
# 对应的池化通道mapping
int* mapping_channel_ptr = mapping_channel_.mutable_cpu_data();
int count = top[0]->count();
// 最后输出的网络层数目同时进行相应的初始化0/-1
caffe_set(count, Dtype(0), top_data);
caffe_set(count, -1, mapping_channel_ptr);
// 核心PSROIPooling
PSROIPoolingForward(bottom[1]->num(), bottom_data, spatial_scale_,
channels_, height_, width_, pooled_height_,
pooled_width_, bottom_rois, output_dim_, group_size_,
top_data, mapping_channel_ptr);
}
template <typename Dtype>
static void PSROIPoolingForward(const int num, const Dtype* bottom_data, const Dtype spatial_scale, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const Dtype* bottom_rois, const int output_dim, const int group_size, Dtype* top_data, int* mapping_channel) {
// roi数量num,对每一个roi进行ps roi池化操作
for (int n = 0; n < num; ++n) {
// roi是间隔5的(x,y,w,h,score),所以roi_add为对应的处理的roi index
int roi_add = n*5;
// [start, end) interval for spatial sampling
// roi_batch_ind为对应的roi index
int roi_batch_ind = bottom_rois[roi_add];
// roi_start_w和roi_start_h需要下采样到当前的feature map的宽度和高度,这里以stride=16为例,那么spatial_scale=1/16.0
// roi的存放顺序为x,y,w,h,score,其中x为roi左上角的坐标,w和h为对应的roi的宽度和高度,这里适当+1为了适当增加roi区域
Dtype roi_start_w = static_cast<Dtype>(round(bottom_rois[roi_add + 1])) * spatial_scale;
Dtype roi_start_h = static_cast<Dtype>(round(bottom_rois[roi_add + 2])) * spatial_scale;
Dtype roi_end_w = static_cast<Dtype>(round(bottom_rois[roi_add + 3]) + 1.) * spatial_scale;
Dtype roi_end_h = static_cast<Dtype>(round(bottom_rois[roi_add + 4]) + 1.) * spatial_scale;
// Force too small ROIs to be 1x1
// 将过小的ROIs设置为1x1
Dtype roi_width = max<Dtype>(roi_end_w - roi_start_w, 0.1);
Dtype roi_height = max<Dtype>(roi_end_h - roi_start_h, 0.1);
// Compute w and h at bottom
// 计算底层的w和h,也就是最后需要池化为pooled_height和pooled_width大小的区域,那么将roi分割为这些大小后,各自对应的bin的大小为bin_size_h和bin_size_w
Dtype bin_size_h = roi_height / static_cast<Dtype>(pooled_height);
Dtype bin_size_w = roi_width / static_cast<Dtype>(pooled_width);
// ctop为输出的ps roi池化维度,ctop应该是增加的池化大小
for (int ctop = 0; ctop < output_dim; ++ctop) {
// 对应的每一个pooled_height和pooled_width中的grid进行对应的池化
for (int ph = 0; ph < pooled_height; ++ph) {
for (int pw = 0; pw < pooled_width; ++pw) {
int index = n*output_dim*pooled_height*pooled_width + ctop*pooled_height*pooled_width + ph*pooled_width + pw;
// The output is in order (n, ctop, ph, pw)
// 输出顺序是(n, ctop, ph, pw)也就是(#roi, output_dim, pooled_height, pooled_width)
// bin内的start和end
int hstart = floor(static_cast<Dtype>(ph) * bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw)* bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * bin_size_w + roi_start_w);
// Add roi offsets and clip to input boundaries
// 边界检测,需要在[0,height/width]之间
hstart = min(max(hstart, 0), height);
hend = min(max(hend, 0), height);
wstart = min(max(wstart, 0), width);
wend = min(max(wend, 0), width);
// 查看该bin是否是空的
bool is_empty = (hend <= hstart) || (wend <= wstart);
// 一组的宽度和高度
int gw = pw;
int gh = ph;
int c = (ctop*group_size + gh)*group_size + gw;
// 池化层最后输出的sum
Dtype out_sum = 0;
// 遍历每一个bin,从h到w进行遍历
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
// 从当前的index对应到真实数据bottom_data中的index
int bottom_index = h*width + w;
out_sum += bottom_data[(roi_batch_ind * channels + c) * height * width + bottom_index];
}
}
Dtype bin_area = (hend - hstart)*(wend - wstart);
if (is_empty){
top_data[index] = 0;
}
else{
// 将池化输出通过平局获取
top_data[index] = out_sum/bin_area;
}
// 对应的mapping_channel为c
mapping_channel[index] = c;
}
}
}
}
}
使用Deformable-ConvNets环境中训练R-FCN网络,训练集是2007_trainval,测试集是2007_test,训练测试配置yaml文件为./experiments/fpn/cfgs/resnet_v1_101_coco_trainval_fpn_end2end_ohem.yaml。
AP for aeroplane = 0.7611
AP for bicycle = 0.8006
AP for bird = 0.7612
AP for boat = 0.6477
AP for bottle = 0.6188
AP for bus = 0.8375
AP for car = 0.8034
AP for cat = 0.8715
AP for chair = 0.5583
AP for cow = 0.8152
AP for diningtable = 0.6597
AP for dog = 0.8692
AP for horse = 0.8322
AP for motorbike = 0.7850
AP for person = 0.7875
AP for pottedplant = 0.4577
AP for sheep = 0.7291
AP for sofa = 0.7339
AP for train = 0.8109
AP for tvmonitor = 0.7240
Mean [email protected] = 0.7432
AP for aeroplane = 0.4946
AP for bicycle = 0.6066
AP for bird = 0.5411
AP for boat = 0.3584
AP for bottle = 0.3845
AP for bus = 0.7430
AP for car = 0.6925
AP for cat = 0.6698
AP for chair = 0.3244
AP for cow = 0.6376
AP for diningtable = 0.4007
AP for dog = 0.6314
AP for horse = 0.6247
AP for motorbike = 0.5744
AP for person = 0.5496
AP for pottedplant = 0.2311
AP for sheep = 0.5756
AP for sofa = 0.5211
AP for train = 0.6610
AP for tvmonitor = 0.6209
Mean [email protected] = 0.5421
主要包含rpn_eval_metric、rpn_cls_metric、rpn_bbox_metric和rcnn_eval_metric、rcnn_cls_metric、rcnn_bbox_metric这6类metric,具体细节如下所示。
class RPNAccMetric(mx.metric.EvalMetric):
"""
RPN Acc评价指标
"""
def __init__(self):
super(RPNAccMetric, self).__init__('RPNAcc')
self.pred, self.label = get_rpn_names()
def update(self, labels, preds):
pred = preds[self.pred.index('rpn_cls_prob')]
label = labels[self.label.index('rpn_label')]
# pred (b, c, p) or (b, c, h, w)
pred_label = mx.ndarray.argmax_channel(pred).asnumpy().astype('int32')
pred_label = pred_label.reshape((pred_label.shape[0], -1))
# label (b, p)
label = label.asnumpy().astype('int32')
# filter with keep_inds
keep_inds = np.where(label != -1)
pred_label = pred_label[keep_inds]
label = label[keep_inds]
self.sum_metric += np.sum(pred_label.flat == label.flat)
self.num_inst += len(pred_label.flat)
- R-FCN论文翻译——中文版
- Understanding Region-based Fully Convolutional Networks (R-FCN) for object detection 这篇博客非常直观,详细参考。
- pytorch_RFCN pytorch实现的R-FCN网络。
- psroi_pooling_layer.cu 作者caffe实现的PSROIPooling的源码,对应的CPU第三方实现psroi_pooling_layer.cpp。
- RFCN train_val.prototxt和RFCN test.prototxt