Skip to content

Commit

Permalink
decouple convolutiondepthwise and convolution, reduce binary size by …
Browse files Browse the repository at this point in the history
…10%, fix Tencent#254
  • Loading branch information
nihui authored and ghimiredhikura committed Jul 26, 2020
1 parent 2113b97 commit 2cb6ecf
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 220 deletions.
162 changes: 78 additions & 84 deletions src/layer/arm/convolutiondepthwise_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,9 @@
#include <omp.h>
#endif

namespace ncnn {
#include "layer_type.h"

#include "convolution_1x1.h"
#include "convolution_2x2.h"
#include "convolution_3x3.h"
#include "convolution_4x4.h"
#include "convolution_5x5.h"
#include "convolution_7x7.h"
namespace ncnn {

#include "convolutiondepthwise_3x3.h"

Expand All @@ -36,77 +31,18 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
// convolv with NxN kernel
// value = value + bias

if (kernel_w != kernel_h || stride_w != stride_h)
{
return ConvolutionDepthWise::forward(bottom_blob, top_blob);
}

const int kernel_size = kernel_w;
const int stride = stride_w;

if (kernel_size > 7 || stride > 4 || dilation_w != 1 || dilation_h != 1)
{
return ConvolutionDepthWise::forward(bottom_blob, top_blob);
}

typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&);
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;

// kernel_size x stride
conv_func conv_func_table[7][4] =
{
{
conv1x1s1_neon,
conv1x1s2_neon,
0,
0
}, // kernel_size = 1
{
conv2x2s1_neon,
0,
0,
0
}, // kernel_size = 2
{
conv3x3s1_neon,
conv3x3s2_neon,
0,
0
}, // kernel_size = 3
{
0,
0,
0,
conv4x4s4_neon
}, // kernel_size = 4
{
conv5x5s1_neon,
conv5x5s2_neon,
0,
0
}, // kernel_size = 5
{
0,
0,
0,
0
}, // kernel_size = 6
{
conv7x7s1_neon,
conv7x7s2_neon,
0,
0
} // kernel_size = 7
};

conv_func conv = conv_func_table[kernel_size-1][stride-1];
if (!conv)
if (channels % group != 0 || num_output % group != 0)
{
return ConvolutionDepthWise::forward(bottom_blob, top_blob);
// reject invalid group
return -100;
}

int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

Mat bottom_blob_bordered = bottom_blob;
if (pad_w > 0 || pad_h > 0)
Expand All @@ -120,8 +56,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
}
else if (pad_w == -233 && pad_h == -233)
{
int wpad = kernel_size + (w - 1) / stride * stride - w;
int hpad = kernel_size + (h - 1) / stride * stride - h;
int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
if (wpad > 0 || hpad > 0)
{
copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, 0.f);
Expand All @@ -133,26 +69,26 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
h = bottom_blob_bordered.h;
}

int outw = (w - kernel_size) / stride + 1;
int outh = (h - kernel_size) / stride + 1;
int outw = (w - kernel_extent_w) / stride_w + 1;
int outh = (h - kernel_extent_h) / stride_h + 1;

top_blob.create(outw, outh, num_output);
if (top_blob.empty())
return -100;

const int maxk = kernel_size * kernel_size;
const int maxk = kernel_w * kernel_h;

// depth-wise
if (channels == group && group == num_output)
{
if (kernel_size == 3)
if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
{
if (stride == 1)
if (stride_w == 1 && stride_h == 1)
{
convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
return 0;
}
else if (stride == 2)
else if (stride_w == 2 && stride_h == 2)
{
convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data);
return 0;
Expand All @@ -174,7 +110,36 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
if (bias_term)
bias_data_g = Mat(1, (void*)((const float*)bias_data + g));

conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
// call Convolution
ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

// set param
ncnn::ParamDict pd;
pd.set(0, 1);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk);// weight_data_size

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(weights);

// forward
op->forward(bottom_blob_bordered_g, top_blob_g);

delete op;
}

#ifdef _OPENMP
Expand All @@ -195,7 +160,36 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob) con
if (bias_term)
bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));

conv(bottom_blob_bordered_g, top_blob_g, weight_data_g, bias_data_g);
// call Convolution
ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);

// set param
ncnn::ParamDict pd;
pd.set(0, num_output_g);// num_output
pd.set(1, kernel_w);
pd.set(11, kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, 0);// pad_w
pd.set(14, 0);// pad_h
pd.set(5, bias_term);
pd.set(6, maxk * channels_g * num_output_g);// weight_data_size

op->load_param(pd);

// set weights
ncnn::Mat weights[2];
weights[0] = weight_data_g;
weights[1] = bias_data_g;

op->load_model(weights);

// forward
op->forward(bottom_blob_bordered_g, top_blob_g);

delete op;
}

return 0;
Expand Down
2 changes: 1 addition & 1 deletion src/layer/arm/deconvolution_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob) const
int outw = (w - 1) * stride + kernel_size;
int outh = (h - 1) * stride + kernel_size;

Mat top_blob_bordered;
Mat top_blob_bordered = top_blob;
top_blob_bordered.create(outw, outh, num_output);
if (top_blob_bordered.empty())
return -100;
Expand Down
Loading

0 comments on commit 2cb6ecf

Please sign in to comment.