Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-97] implement DepthwiseConv2dBackwardFilterKernel from tensorflow codebase #10098

Merged
merged 1 commit into from
Mar 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 7 additions & 78 deletions src/operator/nn/depthwise_convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,80 +79,6 @@ class DepthwiseConvolutionOp {
}; // class DepthwiseConvolutionOp

namespace depthwise_conv {
namespace cuda {
template<typename DType, int kFilterWidth, int kFilterHeight>
__global__ void __launch_bounds__(1024, 2)
DepthwiseConv2dBackwardFilterKernel(const DepthwiseArgs args,
const DType* out_grad,
const DType* input,
DType* filter_grad) {
const int in_height = args.in_height;
const int in_width = args.in_width;
const int channel = args.in_channel;
const int filter_height = kFilterHeight > 0 ? kFilterHeight : args.filter_height;
const int filter_width = kFilterWidth > 0 ? kFilterWidth : args.filter_width;
const int stride_height = args.stride_height;
const int stride_width = args.stride_width;
const int pad_height = args.pad_height;
const int pad_width = args.pad_width;
const int out_height = args.out_height;
const int out_width = args.out_width;

const int filter_pixels = filter_width * filter_height;
const int out_pixels = out_height * out_width;
const int in_pixels = in_height * in_width;
const int batch_channel_num = channel * args.batch;

for (int b = blockIdx.x; b < batch_channel_num; b += gridDim.x) {
const int local_batch = b / channel;
const int local_channel = b % channel;
const int filter_offset_temp = local_channel * filter_pixels;
const int out_grad_offset_temp = (local_batch * channel * out_pixels) +
(local_channel * out_pixels);

// Make sure all threads enter the loop so they get to the enclosed __syncthreads()
for (int out_id = threadIdx.x;
out_id < ROUND_TO_MULTIPLE(out_pixels,
blockDim.x); out_id += blockDim.x) {
const int out_w = out_id % out_width;
const int out_h = (out_id / out_width) % out_height;
const int out_grad_offset = out_grad_offset_temp + (out_h * out_width) + (out_w);
// Set out_g to 0 if the thread would normally have not entered the loop.
const DType out_g = out_id < out_pixels ? ldg(out_grad + out_grad_offset) : DType(0);

const int in_h_start = out_h * stride_height - pad_height;
const int in_w_start = out_w * stride_width - pad_width;
CUDA_UNROLL for (int f_h = 0; f_h < filter_height; ++f_h) {
const int in_h = in_h_start + f_h;
const int input_offset_temp = (local_batch * channel * in_pixels) +
(local_channel * in_pixels) + (in_h * in_width);
const int filter_offset_h = filter_width * f_h;

CUDA_UNROLL for (int f_w = 0; f_w < filter_width; ++f_w) {
const int in_w = in_w_start + f_w;
DType partial_grad = DType(0.0f);
if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) {
const int input_offset = input_offset_temp + in_w;
// Set partial_grad to 0 if the thread would normally not have entered the loop.
partial_grad = out_id < out_pixels ? ldg(input + input_offset) * out_g : DType(0);
}
// reduce all valid partial grad in a block
typedef cub::BlockReduce<DType, mshadow::cuda::kBaseThreadNum> BlockReduceT;
__shared__ typename BlockReduceT::TempStorage temp_storage_reduce;
DType aggregate = BlockReduceT(temp_storage_reduce).Sum(partial_grad, blockDim.x);
if (threadIdx.x == 0) {
DType* addr = filter_grad + f_w + filter_offset_h + filter_offset_temp;
atomicAdd(addr, aggregate);
}
// The presense of __syncthreads() here means all threads must enter enclosing for-loops.
__syncthreads();
} // for filter_width
} // for filter_height
} // for out_pixels
__syncthreads();
} // for batch_channel_num
}
} // namespace cuda

template<typename DType>
void DepthwiseConv2dForwardGpu(mshadow::Stream<gpu> *stream,
Expand Down Expand Up @@ -244,6 +170,7 @@ void DepthwiseConv2dBackwardFilterGpu(mshadow::Stream<gpu> *stream,
using namespace mshadow;
using namespace mshadow::expr;
using namespace tf::depthwise_conv;
using namespace tf::depthwise_conv::cuda;
Tensor<gpu, 4, DType> out_g = out_grad[conv::kOut].get<gpu, 4, DType>(stream);
Tensor<gpu, 4, DType> in_d = in_data[conv::kData].get<gpu, 4, DType>(stream);
Tensor<gpu, 4, DType> weight_grad = in_grad[conv::kWeight].get<gpu, 4, DType>(stream);
Expand All @@ -258,17 +185,19 @@ void DepthwiseConv2dBackwardFilterGpu(mshadow::Stream<gpu> *stream,
auto s = mshadow::Stream<gpu>::GetStream(stream);
int block_num = std::min(args.out_channel * args.batch, mshadow::cuda::kMaxGridNum);
if (args.filter_width == 3 && args.filter_height == 3) {
cuda::DepthwiseConv2dBackwardFilterKernel<DType, 3, 3>
DepthwiseConv2dBackwardFilterKernel<DType, 3, 3>
<<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(args,
out_g.dptr_,
in_d.dptr_,
weight_grad.dptr_);
weight_grad.dptr_,
num_out_grad);
} else {
cuda::DepthwiseConv2dBackwardFilterKernel<DType, -1, -1>
DepthwiseConv2dBackwardFilterKernel<DType, -1, -1>
<<<block_num, mshadow::cuda::kBaseThreadNum, 0, s>>>(args,
out_g.dptr_,
in_d.dptr_,
weight_grad.dptr_);
weight_grad.dptr_,
num_out_grad);
}
MSHADOW_CUDA_POST_KERNEL_CHECK(DepthwiseConv2dBackwardFilterKernel);
}
Expand Down
90 changes: 90 additions & 0 deletions src/operator/nn/depthwise_convolution_tf.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,96 @@ DepthwiseConv2dBackwardDataKernel(const DepthwiseArgs args,
}
}

// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
template <typename DType, int kFilterWidth, int kFilterHeight>
__global__ void __launch_bounds__(640, 2)
DepthwiseConv2dBackwardFilterKernel(const DepthwiseArgs args,
const DType* out_backprop,
const DType* input,
DType* filter_backprop,
int num_out_backprop) {
const int in_channel = args.in_channel;
const int in_height = args.in_height;
const int in_width = args.in_width;
const int filter_height = kFilterHeight > 0 ? kFilterHeight : args.filter_height;
const int filter_width = kFilterWidth > 0 ? kFilterWidth : args.filter_width;
const int stride_height = args.stride_height;
const int stride_width = args.stride_width;
const int pad_height = args.pad_height;
const int pad_width = args.pad_width;
const int out_channel = args.out_channel;
const int out_height = args.out_height;
const int out_width = args.out_width;

CUDA_KERNEL_LOOP(thread_id, num_out_backprop) {
// Compute the indexes of this thread in the output.
const int out_w = thread_id % out_width;
const int out_h = (thread_id / out_width) % out_height;
const int out_c = (thread_id / out_width / out_height) % out_channel;
const int out_b = thread_id / out_width / out_height / out_channel;
const int in_c = out_c;

// Decide if all input is valid, if yes, we can skip the boundary checks
// for each input.
const int in_row_start = out_h * stride_height - pad_height;
const int in_col_start = out_w * stride_width - pad_width;
const int in_row_end = in_row_start + filter_height;
const int in_col_end = in_col_start + filter_width;

const int out_backprop_offset =
(out_b * out_channel * out_height * out_width) +
(out_c * out_height * out_width) + (out_h * out_width) +
(out_w);

const DType out_bp = ldg(out_backprop + out_backprop_offset);
if (in_row_start >= 0 && in_col_start >= 0 &&
in_row_end < in_height && in_col_end < in_width) {
CUDA_UNROLL for (int f_h = 0; f_h < filter_height; ++f_h) {
const int in_row = in_row_start + f_h;
// Avoid repeated computation.
const int input_offset_temp =
(out_b * in_channel * in_height * in_width) +
(in_c * in_height * in_width) + (in_row * in_width);

CUDA_UNROLL for (int f_w = 0; f_w < filter_width; ++f_w) {
const int in_col = in_col_start + f_w;
const int input_offset = input_offset_temp + in_col;
DType partial_sum = ldg(input + input_offset) * out_bp;
DType* addr = filter_backprop + (in_c + in_channel * (f_w + filter_width * f_h));
atomicAdd(addr, partial_sum);
}
}
} else {
CUDA_UNROLL for (int f_h = 0; f_h < filter_height; ++f_h) {
const int in_row = in_row_start + f_h;
// Avoid repeated computation.
const int input_offset_temp =
(out_b * in_channel * in_height * in_width) +
(in_c * in_height * in_width) + (in_row * in_width);
CUDA_UNROLL for (int f_w = 0; f_w < filter_width; ++f_w) {
const int in_col = in_col_start + f_w;
const int addr_temp = filter_width * f_h;

if (in_row >= 0 && in_row < in_height && in_col >= 0 && in_col < in_width) {
const int input_offset = input_offset_temp + in_col;
DType partial_sum = ldg(input + input_offset) * out_bp;
DType* addr = filter_backprop + (in_c + in_channel * (f_w + addr_temp));
// Potentially many threads can add to the same address so we have
// to use atomic add here.
// TODO(jmchen): If atomic add turns out to be slow, we can:
// 1. allocate multiple buffers for the gradients (one for each
// example in a batch, for example). This can reduce the
// contention on the destination; 2. Have each thread compute one
// gradient for an element in the filters. This should work well
// when the input depth is big and filter size is not too small.
atomicAdd(addr, partial_sum);
}
}
}
}
}
}

// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
// NCHW format, tailored for small images up to 32x32. Only use this kernel if
// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
Expand Down