Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【Hackathon No.36】优化 lerp_grad op 在 GPU 上的计算性能 #45946

Merged
merged 16 commits into from
Oct 10, 2022
208 changes: 206 additions & 2 deletions paddle/phi/kernels/gpu/lerp_grad_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

该文件并不是这个PR新增,不要改copyright年份。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

该文件并不是这个PR新增,不要改copyright年份。

done

//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -15,8 +15,212 @@
#include "paddle/phi/kernels/lerp_grad_kernel.h"

#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"

#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
#include "paddle/phi/kernels/gpu/reduce.h"

namespace phi {

template <typename T>
__global__ void LerpGradKernelImpl(const T* weight,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

该计算似乎可以调用ElementwiseKernel实现,多输出的用法可参考:

phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
dev_ctx, ins, &outs, functor);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

该计算似乎可以调用ElementwiseKernel实现,多输出的用法可参考:

phi::funcs::ElementwiseKernel<T, decltype(functor), 2>(
dev_ctx, ins, &outs, functor);

之前在做另一个算子优化时测试使用elementwisekernel速度没有直接实现的快,这里我测试一下看哪个速度更好

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个PR修改成当前实现也OK,可优先修复一下CI问题:
image

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好的 已修复,再观察下CI

const T* dout,
T* dx,
T* dy,
const int out_size,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

调这个kernel的要求是x、y、weight的size都一样?那参数是不是只用传一个numel就行?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里的x和y的大小可能是不一致的,例如x:[2,3,4], y:[2,2,3,4],只有axis0不同的情况下是不需要进行broadcast到同样维度的,但是需要在kernel里对x和y计算的数据量进行区分

const int x_size,
const int y_size) {
CUDA_KERNEL_LOOP_TYPE(idx, out_size, int64_t) {
T temp_dx = weight[idx] * dout[idx];
if (idx < x_size) {
dx[idx] = dout[idx] - temp_dx;
}
if (idx < y_size) {
dy[idx] = temp_dx;
}
}
}

template <typename T>
__global__ void LerpGradScalarKernelImpl(const T* weight,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上

const T* dout,
T* dx,
T* dy,
const int out_size,
const int x_size,
const int y_size) {
T weight_scalar = weight[0];
CUDA_KERNEL_LOOP_TYPE(idx, out_size, int64_t) {
T temp_dx = weight_scalar * dout[idx];
if (idx < x_size) {
dx[idx] = dout[idx] - temp_dx;
}
if (idx < y_size) {
dy[idx] = temp_dx;
}
}
}

bool XYNeedReduce(const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& out) {
auto x_dims = x.dims();
auto y_dims = y.dims();
auto out_dims = out.dims();
int x_rank = x_dims.size();
int y_rank = y_dims.size();
int out_rank = out_dims.size();
int smaller_rank = std::min(x_rank, y_rank);
if (std::max(x_rank, y_rank) < out_rank) {
return true;
}
for (int i = 1; i <= smaller_rank; ++i) {
int x_idx = x_rank - i;
int y_idx = y_rank - i;
int out_idx = out_rank - i;
if (x_dims[x_idx] != y_dims[y_idx]) {
return true;
}
if (x_dims[x_idx] == 1 && y_dims[y_idx] == 1 && out_dims[out_idx] != 1) {
return true;
}
}
return false;
}

template <typename T, typename Context>
void SwitchKernel(const Context& ctx,
const DenseTensor& weight,
const DenseTensor& out_grad,
const int x_grad_size,
const int y_grad_size,
T* x_grad_data,
T* y_grad_data) {
if (weight.dims().size() == 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个if里面的判断条件不对?应该用weight.numel() == 1?我看了下,单测里面没有w是1-D且shape不是[1]的测试。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个if里面的判断条件不对?应该用weight.numel() == 1?我看了下,单测里面没有w是1-D且shape不是[1]的测试。

是的,bug已修复,在单测中也将TestLerpBroadWToXY修改为1-D且shape不是[1],同时测试broadcast和此处提到的情况

// condition when weight is a scalar
const T* weight_data = weight.data<T>();
// const T weight_scalar = weight_data[0];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

无用的注释可以删除

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

无用的注释可以删除

done

const T* out_grad_data = out_grad.data<T>();
const int out_size = out_grad.numel();
const int weight_size = weight.numel();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里建议使用int64_t类型

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里建议使用int64_t类型

done

auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
LerpGradScalarKernelImpl<T><<<gpu_config.GetGridSize(),
gpu_config.GetBlockSize(),
0,
ctx.stream()>>>(weight_data,
out_grad_data,
x_grad_data,
y_grad_data,
out_size,
x_grad_size,
y_grad_size);
} else {
// broadcast weight with out_grad's dimensions
const std::vector<const DenseTensor*> in_tensors = {&weight, &out_grad};
DenseTensor b_weight = phi::EmptyLike<T>(ctx, out_grad);
DenseTensor b_out = phi::EmptyLike<T>(ctx, out_grad);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我理解,out应该永远都不需要broadcast?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

我理解,out应该永远都不需要broadcast?

是的 out是不需要broadcast 的,这里做这步操作是为了使用BroadcastTensorsKernel将weight进行broadcast,由于out_grad本身是const的无法作为入参,所以加了这一步操作

std::vector<DenseTensor*> out_tensors = {&b_weight, &b_out};

phi::BroadcastTensorsKernel<T, Context>(ctx, in_tensors, out_tensors);
Copy link
Contributor

@Xreki Xreki Oct 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

此处也可以调用BroadcastKernel,性能应该会好一些。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

此处爷可以调用BroadcastKernel,性能应该会好一些。

好的 我测试一下

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个PR修改成当前实现也OK,可优先修复一下CI问题

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个PR修改成当前实现也OK,可优先修复一下CI问题

已修复,CI测试基本都已通过。又一个Kunlun-kp-build提示失败,我重启流水线提示没有权限,麻烦大佬帮忙重启下可以吗?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个CI不是required,由于网络原因失败,我咨询过了,可以不用管这个CI。


const T* weight_data = b_weight.data<T>();
const T* out_grad_data = b_out.data<T>();
const int out_size = out_grad.numel();
const int weight_size = weight.numel();

auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
LerpGradKernelImpl<T><<<gpu_config.GetGridSize(),
gpu_config.GetBlockSize(),
0,
ctx.stream()>>>(weight_data,
out_grad_data,
x_grad_data,
y_grad_data,
out_size,
x_grad_size,
y_grad_size);
}
}

template <typename T, typename Context>
void LerpGradKernel(const Context& ctx,
const DenseTensor& x,
const DenseTensor& y,
const DenseTensor& weight,
const DenseTensor& out,
const DenseTensor& out_grad,
DenseTensor* x_grad,
DenseTensor* y_grad) {
const int rank = out.dims().size();
PADDLE_ENFORCE_GE(
rank,
1,
phi::errors::InvalidArgument(
"The number of dimensions for LerpGradOp must be "
"greater than or equal to 1, but the value received is %d.",
rank));
PADDLE_ENFORCE_LE(
rank,
6,
phi::errors::InvalidArgument(
"The number of dimensions for LerpGradOp must be "
"less than or equal to 6, but the value received is %d.",
rank));

// check if x_grad and y_grad need to be reduced
// if x has a different dimension with y or weight in the middle axis, then
// they need to be broadcast and then reduced.
bool reduce_flag = XYNeedReduce(x, y, out);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (dx) {
dx_dims = phi::funcs::ExtendDims2Rank(dx->dims(), D);
phi::funcs::GetBroadcastDims<D>(dx_dims, dout_dims, &dx_bcast_dims);
}
if (dy) {
dy_dims = phi::funcs::ExtendDims2Rank(dy->dims(), D);
phi::funcs::GetBroadcastDims<D>(dy_dims, dout_dims, &dy_bcast_dims);
}
phi::funcs::GetBroadcastDims<D>(w_dims, dout_dims, &w_bcast_dims);
auto eigen_w = phi::EigenTensor<T, D>::From(w, w_dims);
auto eigen_dout = phi::EigenTensor<T, D>::From(dout);

注意到develop实现代码中,存在对dx、dy是否为空的判断。Paddle的任何算子,当存在多个梯度输出时,所有的梯度变量都允许为nullptr,所以还需要加一下相关的逻辑。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

好的,我加一下再提交

if (!reduce_flag) {
T* x_grad_data = ctx.template Alloc<T>(x_grad);
T* y_grad_data = ctx.template Alloc<T>(y_grad);
int x_grad_size = x.numel();
int y_grad_size = y.numel();

SwitchKernel<T, Context>(ctx,
weight,
out_grad,
x_grad_size,
y_grad_size,
x_grad_data,
y_grad_data);

} else {
DenseTensor b_xgrad = phi::EmptyLike<T, Context>(ctx, out_grad);
DenseTensor b_ygrad = phi::EmptyLike<T, Context>(ctx, out_grad);
T* x_grad_data = ctx.template Alloc<T>(&b_xgrad);
T* y_grad_data = ctx.template Alloc<T>(&b_ygrad);
int x_grad_size = out.numel();
int y_grad_size = out.numel();

SwitchKernel<T, Context>(ctx,
weight,
out_grad,
x_grad_size,
y_grad_size,
x_grad_data,
y_grad_data);

std::vector<int> reduce_axis_x =
funcs::GetReduceDim(x_grad->dims(), b_xgrad.dims(), -1);

std::vector<int> reduce_axis_y =
funcs::GetReduceDim(y_grad->dims(), b_ygrad.dims(), -1);

phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, b_xgrad, x_grad, kps::IdentityFunctor<T>(), reduce_axis_x);

phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
ctx, b_ygrad, y_grad, kps::IdentityFunctor<T>(), reduce_axis_y);
}
}

} // namespace phi

PD_REGISTER_KERNEL(
lerp_grad, GPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
3 changes: 2 additions & 1 deletion paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,11 @@ void BroadcastTensorsKernel(const Context& ctx,
SWITCH_OUT_RANK_CASE(3)
SWITCH_OUT_RANK_CASE(4)
SWITCH_OUT_RANK_CASE(5)
SWITCH_OUT_RANK_CASE(6)
default: {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Target tensor rank out of range"
"Maximum supported rank for broadcast is: 5"));
"Maximum supported rank for broadcast is: 6"));
}
}
}
Expand Down
30 changes: 30 additions & 0 deletions python/paddle/fluid/tests/unittests/test_lerp_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import unittest
import numpy as np
from op_test import OpTest
Expand Down Expand Up @@ -78,6 +80,34 @@ def init_shape(self):
self.shape = [2, 1, 2, 5, 1, 5]


class TestLerpBroadXY(TestLerp):

def setUp(self):
self.op_type = "lerp"
self.python_api = paddle.lerp
self.init_dtype()
self.init_shape()
x = np.arange(1., 201.).astype(self.dtype).reshape([2, 1, 2, 50])
y = np.full(200, 10.).astype(self.dtype).reshape([2, 2, 1, 50])
w = np.asarray([0.5]).astype(self.dtype)
self.inputs = {'X': x, 'Y': y, 'Weight': w}
self.outputs = {'Out': x + w * (y - x)}


class TestLerpBroadWToXY(TestLerp):

def setUp(self):
self.op_type = "lerp"
self.python_api = paddle.lerp
self.init_dtype()
self.init_shape()
x = np.full(600, 2.5).astype(self.dtype).reshape([3, 2, 2, 50])
y = np.full(600, 1.).astype(self.dtype).reshape([3, 2, 2, 50])
w = np.full(200, 0.5).astype(self.dtype).reshape([2, 2, 50])
self.inputs = {'X': x, 'Y': y, 'Weight': w}
self.outputs = {'Out': x + w * (y - x)}


class TestLerpAPI(unittest.TestCase):

def init_dtype(self):
Expand Down