Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… dev/unify_gpu_context3
  • Loading branch information
zhiqiu committed Jul 29, 2022
2 parents b20da7a + 8849056 commit c73144b
Show file tree
Hide file tree
Showing 77 changed files with 6,313 additions and 1,475 deletions.
2 changes: 2 additions & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ message RecomputeConfig {
repeated string checkpoints = 1;
optional bool enable_offload = 2 [ default = false ];
repeated int32 checkpoint_shape = 3;
optional bool enable_tuning = 4 [ default = false ]; // incubate for auto parallel
}

message ShardingConfig {
Expand All @@ -46,6 +47,7 @@ message ShardingConfig {
// Optimizer sharding. Temporary plans and may be deprecated
optional bool _dp_as_optimizer_sharding = 13 [ default = false ];
optional int32 stage = 14 [ default = 1 ];
optional bool enable_tuning = 15 [ default = false ]; // incubate for auto parallel
}

message HybridConfig {
Expand Down
12 changes: 7 additions & 5 deletions paddle/fluid/inference/tensorrt/op_teller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2091,12 +2091,14 @@ bool OpTeller::Tell(const framework::ir::Node* node,
VLOG(3) << "unsupport data type conversion";
return false;
}
if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 2 ||
in_dtype == 0) &&
if (in_dtype == 0) {
VLOG(3) << "do not support input data type as bool now";
return false;
}
if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 2) &&
(out_dtype == 5 || out_dtype == 4 || out_dtype == 2))) {
VLOG(3)
<< "only valid conversions are: "
"(kFLOAT | kHALF | kINT32 | kBOOL) -> (kFLOAT | kHALF | kINT32)";
VLOG(3) << "only valid conversions are: "
"(kFLOAT | kHALF | kINT32) -> (kFLOAT | kHALF | kINT32)";
return false;
}
}
Expand Down
18 changes: 8 additions & 10 deletions paddle/fluid/operators/detection/multiclass_nms_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ limitations under the License. */

#include <glog/logging.h>

#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/nms_util.h"
#include "paddle/phi/infermeta/ternary.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -609,12 +611,6 @@ class MultiClassNMS3Op : public MultiClassNMS2Op {
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: MultiClassNMS2Op(type, inputs, outputs, attrs) {}

void InferShape(framework::InferShapeContext* ctx) const override {
MultiClassNMS2Op::InferShape(ctx);

ctx->SetOutputDim("NmsRoisNum", {-1});
}
};

class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
Expand All @@ -633,6 +629,10 @@ class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
} // namespace operators
} // namespace paddle

DECLARE_INFER_SHAPE_FUNCTOR(multiclass_nms3,
MultiClassNMSShapeFunctor,
PD_INFER_META(phi::MultiClassNMSInferMeta));

namespace ops = paddle::operators;
REGISTER_OPERATOR(
multiclass_nms,
Expand All @@ -658,7 +658,5 @@ REGISTER_OPERATOR(
ops::MultiClassNMS3Op,
ops::MultiClassNMS3OpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(multiclass_nms3,
ops::MultiClassNMSKernel<float>,
ops::MultiClassNMSKernel<double>);
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MultiClassNMSShapeFunctor);
121 changes: 71 additions & 50 deletions paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ using Tensor = framework::Tensor;

template <typename T>
class ResNetUnitXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;

public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
Expand Down Expand Up @@ -63,9 +65,12 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
std::string act_type = ctx.Attr<std::string>("act_type");
auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();

std::vector<const T *> x_list = {input_x->data<T>()};
std::vector<const T *> w_list = {filter_x->data<T>()};
std::vector<T *> conv_y_list = {conv_out_x->mutable_data<T>(place)};
std::vector<const XPUType *> x_list = {
reinterpret_cast<const XPUType *>(input_x->data<T>())};
std::vector<const XPUType *> w_list = {
reinterpret_cast<const XPUType *>(filter_x->data<T>())};
std::vector<XPUType *> conv_y_list = {
reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};

std::vector<std::vector<int>> x_shape_list = {
phi::vectorize<int>(input_x->dims())};
Expand Down Expand Up @@ -107,9 +112,10 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");

x_list.push_back(input_z->data<T>());
w_list.push_back(filter_z->data<T>());
conv_y_list.push_back(conv_out_z->mutable_data<T>(place));
x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
conv_y_list.push_back(
reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));

x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));

Expand All @@ -133,17 +139,17 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
if (fuse_add) {
const Tensor *input_z = ctx.Input<Tensor>("Z");
auto input_z_shape = phi::vectorize<int>(input_z->dims());
x_list.push_back(input_z->data<T>());
x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
x_shape_list.push_back(input_z_shape);
x_maxlist.push_back(nullptr);
}
}
int r = xpu::resnet_unit_fusion<T, T, T, int16_t>(
int r = xpu::resnet_unit_fusion<XPUType, XPUType, XPUType, int16_t>(
dev_ctx.x_context(),
x_list,
w_list,
conv_y_list,
output->mutable_data<T>(place),
reinterpret_cast<XPUType *>(output->mutable_data<T>(place)),
x_shape_list,
filter_x_shape[0],
ksize_list,
Expand Down Expand Up @@ -172,6 +178,8 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {

template <typename T>
class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;

public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
Expand Down Expand Up @@ -208,11 +216,16 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {

auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();

std::vector<const T *> x_list = {x->data<T>()};
std::vector<const T *> w_list = {filter_x->data<T>()};
std::vector<const T *> conv_y_list = {conv_out_x->data<T>()};
std::vector<T *> dx_list = {x_grad->mutable_data<T>(place)};
std::vector<T *> dw_list = {filter_x_grad->mutable_data<T>(place)};
std::vector<const XPUType *> x_list = {
reinterpret_cast<const XPUType *>(x->data<T>())};
std::vector<const XPUType *> w_list = {
reinterpret_cast<const XPUType *>(filter_x->data<T>())};
std::vector<const XPUType *> conv_y_list = {
reinterpret_cast<const XPUType *>(conv_out_x->data<T>())};
std::vector<XPUType *> dx_list = {
reinterpret_cast<XPUType *>(x_grad->mutable_data<T>(place))};
std::vector<XPUType *> dw_list = {
reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};

std::vector<std::vector<int>> x_shape_list = {
phi::vectorize<int>(x->dims())};
Expand Down Expand Up @@ -262,11 +275,14 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
Tensor *scale_z_grad =
ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
x_list.push_back(z->data<T>());
w_list.push_back(filter_z->data<T>());
conv_y_list.push_back(conv_out_z->data<T>());
dx_list.push_back(z_grad->mutable_data<T>(place));
dw_list.push_back(filter_z_grad->mutable_data<T>(place));
x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
conv_y_list.push_back(
reinterpret_cast<const XPUType *>(conv_out_z->data<T>()));
dx_list.push_back(
reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
dw_list.push_back(
reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
x_shape_list.push_back(phi::vectorize<int>(z->dims()));

auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
Expand All @@ -288,38 +304,39 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
} else {
if (fuse_add) {
auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
dx_list.push_back(z_grad->mutable_data<T>(place));
dx_list.push_back(
reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
}
}

int r =
xpu::resnet_unit_grad_fusion<T, T, T, int16_t>(dev_ctx.x_context(),
x_list,
w_list,
y_grad->data<T>(),
output->data<T>(),
conv_y_list,
dx_list,
dw_list,
x_shape_list,
filter_x_shape[0],
ksize_list,
stride_list,
paddings,
dilations,
group,
x_maxlist,
w_maxlist,
scale_list,
batch_mean_list,
batch_invstd_list,
dscale_list,
dbias_list,
xpu::Activation_t::RELU,
eps,
is_nchw,
has_shortcut,
fuse_add);
int r = xpu::resnet_unit_grad_fusion<XPUType, XPUType, XPUType, int16_t>(
dev_ctx.x_context(),
x_list,
w_list,
reinterpret_cast<const XPUType *>(y_grad->data<T>()),
reinterpret_cast<const XPUType *>(output->data<T>()),
conv_y_list,
dx_list,
dw_list,
x_shape_list,
filter_x_shape[0],
ksize_list,
stride_list,
paddings,
dilations,
group,
x_maxlist,
w_maxlist,
scale_list,
batch_mean_list,
batch_invstd_list,
dscale_list,
dbias_list,
xpu::Activation_t::RELU,
eps,
is_nchw,
has_shortcut,
fuse_add);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
}
};
Expand All @@ -329,5 +346,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {

namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit,
ops::ResNetUnitXPUKernel<plat::float16>,
ops::ResNetUnitXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit_grad,
ops::ResNetUnitGradXPUKernel<plat::float16>,
ops::ResNetUnitGradXPUKernel<float>);
Loading

0 comments on commit c73144b

Please sign in to comment.