From 29e7c6047f6ea1f2620572cfffc36cf840556f54 Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Wed, 20 Jul 2022 02:39:45 +0000 Subject: [PATCH 01/12] move average_accumulates op to phi kernel --- .../phi/kernels/average_accumulates_kernel.h | 57 ++++++++ .../kernels/cpu/average_accumulates_kernel.cc | 138 ++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 paddle/phi/kernels/average_accumulates_kernel.h create mode 100644 paddle/phi/kernels/cpu/average_accumulates_kernel.cc diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h new file mode 100644 index 00000000000000..7ecbfa8405d6e3 --- /dev/null +++ b/paddle/phi/kernels/average_accumulates_kernel.h @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void GetAccumulators(const Context& dev_ctx, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + int64_t* num_updates, + int64_t* num_accumulates, + int64_t* old_num_accumulates); + +template +void SetAccumulators(const Context& ctx, + int64_t num_updates, + int64_t num_accumulates, + int64_t old_num_accumulates, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates); + + +template +void AverageAccumulatesKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& in_sum_1, + const DenseTensor& in_sum_2, + const DenseTensor& in_sum_3, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + DenseTensor* out_sum_1, + DenseTensor* out_sum_2, + DenseTensor* out_sum_3, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates + ); +} \ No newline at end of file diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc new file mode 100644 index 00000000000000..a9786391bae7f1 --- /dev/null +++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc @@ -0,0 +1,138 @@ +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/average_accumulates_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi{ + +template <> +void GetAccumulators(const CPUContext& dev_ctx, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + int64_t* num_updates, + int64_t* num_accumulates, + int64_t* old_num_accumulates) { + *old_num_accumulates = in_old_num_accumulates.data()[0]; + *num_accumulates = in_num_accumulates.data()[0]; + *num_updates = in_num_updates.data()[0]; +} + +template <> +void SetAccumulators(const CPUContext& dev_ctx, + int64_t num_updates, + int64_t num_accumulates, + int64_t old_num_accumulates, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates) { + out_old_num_accumulates->data()[0] = old_num_accumulates; + out_num_accumulates->data()[0] = num_accumulates; + out_num_updates->data()[0] = num_updates; +} + +template +void AverageAccumulatesKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& in_sum_1, + const DenseTensor& in_sum_2, + const DenseTensor& in_sum_3, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + DenseTensor* out_sum_1, + DenseTensor* out_sum_2, + DenseTensor* out_sum_3, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates){ + // It is used to avoid loss of precision + static const int64_t kMaxNumAccumulates = 16384; + // Get accumulators from input + int64_t num_updates = 0; + int64_t num_accumulates = 0; + int64_t old_num_accumulates = 0; + GetAccumulators( + dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates); + + // Get attrs + // float average_window = ctx.Attr("average_window"); + // int64_t max_average_window = ctx.Attr("max_average_window"); + // int64_t min_average_window = ctx.Attr("min_average_window"); + PADDLE_ENFORCE_LE( + min_average_window, + max_average_window, + errors::InvalidArgument( + "The min_average_window > " + "max_average_window is not right, min_average_window is %ld, " + "max_average_window is %ld.", + min_average_window, + max_average_window)); + + // Get inputs + //auto* param = ctx.Input("param"); + //auto* in_sum_1 = ctx.Input("in_sum_1"); + //auto* in_sum_2 = ctx.Input("in_sum_2"); + //auto* in_sum_3 = ctx.Input("in_sum_3"); + auto param_tensor = EigenVector::Flatten(param); + auto in_sum_1_tensor = EigenVector::Flatten(in_sum_1); + auto in_sum_2_tensor = EigenVector::Flatten(in_sum_2); + auto in_sum_3_tensor = EigenVector::Flatten(in_sum_3); + + // Get outputs + //auto* out_sum_1 = ctx.Output("out_sum_1"); + //auto* out_sum_2 = ctx.Output("out_sum_2"); + //auto* out_sum_3 = ctx.Output("out_sum_3"); + auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); + auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); + auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); + + // Compute + //auto& place = *ctx.template device_context().eigen_device(); + + auto& place = *dev_ctx.eigen_device(); + + funcs::SetConstant constant_functor; + ++num_updates; + ++num_accumulates; + out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; + out_sum_2_tensor.device(place) = in_sum_2_tensor; + out_sum_3_tensor.device(place) = in_sum_3_tensor; + if (num_updates % kMaxNumAccumulates == 0) { + // Move the sum to a different buffer to avoid loss of precision due to + // too many sums. + out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; + constant_functor( + dev_ctx, out_sum_1, 0.0); + } + if (num_accumulates >= min_average_window && + num_accumulates >= std::min(max_average_window, + num_updates * average_window)) { + // Now the average window is too long, discard the old sum. + out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; + constant_functor( + dev_ctx, out_sum_1, 0.0); + constant_functor( + dev_ctx, out_sum_2, 0.0); + old_num_accumulates = num_accumulates; + num_accumulates = 0; + } + + // Set accumulators to output + SetAccumulators( + dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + average_accumulates, + CPU, + ALL_LAYOUT, + phi::AverageAccumulatesKernel, + float, + double){} \ No newline at end of file From b60fc04431faaeac693ac70074fda5619c28b8ac Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Wed, 20 Jul 2022 07:14:31 +0000 Subject: [PATCH 02/12] move infer meta --- .../fluid/operators/average_accumulates_op.cc | 7 ++ paddle/phi/infermeta/multiary.cc | 25 +++++ paddle/phi/infermeta/multiary.h | 17 +++ .../kernels/cpu/average_accumulates_kernel.cc | 102 +----------------- .../kernels/gpu/average_accumulates_kernel.cu | 80 ++++++++++++++ .../impl/average_accumulates_kernel_impl.h | 102 ++++++++++++++++++ 6 files changed, 234 insertions(+), 99 deletions(-) create mode 100644 paddle/phi/kernels/gpu/average_accumulates_kernel.cu create mode 100644 paddle/phi/kernels/impl/average_accumulates_kernel_impl.h diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index 856a703fd2b068..b9ff61cf40e7db 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/average_accumulates_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -208,7 +210,12 @@ And for a mini-batch in training, accumulators were computed as below steps: } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates, + AverageAccumulatesInferShapeFunctor, + PD_INFER_META(phi::AverageAccumulatesMeta)); + namespace ops = paddle::operators; + REGISTER_OPERATOR( average_accumulates, ops::AverageAccumulatesOp, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 3369b0c392ec33..e056695ea992b7 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -434,6 +434,31 @@ void AucInferMeta(const MetaTensor& input, } } +void AverageAccumulatesMeta(const MetaTensor& param, + const MetaTensor& in_sum_1, + const MetaTensor& in_sum_2, + const MetaTensor& in_sum_3, + const MetaTensor& in_num_accumulates, + const MetaTensor& in_old_num_accumulates, + const MetaTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + MetaTensor* out_sum_1, + MetaTensor* out_sum_2, + MetaTensor* out_sum_3, + MetaTensor* out_num_accumulates, + MetaTensor* out_old_num_accumulates, + MetaTensor* out_num_updates){ + auto in_dim = param.dims(); + out_sum_1->set_dims(in_dim); + out_sum_2->set_dims(in_dim); + out_sum_3->set_dims(in_dim); + out_num_accumulates->set_dims({1}); + out_old_num_accumulates->set_dims({1}); + out_num_updates->set_dims({1}); +} + void BatchNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 0ec71e86893c3c..24f3d7898dfd55 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -134,6 +134,23 @@ void AucInferMeta(const MetaTensor& input, MetaTensor* stat_neg_out, MetaConfig config = MetaConfig()); +void AverageAccumulatesMeta(const MetaTensor& param, + const MetaTensor& in_sum_1, + const MetaTensor& in_sum_2, + const MetaTensor& in_sum_3, + const MetaTensor& in_num_accumulates, + const MetaTensor& in_old_num_accumulates, + const MetaTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + MetaTensor* out_sum_1, + MetaTensor* out_sum_2, + MetaTensor* out_sum_3, + MetaTensor* out_num_accumulates, + MetaTensor* out_old_num_accumulates, + MetaTensor* out_num_updates); + void BatchNormInferMeta(const MetaTensor& x, const MetaTensor& scale, const MetaTensor& bias, diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc index a9786391bae7f1..54bb124d0732a2 100644 --- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc +++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc @@ -1,13 +1,12 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/average_accumulates_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" namespace phi{ template <> -void GetAccumulators(const CPUContext& dev_ctx, +void GetAccumulators(const phi::CPUContext& dev_ctx, const DenseTensor& in_num_accumulates, const DenseTensor& in_old_num_accumulates, const DenseTensor& in_num_updates, @@ -20,7 +19,7 @@ void GetAccumulators(const CPUContext& dev_ctx, } template <> -void SetAccumulators(const CPUContext& dev_ctx, +void SetAccumulators(const phi::CPUContext& dev_ctx, int64_t num_updates, int64_t num_accumulates, int64_t old_num_accumulates, @@ -32,101 +31,6 @@ void SetAccumulators(const CPUContext& dev_ctx, out_num_updates->data()[0] = num_updates; } -template -void AverageAccumulatesKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& in_sum_1, - const DenseTensor& in_sum_2, - const DenseTensor& in_sum_3, - const DenseTensor& in_num_accumulates, - const DenseTensor& in_old_num_accumulates, - const DenseTensor& in_num_updates, - float average_window, - int64_t max_average_window, - int64_t min_average_window, - DenseTensor* out_sum_1, - DenseTensor* out_sum_2, - DenseTensor* out_sum_3, - DenseTensor* out_num_accumulates, - DenseTensor* out_old_num_accumulates, - DenseTensor* out_num_updates){ - // It is used to avoid loss of precision - static const int64_t kMaxNumAccumulates = 16384; - // Get accumulators from input - int64_t num_updates = 0; - int64_t num_accumulates = 0; - int64_t old_num_accumulates = 0; - GetAccumulators( - dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates); - - // Get attrs - // float average_window = ctx.Attr("average_window"); - // int64_t max_average_window = ctx.Attr("max_average_window"); - // int64_t min_average_window = ctx.Attr("min_average_window"); - PADDLE_ENFORCE_LE( - min_average_window, - max_average_window, - errors::InvalidArgument( - "The min_average_window > " - "max_average_window is not right, min_average_window is %ld, " - "max_average_window is %ld.", - min_average_window, - max_average_window)); - - // Get inputs - //auto* param = ctx.Input("param"); - //auto* in_sum_1 = ctx.Input("in_sum_1"); - //auto* in_sum_2 = ctx.Input("in_sum_2"); - //auto* in_sum_3 = ctx.Input("in_sum_3"); - auto param_tensor = EigenVector::Flatten(param); - auto in_sum_1_tensor = EigenVector::Flatten(in_sum_1); - auto in_sum_2_tensor = EigenVector::Flatten(in_sum_2); - auto in_sum_3_tensor = EigenVector::Flatten(in_sum_3); - - // Get outputs - //auto* out_sum_1 = ctx.Output("out_sum_1"); - //auto* out_sum_2 = ctx.Output("out_sum_2"); - //auto* out_sum_3 = ctx.Output("out_sum_3"); - auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); - auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); - auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); - - // Compute - //auto& place = *ctx.template device_context().eigen_device(); - - auto& place = *dev_ctx.eigen_device(); - - funcs::SetConstant constant_functor; - ++num_updates; - ++num_accumulates; - out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; - out_sum_2_tensor.device(place) = in_sum_2_tensor; - out_sum_3_tensor.device(place) = in_sum_3_tensor; - if (num_updates % kMaxNumAccumulates == 0) { - // Move the sum to a different buffer to avoid loss of precision due to - // too many sums. - out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; - constant_functor( - dev_ctx, out_sum_1, 0.0); - } - if (num_accumulates >= min_average_window && - num_accumulates >= std::min(max_average_window, - num_updates * average_window)) { - // Now the average window is too long, discard the old sum. - out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; - constant_functor( - dev_ctx, out_sum_1, 0.0); - constant_functor( - dev_ctx, out_sum_2, 0.0); - old_num_accumulates = num_accumulates; - num_accumulates = 0; - } - - // Set accumulators to output - SetAccumulators( - dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates); -} - } // namespace phi PD_REGISTER_KERNEL( diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu new file mode 100644 index 00000000000000..7142c8318c189d --- /dev/null +++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu @@ -0,0 +1,80 @@ +#include "paddle/phi/backends/gpu/gpu_context.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/average_accumulates_kernel.h" +#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" + +namespace phi { + +template<> +void GetAccumulators( + const phi::GPUContext& dev_ctx, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + int64_t* num_updates, + int64_t* num_accumulates, + int64_t* old_num_accumulates) { + auto stream = dev_ctx.stream(); + auto cuda_place = in_old_num_accumulates.place(); + paddle::memory::Copy(paddle::platform::CPUPlace(), + old_num_accumulates, + cuda_place, + in_old_num_accumulates.data(), + sizeof(int64_t), + stream); + paddle::memory::Copy(paddle::platform::CPUPlace(), + num_accumulates, + cuda_place, + in_num_accumulates.data(), + sizeof(int64_t), + stream); + paddle::memory::Copy(paddle::platform::CPUPlace(), + num_updates, + cuda_place, + in_num_updates.data(), + sizeof(int64_t), + stream); +} + +template <> +void SetAccumulators( + const phi::GPUContext& dev_ctx, + int64_t num_updates, + int64_t num_accumulates, + int64_t old_num_accumulates, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates) { + auto stream = dev_ctx.stream(); + + auto cuda_place = out_old_num_accumulates->place(); + + paddle::memory::Copy(cuda_place, + out_old_num_accumulates->data(), + paddle::platform::CPUPlace(), + &old_num_accumulates, + sizeof(int64_t), + stream); + paddle::memory::Copy(cuda_place, + out_num_accumulates->data(), + paddle::platform::CPUPlace(), + &num_accumulates, + sizeof(int64_t), + stream); + paddle::memory::Copy(cuda_place, + out_num_updates->data(), + paddle::platform::CPUPlace(), + &num_updates, + sizeof(int64_t), + stream); +} + +} // namespace phi + +PD_REGISTER_KERNEL(average_accumulates, + GPU, + ALL_LAYOUT, + phi::AverageAccumulatesKernel, + float, + double){} \ No newline at end of file diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h new file mode 100644 index 00000000000000..20f33c7033b1af --- /dev/null +++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h @@ -0,0 +1,102 @@ +#pragma once +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/average_accumulates_kernel.h" + +namespace phi{ + +template +void AverageAccumulatesKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& in_sum_1, + const DenseTensor& in_sum_2, + const DenseTensor& in_sum_3, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + DenseTensor* out_sum_1, + DenseTensor* out_sum_2, + DenseTensor* out_sum_3, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates){ + // It is used to avoid loss of precision + static const int64_t kMaxNumAccumulates = 16384; + // Get accumulators from input + int64_t num_updates = 0; + int64_t num_accumulates = 0; + int64_t old_num_accumulates = 0; + GetAccumulators( + dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates); + + // Get attrs + // float average_window = ctx.Attr("average_window"); + // int64_t max_average_window = ctx.Attr("max_average_window"); + // int64_t min_average_window = ctx.Attr("min_average_window"); + PADDLE_ENFORCE_LE( + min_average_window, + max_average_window, + errors::InvalidArgument( + "The min_average_window > " + "max_average_window is not right, min_average_window is %ld, " + "max_average_window is %ld.", + min_average_window, + max_average_window)); + + // Get inputs + //auto* param = ctx.Input("param"); + //auto* in_sum_1 = ctx.Input("in_sum_1"); + //auto* in_sum_2 = ctx.Input("in_sum_2"); + //auto* in_sum_3 = ctx.Input("in_sum_3"); + auto param_tensor = EigenVector::Flatten(param); + auto in_sum_1_tensor = EigenVector::Flatten(in_sum_1); + auto in_sum_2_tensor = EigenVector::Flatten(in_sum_2); + auto in_sum_3_tensor = EigenVector::Flatten(in_sum_3); + + // Get outputs + //auto* out_sum_1 = ctx.Output("out_sum_1"); + //auto* out_sum_2 = ctx.Output("out_sum_2"); + //auto* out_sum_3 = ctx.Output("out_sum_3"); + auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); + auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); + auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); + + // Compute + //auto& place = *ctx.template device_context().eigen_device(); + + auto& place = *dev_ctx.eigen_device(); + + funcs::SetConstant constant_functor; + ++num_updates; + ++num_accumulates; + out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; + out_sum_2_tensor.device(place) = in_sum_2_tensor; + out_sum_3_tensor.device(place) = in_sum_3_tensor; + if (num_updates % kMaxNumAccumulates == 0) { + // Move the sum to a different buffer to avoid loss of precision due to + // too many sums. + out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; + constant_functor( + dev_ctx, out_sum_1, 0.0); + } + if (num_accumulates >= min_average_window && + num_accumulates >= std::min(max_average_window, + num_updates * average_window)) { + // Now the average window is too long, discard the old sum. + out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; + constant_functor( + dev_ctx, out_sum_1, 0.0); + constant_functor( + dev_ctx, out_sum_2, 0.0); + old_num_accumulates = num_accumulates; + num_accumulates = 0; + } + + // Set accumulators to output + SetAccumulators( + dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates); +} +}// namespace phi \ No newline at end of file From 5d8c057e2fa0fbbafec49fe46ee66ad055c927d6 Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Thu, 21 Jul 2022 01:41:16 +0000 Subject: [PATCH 03/12] yaml for average accumulates --- .../fluid/operators/average_accumulates_op.cc | 2 +- paddle/phi/api/yaml/legacy_api.yaml | 10 ++++++ paddle/phi/infermeta/multiary.cc | 2 +- paddle/phi/infermeta/multiary.h | 2 +- .../phi/kernels/average_accumulates_kernel.h | 2 +- .../kernels/cpu/average_accumulates_kernel.cc | 20 +++++++++-- .../kernels/gpu/average_accumulates_kernel.cu | 18 +++++----- .../impl/average_accumulates_kernel_impl.h | 33 +++++++++++++------ .../phi/ops/compat/average_accumulates_sig.cc | 26 +++++++++++++++ .../paddle/incubate/optimizer/modelaverage.py | 15 ++++++++- 10 files changed, 104 insertions(+), 26 deletions(-) create mode 100644 paddle/phi/ops/compat/average_accumulates_sig.cc diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index b9ff61cf40e7db..f3b85e7a770ca5 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -212,7 +212,7 @@ And for a mini-batch in training, accumulators were computed as below steps: DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates, AverageAccumulatesInferShapeFunctor, - PD_INFER_META(phi::AverageAccumulatesMeta)); + PD_INFER_META(phi::AverageAccumulatesInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index b9e7361abea7da..3f662ee04a0aa5 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -247,6 +247,16 @@ kernel : func : auc +#average_accumulates +- api : average_accumulates + args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int max_average_window, int min_average_window) + output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates) + infer_meta: + func : AverageAccumulatesInferMeta + kernel : + func : average_accumulates + data_type : param + # batch_norm - api : batch_norm args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index e056695ea992b7..d8959569af105d 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -434,7 +434,7 @@ void AucInferMeta(const MetaTensor& input, } } -void AverageAccumulatesMeta(const MetaTensor& param, +void AverageAccumulatesInferMeta(const MetaTensor& param, const MetaTensor& in_sum_1, const MetaTensor& in_sum_2, const MetaTensor& in_sum_3, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 24f3d7898dfd55..cf28a65a8d764f 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -134,7 +134,7 @@ void AucInferMeta(const MetaTensor& input, MetaTensor* stat_neg_out, MetaConfig config = MetaConfig()); -void AverageAccumulatesMeta(const MetaTensor& param, +void AverageAccumulatesInferMeta(const MetaTensor& param, const MetaTensor& in_sum_1, const MetaTensor& in_sum_2, const MetaTensor& in_sum_3, diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h index 7ecbfa8405d6e3..5809aace4f860b 100644 --- a/paddle/phi/kernels/average_accumulates_kernel.h +++ b/paddle/phi/kernels/average_accumulates_kernel.h @@ -26,7 +26,7 @@ void GetAccumulators(const Context& dev_ctx, int64_t* old_num_accumulates); template -void SetAccumulators(const Context& ctx, +void SetAccumulators(const Context& dev_ctx, int64_t num_updates, int64_t num_accumulates, int64_t old_num_accumulates, diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc index 54bb124d0732a2..1837a6a3194fd6 100644 --- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc +++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc @@ -1,8 +1,24 @@ -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #include "paddle/phi/kernels/average_accumulates_kernel.h" #include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + + namespace phi{ template <> diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu index 7142c8318c189d..0fe7012ea245c0 100644 --- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu +++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu @@ -1,9 +1,9 @@ -#include "paddle/phi/backends/gpu/gpu_context.h" - -#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/average_accumulates_kernel.h" #include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + namespace phi { template<> @@ -17,19 +17,19 @@ void GetAccumulators( int64_t* old_num_accumulates) { auto stream = dev_ctx.stream(); auto cuda_place = in_old_num_accumulates.place(); - paddle::memory::Copy(paddle::platform::CPUPlace(), + paddle::memory::Copy(phi::CPUPlace(), old_num_accumulates, cuda_place, in_old_num_accumulates.data(), sizeof(int64_t), stream); - paddle::memory::Copy(paddle::platform::CPUPlace(), + paddle::memory::Copy(phi::CPUPlace(), num_accumulates, cuda_place, in_num_accumulates.data(), sizeof(int64_t), stream); - paddle::memory::Copy(paddle::platform::CPUPlace(), + paddle::memory::Copy(phi::CPUPlace(), num_updates, cuda_place, in_num_updates.data(), @@ -52,19 +52,19 @@ void SetAccumulators( paddle::memory::Copy(cuda_place, out_old_num_accumulates->data(), - paddle::platform::CPUPlace(), + phi::CPUPlace(), &old_num_accumulates, sizeof(int64_t), stream); paddle::memory::Copy(cuda_place, out_num_accumulates->data(), - paddle::platform::CPUPlace(), + phi::CPUPlace(), &num_accumulates, sizeof(int64_t), stream); paddle::memory::Copy(cuda_place, out_num_updates->data(), - paddle::platform::CPUPlace(), + phi::CPUPlace(), &num_updates, sizeof(int64_t), stream); diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h index 20f33c7033b1af..63bdab594cb65e 100644 --- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h +++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h @@ -1,7 +1,24 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + #pragma once +#include "paddle/phi/kernels/average_accumulates_kernel.h" + +#include + #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/average_accumulates_kernel.h" namespace phi{ @@ -66,7 +83,6 @@ void AverageAccumulatesKernel(const Context& dev_ctx, // Compute //auto& place = *ctx.template device_context().eigen_device(); - auto& place = *dev_ctx.eigen_device(); funcs::SetConstant constant_functor; @@ -79,24 +95,21 @@ void AverageAccumulatesKernel(const Context& dev_ctx, // Move the sum to a different buffer to avoid loss of precision due to // too many sums. out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; - constant_functor( - dev_ctx, out_sum_1, 0.0); + constant_functor(dev_ctx, out_sum_1, static_cast(0)); } if (num_accumulates >= min_average_window && num_accumulates >= std::min(max_average_window, num_updates * average_window)) { // Now the average window is too long, discard the old sum. out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; - constant_functor( - dev_ctx, out_sum_1, 0.0); - constant_functor( - dev_ctx, out_sum_2, 0.0); + constant_functor(dev_ctx, out_sum_1, static_cast(0)); + constant_functor(dev_ctx, out_sum_2, static_cast(0)); old_num_accumulates = num_accumulates; num_accumulates = 0; } // Set accumulators to output - SetAccumulators( - dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates); + SetAccumulators(dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates); } + }// namespace phi \ No newline at end of file diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc new file mode 100644 index 00000000000000..69a6263c150386 --- /dev/null +++ b/paddle/phi/ops/compat/average_accumulates_sig.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + KernelSignature AverageAccumulatesOpArgumentMapping(const ArgumentMappingContext& ctx){ + return KernelSignature("average_accumulates", + {"param","in_sum_1","in_sum_2","in_sum_3","in_num_accumulates","in_old_num_accumulates","in_num_updates"}, + {"average_window","max_average_window","min_average_window"}, + {"out_sum_1","out_sum_2","out_sum_3","out_num_accumulates","out_old_num_accumulates","out_num_updates"}); + + } +}// namespace phi +PD_REGISTER_ARG_MAPPING_FN(average_accumulates,phi::AverageAccumulatesOpArgumentMapping) \ No newline at end of file diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index 361827ba48de25..2e53607311bf85 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -21,6 +21,7 @@ from paddle.fluid.dygraph import base as imperative_base from paddle.fluid.wrapped_decorator import signature_safe_contextmanager from paddle import _C_ops +from paddle.fluid.framework import in_dygraph_mode __all__ = [] @@ -231,7 +232,19 @@ def _append_optimize_op(self, block, param_and_grad): old_num_accumulates = self._get_accumulator('old_num_accumulates', param_and_grad[0]) num_updates = self._get_accumulator('num_updates', param_and_grad[0]) - if framework._non_static_mode(): + + + if in_dygraph_mode(): + _, _, _, _, _, _ =_C_ops.final_state_average_accumulates( + param_and_grad[0], sum_1, sum_2, sum_3, + num_accumulates, old_num_accumulates, num_updates, + self.average_window, + self.max_average_window, + self.min_average_window, + sum_1, sum_2, sum_3, + num_accumulates, old_num_accumulates, num_updates) + return None + elif framework._non_static_mode(): _, _, _, _, _, _ = _C_ops.average_accumulates( param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates, old_num_accumulates, num_updates, sum_1, sum_2, sum_3, From 84f813433ae2315357adee65dddb177237779fa0 Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Fri, 22 Jul 2022 09:51:38 +0000 Subject: [PATCH 04/12] mutable problem fix, add inplace in yaml --- .../fluid/operators/average_accumulates_op.cu | 15 +++--- paddle/phi/api/yaml/legacy_api.yaml | 7 +-- .../phi/kernels/average_accumulates_kernel.h | 7 +-- .../kernels/cpu/average_accumulates_kernel.cc | 1 - .../kernels/gpu/average_accumulates_kernel.cu | 41 +++++++++++---- .../impl/average_accumulates_kernel_impl.h | 51 ++++++++++++++----- .../paddle/incubate/optimizer/modelaverage.py | 9 ++-- 7 files changed, 88 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index d793d528a5b18c..48c47858c1c760 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -28,19 +28,20 @@ void GetAccumulators( auto* in_num_updates = ctx.Input("in_num_updates"); auto stream = ctx.cuda_device_context().stream(); auto cuda_place = in_old_num_accumulates->place(); - memory::Copy(platform::CPUPlace(), + paddle::memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place, in_old_num_accumulates->data(), sizeof(int64_t), stream); - memory::Copy(platform::CPUPlace(), + + paddle::memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place, in_num_accumulates->data(), sizeof(int64_t), stream); - memory::Copy(platform::CPUPlace(), + paddle::memory::Copy(platform::CPUPlace(), num_updates_, cuda_place, in_num_updates->data(), @@ -60,19 +61,19 @@ void SetAccumulators( auto* out_num_updates = ctx.Output("out_num_updates"); auto cuda_place = out_old_num_accumulates->place(); - memory::Copy(cuda_place, + paddle::memory::Copy(cuda_place, out_old_num_accumulates->data(), platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), stream); - memory::Copy(cuda_place, + paddle::memory::Copy(cuda_place, out_num_accumulates->data(), platform::CPUPlace(), &num_accumulates_, sizeof(int64_t), stream); - memory::Copy(cuda_place, + paddle::memory::Copy(cuda_place, out_num_updates->data(), platform::CPUPlace(), &num_updates_, @@ -85,6 +86,6 @@ void SetAccumulators( namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - average_accumulates, + average_accumulates_, ops::AverageAccumulatesKernel, ops::AverageAccumulatesKernel); diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index 3f662ee04a0aa5..941e65ced5bc2e 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -248,14 +248,15 @@ func : auc #average_accumulates -- api : average_accumulates - args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int max_average_window, int min_average_window) +- api : average_accumulates_ + args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window) output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates) infer_meta: func : AverageAccumulatesInferMeta kernel : - func : average_accumulates + func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense} data_type : param + inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) # batch_norm - api : batch_norm diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h index 5809aace4f860b..a6f5176c83131b 100644 --- a/paddle/phi/kernels/average_accumulates_kernel.h +++ b/paddle/phi/kernels/average_accumulates_kernel.h @@ -13,7 +13,9 @@ // limitations under the License. #pragma once + #include "paddle/phi/core/dense_tensor.h" + namespace phi { template @@ -52,6 +54,5 @@ void AverageAccumulatesKernel(const Context& dev_ctx, DenseTensor* out_sum_3, DenseTensor* out_num_accumulates, DenseTensor* out_old_num_accumulates, - DenseTensor* out_num_updates - ); -} \ No newline at end of file + DenseTensor* out_num_updates); +} // namespace phi \ No newline at end of file diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc index 1837a6a3194fd6..9135a810844305 100644 --- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc +++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" - namespace phi{ template <> diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu index 0fe7012ea245c0..9a87ddfd83ee35 100644 --- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu +++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu @@ -1,7 +1,21 @@ -#include "paddle/phi/kernels/average_accumulates_kernel.h" -#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" + +#include "paddle/phi/kernels/average_accumulates_kernel.h" +#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { @@ -46,24 +60,31 @@ void SetAccumulators( DenseTensor* out_num_accumulates, DenseTensor* out_old_num_accumulates, DenseTensor* out_num_updates) { + + int64_t* out_num_accumulates_ptr=dev_ctx.template Alloc(out_num_accumulates); + int64_t* out_old_num_accumulates_ptr=dev_ctx.template Alloc(out_old_num_accumulates); + int64_t* out_num_updates_ptr=dev_ctx.template Alloc(out_num_updates); + + auto stream = dev_ctx.stream(); auto cuda_place = out_old_num_accumulates->place(); - - paddle::memory::Copy(cuda_place, - out_old_num_accumulates->data(), + paddle::memory::Copy(dev_ctx.GetPlace(), + out_num_accumulates_ptr, phi::CPUPlace(), - &old_num_accumulates, + &num_accumulates, sizeof(int64_t), stream); - paddle::memory::Copy(cuda_place, - out_num_accumulates->data(), + + paddle::memory::Copy(dev_ctx.GetPlace(), + out_old_num_accumulates_ptr, phi::CPUPlace(), - &num_accumulates, + &old_num_accumulates, sizeof(int64_t), stream); + paddle::memory::Copy(cuda_place, - out_num_updates->data(), + out_num_updates_ptr, phi::CPUPlace(), &num_updates, sizeof(int64_t), diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h index 63bdab594cb65e..96438f08711454 100644 --- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h +++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h @@ -43,12 +43,21 @@ void AverageAccumulatesKernel(const Context& dev_ctx, // It is used to avoid loss of precision static const int64_t kMaxNumAccumulates = 16384; // Get accumulators from input - int64_t num_updates = 0; - int64_t num_accumulates = 0; - int64_t old_num_accumulates = 0; - GetAccumulators( - dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates); + // int64_t num_updates = 0; + // int64_t num_accumulates = 0; + // int64_t old_num_accumulates = 0; + + auto num_updates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t)); + int64_t* num_updates_cpu_ptr=reinterpret_cast(num_updates_cpu->ptr()); + + auto num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t)); + int64_t* num_accumulates_cpu_ptr=reinterpret_cast(num_accumulates_cpu->ptr()); + auto old_num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t)); + int64_t* old_num_accumulates_cpu_ptr=reinterpret_cast(old_num_accumulates_cpu->ptr()); + + GetAccumulators( + dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, num_updates_cpu_ptr, num_accumulates_cpu_ptr, old_num_accumulates_cpu_ptr); // Get attrs // float average_window = ctx.Attr("average_window"); // int64_t max_average_window = ctx.Attr("max_average_window"); @@ -77,6 +86,11 @@ void AverageAccumulatesKernel(const Context& dev_ctx, //auto* out_sum_1 = ctx.Output("out_sum_1"); //auto* out_sum_2 = ctx.Output("out_sum_2"); //auto* out_sum_3 = ctx.Output("out_sum_3"); + dev_ctx.template Alloc(out_sum_1); + dev_ctx.template Alloc(out_sum_2); + dev_ctx.template Alloc(out_sum_3); + + auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); @@ -86,30 +100,39 @@ void AverageAccumulatesKernel(const Context& dev_ctx, auto& place = *dev_ctx.eigen_device(); funcs::SetConstant constant_functor; - ++num_updates; - ++num_accumulates; + ++(*num_updates_cpu_ptr); + ++(*num_accumulates_cpu_ptr); out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; out_sum_2_tensor.device(place) = in_sum_2_tensor; out_sum_3_tensor.device(place) = in_sum_3_tensor; - if (num_updates % kMaxNumAccumulates == 0) { + if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) { // Move the sum to a different buffer to avoid loss of precision due to // too many sums. out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; constant_functor(dev_ctx, out_sum_1, static_cast(0)); } - if (num_accumulates >= min_average_window && - num_accumulates >= std::min(max_average_window, - num_updates * average_window)) { + if ((*num_accumulates_cpu_ptr) >= min_average_window && + (*num_accumulates_cpu_ptr) >= std::min(max_average_window, + (*num_updates_cpu_ptr) * average_window)) { // Now the average window is too long, discard the old sum. out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; constant_functor(dev_ctx, out_sum_1, static_cast(0)); constant_functor(dev_ctx, out_sum_2, static_cast(0)); - old_num_accumulates = num_accumulates; - num_accumulates = 0; + (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr); + (*num_accumulates_cpu_ptr) = 0; } // Set accumulators to output - SetAccumulators(dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates); + VLOG(3)<<"@@@@@num_accumulates : "<<*num_accumulates_cpu_ptr; + + SetAccumulators(dev_ctx, + *num_updates_cpu_ptr, + *num_accumulates_cpu_ptr, + *old_num_accumulates_cpu_ptr, + out_num_accumulates, + out_old_num_accumulates, + out_num_updates); + } }// namespace phi \ No newline at end of file diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index 2e53607311bf85..7b1e1d8ca144cc 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -235,19 +235,18 @@ def _append_optimize_op(self, block, param_and_grad): if in_dygraph_mode(): - _, _, _, _, _, _ =_C_ops.final_state_average_accumulates( + _, _, _, _, _, _ = _C_ops.final_state_average_accumulates( param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates, old_num_accumulates, num_updates, self.average_window, self.max_average_window, - self.min_average_window, - sum_1, sum_2, sum_3, - num_accumulates, old_num_accumulates, num_updates) + self.min_average_window) + return None elif framework._non_static_mode(): _, _, _, _, _, _ = _C_ops.average_accumulates( param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates, - old_num_accumulates, num_updates, sum_1, sum_2, sum_3, + old_num_accumulates, num_updates, sum_1, sum_2, sum_3, num_accumulates, old_num_accumulates, num_updates, 'average_window', self.average_window, 'min_average_window', self.min_average_window, 'max_average_window', From 40fb5f600c3c8e3c56311ae8904213c4d1cdc9f8 Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Fri, 22 Jul 2022 09:56:47 +0000 Subject: [PATCH 05/12] polish --- paddle/fluid/operators/average_accumulates_op.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu index 48c47858c1c760..1ce5f84e75ad35 100644 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -28,20 +28,20 @@ void GetAccumulators( auto* in_num_updates = ctx.Input("in_num_updates"); auto stream = ctx.cuda_device_context().stream(); auto cuda_place = in_old_num_accumulates->place(); - paddle::memory::Copy(platform::CPUPlace(), + memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place, in_old_num_accumulates->data(), sizeof(int64_t), stream); - paddle::memory::Copy(platform::CPUPlace(), + memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place, in_num_accumulates->data(), sizeof(int64_t), stream); - paddle::memory::Copy(platform::CPUPlace(), + memory::Copy(platform::CPUPlace(), num_updates_, cuda_place, in_num_updates->data(), @@ -61,19 +61,19 @@ void SetAccumulators( auto* out_num_updates = ctx.Output("out_num_updates"); auto cuda_place = out_old_num_accumulates->place(); - paddle::memory::Copy(cuda_place, + memory::Copy(cuda_place, out_old_num_accumulates->data(), platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), stream); - paddle::memory::Copy(cuda_place, + memory::Copy(cuda_place, out_num_accumulates->data(), platform::CPUPlace(), &num_accumulates_, sizeof(int64_t), stream); - paddle::memory::Copy(cuda_place, + memory::Copy(cuda_place, out_num_updates->data(), platform::CPUPlace(), &num_updates_, @@ -86,6 +86,6 @@ void SetAccumulators( namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - average_accumulates_, + average_accumulates, ops::AverageAccumulatesKernel, ops::AverageAccumulatesKernel); From c6b78d2356e5c1cb83f36ca9d9a932ef63c1cb79 Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Sun, 24 Jul 2022 17:05:15 +0000 Subject: [PATCH 06/12] bug fix --- .../final_state_generator/python_c_gen.py | 2 + paddle/phi/infermeta/multiary.cc | 46 +++++++++++-------- .../phi/ops/compat/average_accumulates_sig.cc | 31 +++++++++---- .../paddle/incubate/optimizer/modelaverage.py | 15 +++--- 4 files changed, 56 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9d5706f65bdf0c..b7229278a8ddf5 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -57,6 +57,8 @@ def SkipAPIGeneration(forward_api_name): 'adam', 'adamw_', 'adamw', + 'average_accumulates', + 'average_accumulates_', 'decayed_adagrad_', 'decayed_adagrad', 'dgc_momentum_', diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index d8959569af105d..f4d6a00bb67cbf 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -435,28 +435,34 @@ void AucInferMeta(const MetaTensor& input, } void AverageAccumulatesInferMeta(const MetaTensor& param, - const MetaTensor& in_sum_1, - const MetaTensor& in_sum_2, - const MetaTensor& in_sum_3, - const MetaTensor& in_num_accumulates, - const MetaTensor& in_old_num_accumulates, - const MetaTensor& in_num_updates, - float average_window, - int64_t max_average_window, - int64_t min_average_window, - MetaTensor* out_sum_1, - MetaTensor* out_sum_2, - MetaTensor* out_sum_3, - MetaTensor* out_num_accumulates, - MetaTensor* out_old_num_accumulates, - MetaTensor* out_num_updates){ - auto in_dim = param.dims(); - out_sum_1->set_dims(in_dim); - out_sum_2->set_dims(in_dim); - out_sum_3->set_dims(in_dim); + const MetaTensor& in_sum_1, + const MetaTensor& in_sum_2, + const MetaTensor& in_sum_3, + const MetaTensor& in_num_accumulates, + const MetaTensor& in_old_num_accumulates, + const MetaTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + MetaTensor* out_sum_1, + MetaTensor* out_sum_2, + MetaTensor* out_sum_3, + MetaTensor* out_num_accumulates, + MetaTensor* out_old_num_accumulates, + MetaTensor* out_num_updates) { + // auto in_dim = param.dims; + out_sum_1->set_dims(in_sum_1.dims()); + out_sum_1->set_dtype(in_sum_1.dtype()); + out_sum_2->set_dims(in_sum_2.dims()); + out_sum_2->set_dtype(in_sum_2.dtype()); + out_sum_3->set_dims(in_sum_3.dims()); + out_sum_3->set_dtype(in_sum_3.dtype()); out_num_accumulates->set_dims({1}); + out_num_accumulates->set_dtype(in_num_accumulates.dtype()); out_old_num_accumulates->set_dims({1}); - out_num_updates->set_dims({1}); + out_old_num_accumulates->set_dtype(in_old_num_accumulates.dtype()); + out_num_updates->set_dims({1}); + out_num_updates->set_dtype(in_num_updates.dtype()); } void BatchNormInferMeta(const MetaTensor& x, diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc index 69a6263c150386..c14e8ab3575531 100644 --- a/paddle/phi/ops/compat/average_accumulates_sig.cc +++ b/paddle/phi/ops/compat/average_accumulates_sig.cc @@ -15,12 +15,25 @@ limitations under the License. */ #include "paddle/phi/core/compat/op_utils.h" namespace phi { - KernelSignature AverageAccumulatesOpArgumentMapping(const ArgumentMappingContext& ctx){ - return KernelSignature("average_accumulates", - {"param","in_sum_1","in_sum_2","in_sum_3","in_num_accumulates","in_old_num_accumulates","in_num_updates"}, - {"average_window","max_average_window","min_average_window"}, - {"out_sum_1","out_sum_2","out_sum_3","out_num_accumulates","out_old_num_accumulates","out_num_updates"}); - - } -}// namespace phi -PD_REGISTER_ARG_MAPPING_FN(average_accumulates,phi::AverageAccumulatesOpArgumentMapping) \ No newline at end of file +KernelSignature AverageAccumulatesOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "average_accumulates", + {"param", + "in_sum_1", + "in_sum_2", + "in_sum_3", + "in_num_accumulates", + "in_old_num_accumulates", + "in_num_updates"}, + {"average_window", "max_average_window", "min_average_window"}, + {"out_sum_1", + "out_sum_2", + "out_sum_3", + "out_num_accumulates", + "out_old_num_accumulates", + "out_num_updates"}); +} +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(average_accumulates, + phi::AverageAccumulatesOpArgumentMapping); diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index 7b1e1d8ca144cc..c62456eae388be 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -232,21 +232,18 @@ def _append_optimize_op(self, block, param_and_grad): old_num_accumulates = self._get_accumulator('old_num_accumulates', param_and_grad[0]) num_updates = self._get_accumulator('num_updates', param_and_grad[0]) - - + if in_dygraph_mode(): - _, _, _, _, _, _ = _C_ops.final_state_average_accumulates( - param_and_grad[0], sum_1, sum_2, sum_3, - num_accumulates, old_num_accumulates, num_updates, - self.average_window, - self.max_average_window, - self.min_average_window) + _, _, _, _, _, _ = _C_ops.final_state_average_accumulates_( + param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates, + old_num_accumulates, num_updates, self.average_window, + self.max_average_window, self.min_average_window) return None elif framework._non_static_mode(): _, _, _, _, _, _ = _C_ops.average_accumulates( param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates, - old_num_accumulates, num_updates, sum_1, sum_2, sum_3, + old_num_accumulates, num_updates, sum_1, sum_2, sum_3, num_accumulates, old_num_accumulates, num_updates, 'average_window', self.average_window, 'min_average_window', self.min_average_window, 'max_average_window', From 7325cb64807affc69c96b9203bf74e732ebe3b2e Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Sun, 24 Jul 2022 17:15:50 +0000 Subject: [PATCH 07/12] code style improve --- .../impl/average_accumulates_kernel_impl.h | 204 +++++++++--------- 1 file changed, 106 insertions(+), 98 deletions(-) diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h index 96438f08711454..8731316317d477 100644 --- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h +++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h @@ -17,12 +17,12 @@ limitations under the License. */ #include -#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" -namespace phi{ +namespace phi { -template +template void AverageAccumulatesKernel(const Context& dev_ctx, const DenseTensor& param, const DenseTensor& in_sum_1, @@ -39,100 +39,108 @@ void AverageAccumulatesKernel(const Context& dev_ctx, DenseTensor* out_sum_3, DenseTensor* out_num_accumulates, DenseTensor* out_old_num_accumulates, - DenseTensor* out_num_updates){ - // It is used to avoid loss of precision - static const int64_t kMaxNumAccumulates = 16384; - // Get accumulators from input - // int64_t num_updates = 0; - // int64_t num_accumulates = 0; - // int64_t old_num_accumulates = 0; - - auto num_updates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t)); - int64_t* num_updates_cpu_ptr=reinterpret_cast(num_updates_cpu->ptr()); - - auto num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t)); - int64_t* num_accumulates_cpu_ptr=reinterpret_cast(num_accumulates_cpu->ptr()); - - auto old_num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t)); - int64_t* old_num_accumulates_cpu_ptr=reinterpret_cast(old_num_accumulates_cpu->ptr()); - - GetAccumulators( - dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, num_updates_cpu_ptr, num_accumulates_cpu_ptr, old_num_accumulates_cpu_ptr); - // Get attrs - // float average_window = ctx.Attr("average_window"); - // int64_t max_average_window = ctx.Attr("max_average_window"); - // int64_t min_average_window = ctx.Attr("min_average_window"); - PADDLE_ENFORCE_LE( - min_average_window, - max_average_window, - errors::InvalidArgument( - "The min_average_window > " - "max_average_window is not right, min_average_window is %ld, " - "max_average_window is %ld.", - min_average_window, - max_average_window)); - - // Get inputs - //auto* param = ctx.Input("param"); - //auto* in_sum_1 = ctx.Input("in_sum_1"); - //auto* in_sum_2 = ctx.Input("in_sum_2"); - //auto* in_sum_3 = ctx.Input("in_sum_3"); - auto param_tensor = EigenVector::Flatten(param); - auto in_sum_1_tensor = EigenVector::Flatten(in_sum_1); - auto in_sum_2_tensor = EigenVector::Flatten(in_sum_2); - auto in_sum_3_tensor = EigenVector::Flatten(in_sum_3); - - // Get outputs - //auto* out_sum_1 = ctx.Output("out_sum_1"); - //auto* out_sum_2 = ctx.Output("out_sum_2"); - //auto* out_sum_3 = ctx.Output("out_sum_3"); - dev_ctx.template Alloc(out_sum_1); - dev_ctx.template Alloc(out_sum_2); - dev_ctx.template Alloc(out_sum_3); - - - auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); - auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); - auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); - - // Compute - //auto& place = *ctx.template device_context().eigen_device(); - auto& place = *dev_ctx.eigen_device(); - - funcs::SetConstant constant_functor; - ++(*num_updates_cpu_ptr); - ++(*num_accumulates_cpu_ptr); - out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; - out_sum_2_tensor.device(place) = in_sum_2_tensor; - out_sum_3_tensor.device(place) = in_sum_3_tensor; - if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) { - // Move the sum to a different buffer to avoid loss of precision due to - // too many sums. - out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; - constant_functor(dev_ctx, out_sum_1, static_cast(0)); - } - if ((*num_accumulates_cpu_ptr) >= min_average_window && - (*num_accumulates_cpu_ptr) >= std::min(max_average_window, - (*num_updates_cpu_ptr) * average_window)) { - // Now the average window is too long, discard the old sum. - out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; - constant_functor(dev_ctx, out_sum_1, static_cast(0)); - constant_functor(dev_ctx, out_sum_2, static_cast(0)); - (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr); - (*num_accumulates_cpu_ptr) = 0; - } - - // Set accumulators to output - VLOG(3)<<"@@@@@num_accumulates : "<<*num_accumulates_cpu_ptr; - - SetAccumulators(dev_ctx, - *num_updates_cpu_ptr, - *num_accumulates_cpu_ptr, - *old_num_accumulates_cpu_ptr, - out_num_accumulates, - out_old_num_accumulates, - out_num_updates); - + DenseTensor* out_num_updates) { + // It is used to avoid loss of precision + static const int64_t kMaxNumAccumulates = 16384; + // Get accumulators from input + // int64_t num_updates = 0; + // int64_t num_accumulates = 0; + // int64_t old_num_accumulates = 0; + + auto num_updates_cpu = + paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t)); + int64_t* num_updates_cpu_ptr = + reinterpret_cast(num_updates_cpu->ptr()); + + auto num_accumulates_cpu = + paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t)); + int64_t* num_accumulates_cpu_ptr = + reinterpret_cast(num_accumulates_cpu->ptr()); + + auto old_num_accumulates_cpu = + paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t)); + int64_t* old_num_accumulates_cpu_ptr = + reinterpret_cast(old_num_accumulates_cpu->ptr()); + + GetAccumulators(dev_ctx, + in_num_accumulates, + in_old_num_accumulates, + in_num_updates, + num_updates_cpu_ptr, + num_accumulates_cpu_ptr, + old_num_accumulates_cpu_ptr); + // Get attrs + // float average_window = ctx.Attr("average_window"); + // int64_t max_average_window = ctx.Attr("max_average_window"); + // int64_t min_average_window = ctx.Attr("min_average_window"); + PADDLE_ENFORCE_LE( + min_average_window, + max_average_window, + errors::InvalidArgument( + "The min_average_window > " + "max_average_window is not right, min_average_window is %ld, " + "max_average_window is %ld.", + min_average_window, + max_average_window)); + + // Get inputs + // auto* param = ctx.Input("param"); + // auto* in_sum_1 = ctx.Input("in_sum_1"); + // auto* in_sum_2 = ctx.Input("in_sum_2"); + // auto* in_sum_3 = ctx.Input("in_sum_3"); + auto param_tensor = EigenVector::Flatten(param); + auto in_sum_1_tensor = EigenVector::Flatten(in_sum_1); + auto in_sum_2_tensor = EigenVector::Flatten(in_sum_2); + auto in_sum_3_tensor = EigenVector::Flatten(in_sum_3); + + // Get outputs + // auto* out_sum_1 = ctx.Output("out_sum_1"); + // auto* out_sum_2 = ctx.Output("out_sum_2"); + // auto* out_sum_3 = ctx.Output("out_sum_3"); + dev_ctx.template Alloc(out_sum_1); + dev_ctx.template Alloc(out_sum_2); + dev_ctx.template Alloc(out_sum_3); + + auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); + auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); + auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); + + // Compute + // auto& place = *ctx.template device_context().eigen_device(); + auto& place = *dev_ctx.eigen_device(); + + funcs::SetConstant constant_functor; + ++(*num_updates_cpu_ptr); + ++(*num_accumulates_cpu_ptr); + out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; + out_sum_2_tensor.device(place) = in_sum_2_tensor; + out_sum_3_tensor.device(place) = in_sum_3_tensor; + if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) { + // Move the sum to a different buffer to avoid loss of precision due to + // too many sums. + out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; + constant_functor(dev_ctx, out_sum_1, static_cast(0)); + } + if ((*num_accumulates_cpu_ptr) >= min_average_window && + (*num_accumulates_cpu_ptr) >= + std::min(max_average_window, + (*num_updates_cpu_ptr) * average_window)) { + // Now the average window is too long, discard the old sum. + out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; + constant_functor(dev_ctx, out_sum_1, static_cast(0)); + constant_functor(dev_ctx, out_sum_2, static_cast(0)); + (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr); + (*num_accumulates_cpu_ptr) = 0; + } + + // Set accumulators to output + SetAccumulators(dev_ctx, + *num_updates_cpu_ptr, + *num_accumulates_cpu_ptr, + *old_num_accumulates_cpu_ptr, + out_num_accumulates, + out_old_num_accumulates, + out_num_updates); } -}// namespace phi \ No newline at end of file +} // namespace phi From c5a7f9554eb7ec38a525af490a6d26b4e23737cd Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Sun, 24 Jul 2022 17:55:05 +0000 Subject: [PATCH 08/12] code style improve --- .../kernels/gpu/average_accumulates_kernel.cu | 131 +++++++++--------- 1 file changed, 65 insertions(+), 66 deletions(-) diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu index 9a87ddfd83ee35..98a6699d9754f1 100644 --- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu +++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu @@ -12,90 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/backends/gpu/gpu_context.h" - #include "paddle/phi/kernels/average_accumulates_kernel.h" #include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -template<> -void GetAccumulators( - const phi::GPUContext& dev_ctx, - const DenseTensor& in_num_accumulates, - const DenseTensor& in_old_num_accumulates, - const DenseTensor& in_num_updates, - int64_t* num_updates, - int64_t* num_accumulates, - int64_t* old_num_accumulates) { - auto stream = dev_ctx.stream(); - auto cuda_place = in_old_num_accumulates.place(); - paddle::memory::Copy(phi::CPUPlace(), - old_num_accumulates, - cuda_place, - in_old_num_accumulates.data(), - sizeof(int64_t), - stream); - paddle::memory::Copy(phi::CPUPlace(), - num_accumulates, - cuda_place, - in_num_accumulates.data(), - sizeof(int64_t), - stream); - paddle::memory::Copy(phi::CPUPlace(), - num_updates, - cuda_place, - in_num_updates.data(), - sizeof(int64_t), - stream); +template <> +void GetAccumulators(const phi::GPUContext& dev_ctx, + const DenseTensor& in_num_accumulates, + const DenseTensor& in_old_num_accumulates, + const DenseTensor& in_num_updates, + int64_t* num_updates, + int64_t* num_accumulates, + int64_t* old_num_accumulates) { + auto stream = dev_ctx.stream(); + auto cuda_place = in_old_num_accumulates.place(); + paddle::memory::Copy(phi::CPUPlace(), + old_num_accumulates, + cuda_place, + in_old_num_accumulates.data(), + sizeof(int64_t), + stream); + paddle::memory::Copy(phi::CPUPlace(), + num_accumulates, + cuda_place, + in_num_accumulates.data(), + sizeof(int64_t), + stream); + paddle::memory::Copy(phi::CPUPlace(), + num_updates, + cuda_place, + in_num_updates.data(), + sizeof(int64_t), + stream); } template <> -void SetAccumulators( - const phi::GPUContext& dev_ctx, - int64_t num_updates, - int64_t num_accumulates, - int64_t old_num_accumulates, - DenseTensor* out_num_accumulates, - DenseTensor* out_old_num_accumulates, - DenseTensor* out_num_updates) { - - int64_t* out_num_accumulates_ptr=dev_ctx.template Alloc(out_num_accumulates); - int64_t* out_old_num_accumulates_ptr=dev_ctx.template Alloc(out_old_num_accumulates); - int64_t* out_num_updates_ptr=dev_ctx.template Alloc(out_num_updates); +void SetAccumulators(const phi::GPUContext& dev_ctx, + int64_t num_updates, + int64_t num_accumulates, + int64_t old_num_accumulates, + DenseTensor* out_num_accumulates, + DenseTensor* out_old_num_accumulates, + DenseTensor* out_num_updates) { + int64_t* out_num_accumulates_ptr = + dev_ctx.template Alloc(out_num_accumulates); + int64_t* out_old_num_accumulates_ptr = + dev_ctx.template Alloc(out_old_num_accumulates); + int64_t* out_num_updates_ptr = + dev_ctx.template Alloc(out_num_updates); + auto stream = dev_ctx.stream(); - auto stream = dev_ctx.stream(); + auto cuda_place = out_old_num_accumulates->place(); + paddle::memory::Copy(dev_ctx.GetPlace(), + out_num_accumulates_ptr, + phi::CPUPlace(), + &num_accumulates, + sizeof(int64_t), + stream); - auto cuda_place = out_old_num_accumulates->place(); - paddle::memory::Copy(dev_ctx.GetPlace(), - out_num_accumulates_ptr, - phi::CPUPlace(), - &num_accumulates, - sizeof(int64_t), - stream); - - paddle::memory::Copy(dev_ctx.GetPlace(), - out_old_num_accumulates_ptr, - phi::CPUPlace(), - &old_num_accumulates, - sizeof(int64_t), - stream); + paddle::memory::Copy(dev_ctx.GetPlace(), + out_old_num_accumulates_ptr, + phi::CPUPlace(), + &old_num_accumulates, + sizeof(int64_t), + stream); - paddle::memory::Copy(cuda_place, - out_num_updates_ptr, - phi::CPUPlace(), - &num_updates, - sizeof(int64_t), - stream); + paddle::memory::Copy(cuda_place, + out_num_updates_ptr, + phi::CPUPlace(), + &num_updates, + sizeof(int64_t), + stream); } -} // namespace phi +} // namespace phi PD_REGISTER_KERNEL(average_accumulates, GPU, ALL_LAYOUT, phi::AverageAccumulatesKernel, float, - double){} \ No newline at end of file + double) {} From 9004b45a0c78c0e1becb8c290c812cec642bdf6c Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Mon, 25 Jul 2022 05:24:47 +0000 Subject: [PATCH 09/12] polish --- paddle/phi/infermeta/multiary.h | 30 +++++++++---------- .../phi/kernels/average_accumulates_kernel.h | 5 ++-- .../kernels/cpu/average_accumulates_kernel.cc | 17 +++++------ 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index cf28a65a8d764f..ef5b0a15ee5069 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -135,21 +135,21 @@ void AucInferMeta(const MetaTensor& input, MetaConfig config = MetaConfig()); void AverageAccumulatesInferMeta(const MetaTensor& param, - const MetaTensor& in_sum_1, - const MetaTensor& in_sum_2, - const MetaTensor& in_sum_3, - const MetaTensor& in_num_accumulates, - const MetaTensor& in_old_num_accumulates, - const MetaTensor& in_num_updates, - float average_window, - int64_t max_average_window, - int64_t min_average_window, - MetaTensor* out_sum_1, - MetaTensor* out_sum_2, - MetaTensor* out_sum_3, - MetaTensor* out_num_accumulates, - MetaTensor* out_old_num_accumulates, - MetaTensor* out_num_updates); + const MetaTensor& in_sum_1, + const MetaTensor& in_sum_2, + const MetaTensor& in_sum_3, + const MetaTensor& in_num_accumulates, + const MetaTensor& in_old_num_accumulates, + const MetaTensor& in_num_updates, + float average_window, + int64_t max_average_window, + int64_t min_average_window, + MetaTensor* out_sum_1, + MetaTensor* out_sum_2, + MetaTensor* out_sum_3, + MetaTensor* out_num_accumulates, + MetaTensor* out_old_num_accumulates, + MetaTensor* out_num_updates); void BatchNormInferMeta(const MetaTensor& x, const MetaTensor& scale, diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h index a6f5176c83131b..63f2b362cfde3a 100644 --- a/paddle/phi/kernels/average_accumulates_kernel.h +++ b/paddle/phi/kernels/average_accumulates_kernel.h @@ -31,12 +31,11 @@ template void SetAccumulators(const Context& dev_ctx, int64_t num_updates, int64_t num_accumulates, - int64_t old_num_accumulates, + int64_t old_num_accumulates, DenseTensor* out_num_accumulates, DenseTensor* out_old_num_accumulates, DenseTensor* out_num_updates); - template void AverageAccumulatesKernel(const Context& dev_ctx, const DenseTensor& param, @@ -55,4 +54,4 @@ void AverageAccumulatesKernel(const Context& dev_ctx, DenseTensor* out_num_accumulates, DenseTensor* out_old_num_accumulates, DenseTensor* out_num_updates); -} // namespace phi \ No newline at end of file +} // namespace phi diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc index 9135a810844305..14eb38d5b99b6e 100644 --- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc +++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -namespace phi{ +namespace phi { template <> void GetAccumulators(const phi::CPUContext& dev_ctx, @@ -46,12 +46,11 @@ void SetAccumulators(const phi::CPUContext& dev_ctx, out_num_updates->data()[0] = num_updates; } -} // namespace phi +} // namespace phi -PD_REGISTER_KERNEL( - average_accumulates, - CPU, - ALL_LAYOUT, - phi::AverageAccumulatesKernel, - float, - double){} \ No newline at end of file +PD_REGISTER_KERNEL(average_accumulates, + CPU, + ALL_LAYOUT, + phi::AverageAccumulatesKernel, + float, + double) {} From d0053544410e1065c9930d8bd48ab569b7467a1e Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Mon, 25 Jul 2022 07:28:37 +0000 Subject: [PATCH 10/12] add paddle enforce --- paddle/phi/infermeta/multiary.cc | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f4d6a00bb67cbf..03f2e6f47187e4 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -451,6 +451,37 @@ void AverageAccumulatesInferMeta(const MetaTensor& param, MetaTensor* out_old_num_accumulates, MetaTensor* out_num_updates) { // auto in_dim = param.dims; + PADDLE_ENFORCE_NE( + out_sum_1, + nullptr, + errors::NotFound( + "Output(out_sum_1) of AverageAccumulates should not be null.")); + PADDLE_ENFORCE_NE( + out_sum_2, + nullptr, + errors::NotFound( + "Output(out_sum_2) of AverageAccumulates should not be null.")); + PADDLE_ENFORCE_NE( + out_sum_3, + nullptr, + errors::NotFound( + "Output(out_sum_3) of AverageAccumulates should not be null.")); + PADDLE_ENFORCE_NE(out_num_accumulates, + nullptr, + errors::NotFound("Output(out_num_accumulates) of " + "AverageAccumulates should not be null.")); + + PADDLE_ENFORCE_NE(out_old_num_accumulates, + nullptr, + errors::NotFound("Output(out_old_num_accumulates) of " + "AverageAccumulates should not be null.")); + + PADDLE_ENFORCE_NE( + out_num_updates, + nullptr, + errors::NotFound( + "Output(out_num_updates) of AverageAccumulates should not be null.")); + out_sum_1->set_dims(in_sum_1.dims()); out_sum_1->set_dtype(in_sum_1.dtype()); out_sum_2->set_dims(in_sum_2.dims()); From e280426ed45cd0fbcc5aa11fb25ce2745f516b8f Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Tue, 26 Jul 2022 08:01:03 +0000 Subject: [PATCH 11/12] clean infershape and old compute function --- .../fluid/operators/average_accumulates_op.cc | 97 +------------- .../fluid/operators/average_accumulates_op.cu | 91 -------------- .../fluid/operators/average_accumulates_op.h | 119 ------------------ paddle/fluid/operators/unity_build_rule.cmake | 1 - .../paddle/incubate/optimizer/modelaverage.py | 1 - 5 files changed, 6 insertions(+), 303 deletions(-) delete mode 100644 paddle/fluid/operators/average_accumulates_op.cu delete mode 100644 paddle/fluid/operators/average_accumulates_op.h diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index f3b85e7a770ca5..9f8f295c249353 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -12,101 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/average_accumulates_op.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { -template <> -void GetAccumulators(const framework::ExecutionContext& ctx, - int64_t* num_updates, - int64_t* num_accumulates, - int64_t* old_num_accumulates) { - auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); - auto* in_num_accumulates = ctx.Input("in_num_accumulates"); - auto* in_num_updates = ctx.Input("in_num_updates"); - - *old_num_accumulates = in_old_num_accumulates->data()[0]; - *num_accumulates = in_num_accumulates->data()[0]; - *num_updates = in_num_updates->data()[0]; -} - -template <> -void SetAccumulators(const framework::ExecutionContext& ctx, - int64_t num_updates, - int64_t num_accumulates, - int64_t old_num_accumulates) { - auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); - auto* out_num_accumulates = ctx.Output("out_num_accumulates"); - auto* out_num_updates = ctx.Output("out_num_updates"); - - out_old_num_accumulates->data()[0] = old_num_accumulates; - out_num_accumulates->data()[0] = num_accumulates; - out_num_updates->data()[0] = num_updates; -} - class AverageAccumulatesOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("param"), "Input", "param", "AverageAccumulates"); - OP_INOUT_CHECK( - ctx->HasInput("in_sum_1"), "Input", "in_sum_1", "AverageAccumulates"); - OP_INOUT_CHECK( - ctx->HasInput("in_sum_2"), "Input", "in_sum_2", "AverageAccumulates"); - OP_INOUT_CHECK( - ctx->HasInput("in_sum_3"), "Input", "in_sum_3", "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasInput("in_num_accumulates"), - "Input", - "in_num_accumulates", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasInput("in_old_num_accumulates"), - "Input", - "in_old_num_accumulates", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasInput("in_num_updates"), - "Input", - "in_num_updates", - "AverageAccumulates"); - - OP_INOUT_CHECK(ctx->HasOutput("out_sum_1"), - "Output", - "out_sum_1", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasOutput("out_sum_2"), - "Output", - "out_sum_2", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasOutput("out_sum_3"), - "Output", - "out_sum_3", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasOutput("out_num_accumulates"), - "Output", - "out_num_accumulates", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasOutput("out_old_num_accumulates"), - "Output", - "out_old_num_accumulates", - "AverageAccumulates"); - OP_INOUT_CHECK(ctx->HasOutput("out_num_updates"), - "Output", - "out_num_updates", - "AverageAccumulates"); - auto in_dim = ctx->GetInputDim("param"); - - ctx->SetOutputDim("out_sum_1", in_dim); - ctx->SetOutputDim("out_sum_2", in_dim); - ctx->SetOutputDim("out_sum_3", in_dim); - ctx->SetOutputDim("out_num_accumulates", {1}); - ctx->SetOutputDim("out_old_num_accumulates", {1}); - ctx->SetOutputDim("out_num_updates", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -210,18 +128,15 @@ And for a mini-batch in training, accumulators were computed as below steps: } // namespace operators } // namespace paddle +namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates, AverageAccumulatesInferShapeFunctor, PD_INFER_META(phi::AverageAccumulatesInferMeta)); -namespace ops = paddle::operators; - REGISTER_OPERATOR( average_accumulates, ops::AverageAccumulatesOp, ops::AverageAccumulatesOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(average_accumulates, - ops::AverageAccumulatesKernel, - ops::AverageAccumulatesKernel); + paddle::framework::EmptyGradOpMaker, + AverageAccumulatesInferShapeFunctor); diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu deleted file mode 100644 index 1ce5f84e75ad35..00000000000000 --- a/paddle/fluid/operators/average_accumulates_op.cu +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/average_accumulates_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" - -namespace paddle { -namespace operators { -template <> -void GetAccumulators( - const framework::ExecutionContext& ctx, - int64_t* num_updates_, - int64_t* num_accumulates_, - int64_t* old_num_accumulates_) { - auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); - auto* in_num_accumulates = ctx.Input("in_num_accumulates"); - auto* in_num_updates = ctx.Input("in_num_updates"); - auto stream = ctx.cuda_device_context().stream(); - auto cuda_place = in_old_num_accumulates->place(); - memory::Copy(platform::CPUPlace(), - old_num_accumulates_, - cuda_place, - in_old_num_accumulates->data(), - sizeof(int64_t), - stream); - - memory::Copy(platform::CPUPlace(), - num_accumulates_, - cuda_place, - in_num_accumulates->data(), - sizeof(int64_t), - stream); - memory::Copy(platform::CPUPlace(), - num_updates_, - cuda_place, - in_num_updates->data(), - sizeof(int64_t), - stream); -} - -template <> -void SetAccumulators( - const framework::ExecutionContext& ctx, - int64_t num_updates_, - int64_t num_accumulates_, - int64_t old_num_accumulates_) { - auto stream = ctx.cuda_device_context().stream(); - auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); - auto* out_num_accumulates = ctx.Output("out_num_accumulates"); - auto* out_num_updates = ctx.Output("out_num_updates"); - auto cuda_place = out_old_num_accumulates->place(); - - memory::Copy(cuda_place, - out_old_num_accumulates->data(), - platform::CPUPlace(), - &old_num_accumulates_, - sizeof(int64_t), - stream); - memory::Copy(cuda_place, - out_num_accumulates->data(), - platform::CPUPlace(), - &num_accumulates_, - sizeof(int64_t), - stream); - memory::Copy(cuda_place, - out_num_updates->data(), - platform::CPUPlace(), - &num_updates_, - sizeof(int64_t), - stream); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - average_accumulates, - ops::AverageAccumulatesKernel, - ops::AverageAccumulatesKernel); diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h deleted file mode 100644 index afa43f8c240c58..00000000000000 --- a/paddle/fluid/operators/average_accumulates_op.h +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void GetAccumulators(const framework::ExecutionContext& ctx, - int64_t* num_updates, - int64_t* num_accumulates, - int64_t* old_num_accumulates); - -template -void SetAccumulators(const framework::ExecutionContext& ctx, - int64_t num_updates, - int64_t num_accumulates, - int64_t old_num_accumulates); - -template -class AverageAccumulatesKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // It is used to avoid loss of precision - static const int64_t kMaxNumAccumulates = 16384; - // Get accumulators from input - int64_t num_updates = 0; - int64_t num_accumulates = 0; - int64_t old_num_accumulates = 0; - GetAccumulators( - ctx, &num_updates, &num_accumulates, &old_num_accumulates); - - // Get attrs - float average_window = ctx.Attr("average_window"); - int64_t max_average_window = ctx.Attr("max_average_window"); - int64_t min_average_window = ctx.Attr("min_average_window"); - PADDLE_ENFORCE_LE( - min_average_window, - max_average_window, - platform::errors::InvalidArgument( - "The min_average_window > " - "max_average_window is not right, min_average_window is %ld, " - "max_average_window is %ld.", - min_average_window, - max_average_window)); - - // Get inputs - auto* param = ctx.Input("param"); - auto* in_sum_1 = ctx.Input("in_sum_1"); - auto* in_sum_2 = ctx.Input("in_sum_2"); - auto* in_sum_3 = ctx.Input("in_sum_3"); - auto param_tensor = framework::EigenVector::Flatten(*param); - auto in_sum_1_tensor = framework::EigenVector::Flatten(*in_sum_1); - auto in_sum_2_tensor = framework::EigenVector::Flatten(*in_sum_2); - auto in_sum_3_tensor = framework::EigenVector::Flatten(*in_sum_3); - - // Get outputs - auto* out_sum_1 = ctx.Output("out_sum_1"); - auto* out_sum_2 = ctx.Output("out_sum_2"); - auto* out_sum_3 = ctx.Output("out_sum_3"); - auto out_sum_1_tensor = framework::EigenVector::Flatten(*out_sum_1); - auto out_sum_2_tensor = framework::EigenVector::Flatten(*out_sum_2); - auto out_sum_3_tensor = framework::EigenVector::Flatten(*out_sum_3); - - // Compute - auto& place = *ctx.template device_context().eigen_device(); - phi::funcs::SetConstant constant_functor; - ++num_updates; - ++num_accumulates; - out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; - out_sum_2_tensor.device(place) = in_sum_2_tensor; - out_sum_3_tensor.device(place) = in_sum_3_tensor; - if (num_updates % kMaxNumAccumulates == 0) { - // Move the sum to a different buffer to avoid loss of precision due to - // too many sums. - out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; - constant_functor( - ctx.template device_context(), out_sum_1, 0.0); - } - if (num_accumulates >= min_average_window && - num_accumulates >= std::min(max_average_window, - num_updates * average_window)) { - // Now the average window is too long, discard the old sum. - out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; - constant_functor( - ctx.template device_context(), out_sum_1, 0.0); - constant_functor( - ctx.template device_context(), out_sum_2, 0.0); - old_num_accumulates = num_accumulates; - num_accumulates = 0; - } - - // Set accumulators to output - SetAccumulators( - ctx, num_updates, num_accumulates, old_num_accumulates); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 62aa990ca7bc82..69206b53c7cf95 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -396,7 +396,6 @@ register_unity_group( conv_transpose_op.cu cos_sim_op.cu crop_op.cu - average_accumulates_op.cu conj_op.cu correlation_op.cu) register_unity_group( diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index c62456eae388be..b7d499f77292ec 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -238,7 +238,6 @@ def _append_optimize_op(self, block, param_and_grad): param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates, old_num_accumulates, num_updates, self.average_window, self.max_average_window, self.min_average_window) - return None elif framework._non_static_mode(): _, _, _, _, _, _ = _C_ops.average_accumulates( From 339be4b11f503fe7142bab0eb17ce41f73a3dab4 Mon Sep 17 00:00:00 2001 From: wwbitejotunn Date: Tue, 26 Jul 2022 09:01:28 +0000 Subject: [PATCH 12/12] restore cmake --- paddle/fluid/operators/unity_build_rule.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 69206b53c7cf95..62aa990ca7bc82 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -396,6 +396,7 @@ register_unity_group( conv_transpose_op.cu cos_sim_op.cu crop_op.cu + average_accumulates_op.cu conj_op.cu correlation_op.cu) register_unity_group(