From 29e7c6047f6ea1f2620572cfffc36cf840556f54 Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Wed, 20 Jul 2022 02:39:45 +0000
Subject: [PATCH 01/12] move average_accumulates op to phi kernel

---
 .../phi/kernels/average_accumulates_kernel.h  |  57 ++++++++
 .../kernels/cpu/average_accumulates_kernel.cc | 138 ++++++++++++++++++
 2 files changed, 195 insertions(+)
 create mode 100644 paddle/phi/kernels/average_accumulates_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/average_accumulates_kernel.cc
diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h
new file mode 100644
index 00000000000000..7ecbfa8405d6e3
--- /dev/null
+++ b/paddle/phi/kernels/average_accumulates_kernel.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename Context>
+void GetAccumulators(const Context& dev_ctx,
+                     const DenseTensor& in_num_accumulates,
+                     const DenseTensor& in_old_num_accumulates,
+                     const DenseTensor& in_num_updates,
+                     int64_t* num_updates,
+                     int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);
+
+template <typename Context>
+void SetAccumulators(const Context& ctx,
+                     int64_t num_updates,
+                     int64_t num_accumulates,
+                     int64_t old_num_accumulates,                                
+                     DenseTensor* out_num_accumulates,
+                     DenseTensor* out_old_num_accumulates,
+                     DenseTensor* out_num_updates);
+
+
+template <typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                              const DenseTensor& param,
+                              const DenseTensor& in_sum_1,
+                              const DenseTensor& in_sum_2,
+                              const DenseTensor& in_sum_3,
+                              const DenseTensor& in_num_accumulates,
+                              const DenseTensor& in_old_num_accumulates,
+                              const DenseTensor& in_num_updates,
+                              float average_window,
+                              int64_t max_average_window,
+                              int64_t min_average_window,
+                              DenseTensor* out_sum_1,
+                              DenseTensor* out_sum_2,
+                              DenseTensor* out_sum_3,
+                              DenseTensor* out_num_accumulates,
+                              DenseTensor* out_old_num_accumulates,
+                              DenseTensor* out_num_updates
+                              );
+}
\ No newline at end of file
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
new file mode 100644
index 00000000000000..a9786391bae7f1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -0,0 +1,138 @@
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi{
+
+template <>
+void GetAccumulators<CPUContext>(const CPUContext& dev_ctx,
+                                      const DenseTensor& in_num_accumulates,
+                                      const DenseTensor& in_old_num_accumulates,
+                                      const DenseTensor& in_num_updates,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
+  *old_num_accumulates = in_old_num_accumulates.data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates.data<int64_t>()[0];
+  *num_updates = in_num_updates.data<int64_t>()[0];
+}
+
+template <>
+void SetAccumulators<CPUContext>(const CPUContext& dev_ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates,
+                                      DenseTensor* out_num_accumulates,
+                                      DenseTensor* out_old_num_accumulates,
+                                      DenseTensor* out_num_updates) {
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
+}
+
+template<typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                                const DenseTensor& param,
+                                const DenseTensor& in_sum_1,
+                                const DenseTensor& in_sum_2,
+                                const DenseTensor& in_sum_3,
+                                const DenseTensor& in_num_accumulates,
+                                const DenseTensor& in_old_num_accumulates,
+                                const DenseTensor& in_num_updates,
+                                float average_window,
+                                int64_t max_average_window,
+                                int64_t min_average_window,
+                                DenseTensor* out_sum_1,
+                                DenseTensor* out_sum_2,
+                                DenseTensor* out_sum_3,
+                                DenseTensor* out_num_accumulates,
+                                DenseTensor* out_old_num_accumulates,
+                                DenseTensor* out_num_updates){
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<Context>(
+        dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates);
+
+    // Get attrs
+    // float average_window = ctx.Attr<float>("average_window");
+    // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    PADDLE_ENFORCE_LE(
+        min_average_window,
+        max_average_window,
+        errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window,
+            max_average_window));
+
+    // Get inputs
+    //auto* param = ctx.Input<Tensor>("param");
+    //auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    //auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    //auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
+
+    // Get outputs
+    //auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    //auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    //auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+    // Compute
+    //auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto& place = *dev_ctx.eigen_device();
+
+    funcs::SetConstant<Context, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+        // Move the sum to a different buffer to avoid loss of precision due to
+        // too many sums.
+        out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+        constant_functor(
+            dev_ctx, out_sum_1, 0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                                num_updates * average_window)) {
+        //  Now the average window is too long, discard the old sum.
+        out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+        constant_functor(
+            dev_ctx, out_sum_1, 0.0);
+        constant_functor(
+            dev_ctx, out_sum_2, 0.0);
+        old_num_accumulates = num_accumulates;
+        num_accumulates = 0;
+    }
+
+    // Set accumulators to output
+    SetAccumulators<Context>(
+        dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates);        
+}
+
+} // namespace phi
+
+PD_REGISTER_KERNEL(
+    average_accumulates,
+    CPU,
+    ALL_LAYOUT,
+    phi::AverageAccumulatesKernel,
+    float,
+    double){}
\ No newline at end of file

From b60fc04431faaeac693ac70074fda5619c28b8ac Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Wed, 20 Jul 2022 07:14:31 +0000
Subject: [PATCH 02/12] move infer meta

---
 .../fluid/operators/average_accumulates_op.cc |   7 ++
 paddle/phi/infermeta/multiary.cc              |  25 +++++
 paddle/phi/infermeta/multiary.h               |  17 +++
 .../kernels/cpu/average_accumulates_kernel.cc | 102 +-----------------
 .../kernels/gpu/average_accumulates_kernel.cu |  80 ++++++++++++++
 .../impl/average_accumulates_kernel_impl.h    | 102 ++++++++++++++++++
 6 files changed, 234 insertions(+), 99 deletions(-)
 create mode 100644 paddle/phi/kernels/gpu/average_accumulates_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/average_accumulates_kernel_impl.h

diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 856a703fd2b068..b9ff61cf40e7db 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -208,7 +210,12 @@ And for a mini-batch in training, accumulators were computed as below steps:
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
+                            AverageAccumulatesInferShapeFunctor,
+                            PD_INFER_META(phi::AverageAccumulatesMeta));
+
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(
     average_accumulates,
     ops::AverageAccumulatesOp,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 3369b0c392ec33..e056695ea992b7 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -434,6 +434,31 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
+void AverageAccumulatesMeta(const MetaTensor& param,
+                            const MetaTensor& in_sum_1,
+                            const MetaTensor& in_sum_2,
+                            const MetaTensor& in_sum_3,
+                            const MetaTensor& in_num_accumulates,
+                            const MetaTensor& in_old_num_accumulates,
+                            const MetaTensor& in_num_updates,
+                            float average_window,
+                            int64_t max_average_window,
+                            int64_t min_average_window,
+                            MetaTensor* out_sum_1,
+                            MetaTensor* out_sum_2,
+                            MetaTensor* out_sum_3,
+                            MetaTensor* out_num_accumulates,
+                            MetaTensor* out_old_num_accumulates,
+                            MetaTensor* out_num_updates){
+  auto in_dim = param.dims();
+  out_sum_1->set_dims(in_dim);      
+  out_sum_2->set_dims(in_dim);
+  out_sum_3->set_dims(in_dim);
+  out_num_accumulates->set_dims({1});
+  out_old_num_accumulates->set_dims({1});
+  out_num_updates->set_dims({1});                      
+}
+
 void BatchNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 0ec71e86893c3c..24f3d7898dfd55 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -134,6 +134,23 @@ void AucInferMeta(const MetaTensor& input,
                   MetaTensor* stat_neg_out,
                   MetaConfig config = MetaConfig());
 
+void AverageAccumulatesMeta(const MetaTensor& param,
+                            const MetaTensor& in_sum_1,
+                            const MetaTensor& in_sum_2,
+                            const MetaTensor& in_sum_3,
+                            const MetaTensor& in_num_accumulates,
+                            const MetaTensor& in_old_num_accumulates,
+                            const MetaTensor& in_num_updates,
+                            float average_window,
+                            int64_t max_average_window,
+                            int64_t min_average_window,
+                            MetaTensor* out_sum_1,
+                            MetaTensor* out_sum_2,
+                            MetaTensor* out_sum_3,
+                            MetaTensor* out_num_accumulates,
+                            MetaTensor* out_old_num_accumulates,
+                            MetaTensor* out_num_updates);
+
 void BatchNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
index a9786391bae7f1..54bb124d0732a2 100644
--- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -1,13 +1,12 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/average_accumulates_kernel.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
 
 namespace phi{
 
 template <>
-void GetAccumulators<CPUContext>(const CPUContext& dev_ctx,
+void GetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
                                       const DenseTensor& in_num_accumulates,
                                       const DenseTensor& in_old_num_accumulates,
                                       const DenseTensor& in_num_updates,
@@ -20,7 +19,7 @@ void GetAccumulators<CPUContext>(const CPUContext& dev_ctx,
 }
 
 template <>
-void SetAccumulators<CPUContext>(const CPUContext& dev_ctx,
+void SetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
                                       int64_t num_updates,
                                       int64_t num_accumulates,
                                       int64_t old_num_accumulates,
@@ -32,101 +31,6 @@ void SetAccumulators<CPUContext>(const CPUContext& dev_ctx,
   out_num_updates->data<int64_t>()[0] = num_updates;
 }
 
-template<typename T, typename Context>
-void AverageAccumulatesKernel(const Context& dev_ctx,
-                                const DenseTensor& param,
-                                const DenseTensor& in_sum_1,
-                                const DenseTensor& in_sum_2,
-                                const DenseTensor& in_sum_3,
-                                const DenseTensor& in_num_accumulates,
-                                const DenseTensor& in_old_num_accumulates,
-                                const DenseTensor& in_num_updates,
-                                float average_window,
-                                int64_t max_average_window,
-                                int64_t min_average_window,
-                                DenseTensor* out_sum_1,
-                                DenseTensor* out_sum_2,
-                                DenseTensor* out_sum_3,
-                                DenseTensor* out_num_accumulates,
-                                DenseTensor* out_old_num_accumulates,
-                                DenseTensor* out_num_updates){
-    // It is used to avoid loss of precision
-    static const int64_t kMaxNumAccumulates = 16384;
-    // Get accumulators from input
-    int64_t num_updates = 0;
-    int64_t num_accumulates = 0;
-    int64_t old_num_accumulates = 0;
-    GetAccumulators<Context>(
-        dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates);
-
-    // Get attrs
-    // float average_window = ctx.Attr<float>("average_window");
-    // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
-    // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(
-        min_average_window,
-        max_average_window,
-        errors::InvalidArgument(
-            "The min_average_window > "
-            "max_average_window is not right, min_average_window is %ld, "
-            "max_average_window is %ld.",
-            min_average_window,
-            max_average_window));
-
-    // Get inputs
-    //auto* param = ctx.Input<Tensor>("param");
-    //auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-    //auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-    //auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
-    auto param_tensor = EigenVector<T>::Flatten(param);
-    auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
-    auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
-    auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
-
-    // Get outputs
-    //auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-    //auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-    //auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
-    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
-    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
-    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
-
-    // Compute
-    //auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
-    auto& place = *dev_ctx.eigen_device();
-
-    funcs::SetConstant<Context, T> constant_functor;
-    ++num_updates;
-    ++num_accumulates;
-    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-    out_sum_2_tensor.device(place) = in_sum_2_tensor;
-    out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if (num_updates % kMaxNumAccumulates == 0) {
-        // Move the sum to a different buffer to avoid loss of precision due to
-        // too many sums.
-        out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-        constant_functor(
-            dev_ctx, out_sum_1, 0.0);
-    }
-    if (num_accumulates >= min_average_window &&
-        num_accumulates >= std::min<int64_t>(max_average_window,
-                                                num_updates * average_window)) {
-        //  Now the average window is too long, discard the old sum.
-        out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-        constant_functor(
-            dev_ctx, out_sum_1, 0.0);
-        constant_functor(
-            dev_ctx, out_sum_2, 0.0);
-        old_num_accumulates = num_accumulates;
-        num_accumulates = 0;
-    }
-
-    // Set accumulators to output
-    SetAccumulators<Context>(
-        dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates);        
-}
-
 } // namespace phi
 
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
new file mode 100644
index 00000000000000..7142c8318c189d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -0,0 +1,80 @@
+#include "paddle/phi/backends/gpu/gpu_context.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+
+namespace phi {
+
+template<>
+void GetAccumulators<phi::GPUContext>(
+        const phi::GPUContext& dev_ctx,
+        const DenseTensor& in_num_accumulates,
+        const DenseTensor& in_old_num_accumulates,
+        const DenseTensor& in_num_updates,
+        int64_t* num_updates,
+        int64_t* num_accumulates,
+        int64_t* old_num_accumulates) {
+    auto stream = dev_ctx.stream();
+    auto cuda_place = in_old_num_accumulates.place();
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+               old_num_accumulates,
+               cuda_place,
+               in_old_num_accumulates.data<int64_t>(),
+               sizeof(int64_t),
+               stream);
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                num_accumulates,
+                cuda_place,
+                in_num_accumulates.data<int64_t>(),
+                sizeof(int64_t),
+                stream);
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                num_updates,
+                cuda_place,
+                in_num_updates.data<int64_t>(),
+                sizeof(int64_t),
+                stream);
+}
+
+template <>
+void SetAccumulators<phi::GPUContext>(
+        const phi::GPUContext& dev_ctx,
+        int64_t num_updates,
+        int64_t num_accumulates,
+        int64_t old_num_accumulates,                                
+        DenseTensor* out_num_accumulates,
+        DenseTensor* out_old_num_accumulates,
+        DenseTensor* out_num_updates) {
+    auto stream = dev_ctx.stream();
+
+    auto cuda_place = out_old_num_accumulates->place();
+
+    paddle::memory::Copy(cuda_place,
+                out_old_num_accumulates->data<int64_t>(),
+                paddle::platform::CPUPlace(),
+                &old_num_accumulates,
+                sizeof(int64_t),
+                stream);
+    paddle::memory::Copy(cuda_place,
+                out_num_accumulates->data<int64_t>(),
+                paddle::platform::CPUPlace(),
+                &num_accumulates,
+                sizeof(int64_t),
+                stream);
+    paddle::memory::Copy(cuda_place,
+                out_num_updates->data<int64_t>(),
+                paddle::platform::CPUPlace(),
+                &num_updates,
+                sizeof(int64_t),
+                stream);
+}
+
+} // namespace phi
+
+PD_REGISTER_KERNEL(average_accumulates,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AverageAccumulatesKernel,
+                   float,
+                   double){}
\ No newline at end of file
diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
new file mode 100644
index 00000000000000..20f33c7033b1af
--- /dev/null
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -0,0 +1,102 @@
+#pragma once
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+
+namespace phi{
+
+template<typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                              const DenseTensor& param,
+                              const DenseTensor& in_sum_1,
+                              const DenseTensor& in_sum_2,
+                              const DenseTensor& in_sum_3,
+                              const DenseTensor& in_num_accumulates,
+                              const DenseTensor& in_old_num_accumulates,
+                              const DenseTensor& in_num_updates,
+                              float average_window,
+                              int64_t max_average_window,
+                              int64_t min_average_window,
+                              DenseTensor* out_sum_1,
+                              DenseTensor* out_sum_2,
+                              DenseTensor* out_sum_3,
+                              DenseTensor* out_num_accumulates,
+                              DenseTensor* out_old_num_accumulates,
+                              DenseTensor* out_num_updates){
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<Context>(
+        dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates);
+
+    // Get attrs
+    // float average_window = ctx.Attr<float>("average_window");
+    // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    PADDLE_ENFORCE_LE(
+        min_average_window,
+        max_average_window,
+        errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window,
+            max_average_window));
+
+    // Get inputs
+    //auto* param = ctx.Input<Tensor>("param");
+    //auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    //auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    //auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
+
+    // Get outputs
+    //auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    //auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    //auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+    // Compute
+    //auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto& place = *dev_ctx.eigen_device();
+
+    funcs::SetConstant<Context, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+        // Move the sum to a different buffer to avoid loss of precision due to
+        // too many sums.
+        out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+        constant_functor(
+            dev_ctx, out_sum_1, 0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                                num_updates * average_window)) {
+        //  Now the average window is too long, discard the old sum.
+        out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+        constant_functor(
+            dev_ctx, out_sum_1, 0.0);
+        constant_functor(
+            dev_ctx, out_sum_2, 0.0);
+        old_num_accumulates = num_accumulates;
+        num_accumulates = 0;
+    }
+
+    // Set accumulators to output
+    SetAccumulators<Context>(
+        dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates);        
+}
+}// namespace phi
\ No newline at end of file

From 5d8c057e2fa0fbbafec49fe46ee66ad055c927d6 Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Thu, 21 Jul 2022 01:41:16 +0000
Subject: [PATCH 03/12] yaml for average accumulates

---
 .../fluid/operators/average_accumulates_op.cc |  2 +-
 paddle/phi/api/yaml/legacy_api.yaml           | 10 ++++++
 paddle/phi/infermeta/multiary.cc              |  2 +-
 paddle/phi/infermeta/multiary.h               |  2 +-
 .../phi/kernels/average_accumulates_kernel.h  |  2 +-
 .../kernels/cpu/average_accumulates_kernel.cc | 20 +++++++++--
 .../kernels/gpu/average_accumulates_kernel.cu | 18 +++++-----
 .../impl/average_accumulates_kernel_impl.h    | 33 +++++++++++++------
 .../phi/ops/compat/average_accumulates_sig.cc | 26 +++++++++++++++
 .../paddle/incubate/optimizer/modelaverage.py | 15 ++++++++-
 10 files changed, 104 insertions(+), 26 deletions(-)
 create mode 100644 paddle/phi/ops/compat/average_accumulates_sig.cc

diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index b9ff61cf40e7db..f3b85e7a770ca5 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -212,7 +212,7 @@ And for a mini-batch in training, accumulators were computed as below steps:
 
 DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
                             AverageAccumulatesInferShapeFunctor,
-                            PD_INFER_META(phi::AverageAccumulatesMeta));
+                            PD_INFER_META(phi::AverageAccumulatesInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index b9e7361abea7da..3f662ee04a0aa5 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -247,6 +247,16 @@
   kernel :
     func : auc
 
+#average_accumulates
+- api : average_accumulates
+  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int max_average_window, int min_average_window)
+  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
+  infer_meta:
+    func : AverageAccumulatesInferMeta
+  kernel :
+    func : average_accumulates
+    data_type : param
+
 # batch_norm
 - api : batch_norm
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index e056695ea992b7..d8959569af105d 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -434,7 +434,7 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
-void AverageAccumulatesMeta(const MetaTensor& param,
+void AverageAccumulatesInferMeta(const MetaTensor& param,
                             const MetaTensor& in_sum_1,
                             const MetaTensor& in_sum_2,
                             const MetaTensor& in_sum_3,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 24f3d7898dfd55..cf28a65a8d764f 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -134,7 +134,7 @@ void AucInferMeta(const MetaTensor& input,
                   MetaTensor* stat_neg_out,
                   MetaConfig config = MetaConfig());
 
-void AverageAccumulatesMeta(const MetaTensor& param,
+void AverageAccumulatesInferMeta(const MetaTensor& param,
                             const MetaTensor& in_sum_1,
                             const MetaTensor& in_sum_2,
                             const MetaTensor& in_sum_3,
diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h
index 7ecbfa8405d6e3..5809aace4f860b 100644
--- a/paddle/phi/kernels/average_accumulates_kernel.h
+++ b/paddle/phi/kernels/average_accumulates_kernel.h
@@ -26,7 +26,7 @@ void GetAccumulators(const Context& dev_ctx,
                      int64_t* old_num_accumulates);
 
 template <typename Context>
-void SetAccumulators(const Context& ctx,
+void SetAccumulators(const Context& dev_ctx,
                      int64_t num_updates,
                      int64_t num_accumulates,
                      int64_t old_num_accumulates,                                
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
index 54bb124d0732a2..1837a6a3194fd6 100644
--- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -1,8 +1,24 @@
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/phi/kernels/average_accumulates_kernel.h"
 #include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+
 namespace phi{
 
 template <>
diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
index 7142c8318c189d..0fe7012ea245c0 100644
--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -1,9 +1,9 @@
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/average_accumulates_kernel.h"
 #include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
 namespace phi {
 
 template<>
@@ -17,19 +17,19 @@ void GetAccumulators<phi::GPUContext>(
         int64_t* old_num_accumulates) {
     auto stream = dev_ctx.stream();
     auto cuda_place = in_old_num_accumulates.place();
-    paddle::memory::Copy(paddle::platform::CPUPlace(),
+    paddle::memory::Copy(phi::CPUPlace(),
                old_num_accumulates,
                cuda_place,
                in_old_num_accumulates.data<int64_t>(),
                sizeof(int64_t),
                stream);
-    paddle::memory::Copy(paddle::platform::CPUPlace(),
+    paddle::memory::Copy(phi::CPUPlace(),
                 num_accumulates,
                 cuda_place,
                 in_num_accumulates.data<int64_t>(),
                 sizeof(int64_t),
                 stream);
-    paddle::memory::Copy(paddle::platform::CPUPlace(),
+    paddle::memory::Copy(phi::CPUPlace(),
                 num_updates,
                 cuda_place,
                 in_num_updates.data<int64_t>(),
@@ -52,19 +52,19 @@ void SetAccumulators<phi::GPUContext>(
 
     paddle::memory::Copy(cuda_place,
                 out_old_num_accumulates->data<int64_t>(),
-                paddle::platform::CPUPlace(),
+                phi::CPUPlace(),
                 &old_num_accumulates,
                 sizeof(int64_t),
                 stream);
     paddle::memory::Copy(cuda_place,
                 out_num_accumulates->data<int64_t>(),
-                paddle::platform::CPUPlace(),
+                phi::CPUPlace(),
                 &num_accumulates,
                 sizeof(int64_t),
                 stream);
     paddle::memory::Copy(cuda_place,
                 out_num_updates->data<int64_t>(),
-                paddle::platform::CPUPlace(),
+                phi::CPUPlace(),
                 &num_updates,
                 sizeof(int64_t),
                 stream);
diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
index 20f33c7033b1af..63bdab594cb65e 100644
--- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -1,7 +1,24 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+
+#include <algorithm>
+
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/average_accumulates_kernel.h"
 
 namespace phi{
 
@@ -66,7 +83,6 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
 
     // Compute
     //auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
     auto& place = *dev_ctx.eigen_device();
 
     funcs::SetConstant<Context, T> constant_functor;
@@ -79,24 +95,21 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
         // Move the sum to a different buffer to avoid loss of precision due to
         // too many sums.
         out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-        constant_functor(
-            dev_ctx, out_sum_1, 0.0);
+        constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
     }
     if (num_accumulates >= min_average_window &&
         num_accumulates >= std::min<int64_t>(max_average_window,
                                                 num_updates * average_window)) {
         //  Now the average window is too long, discard the old sum.
         out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-        constant_functor(
-            dev_ctx, out_sum_1, 0.0);
-        constant_functor(
-            dev_ctx, out_sum_2, 0.0);
+        constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+        constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
         old_num_accumulates = num_accumulates;
         num_accumulates = 0;
     }
 
     // Set accumulators to output
-    SetAccumulators<Context>(
-        dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates);        
+    SetAccumulators<Context>(dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates);        
 }
+
 }// namespace phi
\ No newline at end of file
diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc
new file mode 100644
index 00000000000000..69a6263c150386
--- /dev/null
+++ b/paddle/phi/ops/compat/average_accumulates_sig.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+    KernelSignature AverageAccumulatesOpArgumentMapping(const ArgumentMappingContext& ctx){
+        return KernelSignature("average_accumulates",
+                               {"param","in_sum_1","in_sum_2","in_sum_3","in_num_accumulates","in_old_num_accumulates","in_num_updates"},
+                               {"average_window","max_average_window","min_average_window"},
+                               {"out_sum_1","out_sum_2","out_sum_3","out_num_accumulates","out_old_num_accumulates","out_num_updates"});
+
+    }
+}// namespace phi 
+PD_REGISTER_ARG_MAPPING_FN(average_accumulates,phi::AverageAccumulatesOpArgumentMapping)
\ No newline at end of file
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 361827ba48de25..2e53607311bf85 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -21,6 +21,7 @@
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle import _C_ops
+from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
 
@@ -231,7 +232,19 @@ def _append_optimize_op(self, block, param_and_grad):
         old_num_accumulates = self._get_accumulator('old_num_accumulates',
                                                     param_and_grad[0])
         num_updates = self._get_accumulator('num_updates', param_and_grad[0])
-        if framework._non_static_mode():
+        
+        
+        if in_dygraph_mode():
+            _, _, _, _, _, _ =_C_ops.final_state_average_accumulates(
+                param_and_grad[0], sum_1, sum_2, sum_3, 
+                num_accumulates, old_num_accumulates, num_updates,
+                self.average_window,
+                self.max_average_window,
+                self.min_average_window,
+                sum_1, sum_2, sum_3,
+                num_accumulates, old_num_accumulates, num_updates)
+            return None
+        elif framework._non_static_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates(
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
                 old_num_accumulates, num_updates, sum_1, sum_2, sum_3,

From 84f813433ae2315357adee65dddb177237779fa0 Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Fri, 22 Jul 2022 09:51:38 +0000
Subject: [PATCH 04/12] mutable problem fix, add inplace in yaml

---
 .../fluid/operators/average_accumulates_op.cu | 15 +++---
 paddle/phi/api/yaml/legacy_api.yaml           |  7 +--
 .../phi/kernels/average_accumulates_kernel.h  |  7 +--
 .../kernels/cpu/average_accumulates_kernel.cc |  1 -
 .../kernels/gpu/average_accumulates_kernel.cu | 41 +++++++++++----
 .../impl/average_accumulates_kernel_impl.h    | 51 ++++++++++++++-----
 .../paddle/incubate/optimizer/modelaverage.py |  9 ++--
 7 files changed, 88 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index d793d528a5b18c..48c47858c1c760 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -28,19 +28,20 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
   auto cuda_place = in_old_num_accumulates->place();
-  memory::Copy(platform::CPUPlace(),
+  paddle::memory::Copy(platform::CPUPlace(),
                old_num_accumulates_,
                cuda_place,
                in_old_num_accumulates->data<int64_t>(),
                sizeof(int64_t),
                stream);
-  memory::Copy(platform::CPUPlace(),
+
+  paddle::memory::Copy(platform::CPUPlace(),
                num_accumulates_,
                cuda_place,
                in_num_accumulates->data<int64_t>(),
                sizeof(int64_t),
                stream);
-  memory::Copy(platform::CPUPlace(),
+  paddle::memory::Copy(platform::CPUPlace(),
                num_updates_,
                cuda_place,
                in_num_updates->data<int64_t>(),
@@ -60,19 +61,19 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
   auto cuda_place = out_old_num_accumulates->place();
 
-  memory::Copy(cuda_place,
+  paddle::memory::Copy(cuda_place,
                out_old_num_accumulates->data<int64_t>(),
                platform::CPUPlace(),
                &old_num_accumulates_,
                sizeof(int64_t),
                stream);
-  memory::Copy(cuda_place,
+  paddle::memory::Copy(cuda_place,
                out_num_accumulates->data<int64_t>(),
                platform::CPUPlace(),
                &num_accumulates_,
                sizeof(int64_t),
                stream);
-  memory::Copy(cuda_place,
+  paddle::memory::Copy(cuda_place,
                out_num_updates->data<int64_t>(),
                platform::CPUPlace(),
                &num_updates_,
@@ -85,6 +86,6 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    average_accumulates,
+    average_accumulates_,
     ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
     ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 3f662ee04a0aa5..941e65ced5bc2e 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -248,14 +248,15 @@
     func : auc
 
 #average_accumulates
-- api : average_accumulates
-  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int max_average_window, int min_average_window)
+- api : average_accumulates_
+  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
   output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
   infer_meta:
     func : AverageAccumulatesInferMeta
   kernel :
-    func : average_accumulates
+    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
     data_type : param
+  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
 
 # batch_norm
 - api : batch_norm
diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h
index 5809aace4f860b..a6f5176c83131b 100644
--- a/paddle/phi/kernels/average_accumulates_kernel.h
+++ b/paddle/phi/kernels/average_accumulates_kernel.h
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #pragma once
+
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace phi {
 
 template <typename Context>
@@ -52,6 +54,5 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
                               DenseTensor* out_sum_3,
                               DenseTensor* out_num_accumulates,
                               DenseTensor* out_old_num_accumulates,
-                              DenseTensor* out_num_updates
-                              );
-}
\ No newline at end of file
+                              DenseTensor* out_num_updates);
+} // namespace phi
\ No newline at end of file
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
index 1837a6a3194fd6..9135a810844305 100644
--- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-
 namespace phi{
 
 template <>
diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
index 0fe7012ea245c0..9a87ddfd83ee35 100644
--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -1,7 +1,21 @@
-#include "paddle/phi/kernels/average_accumulates_kernel.h"
-#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -46,24 +60,31 @@ void SetAccumulators<phi::GPUContext>(
         DenseTensor* out_num_accumulates,
         DenseTensor* out_old_num_accumulates,
         DenseTensor* out_num_updates) {
+
+    int64_t* out_num_accumulates_ptr=dev_ctx.template Alloc<int64_t>(out_num_accumulates);
+    int64_t* out_old_num_accumulates_ptr=dev_ctx.template Alloc<int64_t>(out_old_num_accumulates);
+    int64_t* out_num_updates_ptr=dev_ctx.template Alloc<int64_t>(out_num_updates);
+
+
     auto stream = dev_ctx.stream();
 
     auto cuda_place = out_old_num_accumulates->place();
-
-    paddle::memory::Copy(cuda_place,
-                out_old_num_accumulates->data<int64_t>(),
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                out_num_accumulates_ptr,
                 phi::CPUPlace(),
-                &old_num_accumulates,
+                &num_accumulates,
                 sizeof(int64_t),
                 stream);
-    paddle::memory::Copy(cuda_place,
-                out_num_accumulates->data<int64_t>(),
+                
+    paddle::memory::Copy(dev_ctx.GetPlace(),
+                out_old_num_accumulates_ptr,
                 phi::CPUPlace(),
-                &num_accumulates,
+                &old_num_accumulates,
                 sizeof(int64_t),
                 stream);
+
     paddle::memory::Copy(cuda_place,
-                out_num_updates->data<int64_t>(),
+                out_num_updates_ptr,
                 phi::CPUPlace(),
                 &num_updates,
                 sizeof(int64_t),
diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
index 63bdab594cb65e..96438f08711454 100644
--- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -43,12 +43,21 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
     // It is used to avoid loss of precision
     static const int64_t kMaxNumAccumulates = 16384;
     // Get accumulators from input
-    int64_t num_updates = 0;
-    int64_t num_accumulates = 0;
-    int64_t old_num_accumulates = 0;
-    GetAccumulators<Context>(
-        dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, &num_updates, &num_accumulates, &old_num_accumulates);
+    // int64_t num_updates = 0;
+    // int64_t num_accumulates = 0;
+    // int64_t old_num_accumulates = 0;
+
+    auto num_updates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t));
+    int64_t* num_updates_cpu_ptr=reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
+
+    auto num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t));
+    int64_t* num_accumulates_cpu_ptr=reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
 
+    auto old_num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t));
+    int64_t* old_num_accumulates_cpu_ptr=reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
+
+    GetAccumulators<Context>(
+        dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, num_updates_cpu_ptr, num_accumulates_cpu_ptr, old_num_accumulates_cpu_ptr);
     // Get attrs
     // float average_window = ctx.Attr<float>("average_window");
     // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
@@ -77,6 +86,11 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
     //auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
     //auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
     //auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    dev_ctx.template Alloc<T>(out_sum_1);
+    dev_ctx.template Alloc<T>(out_sum_2);
+    dev_ctx.template Alloc<T>(out_sum_3);
+    
+    
     auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
     auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
     auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
@@ -86,30 +100,39 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
     auto& place = *dev_ctx.eigen_device();
 
     funcs::SetConstant<Context, T> constant_functor;
-    ++num_updates;
-    ++num_accumulates;
+    ++(*num_updates_cpu_ptr);
+    ++(*num_accumulates_cpu_ptr);
     out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
     out_sum_2_tensor.device(place) = in_sum_2_tensor;
     out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if (num_updates % kMaxNumAccumulates == 0) {
+    if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) {
         // Move the sum to a different buffer to avoid loss of precision due to
         // too many sums.
         out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
         constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
     }
-    if (num_accumulates >= min_average_window &&
-        num_accumulates >= std::min<int64_t>(max_average_window,
-                                                num_updates * average_window)) {
+    if ((*num_accumulates_cpu_ptr) >= min_average_window &&
+        (*num_accumulates_cpu_ptr) >= std::min<int64_t>(max_average_window,
+                                                (*num_updates_cpu_ptr) * average_window)) {
         //  Now the average window is too long, discard the old sum.
         out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
         constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
         constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
-        old_num_accumulates = num_accumulates;
-        num_accumulates = 0;
+        (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr);
+        (*num_accumulates_cpu_ptr) = 0;
     }
 
     // Set accumulators to output
-    SetAccumulators<Context>(dev_ctx, num_updates, num_accumulates, old_num_accumulates, out_num_accumulates, out_old_num_accumulates, out_num_updates);        
+    VLOG(3)<<"@@@@@num_accumulates : "<<*num_accumulates_cpu_ptr;
+    
+    SetAccumulators<Context>(dev_ctx, 
+        *num_updates_cpu_ptr, 
+        *num_accumulates_cpu_ptr,
+        *old_num_accumulates_cpu_ptr, 
+        out_num_accumulates, 
+        out_old_num_accumulates, 
+        out_num_updates);        
+        
 }
 
 }// namespace phi
\ No newline at end of file
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 2e53607311bf85..7b1e1d8ca144cc 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -235,19 +235,18 @@ def _append_optimize_op(self, block, param_and_grad):
         
         
         if in_dygraph_mode():
-            _, _, _, _, _, _ =_C_ops.final_state_average_accumulates(
+            _, _, _, _, _, _ = _C_ops.final_state_average_accumulates(
                 param_and_grad[0], sum_1, sum_2, sum_3, 
                 num_accumulates, old_num_accumulates, num_updates,
                 self.average_window,
                 self.max_average_window,
-                self.min_average_window,
-                sum_1, sum_2, sum_3,
-                num_accumulates, old_num_accumulates, num_updates)
+                self.min_average_window)
+
             return None
         elif framework._non_static_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates(
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
-                old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
+            old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
                 num_accumulates, old_num_accumulates, num_updates,
                 'average_window', self.average_window, 'min_average_window',
                 self.min_average_window, 'max_average_window',

From 40fb5f600c3c8e3c56311ae8904213c4d1cdc9f8 Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Fri, 22 Jul 2022 09:56:47 +0000
Subject: [PATCH 05/12] polish

---
 paddle/fluid/operators/average_accumulates_op.cu | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 48c47858c1c760..1ce5f84e75ad35 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -28,20 +28,20 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
   auto cuda_place = in_old_num_accumulates->place();
-  paddle::memory::Copy(platform::CPUPlace(),
+  memory::Copy(platform::CPUPlace(),
                old_num_accumulates_,
                cuda_place,
                in_old_num_accumulates->data<int64_t>(),
                sizeof(int64_t),
                stream);
 
-  paddle::memory::Copy(platform::CPUPlace(),
+  memory::Copy(platform::CPUPlace(),
                num_accumulates_,
                cuda_place,
                in_num_accumulates->data<int64_t>(),
                sizeof(int64_t),
                stream);
-  paddle::memory::Copy(platform::CPUPlace(),
+  memory::Copy(platform::CPUPlace(),
                num_updates_,
                cuda_place,
                in_num_updates->data<int64_t>(),
@@ -61,19 +61,19 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
   auto cuda_place = out_old_num_accumulates->place();
 
-  paddle::memory::Copy(cuda_place,
+  memory::Copy(cuda_place,
                out_old_num_accumulates->data<int64_t>(),
                platform::CPUPlace(),
                &old_num_accumulates_,
                sizeof(int64_t),
                stream);
-  paddle::memory::Copy(cuda_place,
+  memory::Copy(cuda_place,
                out_num_accumulates->data<int64_t>(),
                platform::CPUPlace(),
                &num_accumulates_,
                sizeof(int64_t),
                stream);
-  paddle::memory::Copy(cuda_place,
+  memory::Copy(cuda_place,
                out_num_updates->data<int64_t>(),
                platform::CPUPlace(),
                &num_updates_,
@@ -86,6 +86,6 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    average_accumulates_,
+    average_accumulates,
     ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
     ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);

From c6b78d2356e5c1cb83f36ca9d9a932ef63c1cb79 Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Sun, 24 Jul 2022 17:05:15 +0000
Subject: [PATCH 06/12] bug fix

---
 .../final_state_generator/python_c_gen.py     |  2 +
 paddle/phi/infermeta/multiary.cc              | 46 +++++++++++--------
 .../phi/ops/compat/average_accumulates_sig.cc | 31 +++++++++----
 .../paddle/incubate/optimizer/modelaverage.py | 15 +++---
 4 files changed, 56 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 9d5706f65bdf0c..b7229278a8ddf5 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -57,6 +57,8 @@ def SkipAPIGeneration(forward_api_name):
     'adam',
     'adamw_',
     'adamw',
+    'average_accumulates',
+    'average_accumulates_',
     'decayed_adagrad_',
     'decayed_adagrad',
     'dgc_momentum_',
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index d8959569af105d..f4d6a00bb67cbf 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -435,28 +435,34 @@ void AucInferMeta(const MetaTensor& input,
 }
 
 void AverageAccumulatesInferMeta(const MetaTensor& param,
-                            const MetaTensor& in_sum_1,
-                            const MetaTensor& in_sum_2,
-                            const MetaTensor& in_sum_3,
-                            const MetaTensor& in_num_accumulates,
-                            const MetaTensor& in_old_num_accumulates,
-                            const MetaTensor& in_num_updates,
-                            float average_window,
-                            int64_t max_average_window,
-                            int64_t min_average_window,
-                            MetaTensor* out_sum_1,
-                            MetaTensor* out_sum_2,
-                            MetaTensor* out_sum_3,
-                            MetaTensor* out_num_accumulates,
-                            MetaTensor* out_old_num_accumulates,
-                            MetaTensor* out_num_updates){
-  auto in_dim = param.dims();
-  out_sum_1->set_dims(in_dim);      
-  out_sum_2->set_dims(in_dim);
-  out_sum_3->set_dims(in_dim);
+                                 const MetaTensor& in_sum_1,
+                                 const MetaTensor& in_sum_2,
+                                 const MetaTensor& in_sum_3,
+                                 const MetaTensor& in_num_accumulates,
+                                 const MetaTensor& in_old_num_accumulates,
+                                 const MetaTensor& in_num_updates,
+                                 float average_window,
+                                 int64_t max_average_window,
+                                 int64_t min_average_window,
+                                 MetaTensor* out_sum_1,
+                                 MetaTensor* out_sum_2,
+                                 MetaTensor* out_sum_3,
+                                 MetaTensor* out_num_accumulates,
+                                 MetaTensor* out_old_num_accumulates,
+                                 MetaTensor* out_num_updates) {
+  // auto in_dim = param.dims;
+  out_sum_1->set_dims(in_sum_1.dims());
+  out_sum_1->set_dtype(in_sum_1.dtype());
+  out_sum_2->set_dims(in_sum_2.dims());
+  out_sum_2->set_dtype(in_sum_2.dtype());
+  out_sum_3->set_dims(in_sum_3.dims());
+  out_sum_3->set_dtype(in_sum_3.dtype());
   out_num_accumulates->set_dims({1});
+  out_num_accumulates->set_dtype(in_num_accumulates.dtype());
   out_old_num_accumulates->set_dims({1});
-  out_num_updates->set_dims({1});                      
+  out_old_num_accumulates->set_dtype(in_old_num_accumulates.dtype());
+  out_num_updates->set_dims({1});
+  out_num_updates->set_dtype(in_num_updates.dtype());
 }
 
 void BatchNormInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc
index 69a6263c150386..c14e8ab3575531 100644
--- a/paddle/phi/ops/compat/average_accumulates_sig.cc
+++ b/paddle/phi/ops/compat/average_accumulates_sig.cc
@@ -15,12 +15,25 @@ limitations under the License. */
 #include "paddle/phi/core/compat/op_utils.h"
 
 namespace phi {
-    KernelSignature AverageAccumulatesOpArgumentMapping(const ArgumentMappingContext& ctx){
-        return KernelSignature("average_accumulates",
-                               {"param","in_sum_1","in_sum_2","in_sum_3","in_num_accumulates","in_old_num_accumulates","in_num_updates"},
-                               {"average_window","max_average_window","min_average_window"},
-                               {"out_sum_1","out_sum_2","out_sum_3","out_num_accumulates","out_old_num_accumulates","out_num_updates"});
-
-    }
-}// namespace phi 
-PD_REGISTER_ARG_MAPPING_FN(average_accumulates,phi::AverageAccumulatesOpArgumentMapping)
\ No newline at end of file
+KernelSignature AverageAccumulatesOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "average_accumulates",
+      {"param",
+       "in_sum_1",
+       "in_sum_2",
+       "in_sum_3",
+       "in_num_accumulates",
+       "in_old_num_accumulates",
+       "in_num_updates"},
+      {"average_window", "max_average_window", "min_average_window"},
+      {"out_sum_1",
+       "out_sum_2",
+       "out_sum_3",
+       "out_num_accumulates",
+       "out_old_num_accumulates",
+       "out_num_updates"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(average_accumulates,
+                           phi::AverageAccumulatesOpArgumentMapping);
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 7b1e1d8ca144cc..c62456eae388be 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -232,21 +232,18 @@ def _append_optimize_op(self, block, param_and_grad):
         old_num_accumulates = self._get_accumulator('old_num_accumulates',
                                                     param_and_grad[0])
         num_updates = self._get_accumulator('num_updates', param_and_grad[0])
-        
-        
+
         if in_dygraph_mode():
-            _, _, _, _, _, _ = _C_ops.final_state_average_accumulates(
-                param_and_grad[0], sum_1, sum_2, sum_3, 
-                num_accumulates, old_num_accumulates, num_updates,
-                self.average_window,
-                self.max_average_window,
-                self.min_average_window)
+            _, _, _, _, _, _ = _C_ops.final_state_average_accumulates_(
+                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
+                old_num_accumulates, num_updates, self.average_window,
+                self.max_average_window, self.min_average_window)
 
             return None
         elif framework._non_static_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates(
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
-            old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
+                old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
                 num_accumulates, old_num_accumulates, num_updates,
                 'average_window', self.average_window, 'min_average_window',
                 self.min_average_window, 'max_average_window',

From 7325cb64807affc69c96b9203bf74e732ebe3b2e Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Sun, 24 Jul 2022 17:15:50 +0000
Subject: [PATCH 07/12] code style improve

---
 .../impl/average_accumulates_kernel_impl.h    | 204 +++++++++---------
 1 file changed, 106 insertions(+), 98 deletions(-)

diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
index 96438f08711454..8731316317d477 100644
--- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -17,12 +17,12 @@ limitations under the License. */
 
 #include <algorithm>
 
-#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace phi{
+namespace phi {
 
-template<typename T, typename Context>
+template <typename T, typename Context>
 void AverageAccumulatesKernel(const Context& dev_ctx,
                               const DenseTensor& param,
                               const DenseTensor& in_sum_1,
@@ -39,100 +39,108 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
                               DenseTensor* out_sum_3,
                               DenseTensor* out_num_accumulates,
                               DenseTensor* out_old_num_accumulates,
-                              DenseTensor* out_num_updates){
-    // It is used to avoid loss of precision
-    static const int64_t kMaxNumAccumulates = 16384;
-    // Get accumulators from input
-    // int64_t num_updates = 0;
-    // int64_t num_accumulates = 0;
-    // int64_t old_num_accumulates = 0;
-
-    auto num_updates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t));
-    int64_t* num_updates_cpu_ptr=reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
-
-    auto num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t));
-    int64_t* num_accumulates_cpu_ptr=reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
-
-    auto old_num_accumulates_cpu=paddle::memory::Alloc(phi::CPUPlace(),sizeof(int64_t));
-    int64_t* old_num_accumulates_cpu_ptr=reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
-
-    GetAccumulators<Context>(
-        dev_ctx, in_num_accumulates, in_old_num_accumulates, in_num_updates, num_updates_cpu_ptr, num_accumulates_cpu_ptr, old_num_accumulates_cpu_ptr);
-    // Get attrs
-    // float average_window = ctx.Attr<float>("average_window");
-    // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
-    // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(
-        min_average_window,
-        max_average_window,
-        errors::InvalidArgument(
-            "The min_average_window > "
-            "max_average_window is not right, min_average_window is %ld, "
-            "max_average_window is %ld.",
-            min_average_window,
-            max_average_window));
-
-    // Get inputs
-    //auto* param = ctx.Input<Tensor>("param");
-    //auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-    //auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-    //auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
-    auto param_tensor = EigenVector<T>::Flatten(param);
-    auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
-    auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
-    auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
-
-    // Get outputs
-    //auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-    //auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-    //auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
-    dev_ctx.template Alloc<T>(out_sum_1);
-    dev_ctx.template Alloc<T>(out_sum_2);
-    dev_ctx.template Alloc<T>(out_sum_3);
-    
-    
-    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
-    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
-    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
-
-    // Compute
-    //auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto& place = *dev_ctx.eigen_device();
-
-    funcs::SetConstant<Context, T> constant_functor;
-    ++(*num_updates_cpu_ptr);
-    ++(*num_accumulates_cpu_ptr);
-    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-    out_sum_2_tensor.device(place) = in_sum_2_tensor;
-    out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) {
-        // Move the sum to a different buffer to avoid loss of precision due to
-        // too many sums.
-        out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-        constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
-    }
-    if ((*num_accumulates_cpu_ptr) >= min_average_window &&
-        (*num_accumulates_cpu_ptr) >= std::min<int64_t>(max_average_window,
-                                                (*num_updates_cpu_ptr) * average_window)) {
-        //  Now the average window is too long, discard the old sum.
-        out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-        constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
-        constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
-        (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr);
-        (*num_accumulates_cpu_ptr) = 0;
-    }
-
-    // Set accumulators to output
-    VLOG(3)<<"@@@@@num_accumulates : "<<*num_accumulates_cpu_ptr;
-    
-    SetAccumulators<Context>(dev_ctx, 
-        *num_updates_cpu_ptr, 
-        *num_accumulates_cpu_ptr,
-        *old_num_accumulates_cpu_ptr, 
-        out_num_accumulates, 
-        out_old_num_accumulates, 
-        out_num_updates);        
-        
+                              DenseTensor* out_num_updates) {
+  // It is used to avoid loss of precision
+  static const int64_t kMaxNumAccumulates = 16384;
+  // Get accumulators from input
+  // int64_t num_updates = 0;
+  // int64_t num_accumulates = 0;
+  // int64_t old_num_accumulates = 0;
+
+  auto num_updates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* num_updates_cpu_ptr =
+      reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
+
+  auto num_accumulates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* num_accumulates_cpu_ptr =
+      reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
+
+  auto old_num_accumulates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* old_num_accumulates_cpu_ptr =
+      reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
+
+  GetAccumulators<Context>(dev_ctx,
+                           in_num_accumulates,
+                           in_old_num_accumulates,
+                           in_num_updates,
+                           num_updates_cpu_ptr,
+                           num_accumulates_cpu_ptr,
+                           old_num_accumulates_cpu_ptr);
+  // Get attrs
+  // float average_window = ctx.Attr<float>("average_window");
+  // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+  // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+  PADDLE_ENFORCE_LE(
+      min_average_window,
+      max_average_window,
+      errors::InvalidArgument(
+          "The min_average_window > "
+          "max_average_window is not right, min_average_window is %ld, "
+          "max_average_window is %ld.",
+          min_average_window,
+          max_average_window));
+
+  // Get inputs
+  // auto* param = ctx.Input<Tensor>("param");
+  // auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+  // auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+  // auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+  auto param_tensor = EigenVector<T>::Flatten(param);
+  auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
+  auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
+  auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
+
+  // Get outputs
+  // auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+  // auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+  // auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+  dev_ctx.template Alloc<T>(out_sum_1);
+  dev_ctx.template Alloc<T>(out_sum_2);
+  dev_ctx.template Alloc<T>(out_sum_3);
+
+  auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+  auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+  auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+  // Compute
+  // auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+  auto& place = *dev_ctx.eigen_device();
+
+  funcs::SetConstant<Context, T> constant_functor;
+  ++(*num_updates_cpu_ptr);
+  ++(*num_accumulates_cpu_ptr);
+  out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+  out_sum_2_tensor.device(place) = in_sum_2_tensor;
+  out_sum_3_tensor.device(place) = in_sum_3_tensor;
+  if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision due to
+    // too many sums.
+    out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+    constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+  }
+  if ((*num_accumulates_cpu_ptr) >= min_average_window &&
+      (*num_accumulates_cpu_ptr) >=
+          std::min<int64_t>(max_average_window,
+                            (*num_updates_cpu_ptr) * average_window)) {
+    //  Now the average window is too long, discard the old sum.
+    out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+    constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+    constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
+    (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr);
+    (*num_accumulates_cpu_ptr) = 0;
+  }
+
+  // Set accumulators to output
+  SetAccumulators<Context>(dev_ctx,
+                           *num_updates_cpu_ptr,
+                           *num_accumulates_cpu_ptr,
+                           *old_num_accumulates_cpu_ptr,
+                           out_num_accumulates,
+                           out_old_num_accumulates,
+                           out_num_updates);
 }
 
-}// namespace phi
\ No newline at end of file
+}  // namespace phi

From c5a7f9554eb7ec38a525af490a6d26b4e23737cd Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Sun, 24 Jul 2022 17:55:05 +0000
Subject: [PATCH 08/12] code style improve

---
 .../kernels/gpu/average_accumulates_kernel.cu | 131 +++++++++---------
 1 file changed, 65 insertions(+), 66 deletions(-)

diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
index 9a87ddfd83ee35..98a6699d9754f1 100644
--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -12,90 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
 #include "paddle/phi/kernels/average_accumulates_kernel.h"
 #include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
 
-template<>
-void GetAccumulators<phi::GPUContext>(
-        const phi::GPUContext& dev_ctx,
-        const DenseTensor& in_num_accumulates,
-        const DenseTensor& in_old_num_accumulates,
-        const DenseTensor& in_num_updates,
-        int64_t* num_updates,
-        int64_t* num_accumulates,
-        int64_t* old_num_accumulates) {
-    auto stream = dev_ctx.stream();
-    auto cuda_place = in_old_num_accumulates.place();
-    paddle::memory::Copy(phi::CPUPlace(),
-               old_num_accumulates,
-               cuda_place,
-               in_old_num_accumulates.data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-    paddle::memory::Copy(phi::CPUPlace(),
-                num_accumulates,
-                cuda_place,
-                in_num_accumulates.data<int64_t>(),
-                sizeof(int64_t),
-                stream);
-    paddle::memory::Copy(phi::CPUPlace(),
-                num_updates,
-                cuda_place,
-                in_num_updates.data<int64_t>(),
-                sizeof(int64_t),
-                stream);
+template <>
+void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
+                                      const DenseTensor& in_num_accumulates,
+                                      const DenseTensor& in_old_num_accumulates,
+                                      const DenseTensor& in_num_updates,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
+  auto stream = dev_ctx.stream();
+  auto cuda_place = in_old_num_accumulates.place();
+  paddle::memory::Copy(phi::CPUPlace(),
+                       old_num_accumulates,
+                       cuda_place,
+                       in_old_num_accumulates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       num_accumulates,
+                       cuda_place,
+                       in_num_accumulates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       num_updates,
+                       cuda_place,
+                       in_num_updates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
 }
 
 template <>
-void SetAccumulators<phi::GPUContext>(
-        const phi::GPUContext& dev_ctx,
-        int64_t num_updates,
-        int64_t num_accumulates,
-        int64_t old_num_accumulates,                                
-        DenseTensor* out_num_accumulates,
-        DenseTensor* out_old_num_accumulates,
-        DenseTensor* out_num_updates) {
-
-    int64_t* out_num_accumulates_ptr=dev_ctx.template Alloc<int64_t>(out_num_accumulates);
-    int64_t* out_old_num_accumulates_ptr=dev_ctx.template Alloc<int64_t>(out_old_num_accumulates);
-    int64_t* out_num_updates_ptr=dev_ctx.template Alloc<int64_t>(out_num_updates);
+void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates,
+                                      DenseTensor* out_num_accumulates,
+                                      DenseTensor* out_old_num_accumulates,
+                                      DenseTensor* out_num_updates) {
+  int64_t* out_num_accumulates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_num_accumulates);
+  int64_t* out_old_num_accumulates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_old_num_accumulates);
+  int64_t* out_num_updates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_num_updates);
 
+  auto stream = dev_ctx.stream();
 
-    auto stream = dev_ctx.stream();
+  auto cuda_place = out_old_num_accumulates->place();
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       out_num_accumulates_ptr,
+                       phi::CPUPlace(),
+                       &num_accumulates,
+                       sizeof(int64_t),
+                       stream);
 
-    auto cuda_place = out_old_num_accumulates->place();
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                out_num_accumulates_ptr,
-                phi::CPUPlace(),
-                &num_accumulates,
-                sizeof(int64_t),
-                stream);
-                
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                out_old_num_accumulates_ptr,
-                phi::CPUPlace(),
-                &old_num_accumulates,
-                sizeof(int64_t),
-                stream);
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       out_old_num_accumulates_ptr,
+                       phi::CPUPlace(),
+                       &old_num_accumulates,
+                       sizeof(int64_t),
+                       stream);
 
-    paddle::memory::Copy(cuda_place,
-                out_num_updates_ptr,
-                phi::CPUPlace(),
-                &num_updates,
-                sizeof(int64_t),
-                stream);
+  paddle::memory::Copy(cuda_place,
+                       out_num_updates_ptr,
+                       phi::CPUPlace(),
+                       &num_updates,
+                       sizeof(int64_t),
+                       stream);
 }
 
-} // namespace phi
+}  // namespace phi
 
 PD_REGISTER_KERNEL(average_accumulates,
                    GPU,
                    ALL_LAYOUT,
                    phi::AverageAccumulatesKernel,
                    float,
-                   double){}
\ No newline at end of file
+                   double) {}

From 9004b45a0c78c0e1becb8c290c812cec642bdf6c Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Mon, 25 Jul 2022 05:24:47 +0000
Subject: [PATCH 09/12] polish

---
 paddle/phi/infermeta/multiary.h               | 30 +++++++++----------
 .../phi/kernels/average_accumulates_kernel.h  |  5 ++--
 .../kernels/cpu/average_accumulates_kernel.cc | 17 +++++------
 3 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index cf28a65a8d764f..ef5b0a15ee5069 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -135,21 +135,21 @@ void AucInferMeta(const MetaTensor& input,
                   MetaConfig config = MetaConfig());
 
 void AverageAccumulatesInferMeta(const MetaTensor& param,
-                            const MetaTensor& in_sum_1,
-                            const MetaTensor& in_sum_2,
-                            const MetaTensor& in_sum_3,
-                            const MetaTensor& in_num_accumulates,
-                            const MetaTensor& in_old_num_accumulates,
-                            const MetaTensor& in_num_updates,
-                            float average_window,
-                            int64_t max_average_window,
-                            int64_t min_average_window,
-                            MetaTensor* out_sum_1,
-                            MetaTensor* out_sum_2,
-                            MetaTensor* out_sum_3,
-                            MetaTensor* out_num_accumulates,
-                            MetaTensor* out_old_num_accumulates,
-                            MetaTensor* out_num_updates);
+                                 const MetaTensor& in_sum_1,
+                                 const MetaTensor& in_sum_2,
+                                 const MetaTensor& in_sum_3,
+                                 const MetaTensor& in_num_accumulates,
+                                 const MetaTensor& in_old_num_accumulates,
+                                 const MetaTensor& in_num_updates,
+                                 float average_window,
+                                 int64_t max_average_window,
+                                 int64_t min_average_window,
+                                 MetaTensor* out_sum_1,
+                                 MetaTensor* out_sum_2,
+                                 MetaTensor* out_sum_3,
+                                 MetaTensor* out_num_accumulates,
+                                 MetaTensor* out_old_num_accumulates,
+                                 MetaTensor* out_num_updates);
 
 void BatchNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h
index a6f5176c83131b..63f2b362cfde3a 100644
--- a/paddle/phi/kernels/average_accumulates_kernel.h
+++ b/paddle/phi/kernels/average_accumulates_kernel.h
@@ -31,12 +31,11 @@ template <typename Context>
 void SetAccumulators(const Context& dev_ctx,
                      int64_t num_updates,
                      int64_t num_accumulates,
-                     int64_t old_num_accumulates,                                
+                     int64_t old_num_accumulates,
                      DenseTensor* out_num_accumulates,
                      DenseTensor* out_old_num_accumulates,
                      DenseTensor* out_num_updates);
 
-
 template <typename T, typename Context>
 void AverageAccumulatesKernel(const Context& dev_ctx,
                               const DenseTensor& param,
@@ -55,4 +54,4 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
                               DenseTensor* out_num_accumulates,
                               DenseTensor* out_old_num_accumulates,
                               DenseTensor* out_num_updates);
-} // namespace phi
\ No newline at end of file
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
index 9135a810844305..14eb38d5b99b6e 100644
--- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-namespace phi{
+namespace phi {
 
 template <>
 void GetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
@@ -46,12 +46,11 @@ void SetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
   out_num_updates->data<int64_t>()[0] = num_updates;
 }
 
-} // namespace phi
+}  // namespace phi
 
-PD_REGISTER_KERNEL(
-    average_accumulates,
-    CPU,
-    ALL_LAYOUT,
-    phi::AverageAccumulatesKernel,
-    float,
-    double){}
\ No newline at end of file
+PD_REGISTER_KERNEL(average_accumulates,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AverageAccumulatesKernel,
+                   float,
+                   double) {}

From d0053544410e1065c9930d8bd48ab569b7467a1e Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Mon, 25 Jul 2022 07:28:37 +0000
Subject: [PATCH 10/12] add paddle enforce

---
 paddle/phi/infermeta/multiary.cc | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index f4d6a00bb67cbf..03f2e6f47187e4 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -451,6 +451,37 @@ void AverageAccumulatesInferMeta(const MetaTensor& param,
                                  MetaTensor* out_old_num_accumulates,
                                  MetaTensor* out_num_updates) {
   // auto in_dim = param.dims;
+  PADDLE_ENFORCE_NE(
+      out_sum_1,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_1) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(
+      out_sum_2,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_2) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(
+      out_sum_3,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_3) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(out_num_accumulates,
+                    nullptr,
+                    errors::NotFound("Output(out_num_accumulates) of "
+                                     "AverageAccumulates should not be null."));
+
+  PADDLE_ENFORCE_NE(out_old_num_accumulates,
+                    nullptr,
+                    errors::NotFound("Output(out_old_num_accumulates) of "
+                                     "AverageAccumulates should not be null."));
+
+  PADDLE_ENFORCE_NE(
+      out_num_updates,
+      nullptr,
+      errors::NotFound(
+          "Output(out_num_updates) of AverageAccumulates should not be null."));
+
   out_sum_1->set_dims(in_sum_1.dims());
   out_sum_1->set_dtype(in_sum_1.dtype());
   out_sum_2->set_dims(in_sum_2.dims());

From e280426ed45cd0fbcc5aa11fb25ce2745f516b8f Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Tue, 26 Jul 2022 08:01:03 +0000
Subject: [PATCH 11/12] clean infershape and old compute function

---
 .../fluid/operators/average_accumulates_op.cc |  97 +-------------
 .../fluid/operators/average_accumulates_op.cu |  91 --------------
 .../fluid/operators/average_accumulates_op.h  | 119 ------------------
 paddle/fluid/operators/unity_build_rule.cmake |   1 -
 .../paddle/incubate/optimizer/modelaverage.py |   1 -
 5 files changed, 6 insertions(+), 303 deletions(-)
 delete mode 100644 paddle/fluid/operators/average_accumulates_op.cu
 delete mode 100644 paddle/fluid/operators/average_accumulates_op.h

diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index f3b85e7a770ca5..9f8f295c249353 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -12,101 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-template <>
-void GetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
-                                      int64_t* num_updates,
-                                      int64_t* num_accumulates,
-                                      int64_t* old_num_accumulates) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-
-  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
-  *num_updates = in_num_updates->data<int64_t>()[0];
-}
-
-template <>
-void SetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
-                                      int64_t num_updates,
-                                      int64_t num_accumulates,
-                                      int64_t old_num_accumulates) {
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
-  out_num_updates->data<int64_t>()[0] = num_updates;
-}
-
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("param"), "Input", "param", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_1"), "Input", "in_sum_1", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_2"), "Input", "in_sum_2", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_3"), "Input", "in_sum_3", "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_num_accumulates"),
-                   "Input",
-                   "in_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_old_num_accumulates"),
-                   "Input",
-                   "in_old_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_num_updates"),
-                   "Input",
-                   "in_num_updates",
-                   "AverageAccumulates");
-
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_1"),
-                   "Output",
-                   "out_sum_1",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_2"),
-                   "Output",
-                   "out_sum_2",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_3"),
-                   "Output",
-                   "out_sum_3",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_num_accumulates"),
-                   "Output",
-                   "out_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_old_num_accumulates"),
-                   "Output",
-                   "out_old_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_num_updates"),
-                   "Output",
-                   "out_num_updates",
-                   "AverageAccumulates");
-    auto in_dim = ctx->GetInputDim("param");
-
-    ctx->SetOutputDim("out_sum_1", in_dim);
-    ctx->SetOutputDim("out_sum_2", in_dim);
-    ctx->SetOutputDim("out_sum_3", in_dim);
-    ctx->SetOutputDim("out_num_accumulates", {1});
-    ctx->SetOutputDim("out_old_num_accumulates", {1});
-    ctx->SetOutputDim("out_num_updates", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -210,18 +128,15 @@ And for a mini-batch in training, accumulators were computed as below steps:
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
 DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
                             AverageAccumulatesInferShapeFunctor,
                             PD_INFER_META(phi::AverageAccumulatesInferMeta));
 
-namespace ops = paddle::operators;
-
 REGISTER_OPERATOR(
     average_accumulates,
     ops::AverageAccumulatesOp,
     ops::AverageAccumulatesOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(average_accumulates,
-                       ops::AverageAccumulatesKernel<phi::CPUContext, float>,
-                       ops::AverageAccumulatesKernel<phi::CPUContext, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AverageAccumulatesInferShapeFunctor);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
deleted file mode 100644
index 1ce5f84e75ad35..00000000000000
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/average_accumulates_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-template <>
-void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t* num_updates_,
-    int64_t* num_accumulates_,
-    int64_t* old_num_accumulates_) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-  auto stream = ctx.cuda_device_context().stream();
-  auto cuda_place = in_old_num_accumulates->place();
-  memory::Copy(platform::CPUPlace(),
-               old_num_accumulates_,
-               cuda_place,
-               in_old_num_accumulates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-
-  memory::Copy(platform::CPUPlace(),
-               num_accumulates_,
-               cuda_place,
-               in_num_accumulates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-  memory::Copy(platform::CPUPlace(),
-               num_updates_,
-               cuda_place,
-               in_num_updates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-}
-
-template <>
-void SetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t num_updates_,
-    int64_t num_accumulates_,
-    int64_t old_num_accumulates_) {
-  auto stream = ctx.cuda_device_context().stream();
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  auto cuda_place = out_old_num_accumulates->place();
-
-  memory::Copy(cuda_place,
-               out_old_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(),
-               &old_num_accumulates_,
-               sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place,
-               out_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(),
-               &num_accumulates_,
-               sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place,
-               out_num_updates->data<int64_t>(),
-               platform::CPUPlace(),
-               &num_updates_,
-               sizeof(int64_t),
-               stream);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
deleted file mode 100644
index afa43f8c240c58..00000000000000
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext>
-void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t* num_updates,
-                     int64_t* num_accumulates,
-                     int64_t* old_num_accumulates);
-
-template <typename DeviceContext>
-void SetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t num_updates,
-                     int64_t num_accumulates,
-                     int64_t old_num_accumulates);
-
-template <typename DeviceContext, typename T>
-class AverageAccumulatesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // It is used to avoid loss of precision
-    static const int64_t kMaxNumAccumulates = 16384;
-    // Get accumulators from input
-    int64_t num_updates = 0;
-    int64_t num_accumulates = 0;
-    int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(
-        ctx, &num_updates, &num_accumulates, &old_num_accumulates);
-
-    // Get attrs
-    float average_window = ctx.Attr<float>("average_window");
-    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
-    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(
-        min_average_window,
-        max_average_window,
-        platform::errors::InvalidArgument(
-            "The min_average_window > "
-            "max_average_window is not right, min_average_window is %ld, "
-            "max_average_window is %ld.",
-            min_average_window,
-            max_average_window));
-
-    // Get inputs
-    auto* param = ctx.Input<Tensor>("param");
-    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
-    auto param_tensor = framework::EigenVector<T>::Flatten(*param);
-    auto in_sum_1_tensor = framework::EigenVector<T>::Flatten(*in_sum_1);
-    auto in_sum_2_tensor = framework::EigenVector<T>::Flatten(*in_sum_2);
-    auto in_sum_3_tensor = framework::EigenVector<T>::Flatten(*in_sum_3);
-
-    // Get outputs
-    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
-    auto out_sum_1_tensor = framework::EigenVector<T>::Flatten(*out_sum_1);
-    auto out_sum_2_tensor = framework::EigenVector<T>::Flatten(*out_sum_2);
-    auto out_sum_3_tensor = framework::EigenVector<T>::Flatten(*out_sum_3);
-
-    // Compute
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::SetConstant<DeviceContext, T> constant_functor;
-    ++num_updates;
-    ++num_accumulates;
-    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-    out_sum_2_tensor.device(place) = in_sum_2_tensor;
-    out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if (num_updates % kMaxNumAccumulates == 0) {
-      // Move the sum to a different buffer to avoid loss of precision due to
-      // too many sums.
-      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
-    }
-    if (num_accumulates >= min_average_window &&
-        num_accumulates >= std::min<int64_t>(max_average_window,
-                                             num_updates * average_window)) {
-      //  Now the average window is too long, discard the old sum.
-      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_2, 0.0);
-      old_num_accumulates = num_accumulates;
-      num_accumulates = 0;
-    }
-
-    // Set accumulators to output
-    SetAccumulators<DeviceContext>(
-        ctx, num_updates, num_accumulates, old_num_accumulates);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 62aa990ca7bc82..69206b53c7cf95 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -396,7 +396,6 @@ register_unity_group(
   conv_transpose_op.cu
   cos_sim_op.cu
   crop_op.cu
-  average_accumulates_op.cu
   conj_op.cu
   correlation_op.cu)
 register_unity_group(
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index c62456eae388be..b7d499f77292ec 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -238,7 +238,6 @@ def _append_optimize_op(self, block, param_and_grad):
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
                 old_num_accumulates, num_updates, self.average_window,
                 self.max_average_window, self.min_average_window)
-
             return None
         elif framework._non_static_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates(

From 339be4b11f503fe7142bab0eb17ce41f73a3dab4 Mon Sep 17 00:00:00 2001
From: wwbitejotunn <wang_bojun@outlook.com>
Date: Tue, 26 Jul 2022 09:01:28 +0000
Subject: [PATCH 12/12] restore cmake

---
 paddle/fluid/operators/unity_build_rule.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 69206b53c7cf95..62aa990ca7bc82 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -396,6 +396,7 @@ register_unity_group(
   conv_transpose_op.cu
   cos_sim_op.cu
   crop_op.cu
+  average_accumulates_op.cu
   conj_op.cu
   correlation_op.cu)
 register_unity_group(