diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
index cb519a28ea38bd..997a2501512d0a 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -359,6 +359,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
 
     std::vector<int64_t> skip_layernorm_x_shape =
         skip_layernorm_x->Var()->GetShape();
+    check_flag = true;
     if (skip_layernorm_x_shape.size() != multihead_matmul_input_shape.size()) {
       check_flag = false;
       VLOG(3) << "Transformer model remove_padding shape check failed, return "
@@ -395,6 +396,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
 
     std::vector<int64_t> fc_input_shape = fc_input->Var()->GetShape();
+    check_flag = true;
     if ((fc_input_shape.size() != multihead_matmul_input_shape.size()) ||
         (fc_input_shape.size() != 3)) {
       check_flag = false;
@@ -446,11 +448,13 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
 
     std::vector<int64_t> activation_input_shape =
         activation_input->Var()->GetShape();
+    check_flag = true;
     if ((activation_input_shape.size() !=
          multihead_matmul_input_shape.size()) ||
         (activation_input_shape.size() != 3)) {
       check_flag = false;
-      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+      VLOG(3) << "Activation: Transformer model remove_padding "
+                 "shape(activation_input_shape.size()) check failed, return "
                  "remove_padding pass.";
       return;
     }
@@ -465,7 +469,8 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
       check_flag = false;
     }
     if (!check_flag) {
-      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+      VLOG(3) << "Activation: Transformer model remove_padding "
+                 "shape(activation_input_shape[i]) check failed, return "
                  "remove_padding pass.";
       return;
     }
@@ -530,6 +535,7 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
 
     std::vector<int64_t> skip_layernorm_x_shape =
         preln_skip_layernorm_x->Var()->GetShape();
+    check_flag = true;
     if (skip_layernorm_x_shape.size() != multihead_matmul_input_shape.size()) {
       check_flag = false;
       VLOG(3) << "Transformer model remove_padding shape check failed, return "
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 81c34ae29c05a7..022ba1483b955d 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -88,6 +88,11 @@ void Tensor::ReshapeStrings(const size_t &shape) {
 
 template <typename T>
 T *Tensor::mutable_data(PlaceType place) {
+#ifdef PADDLE_WITH_ONNXRUNTIME
+  if (is_ort_tensor_) {
+    return ORTGetMutableData<T>();
+  }
+#endif
   EAGER_GET_TENSOR(paddle::framework::LoDTensor);
   PADDLE_ENFORCE_GT(
       tensor->numel(),
@@ -720,6 +725,17 @@ void Tensor::SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding) {
   binding_ = binding;
 }
 
+template <typename T>
+T *Tensor::ORTGetMutableData() {
+  auto binding = binding_.lock();
+  PADDLE_ENFORCE_NOT_NULL(binding,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "output tensor [%s] no binding ptr", name_));
+  std::vector<Ort::Value> outputs = binding->GetOutputValues();
+  Ort::Value &value = outputs[idx_];
+  return value.GetTensorMutableData<T>();
+}
+
 template <typename T>
 void Tensor::ORTCopyToCpu(T *data) const {
   auto binding = binding_.lock();
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index d96148abd3b560..b10f051d6e44e4 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -198,6 +198,9 @@ class PD_INFER_DECL Tensor {
 
   void SetOrtBinding(const std::shared_ptr<Ort::IoBinding> binding);
 
+  template <typename T>
+  T* ORTGetMutableData();
+
   template <typename T>
   void ORTCopyFromCpu(const T* data);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index cb6797c4e2a71d..2044c8acdd4b00 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -60,6 +60,50 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
           std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
       emb_names =
           std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+
+      auto mask_id_tensor = engine_->GetITensor("mask_id");
+      auto mask_dims = mask_id_tensor->getDimensions();
+      auto slice_start_dims = mask_dims;
+      auto slice_stride_dims = mask_dims;
+
+      for (int i = 0; i < mask_dims.nbDims; i++) {
+        slice_start_dims.d[i] = 0;
+        slice_stride_dims.d[i] = 1;
+      }
+
+      auto* shape_tensor = Shape(mask_id_tensor);
+      std::vector<nvinfer1::ITensor*> size_vec_tensor;
+      for (int i = 0; i < mask_dims.nbDims; i++) {
+        size_vec_tensor.push_back(Add1DConstantLayer(1));
+      }
+      size_vec_tensor[1] = GetEleTensorOfShape(shape_tensor, 1);
+      auto size_tensor = Concat(size_vec_tensor);
+
+      auto slice_layer =
+          TRT_ENGINE_ADD_LAYER(engine_,
+                               Slice,
+                               *mask_id_tensor,
+                               slice_start_dims,
+                               slice_start_dims,
+                               slice_stride_dims);  // unuseful slice_start_dims
+      slice_layer->setInput(2, *size_tensor);
+      slice_layer->setName(
+          ("Embeltwise_slice_layer (Output: slice_max_seqlen " +
+           op_desc.Output("Out")[0] + ")")
+              .c_str());
+      engine_->SetTensorDynamicRange(slice_layer->getOutput(0), 1.0f);
+
+      auto* reshape_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *slice_layer->getOutput(0));
+      nvinfer1::Dims shape_dim;
+      shape_dim.nbDims = 1;
+      shape_dim.d[0] = -1;
+      reshape_layer->setReshapeDimensions(shape_dim);
+      reshape_layer->setName(("Embeltwise_reshape_layer (Output: max_seqlen " +
+                              op_desc.Output("Out")[0] + ")")
+                                 .c_str());
+      engine_->SetTensorDynamicRange(reshape_layer->getOutput(0), 1.0f);
+      engine_->SetITensor("max_seqlen_tensor", reshape_layer->getOutput(0));
     } else {
       id_names = op_desc.Input("Ids");
       emb_names = op_desc.Input("Embs");
@@ -192,20 +236,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       plugin_inputs.emplace_back(
           engine_->GetITensor(pos_id_name));  // cu_seqlens,
                                               // eval_placeholder_2
-      auto max_seqlen_tensor = engine_->GetITensor(mask_id_name);
-      auto* shuffle_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
-      nvinfer1::Dims shape_dim;
-      shape_dim.nbDims = 1;
-      shape_dim.d[0] = -1;
-      shuffle_layer->setReshapeDimensions(shape_dim);
-      shuffle_layer->setName(
-          ("Embeltwise_Shuffle_reshape (Output: max_seqlen " +
-           op_desc.Output("Out")[0] + ")")
-              .c_str());
-      engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
-      plugin_inputs.emplace_back(
-          shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+      plugin_inputs.emplace_back(engine_->GetITensor(
+          "max_seqlen_tensor"));  // max_seqlen, eval_placeholder_3
 
       auto creator = GetPluginRegistry()->getPluginCreator(
           "CustomEmbLayerNormPluginDynamic", "2");
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 298c551f3d45a2..9a3a9e3fc42829 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
 http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
index 92a74e65adb931..dba0d003c0fe55 100644
--- a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc
@@ -23,7 +23,6 @@ class FusedTokenPruneOpConverter : public OpConverter {
                   bool test_mode) override {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
-
     auto* Attn = engine_->GetITensor(op_desc.Input("Attn").front());
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Mask = engine_->GetITensor(op_desc.Input("Mask").front());
@@ -36,28 +35,54 @@ class FusedTokenPruneOpConverter : public OpConverter {
         op_desc.HasAttr("keep_order")
             ? PADDLE_GET_CONST(bool, op_desc.GetAttr("keep_order"))
             : false;
-
-    std::vector<nvinfer1::ITensor*> itensors = {Attn, X, Mask, NewMask};
-
     auto output_name = op_desc.Output("SlimmedX")[0];
     auto out_inds_name = op_desc.Output("CLSInds")[0];
     if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
 
       if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
         with_fp16 = true;
       }
+      bool flag_varseqlen = engine_->use_varseqlen();
       plugin::FusedTokenPrunePluginDynamic* plugin =
           new plugin::FusedTokenPrunePluginDynamic(
-              with_fp16, keep_first_token, keep_order);
-      layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
-#else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
-#endif
+              with_fp16, keep_first_token, keep_order, flag_varseqlen);
+      if (flag_varseqlen) {
+        auto* word_id = engine_->GetITensor("word_id");
+        auto* pos_id = engine_->GetITensor("pos_id");
+        auto* mask_id = engine_->GetITensor("mask_id");
+        std::vector<nvinfer1::ITensor*> itensors = {
+            Attn, X, Mask, NewMask, word_id, pos_id, mask_id};
+        layer = engine_->AddDynamicPlugin(itensors.data(), 7, plugin);
+
+        layer->getOutput(0)->setName(output_name.c_str());
+        engine_->SetITensor(output_name, layer->getOutput(0));
+
+        layer->getOutput(1)->setName(out_inds_name.c_str());
+        engine_->SetITensor(out_inds_name, layer->getOutput(1));
+
+        engine_->DeleteITensor("word_id", word_id);
+        layer->getOutput(2)->setName("word_id_after_token_prune");
+        engine_->SetITensor("word_id", layer->getOutput(2));
+
+        engine_->DeleteITensor("pos_id", pos_id);
+        layer->getOutput(3)->setName("pos_id_after_token_prune");
+        engine_->SetITensor("pos_id", layer->getOutput(3));
+
+        engine_->DeleteITensor("mask_id", mask_id);
+        layer->getOutput(4)->setName("mask_id_after_token_prune");
+        engine_->SetITensor("mask_id", layer->getOutput(4));
+      } else {
+        std::vector<nvinfer1::ITensor*> itensors = {Attn, X, Mask, NewMask};
+        layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
+        layer->getOutput(0)->setName(output_name.c_str());
+        engine_->SetITensor(output_name, layer->getOutput(0));
+        layer->getOutput(1)->setName(out_inds_name.c_str());
+        engine_->SetITensor(out_inds_name, layer->getOutput(1));
+      }
+      layer->setName(
+          ("fused_token_prune(Output: " + output_name + ")").c_str());
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the Ernie(Bert) model in static shape mode, which "
@@ -65,8 +90,6 @@ class FusedTokenPruneOpConverter : public OpConverter {
           "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
           "the shape information to run the dynamic shape mode."));
     }
-    RreplenishLayerAndOutput(
-        layer, "fused_token_prune", {output_name, out_inds_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 5579844e1acf86..ce678bd4915697 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -94,6 +94,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
         nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(bias_data),
                                static_cast<int32_t>(bias_t->numel())};
+        auto max_seqlen_tensor = engine_->GetITensor("max_seqlen_tensor");
+        auto pos_id_tensor = engine_->GetITensor("pos_id");
         if (engine_->with_interleaved()) {
           VLOG(4) << "fused multihead_matmul op: use_varseqlen and "
                      "with_interleaved";
@@ -154,31 +156,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
 
           std::vector<nvinfer1::ITensor*> plugin_inputs;
           plugin_inputs.emplace_back(fc_layer->getOutput(0));
-          if (engine_->Has("ernie_pos_name")) {
-            plugin_inputs.emplace_back(engine_->GetITensor(
-                engine_->Get<std::string>("ernie_pos_name")));
-          } else {
-            plugin_inputs.emplace_back(engine_->GetITensor(
-                engine_->network()
-                    ->getInput(2)
-                    ->getName()));  // cu_seqlens, eval_placeholder_2
-          }
-          auto max_seqlen_tensor =
-              engine_->GetITensor(engine_->network()->getInput(3)->getName());
-          engine_->SetTensorDynamicRange(max_seqlen_tensor, 1.0f);
-          auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-              engine_,
-              Shuffle,
-              *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-          nvinfer1::Dims shape_dim;
-          shape_dim.nbDims = 1;
-          shape_dim.d[0] = -1;
-          shuffle_layer->setReshapeDimensions(shape_dim);
-          engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+          plugin_inputs.emplace_back(pos_id_tensor);
           plugin_inputs.emplace_back(
-              shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-          shuffle_layer->setName(
-              ("Multihead: Shuffle: (Output: " + output_name + ")").c_str());
+              max_seqlen_tensor);  // max_seqlen, eval_placeholder_3
           auto plugin_layer = engine_->network()->addPluginV2(
               plugin_inputs.data(), plugin_inputs.size(), *plugin);
           layer = plugin_layer;
@@ -299,20 +279,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
           std::vector<nvinfer1::ITensor*> plugin_inputs;
           plugin_inputs.emplace_back(fc_layer->getOutput(0));
           plugin_inputs.emplace_back(engine_->GetITensor("qkv_plugin_mask"));
-          plugin_inputs.emplace_back(engine_->GetITensor("pos_id"));
-
-          auto max_seqlen_tensor = engine_->GetITensor("mask_id");
-          auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-              engine_,
-              Shuffle,
-              *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-          nvinfer1::Dims shape_dim;
-          shape_dim.nbDims = 1;
-          shape_dim.d[0] = -1;
-          shuffle_layer->setReshapeDimensions(shape_dim);
-          engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+          plugin_inputs.emplace_back(pos_id_tensor);
           plugin_inputs.emplace_back(
-              shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+              max_seqlen_tensor);  // max_seqlen, eval_placeholder_3
 
           auto plugin_layer = engine_->network()->addPluginV2(
               plugin_inputs.data(), plugin_inputs.size(), *plugin);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index 38a811b8f521b1..8079915b9e01ce 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -157,20 +157,47 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     plugin_inputs.emplace_back(
         engine_->GetITensor(pos_id_name));  // cu_seqlens,
                                             // eval_placeholder_2
-    auto max_seqlen_tensor = engine_->GetITensor(mask_id_name);
-    auto* shuffle_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+    auto mask_id_tensor = engine_->GetITensor("mask_id");
+    auto mask_dims = mask_id_tensor->getDimensions();
+    auto slice_start_dims = mask_dims;
+    auto slice_size_dims = mask_dims;
+    auto slice_stride_dims = mask_dims;
+
+    for (int i = 0; i < mask_dims.nbDims; i++) {
+      slice_start_dims.d[i] = 0;
+      slice_size_dims.d[i] = 1;
+      slice_stride_dims.d[i] = 1;
+    }
+    slice_size_dims.d[1] = mask_dims.d[1];
+    auto* slice_size_tensor = Add1DConstantLayer(slice_size_dims);
+    auto slice_layer =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Slice,
+                             *mask_id_tensor,
+                             slice_start_dims,
+                             slice_start_dims,
+                             slice_stride_dims);  // unuseful slice_start_dims
+    slice_layer->setInput(2, *slice_size_tensor);
+    slice_layer->setName(
+        ("PrelnEmbeltwise_slice_layer (Output: slice_max_seqlen " +
+         op_desc.Output("Out")[0] + ")")
+            .c_str());
+    engine_->SetTensorDynamicRange(slice_layer->getOutput(0), 1.0f);
+
+    auto* reshape_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *slice_layer->getOutput(0));
     nvinfer1::Dims shape_dim;
     shape_dim.nbDims = 1;
     shape_dim.d[0] = -1;
-    shuffle_layer->setReshapeDimensions(shape_dim);
-    shuffle_layer->setName(
-        ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " +
-         op_desc.Output("Out_0")[0] + ")")
+    reshape_layer->setReshapeDimensions(shape_dim);
+    reshape_layer->setName(
+        ("PrelnEmbeltwise_reshape_layer (Output: max_seqlen " +
+         op_desc.Output("Out")[0] + ")")
             .c_str());
-    engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+    engine_->SetTensorDynamicRange(reshape_layer->getOutput(0), 1.0f);
+    engine_->SetITensor("max_seqlen_tensor", reshape_layer->getOutput(0));
     plugin_inputs.emplace_back(
-        shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+        reshape_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
 
     auto creator = GetPluginRegistry()->getPluginCreator(
         "CustomEmbLayerNormPluginDynamic", "3");
diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
index 086cae495b522f..dab1bc2c91023b 100644
--- a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
@@ -111,6 +111,8 @@ class SparseMultiheadMatMulOpConverter : public OpConverter {
         nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
                                static_cast<void*>(bias_data),
                                static_cast<int32_t>(bias_t->numel())};
+        auto max_seqlen_tensor = engine_->GetITensor("max_seqlen_tensor");
+        auto pos_id_tensor = engine_->GetITensor("pos_id");
         if (engine_->with_interleaved()) {
           VLOG(4) << "fused multihead_matmul op: use_varseqlen and "
                      "with_interleaved";
@@ -171,31 +173,9 @@ class SparseMultiheadMatMulOpConverter : public OpConverter {
 
           std::vector<nvinfer1::ITensor*> plugin_inputs;
           plugin_inputs.emplace_back(fc_layer->getOutput(0));
-          if (engine_->Has("ernie_pos_name")) {
-            plugin_inputs.emplace_back(engine_->GetITensor(
-                engine_->Get<std::string>("ernie_pos_name")));
-          } else {
-            plugin_inputs.emplace_back(engine_->GetITensor(
-                engine_->network()
-                    ->getInput(2)
-                    ->getName()));  // cu_seqlens, eval_placeholder_2
-          }
-          auto max_seqlen_tensor =
-              engine_->GetITensor(engine_->network()->getInput(3)->getName());
-          engine_->SetTensorDynamicRange(max_seqlen_tensor, 1.0f);
-          auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-              engine_,
-              Shuffle,
-              *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-          nvinfer1::Dims shape_dim;
-          shape_dim.nbDims = 1;
-          shape_dim.d[0] = -1;
-          shuffle_layer->setReshapeDimensions(shape_dim);
-          engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+          plugin_inputs.emplace_back(pos_id_tensor);
           plugin_inputs.emplace_back(
-              shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-          shuffle_layer->setName(
-              ("Multihead: Shuffle: (Output: " + output_name + ")").c_str());
+              max_seqlen_tensor);  // max_seqlen, eval_placeholder_3
           auto plugin_layer = engine_->network()->addPluginV2(
               plugin_inputs.data(), plugin_inputs.size(), *plugin);
           layer = plugin_layer;
@@ -316,21 +296,9 @@ class SparseMultiheadMatMulOpConverter : public OpConverter {
           std::vector<nvinfer1::ITensor*> plugin_inputs;
           plugin_inputs.emplace_back(fc_layer->getOutput(0));
           plugin_inputs.emplace_back(engine_->GetITensor("qkv_plugin_mask"));
-          plugin_inputs.emplace_back(engine_->GetITensor("pos_id"));
-
-          auto max_seqlen_tensor = engine_->GetITensor("mask_id");
-          auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-              engine_,
-              Shuffle,
-              *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-          nvinfer1::Dims shape_dim;
-          shape_dim.nbDims = 1;
-          shape_dim.d[0] = -1;
-          shuffle_layer->setReshapeDimensions(shape_dim);
-          engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+          plugin_inputs.emplace_back(pos_id_tensor);
           plugin_inputs.emplace_back(
-              shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-
+              max_seqlen_tensor);  // max_seqlen, eval_placeholder_3
           auto plugin_layer = engine_->network()->addPluginV2(
               plugin_inputs.data(), plugin_inputs.size(), *plugin);
           layer = plugin_layer;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index a4d373e83b3552..0bba420a2083af 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -410,6 +410,19 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
                         name));
   network()->markOutput(*output);
 }
+void TensorRTEngine::DeleteITensor(const std::string &name,
+                                   nvinfer1::ITensor *tensor) {
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor,
+      platform::errors::InvalidArgument(
+          "Tensor named %s of TRT engine should not be null.", name));
+  PADDLE_ENFORCE_EQ(
+      true,
+      itensor_map_.count(name),
+      platform::errors::InvalidArgument(
+          "Tensor named %s of TRT engine should not be null", name));
+  itensor_map_.erase(name);
+}
 
 void TensorRTEngine::SetITensor(const std::string &name,
                                 nvinfer1::ITensor *tensor) {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 73506eb8f6244d..fcd28ec749cd8f 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -278,6 +278,7 @@ class TensorRTEngine {
   void DeclareOutput(const std::string& name);
   void ClearTensorMap() { itensor_map_.clear(); }
 
+  void DeleteITensor(const std::string& name, nvinfer1::ITensor* tensor);
   void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
   // Get an ITensor called name.
   nvinfer1::ITensor* GetITensor(const std::string& name);
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
index c10ab7277e7888..f52ef0c52ff0e0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -177,22 +177,75 @@ __global__ void TakeAlongAxis(const T* src,
   }
 }
 
+__global__ void pos_id_prune_kernel(const int32_t* src,
+                                    int32_t* dst,
+                                    int pos_nums,
+                                    float scale) {
+  dst[0] = 0;
+  for (int i = 1; i < pos_nums; i++) {
+    dst[i] =
+        dst[i - 1] + max(static_cast<int>((src[i] - src[i - 1]) * scale), 2);
+  }
+}
+
 nvinfer1::DimsExprs FusedTokenPrunePluginDynamic::getOutputDimensions(
     int output_index,
     const nvinfer1::DimsExprs* inputs,
     int nb_inputs,
     nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   auto x_dims = inputs[1], new_mask_dims = inputs[3];
-  if (output_index == 0) {
-    nvinfer1::DimsExprs ret = x_dims;
-    ret.d[1] = new_mask_dims.d[2];
-    return ret;
+  if (flag_varseqlen_) {
+    if (output_index == 0) {
+      nvinfer1::DimsExprs ret = x_dims;
+      ret.d[1] = new_mask_dims.d[2];
+      return ret;
+    } else if (output_index == 1) {
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = 2;
+      ret.d[0] = new_mask_dims.d[0];
+      ret.d[1] = new_mask_dims.d[2];
+      return ret;
+    } else if (output_index == 2) {
+      // word id
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = 1;
+      // max sum of seqlen: pre_seqlen * new_mask[2] / mask[1] + 2 * batchs
+      const auto* two = expr_builder.constant(2);
+      ret.d[0] = expr_builder.operation(
+          nvinfer1::DimensionOperation::kSUM,
+          *expr_builder.operation(
+              nvinfer1::DimensionOperation::kFLOOR_DIV,
+              *expr_builder.operation(nvinfer1::DimensionOperation::kPROD,
+                                      *inputs[4].d[0],
+                                      *new_mask_dims.d[2]),
+              *inputs[6].d[1]),
+          *expr_builder.operation(
+              nvinfer1::DimensionOperation::kPROD, *two, *inputs[6].d[0]));
+      return ret;
+    } else if (output_index == 3) {
+      // pos id
+      nvinfer1::DimsExprs ret = inputs[5];
+      return ret;
+    } else if (output_index == 4) {
+      // mask id
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = 2;
+      ret.d[0] = inputs[6].d[0];
+      ret.d[1] = new_mask_dims.d[2];
+      return ret;
+    }
   } else {
-    nvinfer1::DimsExprs ret;
-    ret.nbDims = 2;
-    ret.d[0] = new_mask_dims.d[0];
-    ret.d[1] = new_mask_dims.d[2];
-    return ret;
+    if (output_index == 0) {
+      nvinfer1::DimsExprs ret = x_dims;
+      ret.d[1] = new_mask_dims.d[2];
+      return ret;
+    } else {
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = 2;
+      ret.d[0] = new_mask_dims.d[0];
+      ret.d[1] = new_mask_dims.d[2];
+      return ret;
+    }
   }
 }
 
@@ -215,26 +268,53 @@ bool FusedTokenPrunePluginDynamic::supportsFormatCombination(
                                         nb_inputs + nb_outputs));
 
   const nvinfer1::PluginTensorDesc& in = in_out[pos];
-  if (pos == 0) {
-    if (with_fp16_) {
+  if (flag_varseqlen_) {
+    if (pos == 0) {
+      if (with_fp16_) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+        return (in.type == nvinfer1::DataType::kFLOAT ||
+                in.type == nvinfer1::DataType::kHALF) &&
+               (in.format == nvinfer1::TensorFormat::kLINEAR);
 #else
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+        return (in.type == nvinfer1::DataType::kFLOAT) &&
+               (in.format == nvinfer1::TensorFormat::kLINEAR);
 #endif
+      } else {
+        return (in.type == nvinfer1::DataType::kFLOAT) &&
+               (in.format == nvinfer1::TensorFormat::kLINEAR);
+      }
+    } else if (pos <= 3 || pos == 7) {
+      const nvinfer1::PluginTensorDesc& prev = in_out[0];
+      return in.type == prev.type && in.format == prev.format;
+    } else if (pos == 6 || pos == 11) {  // mask_id, mask_id_out
+      return in.type == nvinfer1::DataType::kFLOAT &&
+             in.format == nvinfer1::TensorFormat::kLINEAR;
     } else {
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return in.type == nvinfer1::DataType::kINT32 &&
+             in.format == nvinfer1::TensorFormat::kLINEAR;
     }
-  } else if (pos <= 4) {
-    const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
-    return in.type == prev.type && in.format == prev.format;
   } else {
-    const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
-    return in.type == nvinfer1::DataType::kINT32 && in.format == prev.format;
+    if (pos == 0) {
+      if (with_fp16_) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+        return (in.type == nvinfer1::DataType::kFLOAT ||
+                in.type == nvinfer1::DataType::kHALF) &&
+               (in.format == nvinfer1::TensorFormat::kLINEAR);
+#else
+        return (in.type == nvinfer1::DataType::kFLOAT) &&
+               (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+      } else {
+        return (in.type == nvinfer1::DataType::kFLOAT) &&
+               (in.format == nvinfer1::TensorFormat::kLINEAR);
+      }
+    } else if (pos <= 4) {
+      const nvinfer1::PluginTensorDesc& prev = in_out[0];
+      return in.type == prev.type && in.format == prev.format;
+    } else {
+      return in.type == nvinfer1::DataType::kINT32 &&
+             in.format == nvinfer1::TensorFormat::kLINEAR;
+    }
   }
 }
 
@@ -242,10 +322,22 @@ nvinfer1::DataType FusedTokenPrunePluginDynamic::getOutputDataType(
     int index,
     const nvinfer1::DataType* input_types,
     int nb_inputs) const TRT_NOEXCEPT {
-  if (index == 0) {
-    return input_types[1];
-  } else if (index == 1) {
-    return nvinfer1::DataType::kINT32;
+  if (flag_varseqlen_) {
+    if (index == 0) {
+      return input_types[1];
+    } else if (index == 4) {
+      return nvinfer1::DataType::kFLOAT;
+    } else {
+      // index = 1,2,3
+      return nvinfer1::DataType::kINT32;
+    }
+  } else {
+    if (index == 0) {
+      return input_types[1];
+    } else {
+      // index = 1
+      return nvinfer1::DataType::kINT32;
+    }
   }
 }
 
@@ -273,15 +365,16 @@ size_t FusedTokenPrunePluginDynamic::getWorkspaceSize(
 }
 
 template <typename T>
-int FusedTokenPrunePluginDynamic::enqueueImpl(
-    const nvinfer1::PluginTensorDesc* input_desc,
-    const nvinfer1::PluginTensorDesc* output_desc,
-    const void* const* inputs,
-    void* const* outputs,
-    void* workspace_ptr,
-    cudaStream_t stream,
-    int device_id,
-    T max_value) {
+inline void enqueueImpl(const nvinfer1::PluginTensorDesc* input_desc,
+                        const nvinfer1::PluginTensorDesc* output_desc,
+                        const void* const* inputs,
+                        void* const* outputs,
+                        void* workspace_ptr,
+                        cudaStream_t stream,
+                        int device_id,
+                        T max_value,
+                        bool keep_first_token_,
+                        bool keep_order_) {
   // Dims
   auto attn_dims = input_desc[0].dims;
   auto x_dims = input_desc[1].dims;
@@ -462,8 +555,14 @@ int FusedTokenPrunePluginDynamic::enqueueImpl(
                                      slimmed_x_len,
                                      c);
   }
+}
 
-  return cudaGetLastError() != cudaSuccess;
+inline void pos_id_prune(const int32_t* input,
+                         int32_t* output,
+                         int pos_nums,
+                         float scale,
+                         cudaStream_t stream) {
+  pos_id_prune_kernel<<<1, 1, 0, stream>>>(input, output, pos_nums, scale);
 }
 
 int FusedTokenPrunePluginDynamic::enqueue(
@@ -485,14 +584,16 @@ int FusedTokenPrunePluginDynamic::enqueue(
 
     float max = std::numeric_limits<float>::max();
 
-    return enqueueImpl<float>(input_desc,
-                              output_desc,
-                              inputs,
-                              outputs,
-                              workspace,
-                              stream,
-                              device_id,
-                              max);
+    enqueueImpl<float>(input_desc,
+                       output_desc,
+                       inputs,
+                       outputs,
+                       workspace,
+                       stream,
+                       device_id,
+                       max,
+                       keep_first_token_,
+                       keep_order_);
 
   } else if (input_type == nvinfer1::DataType::kHALF) {
 #ifdef TRT_PLUGIN_FP16_AVALIABLE
@@ -500,14 +601,16 @@ int FusedTokenPrunePluginDynamic::enqueue(
 
     half max = 65504.0;
 
-    return enqueueImpl<half>(input_desc,
-                             output_desc,
-                             inputs,
-                             outputs,
-                             workspace,
-                             stream,
-                             device_id,
-                             max);
+    enqueueImpl<half>(input_desc,
+                      output_desc,
+                      inputs,
+                      outputs,
+                      workspace,
+                      stream,
+                      device_id,
+                      max,
+                      keep_first_token_,
+                      keep_order_);
 
 #else
     PADDLE_THROW(platform::errors::Fatal(
@@ -522,6 +625,17 @@ int FusedTokenPrunePluginDynamic::enqueue(
         platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type "
                                 "should be float or half."));
   }
+  if (flag_varseqlen_) {
+    float scale =
+        static_cast<float>(input_desc[3].dims.d[2]) / input_desc[6].dims.d[1];
+    // outputs[2]=inputs[4]; // word_id
+    const int32_t* inputs5 = static_cast<const int32_t*>(inputs[5]);
+    int32_t* outputs3 = static_cast<int32_t*>(outputs[3]);
+    pos_id_prune(
+        inputs5, outputs3, input_desc[5].dims.d[0], scale, stream);  // pos_id
+    // outputs[4]=inputs[6]; // new_mask
+  }
+  return cudaGetLastError() != cudaSuccess;
 }
 
 #endif
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
index fcd91522ca39cb..0b32e8a552bb7e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h
@@ -28,34 +28,45 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit FusedTokenPrunePluginDynamic(bool with_fp16,
                                         bool keep_first_token,
-                                        bool keep_order)
-      : keep_first_token_(keep_first_token), keep_order_(keep_order) {
+                                        bool keep_order,
+                                        bool flag_varseqlen)
+      : keep_first_token_(keep_first_token),
+        keep_order_(keep_order),
+        flag_varseqlen_(flag_varseqlen) {
     with_fp16_ = with_fp16;
   }
   FusedTokenPrunePluginDynamic(void const* serial_data, size_t serial_length) {
     DeserializeValue(&serial_data, &serial_length, &with_fp16_);
     DeserializeValue(&serial_data, &serial_length, &keep_first_token_);
     DeserializeValue(&serial_data, &serial_length, &keep_order_);
+    DeserializeValue(&serial_data, &serial_length, &flag_varseqlen_);
   }
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new FusedTokenPrunePluginDynamic(
-        with_fp16_, keep_first_token_, keep_order_);
+        with_fp16_, keep_first_token_, keep_order_, flag_varseqlen_);
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "fused_token_prune_plugin_dynamic";
   }
-  int getNbOutputs() const TRT_NOEXCEPT override { return 2; }
+  int getNbOutputs() const TRT_NOEXCEPT override {
+    if (flag_varseqlen_) {
+      return 5;
+    } else {
+      return 2;
+    }
+  }
   int initialize() TRT_NOEXCEPT override { return 0; }
 
   size_t getSerializationSize() const TRT_NOEXCEPT override {
     return SerializedSize(with_fp16_) + SerializedSize(keep_first_token_) +
-           SerializedSize(keep_order_);
+           SerializedSize(keep_order_) + SerializedSize(flag_varseqlen_);
   }
   void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, with_fp16_);
     SerializeValue(&buffer, keep_first_token_);
     SerializeValue(&buffer, keep_order_);
+    SerializeValue(&buffer, flag_varseqlen_);
   }
 
   nvinfer1::DimsExprs getOutputDimensions(
@@ -95,17 +106,9 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT {
   void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
-  template <typename T>
-  int enqueueImpl(const nvinfer1::PluginTensorDesc* input_desc,
-                  const nvinfer1::PluginTensorDesc* output_desc,
-                  const void* const* inputs,
-                  void* const* outputs,
-                  void* workspace,
-                  cudaStream_t stream,
-                  int device_id,
-                  T max_value);
   bool keep_first_token_;
   bool keep_order_;
+  bool flag_varseqlen_;
 };
 
 class FusedTokenPrunePluginDynamicCreator : public nvinfer1::IPluginCreator {
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
index 12bb579731e565..3963b48a26c6c7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
@@ -72,12 +72,16 @@ bool RecoverPaddingPlugin::supportsFormatCombination(
                     platform::errors::InvalidArgument("Must have 1 output, "
                                                       "but got %d output(s). ",
                                                       nbOutputs));
-  if (pos == 1) {  // PosId, MaxSeqlen
+  if (pos == 1) {  // PosId
     return inOut[pos].type == nvinfer1::DataType::kINT32 &&
            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  } else if (pos == 2) {  // mask_id
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+           inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  } else {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+           inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   }
-  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format
   // == nvinfer1::TensorFormat::kLINEAR)||
   // (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format ==
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
index 9625dc74b21610..418ecb015784fe 100644
--- a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
@@ -72,9 +72,10 @@ bool RemovePaddingPlugin::supportsFormatCombination(
   if (pos == 1 || pos == 2) {  // pos_id, work_id
     return inOut[pos].type == nvinfer1::DataType::kINT32 &&
            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+  } else {
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+           inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   }
-  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
-         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
   // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format
   // == nvinfer1::TensorFormat::kLINEAR)||
   // (inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format ==
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
index 131ce46d89a661..4cc20c43659757 100644
--- a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc
@@ -22,8 +22,10 @@ namespace tensorrt {
 namespace plugin {
 
 TEST(fused_token_prune_op_plugin, test_plugin) {
-  FusedTokenPrunePluginDynamic plugin(
-      true, /*keep_first_token*/ false, /*keep_order*/ true);
+  FusedTokenPrunePluginDynamic plugin(true,
+                                      /*keep_first_token*/ false,
+                                      /*keep_order*/ true,
+                                      /*flag_varseqlen*/ false);
   plugin.configurePlugin(nullptr, 4, nullptr, 2);
   plugin.initialize();
   plugin.getPluginType();
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index 844086846c0257..6ac23e32856bec 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -293,8 +293,10 @@ TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
   auto *new_mask = engine_->DeclareInput(
       "new_mask", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 2, 2});
   plugin::FusedTokenPrunePluginDynamic *plugin =
-      new plugin::FusedTokenPrunePluginDynamic(
-          true, /*keep_first_token*/ false, /*keep_order*/ true);
+      new plugin::FusedTokenPrunePluginDynamic(true,
+                                               /*keep_first_token*/ false,
+                                               /*keep_order*/ true,
+                                               /*flag_varseqlen*/ false);
   std::vector<nvinfer1::ITensor *> itensors = {attn, x, mask, new_mask};
   auto *layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin);
   PADDLE_ENFORCE_NOT_NULL(layer,
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
index a6f2e03f14fd02..03733e34ec670b 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cc
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/prior_box_op.h"
-
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -28,79 +29,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "PriorBoxOp");
-    OP_INOUT_CHECK(ctx->HasInput("Image"), "Input", "Image", "PriorBoxOp");
-
-    auto image_dims = ctx->GetInputDim("Image");
-    auto input_dims = ctx->GetInputDim("Input");
-
-    PADDLE_ENFORCE_EQ(
-        image_dims.size(),
-        4,
-        platform::errors::InvalidArgument(
-            "The Input(Image) of Op(PriorBoxOp) should be a 4-D Tensor "
-            "and data format is NCHW. But received Image's dimensions = %d, "
-            "shape = [%s].",
-            image_dims.size(),
-            image_dims));
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(),
-        4,
-        platform::errors::InvalidArgument(
-            "The Input(Input) of Op(PriorBoxOp) should be a 4-D Tensor "
-            "and data format is NCHW. But received Input's dimensions = %d, "
-            "shape = [%s].",
-            input_dims.size(),
-            input_dims));
-
-    auto min_sizes = ctx->Attrs().Get<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx->Attrs().Get<std::vector<float>>("max_sizes");
-    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
-    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
-    bool flip = ctx->Attrs().Get<bool>("flip");
-
-    std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
-
-    size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          max_sizes.size(),
-          min_sizes.size(),
-          platform::errors::InvalidArgument(
-              "The length of min_size and "
-              "max_size must be equal. But received: min_size's length is %d, "
-              "max_size's length is %d.",
-              min_sizes.size(),
-              max_sizes.size()));
-      num_priors += max_sizes.size();
-      for (size_t i = 0; i < max_sizes.size(); ++i) {
-        PADDLE_ENFORCE_GT(
-            max_sizes[i],
-            min_sizes[i],
-            platform::errors::InvalidArgument(
-                "max_size[%d] must be greater "
-                "than min_size[%d]. But received: max_size[%d] is %f, "
-                "min_size[%d] is %f.",
-                i,
-                i,
-                i,
-                max_sizes[i],
-                i,
-                min_sizes[i]));
-      }
-    }
-
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", phi::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", phi::make_ddim(dim_vec));
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -274,17 +202,18 @@ Please get more information from the following papers:
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(prior_box,
+                            PriorBoxInferShapeFunctor,
+                            PD_INFER_META(phi::PriorBoxInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(
     prior_box,
     ops::PriorBoxOp,
     ops::PriorBoxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(prior_box,
-                       ops::PriorBoxOpKernel<float, float>,
-                       ops::PriorBoxOpKernel<double, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    PriorBoxInferShapeFunctor);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box,
                                     MKLDNN,
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
index af246047529fcd..1cdf7691338294 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -193,8 +193,3 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(prior_box,
-                        ops::PriorBoxOpCUDAKernel<float>,
-                        ops::PriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
index 67bc9ba4fe774d..c6831f975c47a4 100644
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lu_op.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
deleted file mode 100644
index 1122937b6efe36..00000000000000
--- a/paddle/fluid/operators/lu_op.h
+++ /dev/null
@@ -1,528 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/phi_utils.h"
-#include "paddle/fluid/operators/set_value_op.h"
-#include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
-#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
-#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-#include "paddle/phi/kernels/triangular_solve_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensorArray = framework::LoDTensorArray;
-
-template <typename DeviceContext, typename T, size_t D>
-void SetValueCompute(const framework::ExecutionContext& ctx,
-                     framework::Tensor* in,
-                     framework::Tensor* value_tensor,
-                     framework::Tensor* out,
-                     const std::vector<int64_t>& axes,
-                     std::vector<int64_t>* starts,
-                     std::vector<int64_t>* ends,
-                     const std::vector<int64_t>& shape) {
-  std::vector<int64_t> steps = {1, 1};
-  std::vector<int64_t> decrease_axes = {};
-  std::vector<int64_t> none_axes = {};
-
-  auto dtype = framework::TransToProtoVarType(in->dtype());
-
-  auto in_dims = in->dims();
-  phi::funcs::CheckAndUpdateSliceAttrs<int64_t>(
-      in_dims, axes, starts, ends, &steps);
-  auto slice_dims =
-      phi::funcs::GetSliceDims(in_dims, axes, *starts, *ends, &steps);
-  auto decrease_slice_dims =
-      phi::funcs::GetDecreasedDims(slice_dims, decrease_axes);
-
-  auto slice_dims_for_assign = decrease_slice_dims;
-  if (!none_axes.empty()) {
-    std::vector<int64_t> slice_dims_with_none;
-
-    size_t none_axes_cur = 0, decrease_axes_cur = 0;
-    for (int i = 0; i < slice_dims.size(); ++i) {
-      while (none_axes_cur < none_axes.size() &&
-             none_axes[none_axes_cur] <= i) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-      if (decrease_axes_cur < decrease_axes.size() &&
-          decrease_axes[decrease_axes_cur] == i) {
-        decrease_axes_cur++;
-      } else {
-        slice_dims_with_none.push_back(slice_dims[i]);
-      }
-    }
-    while (none_axes_cur < none_axes.size()) {
-      slice_dims_with_none.push_back(1);
-      none_axes_cur++;
-    }
-
-    slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-  }
-
-  auto place = ctx.GetPlace();
-  auto& eigen_place =
-      *ctx.template device_context<DeviceContext>().eigen_device();
-
-  // Here copy data from input to avoid data loss at PE and Graph level.
-  // TODO(liym27): Speed up in the future version.
-  // - Q: Why don't call ShareDataWith to speed up?
-  // - A: Because it's not supported to ShareDataWith on OP's input and output
-  // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP
-  // - Q: Why don't delete Input, after all, the input and output are the same
-  // Tensor at program level?
-  // - A: If deleting Input, the graph will be complex, such as there will
-  // be two ops points to the output in graph: op1 -> output <- set_value.
-  // In this case, we have to find a way to handle the running order of
-  // set_value is what we want.
-  paddle::framework::TensorCopy(*in, place, out);
-
-  Tensor slice_tensor(framework::TransToPhiDataType(dtype)),
-      pad_tensor(framework::TransToPhiDataType(dtype));
-  slice_tensor.mutable_data<T>(slice_dims, place);
-  pad_tensor.mutable_data<T>(in_dims, place);
-
-  auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
-  auto out_e = framework::EigenTensor<T, D>::From(*out);
-  auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
-
-  // Step 1: Set the value of out at `_index` to zero
-  slice_e.device(eigen_place) = slice_e.constant(T(0));
-
-  auto starts_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-  auto ends_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-  auto strides_indices = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-  for (size_t i = 0; i < D; ++i) {
-    starts_indices[i] = 0;
-    ends_indices[i] = slice_dims[i];
-    strides_indices[i] = 1;
-  }
-  for (size_t i = 0; i < axes.size(); i++) {
-    int axis_index = axes[i];
-    starts_indices[axis_index] = (*starts)[i];
-    ends_indices[axis_index] = (*ends)[i];
-    strides_indices[axis_index] = steps[i];
-    if ((*starts)[i] ==
-        (*ends)[i]) {  // slice is empty, data will not be changed
-      return;
-    }
-  }
-
-  out_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-      .device(eigen_place) = slice_e;
-
-  // Step 2: Set a tensor with the same shape as out tensor. And its data at
-  // '_index' is the same as value_tensor, and data out of '_index' to zero
-
-  // - Step 2.1 Set slice tensor with value
-
-  // NOTE(liym27): [ Why resize slice_tensor here? ]
-  // A: When do broadcasting on slice_tensor and value_tensor, the shape of
-  // slice_tensor should be decreased dims.
-  // e.g.
-  //  x[:,0] = value_tensor
-  // x's shape = [3, 4], value_tensor's shape = [3]
-  // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
-  // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
-  // shape is [3, 3], which cross the border;
-  // If do broadcasting on Tensor with shape [3] and [3], the result's shape
-  // is [3], which is right.
-
-  slice_tensor.Resize(slice_dims_for_assign);
-  if (value_tensor != nullptr) {
-    CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims());
-    // ElementwiseComputeEx can do broadcasting
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
-  } else {
-    Tensor value_t(framework::TransToPhiDataType(dtype));
-    auto value_dims = phi::make_ddim(shape);
-    CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-    value_t.mutable_data<T>(value_dims, place);
-    auto value_name = GetValueName(dtype);
-    CopyVectorToTensor<T>(value_name.c_str(), &value_t, ctx);
-    value_t.Resize(value_dims);
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-        ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
-  }
-  slice_tensor.Resize(slice_dims);
-
-  // - Step 2.2 Pad slice tensor with 0
-  pad_e.device(eigen_place) = pad_e.constant(T(0));
-  pad_e.stridedSlice(starts_indices, ends_indices, strides_indices)
-      .device(eigen_place) = slice_e;
-
-  // Step 3: Set out tensor with value_tensor
-  out_e.device(eigen_place) = out_e - pad_e;
-}
-
-template <typename DeviceContext, typename T>
-void SetValueCompute_dispatch(const framework::ExecutionContext& ctx,
-                              framework::Tensor* in,
-                              framework::Tensor* value_tensor,
-                              framework::Tensor* out,
-                              const std::vector<int64_t>& axes,
-                              std::vector<int64_t>* starts,
-                              std::vector<int64_t>* ends,
-                              const std::vector<int64_t>& shape,
-                              int rank) {
-  switch (rank) {
-    case 1:
-      SetValueCompute<DeviceContext, T, 1>(
-          ctx, in, value_tensor, out, axes, starts, ends, shape);
-      break;
-    case 2:
-      SetValueCompute<DeviceContext, T, 2>(
-          ctx, in, value_tensor, out, axes, starts, ends, shape);
-      break;
-    case 3:
-      SetValueCompute<DeviceContext, T, 3>(
-          ctx, in, value_tensor, out, axes, starts, ends, shape);
-      break;
-    case 4:
-      SetValueCompute<DeviceContext, T, 4>(
-          ctx, in, value_tensor, out, axes, starts, ends, shape);
-      break;
-    case 5:
-      SetValueCompute<DeviceContext, T, 5>(
-          ctx, in, value_tensor, out, axes, starts, ends, shape);
-      break;
-    case 6:
-      SetValueCompute<DeviceContext, T, 6>(
-          ctx, in, value_tensor, out, axes, starts, ends, shape);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The rank of input should be less than 7, but received %d.", rank));
-  }
-}
-
-template <typename DeviceContext, typename T>
-void Tensor_Conj(const DeviceContext& dev_ctx,
-                 const framework::Tensor& tensor,
-                 framework::Tensor* out) {
-  out->Resize(tensor.dims());
-  platform::ForRange<DeviceContext> out_for_range(dev_ctx, tensor.numel());
-  phi::funcs::ConjFunctor<T> out_functor(
-      tensor.data<T>(),
-      tensor.numel(),
-      out->mutable_data<T>(dev_ctx.GetPlace()));
-  out_for_range(out_functor);
-}
-
-template <typename DeviceContext, typename T>
-void Tensor_Add(const DeviceContext& dev_ctx,
-                const framework::Tensor& src1,
-                const framework::Tensor& src2,
-                framework::Tensor* out) {
-  out->Resize(src1.dims());
-  out->mutable_data<T>(dev_ctx.GetPlace());
-
-  phi::AddRawKernel<
-      T,
-      typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
-      static_cast<const typename paddle::framework::ConvertToPhiContext<
-          DeviceContext>::TYPE&>(dev_ctx),
-      src1,
-      src2,
-      -1,
-      out);
-}
-
-template <typename DeviceContext, typename T>
-void Tensor_Sub(const DeviceContext& dev_ctx,
-                const framework::Tensor& src1,
-                const framework::Tensor& src2,
-                framework::Tensor* out) {
-  out->Resize(src1.dims());
-  out->mutable_data<T>(dev_ctx.GetPlace());
-
-  phi::SubtractRawKernel<
-      T,
-      typename paddle::framework::ConvertToPhiContext<DeviceContext>::TYPE>(
-      static_cast<const typename paddle::framework::ConvertToPhiContext<
-          DeviceContext>::TYPE&>(dev_ctx),
-      src1,
-      src2,
-      -1,
-      out);
-}
-
-template <typename DeviceContext, typename T, size_t D>
-void SliceCompute(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* in,
-                  framework::Tensor* out,
-                  const std::vector<int>& axes_int,
-                  const std::vector<int>& starts_int,
-                  const std::vector<int>& ends_int) {
-  std::vector<int64_t> axes(axes_int.begin(), axes_int.end());
-  std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-  std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-
-  std::vector<int> decrease_axis = {};
-  std::vector<int> infer_flags = {};
-
-  PADDLE_ENFORCE_EQ(
-      starts.size(),
-      axes.size(),
-      platform::errors::InvalidArgument(
-          "The size of starts must be equal to the size of axes."));
-  PADDLE_ENFORCE_EQ(ends.size(),
-                    axes.size(),
-                    platform::errors::InvalidArgument(
-                        "The size of ends must be equal to the size of axes."));
-
-  // Step 2: Compute output
-
-  auto in_dims = in->dims();
-  auto out_dims = out->dims();
-  auto slice_dims = out_dims;
-
-  // 2.1 Infer output dims
-  for (size_t i = 0; i < axes.size(); ++i) {
-    // when start == -1 && end == start+1
-    if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-      auto ret = std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-      if (ret != decrease_axis.end()) {
-        ends[i] = in_dims[axes[i]];
-      }
-    }
-  }
-
-  phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
-  slice_dims = phi::funcs::GetSliceDims<int64_t>(
-      in_dims, axes, starts, ends, nullptr, nullptr);
-  out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
-
-  // 2.2 Get output
-  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
-  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
-
-  for (size_t i = 0; i < D; ++i) {
-    offsets[i] = 0;
-    extents[i] = slice_dims[i];
-  }
-  for (size_t i = 0; i < axes.size(); ++i) {
-    offsets[axes[i]] = starts[i];
-  }
-
-  out->Resize(slice_dims);
-  out->mutable_data<T>(ctx.GetPlace());
-
-  auto in_t = framework::EigenTensor<T, D>::From(*in, in_dims);
-  auto out_t = framework::EigenTensor<T, D>::From(*out, slice_dims);
-  auto& eigen_place =
-      *ctx.template device_context<DeviceContext>().eigen_device();
-
-  if (in->numel() <= Eigen::NumTraits<int>::highest()) {
-    // similar to tf.slice:
-    // if element number less than INT_MAX, change the type of index to int
-    Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
-    for (size_t i = 0; i < D; i++) {
-      offsets_32bit[i] = offsets[i];
-      extents_32bit[i] = extents[i];
-    }
-    EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
-        eigen_place,
-        framework::To32BitIndex(out_t),
-        framework::To32BitIndex(in_t),
-        offsets_32bit,
-        extents_32bit);
-  } else {
-    EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
-        eigen_place, out_t, in_t, offsets, extents);
-  }
-
-  out->Resize(out_dims);
-  out->mutable_data<T>(ctx.GetPlace());
-}
-
-template <typename DeviceContext, typename T>
-void Tensor_narrow(const framework::ExecutionContext& ctx,
-                   const framework::Tensor* src,
-                   framework::Tensor* out,
-                   int row_s,
-                   int row_e,
-                   int col_s,
-                   int col_e) {
-  auto rank = src->dims().size();
-  std::vector<int> axes_int = {rank - 2, rank - 1};
-  std::vector<int> starts_int = {row_s, col_s};
-  std::vector<int> ends_int = {row_e, col_e};
-  switch (rank) {
-    case 1:
-      SliceCompute<DeviceContext, T, 1>(
-          ctx, src, out, axes_int, starts_int, ends_int);
-      break;
-    case 2:
-      SliceCompute<DeviceContext, T, 2>(
-          ctx, src, out, axes_int, starts_int, ends_int);
-      break;
-    case 3:
-      SliceCompute<DeviceContext, T, 3>(
-          ctx, src, out, axes_int, starts_int, ends_int);
-      break;
-    case 4:
-      SliceCompute<DeviceContext, T, 4>(
-          ctx, src, out, axes_int, starts_int, ends_int);
-      break;
-    case 5:
-      SliceCompute<DeviceContext, T, 5>(
-          ctx, src, out, axes_int, starts_int, ends_int);
-      break;
-    case 6:
-      SliceCompute<DeviceContext, T, 6>(
-          ctx, src, out, axes_int, starts_int, ends_int);
-      break;
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The rank of input should be less than 7, but received %d.", rank));
-  }
-}
-
-template <typename DeviceContext>
-void arange(const DeviceContext& dev_ctx,
-            framework::Tensor* tmp,
-            int w,
-            int batchsize = 1,
-            int h = 1) {
-  tmp->Resize(phi::make_ddim({batchsize * w}));
-  platform::CPUPlace cpu;
-  auto tmpdata = tmp->mutable_data<int32_t>(cpu);
-  for (int b = 0; b < batchsize; b++) {
-    for (int i = 0; i < w; i++) {
-      tmpdata[b * w + i] = static_cast<int32_t>(b * h + i);
-    }
-  }
-}
-
-template <typename T>
-struct OneFunctor {
-  OneFunctor(T* output, int* idtptr, int w, int dim)
-      : output_(output), idtptr_(idtptr), w_(w), dim_(dim) {}
-
-  HOSTDEVICE void operator()(size_t idx) const {
-    output_[w_ * idtptr_[idx] + idx % dim_] = static_cast<T>(1);
-  }
-
-  T* output_;
-  int* idtptr_;
-  int w_;
-  int dim_;
-};
-
-template <typename DeviceContext, typename T>
-void LU_Unpack(const DeviceContext& dev_ctx,
-               const framework::Tensor* LU,
-               framework::Tensor* L,
-               framework::Tensor* U) {
-  const auto udims = LU->dims();
-  L->Resize(udims);
-  U->Resize(udims);
-  const auto H = udims[udims.size() - 2];
-  const auto W = udims[udims.size() - 1];
-  auto L_dataptr = L->mutable_data<T>(dev_ctx.GetPlace());
-  platform::ForRange<DeviceContext> x_for_range(dev_ctx, LU->numel());
-  phi::funcs::TrilTriuCompute<T> tril_computer(
-      LU->data<T>(), -1, true, H, W, L_dataptr);
-  x_for_range(tril_computer);
-
-  phi::funcs::TrilTriuCompute<T> triu_computer(
-      LU->data<T>(), 0, false, H, W, U->mutable_data<T>(dev_ctx.GetPlace()));
-  x_for_range(triu_computer);
-
-  // set L's diagonal 1
-  auto dim = std::min(H, W);
-  framework::Tensor rowtensor, rt_dev;
-  auto batchsize = product(phi::slice_ddim(udims, 0, udims.size() - 2));
-  batchsize = std::max(static_cast<int>(batchsize), 1);
-  arange<DeviceContext>(dev_ctx, &rowtensor, dim, batchsize, H);
-  auto idtptr = rowtensor.data<int32_t>();
-  if (platform::is_gpu_place(dev_ctx.GetPlace())) {
-    framework::TensorCopy(rowtensor, dev_ctx.GetPlace(), &rt_dev);
-    idtptr = rt_dev.data<int32_t>();
-  }
-
-  platform::ForRange<DeviceContext> for_range(dev_ctx, rowtensor.numel());
-  OneFunctor<T> functor(L_dataptr, idtptr, W, dim);
-  for_range(functor);
-}
-
-template <typename DeviceContext, typename T>
-void scatterpivot(const DeviceContext& dev_ctx,
-                  T* out_data,
-                  framework::Tensor* idlst,
-                  int w,
-                  int dim) {
-  framework::Tensor idlst_tmp;
-  idlst_tmp.Resize(idlst->dims());
-  idlst_tmp.mutable_data<int32_t>(dev_ctx.GetPlace());
-  framework::TensorCopy(*idlst, dev_ctx.GetPlace(), &idlst_tmp);
-  auto idtptr = idlst_tmp.data<int32_t>();
-
-  platform::ForRange<DeviceContext> for_range(dev_ctx, idlst_tmp.numel());
-  OneFunctor<T> functor(out_data, idtptr, w, dim);
-  for_range(functor);
-}
-
-template <typename DeviceContext, typename T>
-void Unpack_Pivot(const DeviceContext& dev_ctx,
-                  const framework::Tensor& Pivot,
-                  framework::Tensor* P,
-                  int h,
-                  int w) {
-  auto dims = Pivot.dims();
-  auto Pdimvec = vectorize(dims);
-  auto prank = Pdimvec.size();
-  auto Pnum = dims[prank - 1];
-  framework::Tensor Pivot_cpu;
-  platform::CPUPlace cpu;
-  framework::TensorCopy(Pivot, cpu, &Pivot_cpu);
-  auto pdataptr = Pivot_cpu.data<int32_t>();
-  Pdimvec[prank - 1] = h;
-  Pdimvec.emplace_back(h);
-  auto Pdim = phi::make_ddim(Pdimvec);
-  P->Resize(Pdim);
-  auto pdata = P->mutable_data<T>(dev_ctx.GetPlace());
-  phi::funcs::SetConstant<DeviceContext, T> setter;
-  setter(dev_ctx, P, static_cast<T>(0));
-
-  auto batchsize = product(phi::slice_ddim(dims, 0, prank - 1));
-  batchsize = std::max(static_cast<int>(batchsize), 1);
-  framework::Tensor idt;
-  for (int i = 0; i < batchsize; i++) {
-    arange<DeviceContext>(dev_ctx, &idt, h);
-    auto idlst = idt.data<int32_t>();
-    for (int j = 0; j < Pnum; j++) {
-      if (idlst[pdataptr[i * Pnum + j] - 1] == idlst[j]) continue;
-      auto temp = idlst[j];
-      idlst[j] = idlst[pdataptr[i * Pnum + j] - 1];
-      idlst[pdataptr[i * Pnum + j] - 1] = temp;
-    }
-    scatterpivot(dev_ctx, &(pdata[i * h * h]), &idt, h, h);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
index 4c6b37ed3e55e0..988cba43989e95 100644
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/lu_unpack_op.h"
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -42,44 +46,6 @@ class LU_UnpackOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "LU_Unpack");
-    OP_INOUT_CHECK(context->HasInput("Pivots"), "Input", "Pivots", "LU_Unpack");
-    OP_INOUT_CHECK(context->HasOutput("L"), "Output", "L", "LU_Unpack");
-    OP_INOUT_CHECK(context->HasOutput("U"), "Output", "U", "LU_Unpack");
-    OP_INOUT_CHECK(context->HasOutput("Pmat"), "Output", "Pmat", "LU_Unpack");
-    bool unpack_ludata = context->Attrs().Get<bool>("unpack_ludata");
-    bool unpack_pivots = context->Attrs().Get<bool>("unpack_pivots");
-
-    auto x_dims = context->GetInputDim("X");
-    int x_rank = x_dims.size();
-    PADDLE_ENFORCE_GE(x_rank,
-                      2,
-                      platform::errors::InvalidArgument(
-                          "the rank of input must greater than 2"));
-
-    // context->SetOutputDim("Out", x_dims);
-    int m = x_dims[x_rank - 1];
-    int n = x_dims[x_rank - 2];
-    int min_mn = std::min(m, n);
-    if (unpack_ludata) {
-      auto ldims = x_dims;
-      auto udims = x_dims;
-      if (m >= n) {
-        udims[x_rank - 2] = min_mn;
-      } else {
-        ldims[x_rank - 1] = min_mn;
-      }
-      context->SetOutputDim("U", udims);
-      context->SetOutputDim("L", ldims);
-    }
-    if (unpack_pivots) {
-      auto pdims = x_dims;
-      pdims[x_rank - 1] = m;
-      context->SetOutputDim("Pmat", pdims);
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -143,25 +109,6 @@ class LU_UnpackGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lu_unpack");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("L")),
-                   "Input",
-                   "L@GRAD",
-                   "lu_unpack");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("U")),
-                   "Input",
-                   "U@GRAD",
-                   "lu_unpack");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -175,19 +122,21 @@ class LU_UnpackGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DECLARE_INFER_SHAPE_FUNCTOR(lu_unpack,
+                            LUUnpackInferMetaFunctor,
+                            PD_INFER_META(phi::LUUnpackInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(lu_unpack_grad,
+                            LUUnpackGradInferMetaFunctor,
+                            PD_INFER_META(phi::LUUnpackGradInferMeta));
+
 REGISTER_OPERATOR(lu_unpack,
                   ops::LU_UnpackOp,
                   ops::LU_UnpackOpMaker,
                   ops::LU_UnpackOpVarTypeInference,
                   ops::LU_UnpackOpGradMaker<paddle::framework::OpDesc>,
-                  ops::LU_UnpackOpGradMaker<paddle::imperative::OpBase>);
+                  ops::LU_UnpackOpGradMaker<paddle::imperative::OpBase>,
+                  LUUnpackInferMetaFunctor);
 REGISTER_OPERATOR(lu_unpack_grad,
                   ops::LU_UnpackGradOp,
-                  ops::LU_UnpackGradOpVarTypeInference);
-
-REGISTER_OP_CPU_KERNEL(lu_unpack,
-                       ops::LU_UnpackKernel<phi::CPUContext, float>,
-                       ops::LU_UnpackKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(lu_unpack_grad,
-                       ops::LU_UnpackGradKernel<phi::CPUContext, float>,
-                       ops::LU_UnpackGradKernel<phi::CPUContext, double>);
+                  ops::LU_UnpackGradOpVarTypeInference,
+                  LUUnpackGradInferMetaFunctor);
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
deleted file mode 100644
index 559c13c9ee6e2a..00000000000000
--- a/paddle/fluid/operators/lu_unpack_op.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/lu_op.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/tril_triu_compute.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensorArray = framework::LoDTensorArray;
-
-template <typename DeviceContext, typename T>
-class LU_UnpackKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto xin = ctx.Input<framework::Tensor>("X");
-    auto P = ctx.Input<framework::Tensor>("Pivots");
-
-    auto ltensor = ctx.Output<framework::Tensor>("L");
-    auto utensor = ctx.Output<framework::Tensor>("U");
-    auto ptensor = ctx.Output<framework::Tensor>("Pmat");
-
-    auto unpack_ludata = ctx.Attr<bool>("unpack_ludata");
-    auto unpack_pivots = ctx.Attr<bool>("unpack_pivots");
-
-    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto xdims = xin->dims();
-    int xrank = xdims.size();
-    int64_t m = xdims[xrank - 2];
-    int64_t n = xdims[xrank - 1];
-    int64_t k = std::min(m, n);
-
-    if (unpack_ludata) {
-      ltensor->mutable_data<T>(ctx.GetPlace());
-      utensor->mutable_data<T>(ctx.GetPlace());
-
-      framework::Tensor L, U;
-      LU_Unpack<DeviceContext, T>(dev_ctx, xin, &L, &U);
-
-      if (m >= n) {
-        framework::TensorCopy(L, ctx.GetPlace(), ltensor);
-        Tensor_narrow<DeviceContext, T>(ctx, &U, utensor, 0, k, 0, k);
-      } else {
-        framework::TensorCopy(U, ctx.GetPlace(), utensor);
-        Tensor_narrow<DeviceContext, T>(ctx, &L, ltensor, 0, k, 0, k);
-      }
-    }
-
-    if (unpack_pivots) {
-      ptensor->mutable_data<T>(ctx.GetPlace());
-      Unpack_Pivot<DeviceContext, T>(dev_ctx, *P, ptensor, m, k);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LU_UnpackGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto dl = ctx.Input<framework::Tensor>(framework::GradVarName("L"));
-    auto du = ctx.Input<framework::Tensor>(framework::GradVarName("U"));
-    auto dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    framework::Tensor dl_tril, du_triu;
-    const auto ldims = dl->dims();
-    dl_tril.Resize(ldims);
-    auto H = ldims[ldims.size() - 2];
-    auto W = ldims[ldims.size() - 1];
-    auto L_dataptr = dl_tril.mutable_data<T>(dev_ctx.GetPlace());
-    platform::ForRange<DeviceContext> l_for_range(dev_ctx, dl->numel());
-    phi::funcs::TrilTriuCompute<T> tril_computer(
-        dl->data<T>(), -1, true, H, W, L_dataptr);
-    l_for_range(tril_computer);
-
-    const auto udims = du->dims();
-    du_triu.Resize(udims);
-    H = udims[udims.size() - 2];
-    W = udims[udims.size() - 1];
-    auto U_dataptr = du_triu.mutable_data<T>(dev_ctx.GetPlace());
-    platform::ForRange<DeviceContext> u_for_range(dev_ctx, du->numel());
-    phi::funcs::TrilTriuCompute<T> triu_computer(
-        du->data<T>(), 0, false, H, W, U_dataptr);
-    u_for_range(triu_computer);
-
-    auto xdims = dx->dims();
-    int xrank = xdims.size();
-    int64_t m = xdims[xrank - 2];
-    int64_t n = xdims[xrank - 1];
-    int64_t k = std::min(m, n);
-
-    std::vector<int64_t> axes = {xrank - 2, xrank - 1};
-    std::vector<int64_t> slice_starts(2, 0);
-    std::vector<int64_t> slice_ends(2, 0);
-    auto valuedims = vectorize(xdims);
-
-    phi::funcs::SetConstant<DeviceContext, T> setter;
-    setter(dev_ctx, dx, static_cast<T>(0));
-    if (m <= n) {
-      slice_starts[0] = 0;
-      slice_starts[1] = 0;
-      slice_ends[0] = k;
-      slice_ends[1] = k;
-      valuedims[xrank - 2] = k;
-      valuedims[xrank - 1] = k;
-      SetValueCompute_dispatch<DeviceContext, T>(ctx,
-                                                 dx,
-                                                 &dl_tril,
-                                                 dx,
-                                                 axes,
-                                                 &slice_starts,
-                                                 &slice_ends,
-                                                 valuedims,
-                                                 xrank);
-
-      Tensor_Add<DeviceContext, T>(dev_ctx, *dx, du_triu, dx);
-    } else {
-      slice_starts[0] = 0;
-      slice_starts[1] = 0;
-      slice_ends[0] = k;
-      slice_ends[1] = k;
-      valuedims[xrank - 2] = k;
-      valuedims[xrank - 1] = k;
-      SetValueCompute_dispatch<DeviceContext, T>(ctx,
-                                                 dx,
-                                                 &du_triu,
-                                                 dx,
-                                                 axes,
-                                                 &slice_starts,
-                                                 &slice_ends,
-                                                 valuedims,
-                                                 xrank);
-
-      Tensor_Add<DeviceContext, T>(dev_ctx, *dx, dl_tril, dx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
index 36776cebfcd46d..61f238f19d1378 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
@@ -11,20 +11,28 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceAMaxOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_amax"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_amax"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_amax,
+                            ReduceAMaxInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
-REGISTER_REDUCE_OP(reduce_amax);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OPERATOR(
     reduce_amax,
-    ops::ReduceKernel<phi::CPUContext, float, ops::MaxFunctor>,
-    ops::ReduceKernel<phi::CPUContext, double, ops::MaxFunctor>,
-    ops::ReduceKernel<phi::CPUContext, int, ops::MaxFunctor>,
-    ops::ReduceKernel<phi::CPUContext, int64_t, ops::MaxFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_amax_grad,
-    ops::ReduceGradKernel<phi::CPUContext, float, ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<phi::CPUContext, double, ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<phi::CPUContext, int, ops::AMaxOrAMinGradFunctor>,
-    ops::
-        ReduceGradKernel<phi::CPUContext, int64_t, ops::AMaxOrAMinGradFunctor>);
+    ops::ReduceOp,
+    ReduceAMaxOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceAMaxInferShapeFunctor);
+REGISTER_OPERATOR(reduce_amax_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps b/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
deleted file mode 100644
index 09987279184694..00000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PADDLE_WITH_XPU_KP
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#endif
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(
-    reduce_amax, KP, plat::XPUPlace,
-    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amax,
-    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
-#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
index bb99ca9b17e7ea..aac8414ac197d1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
@@ -11,20 +11,28 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace ops = paddle::operators;
+
+class ReduceAMinOpMaker : public ops::ReduceOpMaker {
+ protected:
+  virtual std::string GetName() const { return "reduce_amin"; }
+  virtual std::string GetOpType() const { return "Reduce reduce_amin"; }
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(reduce_amin,
+                            ReduceAMinInferShapeFunctor,
+                            PD_INFER_META(phi::ReduceInferMetaBase));
 
-REGISTER_REDUCE_OP(reduce_amin);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OPERATOR(
     reduce_amin,
-    ops::ReduceKernel<phi::CPUContext, float, ops::MinFunctor>,
-    ops::ReduceKernel<phi::CPUContext, double, ops::MinFunctor>,
-    ops::ReduceKernel<phi::CPUContext, int, ops::MinFunctor>,
-    ops::ReduceKernel<phi::CPUContext, int64_t, ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    reduce_amin_grad,
-    ops::ReduceGradKernel<phi::CPUContext, float, ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<phi::CPUContext, double, ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<phi::CPUContext, int, ops::AMaxOrAMinGradFunctor>,
-    ops::
-        ReduceGradKernel<phi::CPUContext, int64_t, ops::AMaxOrAMinGradFunctor>);
+    ops::ReduceOp,
+    ReduceAMinOpMaker,
+    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
+    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
+    ReduceAMinInferShapeFunctor);
+REGISTER_OPERATOR(reduce_amin_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps b/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
deleted file mode 100644
index 5e1139396d90cb..00000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PADDLE_WITH_XPU_KP
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#endif
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(
-    reduce_amin, KP, plat::XPUPlace,
-    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amin,
-    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
-#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h b/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
index 3d0f7bd08f9301..a458dd09f4aaa4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_max_op.h
@@ -55,120 +55,5 @@ struct MaxOrMinGradFunctor {
   }
 };
 
-#define HANDLE_AXIS_DIM(BROADCAST_DIM, AXIS_DIM)                 \
-  if (broadcast_dim_size == BROADCAST_DIM && rank == AXIS_DIM) { \
-    AMaxOrAMinAxisIsListGradFunctor<DeviceContext,               \
-                                    X,                           \
-                                    Y,                           \
-                                    DX,                          \
-                                    DY,                          \
-                                    Dim,                         \
-                                    BROADCAST_DIM,               \
-                                    AXIS_DIM>(                   \
-        place, x, y, dx, dy, dim, axis_dim);                     \
-  }
-
-template <typename DeviceContext,
-          typename X,
-          typename Y,
-          typename DX,
-          typename DY,
-          typename Dim,
-          int R,
-          int D>
-void AMaxOrAMinAxisIsListGradFunctor(const DeviceContext& place,
-                                     X* x,
-                                     Y* y,
-                                     DX* dx,
-                                     DY* dy,
-                                     const Dim& dim,
-                                     const std::vector<int>& axis_dim) {
-  // R is x->dimensions().size();
-  // D is axis_dim->dimensions().size();
-  auto axis = Eigen::array<int, D>();
-  auto reshape_x = Eigen::array<int, R>();
-  auto reshape_y = Eigen::array<int, R>();
-
-  for (int i = 0; i < D; i++) axis[i] = axis_dim[i];
-  for (int i = 0; i < R; i++) {
-    reshape_x[i] = x->dimensions()[i];
-    reshape_y[i] = y->dimensions()[i];
-  }
-
-  auto equals = (*x) == y->broadcast(dim);
-  auto ones = dx->constant(1);
-  auto zeros = dx->constant(0);
-  auto mask = equals.select(ones, zeros);
-  dx->device(place) =
-      dy->broadcast(dim) * mask /
-      mask.reshape(reshape_x).sum(axis).reshape(reshape_y).broadcast(dim);
-}
-
-struct AMaxOrAMinGradFunctor {
-  template <typename DeviceContext,
-            typename X,
-            typename Y,
-            typename DX,
-            typename DY,
-            typename Dim>
-  void operator()(const DeviceContext& place,
-                  X* x,
-                  Y* y,
-                  DX* dx,
-                  DY* dy,
-                  const Dim& dim,
-                  int size) {
-    auto equals = (*x) == y->broadcast(dim);
-    auto ones = dx->constant(1);
-    auto zeros = dx->constant(0);
-    auto mask = equals.select(ones, zeros);
-
-    // If there are multiple minimum or maximum elements,
-    // we evenly distribute gradient between these equal values
-    size_t x_numel = 1;
-    for (size_t i = 0; i < x->dimensions().size(); i++)
-      x_numel *= x->dimensions()[i];
-    // reduce_all
-    if (size == static_cast<int>(x_numel)) {
-      auto equal_number = mask.sum()
-                              .reshape(Eigen::array<int, 1>({1}))
-                              .broadcast(Eigen::array<int, 1>({size}));
-      dx->device(place) = dy->broadcast(dim) * mask / equal_number;
-      return;
-    }
-
-    // compute forward reduce axis_dim by dim (which is broadcast_dim)
-    std::vector<int> axis_dim;
-    int broadcast_dim_size = static_cast<int>(dim.size());
-    for (int i = 0; i < broadcast_dim_size; i++) {
-      if (dim[i] > 1) {
-        axis_dim.push_back(i);
-      }
-    }
-
-    int rank = static_cast<int>(axis_dim.size());
-    // axis is a int element
-    if (rank == 1) {
-      auto axis = Eigen::array<int, 1>({axis_dim[0]});
-      dx->device(place) =
-          dy->broadcast(dim) * mask /
-          mask.sum(axis).reshape(dy->dimensions()).broadcast(dim);
-      return;
-    }
-    // axis is list, HANDLE_AXIS_DIM(broadcast_dim_size, rank)
-    HANDLE_AXIS_DIM(3, 2);
-    HANDLE_AXIS_DIM(4, 2);
-    HANDLE_AXIS_DIM(4, 3);
-    // comments for accelerating compiling temporarily.
-    // HANDLE_AXIS_DIM(5, 2);
-    // HANDLE_AXIS_DIM(5, 3);
-    // HANDLE_AXIS_DIM(5, 4);
-    // HANDLE_AXIS_DIM(6, 2);
-    // HANDLE_AXIS_DIM(6, 3);
-    // HANDLE_AXIS_DIM(6, 4);
-    // HANDLE_AXIS_DIM(6, 5);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index e9bc3905a22ee3..9e53a6b56de5ca 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -838,87 +838,6 @@ struct DivideFunctor {
 
   inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
 };
-
-template <typename T, template <typename, typename> class TransformOp>
-class ReduceCudaAMaxAMinGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-    auto* in_x = context.Input<Tensor>("X");
-    auto* out_y = context.Input<Tensor>("Out");
-    auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto out_dtype = context.Attr<int>("in_dtype");
-    auto pt_out_dtype = framework::TransToPhiDataType(
-        static_cast<framework::proto::VarType::Type>(out_dtype));
-    // get reduce_dim and reduce_num for reduce_mean_grad
-    int dim_size = in_x->dims().size();
-    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
-    auto update_dims = vectorize(d_x->dims());
-    int reduce_num = 1;
-    for (auto i : reduce_dims) {
-      reduce_num *= (in_x->dims())[i];
-      update_dims[i] = 1;
-    }
-    auto& dev_ctx = context.cuda_device_context();
-
-    // make new tensor reduce_out
-    phi::DenseTensor new_y(out_y->type());
-    new_y.ShareDataWith(*out_y);
-    new_y.Resize(phi::make_ddim(update_dims));
-
-    // make new tensor d_out
-    phi::DenseTensor new_dout(d_out->type());
-    new_dout.ShareDataWith(*d_out);
-    new_dout.Resize(phi::make_ddim(update_dims));
-    d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
-
-    auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x);
-    auto new_in_tensor = new_in.get();
-
-    auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x);
-    auto new_dx_tensor = new_dx.get();
-
-    // make equal_out
-    phi::DenseTensor* equal_out = new phi::DenseTensor();
-    equal_out->Resize(in_x->dims());
-    dev_ctx.template Alloc<T>(equal_out);
-    auto equal_out_tensor = *equal_out;
-
-    // make new tensor equal_count
-    phi::DenseTensor* equal_count = new phi::DenseTensor();
-    equal_count->Resize(phi::make_ddim(update_dims));
-    dev_ctx.template Alloc<T>(equal_count);
-
-    // compute
-    // 1. equal_out = Equal(x, y)
-    std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
-    std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
-    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-        dev_ctx, equal_inputs, &equal_outputs, 0, EqualFunctor<T>());
-    // 2. equal_count = reduceSum(equal_out)
-    using MPType = typename kps::details::MPTypeTrait<T>::Type;
-    phi::funcs::
-        ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T, MPType>>(
-            dev_ctx,
-            equal_out_tensor,
-            equal_count,
-            kps::IdentityFunctor<T, MPType>(),
-            reduce_dims,
-            false);
-
-    // 3. dx = Div(dout, equal_out)
-    std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
-                                                        equal_count};
-    std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
-    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-        dev_ctx, grad_inputs, &grad_outputs, 0, DivideFunctor<T>());
-    delete equal_out;
-    delete equal_count;
-  }
-};
 #endif
 #endif
 
diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
index 9d5b630bcd498b..88903763d805d7 100644
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -141,7 +141,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
             'int64_t[]': 'const std::vector<int64_t>&',
-            'int[]': 'const std::vector<int>&'
+            'int[]': 'const std::vector<int>&',
         }
         optional_types_trans = {
             'Tensor': 'const paddle::optional<Tensor>&',
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index ff80469eedaf94..25cdd37ddea9d9 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -129,6 +129,24 @@
   kernel :
     func : allclose
 
+- api : amax
+  args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
+  output : Tensor(out)
+  infer_meta :
+    func : ReduceInferMeta
+  kernel :
+    func : amax
+  backward : amax_grad
+
+- api : amin
+  args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
+  output : Tensor(out)
+  infer_meta :
+    func : ReduceInferMeta
+  kernel :
+    func : amin
+  backward : amin_grad
+
 - api : angle
   args : (Tensor x)
   output : Tensor
@@ -1443,6 +1461,16 @@
     func : lu
   backward : lu_grad
 
+- api : lu_unpack
+  args : (Tensor x, Tensor pivots, bool unpack_ludata, bool unpack_pivots)
+  output : Tensor(pmat), Tensor(l), Tensor(u)
+  infer_meta :
+    func : LUUnpackInferMeta
+  kernel :
+    func : lu_unpack
+    data_type : x
+  backward : lu_unpack_grad
+
 # masked_select
 - api : masked_select
   args : (Tensor x, Tensor mask)
@@ -1791,6 +1819,14 @@
     func : prelu
   backward : prelu_grad
 
+- api : prior_box
+  args : (Tensor input, Tensor image, float[] min_sizes, float[] aspect_ratios, float[] variances, float[] max_sizes = {}, bool flip=true, bool clip=true, float step_w=0.0, float step_h=0.0, float offset=0.5, bool min_max_aspect_ratios_order=false)
+  output : Tensor(out), Tensor(var)
+  infer_meta :
+    func : PriorBoxInferMeta
+  kernel :
+    func : prior_box
+
 - api : psroi_pool
   args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, int output_channels, float spatial_scale)
   output : Tensor
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index e7167b17637315..c00d9fd9a627b1 100644
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -92,6 +92,26 @@
   kernel :
     func : addmm_grad
 
+- backward_api : amax_grad
+  forward: amax (Tensor x,  int64_t[] dims={},  bool keep_dim=false) -> Tensor(out)
+  args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] dims={},  bool keep_dim=false, bool reduce_all=false)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : amax_grad
+
+- backward_api : amin_grad
+  forward: amin (Tensor x,  int64_t[] dims={},  bool keep_dim=false) -> Tensor(out)
+  args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] dims={},  bool keep_dim=false, bool reduce_all=false)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : amin_grad
+
 - backward_api : angle_grad
   forward : angle (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
@@ -1254,6 +1274,15 @@
   kernel :
     func : lu_grad
 
+- backward_api : lu_unpack_grad
+  forward : lu_unpack (Tensor x, Tensor pivots, bool unpack_ludata, bool unpack_pivots) -> Tensor(pmat), Tensor(l), Tensor(u)
+  args : (Tensor x, Tensor pivots, Tensor l, Tensor u, Tensor pmat, Tensor l_grad, Tensor u_grad, bool unpack_ludata, bool unpack_pivots)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : LUUnpackGradInferMeta
+  kernel :
+    func : lu_unpack_grad
+
 - backward_api : masked_select_grad
   forward : masked_select (Tensor x, Tensor mask) -> Tensor(out)
   args : (Tensor x, Tensor mask, Tensor out_grad)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index a0f71f3b689b18..26e578107206e8 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -456,6 +456,24 @@ void LUGradInferMeta(const MetaTensor& x,
   }
 }
 
+void LUUnpackGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& pivots,
+                           const MetaTensor& l,
+                           const MetaTensor& u,
+                           const MetaTensor& pmat,
+                           const MetaTensor& l_grad,
+                           const MetaTensor& u_grad,
+                           bool unpack_ludata,
+                           bool unpack_pivots,
+                           MetaTensor* x_grad) {
+  auto x_dims = x.dims();
+
+  if (x_grad) {
+    x_grad->set_dims(x_dims);
+    x_grad->set_dtype(x.dtype());
+  }
+}
+
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
                                    const MetaTensor& mask,
                                    const MetaTensor& dout,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index ae04764d105a80..bc89d84cf2203c 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -207,6 +207,17 @@ void LUGradInferMeta(const MetaTensor& x,
                      bool pivot,
                      MetaTensor* x_grad);
 
+void LUUnpackGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& pivots,
+                           const MetaTensor& l,
+                           const MetaTensor& u,
+                           const MetaTensor& pmat,
+                           const MetaTensor& l_grad,
+                           const MetaTensor& u_grad,
+                           bool unpack_ludata,
+                           bool unpack_pivots,
+                           MetaTensor* x_grad);
+
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
                                    const MetaTensor& mask,
                                    const MetaTensor& dout,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 4d72e1b60d6ee1..ebcc6e28b4589d 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1486,6 +1486,52 @@ void LogLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void LUUnpackInferMeta(const MetaTensor& x,
+                       const MetaTensor& pivots,
+                       bool unpack_ludata,
+                       bool unpack_pivots,
+                       MetaTensor* pmat,
+                       MetaTensor* l,
+                       MetaTensor* u) {
+  PADDLE_ENFORCE_NOT_NULL(
+      pmat,
+      phi::errors::InvalidArgument("Output(Pmat) should not be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      l, phi::errors::InvalidArgument("Output(L) should not be nullptr."));
+  PADDLE_ENFORCE_NOT_NULL(
+      u, phi::errors::InvalidArgument("Output(U) should not be nullptr."));
+
+  auto x_dims = x.dims();
+  int x_rank = x_dims.size();
+  PADDLE_ENFORCE_GE(
+      x_rank,
+      2,
+      phi::errors::InvalidArgument("The rank of input must greater than 2."));
+
+  int m = x_dims[x_rank - 1];
+  int n = x_dims[x_rank - 2];
+  int min_mn = std::min(m, n);
+  if (unpack_ludata) {
+    auto ldims = x_dims;
+    auto udims = x_dims;
+    if (m >= n) {
+      udims[x_rank - 2] = min_mn;
+    } else {
+      ldims[x_rank - 1] = min_mn;
+    }
+    u->set_dims(udims);
+    u->set_dtype(x.dtype());
+    l->set_dims(ldims);
+    l->set_dtype(x.dtype());
+  }
+  if (unpack_pivots) {
+    auto pdims = x_dims;
+    pdims[x_rank - 1] = m;
+    pmat->set_dims(pdims);
+    pmat->set_dtype(x.dtype());
+  }
+}
+
 void MaskedSelectInferMeta(const MetaTensor& x,
                            const MetaTensor& mask,
                            MetaTensor* out) {
@@ -1809,6 +1855,110 @@ void PReluInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+void PriorBoxInferMeta(const MetaTensor& input,
+                       const MetaTensor& image,
+                       const std::vector<float>& min_sizes,
+                       const std::vector<float>& aspect_ratios,
+                       const std::vector<float>& variances,
+                       const std::vector<float>& max_sizes,
+                       bool flip,
+                       bool clip,
+                       float step_w,
+                       float step_h,
+                       float offset,
+                       bool min_max_aspect_ratios_order,
+                       MetaTensor* out,
+                       MetaTensor* var) {
+  auto image_dims = image.dims();
+  auto input_dims = input.dims();
+
+  PADDLE_ENFORCE_EQ(
+      image_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "The Input(Image) of Op(PriorBoxOp) should be a 4-D Tensor "
+          "and data format is NCHW. But received Image's dimensions = %d, "
+          "shape = [%s].",
+          image_dims.size(),
+          image_dims));
+  PADDLE_ENFORCE_EQ(
+      input_dims.size(),
+      4,
+      phi::errors::InvalidArgument(
+          "The Input(Input) of Op(PriorBoxOp) should be a 4-D Tensor "
+          "and data format is NCHW. But received Input's dimensions = %d, "
+          "shape = [%s].",
+          input_dims.size(),
+          input_dims));
+
+  std::vector<float> aspect_ratios_vec;
+  ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
+
+  size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
+  if (max_sizes.size() > 0) {
+    PADDLE_ENFORCE_EQ(
+        max_sizes.size(),
+        min_sizes.size(),
+        phi::errors::InvalidArgument(
+            "The length of min_size and "
+            "max_size must be equal. But received: min_size's length is %d, "
+            "max_size's length is %d.",
+            min_sizes.size(),
+            max_sizes.size()));
+    num_priors += max_sizes.size();
+    for (size_t i = 0; i < max_sizes.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          max_sizes[i],
+          min_sizes[i],
+          phi::errors::InvalidArgument(
+              "max_size[%d] must be greater "
+              "than min_size[%d]. But received: max_size[%d] is %f, "
+              "min_size[%d] is %f.",
+              i,
+              i,
+              i,
+              max_sizes[i],
+              i,
+              min_sizes[i]));
+    }
+  }
+
+  std::vector<int64_t> dim_vec(4);
+  dim_vec[0] = input_dims[2];
+  dim_vec[1] = input_dims[3];
+  dim_vec[2] = num_priors;
+  dim_vec[3] = 4;
+
+  out->set_dtype(input.dtype());
+  var->set_dtype(input.dtype());
+  out->set_dims(phi::make_ddim(dim_vec));
+  var->set_dims(phi::make_ddim(dim_vec));
+}
+
 void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
                            const MetaTensor& value,
                            bool out_int32,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 53d6c12e8862fb..3662a7c3166539 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -225,6 +225,14 @@ void LogLossInferMeta(const MetaTensor& input,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
 
+void LUUnpackInferMeta(const MetaTensor& x,
+                       const MetaTensor& pivots,
+                       bool unpack_ludata,
+                       bool unpack_pivots,
+                       MetaTensor* pmat,
+                       MetaTensor* l,
+                       MetaTensor* u);
+
 void MaskedSelectInferMeta(const MetaTensor& x,
                            const MetaTensor& mask,
                            MetaTensor* out);
@@ -256,6 +264,21 @@ void PReluInferMeta(const MetaTensor& x,
                     MetaTensor* out,
                     MetaConfig config = MetaConfig());
 
+void PriorBoxInferMeta(const MetaTensor& input,
+                       const MetaTensor& image,
+                       const std::vector<float>& min_sizes,
+                       const std::vector<float>& aspect_ratios,
+                       const std::vector<float>& variances,
+                       const std::vector<float>& max_sizes,
+                       bool flip,
+                       bool clip,
+                       float step_w,
+                       float step_h,
+                       float offset,
+                       bool min_max_aspect_ratios_order,
+                       MetaTensor* out,
+                       MetaTensor* var);
+
 void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
                            const MetaTensor& value,
                            bool out_int32,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/phi/kernels/cpu/lu_unpack_grad_kernel.cc
similarity index 52%
rename from paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
rename to paddle/phi/kernels/cpu/lu_unpack_grad_kernel.cc
index f5580d784b5896..712c43e97ef1fa 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
+++ b/paddle/phi/kernels/cpu/lu_unpack_grad_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-template <typename T>
-using CUDAReduceMinGradKernel =
-    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
-REGISTER_OP_CUDA_KERNEL(reduce_amin_grad,
-                        CUDAReduceMinGradKernel<int>,
-                        CUDAReduceMinGradKernel<int64_t>,
-                        CUDAReduceMinGradKernel<float>,
-                        CUDAReduceMinGradKernel<double>);
+#include "paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h"
+#include "paddle/phi/kernels/lu_unpack_grad_kernel.h"
+
+PD_REGISTER_KERNEL(
+    lu_unpack_grad, CPU, ALL_LAYOUT, phi::LUUnpackGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/phi/kernels/cpu/lu_unpack_kernel.cc
similarity index 52%
rename from paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
rename to paddle/phi/kernels/cpu/lu_unpack_kernel.cc
index d19819f17dc775..bed7da328ffacb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
+++ b/paddle/phi/kernels/cpu/lu_unpack_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,13 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 
-template <typename T>
-using CUDAReduceMaxGradKernel =
-    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
-REGISTER_OP_CUDA_KERNEL(reduce_amax_grad,
-                        CUDAReduceMaxGradKernel<int>,
-                        CUDAReduceMaxGradKernel<int64_t>,
-                        CUDAReduceMaxGradKernel<float>,
-                        CUDAReduceMaxGradKernel<double>);
+#include "paddle/phi/kernels/impl/lu_unpack_kernel_impl.h"
+#include "paddle/phi/kernels/lu_unpack_kernel.h"
+
+PD_REGISTER_KERNEL(
+    lu_unpack, CPU, ALL_LAYOUT, phi::LUUnpackKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/prior_box_kernel.cc b/paddle/phi/kernels/cpu/prior_box_kernel.cc
new file mode 100644
index 00000000000000..018b18006d3a70
--- /dev/null
+++ b/paddle/phi/kernels/cpu/prior_box_kernel.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prior_box_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PriorBoxKernel(const Context& ctx,
+                    const DenseTensor& input,
+                    const DenseTensor& image,
+                    const std::vector<float>& min_sizes,
+                    const std::vector<float>& aspect_ratios,
+                    const std::vector<float>& variances,
+                    const std::vector<float>& max_sizes,
+                    bool flip,
+                    bool clip,
+                    float step_w,
+                    float step_h,
+                    float offset,
+                    bool min_max_aspect_ratios_order,
+                    DenseTensor* out,
+                    DenseTensor* var) {
+  std::vector<float> new_aspect_ratios;
+  ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+
+  T new_step_w = static_cast<T>(step_w);
+  T new_step_h = static_cast<T>(step_h);
+  T new_offset = static_cast<T>(offset);
+
+  auto img_width = image.dims()[3];
+  auto img_height = image.dims()[2];
+
+  auto feature_width = input.dims()[3];
+  auto feature_height = input.dims()[2];
+
+  T step_width, step_height;
+  if (new_step_w == 0 || new_step_h == 0) {
+    step_width = static_cast<T>(img_width) / feature_width;
+    step_height = static_cast<T>(img_height) / feature_height;
+  } else {
+    step_width = new_step_w;
+    step_height = new_step_h;
+  }
+
+  int num_priors = new_aspect_ratios.size() * min_sizes.size();
+  if (max_sizes.size() > 0) {
+    num_priors += max_sizes.size();
+  }
+
+  ctx.template Alloc<T>(out);
+  ctx.template Alloc<T>(var);
+
+  T* b_t = out->data<T>();
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      T center_x = (w + new_offset) * step_width;
+      T center_y = (h + new_offset) * step_height;
+      T box_width, box_height;
+      for (size_t s = 0; s < min_sizes.size(); ++s) {
+        auto min_size = min_sizes[s];
+        if (min_max_aspect_ratios_order) {
+          box_width = box_height = min_size / 2.;
+          b_t[0] = (center_x - box_width) / img_width;
+          b_t[1] = (center_y - box_height) / img_height;
+          b_t[2] = (center_x + box_width) / img_width;
+          b_t[3] = (center_y + box_height) / img_height;
+          b_t += 4;
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
+          // priors with different aspect ratios
+          for (size_t r = 0; r < new_aspect_ratios.size(); ++r) {
+            float ar = new_aspect_ratios[r];
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
+        } else {
+          // priors with different aspect ratios
+          for (size_t r = 0; r < new_aspect_ratios.size(); ++r) {
+            float ar = new_aspect_ratios[r];
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
+        }
+      }
+    }
+  }
+
+  if (clip) {
+    T* dt = out->data<T>();
+    std::transform(dt, dt + out->numel(), dt, [](T v) -> T {
+      return std::min<T>(std::max<T>(v, 0.), 1.);
+    });
+  }
+
+  DenseTensor var_t;
+  var_t.Resize(phi::make_ddim({1, static_cast<int>(variances.size())}));
+  ctx.template Alloc<T>(&var_t);
+  auto var_et = EigenTensor<T, 2>::From(var_t);
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (size_t i = 0; i < variances.size(); ++i) {
+    var_et(0, i) = variances[i];
+  }
+
+  int box_num = feature_height * feature_width * num_priors;
+  auto var_dim = var->dims();
+  var->Resize({box_num, static_cast<int>(variances.size())});
+
+  auto e_vars = EigenMatrix<T, Eigen::RowMajor>::From(*var);
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int i = 0; i < box_num; ++i) {
+    for (size_t j = 0; j < variances.size(); ++j) {
+      e_vars(i, j) = variances[j];
+    }
+  }
+  var->Resize(var_dim);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    prior_box, CPU, ALL_LAYOUT, phi::PriorBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc
new file mode 100644
index 00000000000000..ffe9133d6d94c9
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amax_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceAMaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::AMaxOrAMinGradFunctor>(
+      dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(amax_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceAMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_amax_kernel.cc b/paddle/phi/kernels/cpu/reduce_amax_kernel.cc
new file mode 100644
index 00000000000000..ac3b5ce762e293
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_amax_kernel.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amax_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMaxRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MaxFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(amax_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AMaxRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc
new file mode 100644
index 00000000000000..6bb0e5061cc20a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amin_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+#include "paddle/phi/kernels/impl/reduce_grad.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceAMinGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad) {
+  ReduceGradKernel<Context, T, funcs::AMaxOrAMinGradFunctor>(
+      dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(amin_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceAMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_amin_kernel.cc b/paddle/phi/kernels/cpu/reduce_amin_kernel.cc
new file mode 100644
index 00000000000000..d8f090f93ffd3a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_amin_kernel.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amin_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMinRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::MinFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(amin_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AMinRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index 9bf1bfecabbf22..34032e153c0496 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
 namespace phi {
 namespace funcs {
 
@@ -178,5 +181,120 @@ struct MaxOrMinGradFunctor {
   }
 };
 
+#define HANDLE_AXIS_DIM(BROADCAST_DIM, AXIS_DIM)                 \
+  if (broadcast_dim_size == BROADCAST_DIM && rank == AXIS_DIM) { \
+    AMaxOrAMinAxisIsListGradFunctor<DeviceContext,               \
+                                    X,                           \
+                                    Y,                           \
+                                    DX,                          \
+                                    DY,                          \
+                                    Dim,                         \
+                                    BROADCAST_DIM,               \
+                                    AXIS_DIM>(                   \
+        place, x, y, dx, dy, dim, axis_dim);                     \
+  }
+
+template <typename DeviceContext,
+          typename X,
+          typename Y,
+          typename DX,
+          typename DY,
+          typename Dim,
+          int R,
+          int D>
+void AMaxOrAMinAxisIsListGradFunctor(const DeviceContext& place,
+                                     X* x,
+                                     Y* y,
+                                     DX* dx,
+                                     DY* dy,
+                                     const Dim& dim,
+                                     const std::vector<int>& axis_dim) {
+  // R is x->dimensions().size();
+  // D is axis_dim->dimensions().size();
+  auto axis = Eigen::array<int, D>();
+  auto reshape_x = Eigen::array<int, R>();
+  auto reshape_y = Eigen::array<int, R>();
+
+  for (int i = 0; i < D; i++) axis[i] = axis_dim[i];
+  for (int i = 0; i < R; i++) {
+    reshape_x[i] = x->dimensions()[i];
+    reshape_y[i] = y->dimensions()[i];
+  }
+
+  auto equals = (*x) == y->broadcast(dim);
+  auto ones = dx->constant(1);
+  auto zeros = dx->constant(0);
+  auto mask = equals.select(ones, zeros);
+  dx->device(place) =
+      dy->broadcast(dim) * mask /
+      mask.reshape(reshape_x).sum(axis).reshape(reshape_y).broadcast(dim);
+}
+
+struct AMaxOrAMinGradFunctor {
+  template <typename DeviceContext,
+            typename X,
+            typename Y,
+            typename DX,
+            typename DY,
+            typename Dim>
+  void operator()(const DeviceContext& place,
+                  X* x,
+                  Y* y,
+                  DX* dx,
+                  DY* dy,
+                  const Dim& dim,
+                  int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    auto mask = equals.select(ones, zeros);
+
+    // If there are multiple minimum or maximum elements,
+    // we evenly distribute gradient between these equal values
+    size_t x_numel = 1;
+    for (size_t i = 0; i < x->dimensions().size(); i++)
+      x_numel *= x->dimensions()[i];
+    // reduce_all
+    if (size == static_cast<int>(x_numel)) {
+      auto equal_number = mask.sum()
+                              .reshape(Eigen::array<int, 1>({1}))
+                              .broadcast(Eigen::array<int, 1>({size}));
+      dx->device(place) = dy->broadcast(dim) * mask / equal_number;
+      return;
+    }
+
+    // compute forward reduce axis_dim by dim (which is broadcast_dim)
+    std::vector<int> axis_dim;
+    int broadcast_dim_size = static_cast<int>(dim.size());
+    for (int i = 0; i < broadcast_dim_size; i++) {
+      if (dim[i] > 1) {
+        axis_dim.push_back(i);
+      }
+    }
+
+    int rank = static_cast<int>(axis_dim.size());
+    // axis is a int element
+    if (rank == 1) {
+      auto axis = Eigen::array<int, 1>({axis_dim[0]});
+      dx->device(place) =
+          dy->broadcast(dim) * mask /
+          mask.sum(axis).reshape(dy->dimensions()).broadcast(dim);
+      return;
+    }
+    // axis is list, HANDLE_AXIS_DIM(broadcast_dim_size, rank)
+    HANDLE_AXIS_DIM(3, 2);
+    HANDLE_AXIS_DIM(4, 2);
+    HANDLE_AXIS_DIM(4, 3);
+    // comments for accelerating compiling temporarily.
+    // HANDLE_AXIS_DIM(5, 2);
+    // HANDLE_AXIS_DIM(5, 3);
+    // HANDLE_AXIS_DIM(5, 4);
+    // HANDLE_AXIS_DIM(6, 2);
+    // HANDLE_AXIS_DIM(6, 3);
+    // HANDLE_AXIS_DIM(6, 4);
+    // HANDLE_AXIS_DIM(6, 5);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/lu_unpack_grad_kernel.cu b/paddle/phi/kernels/gpu/lu_unpack_grad_kernel.cu
new file mode 100644
index 00000000000000..779c4f3facaf39
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lu_unpack_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h"
+#include "paddle/phi/kernels/lu_unpack_grad_kernel.h"
+
+PD_REGISTER_KERNEL(
+    lu_unpack_grad, GPU, ALL_LAYOUT, phi::LUUnpackGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lu_unpack_kernel.cu b/paddle/phi/kernels/gpu/lu_unpack_kernel.cu
new file mode 100644
index 00000000000000..01a9212a59303f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/lu_unpack_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/impl/lu_unpack_kernel_impl.h"
+#include "paddle/phi/kernels/lu_unpack_kernel.h"
+
+PD_REGISTER_KERNEL(
+    lu_unpack, GPU, ALL_LAYOUT, phi::LUUnpackKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/prior_box_kernel.cu b/paddle/phi/kernels/gpu/prior_box_kernel.cu
new file mode 100644
index 00000000000000..317f2a3231a642
--- /dev/null
+++ b/paddle/phi/kernels/gpu/prior_box_kernel.cu
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/prior_box_kernel.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+__device__ inline T clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+__global__ void GenPriorBox(T* out,
+                            const T* aspect_ratios,
+                            const int height,
+                            const int width,
+                            const int im_height,
+                            const int im_width,
+                            const int as_num,
+                            const T offset,
+                            const T step_width,
+                            const T step_height,
+                            const T* min_sizes,
+                            const T* max_sizes,
+                            const int min_num,
+                            bool is_clip,
+                            bool min_max_aspect_ratios_order) {
+  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  int box_num = height * width * num_priors;
+  CUDA_KERNEL_LOOP(i, box_num) {
+    int h = i / (num_priors * width);
+    int w = (i / num_priors) % width;
+    int p = i % num_priors;
+    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    T cx = (w + offset) * step_width;
+    T cy = (h + offset) * step_height;
+    T bw, bh;
+    T min_size = min_sizes[m];
+    if (max_sizes) {
+      int s = p % (as_num + 1);
+      if (!min_max_aspect_ratios_order) {
+        if (s < as_num) {
+          T ar = aspect_ratios[s];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        } else {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        }
+      } else {
+        if (s == 0) {
+          bw = bh = min_size / 2.;
+        } else if (s == 1) {
+          T max_size = max_sizes[m];
+          bw = sqrt(min_size * max_size) / 2.;
+          bh = bw;
+        } else {
+          T ar = aspect_ratios[s - 1];
+          bw = min_size * sqrt(ar) / 2.;
+          bh = min_size / sqrt(ar) / 2.;
+        }
+      }
+    } else {
+      int s = p % as_num;
+      T ar = aspect_ratios[s];
+      bw = min_size * sqrt(ar) / 2.;
+      bh = min_size / sqrt(ar) / 2.;
+    }
+    T xmin = (cx - bw) / im_width;
+    T ymin = (cy - bh) / im_height;
+    T xmax = (cx + bw) / im_width;
+    T ymax = (cy + bh) / im_height;
+    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
+    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
+    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
+    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out,
+                            const T* var,
+                            const int vnum,
+                            const int num) {
+  CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
+}
+
+template <typename T, typename Context>
+void PriorBoxKernel(const Context& ctx,
+                    const DenseTensor& input,
+                    const DenseTensor& image,
+                    const std::vector<float>& min_sizes,
+                    const std::vector<float>& aspect_ratios,
+                    const std::vector<float>& variances,
+                    const std::vector<float>& max_sizes,
+                    bool flip,
+                    bool clip,
+                    float step_w,
+                    float step_h,
+                    float offset,
+                    bool min_max_aspect_ratios_order,
+                    DenseTensor* out,
+                    DenseTensor* var) {
+  std::vector<float> new_aspect_ratios;
+  ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+
+  T new_step_w = static_cast<T>(step_w);
+  T new_step_h = static_cast<T>(step_h);
+  T new_offset = static_cast<T>(offset);
+
+  auto im_width = image.dims()[3];
+  auto im_height = image.dims()[2];
+
+  auto width = input.dims()[3];
+  auto height = input.dims()[2];
+
+  T step_width, step_height;
+  if (new_step_w == 0 || new_step_h == 0) {
+    step_width = static_cast<T>(im_width) / width;
+    step_height = static_cast<T>(im_height) / height;
+  } else {
+    step_width = new_step_w;
+    step_height = new_step_h;
+  }
+
+  int num_priors = new_aspect_ratios.size() * min_sizes.size();
+  if (max_sizes.size() > 0) {
+    num_priors += max_sizes.size();
+  }
+  int min_num = static_cast<int>(min_sizes.size());
+  int box_num = width * height * num_priors;
+
+  int block = 512;
+  int grid = (box_num + block - 1) / block;
+
+  auto stream = ctx.stream();
+
+  ctx.template Alloc<T>(out);
+  ctx.template Alloc<T>(var);
+
+  DenseTensor r;
+  paddle::framework::TensorFromVector(new_aspect_ratios, ctx, &r);
+
+  DenseTensor min;
+  paddle::framework::TensorFromVector(min_sizes, ctx, &min);
+
+  T* max_data = nullptr;
+  DenseTensor max;
+  if (max_sizes.size() > 0) {
+    paddle::framework::TensorFromVector(max_sizes, ctx, &max);
+    max_data = max.data<T>();
+  }
+
+  GenPriorBox<T><<<grid, block, 0, stream>>>(out->data<T>(),
+                                             r.data<T>(),
+                                             height,
+                                             width,
+                                             im_height,
+                                             im_width,
+                                             new_aspect_ratios.size(),
+                                             new_offset,
+                                             step_width,
+                                             step_height,
+                                             min.data<T>(),
+                                             max_data,
+                                             min_num,
+                                             clip,
+                                             min_max_aspect_ratios_order);
+
+  DenseTensor v;
+  paddle::framework::TensorFromVector(variances, ctx, &v);
+  grid = (box_num * 4 + block - 1) / block;
+  SetVariance<T><<<grid, block, 0, stream>>>(
+      var->data<T>(), v.data<T>(), variances.size(), box_num * 4);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    prior_box, GPU, ALL_LAYOUT, phi::PriorBoxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu
new file mode 100644
index 00000000000000..a75ef42889da2e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce_amin_amax_common.h"
+#include "paddle/phi/kernels/reduce_max_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceAMaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad) {
+  ReduceCudaAMaxAMinGrad<T, Context>(
+      dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(amax_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceAMaxGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
new file mode 100644
index 00000000000000..fe3cd89d5bc974
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& out_grad,
+                            const std::vector<int64_t>& dims,
+                            bool keep_dim,
+                            bool reduce_all,
+                            DenseTensor* x_grad) {
+  auto* in_x = &x;
+  auto* out_y = &out;
+  auto* d_out = &out_grad;
+  auto* d_x = x_grad;
+  // get reduce_dim and reduce_num for reduce_mean_grad
+  int dim_size = in_x->dims().size();
+  auto reduce_dims = funcs::details::GetReduceDim(dims, dim_size, reduce_all);
+  auto update_dims = vectorize(d_x->dims());
+  int reduce_num = 1;
+  for (auto i : reduce_dims) {
+    reduce_num *= (in_x->dims())[i];
+    update_dims[i] = 1;
+  }
+
+  // make new tensor reduce_out
+  phi::DenseTensor new_y(out_y->type());
+  new_y.ShareDataWith(*out_y);
+  new_y.Resize(phi::make_ddim(update_dims));
+
+  // make new tensor d_out
+  phi::DenseTensor new_dout(d_out->type());
+  new_dout.ShareDataWith(*d_out);
+  new_dout.Resize(phi::make_ddim(update_dims));
+  dev_ctx.Alloc(d_x, d_out->dtype());
+
+  auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x);
+  auto new_in_tensor = new_in.get();
+
+  auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x);
+  auto new_dx_tensor = new_dx.get();
+
+  // make equal_out
+  phi::DenseTensor* equal_out = new phi::DenseTensor();
+  equal_out->Resize(in_x->dims());
+  dev_ctx.template Alloc<T>(equal_out);
+  auto equal_out_tensor = *equal_out;
+
+  // make new tensor equal_count
+  phi::DenseTensor* equal_count = new phi::DenseTensor();
+  equal_count->Resize(phi::make_ddim(update_dims));
+  dev_ctx.template Alloc<T>(equal_count);
+
+  // compute
+  // 1. equal_out = Equal(x, y)
+  std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
+  std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+  // 2. equal_count = reduceSum(equal_out)
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+  phi::funcs::
+      ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T, MPType>>(
+          dev_ctx,
+          equal_out_tensor,
+          equal_count,
+          kps::IdentityFunctor<T, MPType>(),
+          reduce_dims,
+          false);
+
+  // 3. dx = Div(dout, equal_out)
+  std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
+                                                      equal_count};
+  std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, grad_inputs, &grad_outputs, 0, funcs::DivideFunctor<T>());
+  delete equal_out;
+  delete equal_count;
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu
new file mode 100644
index 00000000000000..152ef494b4c130
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amin_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce_amin_amax_common.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceAMinGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad) {
+  ReduceCudaAMaxAMinGrad<T, Context>(
+      dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(amin_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceAMinGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h b/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h
new file mode 100644
index 00000000000000..648e12bb26a48c
--- /dev/null
+++ b/paddle/phi/kernels/impl/lu_unpack_grad_kernel_impl.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/impl/lu_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LUUnpackGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& pivots,
+                        const DenseTensor& l,
+                        const DenseTensor& u,
+                        const DenseTensor& pmat,
+                        const DenseTensor& l_grad,
+                        const DenseTensor& u_grad,
+                        bool unpack_ludata,
+                        bool unpack_pivots,
+                        DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+
+  DenseTensor dl_tril, du_triu;
+  const auto ldims = l_grad.dims();
+  dl_tril.Resize(ldims);
+  auto H = ldims[ldims.size() - 2];
+  auto W = ldims[ldims.size() - 1];
+  dev_ctx.template Alloc<T>(&dl_tril);
+  auto L_dataptr = dl_tril.data<T>();
+  phi::funcs::ForRange<Context> l_for_range(dev_ctx, l_grad.numel());
+  phi::funcs::TrilTriuCompute<T> tril_computer(
+      l_grad.data<T>(), -1, true, H, W, L_dataptr);
+  l_for_range(tril_computer);
+
+  const auto udims = u_grad.dims();
+  du_triu.Resize(udims);
+  H = udims[udims.size() - 2];
+  W = udims[udims.size() - 1];
+  dev_ctx.template Alloc<T>(&du_triu);
+  auto U_dataptr = du_triu.data<T>();
+  phi::funcs::ForRange<Context> u_for_range(dev_ctx, u_grad.numel());
+  phi::funcs::TrilTriuCompute<T> triu_computer(
+      u_grad.data<T>(), 0, false, H, W, U_dataptr);
+  u_for_range(triu_computer);
+
+  auto xdims = x_grad->dims();
+  int xrank = xdims.size();
+  int64_t m = xdims[xrank - 2];
+  int64_t n = xdims[xrank - 1];
+  int64_t k = std::min(m, n);
+
+  std::vector<int64_t> axes = {xrank - 2, xrank - 1};
+  std::vector<int64_t> slice_starts(2, 0);
+  std::vector<int64_t> slice_ends(2, 0);
+  auto valuedims = vectorize(xdims);
+
+  phi::funcs::SetConstant<Context, T> setter;
+  setter(dev_ctx, x_grad, static_cast<T>(0));
+  if (m <= n) {
+    slice_starts[0] = 0;
+    slice_starts[1] = 0;
+    slice_ends[0] = k;
+    slice_ends[1] = k;
+    valuedims[xrank - 2] = k;
+    valuedims[xrank - 1] = k;
+    SetValueCompute_dispatch<Context, T>(dev_ctx,
+                                         x_grad,
+                                         &dl_tril,
+                                         x_grad,
+                                         axes,
+                                         &slice_starts,
+                                         &slice_ends,
+                                         valuedims,
+                                         xrank);
+
+    Tensor_Add<Context, T>(dev_ctx, *x_grad, du_triu, x_grad);
+  } else {
+    slice_starts[0] = 0;
+    slice_starts[1] = 0;
+    slice_ends[0] = k;
+    slice_ends[1] = k;
+    valuedims[xrank - 2] = k;
+    valuedims[xrank - 1] = k;
+    SetValueCompute_dispatch<Context, T>(dev_ctx,
+                                         x_grad,
+                                         &du_triu,
+                                         x_grad,
+                                         axes,
+                                         &slice_starts,
+                                         &slice_ends,
+                                         valuedims,
+                                         xrank);
+
+    Tensor_Add<Context, T>(dev_ctx, *x_grad, dl_tril, x_grad);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/lu_unpack_kernel_impl.h b/paddle/phi/kernels/impl/lu_unpack_kernel_impl.h
new file mode 100644
index 00000000000000..7e77fdd1719945
--- /dev/null
+++ b/paddle/phi/kernels/impl/lu_unpack_kernel_impl.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/impl/lu_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LUUnpackKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& pivots,
+                    bool unpack_ludata,
+                    bool unpack_pivots,
+                    DenseTensor* pmat,
+                    DenseTensor* l,
+                    DenseTensor* u) {
+  auto xdims = x.dims();
+  int xrank = xdims.size();
+  int64_t m = xdims[xrank - 2];
+  int64_t n = xdims[xrank - 1];
+  int64_t k = std::min(m, n);
+
+  if (unpack_ludata) {
+    dev_ctx.template Alloc<T>(l);
+    dev_ctx.template Alloc<T>(u);
+
+    DenseTensor L, U;
+    LU_Unpack<Context, T>(dev_ctx, &x, &L, &U);
+
+    if (m >= n) {
+      phi::Copy(dev_ctx, L, dev_ctx.GetPlace(), false, l);
+      Tensor_narrow<Context, T>(dev_ctx, &U, u, 0, k, 0, k);
+    } else {
+      phi::Copy(dev_ctx, U, dev_ctx.GetPlace(), false, u);
+      Tensor_narrow<Context, T>(dev_ctx, &L, l, 0, k, 0, k);
+    }
+  }
+
+  if (unpack_pivots) {
+    dev_ctx.template Alloc<T>(pmat);
+    Unpack_Pivot<Context, T>(dev_ctx, pivots, pmat, m, k);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/kps/reduce_amax_kernel.cu b/paddle/phi/kernels/kps/reduce_amax_kernel.cu
new file mode 100644
index 00000000000000..57197fd9d5b8a2
--- /dev/null
+++ b/paddle/phi/kernels/kps/reduce_amax_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_amin_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMaxRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MaxFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(amax_raw, KPS, ALL_LAYOUT, phi::AMaxRawKernel, float) {}
+#else
+PD_REGISTER_KERNEL(amax_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AMaxRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/kps/reduce_amin_kernel.cu b/paddle/phi/kernels/kps/reduce_amin_kernel.cu
new file mode 100644
index 00000000000000..230adcc8294418
--- /dev/null
+++ b/paddle/phi/kernels/kps/reduce_amin_kernel.cu
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amin_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMinRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MinFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(amin_raw, KPS, ALL_LAYOUT, phi::AMinRawKernel, float) {}
+#else
+PD_REGISTER_KERNEL(amin_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AMinRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/lu_unpack_grad_kernel.h b/paddle/phi/kernels/lu_unpack_grad_kernel.h
new file mode 100644
index 00000000000000..056f2096d96e58
--- /dev/null
+++ b/paddle/phi/kernels/lu_unpack_grad_kernel.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LUUnpackGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& pivots,
+                        const DenseTensor& l,
+                        const DenseTensor& u,
+                        const DenseTensor& pmat,
+                        const DenseTensor& l_grad,
+                        const DenseTensor& u_grad,
+                        bool unpack_ludata,
+                        bool unpack_pivots,
+                        DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/lu_unpack_kernel.h b/paddle/phi/kernels/lu_unpack_kernel.h
new file mode 100644
index 00000000000000..48acc3cc566ebf
--- /dev/null
+++ b/paddle/phi/kernels/lu_unpack_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LUUnpackKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& pivots,
+                    bool unpack_ludata,
+                    bool unpack_pivots,
+                    DenseTensor* pmat,
+                    DenseTensor* l,
+                    DenseTensor* u);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/prior_box_kernel.h b/paddle/phi/kernels/prior_box_kernel.h
new file mode 100644
index 00000000000000..7a25b7d8e6d463
--- /dev/null
+++ b/paddle/phi/kernels/prior_box_kernel.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PriorBoxKernel(const Context& ctx,
+                    const DenseTensor& input,
+                    const DenseTensor& image,
+                    const std::vector<float>& min_sizes,
+                    const std::vector<float>& aspect_ratios,
+                    const std::vector<float>& variances,
+                    const std::vector<float>& max_sizes,
+                    bool flip,
+                    bool clip,
+                    float step_w,
+                    float step_h,
+                    float offset,
+                    bool min_max_aspect_ratios_order,
+                    DenseTensor* out,
+                    DenseTensor* var);
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_amax_grad_kernel.h b/paddle/phi/kernels/reduce_amax_grad_kernel.h
new file mode 100644
index 00000000000000..82518c11675c39
--- /dev/null
+++ b/paddle/phi/kernels/reduce_amax_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceAMaxGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
new file mode 100644
index 00000000000000..acec25d83db6a2
--- /dev/null
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amax_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMaxKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  AMaxRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
+#endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(amax, KPS, ALL_LAYOUT, phi::AMaxKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_amax_kernel.h b/paddle/phi/kernels/reduce_amax_kernel.h
new file mode 100644
index 00000000000000..79a287b4871364
--- /dev/null
+++ b/paddle/phi/kernels/reduce_amax_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMaxRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void AMaxKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_amin_grad_kernel.h b/paddle/phi/kernels/reduce_amin_grad_kernel.h
new file mode 100644
index 00000000000000..96f157e2038628
--- /dev/null
+++ b/paddle/phi/kernels/reduce_amin_grad_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceAMinGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
+                          const std::vector<int64_t>& dims,
+                          bool keep_dim,
+                          bool reduce_all,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
new file mode 100644
index 00000000000000..28e6e587f40201
--- /dev/null
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_amin_kernel.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMinKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  AMinRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(
+    amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
+#endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(amin, KPS, ALL_LAYOUT, phi::AMinKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.h b/paddle/phi/kernels/reduce_amin_kernel.h
new file mode 100644
index 00000000000000..b36351dd5258f8
--- /dev/null
+++ b/paddle/phi/kernels/reduce_amin_kernel.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AMinRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
+template <typename T, typename Context>
+void AMinKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/lu_unpack_sig.cc b/paddle/phi/ops/compat/lu_unpack_sig.cc
new file mode 100644
index 00000000000000..8baafe4fcb23ac
--- /dev/null
+++ b/paddle/phi/ops/compat/lu_unpack_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LUUnpackOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("lu_unpack",
+                         {"X", "Pivots"},
+                         {"unpack_ludata", "unpack_pivots"},
+                         {"Pmat", "L", "U"});
+}
+
+KernelSignature LUUnpackGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("lu_unpack_grad",
+                         {"X", "Pivots", "L", "U", "Pmat", "L@GRAD", "U@GRAD"},
+                         {"unpack_ludata", "unpack_pivots"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(lu_unpack, phi::LUUnpackOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lu_unpack_grad, phi::LUUnpackGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/prior_box_sig.cc b/paddle/phi/ops/compat/prior_box_sig.cc
new file mode 100644
index 00000000000000..5d4cd5164305f8
--- /dev/null
+++ b/paddle/phi/ops/compat/prior_box_sig.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature PriorBoxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("prior_box",
+                         {"Input", "Image"},
+                         {"min_sizes",
+                          "aspect_ratios",
+                          "variances",
+                          "max_sizes",
+                          "flip",
+                          "clip",
+                          "step_w",
+                          "step_h",
+                          "offset",
+                          "min_max_aspect_ratios_order"},
+                         {"Boxes", "Variances"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(prior_box, phi::PriorBoxOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index a0ba07f5e8e2cf..e796307c0c9b3a 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -83,6 +83,22 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceAMaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "max_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "max_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "amax_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("amax", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("X")) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
@@ -99,6 +115,22 @@ KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceAMinOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
+    // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in
+    // InferShape, so we must return the "min_raw" KernelSignature.
+    // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with
+    // the "min_raw" KernelSignature
+    if (ctx.IsForInferShape() || reduce_all) {
+      return KernelSignature(
+          "amin_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+    }
+    return KernelSignature("amin", {"X"}, {"dim", "keep_dim"}, {"Out"});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("X")) {
     bool reduce_all = paddle::any_cast<bool>(ctx.Attr("reduce_all"));
@@ -151,6 +183,14 @@ KernelSignature ReduceMaxGradOpArgumentMapping(
                          {"X@GRAD"});
 }
 
+KernelSignature ReduceAMaxGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("amax_grad",
+                         {"X", "Out", "Out@GRAD"},
+                         {"dim", "keep_dim", "reduce_all"},
+                         {"X@GRAD"});
+}
+
 KernelSignature ReduceMinGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("min_grad",
@@ -159,6 +199,14 @@ KernelSignature ReduceMinGradOpArgumentMapping(
                          {"X@GRAD"});
 }
 
+KernelSignature ReduceAMinGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("amin_grad",
+                         {"X", "Out", "Out@GRAD"},
+                         {"dim", "keep_dim", "reduce_all"},
+                         {"X@GRAD"});
+}
+
 KernelSignature ReduceProdGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("prod_grad",
@@ -173,6 +221,8 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_amax, amax);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_amin, amin);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any);
@@ -182,12 +232,16 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad);
 PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_amax_grad, amax_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_amin_grad, amin_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_amax, phi::ReduceAMaxOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_amin, phi::ReduceAMinOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping);
 
@@ -199,5 +253,9 @@ PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad,
                            phi::ReduceProdGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad,
                            phi::ReduceMaxGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_amax_grad,
+                           phi::ReduceAMaxGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad,
                            phi::ReduceMinGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_amin_grad,
+                           phi::ReduceAMinGradOpArgumentMapping);
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index f8691cc156f940..3540f69c049739 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -22,7 +22,7 @@
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-from ..framework import Variable, _non_static_mode, static_only
+from ..framework import Variable, _non_static_mode, static_only, in_dygraph_mode
 from .. import core
 from .loss import softmax_with_cross_entropy
 from . import tensor
@@ -1794,18 +1794,20 @@ def __reshape_to_2d(var):
     return loss
 
 
-def prior_box(input,
-              image,
-              min_sizes,
-              max_sizes=None,
-              aspect_ratios=[1.],
-              variance=[0.1, 0.1, 0.2, 0.2],
-              flip=False,
-              clip=False,
-              steps=[0.0, 0.0],
-              offset=0.5,
-              name=None,
-              min_max_aspect_ratios_order=False):
+def prior_box(
+    input,
+    image,
+    min_sizes,
+    max_sizes=None,
+    aspect_ratios=[1.],
+    variance=[0.1, 0.1, 0.2, 0.2],
+    flip=False,
+    clip=False,
+    steps=[0.0, 0.0],
+    offset=0.5,
+    name=None,
+    min_max_aspect_ratios_order=False,
+):
     """
 
     This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
@@ -1905,6 +1907,15 @@ def prior_box(input,
 		# [6L, 9L, 1L, 4L]
 
     """
+
+    if in_dygraph_mode():
+        step_w, step_h = steps
+        if max_sizes == None:
+            max_sizes = []
+        return _C_ops.final_state_prior_box(input, image, min_sizes,
+                                            aspect_ratios, variance, max_sizes,
+                                            flip, clip, step_w, step_h, offset,
+                                            min_max_aspect_ratios_order)
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
     check_variable_and_dtype(input, 'input',
diff --git a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
index 97773c70e177a4..246587fba71515 100644
--- a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py
@@ -120,6 +120,8 @@ def set_output(self, A):
 
     def setUp(self):
         self.op_type = "lu_unpack"
+        self.python_api = paddle.tensor.linalg.lu_unpack
+        self.python_out_sig = ["Pmat", "L", "U"]
         self.config()
         x = np.random.random(self.x_shape).astype(self.dtype)
         if paddle.in_dynamic_mode():
@@ -156,10 +158,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], ['L', 'U'])
+        self.check_grad(['X'], ['L', 'U'], check_eager=True)
 
 
 # m = n
diff --git a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
index 651d9b5ea6860b..bdcc302de548f8 100644
--- a/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
+++ b/python/paddle/fluid/tests/unittests/test_pairwise_distance.py
@@ -20,24 +20,64 @@
 import unittest
 
 
-def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False):
-    return np.linalg.norm(x - y, ord=p, axis=1, keepdims=keepdim)
+def np_pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False):
+    distance = np.linalg.norm(x - y + epsilon, ord=p, axis=-1, keepdims=keepdim)
+    # Paddle currently has not supported for 0-d Tensors, so even if keep_dim is False,
+    # and neither x nor y is batched, a Tensor of shape (1, ) is returned
+    if distance.ndim == 0:
+        distance = np.expand_dims(distance, axis=0)
+    return distance
 
 
-def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
+def call_pairwise_distance_layer(x, y, p=2., epsilon=1e-6, keepdim='False'):
+    pairwise_distance = paddle.nn.PairwiseDistance(p=p,
+                                                   epsilon=epsilon,
+                                                   keepdim=keepdim)
+    distance = pairwise_distance(x=x, y=y)
+    return distance
+
+
+def call_pairwise_distance_functional(x,
+                                      y,
+                                      p=2.,
+                                      epsilon=1e-6,
+                                      keepdim='False'):
+    distance = paddle.nn.functional.pairwise_distance(x=x,
+                                                      y=y,
+                                                      p=p,
+                                                      epsilon=epsilon,
+                                                      keepdim=keepdim)
+    return distance
+
+
+def test_static(place,
+                x_np,
+                y_np,
+                p=2.0,
+                epsilon=1e-6,
+                keepdim=False,
+                functional=False):
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
-
     place = fluid.CUDAPlace(
         0) if paddle.fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
-
+    paddle.enable_static()
     with paddle.static.program_guard(prog, startup_prog):
         x = paddle.fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
         y = paddle.fluid.data(name='y', shape=y_np.shape, dtype=x_np.dtype)
-        dist = paddle.nn.layer.distance.PairwiseDistance(p=p,
+
+        if functional:
+            distance = call_pairwise_distance_functional(x=x,
+                                                         y=y,
+                                                         p=p,
                                                          epsilon=epsilon,
                                                          keepdim=keepdim)
-        distance = dist(x, y)
+        else:
+            distance = call_pairwise_distance_layer(x=x,
+                                                    y=y,
+                                                    p=p,
+                                                    epsilon=epsilon,
+                                                    keepdim=keepdim)
         exe = paddle.static.Executor(place)
         static_ret = exe.run(prog,
                              feed={
@@ -46,69 +86,279 @@ def test_static(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
                              },
                              fetch_list=[distance])
         static_ret = static_ret[0]
+    paddle.disable_static()
     return static_ret
 
 
-def test_dygraph(x_np, y_np, p=2.0, epsilon=1e-6, keepdim=False):
-    paddle.disable_static()
+def test_dygraph(place,
+                 x_np,
+                 y_np,
+                 p=2.0,
+                 epsilon=1e-6,
+                 keepdim=False,
+                 functional=False):
     x = paddle.to_tensor(x_np)
     y = paddle.to_tensor(y_np)
-    dist = paddle.nn.layer.distance.PairwiseDistance(p=p,
-                                                     epsilon=epsilon,
-                                                     keepdim=keepdim)
-    distance = dist(x, y)
-    dygraph_ret = distance.numpy()
-    paddle.enable_static()
+    if functional:
+        dy_distance = call_pairwise_distance_functional(x=x,
+                                                        y=y,
+                                                        p=p,
+                                                        epsilon=epsilon,
+                                                        keepdim=keepdim)
+    else:
+        dy_distance = call_pairwise_distance_layer(x=x,
+                                                   y=y,
+                                                   p=p,
+                                                   epsilon=epsilon,
+                                                   keepdim=keepdim)
+    dygraph_ret = dy_distance.numpy()
     return dygraph_ret
 
 
+def test_legacy_dygraph(place,
+                        x_np,
+                        y_np,
+                        p=2.0,
+                        epsilon=1e-6,
+                        keepdim=False,
+                        functional=False):
+    paddle.fluid.framework._enable_legacy_dygraph()
+    x = paddle.to_tensor(x_np)
+    y = paddle.to_tensor(y_np)
+    if functional:
+        legacy_distance = call_pairwise_distance_functional(x=x,
+                                                            y=y,
+                                                            p=p,
+                                                            epsilon=epsilon,
+                                                            keepdim=keepdim)
+    else:
+        legacy_distance = call_pairwise_distance_layer(x=x,
+                                                       y=y,
+                                                       p=p,
+                                                       epsilon=epsilon,
+                                                       keepdim=keepdim)
+    legacy_ret = legacy_distance.numpy()
+    paddle.fluid.framework._disable_legacy_dygraph()
+    return legacy_ret
+
+
 class TestPairwiseDistance(unittest.TestCase):
 
     def test_pairwise_distance(self):
-        all_shape = [[100, 100], [4, 5, 6, 7]]
+        epsilon = 1e-6
+        all_shape = [[5], [100, 100]]
         dtypes = ['float32', 'float64']
+        p_list = [-1, 0, 1, 2, np.inf, -np.inf]
+        places = [paddle.CPUPlace()]
+        if paddle.device.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
         keeps = [False, True]
-        for shape in all_shape:
-            for dtype in dtypes:
-                for keepdim in keeps:
-                    x_np = np.random.random(shape).astype(dtype)
-                    y_np = np.random.random(shape).astype(dtype)
-
-                    static_ret = test_static(x_np, y_np, keepdim=keepdim)
-                    dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
-                    excepted_value = pairwise_distance(x_np,
+        for place in places:
+            for shape in all_shape:
+                for dtype in dtypes:
+                    for p in p_list:
+                        for keepdim in keeps:
+                            x_np = np.random.random(shape).astype(dtype)
+                            y_np = np.random.random(shape).astype(dtype)
+
+                            static_ret = test_static(place,
+                                                     x_np,
+                                                     y_np,
+                                                     p,
+                                                     epsilon=epsilon,
+                                                     keepdim=keepdim)
+                            dygraph_ret = test_dygraph(place,
+                                                       x_np,
                                                        y_np,
+                                                       p,
+                                                       epsilon=epsilon,
                                                        keepdim=keepdim)
+                            legacy_ret = test_legacy_dygraph(place,
+                                                             x_np,
+                                                             y_np,
+                                                             p,
+                                                             epsilon=epsilon,
+                                                             keepdim=keepdim)
+                            excepted_value = np_pairwise_distance(
+                                x_np, y_np, p, epsilon=epsilon, keepdim=keepdim)
+
+                            self.assertEqual(static_ret.shape,
+                                             excepted_value.shape)
+                            self.assertEqual(dygraph_ret.shape,
+                                             excepted_value.shape)
+                            self.assertEqual(legacy_ret.shape,
+                                             excepted_value.shape)
+
+                            self.assertTrue(
+                                np.allclose(static_ret, excepted_value))
+                            self.assertTrue(
+                                np.allclose(dygraph_ret, excepted_value))
+                            self.assertTrue(
+                                np.allclose(legacy_ret, excepted_value))
+
+                            static_functional_ret = test_static(place,
+                                                                x_np,
+                                                                y_np,
+                                                                p,
+                                                                epsilon=epsilon,
+                                                                keepdim=keepdim)
+                            dygraph_functional_ret = test_dygraph(
+                                place,
+                                x_np,
+                                y_np,
+                                p,
+                                epsilon=epsilon,
+                                keepdim=keepdim)
+                            legacy_functional_ret = test_legacy_dygraph(
+                                place,
+                                x_np,
+                                y_np,
+                                p,
+                                epsilon=epsilon,
+                                keepdim=keepdim)
 
-                    self.assertTrue(np.allclose(static_ret, dygraph_ret))
-                    self.assertTrue(np.allclose(static_ret, excepted_value))
-                    self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+                            self.assertEqual(static_functional_ret.shape,
+                                             excepted_value.shape)
+                            self.assertEqual(dygraph_functional_ret.shape,
+                                             excepted_value.shape)
+                            self.assertEqual(legacy_functional_ret.shape,
+                                             excepted_value.shape)
 
-    def test_pairwise_distance_broadcast(self):
+                            self.assertTrue(
+                                np.allclose(static_functional_ret,
+                                            excepted_value))
+                            self.assertTrue(
+                                np.allclose(dygraph_functional_ret,
+                                            excepted_value))
+                            self.assertTrue(
+                                np.allclose(legacy_functional_ret,
+                                            excepted_value))
+
+    def test_pairwise_distance_broadcast_1(self):
         shape_x = [100, 100]
         shape_y = [100, 1]
+        epsilon = 1e-6
         keepdim = False
+        place = paddle.CPUPlace()
         x_np = np.random.random(shape_x).astype('float32')
         y_np = np.random.random(shape_y).astype('float32')
-        static_ret = test_static(x_np, y_np, keepdim=keepdim)
-        dygraph_ret = test_dygraph(x_np, y_np, keepdim=keepdim)
-        excepted_value = pairwise_distance(x_np, y_np, keepdim=keepdim)
-        self.assertTrue(np.allclose(static_ret, dygraph_ret))
+        static_ret = test_static(place=place,
+                                 x_np=x_np,
+                                 y_np=y_np,
+                                 epsilon=epsilon,
+                                 keepdim=keepdim)
+        dygraph_ret = test_dygraph(place=place,
+                                   x_np=x_np,
+                                   y_np=y_np,
+                                   epsilon=epsilon,
+                                   keepdim=keepdim)
+        legacy_ret = test_legacy_dygraph(place=place,
+                                         x_np=x_np,
+                                         y_np=y_np,
+                                         epsilon=epsilon,
+                                         keepdim=keepdim)
+        excepted_value = np_pairwise_distance(x_np,
+                                              y_np,
+                                              epsilon=epsilon,
+                                              keepdim=keepdim)
+
+        self.assertEqual(static_ret.shape, excepted_value.shape)
+        self.assertEqual(dygraph_ret.shape, excepted_value.shape)
+        self.assertEqual(legacy_ret.shape, excepted_value.shape)
+
         self.assertTrue(np.allclose(static_ret, excepted_value))
         self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+        self.assertTrue(np.allclose(legacy_ret, excepted_value))
+
+        static_functional_ret = test_static(place=place,
+                                            x_np=x_np,
+                                            y_np=y_np,
+                                            epsilon=epsilon,
+                                            keepdim=keepdim,
+                                            functional=True)
+        dygraph_functional_ret = test_dygraph(place=place,
+                                              x_np=x_np,
+                                              y_np=y_np,
+                                              epsilon=epsilon,
+                                              keepdim=keepdim,
+                                              functional=True)
+        legacy_functional_ret = test_legacy_dygraph(place=place,
+                                                    x_np=x_np,
+                                                    y_np=y_np,
+                                                    epsilon=epsilon,
+                                                    keepdim=keepdim,
+                                                    functional=True)
+
+        self.assertEqual(static_functional_ret.shape, excepted_value.shape)
+        self.assertEqual(dygraph_functional_ret.shape, excepted_value.shape)
+        self.assertEqual(legacy_functional_ret.shape, excepted_value.shape)
 
-    def test_pairwise_distance_different_p(self):
-        shape = [100, 100]
+        self.assertTrue(np.allclose(static_functional_ret, excepted_value))
+        self.assertTrue(np.allclose(dygraph_functional_ret, excepted_value))
+        self.assertTrue(np.allclose(legacy_functional_ret, excepted_value))
+
+    def test_pairwise_distance_broadcast_2(self):
+        shape_x = [100, 100]
+        shape_y = [100]
+        epsilon = 1e-6
         keepdim = False
-        p = 3.0
-        x_np = np.random.random(shape).astype('float32')
-        y_np = np.random.random(shape).astype('float32')
-        static_ret = test_static(x_np, y_np, p=p, keepdim=keepdim)
-        dygraph_ret = test_dygraph(x_np, y_np, p=p, keepdim=keepdim)
-        excepted_value = pairwise_distance(x_np, y_np, p=p, keepdim=keepdim)
-        self.assertTrue(np.allclose(static_ret, dygraph_ret))
+        place = paddle.CPUPlace()
+        x_np = np.random.random(shape_x).astype('float32')
+        y_np = np.random.random(shape_y).astype('float32')
+        static_ret = test_static(place=place,
+                                 x_np=x_np,
+                                 y_np=y_np,
+                                 epsilon=epsilon,
+                                 keepdim=keepdim)
+        dygraph_ret = test_dygraph(place=place,
+                                   x_np=x_np,
+                                   y_np=y_np,
+                                   epsilon=epsilon,
+                                   keepdim=keepdim)
+        legacy_ret = test_legacy_dygraph(place=place,
+                                         x_np=x_np,
+                                         y_np=y_np,
+                                         epsilon=epsilon,
+                                         keepdim=keepdim)
+        excepted_value = np_pairwise_distance(x_np,
+                                              y_np,
+                                              epsilon=epsilon,
+                                              keepdim=keepdim)
+
+        self.assertEqual(static_ret.shape, excepted_value.shape)
+        self.assertEqual(dygraph_ret.shape, excepted_value.shape)
+        self.assertEqual(legacy_ret.shape, excepted_value.shape)
+
         self.assertTrue(np.allclose(static_ret, excepted_value))
         self.assertTrue(np.allclose(dygraph_ret, excepted_value))
+        self.assertTrue(np.allclose(legacy_ret, excepted_value))
+
+        static_functional_ret = test_static(place=place,
+                                            x_np=x_np,
+                                            y_np=y_np,
+                                            epsilon=epsilon,
+                                            keepdim=keepdim,
+                                            functional=True)
+        dygraph_functional_ret = test_dygraph(place=place,
+                                              x_np=x_np,
+                                              y_np=y_np,
+                                              epsilon=epsilon,
+                                              keepdim=keepdim,
+                                              functional=True)
+        legacy_functional_ret = test_legacy_dygraph(place=place,
+                                                    x_np=x_np,
+                                                    y_np=y_np,
+                                                    epsilon=epsilon,
+                                                    keepdim=keepdim,
+                                                    functional=True)
+
+        self.assertEqual(static_functional_ret.shape, excepted_value.shape)
+        self.assertEqual(dygraph_functional_ret.shape, excepted_value.shape)
+        self.assertEqual(legacy_functional_ret.shape, excepted_value.shape)
+
+        self.assertTrue(np.allclose(static_functional_ret, excepted_value))
+        self.assertTrue(np.allclose(dygraph_functional_ret, excepted_value))
+        self.assertTrue(np.allclose(legacy_functional_ret, excepted_value))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index b0aaaec246f676..0b57e8d00f761e 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -19,6 +19,35 @@
 import sys
 import math
 from op_test import OpTest
+import paddle
+
+
+def python_prior_box(input,
+                     image,
+                     min_sizes,
+                     aspect_ratios=[1.],
+                     variances=[0.1, 0.1, 0.2, 0.2],
+                     max_sizes=None,
+                     flip=False,
+                     clip=False,
+                     step_w=0,
+                     step_h=0,
+                     offset=0.5,
+                     min_max_aspect_ratios_order=False,
+                     name=None):
+    return paddle.fluid.layers.detection.prior_box(
+        input,
+        image,
+        min_sizes=min_sizes,
+        max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios,
+        variance=variances,
+        flip=flip,
+        clip=clip,
+        steps=[step_w, step_h],
+        offset=offset,
+        name=name,
+        min_max_aspect_ratios_order=min_max_aspect_ratios_order)
 
 
 class TestPriorBoxOp(OpTest):
@@ -35,10 +64,10 @@ def set_data(self):
             'variances': self.variances,
             'flip': self.flip,
             'clip': self.clip,
-            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
             'step_w': self.step_w,
             'step_h': self.step_h,
-            'offset': self.offset
+            'offset': self.offset,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
         }
         if len(self.max_sizes) > 0:
             self.attrs['max_sizes'] = self.max_sizes
@@ -46,10 +75,11 @@ def set_data(self):
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def setUp(self):
         self.op_type = "prior_box"
+        self.python_api = python_prior_box
         self.set_data()
 
     def set_max_sizes(self):
@@ -191,4 +221,5 @@ def set_min_max_aspect_ratios_order(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index b5d2d6f5beb9de..701997e0d0ab59 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -69,6 +69,7 @@
 from .conv import conv2d_transpose  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import conv3d_transpose  # noqa: F401
+from .distance import pairwise_distance  # noqa: F401
 from .extension import diag_embed  # noqa: F401
 from .extension import sequence_mask
 from .loss import binary_cross_entropy  # noqa: F401
@@ -137,6 +138,7 @@
     'conv2d_transpose',
     'conv3d',
     'conv3d_transpose',
+    'pairwise_distance',
     'elu',
     'elu_',
     'gelu',
diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py
new file mode 100644
index 00000000000000..8c672ffc69fd2f
--- /dev/null
+++ b/python/paddle/nn/functional/distance.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...fluid.layer_helper import LayerHelper
+from paddle import _C_ops
+from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
+
+__all__ = []
+
+
+def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
+    r"""
+    It computes the pairwise distance between two vectors. The
+    distance is calculated by p-oreder norm:
+
+    .. math::
+
+        \Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}.
+
+    Parameters:
+        x (Tensor): Tensor, shape is :math:`[N, D]` or :math:`[D]`, where :math:`N`
+            is batch size, :math:`D` is the dimension of vector. Available dtype is
+            float32, float64.
+        y (Tensor): Tensor, shape is :math:`[N, D]` or :math:`[D]`, where :math:`N`
+            is batch size, :math:`D` is the dimension of vector. Available dtype is
+            float32, float64.
+        p (float, optional): The order of norm. Default: :math:`2.0`.
+        epsilon (float, optional): Add small value to avoid division by zero.
+            Default: :math:`1e-6`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension
+            in the output Tensor. The result tensor is one dimension less than
+            the result of ``|x-y|`` unless :attr:`keepdim` is True. Default: False.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`.
+            Generally, no setting is required. Default: None.
+
+    Returns:
+        Tensor, the dtype is same as input tensor.
+        - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
+            depending on whether the input has data shaped as :math:`[N, D]`.
+        - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
+            depending on whether the input has data shaped as :math:`[N, D]`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.to_tensor([[1., 3.], [3., 5.]], dtype=paddle.float64)
+            y = paddle.to_tensor([[5., 6.], [7., 8.]], dtype=paddle.float64)
+            distance = paddle.nn.functional.pairwise_distance(x, y)
+            print(distance.numpy()) # [5. 5.]
+
+    """
+    check_type(p, 'porder', (float, int), 'PairwiseDistance')
+    check_type(epsilon, 'epsilon', (float), 'PairwiseDistance')
+    check_type(keepdim, 'keepdim', (bool), 'PairwiseDistance')
+    if in_dygraph_mode():
+        sub = _C_ops.elementwise_sub(x, y)
+        # p_norm op has not uesd epsilon, so change it to the following.
+        if epsilon != 0.0:
+            epsilon = paddle.fluid.dygraph.base.to_variable([epsilon],
+                                                            dtype=sub.dtype)
+            sub = _C_ops.elementwise_add(sub, epsilon)
+        return _C_ops.final_state_p_norm(sub, p, -1, 0., keepdim, False)
+
+    if _in_legacy_dygraph():
+        sub = _C_ops.elementwise_sub(x, y)
+        if epsilon != 0.0:
+            epsilon = paddle.fluid.dygraph.base.to_variable([epsilon],
+                                                            dtype=sub.dtype)
+            sub = _C_ops.elementwise_add(sub, epsilon)
+        return _C_ops.p_norm(sub, 'axis', -1, 'porder', p, 'keepdim', keepdim,
+                             'epsilon', 0.)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'PairwiseDistance')
+    check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'PairwiseDistance')
+    sub = paddle.subtract(x, y)
+    if epsilon != 0.0:
+        epsilon_var = sub.block.create_var(dtype=sub.dtype)
+        epsilon_var = paddle.full(shape=[1],
+                                  fill_value=epsilon,
+                                  dtype=sub.dtype)
+        sub = paddle.add(sub, epsilon_var)
+    helper = LayerHelper("PairwiseDistance", name=name)
+    attrs = {
+        'axis': -1,
+        'porder': p,
+        'keepdim': keepdim,
+        'epsilon': 0.,
+    }
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(type='p_norm',
+                     inputs={'X': sub},
+                     outputs={'Out': out},
+                     attrs=attrs)
+
+    return out
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 7c08e358fcc765..a7a488c833d7ff 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,22 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-
-import paddle
 from .. import Layer
-from ...fluid.data_feeder import check_variable_and_dtype, check_type
-from ...fluid.layer_helper import LayerHelper
-from paddle import _C_ops
-from paddle import in_dynamic_mode
-from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
+from .. import functional as F
 
 __all__ = []
 
 
 class PairwiseDistance(Layer):
     r"""
-    This operator computes the pairwise distance between two vectors. The
+    It computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
     .. math::
@@ -35,33 +28,31 @@ class PairwiseDistance(Layer):
         \Vert x \Vert _p = \left( \sum_{i=1}^n \vert x_i \vert ^ p \right) ^ {1/p}.
 
     Parameters:
-        p (float): The order of norm. The default value is 2.
-        epsilon (float, optional): Add small value to avoid division by zero,
-            default value is 1e-6.
+        p (float, optional): The order of norm. Default: :math:`2.0`.
+        epsilon (float, optional): Add small value to avoid division by zero.
+            Default: :math:`1e-6`.
         keepdim (bool, optional): Whether to reserve the reduced dimension
             in the output Tensor. The result tensor is one dimension less than
-            the result of ``'x-y'`` unless :attr:`keepdim` is True, default
-            value is False.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+            the result of ``|x-y|`` unless :attr:`keepdim` is True. Default: False.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`.
+            Generally, no setting is required. Default: None.
 
     Shape:
-        x: :math:`[N, D]` where `D` is the dimension of vector, available dtype
-            is float32, float64.
-        y: :math:`[N, D]`, y have the same shape and dtype as x.
-        out: :math:`[N]`. If :attr:`keepdim` is ``True``, the out shape is :math:`[N, 1]`.
-            The same dtype as input tensor.
+        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+            is the dimension of the data. Available data type is float32, float64.
+        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        output: The same dtype as input tensor.
+            - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
+                depending on whether the input has data shaped as :math:`[N, D]`.
+            - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
+                depending on whether the input has data shaped as :math:`[N, D]`.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            paddle.disable_static()
-            x_np = np.array([[1., 3.], [3., 5.]]).astype(np.float64)
-            y_np = np.array([[5., 6.], [7., 8.]]).astype(np.float64)
-            x = paddle.to_tensor(x_np)
-            y = paddle.to_tensor(y_np)
+            x = paddle.to_tensor([[1., 3.], [3., 5.]], dtype=paddle.float64)
+            y = paddle.to_tensor([[5., 6.], [7., 8.]], dtype=paddle.float64)
             dist = paddle.nn.PairwiseDistance()
             distance = dist(x, y)
             print(distance.numpy()) # [5. 5.]
@@ -74,41 +65,11 @@ def __init__(self, p=2., epsilon=1e-6, keepdim=False, name=None):
         self.epsilon = epsilon
         self.keepdim = keepdim
         self.name = name
-        check_type(self.p, 'porder', (float, int), 'PairwiseDistance')
-        check_type(self.epsilon, 'epsilon', (float), 'PairwiseDistance')
-        check_type(self.keepdim, 'keepdim', (bool), 'PairwiseDistance')
 
     def forward(self, x, y):
-        if in_dygraph_mode():
-            sub = _C_ops.elementwise_sub(x, y)
-            return _C_ops.final_state_p_norm(sub, self.p, 1, self.epsilon,
-                                             self.keepdim, False)
-
-        if _in_legacy_dygraph():
-            sub = _C_ops.elementwise_sub(x, y)
-            return _C_ops.p_norm(sub, 'axis', 1, 'porder', self.p, 'keepdim',
-                                 self.keepdim, 'epsilon', self.epsilon)
-
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'PairwiseDistance')
-        check_variable_and_dtype(y, 'y', ['float32', 'float64'],
-                                 'PairwiseDistance')
-        sub = paddle.subtract(x, y)
-
-        helper = LayerHelper("PairwiseDistance", name=self.name)
-        attrs = {
-            'axis': 1,
-            'porder': self.p,
-            'keepdim': self.keepdim,
-            'epsilon': self.epsilon,
-        }
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(type='p_norm',
-                         inputs={'X': sub},
-                         outputs={'Out': out},
-                         attrs=attrs)
 
-        return out
+        return F.pairwise_distance(x, y, self.p, self.epsilon, self.keepdim,
+                                   self.name)
 
     def extra_repr(self):
         main_str = 'p={p}'
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 8f6a4f8e1122ab..23720b1921922e 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -2200,6 +2200,11 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             # one can verify : X = P @ L @ U ;   
     """
 
+    if in_dygraph_mode():
+        P, L, U = _C_ops.final_state_lu_unpack(x, y, unpack_ludata,
+                                               unpack_pivots)
+        return P, L, U
+
     if paddle.in_dynamic_mode():
         P, L, U = _C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
                                    'unpack_pivots', unpack_pivots)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index c85d9226e67bd8..f70c9a0c410116 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -29,7 +29,7 @@
 
 import paddle
 from ..static import Variable
-from ..framework import core, in_dygraph_mode, _non_static_mode, LayerHelper
+from ..framework import core, in_dygraph_mode, _non_static_mode, LayerHelper, _in_legacy_dygraph
 from ..fluid.framework import _in_legacy_dygraph
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
@@ -2334,7 +2334,11 @@ def amax(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if reduce_all:
+            axis = range(len(x.shape))
+        return _C_ops.final_state_amax(x,  axis,  keepdim)
+    if _in_legacy_dygraph():
         return _C_ops.reduce_amax(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
 
     helper = LayerHelper('amax', **locals())
@@ -2446,9 +2450,12 @@ def amin(x, axis=None, keepdim=False, name=None):
     """
 
     reduce_all, axis = _get_reduce_all_value(axis)
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if reduce_all:
+            axis = range(len(x.shape))
+        return _C_ops.final_state_amin(x, axis, keepdim)
+    elif _in_legacy_dygraph():
         return _C_ops.reduce_amin(x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all)
-
     helper = LayerHelper('amin', **locals())
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64'], 'amin')
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 2073e241a3b183..1bf2df89855ae4 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -440,8 +440,7 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
     Compute the quantile of the input along the specified axis.
 
     Args:
-    Args:
-        x (Tensor): The input Tensor, it's data type can be float32, float64.
+        x (Tensor): The input Tensor, it's data type can be float32, float64, int32, int64.
         q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list,
             each q will be calculated and the first dimension of output is same to the number of ``q`` .
         axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int.
@@ -525,7 +524,7 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
         if ignore_nan:
             indices.append(q_num * (valid_counts - 1))
         else:
-            # TODO(Asthestarsfalll): Use paddle.index_fill instead of where
+            # TODO: Use paddle.index_fill instead of where
             index = q_num * (valid_counts - 1)
             last_index = x.shape[axis] - 1
             nums = paddle.full_like(index, fill_value=last_index)
@@ -569,7 +568,7 @@ def quantile(x, q, axis=None, keepdim=False):
     If any values in a reduced row are NaN, then the quantiles for that reduction will be NaN.
 
     Args:
-        x (Tensor): The input Tensor, it's data type can be float32, float64.
+        x (Tensor): The input Tensor, it's data type can be float32, float64, int32, int64.
         q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list,
             each q will be calculated and the first dimension of output is same to the number of ``q`` .
         axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int.
@@ -629,7 +628,7 @@ def nanquantile(x, q, axis=None, keepdim=False):
     If all values in a reduced row are NaN, then the quantiles for that reduction will be NaN.
 
     Args:
-        x (Tensor): The input Tensor, it's data type can be float32, float64.
+        x (Tensor): The input Tensor, it's data type can be float32, float64, int32, int64.
         q (int|float|list): The q for calculate quantile, which should be in range [0, 1]. If q is a list,
             each q will be calculated and the first dimension of output is same to the number of ``q`` .
         axis (int|list, optional): The axis along which to calculate quantile. ``axis`` should be int or list of int.