diff --git a/AUTHORS.md b/AUTHORS.md
index 6a5156183a517..b0a31b30295c6 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -83,6 +83,7 @@ This is an incomplete list of authors of [Paddle](https://github.com/PaddlePaddl
 | xushaoyong | Shao-Yong Xu |
 | Yancey1989 | Xu Yan |
 | zhaopu7 | Pu Zhao |
+| zhiqiu | Qiu-Liang Chen |
 | zhouxiao-coder | Xiao Zhou |
 | Zrachel | Rui-Qing Zhang |
 | jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20f6413dfa7d2..52cef45b3a9d1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
 option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
 option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
+option(WITH_MPI "Compile PaddlePaddle with MPI" OFF)
 option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
 option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
 option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
@@ -485,9 +486,6 @@ if(WITH_DISTRIBUTE)
         ON
         CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
   endif()
-  set(WITH_MPI
-      ON
-      CACHE STRING "Enable MPI when compiling WITH_DISTRIBUTE=ON." FORCE)
   if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
     # disable WITH_PSCORE for NPU before include third_party
     message(
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 0201d1131eb4a..ef76aa39604c6 100755
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -42,7 +42,9 @@ set(DISTRIBUTE_COMPILE_FLAGS
 if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
   set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()
-
+if(LINUX)
+  add_subdirectory(rpc)
+endif()
 add_subdirectory(common)
 add_subdirectory(ps)
 add_subdirectory(test)
diff --git a/paddle/fluid/distributed/rpc/CMakeLists.txt b/paddle/fluid/distributed/rpc/CMakeLists.txt
new file mode 100644
index 0000000000000..655a28d4f7616
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(PADDLE_RPC_SRCS python_rpc_handler.cc rpc_agent.cc)
+
+set_source_files_properties(
+  python_rpc_handler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(rpc_agent.cc PROPERTIES COMPILE_FLAGS
+                                                    ${DISTRIBUTE_COMPILE_FLAGS})
+
+set(PADDLE_RPC_DEPS brpc protobuf glog pybind)
+proto_library(paddle_rpc_proto SRCS rpc.proto)
+cc_library(
+  paddle_rpc
+  SRCS ${PADDLE_RPC_SRCS}
+  DEPS ${PADDLE_RPC_DEPS} paddle_rpc_proto)
diff --git a/paddle/fluid/distributed/rpc/future_wrapper.h b/paddle/fluid/distributed/rpc/future_wrapper.h
new file mode 100644
index 0000000000000..6592442f46e75
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/future_wrapper.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include <cassert>
+#include <future>
+#include <string>
+
+#include "paddle/fluid/distributed/rpc/python_rpc_handler.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace distributed {
+class FutureWrapper {
+ public:
+  FutureWrapper() {}
+  explicit FutureWrapper(std::future<std::string> fut) : fut_(std::move(fut)) {}
+  py::object wait() {
+    // GIL must be released, otherwise fut_.get() blocking will cause the
+    // service to fail to process RPC requests, leading to deadlock
+    PADDLE_ENFORCE_EQ(
+        PyGILState_Check(),
+        false,
+        platform::errors::Fatal(
+            "GIL must be released before fut.wait(), otherwise fut_.get() "
+            "blocking will cause the service to fail to "
+            "process RPC requests, leading to deadlock"));
+    auto s = fut_.get();
+    py::gil_scoped_acquire ag;
+    std::shared_ptr<PythonRpcHandler> python_handler =
+        PythonRpcHandler::GetInstance();
+    py::object obj = python_handler->Deserialize(py::bytes(s));
+    return obj;
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(FutureWrapper);
+  std::future<std::string> fut_;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/rpc/python_rpc_handler.cc b/paddle/fluid/distributed/rpc/python_rpc_handler.cc
new file mode 100644
index 0000000000000..13322114def64
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/python_rpc_handler.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/rpc/python_rpc_handler.h"
+
+namespace paddle {
+namespace distributed {
+constexpr auto kInternalModule = "paddle.distributed.rpc.internal";
+
+py::object getFunction(const py::object& module, const char* name) {
+  py::object fn = module.attr(name);
+  return fn;
+}
+
+PythonRpcHandler::PythonRpcHandler() {
+  py::gil_scoped_acquire ag;
+  // import python module
+  py::object rpc_internal = py::module::import(kInternalModule);
+  py_run_function_ = getFunction(rpc_internal, "_run_py_func");
+  py_serialize_ = getFunction(rpc_internal, "_serialize");
+  py_deserialize_ = getFunction(rpc_internal, "_deserialize");
+}
+
+py::object PythonRpcHandler::RunPythonFunc(const py::object& python_func) {
+  py::gil_scoped_acquire ag;
+  return py_run_function_(python_func);
+}
+
+std::string PythonRpcHandler::Serialize(const py::object& obj) {
+  py::gil_scoped_acquire ag;
+  py::object res = py_serialize_(obj);
+  return res.cast<std::string>();
+}
+
+py::object PythonRpcHandler::Deserialize(const std::string& obj) {
+  py::gil_scoped_acquire ag;
+  return py_deserialize_(py::bytes(obj));
+}
+
+std::shared_ptr<PythonRpcHandler> PythonRpcHandler::python_rpc_handler_ =
+    nullptr;
+std::mutex PythonRpcHandler::lock_;
+
+std::shared_ptr<PythonRpcHandler> PythonRpcHandler::GetInstance() {
+  if (python_rpc_handler_ == nullptr) {
+    std::lock_guard<std::mutex> guard(lock_);
+    if (python_rpc_handler_ == nullptr) {
+      python_rpc_handler_ = std::make_shared<PythonRpcHandler>();
+      return python_rpc_handler_;
+    }
+  }
+  return python_rpc_handler_;
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/rpc/python_rpc_handler.h b/paddle/fluid/distributed/rpc/python_rpc_handler.h
new file mode 100644
index 0000000000000..2c5221f53d57b
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/python_rpc_handler.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "paddle/fluid/platform/macros.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace distributed {
+
+class PYBIND11_EXPORT PythonRpcHandler {
+ public:
+  PythonRpcHandler();
+  ~PythonRpcHandler() = default;
+  static std::shared_ptr<PythonRpcHandler> GetInstance();
+  // Run a pickled Python function and return the result py::object
+  py::object RunPythonFunc(const py::object& python_func);
+
+  // Serialized a py::object into a string
+  std::string Serialize(const py::object& obj);
+
+  // Deserialize a string into a py::object
+  py::object Deserialize(const std::string& obj);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(PythonRpcHandler);
+
+  static std::shared_ptr<PythonRpcHandler> python_rpc_handler_;
+  // Ref to `paddle.distributed.rpc.internal.run_py_func`.
+  py::object py_run_function_;
+
+  // Ref to `paddle.distributed.rpc.internal.serialize`.
+  py::object py_serialize_;
+
+  // Ref to `paddle.distributed.rpc.internal.deserialize`.
+  py::object py_deserialize_;
+
+  // Lock to protect initialization.
+  static std::mutex lock_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/rpc/rpc.proto b/paddle/fluid/distributed/rpc/rpc.proto
new file mode 100644
index 0000000000000..2da9e37ae88d9
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/rpc.proto
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+syntax="proto2";
+package paddle.distributed;
+
+option cc_generic_services = true;
+option cc_enable_arenas = true;
+
+message RpcRequest {
+      required bytes message = 1;
+};
+
+message RpcResponse {
+      required bytes message = 1;
+};
+
+service RpcBaseService {
+      rpc Send(RpcRequest) returns (RpcResponse);
+      rpc InvokeRpc(RpcRequest) returns (RpcResponse);
+};
diff --git a/paddle/fluid/distributed/rpc/rpc_agent.cc b/paddle/fluid/distributed/rpc/rpc_agent.cc
new file mode 100644
index 0000000000000..18fa2aba841e5
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/rpc_agent.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/rpc/rpc_agent.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+const int kTimeoutMs = 500000;
+const int kConnectTimeoutMs = 10000;
+const int kMaxRetry = 5;
+const int kCloseWaitMs = 1000;
+std::shared_ptr<RpcAgent> RpcAgent::rpc_agent_instance_ = nullptr;
+
+RpcAgent::RpcAgent(std::string name, std::vector<WorkerInfo> infos) {
+  name_ = std::move(name);
+  for (auto info : infos) {
+    name_to_infos_.insert({info.name_, info});
+    id_to_infos_.insert({info.id_, info});
+  }
+  this->infos_ = std::move(infos);
+  auto it = name_to_infos_.find(name_);
+  this->rank_ = it->second.id_;
+  rpc_service_ = std::make_shared<RpcService>();
+  PADDLE_ENFORCE_EQ(
+      server_.AddService(rpc_service_.get(), brpc::SERVER_DOESNT_OWN_SERVICE),
+      0,
+      platform::errors::Fatal("Fail to add service: %s", name));
+}
+
+int RpcAgent::StartWorker() {
+  auto info = GetWorkerInfo(name_);
+  // Start the server.
+  int port = info.port_;
+  brpc::ServerOptions options;
+  PADDLE_ENFORCE_EQ(server_.Start(port, &options),
+                    0,
+                    platform::errors::Fatal("Fail to start worker: %s", name_));
+  VLOG(0) << "Start worker : " << name_;
+  return 0;
+}
+
+int RpcAgent::StartClient() {
+  // Initialize the channel, NULL means using default options.
+  brpc::ChannelOptions channel_options;
+  channel_options.protocol = "baidu_std";
+  channel_options.timeout_ms = kTimeoutMs;
+  channel_options.connection_type = "pooled";
+  channel_options.connect_timeout_ms = kConnectTimeoutMs;
+  channel_options.max_retry = kMaxRetry;
+  channels_.resize(name_to_infos_.size());
+  // build connection from client to all servers
+  for (std::size_t i = 0; i < channels_.size(); i++) {
+    auto info = id_to_infos_.find(i)->second;
+    channels_[i].reset(new brpc::Channel());
+    PADDLE_ENFORCE_EQ(
+        channels_[i]->Init(info.ip_.c_str(), info.port_, &channel_options),
+        0,
+        platform::errors::Fatal(
+            "Fail to initialize channel: %d, ip: %s, port: %d",
+            i,
+            info.ip_,
+            info.port_));
+  }
+  VLOG(0) << "Init Channels: " << name_;
+  return 0;
+}
+
+int RpcAgent::Stop() {
+  VLOG(0) << "Worker: " << name_ << " is going to stop.";
+  server_.Stop(kCloseWaitMs);
+  server_.Join();
+  rpc_agent_instance_ = nullptr;
+  VLOG(0) << "Worker: " << name_ << " has stopped";
+  return 0;
+}
+void OnRpcDone::Run() {
+  // delete this after Run
+  std::unique_ptr<OnRpcDone> self_guard(this);
+  PADDLE_ENFORCE_EQ(
+      cntl_.Failed(), false, platform::errors::Fatal(cntl_.ErrorText()));
+  promise_->set_value(response_.message());
+  VLOG(2) << "Received response from " << cntl_.remote_side() << " to "
+          << cntl_.local_side() << " (attached=" << cntl_.response_attachment()
+          << ")"
+          << " latency=" << cntl_.latency_us() << "us";
+}
+
+std::future<std::string> RpcAgent::InvokeRpc(const std::string &py_func,
+                                             const std::string &to,
+                                             int timeout_ms = kTimeoutMs) {
+  auto it = name_to_infos_.find(to);
+  PADDLE_ENFORCE_NE(
+      it,
+      name_to_infos_.end(),
+      platform::errors::OutOfRange("Worker %s doesn't exist!", to));
+  uint32_t id = it->second.id_;
+  auto channel = channels_[id];
+  // `done` must be allocated on the heap because its life cycle is after
+  // calling done.Run().
+  OnRpcDone *done = new OnRpcDone;
+  done->cntl_.set_timeout_ms(timeout_ms);
+  done->request_.set_message(py_func);
+  std::future<std::string> fut = done->GetFuture();
+  RpcBaseService_Stub stub(channel.get());
+  stub.InvokeRpc(&done->cntl_, &done->request_, &done->response_, done);
+  return fut;
+}
+
+std::shared_ptr<RpcAgent> RpcAgent::RpcAgentInstance() {
+  PADDLE_ENFORCE_NE(rpc_agent_instance_,
+                    nullptr,
+                    platform::errors::Fatal(
+                        "RpcAgent is not set, please calling "
+                        "paddle.distributed.rpc.int_rpc() to init rpc agent."));
+  return rpc_agent_instance_;
+}
+void RpcAgent::SetAgentInstance(std::shared_ptr<RpcAgent> agent) {
+  PADDLE_ENFORCE_EQ(
+      rpc_agent_instance_,
+      nullptr,
+      platform::errors::Fatal(
+          "RpcAgent has been set, please don't set rpc agent repeatly."));
+  rpc_agent_instance_ = agent;
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/rpc/rpc_agent.h b/paddle/fluid/distributed/rpc/rpc_agent.h
new file mode 100644
index 0000000000000..e6c5a7d099c1b
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/rpc_agent.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <future>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/rpc/python_rpc_handler.h"
+#include "paddle/fluid/distributed/rpc/rpc.pb.h"
+#include "paddle/fluid/distributed/rpc/rpc_service.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+struct WorkerInfo {
+  std::string name_;
+  uint32_t id_;
+  std::string ip_;
+  uint32_t port_;
+  WorkerInfo(std::string name, uint32_t id, std::string ip, uint32_t port)
+      : name_(std::move(name)), id_(id), ip_(std::move(ip)), port_(port) {}
+
+  std::string to_string() const {
+    std::string info = "{name: " + name_ + ", rank: " + std::to_string(id_) +
+                       ", ip: " + ip_ + ", port: " + std::to_string(port_) +
+                       "}";
+    return info;
+  }
+};
+
+class OnRpcDone : public google::protobuf::Closure {
+ public:
+  OnRpcDone() { promise_ = std::make_shared<std::promise<std::string>>(); }
+  // process callback of response
+  void Run();
+  std::future<std::string> GetFuture() {
+    return std::future<std::string>(promise_->get_future());
+  }
+  RpcResponse response_;
+  RpcRequest request_;
+  brpc::Controller cntl_;
+  std::shared_ptr<std::promise<std::string>> promise_;
+};
+
+class RpcAgent {
+ public:
+  static std::shared_ptr<RpcAgent> RpcAgentInstance();
+  static void SetAgentInstance(std::shared_ptr<RpcAgent> agent);
+  // init RpcAgent instance and get information of all services
+  RpcAgent(std::string name, std::vector<WorkerInfo> infos);
+  ~RpcAgent() {}
+
+  const WorkerInfo &GetWorkerInfo(const std::string &name) const {
+    auto it = name_to_infos_.find(name);
+    return it->second;
+  }
+  const WorkerInfo &GetWorkerInfoById(uint32_t id) const {
+    auto it = id_to_infos_.find(id);
+    return it->second;
+  }
+  const WorkerInfo &GetCurrentWorkerInfo() const {
+    return GetWorkerInfo(name_);
+  }
+  const std::vector<WorkerInfo> &GetAllWorkerInfos() const {
+    return this->infos_;
+  }
+
+  uint32_t Rank() { return this->rank_; }
+
+  uint32_t WorldSize() { return infos_.size(); }
+
+  int StartWorker();
+  // build connection from client to all servers
+  int StartClient();
+  int Stop();
+
+  std::future<std::string> InvokeRpc(const std::string &msg,
+                                     const std::string &to,
+                                     int timeout_ms);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(RpcAgent);
+  static std::shared_ptr<RpcAgent> rpc_agent_instance_;
+  brpc::Server server_;
+  std::shared_ptr<RpcService> rpc_service_;
+  std::vector<std::shared_ptr<brpc::Channel>> channels_;
+  std::string name_;
+  uint32_t rank_;
+  std::unordered_map<std::string, WorkerInfo> name_to_infos_;
+  std::unordered_map<uint32_t, WorkerInfo> id_to_infos_;
+  std::vector<WorkerInfo> infos_;
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/rpc/rpc_service.h b/paddle/fluid/distributed/rpc/rpc_service.h
new file mode 100644
index 0000000000000..74d4ab0fe0d58
--- /dev/null
+++ b/paddle/fluid/distributed/rpc/rpc_service.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <brpc/server.h>
+
+#include <string>
+
+#include "paddle/fluid/distributed/rpc/python_rpc_handler.h"
+#include "paddle/fluid/distributed/rpc/rpc.pb.h"
+
+namespace paddle {
+namespace distributed {
+class RpcService : public RpcBaseService {
+ public:
+  RpcService() {}
+  virtual ~RpcService() {}
+
+  virtual void InvokeRpc(google::protobuf::RpcController *cntl_base,
+                         const RpcRequest *request,
+                         RpcResponse *response,
+                         google::protobuf::Closure *done) {
+    // This object helps you to call done->Run() in RAII style. If you need
+    // to process the request asynchronously, pass done_guard.release().
+    brpc::ClosureGuard done_guard(done);
+
+    brpc::Controller *cntl = static_cast<brpc::Controller *>(cntl_base);
+    VLOG(2) << "InvokeRpc API: Received request[log_id=" << cntl->log_id()
+            << "] from " << cntl->remote_side() << " to " << cntl->local_side()
+            << ": "
+            << " (attached=" << cntl->request_attachment() << ")";
+    std::string py_func_str = request->message();
+    std::shared_ptr<PythonRpcHandler> python_handler =
+        PythonRpcHandler::GetInstance();
+    // acquire gil, because native Python objects are used
+    py::gil_scoped_acquire ag;
+    py::object py_func_obj = python_handler->Deserialize(py_func_str);
+    py::object res = python_handler->RunPythonFunc(py_func_obj);
+    std::string res_str = python_handler->Serialize(res);
+    response->set_message(res_str);
+  }
+};
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h
index f955d3df44ddd..ef2a7d8f0f1e0 100644
--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -27,7 +27,7 @@ namespace framework {
 namespace details {
 // assert false when meets NAN or inf
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::ScopeBase& scope,
+                         const framework::Scope& scope,
                          const std::string& var_name,
                          const platform::Place& place);
 
@@ -37,7 +37,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
                          const platform::Place& place);
 
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
-                        const framework::ScopeBase& scope,
+                        const framework::Scope& scope,
                         const platform::Place& place);
 
 template <typename VarType>
@@ -56,7 +56,7 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
 
 #ifdef PADDLE_WITH_ASCEND_CL
 void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::ScopeBase& scope,
+                                 const framework::Scope& scope,
                                  const platform::Place& place);
 #endif
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index deb138f7847d7..bca61ddae69e7 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -450,7 +450,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 }
 
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::ScopeBase& scope,
+                         const framework::Scope& scope,
                          const std::string& var_name,
                          const platform::Place& place) {
   auto* var = scope.FindVar(var_name);
@@ -486,7 +486,7 @@ static phi::DenseTensor& npu_float_status() {
 }
 
 void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
-                                 const framework::ScopeBase& scope,
+                                 const framework::Scope& scope,
                                  const platform::Place& place) {
   if (!platform::is_npu_place(place)) return;
 
@@ -555,7 +555,7 @@ void PrintNpuVarInfo(const std::string& op_type,
 }
 
 void PrintNPUOpValueInfo(const framework::OperatorBase& op,
-                         const framework::ScopeBase& scope,
+                         const framework::Scope& scope,
                          const platform::Place& place) {
   LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
                << "), here we print some tensor value info of this op.";
@@ -573,7 +573,7 @@ void PrintNPUOpValueInfo(const framework::OperatorBase& op,
 }
 
 static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
-                                  const framework::ScopeBase& scope,
+                                  const framework::Scope& scope,
                                   const platform::Place& place) {
   if (!platform::is_npu_place(place)) return;
 
@@ -609,7 +609,7 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
 #endif
 
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
-                        const framework::ScopeBase& exec_scope,
+                        const framework::Scope& exec_scope,
                         const platform::Place& place) {
   std::call_once(white_list_init_flag, InitWhiteListFormEnv);
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 3fd7a994a62fb..25f6ff8355d73 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -123,6 +123,7 @@ message BuildStrategy {
   optional bool allow_cuda_graph_capture = 14 [ default = false ];
   optional int32 reduce_strategy = 15 [ default = 0 ];
   optional bool fuse_gemm_epilogue = 16 [ default = false ];
+  optional string debug_graphviz_path = 17;
 }
 
 message ExecutionStrategy {
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index de9f6a4745fd0..59355c942047e 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -490,9 +490,18 @@ std::vector<phi::MetaTensor*> CompatInferMetaContext::MutableOutputBetween(
     size_t start, size_t end) {
   std::vector<phi::MetaTensor*> result;
   result.reserve(end - start);
+  bool has_meta_tensor = false;
+
   for (size_t i = start; i < end; ++i) {
     auto& out = compat_outputs_.at(i);
     result.emplace_back(out.initialized() ? &out : nullptr);
+    if (!has_meta_tensor && out.initialized()) {
+      has_meta_tensor = true;
+    }
+  }
+
+  if (!has_meta_tensor) {
+    result.clear();
   }
   return result;
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
old mode 100644
new mode 100755
index df19bc9ade8d5..c416ebf200df6
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -19,7 +19,6 @@
 #include <algorithm>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -68,7 +67,7 @@ std::vector<float> ComputePropagateScalesMkldnnPass::GetScales(
     for (int i = 0; i < columns; i++) {
       float max_value = FLT_MIN;
       for (int j = 0; j < rows; j++) {
-        max_value = std::max(max_value, std::abs(data[i + j * columns]));
+        max_value = std::max(max_value, std::abs(data[j + i * rows]));
       }
       max_value = 1.0 / max_value;
       if (std::isinf(max_value) || std::isnan(max_value)) {
@@ -394,8 +393,13 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
       auto out_iter = var_quant_scales->find(op_node->Op()->Output("Out")[0]);
       if (out_iter != var_quant_scales->end()) {
         std::vector<std::string> input_names = op_node->Op()->Input("X");
-        for (auto input_name : input_names)
-          (*var_quant_scales)[input_name] = out_iter->second;
+        for (auto input_name : input_names) {
+          auto concat_in_iter = var_quant_scales->find(input_name);
+          if (concat_in_iter == var_quant_scales->end())
+            (*var_quant_scales)[input_name] = out_iter->second;
+          else
+            (*var_quant_scales)[input_name].second = out_iter->second.second;
+        }
       }
     } else if (op_name == "scale") {
       const std::string output_name = op_node->Op()->Output("Out")[0];
@@ -409,6 +413,40 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
   }
   return waiting_for_scale;
 }
+void ComputePropagateScalesMkldnnPass::UpdateReluOutputScales(
+    ir::Graph* graph, StringPairMap* var_quant_scales) const {
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+    auto op = op_node->Op();
+    bool is_unsigned = false;
+    std::string output_name = "Out";
+    std::string act_name;
+    if (op->Type() == "relu") {
+      is_unsigned = true;
+    } else {
+      if (op->Type() == "conv2d") {
+        act_name = "fuse_activation";
+        output_name = "Output";
+      } else if (op->Type() == "fc") {
+        act_name = "activation_type";
+      }
+      if (!act_name.empty()) {
+        auto act = op->GetAttrIfExists<std::string>(act_name);
+        if (act == "relu" || act == "relu6") {
+          is_unsigned = true;
+        }
+      }
+    }
+    if (is_unsigned) {
+      std::string output_var_name = op->Output(output_name)[0];
+      auto out_iter = var_quant_scales->find(output_var_name);
+      if (out_iter != var_quant_scales->end()) {
+        (*var_quant_scales)[output_var_name].first = true;
+      }
+    }
+  }
+}
 
 void ComputePropagateScalesMkldnnPass::PropagateScales(
     ir::Graph* graph,
@@ -427,21 +465,6 @@ void ComputePropagateScalesMkldnnPass::PropagateScales(
   }
 }
 
-void ComputePropagateScalesMkldnnPass::ConvertStringPairMap(
-    const StringPairMap& var_quant_scales,
-    std::unordered_map<std::string, std::vector<float>>* info_map) const {
-  for (auto iter = var_quant_scales.begin(); iter != var_quant_scales.end();
-       iter++) {
-    auto* data = iter->second.second.data<float>();
-    std::vector<float> data_v;
-    for (int i = 0; i < iter->second.second.numel(); i++) {
-      data_v.push_back(data[i]);
-    }
-
-    info_map->insert(std::make_pair(iter->first, data_v));
-  }
-}
-
 void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Convert paddle model to mkldnn quantized model.";
   const std::string pattern_name = "compute_propagate_scales_mkldnn_pass";
@@ -461,13 +484,13 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
   auto* scope = param_scope();
   GetQuantInfo(graph, &var_quant_scales);
   ComputeWeightScales(graph, scope, &var_quant_scales);
+  UpdateReluOutputScales(graph, &var_quant_scales);
   PropagateScales(graph, &var_quant_scales, scale_immutable_ops);
 
   // save var_quant_scales in the first op's attr
   // for cpu_quantize_pass
-  std::unordered_map<std::string, std::vector<float>> info_map;
-  ConvertStringPairMap(var_quant_scales, &info_map);
-  SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales", info_map);
+  SaveInfoInTheFirstOp(
+      graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
index ecc3ad16a54e6..bae810746ae2d 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
@@ -17,14 +17,12 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-using StringPairMap =
-    std::unordered_map<std::string, std::pair<bool, phi::DenseTensor>>;
-
 class ComputePropagateScalesMkldnnPass : public FusePassBase {
  public:
   ComputePropagateScalesMkldnnPass() = default;
@@ -78,6 +76,9 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
                            Scope* scope,
                            StringPairMap* var_quant_scales) const;
 
+  void UpdateReluOutputScales(ir::Graph* graph,
+                              StringPairMap* var_quant_scales) const;
+
   void UpdateScaleOpInScale(Node* op_node,
                             const std::string& input_name,
                             const std::string& output_name,
@@ -92,10 +93,6 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
       ir::Graph* graph,
       StringPairMap* var_quant_scales,
       const std::unordered_set<std::string>& scale_immutable_ops) const;
-
-  void ConvertStringPairMap(
-      const StringPairMap& var_quant_scales,
-      std::unordered_map<std::string, std::vector<float>>* info_map) const;
 };
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
index 03c01507ca27d..39ecfd2c0e79a 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <unordered_map>
 
 #include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -91,11 +92,16 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
         graph, scope, wx_name, wh_name, var_quant_scales);
   }
 
+  void UpdateReluOutputScales(ir::Graph* graph,
+                              StringPairMap* var_quant_scales) const {
+    pass->UpdateReluOutputScales(graph, var_quant_scales);
+  }
+
   void InitTensorHolder(Scope* scope,
                         const paddle::platform::Place& place,
                         const std::string& var_name) {
     auto x = scope->Var(var_name);
-    auto tensor = x->GetMutable<LoDTensor>();
+    auto tensor = x->GetMutable<phi::DenseTensor>();
     auto tensor_size = 1;
     if (var_name == "filter") {
       tensor_size = positive_and_negative_values.size();
@@ -124,7 +130,6 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
   }
 
   void ComputeRnnWeightScalesTest(const std::string& type,
-                                  const std::initializer_list<std::string>& ops,
                                   const framework::ProgramDesc& prog,
                                   std::vector<double> scales) {
     ir::Graph* graph(new ir::Graph(prog));
@@ -140,7 +145,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
     StringPairMap var_quant_scales;
 
     auto* wx_var = scope.FindVar(wx_var_names);
-    auto* wx_tensor = wx_var->GetMutable<LoDTensor>();
+    auto* wx_tensor = wx_var->GetMutable<phi::DenseTensor>();
     wx_tensor->Resize(phi::make_dim(wx.size(), wx[0].size()));
     for (size_t i = 0; i < wx.size(); i++)
       std::copy(begin(wx[i]),
@@ -149,7 +154,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
                     i * wx[0].size());
 
     auto* wh_var = scope.FindVar(wh_var_names);
-    auto* wh_tensor = wh_var->GetMutable<LoDTensor>();
+    auto* wh_tensor = wh_var->GetMutable<phi::DenseTensor>();
     wh_tensor->Resize(phi::make_dim(wh.size(), wh[0].size()));
     for (size_t i = 0; i < wh.size(); i++)
       std::copy(begin(wh[i]),
@@ -174,6 +179,24 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
     }
   }
 
+  void UpdateReluOutputScaleTest(
+      const framework::ProgramDesc& prog,
+      StringPairMap* var_quant_scales,
+      const std::initializer_list<std::string>& variable_names) {
+    ir::Graph* graph(new ir::Graph(prog));
+    Scope scope;
+
+    PrepareGraph(graph, prog, &scope, conv_variable_names);
+
+    UpdateReluOutputScales(graph, var_quant_scales);
+
+    for (auto& var_name : variable_names) {
+      auto iter = var_quant_scales->find(var_name);
+      ASSERT_NE(iter, var_quant_scales->end());
+      ASSERT_EQ((*var_quant_scales)[var_name].first, true);
+    }
+  }
+
  private:
   std::unique_ptr<ComputePropagateScalesMkldnnPass> pass;
 };
@@ -182,11 +205,15 @@ void SetOp(ProgramDesc* prog,
            const std::string& type,
            const std::string& name,
            const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
+           const std::vector<std::string>& outputs,
+           const std::unordered_map<std::string, std::string>& attrs = {}) {
   auto* op = prog->MutableBlock(0)->AppendOp();
   op->SetType(type);
   op->SetAttr("use_mkldnn", true);
   op->SetAttr("name", name);
+  if (!attrs.empty())
+    for (auto& attr : attrs) op->SetAttr(attr.first, attr.second);
+
   if (type == "conv2d") {
     op->SetInput("Input", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
@@ -211,6 +238,23 @@ ProgramDesc BuildConv2dProgramDesc() {
   return prog;
 }
 
+ProgramDesc BuildConv2dReluProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : conv_variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  std::unordered_map<std::string, std::string> attrs = {
+      {"fuse_activation", "relu"}};
+  SetOp(&prog,
+        "conv2d",
+        "Conv2d",
+        {"conv_in", "filter", "bias"},
+        {"conv_out"},
+        attrs);
+
+  return prog;
+}
+
 ProgramDesc BuildFusionGruProgramDesc() {
   ProgramDesc prog;
   for (auto& v : rnn_variable_names) {
@@ -262,7 +306,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
   StringPairMap var_quant_scales;
 
   auto* var = scope.FindVar(weight_var_name);
-  auto* weight_tensor = var->GetMutable<LoDTensor>();
+  auto* weight_tensor = var->GetMutable<phi::DenseTensor>();
   weight_tensor->Resize(phi::make_dim(1, values.size()));
   std::copy(begin(values),
             end(values),
@@ -283,15 +327,24 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
 }
 
 TEST_F(ComputePropagateScalesMkldnnPassTest, compute_gru_weight_scales) {
-  ComputeRnnWeightScalesTest("gru",
-                             {"fusion_gru", "multi_gru"},
-                             BuildFusionGruProgramDesc(),
-                             gru_scales);
+  ComputeRnnWeightScalesTest("gru", BuildFusionGruProgramDesc(), gru_scales);
 }
 
 TEST_F(ComputePropagateScalesMkldnnPassTest, compute_lstm_weight_scales) {
-  ComputeRnnWeightScalesTest(
-      "lstm", {"fusion_lstm"}, BuildFusionLstmProgramDesc(), lstm_scales);
+  ComputeRnnWeightScalesTest("lstm", BuildFusionLstmProgramDesc(), lstm_scales);
+}
+
+TEST_F(ComputePropagateScalesMkldnnPassTest, update_relu_output_scales) {
+  StringPairMap var_quant_scales;
+  for (auto& var_name : conv_variable_names) {
+    phi::DenseTensor tensor;
+    auto* data = tensor.mutable_data<float>({1}, platform::CPUPlace());
+    data[0] = 10;
+    auto pair = std::make_pair(false, tensor);
+    var_quant_scales.insert(std::make_pair(var_name, pair));
+  }
+  UpdateReluOutputScaleTest(
+      BuildConv2dReluProgramDesc(), &var_quant_scales, {"conv_out"});
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 5ec22e2e88a1e..3161eeeb4b499 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -229,6 +229,7 @@ void CPUQuantizePass::DequantizeOutput(Graph* g,
                     std::vector<std::string>({dequantize_in_node->Name()}));
   deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
   deq_desc.SetAttr("Scale", scale);
+  deq_desc.SetAttr("is_negative_input", !is_unsigned);
   auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
 
   // update op's output
@@ -332,20 +333,8 @@ bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
 }
 
 void CPUQuantizePass::GetQuantInfo(Graph* graph) const {
-  std::unordered_map<std::string, std::vector<float>> info_map{};
-  GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map);
-
-  for (auto iter = info_map.begin(); iter != info_map.end(); iter++) {
-    LoDTensor tensor;
-    const int size = static_cast<int>(iter->second.size());
-    auto* data = tensor.mutable_data<double>({size}, platform::CPUPlace());
-    for (int i = 0; i < size; i++) {
-      data[i] = static_cast<double>(iter->second[i]);
-    }
-
-    auto pair = std::make_pair(false, tensor);
-    var_quant_scales_->insert(std::make_pair(iter->first, pair));
-  }
+  GetInfoFromTheFirstOp(
+      graph, "has_quant_info", "var_quant_scales", var_quant_scales_);
 }
 
 void CPUQuantizePass::QuantizeConv(Graph* graph,
@@ -422,7 +411,16 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
     auto filter_scale_tensor = GetScaleTensorForNode(conv_filter);
     EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
                                      filter_scale_tensor.numel()};
-    eigen_tensor *= static_cast<double>(S8_MAX);
+
+    // If the scale value of a weight is already multiplied by S8_MAX, it does
+    // not need to be multiplied again
+    if (std::find(change_weight_->begin(),
+                  change_weight_->end(),
+                  conv_filter->Name()) == change_weight_->end()) {
+      eigen_tensor *= static_cast<double>(S8_MAX);
+      change_weight_->push_back(conv_filter->Name());
+    }
+
     std::vector<float> filter_scale{
         filter_scale_tensor.data<double>(),
         filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
@@ -593,6 +591,20 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
       return;
     }
 
+    bool are_all_inputs_unsigned{true};
+    // if all inputs were unsigned, then the output was set to unsigned
+    // during the scale calculation step
+    auto inputs = concat_op->inputs;
+    for (size_t i = 0; i < inputs.size(); i++) {
+      if (AreScalesPresentForVarNames({inputs[i]->Name()})) {
+        auto scale_data = GetScaleDataByName(inputs[i]->Name());
+        if (scale_data.first == false) {
+          are_all_inputs_unsigned = false;
+          break;
+        }
+      }
+    }
+
     GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
 
     if (!AreScalesPresentForNodes({concat_out})) {
@@ -601,17 +613,12 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
       return;
     }
 
-    // if all inputs were unsigned, then the output was set to unsigned
-    // during the scale calculation step
-    bool are_all_inputs_unsigned{false};
-    auto output_scale =
-        GetScaleValueForNode(concat_out, &are_all_inputs_unsigned);
+    auto output_scale = GetScaleValueForNode(concat_out);
 
     QuantizeInputs(g, concat_op, "X", are_all_inputs_unsigned);
 
     DequantizeOutput(
         g, concat_op, concat_out, "Out", output_scale, are_all_inputs_unsigned);
-
     ++quantize_concat_count;
   };
 
@@ -695,6 +702,13 @@ void CPUQuantizePass::QuantizeImmutable(Graph* graph,
       return;
     }
 
+    // skip if the dtype of immutable_in is not float32
+    auto dtype = immutable_in->Var()->GetDataType();
+    if (dtype != proto::VarType::FP32) {
+      MarkAndLogCannotQuantizeOp(immutable_op, "The input dtype is not float.");
+      return;
+    }
+
     if (!AreScalesPresentForNodes({immutable_out})) {
       MarkAndLogCannotQuantizeOp(immutable_op,
                                  "No scale available for the operator");
@@ -1166,7 +1180,6 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeImmutable(graph, "reshape2", "X");
   QuantizeImmutable(graph, "transpose2", "X");
   QuantizeImmutable(graph, "slice", "Input");
-  QuantizeImmutable(graph, "shape", "Input");
   QuantizeImmutable(graph, "nearest_interp", "X");
   QuantizeImmutable(graph, "nearest_interp_v2", "X");
   QuantizeElementwise(graph, "elementwise_add");
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index f26d8bfc84c15..ded113dfdc12d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -110,6 +110,11 @@ class CPUQuantizePass : public FusePassBase {
   VarQuantScale string_pair_map = {};
   VarQuantScale* const var_quant_scales_ = &string_pair_map;
 
+  // Save the scale values of which weights have been processed to avoid
+  // secondary processing
+  std::vector<std::string> change_weight = {};
+  std::vector<std::string>* const change_weight_ = &change_weight;
+
   void GetQuantInfo(Graph* graph) const;
 };
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
old mode 100644
new mode 100755
index 4dabdd6bed0bd..70623214503d8
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -66,7 +66,7 @@ void SetOp(ProgramDesc* prog,
              type == "nearest_interp" || type == "nearest_interp_v2") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
-  } else if (type == "slice" || type == "shape") {
+  } else if (type == "slice") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
   } else if (type == "dropout") {
@@ -467,7 +467,7 @@ static const std::initializer_list<std::string> variable_names_immutable_ops = {
 void TestImmutableOp(const std::string tested_op) {
   ProgramDesc prog;
   for (auto& v : variable_names_immutable_ops) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, tested_op, tested_op, {"b"}, {"c"}, true, "int8");
@@ -520,7 +520,7 @@ void TestImmutableOpBetweenNonQuantizedOp(const std::string tested_op) {
 void TestImmutableOpWithManyOutputs(const std::string tested_op) {
   ProgramDesc prog;
   for (auto& v : variable_names_immutable_ops) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
 
   SetOp(&prog, "dropout", "Dropout1", {"a"}, {"b"}, true, "float32");
@@ -556,12 +556,8 @@ void TestImmutableOpWithManyOutputs(const std::string tested_op) {
            SCALE * S8_MAX);
 }
 
-const std::vector<std::string> immutables = {"reshape2",
-                                             "transpose2",
-                                             "slice",
-                                             "shape",
-                                             "nearest_interp",
-                                             "nearest_interp_v2"};
+const std::vector<std::string> immutables = {
+    "reshape2", "transpose2", "slice", "nearest_interp", "nearest_interp_v2"};
 
 class TestImmutables : public testing::TestWithParam<std::string> {};
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 933d60b0a2739..e0a64b2036bb7 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -105,51 +105,24 @@ void CPUQuantizeSquashPass::FindNodesToKeep(
   AddStatis(found_count);
 }
 
-bool CPUQuantizeSquashPass::IsDequantizeInputUint8(
-    const Node* dequant_in) const {
-  PADDLE_ENFORCE_EQ(
-      dequant_in->inputs.size(),
-      1,
-      platform::errors::InvalidArgument(
-          "Dequantize (id: %f) should have only one input.", dequant_in->id()));
-  if (dequant_in->inputs[0]->IsOp()) {
-    auto prev_op = dequant_in->inputs[0]->Op();
-    std::string act_name;
-    if (prev_op->Type() == "relu") {
-      return true;
-    } else {
-      if (prev_op->Type() == "conv2d") {
-        act_name = "fuse_activation";
-      } else if (prev_op->Type() == "fc") {
-        act_name = "activation_type";
-      }
-      if (!act_name.empty()) {
-        auto act = prev_op->GetAttrIfExists<std::string>(act_name);
-        if (act == "relu" || act == "relu6") {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
 bool CPUQuantizeSquashPass::IsDequantizeQuantizeIncompatible(
-    Node* quant_op, Node* dequant_in, Node* next_op) const {
-  bool is_concat_signed =
+    Node* quant_op, Node* dequant_op, Node* next_op) const {
+  bool is_next_op_signed =
       quant_op->Op()->GetAttrIfExists<bool>("is_negative_input");
-  bool is_input_unsigned = IsDequantizeInputUint8(dequant_in);
+  bool is_input_signed =
+      dequant_op->Op()->GetAttrIfExists<bool>("is_negative_input");
+
   /* TODO(sfraczek): remove elementwise from this condition when BinaryMKLDNN
    kernel will support two different input data types */
   bool is_next_op_concat_or_elementwise =
       next_op->Op()->Type() == "concat" ||
       next_op->Op()->Type().find("elementwise") == 0;
-  if (is_next_op_concat_or_elementwise && is_concat_signed &&
-      is_input_unsigned) {
+  if (is_next_op_concat_or_elementwise &&
+      (is_next_op_signed ^ is_input_signed)) {
     VLOG(4) << "Do not squash dequant-quant, because "
             << "next_op is: " << next_op->Op()->Type()
-            << ", is_concat_signed: " << is_concat_signed
-            << ", is_input_unsigned: " << is_input_unsigned << ".";
+            << ", is_next_op_signed: " << is_next_op_signed
+            << ", is_input_signed: " << is_input_signed << ".";
     return true;
   }
   return false;
@@ -174,7 +147,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
     GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
 
-    if (IsDequantizeQuantizeIncompatible(quant_op, dequant_in, next_op)) {
+    if (IsDequantizeQuantizeIncompatible(quant_op, dequant_op, next_op)) {
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
index 5207cc519c698..3aed54609d451 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -43,11 +43,6 @@ class CPUQuantizeSquashPass : public FusePassBase {
       Graph* graph,
       std::unordered_map<const Node*, int>* nodes_keep_counter) const;
 
-  /*
-   * Check if input to dequantize is uint8
-   */
-  bool IsDequantizeInputUint8(const Node* dequant_in) const;
-
   /*
    * Don't squash unsigned dequantize with signed quantize.
    * This is important for concat and elementwise ops.
@@ -55,7 +50,7 @@ class CPUQuantizeSquashPass : public FusePassBase {
    * elementwise assumes first input type.
    */
   bool IsDequantizeQuantizeIncompatible(Node* quant_op,
-                                        Node* dequant_in,
+                                        Node* dequant_op,
                                         Node* next_op) const;
 
   /*
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 655cc95bf28a0..cd71ff153d601 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -68,15 +68,11 @@ void SetOp(ProgramDesc* prog,
     op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
     op->SetAttr("data_format", std::string("NCHW"));
     op->SetAttr("force_fp32_output", false);
-  } else if (type == "quantize") {
+  } else if (type == "quantize" || type == "dequantize") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
     op->SetAttr("Scale", scale[0]);
     op->SetAttr("is_negative_input", is_negative_input);
-  } else if (type == "dequantize") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("Scale", scale[0]);
   } else if (type == "requantize") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
@@ -303,31 +299,22 @@ ProgramDesc BuildConvMultiRequantProgramDesc(bool use_mkldnn,
   return prog;
 }
 
-/* a->relu->b->Dequant->c(u8)->Quant->d-\
- * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x
- * i->relu->j->Dequant->k(u8)->Quant->l-/
+/* a->relu->b->Dequant(u8)->c->Quant(u8)->d-\
+ * e->relu->f->Dequant(u8)->g->Quant(u8)->h--Concat1->i
  */
-ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
+ProgramDesc BuildU8U8ConcatProgramDesc(float scale_out, float scale) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "relu", "Relu1", {"a"}, {"b"}, true, {scale, scale_out});
   SetOp(&prog, "relu", "Relu2", {"e"}, {"f"}, true, {scale, scale_out});
-  SetOp(&prog, "relu", "Relu3", {"i"}, {"j"}, true, {scale, scale_out});
-
-  SetOp(
-      &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out});
 
   SetOp(&prog,
-        "quantize",
-        "Quant1",
+        "dequantize",
+        "Dequant1",
+        {"b"},
         {"c"},
-        {"d"},
         true,
         {scale, scale_out},
         0.0f,
@@ -336,10 +323,23 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
         1,
         false);  // is_negative_input = false
   SetOp(&prog,
-        "quantize",
-        "Quant2",
+        "dequantize",
+        "Dequant2",
+        {"f"},
         {"g"},
-        {"h"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
+
+  SetOp(&prog,
+        "quantize",
+        "Quant1",
+        {"c"},
+        {"d"},
         true,
         {scale, scale_out},
         0.0f,
@@ -349,9 +349,9 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
         false);  // is_negative_input = false
   SetOp(&prog,
         "quantize",
-        "Quant3",
-        {"k"},
-        {"l"},
+        "Quant2",
+        {"g"},
+        {"h"},
         true,
         {scale, scale_out},
         0.0f,
@@ -360,27 +360,47 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
         1,
         false);  // is_negative_input = false
 
-  SetOp(&prog, "concat", "Concat1", {"d", "h", "l"}, {"x"}, true);
+  SetOp(&prog, "concat", "Concat1", {"d", "h"}, {"i"}, true);
   return prog;
 }
 
-/* a->relu->b->Dequant->c(u8)->Quant->d-\
- * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x
- * i->pool2d->j->Dequant->k(s8)->Quant->l-/
+/* a->relu->b->Dequant(u8)->c->Quant(s8)->d-\
+ * e->relu->f->Dequant(u8)->g->Quant(s8)->h--Concat1->x
+ * i->pool2d->j->Dequant(s8)->k->Quant(s8)->l-/
  */
 ProgramDesc BuildU8U8S8ConcatProgramDesc(float scale_out, float scale) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
     prog.MutableBlock(0)->Var(v);
   }
-  SetOp(&prog, "relu", "Pool2d1", {"a"}, {"b"}, true, {scale, scale_out});
-  SetOp(&prog, "relu", "Relu1", {"e"}, {"f"}, true, {scale, scale_out});
+  SetOp(&prog, "relu", "Relu1", {"a"}, {"b"}, true, {scale, scale_out});
+  SetOp(&prog, "relu", "Relu2", {"e"}, {"f"}, true, {scale, scale_out});
   SetOp(&prog, "pool2d", "Pool2d2", {"i"}, {"j"}, true, {scale, scale_out});
 
-  SetOp(
-      &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out});
+  SetOp(&prog,
+        "dequantize",
+        "Dequant1",
+        {"b"},
+        {"c"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
+  SetOp(&prog,
+        "dequantize",
+        "Dequant2",
+        {"f"},
+        {"g"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
   SetOp(
       &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out});
 
@@ -392,9 +412,9 @@ ProgramDesc BuildU8U8S8ConcatProgramDesc(float scale_out, float scale) {
   return prog;
 }
 
-/* a->pool2d->b->Dequant->c(s8)->Quant->d-\
- * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x
- * i->pool2d->j->Dequant->k(s8)->Quant->l-/
+/* a->pool2d->b->Dequant(s8)->c->Quant(s8)->d-\
+ * e->relu->f->Dequant(u8)->g->Quant(s8)->h--Concat1->x
+ * i->pool2d->j->Dequant(s8)->k->Quant(s8)->l-/
  */
 ProgramDesc BuildS8U8S8ConcatProgramDesc(float scale_out, float scale) {
   ProgramDesc prog;
@@ -407,8 +427,18 @@ ProgramDesc BuildS8U8S8ConcatProgramDesc(float scale_out, float scale) {
 
   SetOp(
       &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out});
+  SetOp(&prog,
+        "dequantize",
+        "Dequant2",
+        {"f"},
+        {"g"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
   SetOp(
       &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out});
 
@@ -1141,13 +1171,12 @@ TEST(CpuQuantizeSquashPass, squash_all_s8_input_to_concat1) {
 }
 
 TEST(CpuQuantizeSquashPass, squash_all_u8_input_to_concat2) {
-  // removed 3 x 4 (dequantize_op, dequantize_out, quantize, quantize_out)
-  auto remove_nodes = 12;
+  // removed 2 x 4 (dequantize_op, dequantize_out, quantize, quantize_out)
+  auto remove_nodes = 8;
   std::unordered_map<std::string, int> expected_operators = {
-      {"concat", 1}, {"quantize", 0}, {"dequantize", 0}, {"relu", 3}};
-  CheckNodesTest(BuildU8U8U8ConcatProgramDesc(1.2f, 1.2f),
-                 expected_operators,
-                 remove_nodes);
+      {"concat", 1}, {"quantize", 0}, {"dequantize", 0}, {"relu", 2}};
+  CheckNodesTest(
+      BuildU8U8ConcatProgramDesc(1.2f, 1.2f), expected_operators, remove_nodes);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
index a714f236c4616..6899a7202da9c 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -22,6 +22,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+using StringPairMap =
+    std::unordered_map<std::string, std::pair<bool, phi::DenseTensor>>;
+
 static void SaveInfoInTheFirstOp(
     ir::Graph* graph,
     const std::string& flag,
@@ -44,6 +47,31 @@ static void SaveInfoInTheFirstOp(
   }
 }
 
+static void SaveInfoInTheFirstOp(ir::Graph* graph,
+                                 const std::string& flag,
+                                 const std::string& key_suffix,
+                                 const StringPairMap& info_map) {
+  VLOG(3) << "save variables in the first op's attr";
+
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    op_node->Op()->SetAttr(flag, true);
+    for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
+      auto* data = iter->second.second.data<float>();
+      std::vector<float> data_v(data, data + iter->second.second.numel());
+      op_node->Op()->SetAttr(iter->first + suffix + "_unsigned",
+                             iter->second.first);
+      op_node->Op()->SetAttr(iter->first + suffix, data_v);
+    }
+    break;
+  }
+}
+
 static void GetInfoFromTheFirstOp(
     ir::Graph* graph,
     const std::string& flag,
@@ -77,6 +105,54 @@ static void GetInfoFromTheFirstOp(
   }
 }
 
+static void GetInfoFromTheFirstOp(ir::Graph* graph,
+                                  const std::string& flag,
+                                  const std::string& key_suffix,
+                                  StringPairMap* info_map) {
+  VLOG(3) << "get variables from the first op's attr";
+  const std::string unsigned_flag = "_unsigned";
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  const std::string suffix_is_unsigned = suffix + unsigned_flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    auto* op_desc = op_node->Op();
+    if (op_desc->GetAttrIfExists<bool>(flag)) {
+      op_desc->RemoveAttr(flag);
+      std::vector<std::string> attr_names = op_desc->AttrNames();
+      for (auto fake_name : attr_names) {
+        auto is_unsigned = false;
+        size_t pos = fake_name.find(suffix_is_unsigned);
+
+        if (pos != std::string::npos) {
+          std::string unsigned_var_name = fake_name;
+          is_unsigned =
+              PADDLE_GET_CONST(bool, op_desc->GetAttr(unsigned_var_name));
+
+          std::string var_name = fake_name.substr(0, pos);
+          size_t unsigned_pos = fake_name.find(unsigned_flag);
+          std::string vector_name =
+              fake_name.erase(unsigned_pos, unsigned_flag.length());
+          auto scales_vector = PADDLE_GET_CONST(std::vector<float>,
+                                                op_desc->GetAttr(vector_name));
+          phi::DenseTensor tensor;
+          const int size = static_cast<int>(scales_vector.size());
+          auto data = tensor.mutable_data<double>({size}, platform::CPUPlace());
+          std::copy(scales_vector.begin(), scales_vector.end(), data);
+          auto pair = std::make_pair(is_unsigned, tensor);
+          info_map->insert(std::make_pair(var_name, pair));
+          op_desc->RemoveAttr(unsigned_var_name);
+          op_desc->RemoveAttr(vector_name);
+        }
+      }
+      break;
+    }
+  }
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
index 177309376e825..b1a0aaa830e6a 100644
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
@@ -52,36 +52,25 @@ bool HasBias(ir::Node* conv_op) {
          conv_op->Op()->Input("Bias").size() > 0;
 }
 
-bool ShouldSkipConv(ir::Node* conv_op, Scope* scope, ir::Node* conv_filter) {
-  if (!platform::HasOpINT8DataType(conv_op->Op())) {
-    VLOG(4) << "Skipping non-int8 convolution (id: " << conv_op->id() << ").";
-    return true;
-  }
-
-  auto filter_var = scope->GetVar(conv_filter->Name());
-  if (filter_var->Get<LoDTensor>().dtype() != phi::DataType::FLOAT32) {
-    VLOG(4) << "Skipping convolution (id: " << conv_op->id()
-            << ") because it's a bug that it is detected again.";
-    return true;
-  }
-
-  VLOG(4) << "Not skipping convolution (id: " << conv_op->id() << ")";
-  return false;
-}
-
 template <typename T>
 void QuantizeConvInput(Scope* scope,
                        ir::Graph* g,
                        ir::Node* conv_op,
                        const std::string& input_name,
                        const std::string& scales_attr_name) {
-  const auto scales =
-      conv_op->Op()->GetAttrIfExists<std::vector<float>>(scales_attr_name);
-
-  auto* tensor = scope->GetVar(input_name)->GetMutable<LoDTensor>();
-  QuantizeParams<T>(tensor, scales);
-
-  conv_op->Op()->SetAttr(scales_attr_name, std::vector<float>(1, 1));
+  auto var = scope->GetVar(input_name);
+  if (var->Get<LoDTensor>().dtype() != phi::DataType::FLOAT32) {
+    VLOG(0) << "Skipping convolution filter: " << input_name
+            << " because it is detected again.";
+    conv_op->Op()->SetAttr(scales_attr_name, std::vector<float>(1, 1));
+  } else {
+    const auto scales =
+        conv_op->Op()->GetAttrIfExists<std::vector<float>>(scales_attr_name);
+
+    auto* tensor = scope->GetVar(input_name)->GetMutable<LoDTensor>();
+    QuantizeParams<T>(tensor, scales);
+    conv_op->Op()->SetAttr(scales_attr_name, std::vector<float>(1, 1));
+  }
 }
 
 }  // namespace
@@ -151,7 +140,8 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph,
     PADDLE_ENFORCE_NOT_NULL(
         scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
-    if (ShouldSkipConv(conv_op, scope, conv_filter)) {
+    // If not a quantized OP
+    if (!platform::HasOpINT8DataType(conv_op->Op())) {
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
old mode 100644
new mode 100755
index 507f25d92d8bc..e04cf388ac0d7
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
@@ -89,8 +89,14 @@ struct ProgramStrategy {
 
   virtual void CheckOp(const OpDesc& op) const = 0;
 
-  VarDesc* AddInput(OpDesc* op, std::string input_name, const Data& data) {
-    const std::string var_name = input_name + "_var";
+  VarDesc* AddInput(OpDesc* op,
+                    std::string input_name,
+                    const Data& data,
+                    const std::string user_var_name = "") {
+    std::string var_name = user_var_name;
+    if (var_name.empty()) {
+      var_name = input_name + "_var";
+    }
     op->SetInput(input_name, {var_name});
     auto var = program.MutableBlock(0)->Var(var_name);
     var->SetShape(data.getShape());
@@ -98,8 +104,14 @@ struct ProgramStrategy {
     return var;
   }
 
-  void AddOutput(OpDesc* op, std::string output_name, const Data& data) {
-    const std::string var_name = output_name + "_var";
+  void AddOutput(OpDesc* op,
+                 std::string output_name,
+                 const Data& data,
+                 const std::string user_var_name = "") {
+    std::string var_name = user_var_name;
+    if (var_name.empty()) {
+      var_name = output_name + "_var";
+    }
     op->SetOutput(output_name, {var_name});
     program.MutableBlock(0)->Var(var_name);
     test_scope.CreateTensor(var_name, data);
@@ -117,21 +129,23 @@ struct ConvProgramStrategy : public ProgramStrategy {
                       std::vector<float>&& scale_weights,
                       int groups = 1,
                       Data&& bias = Data(),
-                      std::vector<float>&& scale_bias = {})
+                      std::vector<float>&& scale_bias = {},
+                      bool share_weight = false)
       : input(std::move(input)),
         filter(std::move(filter)),
         output(std::move(output)),
         scale_weights(std::move(scale_weights)),
         groups(std::move(groups)),
         bias(std::move(bias)),
-        scale_bias(std::move(scale_bias)) {}
+        scale_bias(std::move(scale_bias)),
+        share_weight(std::move(share_weight)) {}
 
  protected:
-  OpDesc* CreateBasicConvOp() {
+  OpDesc* CreateBasicConvOp(const std::string conv_name = "Conv1") {
     auto op = program.MutableBlock(0)->AppendOp();
     op->SetType("conv2d");
     op->SetAttr("use_mkldnn", true);
-    op->SetAttr("name", std::string{"Conv1"});
+    op->SetAttr("name", conv_name);
     op->SetAttr("mkldnn_data_type", std::string{"int8"});
     op->SetAttr("data_format", std::string{"NCHW"});
     op->SetAttr("dilations", std::vector<int>({1, 1}));
@@ -155,6 +169,20 @@ struct ConvProgramStrategy : public ProgramStrategy {
       AddInput(op, "Bias", bias);
       op->SetAttr("Bias_scales", scale_bias);
     }
+
+    if (share_weight) {
+      OpDesc* op2 = CreateBasicConvOp("Conv2");
+      AddInput(op2, "Input", input);
+      AddInput(op2, "Filter", filter)->SetPersistable(true);
+      AddOutput(op2, "Output", output, "output2");
+      op2->SetAttr("Scale_weights", scale_weights);
+      op2->SetAttr("Scale_in", 1.0f);
+      op2->SetAttr("groups", groups);
+      if (HasBias()) {
+        AddInput(op2, "Bias", bias, "Bias2");
+        op2->SetAttr("Bias_scales", scale_bias);
+      }
+    }
   }
 
   void CheckOp(const OpDesc& op) const override {
@@ -210,9 +238,9 @@ struct ConvProgramStrategy : public ProgramStrategy {
   const Data output;
   const std::vector<float> scale_weights;
   const int groups;
-
   const Data bias;
   const std::vector<float> scale_bias;
+  const bool share_weight;
 };
 
 struct ParamsQuantizationMkldnnPassTestFixture : public ::testing::Test {
@@ -340,6 +368,19 @@ TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1w) {
   RunPassTest(std::move(program));
 }
 
+TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1ws) {
+  auto program = std::make_unique<ConvProgramStrategy>(
+      GenericInput(),
+      Data({2, 2, 2, 1, 1}, {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}),
+      GenericOutput(),
+      std::vector<float>{2.f, 2.f, 4.f, 4.f},
+      2,
+      Data({2, 2, 1, 1, 1}, {1.5f, 1.5f, 1.5f, 1.5f}),
+      std::vector<float>{2.f, 2.f, 4.f, 4.f},
+      true);
+  RunPassTest(std::move(program));
+}
+
 }  // namespace
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 65c64af464281..7ba71b619d106 100755
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -109,27 +109,34 @@ void QuantDequantMkldnnPass::CollectWeightScalesInfoFromONNXFormatDequantize(
 
     if (op_node->Name() == "dequantize_linear") {
       auto* op_desc = op_node->Op();
+
+      auto scale_name = op_desc->Input("Scale")[0];
+      auto* var = scope->FindVar(scale_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var,
+          platform::errors::NotFound(
+              "The Scales variable [%s] of dequantize op is not found.", var));
+
+      auto* scale_tensor = var->GetMutable<LoDTensor>();
+      auto* scale_data = scale_tensor->data<float>();
+
       auto x_var_name = op_desc->Input("X")[0];
       auto* weight_var = scope->FindVar(x_var_name);
       if (!weight_var) {
         auto out_var_name = op_desc->Output("Y")[0];
-        if (var_quant_scales->count(x_var_name) &&
-            !var_quant_scales->count(out_var_name)) {
-          std::vector<float> scale_v = var_quant_scales->at(x_var_name);
+        float scale = 1.0 / scale_data[0];
+        if (std::isinf(scale) || std::isnan(scale)) {
+          scale = 0.0;
+        }
+        std::vector<float> scale_v = {scale};
+        if (!var_quant_scales->count(out_var_name)) {
           var_quant_scales->insert(std::make_pair(out_var_name, scale_v));
         }
+        if (!var_quant_scales->count(x_var_name)) {
+          var_quant_scales->insert(std::make_pair(x_var_name, scale_v));
+        }
       } else {
         *onnx_format_quantize_model = true;
-        auto scale_name = op_desc->Input("Scale")[0];
-        auto* var = scope->FindVar(scale_name);
-        PADDLE_ENFORCE_NOT_NULL(
-            var,
-            platform::errors::NotFound(
-                "The Scales variable [%s] of dequantize op is not found.",
-                var));
-
-        auto* scale_tensor = var->GetMutable<LoDTensor>();
-        auto* scale_data = scale_tensor->data<float>();
         std::vector<float> thresholds(scale_data,
                                       scale_data + scale_tensor->numel());
         weight_thresholds->insert(std::make_pair(x_var_name, thresholds));
@@ -182,7 +189,7 @@ void QuantDequantMkldnnPass::CollectInputScalesFromQuantize(
       auto* scale_data = scale_tensor->data<float>();
       float scale = 1.0 / scale_data[0];
       if (std::isinf(scale) || std::isnan(scale)) {
-        scale = 0.0;
+        continue;
       }
 
       if (!var_quant_scales->count(x_var_name)) {
@@ -520,12 +527,10 @@ void QuantDequantMkldnnPass::ConvertFromINT8ToFP32(
     int step_c = step_n / size;
     for (int i = 0; i < weight_dims[0]; i++) {
       int begin_n = i * step_n;
-      for (int j = begin_n; j < begin_n + step_n; j++) {
-        for (int k = 0; k < size; k++) {
-          int begin_c = k * step_c;
-          for (int m = begin_c; m < begin_c + step_c; m++) {
-            weight_data[m] *= scales[k];
-          }
+      for (int j = 0; j < size; j++) {
+        int begin_c = begin_n + j * step_c;
+        for (int k = 0; k < step_c; k++) {
+          weight_data[begin_c + k] *= scales[j];
         }
       }
     }
@@ -588,7 +593,8 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat(
     Scope* scope,
     const std::string& weight_name,
     const std::unordered_map<std::string, std::vector<float>>&
-        weight_thresholds) const {
+        weight_thresholds,
+    std::vector<std::string>* dequantized_weights_names) const {
   auto* op_desc = op_node->Op();
   std::string weight_var_name = op_desc->Input(weight_name)[0];
 
@@ -596,6 +602,13 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat(
   auto iter = weight_thresholds.find(weight_var_name);
   if (iter != weight_thresholds.end()) {
     scales = iter->second;
+    auto name_iter = std::find(dequantized_weights_names->begin(),
+                               dequantized_weights_names->end(),
+                               weight_var_name);
+    // Has been dequantized
+    if (name_iter != dequantized_weights_names->end()) {
+      return;
+    }
   } else {
     if (!IsInt8Weight(op_node, scope, weight_name)) {
       return;
@@ -605,7 +618,7 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat(
         "the model is correct.",
         weight_var_name));
   }
-
+  dequantized_weights_names->push_back(weight_var_name);
   auto* var = scope->FindVar(weight_var_name);
   PADDLE_ENFORCE_NOT_NULL(
       var,
@@ -634,14 +647,17 @@ void QuantDequantMkldnnPass::DequantizeWeights(
         << "No need to dequantize weights because weight_thresholds is empty.";
     return;
   }
-
+  std::vector<std::string> dequantized_weights_names;
   for (auto* op_node :
        ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
     if (!op_node->IsOp()) continue;
     if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") {
       if (onnx_format_quantize_model) {
-        DequantizeOpWeightsFromONNXFormat(
-            op_node, scope, "Filter", weight_thresholds);
+        DequantizeOpWeightsFromONNXFormat(op_node,
+                                          scope,
+                                          "Filter",
+                                          weight_thresholds,
+                                          &dequantized_weights_names);
       } else if (IsInt8Weight(op_node, scope, "Filter")) {
         DequantizeOpWeights(
             op_node, scope, "Filter", "Output", weight_thresholds);
@@ -650,7 +666,7 @@ void QuantDequantMkldnnPass::DequantizeWeights(
                op_node->Name() == "matmul_v2") {
       if (onnx_format_quantize_model) {
         DequantizeOpWeightsFromONNXFormat(
-            op_node, scope, "Y", weight_thresholds);
+            op_node, scope, "Y", weight_thresholds, &dequantized_weights_names);
       } else if (IsInt8Weight(op_node, scope, "Y")) {
         DequantizeOpWeights(op_node, scope, "Y", "Out", weight_thresholds);
       }
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
old mode 100644
new mode 100755
index deb9072e04a49..3095cf4d05b15
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -125,7 +125,8 @@ class QuantDequantMkldnnPass : public FusePassBase {
       Scope* scope,
       const std::string& weight_name,
       const std::unordered_map<std::string, std::vector<float>>&
-          weight_thresholds) const;
+          weight_thresholds,
+      std::vector<std::string>* dequantized_weights_names) const;
 
   void DequantizeWeights(
       ir::Graph* graph,
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 273a7ee8bc48c..f41cda93bf9cc 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -535,7 +535,8 @@ void BuildOpFuncList(const platform::Place& place,
           if (op_with_kernel->PhiKernel()->IsValid()) {
             run_phi_kernel = true;
           } else {
-            if (!op_with_kernel->SupportsKernelType(expected_kernel_key)) {
+            if (!op_with_kernel->SupportsKernelType(expected_kernel_key,
+                                                    exec_ctx)) {
               auto phi_cpu_kernel_key = FallBackToCpu(
                   expected_kernel_key, phi_kernel_key, *op_with_kernel);
               op_with_kernel->ResetPhiKernel(
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 67e7293877846..a483de6f21bed 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -140,29 +140,31 @@ ProgramDesc GetLmMainProgram() {
   return main_prog;
 }
 
-// TEST(StandaloneExecutor, run) {
-//   auto place = platform::CUDAPlace(0);
-//   ProgramDesc test_prog = load_from_file("lm_startup_program");
-//   ProgramDesc main_prog = GetLmMainProgram();
-
-//   Scope scope;
-//   StandaloneExecutor exec(place, test_prog, main_prog, &scope);
-//   exec.Run({}, {}, {});
-//   auto start = std::chrono::steady_clock::now();
+TEST(StandaloneExecutor, run) {
+  auto place = platform::CUDAPlace(0);
+  ProgramDesc startup_prog = load_from_file("lm_startup_program");
+  ProgramDesc main_prog = GetLmMainProgram();
 
-//   for (size_t i = 0; i < 10; ++i) {
-//     if (i % 200 == 0) {
-//       std::cout << i << std::endl;
-//     }
+  Scope scope;
+  StandaloneExecutor startup_exec(place, startup_prog);
+  startup_exec.Run(&scope, {}, {});
+  StandaloneExecutor exec(place, main_prog);
+  exec.Run(&scope, {}, {});
+  auto start = std::chrono::steady_clock::now();
+
+  for (size_t i = 0; i < 10; ++i) {
+    if (i % 200 == 0) {
+      std::cout << i << std::endl;
+    }
 
-//     exec.Run({}, {}, {});
-//   }
+    exec.Run(&scope, {}, {});
+  }
 
-//   auto end = std::chrono::steady_clock::now();
-//   std::chrono::duration<double> diff = end - start;
+  auto end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> diff = end - start;
 
-//   std::cout << "time cost " << diff.count() << std::endl;
-// }
+  std::cout << "time cost " << diff.count() << std::endl;
+}
 
 TEST(InterpreterCore, skip_gc_vars) {
   auto place = platform::CUDAPlace(0);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f32995ae41704..d8f0eb5324b66 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -50,6 +50,7 @@ class DenseTensor;
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
 
 #ifdef PADDLE_WITH_MLU
@@ -72,7 +73,7 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
-static DDim GetDimsDebug(const ScopeBase& scope,
+static DDim GetDimsDebug(const Scope& scope,
                          const std::string& name,
                          bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
@@ -96,13 +97,13 @@ static DDim GetDimsDebug(const ScopeBase& scope,
   }
 }
 
-static bool VarInited(const ScopeBase& scope, const std::string& name) {
+static bool VarInited(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) return false;
   return var->IsInitialized();
 }
 
-static std::string GetDtype(const ScopeBase& scope, const std::string& name) {
+static std::string GetDtype(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return "";
@@ -128,7 +129,7 @@ static std::string GetDtype(const ScopeBase& scope, const std::string& name) {
   }
 }
 
-static std::string GetPlace(const ScopeBase& scope, const std::string& name) {
+static std::string GetPlace(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return "";
@@ -157,7 +158,7 @@ static std::string GetPlace(const ScopeBase& scope, const std::string& name) {
   }
 }
 
-static int GetRowSize(const ScopeBase& scope, const std::string& name) {
+static int GetRowSize(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return -1;
@@ -170,7 +171,7 @@ static int GetRowSize(const ScopeBase& scope, const std::string& name) {
   return -1;
 }
 
-static LoD GetLoDDebug(const ScopeBase& scope, const std::string& name) {
+static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   auto default_lod = LoD({{}});
 
@@ -348,7 +349,7 @@ const std::vector<std::string>& OperatorBase::Outputs(
   return it->second;
 }
 
-std::string OperatorBase::DebugStringEx(const ScopeBase* scope) const {
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
   std::stringstream ss;
   ss << "Op(" << type_ << "), inputs:{";
 
@@ -1352,7 +1353,7 @@ bool OperatorWithKernel::SupportsMKLDNN(
 }
 
 bool OperatorWithKernel::SupportsKernelType(
-    const OpKernelType& kernel_type) const {
+    const OpKernelType& kernel_type, const ExecutionContext& exe_ctx) const {
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
   if (kernels_iter == all_op_kernels.end()) return false;
@@ -1386,16 +1387,38 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
+// NOTE(jiahongyu): If MKLDNN can be used, the function SupportsKernelType needs
+// to check whether current op supports MKLDNN kernel. There are three
+// statements in if condition: The first statement checks whether library_type_
+// are changed by other high priority backends; the second checks whether this
+// op has specific implementation; the third checks whether mkldnn kernel can be
+// used.
+#ifdef PADDLE_WITH_MKLDNN
+  if (kernel_type.library_type_ == framework::LibraryType::kPlain &&
+      !paddle::platform::in_mkldnn_white_list(type_) &&
+      this->CanMKLDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
+    auto tmp_kernel_type = kernel_type;
+    tmp_kernel_type.library_type_ = framework::LibraryType::kMKLDNN;
+    tmp_kernel_type.data_layout_ = framework::DataLayout::kMKLDNN;
+    return kernels.find(tmp_kernel_type) != kernels.end();
+  }
+#endif
+
   return kernel_iter != kernels.end();
 }
 
 bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
                                          proto::VarType::Type data_type) const {
+  // NOTE(jiahongyu): Only mkldnn kernels need to check "use_mkldnn" attribute,
+  // hence we first call function SupportsMKLDNN. If we check "use_mkldnn"
+  // attribute first, it will cause error because some codes add "use_mkldnn"
+  // attribute to non-mkldnn ops.
+  if (!this->SupportsMKLDNN(data_type)) {
+    return false;
+  }
   const std::string use_mkldnn_attr = "use_mkldnn";
-  bool use_mkldnn_ctx = ctx.HasAttr(use_mkldnn_attr) &&
-                        ctx.Attr<bool>(use_mkldnn_attr) &&
-                        platform::is_cpu_place(ctx.GetPlace());
-  return use_mkldnn_ctx && this->SupportsMKLDNN(data_type);
+  return ctx.HasAttr(use_mkldnn_attr) && ctx.Attr<bool>(use_mkldnn_attr) &&
+         platform::is_cpu_place(ctx.GetPlace());
 }
 
 void OperatorWithKernel::InferShape(InferShapeContext* ctx) const {
@@ -1544,6 +1567,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
     } else {
       phi_kernel_name = kernel_signature_->name;
+
+// NOTE(jiahongyu): The registered MKLDNN kernel have library_type =
+// LibraryType::kMKLDNN and data_layout_ = DataLayout::kMKLDNN. But the default
+// values are kPlain, so we need to modify the library_type and data_layout_
+// here. There are three statements in if condition: The first statement checks
+// whether library_type_ are changed by other high priority backends; the second
+// checks whether this op has specific implementation; the third checks whether
+// mkldnn kernel can be used.
+#ifdef PADDLE_WITH_MKLDNN
+      if (kernel_type_->library_type_ == framework::LibraryType::kPlain &&
+          !paddle::platform::in_mkldnn_white_list(type_) &&
+          this->CanMKLDNNBeUsed(exe_ctx, kernel_type_->data_type_)) {
+        kernel_type_->library_type_ = framework::LibraryType::kMKLDNN;
+        kernel_type_->data_layout_ = framework::DataLayout::kMKLDNN;
+      }
+#endif
+
 // NOTE(Liu-xiandong):In my ctest, this branch do not be executed,
 // I can't understand it, it's really confusing.
 // But we still need to keep this to avoid errors.
@@ -1771,6 +1811,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
     const ExecutionContext& ctx) const {
   auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+// NOTE(jiahongyu): PADDLE_WITH_MKLDNN codes are moved outside function
+// GetExpectedKernelType, so that if MKLDNN can be used, the library_type_ and
+// data_layout_ of expected_kernel_key need to be adjusted. There are three
+// statements in if condition: The first statement checks whether library_type_
+// are changed by other high priority backends; the second checks whether this
+// op has specific implementation; the third checks whether mkldnn kernel can be
+// used.
+#ifdef PADDLE_WITH_MKLDNN
+  if (expected_kernel_key.library_type_ == framework::LibraryType::kPlain &&
+      !paddle::platform::in_mkldnn_white_list(type_) &&
+      this->CanMKLDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
+    expected_kernel_key.library_type_ = framework::LibraryType::kMKLDNN;
+    expected_kernel_key.data_layout_ = framework::DataLayout::kMKLDNN;
+  }
+#endif
+
   if (HasAttr("op_device")) {
     if (Attr<std::string>("op_device") == "cpu") {
       expected_kernel_key.place_ = platform::CPUPlace();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e649bf2fc7e95..a8a0cd863ee10 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -168,7 +168,7 @@ class OperatorBase {
   virtual void Stop() {}
 
   /// if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(const ScopeBase* scope) const;
+  virtual std::string DebugStringEx(const Scope* scope) const;
   std::string DebugString() const { return DebugStringEx(nullptr); }
 
   virtual bool SupportGPU() const { return false; }
@@ -323,10 +323,16 @@ class ExecutionContext {
   virtual const Attribute& GetAttr(const std::string& name) const {
     auto iter = op_.Attrs().find(name);
     if (iter == op_.Attrs().end()) {
-      return op_.RuntimeAttrs().at(name);
-    } else {
-      return iter->second;
+      iter = op_.RuntimeAttrs().find(name);
+      PADDLE_ENFORCE_NE(
+          iter,
+          op_.RuntimeAttrs().end(),
+          platform::errors::NotFound("(%s) is not found in AttributeMap and "
+                                     "RuntimeAttributeMap of (%s) operator.",
+                                     name,
+                                     op_.Type()));
     }
+    return iter->second;
   }
 
   virtual bool HasInput(const std::string& name) const;
@@ -621,7 +627,8 @@ class OperatorWithKernel : public OperatorBase {
 
   bool SupportsMKLDNN(proto::VarType::Type data_type) const;
 
-  bool SupportsKernelType(const OpKernelType& kernel_type) const;
+  bool SupportsKernelType(const OpKernelType& kernel_type,
+                          const ExecutionContext& exe_ctx) const;
 
   bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx,
                        proto::VarType::Type data_type) const;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc
index 26416269c9e1f..dc36f40d9c6a3 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc
@@ -50,46 +50,16 @@ std::unordered_set<Node*> GetConsumerOps(Node* node) {
   return consumers;
 }
 
-struct Hasher {
-  size_t operator()(const CinnSubGraphPtr& subgraph) const noexcept {
-    return std::hash<uint64_t>()(reinterpret_cast<uint64_t>(subgraph.get()));
-  }
-};
-struct Comparator {
-  bool operator()(const CinnSubGraphPtr& first,
-                  const CinnSubGraphPtr& second) const noexcept {
-    return first.get() == second.get();
-  }
-};
-
-struct CinnSubGraph {
-  using CinnSubGraphPtr = std::shared_ptr<CinnSubGraph>;
-  // construct function
-  CinnSubGraph() {}
-  // construct function
-  CinnSubGraph(Node* op, bool subst) : substitute(subst) { Insert(op); }
+void CinnSubGraph::Insert(Node* op) {
+  nodes.push_back(op);
+  node_set.insert(op);
 
-  void Insert(Node* op) {
-    nodes.push_back(op);
-    node_set.insert(op);
-
-    auto producers = GetProducerOps(op);
-    for (auto producer : producers) {
-      input_nodes.insert(producer);
-    }
-    input_nodes.erase(op);
+  auto producers = GetProducerOps(op);
+  for (auto producer : producers) {
+    input_nodes.insert(producer);
   }
-
-  int depth{0};
-  int max_depth{0}, min_depth{INT_MAX};
-  bool substitute{true};
-  std::vector<Node*> nodes;
-  std::unordered_set<Node*> node_set;
-  std::unordered_set<Node*> input_nodes;
-
-  std::unordered_set<CinnSubGraphPtr, Hasher, Comparator> producers;
-  std::unordered_set<CinnSubGraphPtr, Hasher, Comparator> consumers;
-};
+  input_nodes.erase(op);
+}
 
 void CinnSubgraphDetector::DoOpFusion() {
   // sort node from input to output
@@ -183,7 +153,7 @@ void CinnSubgraphDetector::DoSubGraphFusion() {
         continue;
       }
       // do fusion
-      update |= FuseSubGraph(&subgraph);
+      update |= FuseSubGraph(subgraph);
     }
     if (!update) {
       break;
@@ -191,8 +161,8 @@ void CinnSubgraphDetector::DoSubGraphFusion() {
   }
 }
 
-bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr* subgraph_ptr) {
-  auto producer = *subgraph_ptr;
+bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr subgraph_ptr) {
+  auto producer = subgraph_ptr;
   auto& consumers = producer->consumers;
   std::vector<CinnSubGraphPtr> candidates;
   for (auto& consumer : consumers) {
@@ -276,11 +246,11 @@ bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr* subgraph_ptr) {
 bool CinnSubgraphDetector::IsDependency(
     const CinnSubGraphPtr& producer_g,
     const CinnSubGraphPtr& consumer,
-    const std::unordered_set<CinnSubGraphPtr, Hasher, Comparator>& consumers) {
+    const std::unordered_set<CinnSubGraphPtr>& consumers) {
   std::queue<CinnSubGraphPtr> candidates;
   candidates.push(consumer);
 
-  std::unordered_set<CinnSubGraphPtr, Hasher, Comparator> visited_set;
+  std::unordered_set<CinnSubGraphPtr> visited_set;
   while (!candidates.empty()) {
     auto& candidate = candidates.front();
     candidates.pop();
@@ -303,12 +273,12 @@ bool CinnSubgraphDetector::IsDependency(
 bool CinnSubgraphDetector::IsDependencySimplify(
     const CinnSubGraphPtr& producer_g,
     const CinnSubGraphPtr& consumer,
-    const std::unordered_set<CinnSubGraphPtr, Hasher, Comparator>& consumers) {
+    const std::unordered_set<CinnSubGraphPtr>& consumers) {
   std::queue<CinnSubGraphPtr> candidates;
   candidates.push(consumer);
   // check upper bound.
   int check_upper_depth = producer_g->max_depth;
-  std::unordered_set<CinnSubGraphPtr, Hasher, Comparator> visited_set;
+  std::unordered_set<CinnSubGraphPtr> visited_set;
   while (!candidates.empty()) {
     auto& candidate = candidates.front();
     candidates.pop();
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h
index 1eb3ebbe62fca..e8ff3915c8511 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h
@@ -31,10 +31,32 @@ namespace paddle2cinn {
 using Node = ir::Node;
 using Graph = ir::Graph;
 
-struct Hasher;
-struct Comparator;
+/*
+ *
+ *
+ */
 struct CinnSubGraph;
 using CinnSubGraphPtr = std::shared_ptr<CinnSubGraph>;
+
+struct CinnSubGraph {
+  // construct function
+  CinnSubGraph() {}
+  // construct function
+  CinnSubGraph(Node *op, bool subst) : substitute(subst) { Insert(op); }
+  void Insert(Node *op);
+
+  int depth{0};
+  int max_depth{0};
+  int min_depth{INT_MAX};
+  bool substitute{true};
+  std::vector<Node *> nodes;
+  std::unordered_set<Node *> node_set;
+  std::unordered_set<Node *> input_nodes;
+
+  std::unordered_set<CinnSubGraphPtr> producers;
+  std::unordered_set<CinnSubGraphPtr> consumers;
+};
+
 /*
  * Detect the nodes in a subgraph that meet some conditions. This class doesn't
  * modify the graph.
@@ -55,16 +77,14 @@ class CinnSubgraphDetector {
   void BuildSubGraph();
   // SubGraph Fusion
   void DoSubGraphFusion();
-  bool FuseSubGraph(CinnSubGraphPtr *);
+  bool FuseSubGraph(CinnSubGraphPtr);
   // check exist depency.
-  bool IsDependency(
-      const CinnSubGraphPtr &,
-      const CinnSubGraphPtr &,
-      const std::unordered_set<CinnSubGraphPtr, Hasher, Comparator> &);
-  bool IsDependencySimplify(
-      const CinnSubGraphPtr &,
-      const CinnSubGraphPtr &,
-      const std::unordered_set<CinnSubGraphPtr, Hasher, Comparator> &);
+  bool IsDependency(const CinnSubGraphPtr &,
+                    const CinnSubGraphPtr &,
+                    const std::unordered_set<CinnSubGraphPtr> &);
+  bool IsDependencySimplify(const CinnSubGraphPtr &,
+                            const CinnSubGraphPtr &,
+                            const std::unordered_set<CinnSubGraphPtr> &);
 
  private:
   Graph *graph_;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 7f08fc9b4e22c..b87a294878051 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -38,17 +38,6 @@ class Variable;
 
 namespace paddle {
 namespace framework {
-
-// TODO(zhiqiu): add more function in base class
-class ScopeBase {
- public:
-  /// Find a variable in the scope or any of its ancestors.  Returns
-  /// nullptr if cannot find.
-  /// Caller doesn't own the returned Variable.
-  virtual Variable* FindVar(const std::string& name) const = 0;
-  virtual ~ScopeBase() {}
-};
-
 /**
  * @brief Scope that manage all variables.
  *
@@ -57,7 +46,7 @@ class ScopeBase {
  * One net can run in different scopes and update different variable in the
  * scope.
  */
-class Scope : public ScopeBase {
+class Scope {
  public:
   Scope() {}
   ~Scope();
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index f5ae14bbf6109..18cca8739ec0f 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -79,7 +79,7 @@ class BKCLCommunicator;
 
 namespace framework {
 class LoDRankTable;
-class ScopeBase;
+class Scope;
 class ReaderHolder;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h
index 7ed6e93ec7c7c..6d4f7c347b097 100644
--- a/paddle/fluid/imperative/execution_context.h
+++ b/paddle/fluid/imperative/execution_context.h
@@ -102,7 +102,10 @@ class DygraphExecutionContext : public framework::ExecutionContext {
   }
 
   bool HasAttr(const std::string& name) const override {
-    return attrs_.count(name) != 0 || default_attrs_.count(name) != 0;
+    if (attrs_.find(name) == attrs_.end()) {
+      return default_attrs_.find(name) != default_attrs_.end();
+    }
+    return true;
   }
 
   const framework::AttributeMap& Attrs() const override { return attrs_; }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 30cf0e82e9ff4..1f70bcf4f428a 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -25,6 +25,9 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_op_list.h"
+#endif
 #include "paddle/fluid/framework/library_type.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
@@ -185,13 +188,29 @@ PreparedOp PrepareImpl(
   phi::KernelSignature kernel_signature;
   phi::KernelKey phi_kernel_key;
   std::string phi_kernel_name;
+
+// NOTE(jiahongyu): The registered MKLDNN kernel have library_type =
+// LibraryType::kMKLDNN and data_layout_ = DataLayout::kMKLDNN. But the default
+// values are kPlain, so we need to modify the library_type and data_layout_
+// here. There are three statements in if condition: The first statement checks
+// whether library_type_ are changed by other high priority backends; the second
+// checks whether this op has specific implementation; the third checks whether
+// mkldnn kernel can be used.
+#ifdef PADDLE_WITH_MKLDNN
+  if (expected_kernel_key.library_type_ == framework::LibraryType::kPlain &&
+      !paddle::platform::in_mkldnn_white_list(op.Type()) &&
+      op.CanMKLDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.data_type_)) {
+    expected_kernel_key.library_type_ = framework::LibraryType::kMKLDNN;
+    expected_kernel_key.data_layout_ = framework::DataLayout::kMKLDNN;
+  }
+#endif
+
 #if defined(PADDLE_WITH_XPU)
   bool is_xpu_unsupport =
       paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
           !paddle::platform::is_xpu_support_op(op.Type(),
                                                expected_kernel_key) ||
       paddle::platform::is_in_xpu_black_list(op.Type());
-
 #endif
 
   bool has_phi_kernel = false;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 42126b5048e68..4834039d64f15 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2174,6 +2174,7 @@ USE_TRT_CONVERTER(flatten);
 USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(matmul_v2);
+USE_TRT_CONVERTER(bmm);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
 USE_TRT_CONVERTER(exp);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ed6508929ca1f..5e9e6d8f2c4f1 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -4,6 +4,7 @@ list(
   CONVERT_FILES
   matmul_op.cc
   matmul_v2_op.cc
+  bmm_op.cc
   conv2d_op.cc
   fc_op.cc
   pool2d_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/bmm_op.cc b/paddle/fluid/inference/tensorrt/convert/bmm_op.cc
new file mode 100644
index 0000000000000..4f4751d8ca977
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/bmm_op.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class BMMOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    auto output_name = op_desc.Output("Out")[0];
+
+    layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                 MatrixMultiply,
+                                 *input1,
+                                 nvinfer1::MatrixOperation::kNONE,
+                                 *input2,
+                                 nvinfer1::MatrixOperation::kNONE);
+
+    RreplenishLayerAndOutput(layer, "bmm", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(bmm, BMMOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
index bba2e84e32b9f..4c5944e79451c 100644
--- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
@@ -105,10 +105,287 @@ nvinfer1::DimsExprs InstanceNormInferMeta(
   return x_dims;
 }
 
+inline const nvinfer1::IDimensionExpr* CalcOutputSize(
+    const nvinfer1::IDimensionExpr* input_size,
+    const nvinfer1::IDimensionExpr* filter_size,
+    const nvinfer1::IDimensionExpr* dilation,
+    const nvinfer1::IDimensionExpr* padding1,
+    const nvinfer1::IDimensionExpr* padding2,
+    const nvinfer1::IDimensionExpr* stride,
+    nvinfer1::IExprBuilder& expr_builder  // NOLINT
+) {
+  // dkernel = dilation * (filter_size - 1) + 1;
+  const nvinfer1::IDimensionExpr* dkernel = expr_builder.operation(
+      nvinfer1::DimensionOperation::kSUM,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kPROD,
+          *dilation,
+          *expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
+                                  *filter_size,
+                                  *expr_builder.constant(1))),
+      *expr_builder.constant(1));
+
+  // output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
+  const nvinfer1::IDimensionExpr* tmp = expr_builder.operation(
+      nvinfer1::DimensionOperation::kSUB,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kSUM,
+          *expr_builder.operation(
+              nvinfer1::DimensionOperation::kSUM, *input_size, *padding1),
+          *padding2),
+      *dkernel);
+
+  const nvinfer1::IDimensionExpr* output_size = expr_builder.operation(
+      nvinfer1::DimensionOperation::kSUM,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kFLOOR_DIV, *tmp, *stride),
+      *expr_builder.constant(1));
+  return output_size;
+}
+
+nvinfer1::DimsExprs UnflodInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs,
+      1,
+      phi::errors::InvalidArgument("inputs of unfold should be equal to 1, "
+                                   "But received (%s)",
+                                   nb_inputs));
+
+  const nvinfer1::DimsExprs in_dims = inputs[0];
+  std::vector<const nvinfer1::IDimensionExpr*> out_dims;
+  out_dims.push_back(in_dims.d[0]);
+
+  auto kernel_sizes =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("kernel_sizes"));
+  auto dilations =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("dilations"));
+  auto paddings =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+  auto strides = PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
+
+  // output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+  const nvinfer1::IDimensionExpr* output_channels = expr_builder.operation(
+      nvinfer1::DimensionOperation::kPROD,
+      *in_dims.d[1],
+      *expr_builder.operation(nvinfer1::DimensionOperation::kPROD,
+                              *expr_builder.constant(kernel_sizes[0]),
+                              *expr_builder.constant(kernel_sizes[1])));
+  out_dims.push_back(output_channels);
+
+  const nvinfer1::IDimensionExpr* output_height =
+      CalcOutputSize(in_dims.d[2],
+                     expr_builder.constant(kernel_sizes[0]),
+                     expr_builder.constant(dilations[0]),
+                     expr_builder.constant(paddings[0]),
+                     expr_builder.constant(paddings[2]),
+                     expr_builder.constant(strides[0]),
+                     expr_builder);
+  const nvinfer1::IDimensionExpr* output_width =
+      CalcOutputSize(in_dims.d[3],
+                     expr_builder.constant(kernel_sizes[1]),
+                     expr_builder.constant(dilations[1]),
+                     expr_builder.constant(paddings[1]),
+                     expr_builder.constant(paddings[3]),
+                     expr_builder.constant(strides[1]),
+                     expr_builder);
+
+  const nvinfer1::IDimensionExpr* output_col_length = expr_builder.operation(
+      nvinfer1::DimensionOperation::kPROD, *output_height, *output_width);
+
+  out_dims.push_back(output_col_length);
+  nvinfer1::DimsExprs output;
+  output.nbDims = out_dims.size();
+  for (size_t i = 0; i < out_dims.size(); i++) output.d[i] = out_dims[i];
+  return output;
+}
+
+nvinfer1::DimsExprs ScatterNdAddInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(nb_inputs,
+                    3,
+                    phi::errors::InvalidArgument(
+                        "inputs of scatter_nd_add should be equal to 3, "
+                        "But received (%s)",
+                        nb_inputs));
+  const nvinfer1::DimsExprs ref_dims = inputs[0];
+  return ref_dims;
+}
+
+nvinfer1::DimsExprs UnchangedInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(nb_inputs,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "inputs of UnchangedInferMeta should be equal to 1, "
+                        "But received (%s)",
+                        nb_inputs));
+  return inputs[0];
+}
+
+nvinfer1::DimsExprs Pad3dInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dim = inputs[0];
+
+  nvinfer1::DimsExprs out_dims;
+  out_dims.nbDims = x_dim.nbDims;
+
+  out_dims.d[0] = x_dim.d[0];
+
+  auto paddings =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+  auto data_format =
+      PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_format"));
+
+  if (data_format == "NCDHW") {
+    out_dims.d[1] = x_dim.d[1];
+  } else {
+    out_dims.d[4] = x_dim.d[4];
+  }
+
+  if (data_format == "NCDHW") {
+    // depth
+    out_dims.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[2],
+                                *expr_builder.constant(paddings[4])),
+        *expr_builder.constant(paddings[5]));
+    // height
+    out_dims.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[3],
+                                *expr_builder.constant(paddings[2])),
+        *expr_builder.constant(paddings[3]));
+    // width
+    out_dims.d[4] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[4],
+                                *expr_builder.constant(paddings[0])),
+        *expr_builder.constant(paddings[1]));
+  } else {  // NDHWC
+    // depth
+    out_dims.d[1] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[1],
+                                *expr_builder.constant(paddings[4])),
+        *expr_builder.constant(paddings[5]));
+    // height
+    out_dims.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[2],
+                                *expr_builder.constant(paddings[2])),
+        *expr_builder.constant(paddings[3]));
+    // width
+    out_dims.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[3],
+                                *expr_builder.constant(paddings[0])),
+        *expr_builder.constant(paddings[1]));
+  }
+  return out_dims;
+}
+
+nvinfer1::DimsExprs PNormInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dim = inputs[0];
+  std::vector<const nvinfer1::IDimensionExpr*> reduce_dims;
+  std::vector<const nvinfer1::IDimensionExpr*> keep_dims;
+
+  bool asvector = PADDLE_GET_CONST(bool, op_desc.GetAttr("asvector"));
+  bool keepdim = PADDLE_GET_CONST(bool, op_desc.GetAttr("keepdim"));
+  int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+
+  if (asvector) {
+    reduce_dims.emplace_back(expr_builder.constant(1));
+    keep_dims.emplace_back(expr_builder.constant(1));
+    if (keepdim) {
+      for (int i = 1; i < x_dim.nbDims; ++i) {
+        keep_dims.emplace_back(expr_builder.constant(1));
+      }
+    }
+  } else {
+    if (axis < 0) axis = x_dim.nbDims + axis;
+    for (int i = 0; i < x_dim.nbDims; ++i) {
+      if (i != axis) reduce_dims.emplace_back(x_dim.d[i]);
+    }
+    if (reduce_dims.size() == 0) {
+      reduce_dims.emplace_back(expr_builder.constant(1));
+    }
+  }
+  keep_dims[axis] = expr_builder.constant(1);
+
+  nvinfer1::DimsExprs output;
+  if (keepdim) {
+    output.nbDims = keep_dims.size();
+    for (int i = 0; i < output.nbDims; i++) output.d[i] = keep_dims[i];
+  } else {
+    output.nbDims = reduce_dims.size();
+    for (int i = 0; i < output.nbDims; i++) output.d[i] = reduce_dims[i];
+  }
+  return output;
+}
+
+nvinfer1::DimsExprs GridSamplerInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dims = inputs[0];
+  const nvinfer1::DimsExprs grid_dims = inputs[1];
+
+  nvinfer1::DimsExprs output;
+  if (grid_dims.nbDims == 4) {
+    output.nbDims = 4;
+    output.d[0] = x_dims.d[0];
+    output.d[1] = x_dims.d[1];
+    output.d[2] = grid_dims.d[1];
+    output.d[3] = grid_dims.d[2];
+  } else {
+    output.nbDims = 4;
+    output.d[0] = x_dims.d[0];
+    output.d[1] = x_dims.d[1];
+    output.d[2] = grid_dims.d[1];
+    output.d[3] = grid_dims.d[2];
+    output.d[4] = grid_dims.d[3];
+  }
+  return output;
+}
+
 PD_REGISTER_DYNAMIC_INFER_META_FN(gather_nd, GatherNdInferMeta);
 PD_REGISTER_DYNAMIC_INFER_META_FN(yolo_box, YoloBoxInferMeta);
 PD_REGISTER_DYNAMIC_INFER_META_FN(instance_norm, InstanceNormInferMeta);
-
+PD_REGISTER_DYNAMIC_INFER_META_FN(unfold, UnflodInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(scatter_nd_add, ScatterNdAddInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(inverse, UnchangedInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(pad3d, Pad3dInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(grid_sampler, GridSamplerInferMeta);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
index 0bc2ff78b68df..c0ddaf5d983ef 100644
--- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
@@ -23,6 +23,11 @@ namespace tensorrt {
 USE_TRT_DYNAMIC_INFER_META_FN(gather_nd);
 USE_TRT_DYNAMIC_INFER_META_FN(yolo_box);
 USE_TRT_DYNAMIC_INFER_META_FN(instance_norm);
+USE_TRT_DYNAMIC_INFER_META_FN(unfold);
+USE_TRT_DYNAMIC_INFER_META_FN(scatter_nd_add);
+USE_TRT_DYNAMIC_INFER_META_FN(pad3d);
+USE_TRT_DYNAMIC_INFER_META_FN(inverse);
+USE_TRT_DYNAMIC_INFER_META_FN(grid_sampler);
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 904768d179d32..5fea48604ae11 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -327,6 +327,12 @@ struct SimpleOpTypeSetTeller : public Teller {
       }
     }
 
+    if (op_type == "bmm") {
+      if (!with_dynamic_shape) {
+        return false;
+      }
+    }
+
     if (op_type == "matmul_v2") {
       if (!with_dynamic_shape) {
         return false;
@@ -2115,6 +2121,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "mul",
       "matmul",
       "matmul_v2",
+      "bmm",
       "conv2d",
       "conv2d_fusion",
       "pool2d",
@@ -2227,6 +2234,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "mul",
       "matmul",
       "matmul_v2",
+      "bmm",
       "conv2d",
       "conv2d_fusion",
       "pool2d",
@@ -2353,6 +2361,14 @@ struct GenericPluginTeller : public Teller {
       if (!desc.HasAttr("iou_aware") && !desc.HasAttr("iou_aware_factor"))
         return false;
     }
+    if (op_type == "pad3d") {
+      auto pad3d_inputs = desc.Inputs();
+      if (pad3d_inputs.find("Paddings") != pad3d_inputs.end()) {
+        if (desc.Input("Paddings").size() >= 1) {
+          return false;
+        }
+      }
+    }
     if (use_no_calib_int8) {
       return false;
     } else {
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
index d9afa475bff6a..e083e9633dc29 100644
--- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
@@ -290,6 +290,10 @@ bool GenericPlugin::supportsFormatCombination(
   if (op_desc_.Type() == "gather_nd" || op_desc_.Type() == "yolo_box") {
     if (pos == 0) return in_out[pos].type == nvinfer1::DataType::kFLOAT;
     if (pos == 1) return in_out[pos].type == nvinfer1::DataType::kINT32;
+  } else if (op_desc_.Type() == "scatter_nd_add") {
+    if (pos == 0) return in_out[pos].type == nvinfer1::DataType::kFLOAT;
+    if (pos == 1) return in_out[pos].type == nvinfer1::DataType::kINT32;
+    if (pos == 2) return in_out[pos].type == nvinfer1::DataType::kFLOAT;
   } else {
     return in_out[pos].type == nvinfer1::DataType::kFLOAT;
   }
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index 727d2576e57f7..db1f2953c742f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -40,7 +40,12 @@ T GetValueFromStream(std::stringstream &ss);
 template <>
 std::string GetValueFromStream<std::string>(std::stringstream &ss);
 
-TEST(Analyzer_bert, profile) { profile(); }
+TEST(Analyzer_bert, profile) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
+  profile();
+}
 
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_bert, profile_mkldnn) {
@@ -57,6 +62,9 @@ TEST(Analyzer_bert, profile_mkldnn_bf16) {
 
 // Check the fuse status
 TEST(Analyzer_bert, fuse_statis) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   auto cfg(SetConfig());
   int num_ops;
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
@@ -65,7 +73,12 @@ TEST(Analyzer_bert, fuse_statis) {
   LOG(INFO) << "num_ops: " << num_ops;
 }
 
-TEST(Analyzer_bert, compare) { CompareNativeAndAnalysisWrapper(); }
+TEST(Analyzer_bert, compare) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
+  CompareNativeAndAnalysisWrapper();
+}
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_bert, compare_mkldnn) {
   auto use_mkldnn = true;
@@ -75,6 +88,9 @@ TEST(Analyzer_bert, compare_mkldnn) {
 
 // Compare Deterministic result
 TEST(Analyzer_bert, compare_determine) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   auto cfg(SetConfig());
 
   auto inputs = LoadInputData();
@@ -83,6 +99,9 @@ TEST(Analyzer_bert, compare_determine) {
 }
 
 TEST(Analyzer_bert, transfer_scope_cache) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   auto config(SetConfig());
 
   std::vector<PaddleTensor> input, output;
diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
index 529bc0a8194ba..1efbe7cecdde4 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc
@@ -39,18 +39,31 @@ void profile(bool use_mkldnn = false, bool use_gpu = false) {
                  FLAGS_num_threads);
 }
 
-TEST(Analyzer_ernie, profile) { profile(); }
+TEST(Analyzer_ernie, profile) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
+  profile();
+}
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_ernie, profile_mkldnn) { profile(true, false); }
 #endif
 
 // Check the model by gpu
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-TEST(Analyzer_ernie, profile_gpu) { profile(false, true); }
+TEST(Analyzer_ernie, profile_gpu) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
+  profile(false, true);
+}
 #endif
 
 // Check the fuse status
 TEST(Analyzer_Ernie, fuse_statis) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   AnalysisConfig cfg;
   SetConfig(&cfg);
 
@@ -85,13 +98,21 @@ void compare(bool use_mkldnn = false) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
 }
 
-TEST(Analyzer_ernie, compare) { compare(); }
+TEST(Analyzer_ernie, compare) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
+  compare();
+}
 #ifdef PADDLE_WITH_MKLDNN
 TEST(Analyzer_ernie, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
 
 // Compare Deterministic result
 TEST(Analyzer_Ernie, compare_determine) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   AnalysisConfig cfg;
   SetConfig(&cfg);
   auto pass_builder = cfg.pass_builder();
@@ -104,6 +125,9 @@ TEST(Analyzer_Ernie, compare_determine) {
 
 // Compare results
 TEST(Analyzer_Ernie, compare_results) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   AnalysisConfig cfg;
   SetConfig(&cfg);
   auto pass_builder = cfg.pass_builder();
@@ -150,6 +174,9 @@ TEST(Analyzer_Ernie_ipu, ipu_compare_determine) {
 
 // IPU: Compare results
 TEST(Analyzer_Ernie_ipu, ipu_compare_results) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   AnalysisConfig cfg;
   SetIpuConfig(&cfg);
 
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 870c73fe1e6f6..aeefcf1059243 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -23,6 +23,9 @@ namespace paddle {
 namespace inference {
 
 void run(const AnalysisConfig& config, std::vector<float>* out_data, int bs) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   auto predictor = CreatePaddlePredictor(config);
   auto input_names = predictor->GetInputNames();
 
@@ -222,6 +225,9 @@ std::shared_ptr<paddle_infer::Predictor> InitPredictor() {
 }
 
 void run(paddle_infer::Predictor* predictor, std::vector<float>* out_data) {
+#if !defined(_WIN32)
+  setenv("NVIDIA_TF32_OVERRIDE", "0", 1);
+#endif
   const int run_batch = 2;
   const int run_seq_len = 71;
   const int max_seq_len = 128;
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index d8fd433c0417c..3310bdbbe8254 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -21,9 +21,6 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -36,15 +33,6 @@ class AbsOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -86,15 +74,6 @@ class AbsGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 5160071486244..6f59e44d546fb 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -82,27 +82,18 @@ class ActivationGradOpMaker : public framework::SingleGradOpMaker<T> {
 framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const framework::OperatorWithKernel& oper,
                                       const std::string& name) {
-  framework::LibraryType library{framework::LibraryType::kPlain};
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
   auto data_type = oper.IndicateVarDataType(ctx, name);
-// FIXME(liuwei1031) temporarily disable the code to unblock users
-// TODO(liuwei1031) figure out the reason behind
-// https://github.com/PaddlePaddle/Paddle/issues/16096
-// and re-enable this in the future
-// #ifdef PADDLE_WITH_CUDA
-//   auto it1 = oper.Attrs().find("use_cudnn");
-//   if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
-//     library = framework::LibraryType::kCUDNN;
-//   }
-// #endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      oper.CanMKLDNNBeUsed(ctx, data_type)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
-  }
-#endif
-  return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  // FIXME(liuwei1031) temporarily disable the code to unblock users
+  // TODO(liuwei1031) figure out the reason behind
+  // https://github.com/PaddlePaddle/Paddle/issues/16096
+  // and re-enable this in the future
+  // #ifdef PADDLE_WITH_CUDA
+  //   auto it1 = oper.Attrs().find("use_cudnn");
+  //   if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) {
+  //     library = framework::LibraryType::kCUDNN;
+  //   }
+  // #endif
+  return framework::OpKernelType(data_type, ctx.GetPlace());
 }
 
 class ActivationOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 4979ab0345200..4f134ff974637 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -197,16 +197,6 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
                     platform::errors::InvalidArgument(
                         "Variance input should be of float type"));
 
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-#ifdef PADDLE_WITH_MKLDNN
-  if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    return framework::OpKernelType(input_data_type,
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kMKLDNN,
-                                   framework::LibraryType::kMKLDNN);
-  }
-#endif
-
   return framework::OpKernelType(input_data_type, ctx.GetPlace());
 }
 
@@ -396,18 +386,7 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
         platform::errors::InvalidArgument("gradient variable of Y is empty"));
   }
 
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-  if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-    return framework::OpKernelType(data_type,
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kMKLDNN,
-                                   framework::LibraryType::kMKLDNN);
-  }
-#endif
-
   return framework::OpKernelType(data_type, ctx.GetPlace());
 }
 
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index 7994dacf08794..997c017d3129c 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -30,15 +30,6 @@ class ClipOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -98,15 +89,6 @@ class ClipOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index a875f1fc8df9e..ae65930b86ac0 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
-#ifdef PADDLE_WITH_MKLDNN
-#include <paddle/fluid/platform/mkldnn_helper.h>
-#endif
-
 namespace paddle {
 namespace operators {
 using Tensor = phi::DenseTensor;
@@ -53,14 +49,6 @@ class ConcatOp : public framework::OperatorWithKernel {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "All Inputs of Concat OP are Empty!"));
     }
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -127,19 +115,6 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    // extra checking if attr "use_mkldnn" exist is needed because
-    // test_reverse_op is calling concat_grad kernel without setting
-    // "use_mkldnn" to any value
-    if (ctx.HasAttr("use_mkldnn") &&
-        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 42e5eb2a43820..c80cc2dc734c6 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -49,15 +49,6 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
     }
   }
 #endif
-#ifdef PADDLE_WITH_MKLDNN
-  if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-    return framework::OpKernelType(data_type,
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kMKLDNN,
-                                   framework::LibraryType::kMKLDNN);
-  }
-#endif
-
   return framework::OpKernelType(data_type, ctx.GetPlace());
 }
 
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 6f46e5bf2f8ed..4d620b5181ccb 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -18,9 +18,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/data_layout.h"
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -199,15 +196,6 @@ class DataNormOp : public framework::OperatorWithKernel {
                         platform::errors::InvalidArgument(
                             "bias input should be of float type"));
     }
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
@@ -508,18 +496,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
           "Y@GRAD can not be found for computation"));
     }
 
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index b1f0817539f17..c8289ab098a3a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -45,15 +45,6 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Out");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index afc06b0d9981b..23271352f6b7c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -32,15 +32,6 @@ class ElementwiseMulOp : public ElementwiseOp {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index a8e1da9f7945d..70bdd11977b21 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -156,15 +156,6 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -317,15 +308,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -371,15 +353,6 @@ class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -432,15 +405,6 @@ class ElementwiseOpDoubleGradWithoutDXDY
       input_data_type =
           OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY");
     }
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -493,15 +457,6 @@ class ElementwiseOpTripleGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     framework::proto::VarType::Type input_data_type;
     input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "D_DDOut");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 772ef09219817..09dc0f68cce2a 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -24,6 +24,13 @@ namespace operators {
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
 };
 
 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index fb82f0b6524ba..6bf40fd3bb6b8 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -37,15 +37,6 @@ class ExpandV2Op : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -163,15 +154,6 @@ class ExpandV2GradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 85aadcb07ad32..82c6b89063bea 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -104,15 +104,6 @@ class FillConstantOp : public framework::OperatorWithKernel {
       }
     }
 
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 888e447d798f9..679256b47ca00 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -153,14 +153,6 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FusionGRUOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-#ifdef PADDLE_WITH_MKLDNN
-  if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-    return framework::OpKernelType(data_type,
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kMKLDNN,
-                                   framework::LibraryType::kMKLDNN);
-  }
-#endif
   return framework::OpKernelType(data_type, ctx.GetPlace());
 }
 
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index b09c6a2959b94..93507160a5072 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -176,14 +176,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-#ifdef PADDLE_WITH_MKLDNN
-  if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-    return framework::OpKernelType(data_type,
-                                   ctx.GetPlace(),
-                                   framework::DataLayout::kMKLDNN,
-                                   framework::LibraryType::kMKLDNN);
-  }
-#endif
   return framework::OpKernelType(data_type, ctx.GetPlace());
 }
 
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index e2ee27f2561e1..f418e48f7d9c8 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -60,16 +60,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto data_type =
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.device_context(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index 15b0a04ab2f67..eb3c55711641e 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -36,14 +36,6 @@ class GeluOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 };
@@ -76,14 +68,6 @@ class GeluGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index ac50da83e6b78..257e513700b85 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -340,20 +340,6 @@ class InterpolateOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    // TODO(danqing): support other interp_method
-    // (https://github.com/PaddlePaddle/Paddle/pull/30016/files)
-    // NOTE(jiahy0825): currently only support interp_method = nearest or
-    // interp_method = bilinear
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index e9d0d718b9fb7..a3e7f46fecafe 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -444,20 +444,6 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    // TODO(danqing): support other interp_method
-    // (https://github.com/PaddlePaddle/Paddle/pull/30016/files)
-    // NOTE(jiahy0825): currently only support interp_method = nearest or
-    // interp_method = bilinear
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc
index a4286aea07842..99da0b08af75b 100644
--- a/paddle/fluid/operators/log_softmax_op.cc
+++ b/paddle/fluid/operators/log_softmax_op.cc
@@ -33,15 +33,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index b2ef8f0370e37..a9cfadf3b6455 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -225,16 +225,6 @@ class LRNOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 
@@ -359,16 +349,6 @@ class LRNOpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index f2900bea21c26..024aa7731c9c6 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -697,15 +697,6 @@ class MatMulOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -889,15 +880,6 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 876a90e7b9674..21537b70a4dc8 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -135,15 +135,6 @@ class MatMulV2Op : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -210,15 +201,6 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index f998ca8a5ec0f..ee70a441a754f 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -36,15 +36,6 @@ class PReluOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
@@ -127,15 +118,6 @@ class PReluGradOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index cab04e43e8681..7416269e33dd2 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -31,15 +31,6 @@ class ScaleOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index b191f7cfa0011..445514ab9b050 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -30,15 +30,6 @@ class ShapeOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index ba96e92d3030b..7e98514cde370 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -39,15 +39,6 @@ class ShuffleChannelOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 6c63b2719f409..7cb76dc56cb8a 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -53,7 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
           platform::errors::InvalidArgument(
               "float16 can only be used on GPU/NPU/XPU/MLU and custom place"));
     }
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::CanCUDNNBeUsed(ctx)) {
       return framework::OpKernelType(input_data_type,
@@ -62,15 +61,6 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                                      framework::LibraryType::kCUDNN);
     }
 #endif
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_);
   }
 };
@@ -158,15 +148,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
                                      framework::LibraryType::kCUDNN);
     }
 #endif
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_);
   }
 };
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index e9706f00ce889..d30320f9952ee 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -35,15 +35,6 @@ class StackOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      return framework::OpKernelType(input_data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index df62da0b56546..48024cbb3ca17 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -98,14 +98,6 @@ class TransposeOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     auto &data_format = ctx.Attr<std::string>("data_format");
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
     return framework::OpKernelType(data_type, ctx.GetPlace(), layout_);
@@ -202,14 +194,6 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     std::string data_format = ctx.Attr<std::string>("data_format");
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
     return framework::OpKernelType(data_type, ctx.GetPlace(), layout_);
@@ -360,14 +344,6 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
     framework::proto::VarType::Type data_type =
         OperatorWithKernel::IndicateVarDataType(ctx,
                                                 framework::GradVarName("Out"));
-#ifdef PADDLE_WITH_MKLDNN
-    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
-      return framework::OpKernelType(data_type,
-                                     ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
     std::string data_format = ctx.Attr<std::string>("data_format");
     framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
     return framework::OpKernelType(data_type, ctx.GetPlace(), layout_);
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 8b0fe26eeaa30..a533b17fc175d 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -32,6 +33,9 @@ inline void TransCompute(const int dim,
                          phi::DenseTensor* out,
                          const std::vector<int>& axis) {
   switch (dim) {
+    case 0:
+      phi::Copy<DeviceContext>(dev_ctx, in, dev_ctx.GetPlace(), false, out);
+      break;
     case 1:
       phi::funcs::Transpose<DeviceContext, T, 1> trans1;
       trans1(dev_ctx, in, out, axis);
diff --git a/paddle/fluid/platform/mkldnn_op_list.h b/paddle/fluid/platform/mkldnn_op_list.h
new file mode 100644
index 0000000000000..76f0a2affcc74
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_op_list.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLDNN
+
+#include <unordered_set>
+
+namespace paddle {
+namespace platform {
+
+// NOTE(jiahongyu): Below ops have specific PADDLE_WITH_MKLDNN hard codes within
+// the function GetExpectedKernelType, so we need to handle them through
+// mkldnn_white_list and solve them one-by-one in the future.
+// TODO(jiahongyu): Delete mkldnn_white_list and fully support
+// PADDLE_WITH_MKLDNN of GetExpectedKernelType.
+static const std::unordered_set<std::string> mkldnn_white_list = {
+    "cast",
+    "transfer_dtype",
+    "layer_norm",
+    "pad2d",
+    "pad3d",
+    "pool2d",
+    "pool2d_grad",
+    "slice",
+    "slice_grad",
+    "split",
+    "sum",
+    "sgd",
+    // NOTE(jiahongyu): squeeze MKLDNN kernel are disabled
+    // (https://github.com/PaddlePaddle/Paddle/pull/35781). If these MKLDNN
+    // kernels and codes are deleted in the future, attributes `use_mkldnn`
+    // should be removed from function declaration
+    "squeeze",
+    "squeeze_grad",
+    "squeeze2",
+    "squeeze2_grad",
+    // NOTE(jiahongyu): reshape and flatten have attribute use_mkldnn and they
+    // are registered in paddle, but they didn't change the ExpectedKernelType
+    // of tensor. Actually, mkldnn kernel of squeeze, reshape, and flatten
+    // should never be called.
+    "reshape",
+    "reshape_grad",
+    "reshape2",
+    "reshape2_grad",
+    "flatten",
+    "flatten_grad",
+    "flatten2",
+    "flatten2_grad",
+    // NOTE(jiahongyu): After fixing GetExpectedKernelType in ReduceOp, reduce
+    // series hard code can be deleted together.
+    "reduce_max",
+    "reduce_mean",
+    "reduce_mean_grad",
+    "reduce_min",
+    "reduce_sum",
+    "reduce_sum_grad",
+    // NOTE(jiahongyu): Below ops register kernel with customized_type_value, we
+    // need to analysis and solve them one-by-one.
+    "conv2d",
+    "conv2d_grad",
+    "depthwise_conv2d",
+    "depthwise_conv2d_grad",
+    "conv3d",
+    "conv3d_grad",
+    "prior_box",
+    "fc",
+    "mul",
+    "mul_grad",
+    "transpose2"};
+
+inline bool in_mkldnn_white_list(const std::string& op_name) {
+  return mkldnn_white_list.find(op_name) != mkldnn_white_list.end();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc
index 694f701b5ad9b..ec8561996681d 100644
--- a/paddle/fluid/platform/os_info.cc
+++ b/paddle/fluid/platform/os_info.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #else
 #include <unistd.h>
 #endif
+#include "glog/logging.h"
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"  // import DISABLE_COPY_AND_ASSIGN
 
@@ -115,6 +116,7 @@ bool SetCurrentThreadName(const std::string& name) {
     return false;
   }
   instance.SetCurrentThreadData(name);
+  VLOG(4) << __func__ << " " << name;
   return true;
 }
 
diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h
index ef894fd3dc281..a827d063c152e 100644
--- a/paddle/fluid/platform/os_info.h
+++ b/paddle/fluid/platform/os_info.h
@@ -57,7 +57,7 @@ ThreadId GetCurrentThreadId();
 // create/destory when using it.
 std::unordered_map<uint64_t, ThreadId> GetAllThreadIds();
 
-static constexpr const char* kDefaultThreadName = "unset";
+static constexpr const char* kDefaultThreadName = "unnamed";
 // Returns kDefaultThreadName if SetCurrentThreadName is never called.
 std::string GetCurrentThreadName();
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index c5540fa94ebf7..03a7b97af4df7 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -49,6 +49,14 @@ if(WITH_PSCORE)
     set(PYBIND_DEPS ${PYBIND_DEPS} graph_gpu_wrapper)
   endif()
 endif()
+if(WITH_DISTRIBUTE
+   AND LINUX
+   AND NOT WITH_ASCEND_CL
+   AND NOT WITH_XPU
+   AND NOT WITH_CINN
+   AND NOT WITH_ROCM)
+  set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc)
+endif()
 if(WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
@@ -218,6 +226,29 @@ if(WITH_PSCORE)
   set(PYBIND_SRCS fleet_py.cc ${PYBIND_SRCS})
 endif()
 
+if(WITH_DISTRIBUTE
+   AND LINUX
+   AND NOT WITH_ASCEND_CL
+   AND NOT WITH_XPU
+   AND NOT WITH_CINN
+   AND NOT WITH_ROCM)
+  if(WITH_ARM_BRPC)
+    set(DISTRIBUTE_COMPILE_FLAGS
+        "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+    )
+  else()
+    set(DISTRIBUTE_COMPILE_FLAGS
+        "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result"
+    )
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+    endif()
+  endif()
+  set_source_files_properties(rpc.cc PROPERTIES COMPILE_FLAGS
+                                                ${DISTRIBUTE_COMPILE_FLAGS})
+  set(PYBIND_SRCS rpc.cc ${PYBIND_SRCS})
+endif()
+
 if(WITH_NCCL OR WITH_RCCL)
   list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index f6ace5a9fefdb..dc85c17e6d6c5 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -195,8 +195,8 @@ static PyObject* tensor__add__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__add__", 0);
     {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -292,8 +292,8 @@ static PyObject* tensor__sub__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__sub__", 0);
     {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -385,8 +385,8 @@ static PyObject* tensor__rsub__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__rsub__", 0);
     {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -480,11 +480,12 @@ static PyObject* tensor__mul__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__mul__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -581,11 +582,12 @@ static PyObject* tensor__div__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__div__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -696,11 +698,12 @@ static PyObject* tensor__rdiv__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__rdiv__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -809,11 +812,12 @@ static PyObject* tensor__gt__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__gt__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -894,11 +898,12 @@ static PyObject* tensor__ge__method(TensorObject* self,
         CastPyArg2Scalar(other_obj, "__ge__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -974,17 +979,18 @@ static PyObject* tensor__mod__method(TensorObject* self,
     other_tensor = full_ad_func(self_tensor.shape(),
                                 phi::Scalar(other_float),
                                 self_tensor.dtype(),
-                                place);
+                                self_tensor.place());
   } else if (!PyCheckTensor(other_obj)) {
     paddle::experimental::Scalar value =
         CastPyArg2Scalar(other_obj, "__mod__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -1056,17 +1062,21 @@ static PyObject* tensor__matmul__method(TensorObject* self,
   paddle::experimental::Tensor other_tensor;
   if (has_other_float) {
     eager_gil_scoped_release guard;
-    other_tensor =
-        full_ad_func({1}, phi::Scalar(other_float), self_tensor.dtype(), place);
+    other_tensor = full_ad_func({1},
+                                phi::Scalar(other_float),
+                                self_tensor.dtype(),
+                                self_tensor.place());
   } else if (!PyCheckTensor(other_obj)) {
     paddle::experimental::Scalar value =
         CastPyArg2Scalar(other_obj, "__matmul__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, self_tensor.dtype(), place);
+      other_tensor =
+          full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -1159,17 +1169,18 @@ static PyObject* tensor__lt__method(TensorObject* self,
     other_tensor = full_ad_func(self_tensor.shape(),
                                 phi::Scalar(other_float),
                                 self_tensor.dtype(),
-                                place);
+                                self_tensor.place());
   } else if (!PyCheckTensor(other_obj)) {
     paddle::experimental::Scalar value =
         CastPyArg2Scalar(other_obj, "__lt__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
@@ -1244,17 +1255,18 @@ static PyObject* tensor__le__method(TensorObject* self,
     other_tensor = full_ad_func(self_tensor.shape(),
                                 phi::Scalar(other_float),
                                 self_tensor.dtype(),
-                                place);
+                                self_tensor.place());
   } else if (!PyCheckTensor(other_obj)) {
     paddle::experimental::Scalar value =
         CastPyArg2Scalar(other_obj, "__le__", 0);
     if (PyComplex_Check(other_obj)) {
       eager_gil_scoped_release guard;
-      other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place);
+      other_tensor =
+          full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place());
     } else {
       eager_gil_scoped_release guard;
-      other_tensor =
-          full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place);
+      other_tensor = full_ad_func(
+          self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place());
     }
   } else {
     other_tensor = CastPyArg2Tensor(other_obj, 0);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7486ce9402d75..71f1cf818428e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -182,6 +182,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
 #endif
 
+#if defined(__linux__) && !defined(PADDLE_WITH_XPU) &&               \
+    !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_CINN) && \
+    !defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/pybind/rpc.h"
+#endif
+
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -848,6 +854,8 @@ PYBIND11_MODULE(libpaddle, m) {
 
   m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath);
 
+  m.def("set_current_thread_name", &paddle::platform::SetCurrentThreadName);
+
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
@@ -2602,6 +2610,21 @@ All parameter, weight, gradient are variables in Paddle.
   BindGraphGpuWrapper(&m);
 #endif
 #endif
+#if defined(__linux__) && !defined(PADDLE_WITH_XPU) &&               \
+    !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_CINN) && \
+    !defined(PADDLE_WITH_HIP)
+  BindWorkerInfo(&m);
+  BindFuture(&m);
+  InitAndSetAgentInstance(&m);
+  InvokeRpc(&m);
+  StartWorker(&m);
+  StartClient(&m);
+  StopWorker(&m);
+  GetWorkerInfo(&m);
+  GetWorkerInfoByRank(&m);
+  GetCurrentWorkerInfo(&m);
+  GetAllWorkerInfos(&m);
+#endif
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/rpc.cc b/paddle/fluid/pybind/rpc.cc
new file mode 100644
index 0000000000000..ee35e9c3a4164
--- /dev/null
+++ b/paddle/fluid/pybind/rpc.cc
@@ -0,0 +1,135 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/rpc.h"
+#include "paddle/fluid/distributed/rpc/future_wrapper.h"
+#include "paddle/fluid/distributed/rpc/python_rpc_handler.h"
+#include "paddle/fluid/distributed/rpc/rpc_agent.h"
+
+namespace py = pybind11;
+using paddle::distributed::FutureWrapper;
+using paddle::distributed::PythonRpcHandler;
+using paddle::distributed::RpcAgent;
+using paddle::distributed::WorkerInfo;
+namespace paddle {
+namespace pybind {
+
+void BindWorkerInfo(py::module* m) {
+  py::class_<WorkerInfo>(*m, "WorkerInfo")
+      .def(py::init<std::string, uint32_t, std::string, uint32_t>())
+      .def_readonly("name", &WorkerInfo::name_)
+      .def_readonly("rank", &WorkerInfo::id_)
+      .def_readonly("ip", &WorkerInfo::ip_)
+      .def_readonly("port", &WorkerInfo::port_)
+      .def("__str__", &WorkerInfo::to_string)
+      .def("__repr__", &WorkerInfo::to_string);
+}
+void BindFuture(py::module* m) {
+  py::class_<FutureWrapper, std::shared_ptr<FutureWrapper>>(*m, "Future")
+      .def(py::init<>())
+      .def("wait",
+           &FutureWrapper::wait,
+           py::call_guard<py::gil_scoped_release>());
+}
+void InitAndSetAgentInstance(py::module* m) {
+  m->def(
+      "init_and_set_agent_instance",
+      [](const std::string& name, const std::vector<WorkerInfo>& infos) {
+        auto instance = std::make_shared<RpcAgent>(name, infos);
+        instance->SetAgentInstance(instance);
+      },
+      py::call_guard<py::gil_scoped_release>(),
+      py::arg("name"),
+      py::arg("infos"));
+}
+void InvokeRpc(py::module* m) {
+  m->def(
+      "invoke_rpc",
+      [](const std::string& name, const std::string& py_func, int timeout_ms) {
+        auto instance = RpcAgent::RpcAgentInstance();
+        return std::make_shared<FutureWrapper>(
+            instance->InvokeRpc(py_func, name, timeout_ms));
+      },
+      py::call_guard<py::gil_scoped_release>(),
+      py::arg("to"),
+      py::arg("py_func"),
+      py::arg("timeout_ms"));
+}
+void StartWorker(py::module* m) {
+  m->def(
+      "rpc_start_worker",
+      []() {
+        auto instance = RpcAgent::RpcAgentInstance();
+        instance->StartWorker();
+      },
+      py::call_guard<py::gil_scoped_release>());
+}
+void StartClient(py::module* m) {
+  m->def(
+      "rpc_start_client",
+      []() {
+        auto instance = RpcAgent::RpcAgentInstance();
+        instance->StartClient();
+      },
+      py::call_guard<py::gil_scoped_release>());
+}
+void StopWorker(py::module* m) {
+  m->def(
+      "rpc_stop_worker",
+      []() {
+        auto instance = RpcAgent::RpcAgentInstance();
+        instance->Stop();
+      },
+      py::call_guard<py::gil_scoped_release>());
+}
+void GetWorkerInfo(py::module* m) {
+  m->def(
+      "rpc_get_worker_info",
+      [](const std::string& name) {
+        auto instance = RpcAgent::RpcAgentInstance();
+        return instance->GetWorkerInfo(name);
+      },
+      py::call_guard<py::gil_scoped_release>(),
+      py::arg("name"));
+}
+void GetWorkerInfoByRank(py::module* m) {
+  m->def(
+      "rpc_get_worker_info_by_rank",
+      [](uint32_t rank) {
+        auto instance = RpcAgent::RpcAgentInstance();
+        return instance->GetWorkerInfoById(rank);
+      },
+      py::call_guard<py::gil_scoped_release>(),
+      py::arg("rank"));
+}
+void GetCurrentWorkerInfo(py::module* m) {
+  m->def(
+      "rpc_get_current_worker_info",
+      []() {
+        auto instance = RpcAgent::RpcAgentInstance();
+        return instance->GetCurrentWorkerInfo();
+      },
+      py::call_guard<py::gil_scoped_release>());
+}
+void GetAllWorkerInfos(py::module* m) {
+  m->def(
+      "rpc_get_all_worker_infos",
+      []() {
+        auto instance = RpcAgent::RpcAgentInstance();
+        return instance->GetAllWorkerInfos();
+      },
+      py::call_guard<py::gil_scoped_release>());
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/rpc.h b/paddle/fluid/pybind/rpc.h
new file mode 100644
index 0000000000000..7bd331387439e
--- /dev/null
+++ b/paddle/fluid/pybind/rpc.h
@@ -0,0 +1,37 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindWorkerInfo(py::module* m);
+void BindFuture(py::module* m);
+void InitAndSetAgentInstance(py::module* m);
+void InvokeRpc(py::module* m);
+void StartWorker(py::module* m);
+void StartClient(py::module* m);
+void StopWorker(py::module* m);
+void GetWorkerInfo(py::module* m);
+void GetWorkerInfoByRank(py::module* m);
+void GetCurrentWorkerInfo(py::module* m);
+void GetAllWorkerInfos(py::module* m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 015c1be57370a..176713b71bbcf 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -99,7 +99,6 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   inline void AssignKernelKeySet(const phi::TensorBase& tensor) {
     key_set.backend_set =
         key_set.backend_set | detail::GetTensorBackendSet(tensor);
-    // TODO(chenweihang): select multi layout and dtype
     phi::DataLayout tensor_layout = tensor.layout();
     key_set.layout =
         tensor_layout > key_set.layout ? tensor_layout : key_set.layout;
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index b578afa7c2b85..10b859fdac260 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -40,7 +40,7 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
  * after 2.0, and can no longer be occupied by the previously abandoned ops.
  * They are marked here uniformly.
  */
-const std::unordered_set<std::string> deprecated_op_names(
+static const std::unordered_set<std::string> deprecated_op_names(
     {"diag",
      "flatten",
      "flatten_grad",
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 71256bdabaa67..480882550dbca 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #endif
+#include "paddle/phi/core/compat/op_utils.h"
 
 DECLARE_bool(enable_api_kernel_fallback);
 
@@ -45,6 +46,17 @@ KernelFactory& KernelFactory::Instance() {
   return g_op_kernel_factory;
 }
 
+bool KernelFactory::HasCompatiblePhiKernel(const std::string& op_type) const {
+  if (deprecated_op_names.find(op_type) == deprecated_op_names.end()) {
+    if (phi::OpUtilsMap::Instance().Contains(op_type)) {
+      return true;
+    } else if (kernels_.find(op_type) != kernels_.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name,
                                           const KernelKey& kernel_key) const {
   auto iter = kernels_.find(kernel_name);
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 8e98c276646d9..ed9280fa475bf 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -272,9 +272,7 @@ class KernelFactory {
 
   KernelNameMap& kernels() { return kernels_; }
 
-  bool HasCompatiblePhiKernel(const std::string& op_type) const {
-    return kernels_.find(TransToPhiKernelName(op_type)) != kernels_.end();
-  }
+  bool HasCompatiblePhiKernel(const std::string& op_type) const;
 
   KernelResult SelectKernelOrThrowError(const std::string& kernel_name,
                                         const KernelKey& kernel_key,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 8d085a05a4c91..6cdf0b9345fe0 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3713,7 +3713,7 @@ void TileInferMeta(const MetaTensor& x,
           repeat_times_data.size()));
   PADDLE_ENFORCE_GE(
       repeat_times_data.size(),
-      1,
+      0,
       errors::InvalidArgument(
           "The size of the shape of input 'repeat_times' for tile op "
           "must be positive integers, but the value received is %d.",
@@ -3746,7 +3746,7 @@ void TileInferMeta(const MetaTensor& x,
   }
 
   out->set_dims(phi::make_ddim(out_shape));
-  if (out_shape[0] == x_dims[0]) {
+  if (out_rank > 0 && (out_shape[0] == x_dims[0])) {
     out->share_lod(x);
   }
   out->set_dtype(x.dtype());
diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 050c61596fee5..a4d43ef8fbe89 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -97,14 +97,14 @@ inline void ModulatedDeformableCol2imCPUKernel(
                                            width);
 
           *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad);
+              *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
         }
       }
     }
   }
 }
 
-template <typename T, typename MT, typename Context>
+template <typename T, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -116,7 +116,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               MT* grad_im) {
+                               T* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
 
@@ -222,22 +222,22 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear<T, T>(
-                                            data_im_ptr + cnt * height * width,
-                                            width,
-                                            height,
-                                            width,
-                                            inv_h,
-                                            inv_w);
+        mval += data_col_ptr[col_pos] *
+                funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
+                                          width,
+                                          height,
+                                          width,
+                                          inv_h,
+                                          inv_w);
       }
       const T weight =
-          DmcnGetCoordinateWeight<T, T>(inv_h,
-                                        inv_w,
-                                        height,
-                                        width,
-                                        data_im_ptr + cnt * height * width,
-                                        width,
-                                        bp_dir);
+          DmcnGetCoordinateWeight(inv_h,
+                                  inv_w,
+                                  height,
+                                  width,
+                                  data_im_ptr + cnt * height * width,
+                                  width,
+                                  bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
index a2f5aa2a29795..583df78cc25f3 100644
--- a/paddle/phi/kernels/cpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -35,6 +35,9 @@ void TransposeKernel(const Context& ctx,
   }
   int rank = axis.size();
   switch (rank) {
+    case 0:
+      phi::Copy<Context>(ctx, x, ctx.GetPlace(), false, out);
+      break;
     case 1:
       funcs::Transpose<Context, T, 1> trans1;
       trans1(ctx, x, out, axis);
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index 253a66adfc6a2..48858fa59390e 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 namespace funcs {
@@ -82,8 +82,8 @@ inline void ModulatedDeformableIm2colCPUKernel(
         const T h_im = h_in + i * dilation_h + offset_h;
         const T w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val = DmcnIm2colBilinear<T, T>(
-              data_im_ptr, width, height, width, h_im, w_im);
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
         }
         *data_col_ptr = val;
         if (data_mask_ptr) {
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 0d5076a4937c3..48105d1f517e9 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -14,9 +14,6 @@
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 namespace funcs {
@@ -54,8 +51,6 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
     T* data_col) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t i = index; i < nthreads; i += offset) {
     const int w_col = i % width_col;
     const int h_col = (i / width_col) % height_col;
@@ -90,22 +85,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
-        const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-        const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-        MT val = static_cast<MT>(0);
-        const MT h_im = h_in + i * dilation_h + offset_h;
-        const MT w_im = w_in + j * dilation_w + offset_w;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val = DmcnIm2colBilinear<T, MT>(
-              data_im_ptr, width, height, width, h_im, w_im);
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
         }
+        *data_col_ptr = val;
         if (data_mask_ptr) {
           const int data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-          const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
-          val *= mask;
+          const T mask = data_mask_ptr[data_mask_hw_ptr];
+          *data_col_ptr *= mask;
         }
-        *data_col_ptr = static_cast<T>(val);
         data_col_ptr += batch_size * height_col * width_col;
       }
     }
@@ -169,20 +164,6 @@ template void ModulatedDeformableIm2col(
     const int deformable_groups,
     float* data_col);
 
-template void ModulatedDeformableIm2col(
-    const phi::GPUContext& dev_ctx,
-    const phi::dtype::float16* data_im,
-    const phi::dtype::float16* data_offset,
-    const phi::dtype::float16* data_mask,
-    const std::vector<int64_t>& im_shape,
-    const std::vector<int64_t>& col_shape,
-    const std::vector<int64_t>& filter_shape,
-    const std::vector<int>& paddings,
-    const std::vector<int>& strides,
-    const std::vector<int>& dilations,
-    const int deformable_groups,
-    phi::dtype::float16* data_col);
-
 template void ModulatedDeformableIm2col(
     const phi::GPUContext& dev_ctx,
     const double* data_im,
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.h b/paddle/phi/kernels/funcs/deformable_conv_functor.h
index 62e42cd58334f..eecda72927510 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.h
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.h
@@ -14,47 +14,44 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 namespace funcs {
 
-template <typename T, typename MT>
-HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data,
-                                 const int data_width,
-                                 const int height,
-                                 const int width,
-                                 MT h,
-                                 MT w) {
+template <typename T>
+HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
+                                const int data_width,
+                                const int height,
+                                const int width,
+                                T h,
+                                T w) {
   int h_low = floor(h);
   int w_low = floor(w);
   int h_high = h_low + 1;
   int w_high = w_low + 1;
 
-  MT lh = h - h_low;
-  MT lw = w - w_low;
-  MT hh = 1 - lh;
-  MT hw = 1 - lw;
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh;
+  T hw = 1 - lw;
 
-  MT v1 = (h_low >= 0 && w_low >= 0)
-              ? static_cast<MT>(bottom_data[h_low * data_width + w_low])
-              : 0;
-  MT v2 = (h_low >= 0 && w_high <= width - 1)
-              ? static_cast<MT>(bottom_data[h_low * data_width + w_high])
-              : 0;
-  MT v3 = (h_high <= height - 1 && w_low >= 0)
-              ? static_cast<MT>(bottom_data[h_high * data_width + w_low])
-              : 0;
-  MT v4 = (h_high <= height - 1 && w_high <= width - 1)
-              ? static_cast<MT>(bottom_data[h_high * data_width + w_high])
-              : 0;
+  T v1 =
+      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
+  T v2 = (h_low >= 0 && w_high <= width - 1)
+             ? bottom_data[h_low * data_width + w_high]
+             : 0;
+  T v3 = (h_high <= height - 1 && w_low >= 0)
+             ? bottom_data[h_high * data_width + w_low]
+             : 0;
+  T v4 = (h_high <= height - 1 && w_high <= width - 1)
+             ? bottom_data[h_high * data_width + w_high]
+             : 0;
 
-  MT w1 = hh * hw;
-  MT w2 = hh * lw;
-  MT w3 = lh * hw;
-  MT w4 = lh * lw;
+  T w1 = hh * hw;
+  T w2 = hh * lw;
+  T w3 = lh * hw;
+  T w4 = lh * lw;
 
   return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
 }
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 9c40fdd3f2778..76201a1077edb 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -170,7 +170,7 @@ __global__ void CrossEntropySoftLabel(T* loss,
 /*
   Hard label cross entropy.
 */
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 __global__ void CrossEntropyHardLabel(T* loss,
                                       const T* softmax,
                                       const LabelT* labels,
@@ -185,22 +185,17 @@ __global__ void CrossEntropyHardLabel(T* loss,
   // thread ids compute loss[ids] using softmax[idx]
   if (ids < n * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
-    assert(lbl >= 0 && lbl < dim || lbl == ignore_idx);
-    if (lbl < 0 || lbl >= dim) {  // label is out of bound
+    PADDLE_ENFORCE(lbl >= 0 && lbl < dim || lbl == ignore_idx,
+                   "The value of label expected >= 0 and < %d, or == %d, "
+                   "but got %ld. Please check label value.",
+                   dim,
+                   ignore_idx,
+                   lbl);
+    if (lbl == ignore_idx) {
       loss[ids] = static_cast<T>(0.0);
     } else {
       int64_t idx = idx_n * dim * d + lbl * d + idx_d;
-      if (IgnoreIndex == true) {
-        // IgnoreIndex is true
-        if (lbl == ignore_idx) {
-          loss[ids] = static_cast<T>(0.0);
-        } else {
-          loss[ids] = -Log(softmax[idx]);
-        }
-      } else {
-        // IgnoreIndex is false
-        loss[ids] = -Log(softmax[idx]);
-      }
+      loss[ids] = -Log(softmax[idx]);
     }
   }
 }
@@ -210,7 +205,7 @@ __global__ void CrossEntropyHardLabel(T* loss,
   Input: log softmax
   Output: loss and exp(input)
 */
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 __global__ void CrossEntropyExpHardLabel(T* loss,
                                          T* softmax,
                                          const LabelT* labels,
@@ -226,24 +221,17 @@ __global__ void CrossEntropyExpHardLabel(T* loss,
 
   if (idx < n * dim * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
-    assert(lbl >= 0 && lbl < dim || lbl == ignore_idx);
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (idx_dim == lbl) {
-        if (lbl == ignore_idx) {
-          loss[ids] = static_cast<T>(0.0);
-        } else {
-          loss[ids] = -softmax[idx];
-        }
-      }
+    PADDLE_ENFORCE(lbl >= 0 && lbl < dim || lbl == ignore_idx,
+                   "The value of label expected >= 0 and < %d, or == %d, "
+                   "but got %ld. Please check label value.",
+                   dim,
+                   ignore_idx,
+                   lbl);
+    if (lbl == ignore_idx) {
+      loss[ids] = static_cast<T>(0.0);
     } else {
-      // IgnoreIndex is false
-      if (lbl >= 0 && lbl < dim) {
-        if (lbl == idx_dim) {
-          loss[ids] = -softmax[idx];
-        }
-      } else {
-        loss[ids] = static_cast<T>(0.0);
+      if (lbl == idx_dim) {
+        loss[ids] = -softmax[idx];
       }
     }
     softmax[idx] = Exp(softmax[idx]);
@@ -292,7 +280,7 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input,
   return val;
 }
 
-template <typename T, bool IgnoreIndex>
+template <typename T>
 __device__ __forceinline__ void ComputeLoss(T* loss,
                                             const T loss_value,
                                             const int label_id,
@@ -302,14 +290,8 @@ __device__ __forceinline__ void ComputeLoss(T* loss,
                                             const int offset,
                                             const int ignore_index) {
   int loss_id = vec_size * tid + offset;
-  if (IgnoreIndex) {
-    if (label_value == loss_id) {
-      if (label_value == ignore_index) {
-        loss[label_id] = static_cast<T>(0.0f);
-      } else {
-        loss[label_id] = loss_value;
-      }
-    }
+  if (label_value == ignore_index) {
+    loss[label_id] = static_cast<T>(0.0f);
   } else {
     if (label_value == loss_id) {
       loss[label_id] = loss_value;
@@ -317,11 +299,7 @@ __device__ __forceinline__ void ComputeLoss(T* loss,
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename LabelT,
-          int VecSize,
-          bool IgnoreIndex>
+template <typename T, typename AccT, typename LabelT, int VecSize>
 __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
     T* loss,
     T* softmax,
@@ -335,8 +313,13 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
   int tid = threadIdx.x;
   int label_id = blockIdx.x;
   auto label_value = static_cast<int64_t>(label[label_id]);
-  assert(label_value >= 0 && label_value < size || label_value == ignore_index);
-  const bool label_valid = label_value >= 0 && label_value < size;
+  PADDLE_ENFORCE(
+      label_value >= 0 && label_value < size || label_value == ignore_index,
+      "The value of label expected >= 0 and < %d, or == %d, "
+      "but got %ld. Please check label value.",
+      size,
+      ignore_index,
+      label_value);
   int loss_id_offset = 0;
 
   if (offset > 0) {
@@ -348,16 +331,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
       AccT log_softmax = func(static_cast<AccT>(logits[tid]));
       softmax[tid] = static_cast<T>(std::exp(log_softmax));
       // loss
-      if (label_valid) {
-        ComputeLoss<T, IgnoreIndex>(loss,
-                                    static_cast<T>(-log_softmax),
-                                    label_id,
-                                    label_value,
-                                    tid,
-                                    1,
-                                    loss_id_offset,
-                                    ignore_index);
-      }
+      ComputeLoss<T>(loss,
+                     static_cast<T>(-log_softmax),
+                     label_id,
+                     label_value,
+                     tid,
+                     1,
+                     loss_id_offset,
+                     ignore_index);
     }
     size -= blockDim.x;
     logits += blockDim.x;
@@ -383,16 +364,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
       outs[i] = static_cast<T>(std::exp(log_softmax));
 
       // loss
-      if (label_valid) {
-        ComputeLoss<T, IgnoreIndex>(loss,
-                                    static_cast<T>(-log_softmax),
-                                    label_id,
-                                    label_value,
-                                    tid,
-                                    VecSize,
-                                    loss_id_offset + i,
-                                    ignore_index);
-      }
+      ComputeLoss<T>(loss,
+                     static_cast<T>(-log_softmax),
+                     label_id,
+                     label_value,
+                     tid,
+                     VecSize,
+                     loss_id_offset + i,
+                     ignore_index);
     }
 
     // write
@@ -406,29 +385,18 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
     softmax[tid] = static_cast<T>(std::exp(log_softmax));
 
     // loss
-    if (label_valid) {
-      ComputeLoss<T, IgnoreIndex>(loss,
-                                  static_cast<T>(-log_softmax),
-                                  label_id,
-                                  label_value,
-                                  tid,
-                                  1,
-                                  loss_id_offset,
-                                  ignore_index);
-    }
-  }
-
-  // invalid label, write once
-  if (!label_valid && threadIdx.x == 0) {
-    loss[label_id] = static_cast<T>(0.0f);
+    ComputeLoss<T>(loss,
+                   static_cast<T>(-log_softmax),
+                   label_id,
+                   label_value,
+                   tid,
+                   1,
+                   loss_id_offset,
+                   ignore_index);
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename LabelT,
-          int VecSize,
-          bool IgnoreIndex>
+template <typename T, typename AccT, typename LabelT, int VecSize>
 __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
     T* loss,
     T* softmax,
@@ -441,8 +409,13 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
   int remain = size % (VecSize * blockDim.x);
   int label_id = blockIdx.x;
   auto label_value = static_cast<int64_t>(label[label_id]);
-  assert(label_value >= 0 && label_value < size || label_value == ignore_index);
-  const bool label_valid = label_value >= 0 && label_value < size;
+  PADDLE_ENFORCE(
+      label_value >= 0 && label_value < size || label_value == ignore_index,
+      "The value of label expected >= 0 and < %d, or == %d, "
+      "but got %ld. Please check label value.",
+      size,
+      ignore_index,
+      label_value);
 
   // main part
   for (; tid < (size - remain); tid += VecSize * blockDim.x) {
@@ -457,16 +430,14 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
       AccT log_softmax = func(static_cast<AccT>(ins[i]));
       softmax[tid + i * blockDim.x] = static_cast<T>(std::exp(log_softmax));
       // loss
-      if (label_valid) {
-        ComputeLoss<T, IgnoreIndex>(loss,
-                                    static_cast<T>(-log_softmax),
-                                    label_id,
-                                    label_value,
-                                    tid,
-                                    VecSize,
-                                    i,
-                                    ignore_index);
-      }
+      ComputeLoss<T>(loss,
+                     static_cast<T>(-log_softmax),
+                     label_id,
+                     label_value,
+                     tid,
+                     VecSize,
+                     i,
+                     ignore_index);
     }
   }
 
@@ -475,29 +446,18 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
     AccT log_softmax = func(static_cast<AccT>(logits[tid]));
     softmax[tid] = static_cast<T>(std::exp(log_softmax));
     // loss
-    if (label_valid) {
-      ComputeLoss<T, IgnoreIndex>(loss,
-                                  static_cast<T>(-log_softmax),
-                                  label_id,
-                                  label_value,
-                                  tid,
-                                  1,
-                                  0,
-                                  ignore_index);
-    }
-  }
-
-  // invalid label, write once
-  if (!label_valid && threadIdx.x == 0) {
-    loss[label_id] = static_cast<T>(0.0f);
+    ComputeLoss<T>(loss,
+                   static_cast<T>(-log_softmax),
+                   label_id,
+                   label_value,
+                   tid,
+                   1,
+                   0,
+                   ignore_index);
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename LabelT,
-          int VecSize,
-          bool IgnoreIndex>
+template <typename T, typename AccT, typename LabelT, int VecSize>
 __global__ void VectorizedSoftmaxForward(T* loss,
                                          T* softmax,
                                          const T* logits,
@@ -537,17 +497,16 @@ __global__ void VectorizedSoftmaxForward(T* loss,
   // 3. softmax
   phi::LogSoftmaxForwardFunctor<AccT> func(max, sum);
   if (input_offset == output_offset) {
-    VectorizedSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
-        loss,
-        softmax,
-        logits,
-        label,
-        mid_dim,
-        input_offset,
-        func,
-        ignore_index);
+    VectorizedSoftmaxForwardImpl<T, AccT, LabelT, VecSize>(loss,
+                                                           softmax,
+                                                           logits,
+                                                           label,
+                                                           mid_dim,
+                                                           input_offset,
+                                                           func,
+                                                           ignore_index);
   } else {
-    ScalarSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
+    ScalarSoftmaxForwardImpl<T, AccT, LabelT, VecSize>(
         loss, softmax, logits, label, mid_dim, func, ignore_index);
   }
 }
@@ -560,8 +519,8 @@ The computation includes
   - Compute: sum of - sum_{j}{ label_{i,j} * (src_{i,j} - maxvalue_{i} -
 log(sum[i]))}
 One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
-For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
-api to compute max (sum) in one warp.
+For reduction max (sum), firstly compute max (sum) to one warp, then use
+shuffle api to compute max (sum) in one warp.
 */
 template <typename T, typename VecT, typename AccT, int Log2Elements>
 __global__ void WarpSoftmaxForwardSoftLabel(T* loss,
@@ -880,8 +839,7 @@ template <typename T,
           typename VecT,
           typename AccT,
           int Log2Elements,
-          SoftmaxMode mode,
-          bool IgnoreIndex>
+          SoftmaxMode mode>
 __global__ void WarpSoftmaxForward(T* loss,
                                    T* softmax,
                                    const T* src,
@@ -1033,24 +991,21 @@ __global__ void WarpSoftmaxForward(T* loss,
             // label
             int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
             auto lbl = static_cast<int64_t>(label[first_batch + i]);
-            assert(lbl >= 0 && lbl < element_count || lbl == ignore_index);
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (lbl == loss_idx) {
-                if (lbl != ignore_index) {
-                  loss[first_batch + i] = -logsoftmax;
-                } else {
-                  loss[first_batch + i] = static_cast<T>(0.0);
-                }
-              }
+            if (lbl == ignore_index) {
+              loss[first_batch + i] = static_cast<T>(0.0);
             } else {
-              // IgnoreIndex is false
               if (lbl >= 0 && lbl < element_count) {
                 if (lbl == loss_idx) {
                   loss[first_batch + i] = -logsoftmax;
                 }
               } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
+                PADDLE_ENFORCE(
+                    false,
+                    "The value of label expected >= 0 and < %d, or == %d, "
+                    "but got %ld. Please check label value.",
+                    element_count,
+                    ignore_index,
+                    lbl);
               }
             }
           } else {  // softmax
@@ -1077,20 +1032,21 @@ __global__ void WarpSoftmaxForward(T* loss,
             // label
             int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
             auto lbl = static_cast<int64_t>(label[first_batch + i]);
-            assert(lbl >= 0 && lbl < element_count || lbl == ignore_index);
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (lbl == loss_idx && lbl != ignore_index) {
-                loss[first_batch + i] = -logsoftmax;
-              }
+            if (lbl == ignore_index) {
+              loss[first_batch + i] = static_cast<T>(0.0);
             } else {
-              // IgnoreIndex is false
               if (lbl >= 0 && lbl < element_count) {
                 if (lbl == loss_idx) {
                   loss[first_batch + i] = -logsoftmax;
                 }
               } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
+                PADDLE_ENFORCE(
+                    false,
+                    "The value of label expected >= 0 and < %d, or == %d, "
+                    "but got %ld. Please check label value.",
+                    element_count,
+                    ignore_index,
+                    lbl);
               }
             }
           } else {  // softmax
@@ -1107,23 +1063,23 @@ __global__ void WarpSoftmaxForward(T* loss,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT)            \
-  case Log2Elements:                                                           \
-    WarpSoftmaxForward<T, LabelT, VecT, AccT, Log2Elements, mode, IgnoreIndex> \
-        <<<blocks, threads, 0, stream>>>(loss,                                 \
-                                         softmax,                              \
-                                         src,                                  \
-                                         label,                                \
-                                         batch_size,                           \
-                                         stride,                               \
-                                         element_count,                        \
-                                         ignore_index);                        \
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT) \
+  case Log2Elements:                                                \
+    WarpSoftmaxForward<T, LabelT, VecT, AccT, Log2Elements, mode>   \
+        <<<blocks, threads, 0, stream>>>(loss,                      \
+                                         softmax,                   \
+                                         src,                       \
+                                         label,                     \
+                                         batch_size,                \
+                                         stride,                    \
+                                         element_count,             \
+                                         ignore_index);             \
     break;
 
 /*
   Wrapper of softmax with cross entropy forward hard label.
 */
-template <typename T, typename LabelT, SoftmaxMode mode, bool IgnoreIndex>
+template <typename T, typename LabelT, SoftmaxMode mode>
 void SwitchWarpSoftmaxForward(T* loss,
                               T* softmax,
                               const T* src,
@@ -1162,7 +1118,7 @@ void SwitchWarpSoftmaxForward(T* loss,
   }
 }
 
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 void LaunchVectorizedSoftmaxForward(T* loss,
                                     T* softmax,
                                     const T* logits,
@@ -1186,7 +1142,7 @@ void LaunchVectorizedSoftmaxForward(T* loss,
   block_size = std::max(block_size, kps::details::kWarpSize);
   dim3 grids(high_dim);
   dim3 blocks(block_size);
-  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size, IgnoreIndex>
+  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size>
       <<<grids, blocks, 0, stream>>>(
           loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
 }
@@ -1197,7 +1153,7 @@ void LaunchVectorizedSoftmaxForward(T* loss,
   - LaunchVectorizedSoftmaxForward for large size when axis == -1
   - cudnn function for axis != -1
 */
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
                                              int rank,
                                              int axis,
@@ -1214,24 +1170,24 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
   if (D == 1) {
     if (dim <= max_dim) {  // small size
       const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
-      SwitchWarpSoftmaxForward<T, LabelT, mode, IgnoreIndex>(loss_data,
-                                                             softmax_data,
-                                                             logits_data,
-                                                             labels_data,
-                                                             N,
-                                                             dim,
-                                                             dim,
-                                                             ignore_index,
-                                                             stream);
+      SwitchWarpSoftmaxForward<T, LabelT, mode>(loss_data,
+                                                softmax_data,
+                                                logits_data,
+                                                labels_data,
+                                                N,
+                                                dim,
+                                                dim,
+                                                ignore_index,
+                                                stream);
     } else {  // large size
-      LaunchVectorizedSoftmaxForward<T, LabelT, IgnoreIndex>(loss_data,
-                                                             softmax_data,
-                                                             logits_data,
-                                                             labels_data,
-                                                             N,
-                                                             dim,
-                                                             ignore_index,
-                                                             stream);
+      LaunchVectorizedSoftmaxForward<T, LabelT>(loss_data,
+                                                softmax_data,
+                                                logits_data,
+                                                labels_data,
+                                                N,
+                                                dim,
+                                                ignore_index,
+                                                stream);
     }
   } else {
     ScopedTensorDescriptor desc;
@@ -1275,9 +1231,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     int threads = 128;
     int blocks = (N * dim * D + threads - 1) / threads;
     // compute cross entropy, input is log softmax
-    CrossEntropyExpHardLabel<T, LabelT, IgnoreIndex>
-        <<<blocks, threads, 0, stream>>>(
-            loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
+    CrossEntropyExpHardLabel<T, LabelT><<<blocks, threads, 0, stream>>>(
+        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
   }
 }
 
@@ -1373,25 +1328,14 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
       auto* labels_data = labels.data<LabelT>();
       int threads = 128;
       int blocks = (n * d / axis_dim + threads - 1) / threads;
-      if (ignore_index >= 0 && ignore_index < axis_dim) {
-        CrossEntropyHardLabel<T, LabelT, true>
-            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
-                                                       logits_data,
-                                                       labels_data,
-                                                       n,
-                                                       axis_dim,
-                                                       d / axis_dim,
-                                                       ignore_index);
-      } else {
-        CrossEntropyHardLabel<T, LabelT, false>
-            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
-                                                       logits_data,
-                                                       labels_data,
-                                                       n,
-                                                       axis_dim,
-                                                       d / axis_dim,
-                                                       ignore_index);
-      }
+      CrossEntropyHardLabel<T, LabelT>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                     logits_data,
+                                                     labels_data,
+                                                     n,
+                                                     axis_dim,
+                                                     d / axis_dim,
+                                                     ignore_index);
     }
 
     // cause of input is softmax
@@ -1456,31 +1400,17 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
     } else {
       auto* logits_data = logits.data<T>();
       auto* labels_data = label.data<LabelT>();
-      if (ignore_index >= 0 && ignore_index < axis_dim) {
-        SoftmaxWithCrossEntropyHardLabel<T, LabelT, true>(dev_ctx,
-                                                          rank,
-                                                          axis_v,
-                                                          logits_data,
-                                                          labels_data,
-                                                          loss_data,
-                                                          softmax_data,
-                                                          n,
-                                                          axis_dim,
-                                                          d / axis_dim,
-                                                          ignore_index);
-      } else {
-        SoftmaxWithCrossEntropyHardLabel<T, LabelT, false>(dev_ctx,
-                                                           rank,
-                                                           axis_v,
-                                                           logits_data,
-                                                           labels_data,
-                                                           loss_data,
-                                                           softmax_data,
-                                                           n,
-                                                           axis_dim,
-                                                           d / axis_dim,
-                                                           ignore_index);
-      }
+      SoftmaxWithCrossEntropyHardLabel<T, LabelT>(dev_ctx,
+                                                  rank,
+                                                  axis_v,
+                                                  logits_data,
+                                                  labels_data,
+                                                  loss_data,
+                                                  softmax_data,
+                                                  n,
+                                                  axis_dim,
+                                                  d / axis_dim,
+                                                  ignore_index);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index b65bdc8e44f74..b46f1f4a3314d 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) {
                   kNumMaximumNumBlocks);
 }
 
-template <typename T, typename MT>
+template <typename T>
 __global__ void ModulatedDeformableCol2imGpuKernel(
     const int nthreads,
     const T* data_col,
@@ -51,7 +51,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     const int deformable_group,
     const int height_col,
     const int width_col,
-    MT* grad_im) {
+    T* grad_im) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (size_t thread = index; thread < nthreads; thread += offset) {
@@ -78,17 +78,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
     const int data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-    const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-    const MT cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const MT cur_inv_w_data = w_in + j * dilation_w + offset_w;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
 
-    MT cur_top_grad = static_cast<MT>(data_col[thread]);
+    T cur_top_grad = data_col[thread];
     if (data_mask) {
       const T* data_mask_ptr =
           data_mask + (b * deformable_group + deformable_group_index) *
                           kernel_h * kernel_w * height_col * width_col;
-      const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
     const int cur_h = static_cast<int>(cur_inv_h_data);
@@ -100,12 +100,13 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
           int cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          MT weight = DmcnGetGradientWeight(cur_inv_h_data,
-                                            cur_inv_w_data,
-                                            cur_h + dy,
-                                            cur_w + dx,
-                                            height,
-                                            width);
+          T weight = DmcnGetGradientWeight(cur_inv_h_data,
+                                           cur_inv_w_data,
+                                           cur_h + dy,
+                                           cur_w + dx,
+                                           height,
+                                           width);
+
           paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
                                           weight * cur_top_grad);
         }
@@ -114,7 +115,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
   }
 }
 
-template <typename T, typename MT, typename Context>
+template <typename T, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -126,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               MT* grad_im) {
+                               T* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imGpuKernel<T, MT>
+  ModulatedDeformableCol2imGpuKernel<T>
       <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
                                                  data_col,
                                                  data_offset,
@@ -184,9 +185,8 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     T* grad_mask) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t i = index; i < nthreads; i += offset) {
-    MT val = 0, mval = 0;
+    T val = 0, mval = 0;
     const int w = i % width_col;
     const int h = (i / width_col) % height_col;
     const int c = (i / width_col / height_col) % offset_channels;
@@ -231,42 +231,40 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       const int data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
-
-      const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-      const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-      MT inv_h = h_in + i * dilation_h + offset_h;
-      MT inv_w = w_in + j * dilation_w + offset_w;
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval +=
-            static_cast<MT>(data_col_ptr[col_pos]) *
-            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                             width,
-                                             height,
-                                             width,
-                                             inv_h,
-                                             inv_w);
+        mval += data_col_ptr[col_pos] *
+                funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
+                                          width,
+                                          height,
+                                          width,
+                                          inv_h,
+                                          inv_w);
       }
-      const MT weight =
-          DmcnGetCoordinateWeight<T, MT>(inv_h,
-                                         inv_w,
-                                         height,
-                                         width,
-                                         data_im_ptr + cnt * height * width,
-                                         width,
-                                         bp_dir);
+      const T weight =
+          DmcnGetCoordinateWeight(inv_h,
+                                  inv_w,
+                                  height,
+                                  width,
+                                  data_im_ptr + cnt * height * width,
+                                  width,
+                                  bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-        const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
-        val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask;
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        val += weight * data_col_ptr[col_pos] * mask;
       } else {
-        val += weight * static_cast<MT>(data_col_ptr[col_pos]);
+        val += weight * data_col_ptr[col_pos];
       }
       cnt += 1;
     }
-    grad_offset[i] = static_cast<T>(val);
+    grad_offset[i] = val;
     if (grad_mask && offset_c % 2 == 0)
       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                       kernel_w +
@@ -361,5 +359,4 @@ PD_REGISTER_KERNEL(deformable_conv_grad,
                    ALL_LAYOUT,
                    phi::DeformableConvGradKernel,
                    float,
-                   double,
-                   paddle::platform::float16) {}
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 021791ca93061..2476dcbafb984 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -23,5 +23,4 @@ PD_REGISTER_KERNEL(deformable_conv,
                    ALL_LAYOUT,
                    phi::DeformableConvKernel,
                    float,
-                   double,
-                   phi::dtype::float16) {}
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
index 490b3e9404561..0599c7e4f06a0 100644
--- a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu
@@ -15,8 +15,16 @@
 #include "paddle/phi/kernels/logsumexp_grad_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    logsumexp_grad, GPU, ALL_LAYOUT, phi::LogsumexpGradKernel, float, double) {}
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_KERNEL(logsumexp_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogsumexpGradKernel,
+                   float,
+                   double,
+                   float16) {}
diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
index 18249ff3bafb0..7963808476ded 100644
--- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
@@ -15,8 +15,11 @@
 #include "paddle/phi/kernels/logsumexp_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h"
 
+using float16 = phi::dtype::float16;
+
 PD_REGISTER_KERNEL(
-    logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double) {}
+    logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double, float16) {}
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
index 3f3760a4890a2..9b895adb0a3be 100644
--- a/paddle/phi/kernels/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -35,6 +35,10 @@ void TransposeKernel(const Context& ctx,
   if (out->numel() == 0) {
     return;
   }
+  if (axis.size() == 0) {
+    phi::Copy<Context>(ctx, x, ctx.GetPlace(), false, out);
+    return;
+  }
   paddle::operators::TransposeGPUKernelDriver<T>(ctx, x, axis, out);
 }
 
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 75dfc8514abf8..a079827ca9fd7 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -346,28 +346,41 @@ template <template <typename, typename> class Reduction,
           typename AccT,
           int VecSize>
 __device__ __forceinline__ AccT
-ThreadVecReduce(const T* data,
+ThreadVecReduce(T* data,
                 int dim_size,
+                const int shift,
                 const Reduction<T, AccT>& functor,
                 AccT default_value) {
   using VecT = phi::AlignedVector<T, VecSize>;
   AccT thread_val = default_value;
+
+  // for memory align, handle the unaligned data in first block.
+  int offset = threadIdx.x;
+  if (shift > 0) {
+    data -= shift;
+    dim_size += shift;
+    if (offset >= shift) {
+      thread_val = functor(thread_val, data[offset]);
+    }
+    dim_size -= blockDim.x;
+    data += blockDim.x;
+  }
+
   const int last = dim_size % (VecSize * blockDim.x);
 
   T v[VecSize];
   VecT* value = reinterpret_cast<VecT*>(&v);
 
-  for (int offset = threadIdx.x; offset * VecSize < dim_size - last;
-       offset += blockDim.x) {
-    *value = reinterpret_cast<VecT*>(const_cast<T*>(data))[offset];
+  for (; offset * VecSize < dim_size - last; offset += blockDim.x) {
+    *value = reinterpret_cast<VecT*>(data)[offset];
 #pragma unroll
     for (int i = 0; i < VecSize; i++) {
       thread_val = functor(thread_val, v[i]);
     }
   }
 
-  for (int offset = dim_size - last + threadIdx.x; offset < dim_size;
-       offset += blockDim.x) {
+  offset = dim_size - last + threadIdx.x;
+  for (; offset < dim_size; offset += blockDim.x) {
     thread_val = functor(thread_val, data[offset]);
   }
   return thread_val;
@@ -377,12 +390,27 @@ template <template <typename, typename> class Reduction,
           typename T,
           typename AccT,
           int VecSize>
-__device__ __forceinline__ void ThreadVecWrite(T* out,
-                                               const T* input,
-                                               int dim_size,
-                                               Reduction<AccT, T> functor) {
+__device__ __forceinline__ void ThreadVecWriteVec(T* out,
+                                                  T* input,
+                                                  int dim_size,
+                                                  const int shift,
+                                                  Reduction<AccT, T> functor) {
   using VecT = phi::AlignedVector<T, VecSize>;
 
+  // for memory align, handle the unaligned data in first block.
+  int offset = threadIdx.x;
+  if (shift > 0) {
+    input -= shift;
+    out -= shift;
+    dim_size += shift;
+    if (offset >= shift) {
+      out[offset] = functor(static_cast<AccT>(input[offset]));
+    }
+    dim_size -= blockDim.x;
+    input += blockDim.x;
+    out += blockDim.x;
+  }
+
   const int last = dim_size % (VecSize * blockDim.x);
 
   T in_v[VecSize];
@@ -391,9 +419,8 @@ __device__ __forceinline__ void ThreadVecWrite(T* out,
   T out_v[VecSize];
   VecT* out_value = reinterpret_cast<VecT*>(&out_v);
 
-  for (int offset = threadIdx.x; offset * VecSize < dim_size - last;
-       offset += blockDim.x) {
-    *in_value = reinterpret_cast<VecT*>(const_cast<T*>(input))[offset];
+  for (; offset * VecSize < dim_size - last; offset += blockDim.x) {
+    *in_value = reinterpret_cast<VecT*>(input)[offset];
 #pragma unroll
     for (int i = 0; i < VecSize; i++) {
       out_v[i] = functor(static_cast<AccT>(in_v[i]));
@@ -401,6 +428,33 @@ __device__ __forceinline__ void ThreadVecWrite(T* out,
     reinterpret_cast<VecT*>(out)[offset] = *out_value;
   }
 
+  offset = dim_size - last + threadIdx.x;
+  // the tail
+  for (; offset < dim_size; offset += blockDim.x) {
+    out[offset] = functor(static_cast<AccT>(input[offset]));
+  }
+}
+
+template <template <typename, typename> class Reduction,
+          typename T,
+          typename AccT,
+          int VecSize>
+__device__ __forceinline__ void ThreadVecWrite(T* out,
+                                               T* input,
+                                               int dim_size,
+                                               Reduction<AccT, T> functor) {
+  const int last = dim_size % (VecSize * blockDim.x);
+
+  for (int offset = threadIdx.x; offset < dim_size - last;
+       offset += blockDim.x * VecSize) {
+#pragma unroll
+    for (int i = 0; i < VecSize; i++) {
+      out[offset + i * blockDim.x] =
+          functor(static_cast<AccT>(input[offset + i * blockDim.x]));
+    }
+  }
+
+  // the tail
   for (int offset = dim_size - last + threadIdx.x; offset < dim_size;
        offset += blockDim.x) {
     out[offset] = functor(static_cast<AccT>(input[offset]));
@@ -417,13 +471,19 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) {
   using VecT = phi::AlignedVector<T, VecSize>;
 
   int bid = blockIdx.x;
-  const T* batch_input = src + bid * dim_size;
+  T* batch_input = const_cast<T*>(src) + bid * dim_size;
   T* batch_output = softmax + bid * dim_size;
 
+  const int input_align_shift =
+      ((uint64_t)batch_input) % MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T);
+  const int output_align_shift =
+      ((uint64_t)batch_output) % MATRIX_SOFTMAX_ALIGN_BYTES / sizeof(T);
+
   // get max value
   AccT thread_max = ThreadVecReduce<MaxFunctor, T, AccT, VecSize>(
       batch_input,
       dim_size,
+      input_align_shift,
       MaxFunctor<T, AccT>(),
       std::numeric_limits<AccT>::min());
   BlockReduceMax<AccT>(&thread_max);
@@ -432,6 +492,7 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) {
   AccT thread_exp = ThreadVecReduce<SumExpFunctor, T, AccT, VecSize>(
       batch_input,
       dim_size,
+      input_align_shift,
       SumExpFunctor<T, AccT>(thread_max),
       static_cast<AccT>(0.));
   BlockReduceSum<AccT>(&thread_exp);
@@ -440,12 +501,22 @@ __global__ void KeMatrixSoftmaxForward(T* softmax, const T* src, int dim_size) {
   if (LogMode) {
     LogSoftmaxForwardFunctor<AccT, T> reduction(thread_max,
                                                 std::log(thread_exp));
-    ThreadVecWrite<LogSoftmaxForwardFunctor, T, AccT, VecSize>(
-        batch_output, batch_input, dim_size, reduction);
+    if (input_align_shift == output_align_shift) {
+      ThreadVecWriteVec<LogSoftmaxForwardFunctor, T, AccT, VecSize>(
+          batch_output, batch_input, dim_size, input_align_shift, reduction);
+    } else {
+      ThreadVecWrite<LogSoftmaxForwardFunctor, T, AccT, VecSize>(
+          batch_output, batch_input, dim_size, reduction);
+    }
   } else {
     SoftmaxForwardFunctor<AccT, T> reduction(thread_max, thread_exp);
-    ThreadVecWrite<SoftmaxForwardFunctor, T, AccT, VecSize>(
-        batch_output, batch_input, dim_size, reduction);
+    if (input_align_shift == output_align_shift) {
+      ThreadVecWriteVec<SoftmaxForwardFunctor, T, AccT, VecSize>(
+          batch_output, batch_input, dim_size, input_align_shift, reduction);
+    } else {
+      ThreadVecWrite<SoftmaxForwardFunctor, T, AccT, VecSize>(
+          batch_output, batch_input, dim_size, reduction);
+    }
   }
 }
 
@@ -1371,5 +1442,9 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
         dev_ctx, dx_data, dout.data<T>(), out.data<T>(), N, dim, D);
   }
 }
+#undef FIXED_BLOCK_DIM_BASE
+#undef FIXED_BLOCK_DIM
+#undef FIXED_VEC_SIZE_BASE
+#undef FIXED_VEC_SIZE
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 7402a2273365b..744c48b2bfbd6 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,10 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -60,14 +58,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h,
   return weight;
 }
 
-template <typename T, typename MT>
-HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
-                                      MT argmax_w,
-                                      const int height,
-                                      const int width,
-                                      const T* im_data,
-                                      const int data_width,
-                                      const int bp_dir) {
+template <typename T>
+HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
+                                     T argmax_w,
+                                     const int height,
+                                     const int width,
+                                     const T* im_data,
+                                     const int data_width,
+                                     const int bp_dir) {
   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
       argmax_w >= width) {
     return 0;
@@ -78,51 +76,43 @@ HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
   int argmax_h_high = argmax_h_low + 1;
   int argmax_w_high = argmax_w_low + 1;
 
-  MT weight = 0;
+  T weight = 0;
 
   if (bp_dir == 0) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_low])
+                        im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
 
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? -1 * (argmax_w - argmax_w_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_high])
+                        im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
 
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_low])
+                        im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_w - argmax_w_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_high])
+                        im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;
   } else if (bp_dir == 1) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_low])
+                        im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_high])
+                        im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? -1 * (argmax_h - argmax_h_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_low])
+                        im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_h - argmax_h_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_high])
+                        im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;
   }
 
@@ -145,7 +135,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
                                     T* grad_offset,
                                     T* grad_mask);
 
-template <typename T, typename MT, typename Context>
+template <typename T, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -157,7 +147,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               MT* grad_im);
+                               T* grad_im);
 
 template <typename T, typename Context>
 void FilterGradAddup(const Context& dev_ctx,
@@ -186,7 +176,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               DenseTensor* filter_grad,
                               DenseTensor* mask_grad) {
   const int batch_size = static_cast<int>(x.dims()[0]);
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+
   DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
   std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
@@ -302,8 +292,8 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           mask_grad_data_ptr);
     }
     if (dx) {
-      MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
-
+      T* dx_ptr = dx->data<T>();
+      // get grad of input
       ModulatedDeformableCol2im(dev_ctx,
                                 col_buffer_ptr,
                                 offset_ptr + i * im2col_step * input_offset_dim,
@@ -315,7 +305,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                                 strides,
                                 dilations,
                                 deformable_groups,
-                                mt_dx_ptr + i * im2col_step * input_dim);
+                                dx_ptr + i * im2col_step * input_dim);
       dx->Resize(x.dims());
     }
 
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index d66f4e58e5b61..f864c2e5f0ed0 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -14,13 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/optional.h"
 
 namespace phi {
@@ -40,12 +38,6 @@ void DeformableConvKernel(const Context& dev_ctx,
                           DenseTensor* out) {
   const int batch_size = static_cast<int>(x.dims()[0]);
 
-  int temp_step = std::min(64, batch_size);
-  if (batch_size % temp_step == 0) {
-    im2col_step = temp_step;
-  }
-
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
   std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
 
@@ -109,11 +101,8 @@ void DeformableConvKernel(const Context& dev_ctx,
         dilations,
         deformable_groups,
         col_buffer_ptr);
-    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim(
-        output_4d.dims(),
-        1,
-        output_4d.dims().size()));  // group * C/group * (im2step * H * W)
-
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
+        phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
     // get the product of pixel and weight
     for (int g = 0; g < groups; ++g) {
       DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
@@ -121,11 +110,8 @@ void DeformableConvKernel(const Context& dev_ctx,
       DenseTensor col_buffer_3d_slice =
           col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
               col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-      DenseTensor output_3d_slice =
-          output_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-              output_3d.dims(),
-              1,
-              output_3d.dims().size()));  // C * ((im2col_step)*H*W))
+      DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
       blas.MatMul(weight_3d_slice,
                   false,
                   col_buffer_3d_slice,
@@ -135,29 +121,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                   T(0.0));
     }
   }
-
-  //  swap axis to get the right result when im2col_step is greater than 1
-  if (im2col_step > 1) {
-    std::vector<int> axis(4);
-    axis[0] = 0;
-    axis[1] = 2;
-    axis[2] = 1;
-    axis[3] = 3;
-
-    DenseTensor real_output_buffer = phi::Transpose<T, Context>(
-        dev_ctx,
-        output_4d.Resize(
-            phi::make_ddim({batch_size / im2col_step,
-                            output_shape_vec[1],
-                            im2col_step,
-                            output_shape_vec[2] * output_shape_vec[3]})),
-        axis);
-
-    out->ShareDataWith(real_output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  } else {
-    out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
-  }
+  out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h
index 23e4414858a78..7e5b5ca4f8d4e 100644
--- a/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h
@@ -16,6 +16,7 @@
 #include <type_traits>
 #include <vector>
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/reduce_grad_functions.h"
@@ -23,6 +24,7 @@
 
 namespace phi {
 
+template <typename T>
 struct LogsumexpGradFunctor {
   template <typename Context,
             typename X,
@@ -37,7 +39,13 @@ struct LogsumexpGradFunctor {
                   DY* dy,
                   const Dim& dim,
                   int size) {
-    dx->device(place) = dy->broadcast(dim) * (*x - y->broadcast(dim)).exp();
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    auto x_mt = (*x).template cast<MT>();
+    auto y_mt = (*y).template cast<MT>();
+    auto dy_mt = (*dy).template cast<MT>();
+    dx->device(place) =
+        (dy_mt.broadcast(dim) * (x_mt - y_mt.broadcast(dim)).exp())
+            .template cast<T>();
   }
 };
 
@@ -62,11 +70,11 @@ void LogsumexpGradKernel(const Context& dev_ctx,
     auto dx = phi::EigenVector<T>::Flatten(*in_grad);
     auto& place = *dev_ctx.eigen_device();
     auto broadcast_dim = Eigen::array<int, 1>({{static_cast<int>(in.numel())}});
-    LogsumexpGradFunctor()(
+    LogsumexpGradFunctor<T>()(
         place, &x, &y, &dx, &dy, broadcast_dim, broadcast_dim[0]);
   } else {
     int rank = in.dims().size();
-    LogsumexpGradFunctor functor;
+    LogsumexpGradFunctor<T> functor;
     std::vector<int32_t> axis32;
     axis32.reserve(axis.size());
     std::for_each(axis.begin(), axis.end(), [&axis32](const int64_t& t) {
@@ -74,21 +82,26 @@ void LogsumexpGradKernel(const Context& dev_ctx,
     });
     switch (rank) {
       case 1:
-        phi::funcs::ReduceGradFunctor<Context, T, 1, LogsumexpGradFunctor>(
+        phi::funcs::ReduceGradFunctor<Context, T, 1, LogsumexpGradFunctor<T>>(
             dev_ctx, in, out, out_grad, in_grad, functor, axis32);
         break;
       case 2:
-        phi::funcs::ReduceGradFunctor<Context, T, 2, LogsumexpGradFunctor>(
+        phi::funcs::ReduceGradFunctor<Context, T, 2, LogsumexpGradFunctor<T>>(
             dev_ctx, in, out, out_grad, in_grad, functor, axis32);
         break;
       case 3:
-        phi::funcs::ReduceGradFunctor<Context, T, 3, LogsumexpGradFunctor>(
+        phi::funcs::ReduceGradFunctor<Context, T, 3, LogsumexpGradFunctor<T>>(
             dev_ctx, in, out, out_grad, in_grad, functor, axis32);
         break;
       case 4:
-        phi::funcs::ReduceGradFunctor<Context, T, 4, LogsumexpGradFunctor>(
+        phi::funcs::ReduceGradFunctor<Context, T, 4, LogsumexpGradFunctor<T>>(
             dev_ctx, in, out, out_grad, in_grad, functor, axis32);
         break;
+      default:
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Unsupported dimensions, please keep maximum dimensions of input "
+            "data less than 4."));
+        break;
     }
   }
 }
diff --git a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h
index 7c2eadcb7df78..30a118a1317ba 100644
--- a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h
+++ b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h
@@ -16,6 +16,7 @@
 #include <type_traits>
 #include <vector>
 
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
@@ -23,15 +24,17 @@
 
 namespace phi {
 
-#define HANDLE_DIM(NDIM, RDIM)                                      \
-  if (ndim == NDIM && rdim == RDIM) {                               \
-    funcs::ReduceFunctor<Context, T, NDIM, RDIM, LogsumexpFunctor>( \
-        dev_ctx, x, out, axis, keepdim);                            \
+#define HANDLE_DIM(NDIM, RDIM)                                         \
+  if (ndim == NDIM && rdim == RDIM) {                                  \
+    funcs::ReduceFunctor<Context, T, NDIM, RDIM, LogsumexpFunctor<T>>( \
+        dev_ctx, x, out, axis, keepdim);                               \
   }
 
+template <typename T>
 struct LogsumexpFunctor {
   template <typename Context, typename X, typename Y, typename Dim>
   void operator()(const Context& place, X* x, Y* y, const Dim& dim) {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     auto x_dim = x->dimensions();
     auto t_dim = x_dim;
     for (int i = 0; i < static_cast<int>(dim.size()); i++) {
@@ -46,12 +49,14 @@ struct LogsumexpFunctor {
       r_dim[dim[i]] = x_dim[dim[i]];
     }
 
+    auto x_mt = (*x).template cast<MT>();
     auto y_dim = y->dimensions();
-    auto x_max = x->maximum(dim);
+    auto x_max = x_mt.maximum(dim);
     y->device(place) =
         (x_max +
-         (*x - x_max.reshape(t_dim).broadcast(r_dim)).exp().sum(dim).log())
-            .reshape(y_dim);
+         (x_mt - x_max.reshape(t_dim).broadcast(r_dim)).exp().sum(dim).log())
+            .reshape(y_dim)
+            .template cast<T>();
   }
 };
 
@@ -74,10 +79,16 @@ void LogsumexpKernel(const Context& dev_ctx,
     auto output = phi::EigenScalar<T>::From(*out);
     auto& place = *dev_ctx.eigen_device();
     auto reduce_dim = Eigen::array<int, 1>({{0}});
-    LogsumexpFunctor()(place, &input, &output, reduce_dim);
+    LogsumexpFunctor<T>()(place, &input, &output, reduce_dim);
   } else {
     int ndim = input_dim_size;
     int rdim = axis.size();
+    if (ndim > 4) {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported dimensions, please keep maximum dimensions of input "
+          "data less than 4."));
+    }
+
     // comments for accelerating compiling temporarily.
     // HANDLE_DIM(6, 5);
     // HANDLE_DIM(6, 4);
diff --git a/paddle/phi/kernels/impl/tile_kernel_impl.h b/paddle/phi/kernels/impl/tile_kernel_impl.h
index d19a6a7800671..f7b923b00b1ca 100644
--- a/paddle/phi/kernels/impl/tile_kernel_impl.h
+++ b/paddle/phi/kernels/impl/tile_kernel_impl.h
@@ -54,6 +54,10 @@ void Tile(const Context& dev_ctx,
           vec_x_dims.size(),
           repeat_times.size()));
 
+  if (Rank == 0) {
+    phi::Copy<DeviceContext>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    return;
+  }
   Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
   for (size_t i = 0; i < repeat_times.size(); ++i) {
     bcast_dims[i] = repeat_times[i];
@@ -71,6 +75,7 @@ void Tile(const Context& dev_ctx,
 
   auto eigen_out = EigenTensor<T, Rank>::From(*out, out_dims);
   auto& place = *dev_ctx.eigen_device();
+
   // use 32-bit index to speed up
   bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
   if (use_32bit_index) {
@@ -93,6 +98,9 @@ void TileKernel(const Context& dev_ctx,
   rank = std::max(rank, repeat_times_size);
 
   switch (rank) {
+    case 0:
+      Tile<Context, T, 0>(dev_ctx, x, repeat_times_data, out);
+      break;
     case 1:
       Tile<Context, T, 1>(dev_ctx, x, repeat_times_data, out);
       break;
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index c9ac26e1ea911..254b268b958b2 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -662,6 +662,7 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
 
+pip install requests
 pip install -r %work_dir%\python\unittest_py\requirements.txt
 if %ERRORLEVEL% NEQ 0 (
     echo pip install unittest requirements.txt failed!
@@ -678,8 +679,7 @@ dir %THIRD_PARTY_PATH:/=\%\install\zlib\bin
 dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
 dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
-
-pip install requests
+dir %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
@@ -689,7 +689,9 @@ set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\inst
 %PATH%
 
 REM TODO: make ut find .dll in install\onnxruntime\lib
-xcopy %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib\onnxruntime.dll %work_dir%\%BUILD_DIR%\paddle\fluid\inference\tests\api\ /Y
+if "%WITH_ONNXRUNTIME%"=="ON" (
+    xcopy %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib\onnxruntime.dll %work_dir%\%BUILD_DIR%\paddle\fluid\inference\tests\api\ /Y
+)
 
 if "%WITH_GPU%"=="ON" (
     call:parallel_test_base_gpu
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
index c278041b46126..dfa9f1822bff8 100644
--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import six
-import math
 
 __all__ = []
 
@@ -197,30 +196,3 @@ def _to_bytes(obj, encoding):
         return obj
     else:
         return six.b(obj)
-
-
-# math related functions
-def round(x, d=0):
-    """
-    Compatible round which act the same behaviour in Python3.
-
-    Args:
-        x(float) : The number to round halfway.
-
-    Returns:
-        round result of x
-    """
-    if six.PY3:
-        # The official walkaround of round in Python3 is incorrect
-        # we implement according this answer: https://www.techforgeek.info/round_python.html
-        if x > 0.0:
-            p = 10**d
-            return float(math.floor((x * p) + math.copysign(0.5, x))) / p
-        elif x < 0.0:
-            p = 10**d
-            return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
-        else:
-            return math.copysign(0.0, x)
-    else:
-        import __builtin__
-        return __builtin__.round(x, d)
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 6c7b2fa732969..1790e8748897a 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -68,6 +68,8 @@
 
 from .sharding import *  # noqa: F401
 
+from . import rpc
+
 __all__ = [  # noqa
     "spawn", "launch", "scatter", "broadcast", "ParallelEnv", "new_group",
     "init_parallel_env", "gloo_init_parallel_env", "gloo_barrier",
@@ -76,5 +78,5 @@
     "all_gather_object", "InMemoryDataset", "barrier", "all_reduce", "alltoall",
     "send", "reduce", "recv", "ReduceOp", "wait", "get_rank",
     "ProbabilityEntry", "ParallelMode", "is_initialized", "isend", "irecv",
-    "reduce_scatter"
+    "reduce_scatter", "rpc"
 ]
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 86a545322a294..82c5011faf0af 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -116,3 +116,10 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(TUNING, "profile_end_step", 1)
 set_field_default_config(TUNING, "run_after_tuning", True)
 set_field_default_config(TUNING, "verbose", True)
+
+#########################################
+# dataset configuration
+#########################################
+DATASET = "dataset"
+set_field_default_config(DATASET, "enable", False)
+set_field_default_config(DATASET, "num_shards", 1)
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index e3487d7178ee1..9eb62b1b74e8a 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -17,38 +17,11 @@
 
 import paddle
 from paddle.io import BatchSampler, IterableDataset
-from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler
+from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler, DistributedBatchSampler
 from paddle.fluid.dataloader.dataloader_iter import _DatasetKind, default_collate_fn, default_convert_fn
 
 
-class DistributedDataLoader(metaclass=abc.ABCMeta):
-
-    def __init__(self, dataset, batch_size=1, epochs=1, drop_last=False):
-        if isinstance(dataset, IterableDataset):
-            self.dataset_kind = _DatasetKind.ITER
-        else:
-            self.dataset_kind = _DatasetKind.MAP
-
-        self.dataset = dataset
-        self.epochs = epochs
-        self.drop_last = drop_last
-
-        if batch_size is None:
-            self.batch_size = None
-            self.batch_sampler = None
-        else:
-            self.batch_size = batch_size
-            if isinstance(dataset, IterableDataset):
-                self.batch_sampler = _InfiniteIterableSampler(
-                    dataset, batch_size)
-            else:
-                self.batch_sampler = BatchSampler(dataset,
-                                                  batch_size=batch_size,
-                                                  shuffle=False,
-                                                  drop_last=drop_last)
-
-        self.auto_collate_batch = self.batch_sampler is not None
-        self.sampler_iter = iter(self.index_sampler)
+class DistributedDataLoaderBase(metaclass=abc.ABCMeta):
 
     @abc.abstractmethod
     def __iter__(self):
@@ -58,48 +31,70 @@ def __iter__(self):
     def __next__(self):
         raise NotImplementedError
 
-    @property
-    def index_sampler(self):
-        if self.auto_collate_batch:
-            return self.batch_sampler
-        else:
-            if self.dataset_kind == _DatasetKind.MAP:
-                return list(range(len(self.dataset)))
-            else:
-                return _InfiniteIterableSampler(self.dataset, 1)
-
 
-class NonIterableGeneratorLoader(DistributedDataLoader):
+class DistributedDataLoaderFromGenerator(DistributedDataLoaderBase):
 
     def __init__(self,
                  dataset,
-                 feed_list,
-                 places,
+                 feed_list=None,
+                 capacity=None,
+                 use_double_buffer=True,
+                 iterable=True,
+                 return_list=False,
+                 use_multiprocess=False,
+                 drop_last=True,
+                 places=None,
                  batch_size=1,
                  epochs=1,
                  steps_per_epoch=None,
                  collate_fn=None,
+                 split_data=True,
                  data_parallel_world_size=[],
-                 data_parallel_rank=[],
-                 drop_last=False,
-                 split_data=True):
+                 data_parallel_rank=[]):
+        self.dataset = dataset
         self.feed_list = feed_list
+        self.capacity = capacity
+        self.use_double_buffer = use_double_buffer
+        self.iterable = iterable
+        self.return_list = return_list
+        self.use_multiprocess = use_multiprocess
+        self.drop_last = drop_last
         self.places = places
+        self.batch_size = batch_size
+        self.epochs = epochs
         self.steps_per_epoch = steps_per_epoch
-
+        self.collate_fn = collate_fn
+        self.split_data = split_data
         assert len(data_parallel_world_size) == len(feed_list)
         assert len(data_parallel_rank) == len(feed_list)
         self.dp_world_sizes = data_parallel_world_size
         self.dp_ranks = data_parallel_rank
-        self.split_data = split_data
 
-        super(NonIterableGeneratorLoader,
-              self).__init__(dataset, batch_size, epochs, drop_last)
+        if isinstance(dataset, IterableDataset):
+            self.dataset_kind = _DatasetKind.ITER
+        else:
+            self.dataset_kind = _DatasetKind.MAP
+
+        if self.batch_size is None:
+            self.batch_sampler = None
+        else:
+            if isinstance(dataset, IterableDataset):
+                self.batch_sampler = _InfiniteIterableSampler(
+                    dataset, batch_size)
+            else:
+                self.batch_sampler = BatchSampler(dataset,
+                                                  batch_size=batch_size,
+                                                  shuffle=False,
+                                                  drop_last=drop_last)
+
+        self.auto_collate_batch = self.batch_sampler is not None
+        self.sampler_iter = iter(self.index_sampler)
 
         if self.auto_collate_batch:
             self.collate_fn = collate_fn or default_collate_fn
         else:
             self.collate_fn = collate_fn or default_convert_fn
+
         self.dataset_fetcher = _DatasetKind.create_fetcher(
             self.dataset_kind, self.dataset, self.auto_collate_batch,
             self.collate_fn, self.drop_last)
@@ -115,8 +110,10 @@ def __iter__(self):
     def __next__(self):
         if not self._steps:
             self._cur_step += 1
+            return None
         elif self._cur_step < self._steps:
             self._cur_step += 1
+            return None
         else:
             self._inner_dataloader.reset()
             self.sampler_iter = iter(self.index_sampler)
@@ -138,6 +135,16 @@ def _infer_steps(self):
             )
         return steps_per_epoch
 
+    @property
+    def index_sampler(self):
+        if self.auto_collate_batch:
+            return self.batch_sampler
+        else:
+            if self.dataset_kind == _DatasetKind.MAP:
+                return list(range(len(self.dataset)))
+            else:
+                return _InfiniteIterableSampler(self.dataset, 1)
+
     def _create_inner_dataloader(self):
 
         def data_generator():
@@ -170,7 +177,83 @@ def data_generator():
                 yield partial_data
 
         dataloader = paddle.fluid.io.DataLoader.from_generator(
-            feed_list=self.feed_list, capacity=70, iterable=False)
+            feed_list=self.feed_list,
+            capacity=self.capacity,
+            use_double_buffer=self.use_double_buffer,
+            # iterable=self.iterable,
+            iterable=False,
+            return_list=self.return_list,
+            use_multiprocess=self.use_multiprocess,
+            drop_last=self.drop_last)
         dataloader.set_batch_generator(data_generator, self.places)
 
         return dataloader
+
+
+class DistributedDataLoader(DistributedDataLoaderBase):
+
+    def __init__(self,
+                 dataset,
+                 feed_list=None,
+                 places=None,
+                 return_list=True,
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 collate_fn=None,
+                 num_workers=0,
+                 use_buffer_reader=True,
+                 use_shared_memory=True,
+                 timeout=0,
+                 worker_init_fn=None,
+                 epochs=1,
+                 steps_per_epoch=None,
+                 split_data=True,
+                 data_parallel_world_size=[],
+                 data_parallel_rank=[]):
+        self.dataset = dataset
+        self.feed_list = feed_list
+        self.return_list = return_list
+        self.places = places
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.collate_fn = collate_fn
+        self.num_workers = num_workers
+        self.use_buffer_reader = use_buffer_reader
+        self.use_shared_memory = use_shared_memory
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+        self.epochs = epochs
+        self.steps_per_epoch = steps_per_epoch
+        self.dp_world_sizes = data_parallel_world_size
+        self.dp_ranks = data_parallel_rank
+        self.split_data = split_data
+        # TODO: rank info
+        self.batch_sampler = DistributedBatchSampler(
+            self.dataset, self.batch_size, self.dp_world_sizes[0],
+            self.dp_ranks[0], self.shuffle, self.drop_last)
+        self._inner_dataloader = self._create_inner_dataloader()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self.data)
+
+    def _create_inner_dataloader(self):
+        dataloader = paddle.fluid.io.DataLoader(
+            self.dataset,
+            feed_list=self.feed_list,
+            places=self.places,
+            return_list=self.return_list,
+            batch_sampler=self.batch_sampler,
+            collate_fn=self.collate_fn,
+            num_workers=self.num_workers,
+            use_buffer_reader=self.use_buffer_reader,
+            use_shared_memory=self.use_shared_memory,
+            timeout=self.timeout,
+            worker_init_fn=self.worker_init_fn)
+        self.data = (x for x in dataloader)
+
+        return dataloader
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 6bc5743adb23d..76e9863bfe272 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -40,9 +40,8 @@
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
-from .dist_loader import NonIterableGeneratorLoader
-from .utils import to_list
-from .utils import get_logger, get_dist_attr
+from .dist_loader import DistributedDataLoaderFromGenerator, DistributedDataLoader
+from .utils import to_list, get_logger, get_dist_attr
 from .process_group import new_process_group, get_all_process_groups
 from .dist_context import DistributedContext, get_default_distributed_context
 from .strategy import Strategy
@@ -127,11 +126,11 @@ def __init__(self,
             )
         self._model = model
 
-        if loss and not isinstance(loss,
-                                   paddle.nn.Layer) and not callable(loss):
-            raise TypeError(
-                "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
-            )
+        # if loss and not isinstance(loss,
+        #                            paddle.nn.Layer) and not callable(loss):
+        #     raise TypeError(
+        #         "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
+        #     )
         self._loss = loss
 
         if optimizer and not isinstance(
@@ -184,39 +183,184 @@ def __init__(self,
         self._feed_vars = {}
         self._fetch_vars = {}
         self._planners = {}
-        self._mode_init_states = {
+        self._has_prepared = {"train": False, "eval": False, "predict": False}
+        self._has_prepared_reader = {
             "train": False,
             "eval": False,
             "predict": False
         }
+        self._inputs_spec = []
+        self._labels_spec = []
+        self._inputs = []
+        self._labels = []
 
+        self._skip_build = False
+        self._outside_dataloader = False
         self._planned_mode = None
         self._dygraph_mode = False
         self._tuning = self._strategy.tuning
 
-    def _prepare_program(self, mode):
-        # Do the build process
-        self._build(mode)
-        # Do the planning process
-        self._plan(mode)
-        # Do the parallel process
-        self._parallel(mode)
-        # Init comm and startup program
-        self._initialize(mode)
-        self._mode_init_states[mode] = True
+    def _prepare_data_spec(self, data, split, batch_size):
+        inputs_spec = []
+        labels_spec = []
+        if isinstance(data, paddle.io.IterableDataset):
+            if split is None:
+                inputs, labels = next(iter(data))
+            else:
+                sample = next(iter(data))
+                inputs = sample[:split]
+                labels = sample[split:]
+        elif isinstance(data, paddle.io.Dataset):
+            if split is None:
+                inputs, labels = data[0]
+            else:
+                sample = data[0]
+                inputs = sample[:split]
+                labels = sample[split:]
+        else:
+            raise ValueError(
+                "Data should be a Dataset or IterableDatset, but received {}.".
+                format(type(data).__name__))
+        inputs = to_list(inputs)
+        labels = to_list(labels)
+
+        num_shards = self._strategy.dataset.num_shards
 
-    def _prepare_feed(self, user_feeds=None, mode="train"):
+        def _adjust_item_spec(num_shards, spec):
+            if num_shards > 1 and len(spec.shape) > 1:
+                spec.shape[0] = spec.shape[0] * num_shards
+
+        def _infer_item_spec(item, name, batch_size, specs):
+            if isinstance(item, np.ndarray):
+                spec = InputSpec.from_numpy(item, name)
+                if batch_size is None:
+                    _adjust_item_spec(num_shards, spec)
+                    specs.append(spec)
+                else:
+                    specs.append(spec.batch(batch_size))
+            elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)):
+                _adjust_item_spec(num_shards, spec)
+                spec = InputSpec.from_tensor(item, name)
+                if batch_size is None:
+                    specs.append(spec)
+                else:
+                    specs.append(spec.batch(batch_size))
+            else:
+                specs.append(InputSpec([batch_size], type(item), name))
+
+        if inputs is not None:
+            for i, item in enumerate(inputs):
+                assert item is not None, "Receive None input."
+                name = "input" + str(i)
+                _infer_item_spec(item, name, batch_size, inputs_spec)
+        if labels is not None:
+            for i, item in enumerate(labels):
+                assert item is not None, "Receive None input."
+                name = "label" + str(i)
+                _infer_item_spec(item, name, batch_size, labels_spec)
+
+        inputs_spec = self._validate_spec(inputs_spec)
+        labels_spec = self._validate_spec(labels_spec)
+        return inputs_spec, labels_spec
+
+    def _prepare_data_tensor(self,
+                             inputs_spec,
+                             labels_spec,
+                             inputs=None,
+                             labels=None):
+        if _non_static_mode() or self._dygraph_mode:
+            return None, None
+        inputs_spec = inputs_spec if inputs_spec else []
+        labels_spec = labels_spec if labels_spec else []
+        if inputs_spec:
+            assert isinstance(inputs_spec, list), \
+                "inputs should be list, but received {}".format(type(inputs_spec))
+            if inputs is None:
+                inputs = [s._create_feed_layer() for s in inputs_spec]
+            else:
+                assert isinstance(inputs, list), \
+                    "inputs should be list, but received {}".format(type(inputs))
+                for input_spec, input in zip(inputs_spec, inputs):
+                    if input_spec.shape != input.shape:
+                        input.desc.set_shape(input_spec.shape)
+        if labels_spec:
+            assert isinstance(labels_spec, list), \
+                "labels should be list, but received {}".format(type(labels_spec))
+            if labels is None:
+                labels = [s._create_feed_layer() for s in labels_spec]
+            else:
+                assert isinstance(labels, list), \
+                    "labels should be list, but received {}".format(type(labels))
+                for label_spec, label in zip(labels_spec, labels):
+                    if label_spec.shape != label.shape:
+                        label.desc.set_shape(label_spec.shape)
+        return inputs, labels
+
+    def _prepare_reader(self):
+        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_block = dist_main_prog.global_block()
+
+        # NOTE: this list may be changed if Paddle changes the existing rules.
+        related_reader_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read"
+        ]
+        # remove the first three ops if multiple run fit/evaluate/predict
+        if dist_main_block.ops[0].type == 'create_py_reader':
+            for i in range(len(related_reader_ops)):
+                if dist_main_block.ops[0].type in related_reader_ops:
+                    dist_main_block._remove_op(0, sync=False)
+        dist_main_block._sync_with_cpp()
+        # Step 1: find the reader ops
+        reader_op_indices = []
+        for idx, op in enumerate(dist_main_block.ops):
+            if op.type in related_reader_ops:
+                reader_op_indices.append(idx)
+        # Step 2: insert the new reader ops to cpp
+        new_reader_ops = []
+        for idx in reversed(reader_op_indices):
+            new_op_desc = dist_main_block.desc._prepend_op()
+            new_op_desc.copy_from(dist_main_block.ops[idx].desc)
+            new_op = Operator(dist_main_block,
+                              new_op_desc,
+                              type=new_op_desc.type())
+            new_reader_ops.append(new_op)
+            dist_op = DistributedOperator(new_op)
+            dist_context.add_dist_op_for_program(dist_op)
+        # Step 3: insert the new reader ops to python
+        for new_op in new_reader_ops:
+            dist_main_block.ops.insert(0, new_op)
+        for i in range(len(reader_op_indices)):
+            reader_op_indices[i] += len(reader_op_indices)
+        # Step 4: remove the old reader ops from python and cpp
+        for idx in reversed(reader_op_indices):
+            op = dist_main_block.ops.pop(idx)
+            dist_main_block.desc._remove_op(idx, idx + 1)
+        dist_main_block._sync_with_cpp()
+        self._has_prepared_reader[self._mode] = True
+
+    def _prepare_feed(self, data, user_feeds, mode):
+        feeds = {}
+        if data is not None:
+            if isinstance(data, (list, tuple)):
+                if len(data) == 1 and isinstance(data[0], dict):
+                    for name, data in data[0].items():
+                        feeds[name] = data
+                else:
+                    raise ValueError("Unsupported data {}".format(data))
+            elif isinstance(data, dict):
+                for name, data in data.items():
+                    feeds[name] = data
+            else:
+                raise ValueError("Unsupported data {}".format(data))
         if user_feeds is not None:
             assert isinstance(user_feeds, dict), \
                 "user_feeds must be a dict, but receive {}".format(type(user_feeds).__name__)
-        feeds = {}
-        # TODO: add inputs and labels feed dict
-        if user_feeds is not None:
-            for name, var in user_feeds.items():
-                feeds[name] = var
+            for name, data in user_feeds.items():
+                feeds[name] = data
         return feeds
 
-    def _prepare_fetch(self, user_fetches=None, mode="train"):
+    def _prepare_fetch(self, user_fetches, mode):
         if user_fetches is not None:
             assert isinstance(user_fetches, list), \
                 "user_fetches must be a list, but receive {}".format(type(user_fetches).__name__)
@@ -232,6 +376,8 @@ def _process_fetch_group(group_name, var_list):
                     if var_name not in fetch_names:
                         fetch_names.append(var_name)
                     group_indices.append(fetch_names.index(var_name))
+            if not group_indices:
+                fetch_names.append([])
             fetch_indices.append(group_indices)
 
         if mode != "predict":
@@ -251,13 +397,13 @@ def _process_fetch_group(group_name, var_list):
 
     def _prepare_logger(self,
                         outs,
-                        mode="train",
                         epoch=None,
                         step=None,
                         lr=None,
                         fetch_names=None,
                         fetch_indices=None,
-                        profiler_log=""):
+                        profiler_log="",
+                        mode=None):
         logs = "[{}] ".format(mode)
         if epoch is not None:
             logs += "epoch: {:d} ".format(epoch)
@@ -274,17 +420,19 @@ def _prepare_logger(self,
             group_idx += 1
         # logging metrics
         if mode != "predict":
-            for metric in self._metrics:
-                metrics_indices = fetch_indices[group_idx]
-                metric_out = []
-                for idx in metrics_indices:
-                    metric_out.append(outs[idx])
-                if metric_out:
-                    metric.update(*metric_out)
-                    results = metric.accumulate()
-                    for i, res in enumerate(to_list(results)):
-                        logs += "{}: {:8f} ".format(metric.name()[i], res)
-                group_idx += 1
+            metric_vars = self._fetch_vars[mode]["metrics"]
+            if metric_vars:
+                for metric in self._metrics:
+                    metrics_indices = fetch_indices[group_idx]
+                    metric_out = []
+                    for idx in metrics_indices:
+                        metric_out.append(outs[idx])
+                    if metric_out:
+                        metric.update(*metric_out)
+                        results = metric.accumulate()
+                        for i, res in enumerate(to_list(results)):
+                            logs += "{}: {:8f} ".format(metric.name()[i], res)
+                    group_idx += 1
         # Skip logging outputs
         if mode == "predict":
             group_idx += 1
@@ -295,9 +443,10 @@ def _prepare_logger(self,
                 idx = fetch_names.index(var.name)
                 # Use the user defined name for logging
                 logs += "{}: {} ".format(name, outs[idx])
+        logs += profiler_log
         self._logger.info(logs)
 
-    def _prepare_history(self, outs, mode="train", fetch_indices=None):
+    def _prepare_history(self, outs, fetch_indices=None, mode=None):
         history = {}
         group_idx = 0
         # store loss
@@ -310,16 +459,18 @@ def _prepare_history(self, outs, mode="train", fetch_indices=None):
             group_idx += 1
         # store metrics
         if mode != "predict":
-            for metric in self._metrics:
-                metrics_indices = fetch_indices[group_idx]
-                metric_out = []
-                for idx in metrics_indices:
-                    metric_out.append(outs[idx])
-                if metric_out:
-                    metric.update(*metric_out)
-                    results = metric.accumulate()
-                    history[tuple(metric.name())] = to_list(results)
-                group_idx += 1
+            metric_vars = self._fetch_vars[mode]["metrics"]
+            if metric_vars:
+                for metric in self._metrics:
+                    metrics_indices = fetch_indices[group_idx]
+                    metric_out = []
+                    for idx in metrics_indices:
+                        metric_out.append(outs[idx])
+                    if metric_out:
+                        metric.update(*metric_out)
+                        results = metric.accumulate()
+                        history[tuple(metric.name())] = to_list(results)
+                    group_idx += 1
         # store outputs
         if mode == "predict":
             outputs_indices = fetch_indices[group_idx]
@@ -336,14 +487,25 @@ def _prepare_history(self, outs, mode="train", fetch_indices=None):
         history["fetches"] = fetches_values
         return history
 
+    def _prepare_program(self, mode):
+        # Do the build process
+        self._build(mode)
+        # Do the planning process
+        self._plan(mode)
+        # Do the parallel process
+        self._parallel(mode)
+        # Init comm and startup program
+        self._initialize(mode)
+        self._has_prepared[mode] = True
+
     def _build(self, mode):
         if _non_static_mode() or self._dygraph_mode:
             paddle.disable_static()
             self._dygraph_mode = True
             self._logger.info("Building model with 'to_static' method.")
 
-            inputs_spec = self.inputs_spec
-            labels_spec = self.labels_spec if self.labels_spec else []
+            inputs_spec = self._inputs_spec
+            labels_spec = self._labels_spec if self._labels_spec else []
             self.program_helper = ProgramHelper(self._model, self._loss,
                                                 self._metrics, inputs_spec,
                                                 labels_spec)
@@ -360,6 +522,9 @@ def _build(self, mode):
             losses = self.program_helper.loss_vars
             metrics = self.program_helper.metric_vars
 
+            self._inputs = inputs
+            self._labels = labels
+
             paddle.enable_static()
         else:
             # build program in static mode
@@ -367,24 +532,26 @@ def _build(self, mode):
             if serial_main_prog is not None:
                 return
 
+            outputs = []
             losses = []
             metrics = []
+            inputs = self._inputs
+            labels = self._labels
             serial_main_prog = self._orig_main_prog.clone()
             serial_startup_prog = self._orig_startup_prog.clone()
-            with static.program_guard(serial_main_prog, serial_startup_prog), \
-                utils.unique_name.guard():
-                inputs_spec = self.inputs_spec
-                labels_spec = self.labels_spec if self.labels_spec else []
-                inputs = [s._create_feed_layer() for s in inputs_spec]
-                labels = [s._create_feed_layer() for s in labels_spec]
-                outputs = to_list(self._model(*inputs))
-                if mode != "predict" and self._loss:
-                    losses = to_list(self._loss(*(outputs + labels)))
-
-                if mode != "predict":
-                    for metric in self._metrics:
-                        metrics.append(
-                            to_list(metric.compute(*(outputs + labels))))
+            if not self._skip_build:
+                with static.program_guard(serial_main_prog, serial_startup_prog), \
+                    utils.unique_name.guard():
+                    outputs = to_list(self._model(*inputs))
+                    if mode != "predict" and self._loss:
+                        losses = to_list(self._loss(*(outputs + labels)))
+
+                    if mode != "predict" and (outputs or labels):
+                        for metric in self._metrics:
+                            metrics.append(
+                                to_list(metric.compute(*(outputs + labels))))
+            else:
+                losses = to_list(self._loss)
 
         default_ctx = get_default_distributed_context()
         if not default_ctx.has_annotation:
@@ -427,8 +594,8 @@ def _optimization_tuning(self, mode, dataset, batch_size):
         self._optimization_tuner = OptimizationTuner(self._tuning.to_dict(),
                                                      self._dist_contexts[mode],
                                                      dataset,
-                                                     self.inputs_spec,
-                                                     self.labels_spec,
+                                                     self._inputs_spec,
+                                                     self._labels_spec,
                                                      batch_size=batch_size,
                                                      rank=self._cur_rank)
 
@@ -452,6 +619,7 @@ def _plan(self, mode):
         inputs_var = self._dist_contexts[mode].serial_feed_vars["inputs"]
         labels_var = self._dist_contexts[mode].serial_feed_vars["labels"]
         block = self._dist_contexts[mode].serial_main_program.global_block()
+        # TODO: check this feed_list
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in block.vars:
@@ -555,85 +723,6 @@ def _initialize(self, mode):
             dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
             self._executor.run(dist_startup_prog)
 
-    def _split_sample_item(self, data, split):
-        if isinstance(data, paddle.io.IterableDataset):
-            if split is None:
-                inputs, labels = next(iter(data))
-            else:
-                sample = next(iter(data))
-                inputs = sample[:split]
-                labels = sample[split:]
-        elif isinstance(data, paddle.io.Dataset):
-            if split is None:
-                inputs, labels = data[0]
-            else:
-                sample = data[0]
-                inputs = sample[:split]
-                labels = sample[split:]
-        else:
-            raise ValueError(
-                "Data should be a Dataset or IterableDatset, but received {}.".
-                format(type(data).__name__))
-        inputs = to_list(inputs)
-        labels = to_list(labels)
-        return inputs, labels
-
-    def _infer_sample_spec(self, inputs, labels, batch_size):
-        self.inputs_spec = []
-        self.labels_spec = []
-
-        def _infer_item_spec(item, name, batch_size, specs):
-            if isinstance(item, np.ndarray):
-                spec = InputSpec.from_numpy(item, name)
-                if batch_size is None:
-                    specs.append(spec)
-                else:
-                    specs.append(spec.batch(batch_size))
-            elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)):
-                spec = InputSpec.from_tensor(item, name)
-                if batch_size is None:
-                    specs.append(spec)
-                else:
-                    specs.append(spec.batch(batch_size))
-            else:
-                specs.append(InputSpec([batch_size], type(item), name))
-
-        if inputs is not None:
-            for i, item in enumerate(inputs):
-                assert item is not None, "Receive None input."
-                name = "input" + str(i)
-                _infer_item_spec(item, name, batch_size, self.inputs_spec)
-        if labels is not None:
-            for i, item in enumerate(labels):
-                assert item is not None, "Receive None input."
-                name = "label" + str(i)
-                _infer_item_spec(item, name, batch_size, self.labels_spec)
-
-        self.inputs_spec = self._validate_spec(self.inputs_spec)
-        self.labels_spec = self._validate_spec(self.labels_spec)
-
-    def __call__(self,
-                 inputs=None,
-                 labels=None,
-                 feeds=None,
-                 fetches=None,
-                 mode="train"):
-        feed_dict = self._prepare_feed(feeds, mode)
-        fetch_names, fetch_indices = self._prepare_fetch(fetches, mode)
-        try:
-            outs = self._executor.run(
-                self.main_program,
-                feed=feed_dict,
-                fetch_list=fetch_names,
-                use_program_cache=self._strategy.use_cache,
-                return_numpy=self._strategy.return_numpy)
-        except core.EOFException:
-            pass
-        self._prepare_logger(outs, self.mode, None, None, None, fetch_names,
-                             fetch_indices)
-        history = self._prepare_history(outs, self.mode, fetch_indices)
-        return history
-
     def fit(self,
             train_data,
             train_sample_split=None,
@@ -712,21 +801,28 @@ def fit(self,
                            epochs=2,
                            batch_size=64)
         """
-        self.mode = 'train'
-        inputs, labels = self._split_sample_item(train_data, train_sample_split)
-        self._infer_sample_spec(inputs, labels, batch_size)
-        if not self._mode_init_states[self.mode]:
-            self._prepare_program(self.mode)
+        self._mode = 'train'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            train_data, train_sample_split, batch_size)
+        self._inputs, self._labels = self._prepare_data_tensor(
+            self._inputs_spec, self._labels_spec)
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
         else:
-            self._switch_mode("train")
-
-        assert self.mode in self._dist_main_progs, \
-            "train model is not ready, please call `engine._prepare_program('train')` first."
-        train_dataloader = self._prepare_dataloader(train_data, batch_size,
-                                                    epochs, steps_per_epoch,
-                                                    collate_fn)
-
-        fetch_names, fetch_indices = self._prepare_fetch(mode=self.mode)
+            self._switch_mode(self._mode)
+        train_dataloader = self._prepare_dataloader_from_generator(
+            dataset=train_data,
+            capacity=70,
+            # use_double_buffer=use_double_buffer,
+            iterable=False,
+            # return_list=return_list,
+            # use_multiprocess=use_multiprocess,
+            # drop_last=drop_last,
+            batch_size=batch_size,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
+            collate_fn=collate_fn)
+        fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
         lr_scheduler = self._get_lr_scheduler(self.main_program)
 
         with profiler.Profiler(timer_only=True) as prof:
@@ -746,18 +842,18 @@ def fit(self,
 
                     prof.step()
 
-                    self._prepare_logger(outs, self.mode, epoch, step, lr,
+                    self._prepare_logger(outs, epoch, step, lr,
                                          fetch_names, fetch_indices,
-                                         prof.step_info())
-                    history = self._prepare_history(outs, self.mode,
-                                                    fetch_indices)
-
-                if valid_data and epoch % valid_freq == 0:
-                    self.evaluate(valid_data, valid_sample_split, batch_size,
-                                  valid_steps, collate_fn, callbacks)
-                    self._switch_mode("train")
-                else:
-                    self._reset_metrics()
+                                         prof.step_info(), self._mode)
+                    history = self._prepare_history(outs, fetch_indices,
+                                                    self._mode)
+
+                # if valid_data and epoch % valid_freq == 0:
+                #     self.evaluate(valid_data, valid_sample_split, batch_size,
+                #                   valid_steps, collate_fn, callbacks)
+                #     self._switch_mode("train")
+                # else:
+                #     self._reset_metrics()
             return history
 
     def evaluate(self,
@@ -813,22 +909,32 @@ def evaluate(self,
                 engine.evaluate(valid_dataset, batch_size=64)
 
         """
-        self.mode = 'eval'
-        inputs, labels = self._split_sample_item(valid_data, valid_sample_split)
-        self._infer_sample_spec(inputs, labels, batch_size)
-        if not self._mode_init_states[self.mode]:
-            self._prepare_program(self.mode)
+        self._mode = 'eval'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            valid_data, valid_sample_split, batch_size)
+        self._inputs, self._labels = self._prepare_data_tensor(
+            self._inputs_spec, self._labels_spec)
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
         else:
-            self._switch_mode("eval")
-
-        assert self.mode in self._dist_main_progs, \
+            self._switch_mode(self._mode)
+        assert self._mode in self._dist_main_progs, \
             "eval model is not ready, please call `engine._prepare_program('eval')` first."
-        valid_dataloader = self._prepare_dataloader(valid_data,
-                                                    batch_size,
-                                                    steps_per_epoch=steps,
-                                                    collate_fn=collate_fn)
-
-        fetch_names, fetch_indices = self._prepare_fetch(mode=self.mode)
+        valid_dataloader = self._prepare_dataloader_from_generator(
+            dataset=valid_data,
+            # feed_list=feed_list,
+            capacity=70,
+            # use_double_buffer=use_double_buffer,
+            iterable=False,
+            # return_list=return_list,
+            # use_multiprocess=use_multiprocess,
+            # drop_last=drop_last,
+            # places=places,
+            batch_size=batch_size,
+            # epochs=epochs,
+            steps_per_epoch=steps,
+            collate_fn=collate_fn)
+        fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
 
         for step, _ in enumerate(valid_dataloader):
             try:
@@ -839,9 +945,9 @@ def evaluate(self,
                     return_numpy=self._strategy.return_numpy)
             except core.EOFException:
                 break
-            self._prepare_logger(outs, self.mode, None, step, None, fetch_names,
-                                 fetch_indices)
-            history = self._prepare_history(outs, self.mode, fetch_indices)
+            self._prepare_logger(outs, None, step, None, fetch_names,
+                                 fetch_indices, "", self._mode)
+            history = self._prepare_history(outs, fetch_indices, self._mode)
         self._reset_metrics()
         return history
 
@@ -895,22 +1001,32 @@ def predict(self,
                 engine = auto.Engine(model)
                 engine.predict(valid_dataset, batch_size=64)
         """
-        self.mode = 'predict'
-        inputs, labels = self._split_sample_item(test_data, test_sample_split)
-        self._infer_sample_spec(inputs, labels, batch_size)
-        if not self._mode_init_states[self.mode]:
-            self._prepare_program(self.mode)
+        self._mode = 'predict'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            test_data, test_sample_split, batch_size)
+        self._inputs, self._labels = self._prepare_data_tensor(
+            self._inputs_spec, self._labels_spec)
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
         else:
-            self._switch_mode("predict")
-
-        assert self.mode in self._dist_main_progs, \
+            self._switch_mode(self._mode)
+        assert self._mode in self._dist_main_progs, \
             "predict model is not ready, please call `engine._prepare_program('predict')` first."
-        test_dataloader = self._prepare_dataloader(test_data,
-                                                   batch_size,
-                                                   steps_per_epoch=steps,
-                                                   collate_fn=collate_fn)
-
-        fetch_names, fetch_indices = self._prepare_fetch(mode=self.mode)
+        test_dataloader = self._prepare_dataloader_from_generator(
+            dataset=test_data,
+            # feed_list=feed_list,
+            capacity=70,
+            # use_double_buffer=use_double_buffer,
+            iterable=False,
+            # return_list=return_list,
+            # use_multiprocess=use_multiprocess,
+            # drop_last=drop_last,
+            # places=places,
+            batch_size=batch_size,
+            # epochs=epochs,
+            steps_per_epoch=steps,
+            collate_fn=collate_fn)
+        fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
 
         for step, _ in enumerate(test_dataloader):
             try:
@@ -921,62 +1037,200 @@ def predict(self,
                     return_numpy=self._strategy.return_numpy)
             except core.EOFException:
                 break
-            self._prepare_logger(outs, self.mode, None, step, None, fetch_names,
-                                 fetch_indices)
-            history = self._prepare_history(outs, self.mode, fetch_indices)
+            self._prepare_logger(outs, None, step, None, fetch_names,
+                                 fetch_indices, "", self._mode)
+            history = self._prepare_history(outs, fetch_indices, self._mode)
 
         return history
 
-    def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
-        self.mode = 'train'
-        inputs, labels = self._split_sample_item(tune_data, tune_sample_split)
-        self._infer_sample_spec(inputs, labels, batch_size)
-        self._optimization_tuning(self.mode, tune_data, batch_size)
-
-    def dataloader(self,
-                   dataset,
-                   sample_split=1,
-                   batch_size=1,
-                   epochs=1,
-                   steps_per_epoch=None,
-                   collate_fn=None,
-                   mode="train",
-                   from_generator=True):
-        assert from_generator, "Only support from_generator for now"
-        self.mode = mode
-        inputs, labels = self._split_sample_item(dataset, sample_split)
-        self._infer_sample_spec(inputs, labels, batch_size)
-        if not self._mode_init_states[self.mode]:
-            self._prepare_program(self.mode)
+    def dataloader(
+            self,
+            dataset,
+            # return_list=True,
+            batch_size=1,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=None,
+            num_workers=0,
+            use_buffer_reader=True,
+            use_shared_memory=True,
+            timeout=0,
+            worker_init_fn=None,
+            epochs=1,
+            steps_per_epoch=None,
+            sample_split=1,
+            mode=None):
+        if mode is not None:
+            self.to_mode(mode)
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            dataset, sample_split, batch_size)
+        self._inputs, self._labels = self._prepare_data_tensor(
+            self._inputs_spec, self._labels_spec)
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
         else:
-            self._switch_mode("train")
-        dataloader = self._prepare_dataloader(dataset, batch_size, epochs,
-                                              steps_per_epoch, collate_fn)
+            self._switch_mode(self._mode)
+        dataloader = self._prepare_dataloader(
+            dataset,
+            return_list=False,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=collate_fn,
+            num_workers=num_workers,
+            use_buffer_reader=use_buffer_reader,
+            use_shared_memory=use_shared_memory,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch)
         return dataloader
 
+    def dataloader_from_generator(
+            self,
+            dataset,
+            capacity=70,
+            use_double_buffer=True,
+            iterable=True,
+            # return_list=False,
+            use_multiprocess=False,
+            drop_last=True,
+            batch_size=1,
+            epochs=1,
+            steps_per_epoch=None,
+            collate_fn=None,
+            sample_split=1,
+            mode=None):
+        if mode is not None:
+            self.to_mode(mode)
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            dataset, sample_split, batch_size)
+        self._inputs, self._labels = self._prepare_data_tensor(
+            self._inputs_spec, self._labels_spec)
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+        dataloader = self._prepare_dataloader_from_generator(
+            dataset=dataset,
+            # feed_list=feed_list,
+            capacity=capacity,
+            use_double_buffer=use_double_buffer,
+            iterable=iterable,
+            return_list=False,
+            use_multiprocess=use_multiprocess,
+            drop_last=drop_last,
+            # places=places,
+            batch_size=batch_size,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
+            collate_fn=collate_fn)
+        return dataloader
+
+    def prepare(self,
+                inputs_spec=None,
+                labels_spec=None,
+                inputs=None,
+                labels=None,
+                main_program=None,
+                startup_program=None,
+                mode=None):
+        if mode is not None:
+            self.to_mode(mode)
+        if inputs or labels:
+            self._skip_build = True
+            self._inputs, self._labels = self._prepare_data_tensor(
+                inputs_spec, labels_spec, inputs, labels)
+            self._orig_main_prog = main_program
+            if self._orig_main_prog is None:
+                self._orig_main_prog = static.default_main_program()
+            self._orig_startup_prog = startup_program
+            if self._orig_startup_prog is None:
+                self._orig_startup_prog = static.default_startup_program()
+            if not self._has_prepared[self._mode]:
+                self._prepare_program(self._mode)
+            else:
+                self._switch_mode(self._mode)
+        elif inputs_spec or labels_spec:
+            self._outside_dataloader = True
+            self._inputs, self._labels = self._prepare_data_tensor(
+                inputs_spec, labels_spec)
+            self._orig_main_prog = main_program
+            if self._orig_main_prog is None:
+                self._orig_main_prog = static.default_main_program()
+            self._orig_startup_prog = startup_program
+            if self._orig_startup_prog is None:
+                self._orig_startup_prog = static.default_startup_program()
+            if not self._has_prepared[self._mode]:
+                self._prepare_program(self._mode)
+            else:
+                self._switch_mode(self._mode)
+        else:
+            assert self._inputs_spec and self._labels_spec, \
+                "Please call the dataloader(...) before calling prepare(...)"
+
+    def run(
+        self,
+        data=None,
+        # program=None,
+        feed=None,
+        fetch_list=None,
+        # feed_var_name='feed',
+        # fetch_var_name='fetch',
+        # scope=None,
+        # return_numpy=True,
+        # use_program_cache=False,
+        # return_merged=True,
+        # use_prune=False,
+        mode=None):
+        if mode is not None:
+            self.to_mode(mode)
+        feed_dict = self._prepare_feed(data, feed, self._mode)
+        fetch_names, fetch_indices = self._prepare_fetch(fetch_list, self._mode)
+        if self._outside_dataloader and not self._has_prepared_reader[
+                self._mode]:
+            self._prepare_reader()
+        outs = self._executor.run(self.main_program,
+                                  feed=feed_dict,
+                                  fetch_list=fetch_names,
+                                  use_program_cache=self._strategy.use_cache,
+                                  return_numpy=self._strategy.return_numpy)
+        self._prepare_logger(outs, None, None, None, fetch_names, fetch_indices,
+                             "", self._mode)
+        history = self._prepare_history(outs, fetch_indices, self._mode)
+        return history
+
     def _prepare_dataloader(self,
                             dataset,
-                            batch_size,
+                            return_list=True,
+                            batch_size=1,
+                            shuffle=False,
+                            drop_last=False,
+                            collate_fn=None,
+                            num_workers=0,
+                            use_buffer_reader=True,
+                            use_shared_memory=True,
+                            timeout=0,
+                            worker_init_fn=None,
                             epochs=1,
-                            steps_per_epoch=None,
-                            collate_fn=None):
+                            steps_per_epoch=None):
 
         if self._strategy.gradient_merge and batch_size is not None:
             assert batch_size % self._k_steps == 0, \
                 "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps)
             batch_size //= self._k_steps
 
-        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
-        dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
-        dist_context = self._dist_contexts[self.mode]
+        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
         # Cause predict_program does not contain labels var,
         # then we will add labels var from serial_program to dist_program,
         # that maintains the length of feed_list equal to the length of dataset's values.
-        inputs_var = self._feed_vars[self.mode]["inputs"]
-        labels_var = self._feed_vars[self.mode]["labels"]
+        inputs_var = self._feed_vars[self._mode]["inputs"]
+        labels_var = self._feed_vars[self._mode]["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
@@ -986,45 +1240,121 @@ def _prepare_dataloader(self,
                 copy_var.desc.set_original_id(var.desc.original_id())
                 feed_list.append(copy_var)
 
-        # remove the first three ops if multi run fit/evaluate/predict
-        op_size = len(dist_main_block.ops)
-        if dist_main_block.ops[0].type == 'create_py_reader':
-            op_size -= 3
-            for _ in range(3):
-                dist_main_block._remove_op(0, sync=False)
-
         # insert read op at the end of program
         places = paddle.static.cuda_places()
         with static.program_guard(dist_main_prog, dist_startup_prog):
-            dataloader = NonIterableGeneratorLoader(
+            dataloader = DistributedDataLoader(
                 dataset,
-                feed_list,
-                places,
-                batch_size,
-                epochs,
-                steps_per_epoch,
-                collate_fn,
+                feed_list=feed_list,
+                places=places,
+                return_list=return_list,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last,
+                collate_fn=collate_fn,
+                num_workers=num_workers,
+                use_buffer_reader=use_buffer_reader,
+                use_shared_memory=use_shared_memory,
+                timeout=timeout,
+                worker_init_fn=worker_init_fn,
+                epochs=epochs,
+                steps_per_epoch=steps_per_epoch,
+                split_data=self._strategy.split_data,
                 data_parallel_world_size=self._dp_world_sizes,
-                data_parallel_rank=self._dp_ranks,
-                split_data=self._strategy.split_data)
+                data_parallel_rank=self._dp_ranks)
 
-        # move read op from the end of program to the start of program
-        new_op_size = len(dist_main_block.ops)
-        for _ in range(new_op_size - 1, op_size - 1, -1):
-            op = dist_main_block.ops[new_op_size - 1]
-            new_op_desc = dist_main_block.desc._prepend_op()
-            new_op_desc.copy_from(op.desc)
-            new_op = Operator(dist_main_block,
-                              new_op_desc,
-                              type=new_op_desc.type())
-            dist_main_block.ops.insert(0, new_op)
-            dist_op = DistributedOperator(new_op)
-            dist_context.add_dist_op_for_program(dist_op)
-        for _ in range(new_op_size - op_size):
-            dist_main_block._remove_op(new_op_size, sync=False)
-        dist_main_block._sync_with_cpp()
         return dataloader
 
+    def _prepare_dataloader_from_generator(self,
+                                           dataset,
+                                           capacity=None,
+                                           use_double_buffer=True,
+                                           iterable=True,
+                                           return_list=False,
+                                           use_multiprocess=False,
+                                           drop_last=True,
+                                           batch_size=1,
+                                           epochs=1,
+                                           steps_per_epoch=None,
+                                           collate_fn=None):
+
+        if self._strategy.gradient_merge and batch_size is not None:
+            assert batch_size % self._k_steps == 0, \
+                "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps)
+            batch_size //= self._k_steps
+
+        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_block = dist_main_prog.global_block()
+
+        # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
+        # Cause predict_program does not contain labels var,
+        # then we will add labels var from serial_program to dist_program,
+        # that maintains the length of feed_list equal to the length of dataset's values.
+        inputs_var = self._feed_vars[self._mode]["inputs"]
+        labels_var = self._feed_vars[self._mode]["labels"]
+        feed_list = []
+        for var in inputs_var + labels_var:
+            if var.name in dist_main_block.vars:
+                feed_list.append(dist_main_block.vars[var.name])
+            else:
+                copy_var = dist_main_block._clone_variable(var, var.persistable)
+                copy_var.desc.set_original_id(var.desc.original_id())
+                feed_list.append(copy_var)
+
+        # # remove the first three ops if multi run fit/evaluate/predict
+        # self._op_size = len(dist_main_block.ops)
+        # if dist_main_block.ops[0].type == 'create_py_reader':
+        #     op_size -= 3
+        #     for _ in range(3):
+        #         dist_main_block._remove_op(0, sync=False)
+
+        places = paddle.static.cuda_places()
+        with static.program_guard(dist_main_prog, dist_startup_prog):
+            dataloader = DistributedDataLoaderFromGenerator(
+                dataset=dataset,
+                feed_list=feed_list,
+                capacity=capacity,
+                use_double_buffer=use_double_buffer,
+                iterable=iterable,
+                return_list=return_list,
+                use_multiprocess=use_multiprocess,
+                drop_last=drop_last,
+                places=places,
+                batch_size=batch_size,
+                epochs=epochs,
+                steps_per_epoch=steps_per_epoch,
+                collate_fn=collate_fn,
+                split_data=self._strategy.split_data,
+                data_parallel_world_size=self._dp_world_sizes,
+                data_parallel_rank=self._dp_ranks)
+        self._prepare_reader()
+        # # move read op from the end of program to the start of program
+        # new_op_size = len(dist_main_block.ops)
+        # for _ in range(new_op_size - 1, op_size - 1, -1):
+        #     op = dist_main_block.ops[new_op_size - 1]
+        #     new_op_desc = dist_main_block.desc._prepend_op()
+        #     new_op_desc.copy_from(op.desc)
+        #     new_op = Operator(dist_main_block,
+        #                       new_op_desc,
+        #                       type=new_op_desc.type())
+        #     dist_main_block.ops.insert(0, new_op)
+        #     dist_op = DistributedOperator(new_op)
+        #     dist_context.add_dist_op_for_program(dist_op)
+        # for _ in range(new_op_size - op_size):
+        #     dist_main_block._remove_op(new_op_size, sync=False)
+        # dist_main_block._sync_with_cpp()
+        return dataloader
+
+    def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
+        self._mode = 'train'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            tune_data, tune_sample_split, batch_size)
+        self._inputs, self._labels = self._prepare_data_tensor(
+            self._inputs_spec, self._labels_spec)
+        self._optimization_tuning(self._mode, tune_data, batch_size)
+
     def _validate_spec(self, specs):
         specs = to_list(specs)
         self._k_steps = self._strategy.gradient_merge.k_steps
@@ -1108,9 +1438,14 @@ def _reset_metrics(self):
             metric.reset()
 
     def _switch_mode(self, mode):
-        self.mode = mode
+        self.to_mode(mode)
         self._initialize(mode)
 
+    def to_mode(self, mode):
+        assert mode in ["train", "eval", "predict"], \
+            "mode {} should be one of ['train', 'eval', 'predict']".format(mode)
+        self._mode = mode
+
     def _set_state_dict(self, mode, strict, state_dict, dist_attr):
         program = self._dist_main_progs[mode][self._cur_rank]
         dist_context = self._dist_contexts[mode]
@@ -1129,7 +1464,7 @@ def save(self, path, training=True):
                 is 'dirname/file_prefix' or 'file_prefix'. if empty str.
                 A exception will be raised.
             training (bool, optional): Whether to save for training. If not, save
-                for inference only. If `training` is set to True, the optimzer state
+                for inference only. If `training` is set to True, the optimizer state
                 will be saved. Otherwise, only the model and parameters are saved.
                 This function will silently overwrite existing file at the target
                 location. Default: True.
@@ -1259,42 +1594,34 @@ def _get_lr(self, optimizer):
                         " or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(type(optimizer))
                 )
 
-    @property
-    def mode(self):
-        return self._mode
-
-    @mode.setter
-    def mode(self, mode):
-        self._mode = mode
-
     @property
     def main_program(self):
-        return self._dist_main_progs[self.mode][self._cur_rank]
+        return self._dist_main_progs[self._mode][self._cur_rank]
 
     @property
     def startup_program(self):
-        return self._dist_startup_progs[self.mode][self._cur_rank]
+        return self._dist_startup_progs[self._mode][self._cur_rank]
 
     @property
     def dist_context(self):
-        return self._dist_contexts[self.mode]
+        return self._dist_contexts[self._mode]
 
     @property
     def serial_main_program(self):
-        return self._serial_main_progs[self.mode]
+        return self._serial_main_progs[self._mode]
 
     @property
     def serial_startup_program(self):
-        return self._serial_startup_progs[self.mode]
+        return self._serial_startup_progs[self._mode]
 
     @property
     def fetch_vars(self):
-        return self._fetch_vars[self.mode]
+        return self._fetch_vars[self._mode]
 
     @property
     def inputs(self):
-        return self.inputs_spec
+        return self._inputs
 
     @property
     def labels(self):
-        return self.labels_spec
+        return self._labels
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 88064cccbe66b..f7f6d89a6168e 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -210,11 +210,11 @@ def get_collection(name):
     return _g_collections[name]
 
 
-def add_to_collection(collection_name, value, value_name=None):
+def add_to_collection(collection_name, value, name=None):
     if collection_name not in _g_collections:
         _g_collections[collection_name] = []
-    if value_name is not None:
-        _g_collections[collection_name].append((value_name, value))
+    if name is not None:
+        _g_collections[collection_name].append((name, value))
     else:
         _g_collections[collection_name].append((None, value))
 
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 399a5a485b56e..32917627672d5 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -23,7 +23,7 @@
 from .utils import is_backward_op, is_forward_op, is_loss_op, is_optimize_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
-__varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
+__varname_not_in_block__ = ["lod_tensor_blocking_queue"]
 __not_shape_var_type__ = [
     core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES
 ]
@@ -238,7 +238,9 @@ def partition_block(self, ref_block, target_block):
                                        target_block, serial_input_varname,
                                        new_varname)
                     else:
-                        assert serial_input_varname in __varname_not_in_block__
+                        for varname_not_in_block in __varname_not_in_block__:
+                            assert varname_not_in_block in serial_input_varname, \
+                                "{} is not found".format(serial_input_varname)
 
                     self._serial2dist_varname_mapping[
                         serial_input_varname] = new_varname
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index cf09929ad48aa..d7f29494441ee 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -45,7 +45,8 @@ def get_var_with_recursion(var_name, block, program):
         parent_block = program.blocks[block.parent_idx]
         if var_name in parent_block.vars:
             var = parent_block.vars[var_name]
-    assert var is not None
+    assert var is not None, \
+        "{} is not found".format(var.name)
 
     return var
 
@@ -1838,8 +1839,8 @@ def _reshard_input(self, block):
 
                 idx_offset = 0
                 for var_name in input_var_names:
-                    # skip lod_tensor_blocking_queue_0
-                    if var_name == "lod_tensor_blocking_queue_0":
+                    # skip lod_tensor_blocking_queue_? name
+                    if "lod_tensor_blocking_queue" in var_name:
                         continue
                     var = get_var_with_recursion(var_name, block,
                                                  self.auto_parallel_main_prog)
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
index 813b826aaa054..a504bbfed29b4 100644
--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -114,6 +114,13 @@ def __init__(self, config_dict=None):
         super(TuningConfig, self).__init__(category, config_dict)
 
 
+class DatasetConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.DATASET
+        super(DatasetConfig, self).__init__(category, config_dict)
+
+
 class Strategy(BaseConfig):
     """
     The `Strategy` object is used to configure the paralleization and optimization beheviors.
@@ -178,3 +185,6 @@ def __init__(self, config=None):
 
         config_dict = self._config_dict.get(constants.TUNING, None)
         self.tuning = TuningConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.DATASET, None)
+        self.dataset = DatasetConfig(config_dict)
diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/tuner/profiler.py
index 478501cfe3fbd..4b2655028bf7f 100644
--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -23,7 +23,7 @@
 from paddle.fluid.framework import Program, _current_expected_place
 from paddle.fluid.framework import Operator
 from paddle.distributed.auto_parallel.process_group import get_all_process_groups, new_process_group
-from paddle.distributed.auto_parallel.dist_loader import NonIterableGeneratorLoader
+from paddle.distributed.auto_parallel.dist_loader import DistributedDataLoaderFromGenerator
 from paddle.distributed.collective import _get_global_env
 
 paddle.enable_static()
@@ -132,13 +132,14 @@ def create_dataloader(main_program,
     # insert read op at the end of program
     places = paddle.static.cuda_places()
     with paddle.static.program_guard(main_program, startup_program):
-        dataloader = NonIterableGeneratorLoader(
-            dataset,
-            feed_list,
-            places,
-            dataset.batch_size,
-            epochs,
-            steps_per_epoch,
+        dataloader = DistributedDataLoaderFromGenerator(
+            dataset=dataset,
+            feed_list=feed_list,
+            capacity=70,
+            places=places,
+            batch_size=dataset.batch_size,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
             data_parallel_world_size=dataset.dp_world_size,
             data_parallel_rank=dataset.dp_rank)
 
diff --git a/python/paddle/distributed/communication/stream/reduce_scatter.py b/python/paddle/distributed/communication/stream/reduce_scatter.py
index a4aeae6312a30..68ba8e18e404c 100644
--- a/python/paddle/distributed/communication/stream/reduce_scatter.py
+++ b/python/paddle/distributed/communication/stream/reduce_scatter.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import paddle
-import paddle.distributed as dist
 import paddle.fluid.framework as framework
 from paddle.distributed.communication.group import _get_global_group
 from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
@@ -30,11 +29,11 @@ def _check_tensor_shape(tensor, shape, nranks=1):
 def _check_tensor_list_shape(tensor_list, shape, nranks=1):
     if len(tensor_list) != nranks:
         raise RuntimeError(
-            f"The tensor_list for reduce_scatter is not correctly-sized.")
+            "The tensor_list for reduce_scatter is not correctly-sized.")
     for tensor in tensor_list:
         if tensor.shape != shape:
             raise RuntimeError(
-                f"The tensor_list for reduce_scatter is not correctly-sized.")
+                "The tensor_list for reduce_scatter is not correctly-sized.")
 
 
 def _reduce_scatter_tensor_in_dygraph(out_tensor,
diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py
index 3a3fb00534783..9b0fcaf4b086a 100644
--- a/python/paddle/distributed/communication/stream/scatter.py
+++ b/python/paddle/distributed/communication/stream/scatter.py
@@ -28,11 +28,11 @@ def _check_tensor_shape(tensor, shape, nranks=1):
 def _check_tensor_list_shape(tensor_list, shape, nranks=1):
     if len(tensor_list) != nranks:
         raise RuntimeError(
-            f"The tensor_list for scatter is not correctly-sized.")
+            "The tensor_list for scatter is not correctly-sized.")
     for tensor in tensor_list:
         if tensor.shape != shape:
             raise RuntimeError(
-                f"The tensor_list for scatter is not correctly-sized.")
+                "The tensor_list for scatter is not correctly-sized.")
 
 
 def _scatter_tensor_in_dygraph(out_tensor, in_tensor, src, group, sync_op,
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index f9221f4bb7621..4a5d6c85f855b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -69,6 +69,7 @@ def __init__(self,
                  offload=False,
                  device="gpu",
                  pertrain_sync_models=True,
+                 dp_group=None,
                  **kw):
 
         super().__init__(learning_rate=optim._learning_rate, parameters=params)
@@ -121,6 +122,8 @@ def __init__(self,
         self._group = new_group(
             _get_global_group().ranks) if group is None else group
 
+        # only support to combine stage2 and dp hybrid parallel now.
+        self._dp_group = dp_group
         self.world_size = self._group.nranks
         self._rank = self._group.rank
         self._global_root_rank = self._group.ranks[0]
@@ -172,6 +175,12 @@ def _sync_params_and_buffers(self):
                       group=self._group,
                       sync_op=True)
 
+            if self._dp_group:
+                broadcast(p,
+                          src=self._dp_group.ranks[0],
+                          group=self._dp_group,
+                          sync_op=True)
+
     def _update_task(self, task):
         if self._reduce_overlap:
             assert task is not None
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index a2177df7c516b..573e0b597c8fb 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -65,7 +65,8 @@ def __init__(
             sync_buffers=False,
             buffer_max_size=2**23,  #8MB
             auto_refresh_trainable=True,
-            device="gpu"):
+            device="gpu",
+            dp_group=None):
         super().__init__()
 
         # training options
@@ -91,6 +92,8 @@ def __init__(
             0]  # picking ranks index 0 as the reference
         self._default_device = device
 
+        self._dp_group = dp_group
+
         # Global statistical parameters
         self._all_params = []
         for optim in self._sharding_optimizers:
@@ -201,24 +204,29 @@ def _grad_scale(self):
         """
         Before the gradient accumulation, scale the gradient.
         """
+
+        if self._dp_group is None:
+            scale_factor = self._world_size_scaling
+        else:
+            scale_factor = 1.0 / (self._group.nranks * self._dp_group.nranks)
+
         # Scale grad storages
         for dtype in self._grad_storages.keys():
             if not self._offload and self._rank in self._grad_storages[
                     dtype].keys():
                 self._grad_storages[dtype][self._rank].buffer.scale_(
-                    scale=self._world_size_scaling)
+                    scale=scale_factor)
 
         # Scale grads of params
         with paddle.no_grad():
             for param in self._trainable_params:
                 if param.name in self._param_grads and param.grad is not None:
-                    param.grad.scale_(scale=self._world_size_scaling)
+                    param.grad.scale_(scale=scale_factor)
                 # param._reset_grad_inplace_version(True)
 
             # Scale grads of master params with offload strategy
         if self._offload:
-            self._sharding_optimizers[0]._offload_scale_grad(
-                self._world_size_scaling)
+            self._sharding_optimizers[0]._offload_scale_grad(scale_factor)
 
     def _init_internal_storage(self, needs_fresh):
         """
@@ -288,6 +296,12 @@ def __sync_buffers(self):
                                  self._group,
                                  sync_op=True)
 
+            if self._dp_group:
+                collective.broadcast(buffer,
+                                     self._dp_group.ranks[0],
+                                     self._dp_group,
+                                     sync_op=True)
+
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
         try:
@@ -355,6 +369,13 @@ def cleanup():
                                           group=self._group,
                                           sync_op=not self._reduce_overlap))
 
+                    if self._dp_group:
+                        assert not self._comm_overlap, 'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.'
+                        #TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2.
+                        collective.all_reduce(tensor=param.grad,
+                                              group=self._dp_group,
+                                              sync_op=True)
+
                     # Clear the task flow and trigger callback to clear the redundant gradient
                     # self._clear_task_flow()
 
@@ -405,6 +426,13 @@ def cleanup():
                                 group=self._group,
                                 sync_op=not self._reduce_overlap))
 
+                        if self._dp_group:
+                            assert not self._comm_overlap, 'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.'
+                            #TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2.
+                            collective.all_reduce(tensor=grad_storage.buffer,
+                                                  group=self._dp_group,
+                                                  sync_op=True)
+
                         cleanup()
 
                     # Clear the task flow and trigger callback to clear the redundant gradient
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
index c686164dbd884..c31a658930438 100644
--- a/python/paddle/distributed/launch/controllers/__init__.py
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -18,12 +18,14 @@
 from .collective import CollectiveElasticController
 from .ps import PSController
 from .ipu_controller import IPUController
+from .rpc import RpcController
 
 # the order is extremely important
 _controllers = [
     IPUController,
     CollectiveElasticController,
     PSController,
+    RpcController,
     CollectiveController,
 ]
 
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index c33f69d6ef6d3..21b9ce2d5c80c 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -28,6 +28,7 @@ class ControleMode:
     COLLECTIVE = "collective"
     PS = "ps"
     IPU = "ipu"
+    RPC = "rpc"
 
 
 class ControllerBase(object):
diff --git a/python/paddle/distributed/launch/controllers/rpc.py b/python/paddle/distributed/launch/controllers/rpc.py
new file mode 100644
index 0000000000000..d68c389da9e03
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/rpc.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .controller import Controller, ControleMode
+
+import json
+
+
+class RpcController(Controller):
+
+    @classmethod
+    def enable(cls, ctx):
+        if ctx.args.run_mode == ControleMode.RPC:
+            ctx.logger.debug("{} enabled".format(cls.__name__))
+            return True
+        else:
+            return False
+
+    def build_pod(self):
+        assert (self.ctx.args.master
+                is not None), "Master is None, Please set master address!"
+        self._build_pod_with_master()
+
+    def _build_pod_with_master(self):
+        # nproc_per_node
+        self.pod.replicas = self.pod_replicas()
+
+        # rank will be reset when restart
+        self.pod.rank = int(self.ctx.args.rank)
+
+        port = self.ctx.node.get_free_port()
+
+        # compatible
+        endpoints = [
+            "{}:{}".format(self.ctx.node.ip, p)
+            for p in self.ctx.node.get_free_ports(self.pod.replicas)
+        ]
+
+        data = json.dumps({
+            "name": self.pod.name,
+            "rank": self.pod.rank,
+            "replicas": self.pod.replicas,
+            "dtype": self.ctx.node.device.dtype,
+            "candidate": "{}:{}".format(self.ctx.node.ip, port),
+            "endpoints": ",".join(endpoints),
+        })
+        peer_list, rank = self.master.sync_peers(
+            "/{}/info".format(self.job.id),
+            self.pod.name,
+            data,
+            self.job.replicas,
+            self.pod.rank,
+        )
+        self.pod.rank = rank
+
+        if len(peer_list) < 1:
+            return False
+
+        peer_list = [json.loads(i) for i in peer_list]
+
+        self.ctx.logger.debug("sync peers done {}".format(peer_list))
+        self.save_pod_log(peer_list)
+
+        global_size = sum([i["replicas"] for i in peer_list])
+        rank_offset = sum([i["replicas"] for i in peer_list[:rank]])
+
+        rpc_master = peer_list[0]["candidate"]
+        self.pod.reset()
+        for i in range(self.pod.replicas):
+            e = {
+                "PADDLE_MASTER_ENDPOINT": rpc_master,
+                "PADDLE_WORKER_ENDPOINT": endpoints[i],
+                "PADDLE_TRAINER_ID": "{}".format(i + rank_offset),
+                "PADDLE_TRAINERS_NUM": "{}".format(global_size),
+            }
+            log_file = f"workerlog.{i + rank_offset}"
+            self.add_container(envs=e, log_file=log_file)
+        return True
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index e37ae141f3fb5..a6a0c973208fd 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -48,7 +48,7 @@ def launch():
 
         - ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.
 
-        - ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
+        - ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter/rpc. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
 
         - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
 
@@ -260,6 +260,27 @@ def launch():
             # Please Check the `IPU Parameters` for details
             python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py
 
+    Examples 11 (rpc, cpu, single node):
+        .. code-block:: bash
+            :name: code-block-example-bash11
+
+            # Training on single node with two local servers
+            python -m paddle.distributed.launch --master 127.0.0.1:8765 --nnodes 1 --nproc_per_node 2 --rank 0 --run_mode rpc train.py
+
+    Examples 12 (rpc, cpu, multi node):
+        .. code-block:: bash
+            :name: code-block-example-bash12
+
+            # For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 2 servers.
+
+            # On 192.168.0.16
+
+            python -m paddle.distributed.launch --master 192.168.0.16:8765 --nnodes 2 --nproc_per_node 2 --rank 0 --run_mode rpc train.py
+
+            # On 192.168.0.17
+
+            python -m paddle.distributed.launch --master 192.168.0.16:8765 --nnodes 2 --nproc_per_node 2 --rank 1 --run_mode rpc train.py
+
     """
 
     # initialize the context to run
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index da0c46a8eb121..8470aa5109961 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -82,9 +82,11 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
         with paddle.static.program_guard(main_program, startup_program):
             self._analyze_program()
-            self._prune_grad_scaling()
-            self._calc_comm_overlap()
-            grad_group = self._fuse_allreduce()
+
+            if self.is_data_parallel_applied():
+                self._prune_grad_scaling()
+                self._calc_comm_overlap()
+                grad_group = self._fuse_allreduce()
 
         # self.summary(grad_group)
 
@@ -167,6 +169,9 @@ def _analyze_program(self):
         ) == 0, "Unexception: gradients [{}] is scaled BUT NOT synchronized.".format(
             not_synchronized_grads)
 
+    def is_data_parallel_applied(self):
+        return len(self._group_to_grad_name_map) > 0
+
     def _could_be_prune(self):
 
         return self.dist_context.gradient_scale and (
diff --git a/python/paddle/distributed/rpc/__init__.py b/python/paddle/distributed/rpc/__init__.py
new file mode 100644
index 0000000000000..692600a93b929
--- /dev/null
+++ b/python/paddle/distributed/rpc/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.rpc.rpc import (
+    init_rpc,
+    shutdown,
+    rpc_async,
+    rpc_sync,
+    get_worker_info,
+    get_all_worker_infos,
+    get_current_worker_info,
+)
+
+__all__ = [
+    "init_rpc",
+    "shutdown",
+    "rpc_async",
+    "rpc_sync",
+    "get_worker_info",
+    "get_all_worker_infos",
+    "get_current_worker_info",
+]
diff --git a/python/paddle/distributed/rpc/internal.py b/python/paddle/distributed/rpc/internal.py
new file mode 100644
index 0000000000000..5cae48404a948
--- /dev/null
+++ b/python/paddle/distributed/rpc/internal.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+import pickle
+
+PythonFunc = namedtuple("PythonFunc", ["func", "args", "kwargs"])
+"""Some Python code interfaces called in C++"""
+
+
+def _serialize(obj):
+    return pickle.dumps(obj)
+
+
+def _deserialize(obj):
+    return pickle.loads(obj)
+
+
+def _run_py_func(python_func):
+    result = python_func.func(*python_func.args, **python_func.kwargs)
+    return result
diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py
new file mode 100644
index 0000000000000..c61b1a805b093
--- /dev/null
+++ b/python/paddle/distributed/rpc/rpc.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import namedtuple
+import pickle
+import time
+import datetime
+
+import paddle.fluid.core as core
+from paddle.distributed.utils.launch_utils import logger
+from paddle.distributed.rpc.internal import _serialize, PythonFunc
+from paddle.distributed.launch.context import Node
+
+WorkerInfo = namedtuple("WorkerInfo", ["name", "rank", "ip", "port"])
+
+_DEFAULT_RPC_TIMEOUT = -1
+_MAX_RPC_TIMEOUT_MS = 0x7fffffff
+_BARRIER_TIMEOUT_MAX_DAYS = 99999999
+# tcp store for `_barrier_never_timeout`
+_barrier_store = None
+# count the number of `_barrier_never_timeout` is called and
+# ensure that the barrier key is unique
+_barrier_count = 0
+
+
+def _set_barrier_store(store):
+    global _barrier_store
+    _barrier_store = store
+
+
+def _del_barrier_store():
+    global _barrier_store
+    del _barrier_store
+
+
+def _set_self_info(name, rank, ip, port):
+    self_info = pickle.dumps(WorkerInfo(name, rank, ip, port))
+    _barrier_store.set(str(rank), self_info)
+
+
+def _exchange_all_service_infos(world_size):
+    all_infos = []
+    s = set()
+    for rank in range(world_size):
+        info = pickle.loads(_barrier_store.get(str(rank)))
+        assert (info.name not in s
+                ), "The Worker name must be unique, but name `{}` is repeated."
+        s.add(info.name)
+        all_infos.append(info)
+    return all_infos
+
+
+def _gen_endpoint():
+    node = Node()
+    ip = node.get_host_ip()
+    free_port = node.get_free_port()
+    return "{}:{}".format(ip, free_port)
+
+
+def init_rpc(name, rank=None, world_size=None, master_endpoint=None):
+    """
+    init rpc.
+
+    Args:
+        name (str): worker name.
+        rank (int, optional): worker id, default is None.
+        world_size (int, optional): number of workers, default is None.
+        master_endpoint (str, optional): id address of master, other nodes communicate with the master to
+            get the information of all worker nodes, default is None.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                        master_endpoint="127.0.0.1:8001")
+            rpc.shutdown()
+
+    """
+    rank = int(os.environ["PADDLE_TRAINER_ID"]) if rank is None else rank
+    world_size = int(
+        os.environ["PADDLE_TRAINERS_NUM"]) if world_size is None else world_size
+    worker_endpoint = os.getenv("PADDLE_WORKER_ENDPOINT", None)
+    if worker_endpoint is None:
+        worker_endpoint = _gen_endpoint()
+    logger.info("Trainer {}: worker endpoint: {}".format(rank, worker_endpoint))
+    master_endpoint = (master_endpoint if master_endpoint != None else
+                       os.environ["PADDLE_MASTER_ENDPOINT"])
+    master_addr, master_port = master_endpoint.split(":")
+    master_port = int(master_port)
+    stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
+    store = core.TCPStore(master_addr,
+                          master_port,
+                          rank == 0,
+                          world_size,
+                          timeout=stop_check_timeout)
+    _set_barrier_store(store)
+    ip, port = worker_endpoint.split(":")
+    port = int(port)
+    _set_self_info(name, rank, ip, port)
+    all_infos = _exchange_all_service_infos(world_size)
+    c_infos = []
+    for node_info in all_infos:
+        info = core.WorkerInfo(node_info.name, node_info.rank, node_info.ip,
+                               node_info.port)
+        c_infos.append(info)
+    core.init_and_set_agent_instance(name, c_infos)
+    core.rpc_start_worker()
+    # ensure that all the workers are started
+    _barrier_never_timeout(rank, world_size)
+    core.rpc_start_client()
+    logger.info("Trainer {}: Init RPC done!".format(rank))
+
+
+def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
+    """
+    Make a blocking RPC call to run function ``fn`` on worker ``to``.
+
+    Args:
+        to (str): name of the destination worker.
+        fn (fn): a callable function, such as Python callables.
+        args (tuple, optional): the argument tuple for the ``fn`` invocation, default is None.
+        kwargs (dict, optional): is a dictionary of keyword arguments for the ``fn``
+                       invocation, default is None.
+        timeout (int, optional): timeout in seconds to use for this RPC. If
+                                   the RPC does not complete in this amount of
+                                   time, an exception indicating it has
+                                   timed out will be raised. A value less than or equal to 0
+                                   indicates an infinite timeout, i.e. a timeout
+                                   error will never be raised. The default value is -1.
+
+    Returns:
+        Returns the result of running ``fn`` with ``args`` and ``kwargs``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+
+            def add(a, b):
+                return a + b
+
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                    master_endpoint="127.0.0.1:8002")
+            ret = rpc.rpc_sync("worker0", add, args=(2, 3))
+            rpc.shutdown()
+
+    """
+    fut = _invoke_rpc(to, fn, args, kwargs, timeout)
+    return fut.wait()
+
+
+def rpc_async(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
+    """
+    Make a non-blocking RPC call to run function ``fn`` on worker ``to``.
+
+    Args:
+        to (str): name of the destination worker.
+        fn (fn): a callable function, such as Python callables.
+        args (tuple, optional): the argument tuple for the ``fn`` invocation, default is None.
+        kwargs (dict, optional): is a dictionary of keyword arguments for the ``fn``
+                       invocation, default is None.
+        timeout (int, optional): timeout in seconds to use for this RPC. If
+                                   the RPC does not complete in this amount of
+                                   time, an exception indicating it has
+                                   timed out will be raised. A value less than or equal to 0
+                                   indicates an infinite timeout, i.e. a timeout
+                                   error will never be raised. The default value is -1.
+
+    Returns:
+        Returns a :class:`FutureWrapper` object that can be waited
+        on. When completed, the return value of ``fn`` on ``args`` and
+        ``kwargs`` can be got by `fut.wait()`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+
+            def add(a, b):
+                return a + b
+
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                    master_endpoint="127.0.0.1:8003")
+            fut = rpc.rpc_async("worker0", add, args=(2, 3))
+            print(fut.wait())
+            rpc.shutdown()
+
+    """
+    return _invoke_rpc(to, fn, args, kwargs, timeout)
+
+
+def _invoke_rpc(to, fn, args, kwargs, timeout):
+    args = args if args else ()
+    kwargs = kwargs if kwargs else {}
+    serial_obj = _serialize(PythonFunc(fn, args, kwargs))
+    timeout_ms = timeout * 1000
+    timeout_ms = _MAX_RPC_TIMEOUT_MS if timeout_ms <= 0 else timeout_ms
+    future = core.invoke_rpc(to, serial_obj, timeout_ms)
+    return future
+
+
+def _barrier_never_timeout(global_rank, global_world_size):
+    # max timeout
+    timeout = datetime.timedelta(days=_BARRIER_TIMEOUT_MAX_DAYS)
+
+    if global_world_size < 2:
+        return
+
+    global _barrier_count
+    barrier_prefix = "Barrier/" + str(_barrier_count) + "/"
+    _barrier_count += 1
+    is_master = (global_rank == 0)
+
+    def _check_keys_ready(wait_keys):
+        start_time = time.time()
+        while len(wait_keys) > 0:
+            time.sleep(0.1)
+            elapse_time = time.time() - start_time
+            if datetime.timedelta(seconds=elapse_time) > timeout:
+                raise RuntimeError(
+                    "Keys {} are not ready sinck rank {} is waiting them.".
+                    format(wait_keys, global_rank))
+            wait_keys = list(
+                filter(lambda key: int(_barrier_store.get(key)) != 1,
+                       wait_keys))
+
+    if is_master:
+        # the master will add key, wait for all workers'exiting key and exit in the end.
+        # Note: the master must exit in the end to ensure that the TcpServer is destroyed in the end.
+        wait_keys = [
+            barrier_prefix + str(rank) for rank in range(1, global_world_size)
+        ]
+        _barrier_store.add(barrier_prefix + str(0), 1)
+        _check_keys_ready(wait_keys)
+    else:
+        wait_keys = [barrier_prefix + str(0)]
+        _check_keys_ready(wait_keys)
+        _barrier_store.add(barrier_prefix + str(global_rank), 1)
+
+
+def shutdown():
+    """
+    Perform a shutdown of the RPC agent, stop the worker and destroy the agent.
+    This will block until all local and remote RPC processes reach this method
+    and wait for all outstanding work to complete.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                        master_endpoint="127.0.0.1:8004")
+            rpc.shutdown()
+
+    """
+    info = get_current_worker_info()
+    rank = info.rank
+    world_size = len(get_all_worker_infos())
+    # master will exit in the end
+    _barrier_never_timeout(rank, world_size)
+    core.rpc_stop_worker()
+    _del_barrier_store()
+    logger.info("Trainer {}: rpc shutdown!".format(rank))
+
+
+def get_worker_info(name):
+    """
+    Get worker information by worker name.
+
+    Args:
+        name (str): name of the worker.
+
+    Returns:
+        class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+            import os
+
+            os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9002"
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                        master_endpoint="127.0.0.1:8005")
+
+            print(rpc.get_worker_info("worker0"))
+            # {name: worker0, rank: 0, ip: 127.0.0.1, port: 9002}
+
+            rpc.shutdown()
+
+    """
+    return core.rpc_get_worker_info(name)
+
+
+def get_all_worker_infos():
+    """
+    Get all worker informations.
+
+    Returns:
+        List[WorkerInfo].
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+            import os
+
+            os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9003"
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                    master_endpoint="127.0.0.1:8006")
+
+            print(rpc.get_all_worker_infos())
+            # [{name: worker0, rank: 0, ip: 127.0.0.1, port: 9003}]
+
+            rpc.shutdown()
+
+    """
+    return core.rpc_get_all_worker_infos()
+
+
+def get_current_worker_info():
+    """
+    Get current worker information.
+
+    Returns:
+        class `WorkerInfo` with attribute `name`, `rank`, `ip` and `port`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.rpc as rpc
+            import os
+
+            os.environ["PADDLE_WORKER_ENDPOINT"] = "127.0.0.1:9004"
+            rpc.init_rpc("worker0", rank=0, world_size=1,
+                        master_endpoint="127.0.0.1:8007")
+
+            print(rpc.get_current_worker_info())
+            # {name: worker0, rank: 0, ip: 127.0.0.1, port: 9004}
+
+            rpc.shutdown()
+
+    """
+    return core.rpc_get_current_worker_info()
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 144813f5585a9..1474f639547fb 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -45,7 +45,8 @@ def group_sharded_parallel(model,
                            sync_buffers=False,
                            buffer_max_size=2**23,
                            segment_size=2**20,
-                           sync_comm=False):
+                           sync_comm=False,
+                           dp_group=None):
     """
     Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
     Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
@@ -61,6 +62,7 @@ def group_sharded_parallel(model,
         buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
         segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
         sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
+        dp_group(Group, optional): dp communication group, only support to combine stage2 and dp hybrid communication now.
 
     Returns:
         model: A wrapper for group sharded given model.
@@ -123,12 +125,14 @@ def check_dtype(param):
                 params=optimizer._parameter_list,
                 optim=optimizer,
                 group=group,
-                offload=offload)
+                offload=offload,
+                dp_group=dp_group)
             model = GroupShardedStage2(model,
                                        optimizer,
                                        group=group,
                                        sync_buffers=sync_buffers,
-                                       buffer_max_size=buffer_max_size)
+                                       buffer_max_size=buffer_max_size,
+                                       dp_group=dp_group)
         else:
             optimizer = ShardingOptimizerStage2(params=model.parameters(),
                                                 optim=optimizer,
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 92fe3fb91549b..71e4a9f83adb4 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -205,6 +205,7 @@ def _thread_loop(self, legacy_expected_place):
         # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
         # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda
         # APIs in this thread.
+        core.set_current_thread_name("Dataloader_" + str(id(self)))
         _set_expected_place(legacy_expected_place)
 
         while not self._thread_done_event.is_set():
@@ -530,6 +531,7 @@ def _thread_loop(self, legacy_expected_place):
         # If we do not set cudaDeviceId in new thread, the default cudaDeviceId will be 0,
         # Which may cost hundreds of MB of GPU memory on CUDAPlace(0) if calling some cuda
         # APIs in this thread.
+        core.set_current_thread_name("Dataloader_" + str(id(self)))
         _set_expected_place(legacy_expected_place)
 
         while not self._thread_done_event.is_set():
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 62b31cdbed855..c526bd655d931 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -477,6 +477,7 @@ def start_provide_thread(func):
         def __provider_thread__(legacy_expected_place):
             try:
                 # See _DataLoaderIterSingleProcess._thread_loop() for why set expected place here.
+
                 _set_expected_place(legacy_expected_place)
 
                 for tensors in func():
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 0eed36fb12e90..07dd03a2bc69e 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1126,6 +1126,7 @@ def _exit_thread_unexpectedly(self):
 
     def _reader_thread_loop_for_multiprocess(self, legacy_expected_place):
         # See _DataLoaderIterSingleProcess._thread_loop() for why set expected place here.
+        core.set_current_thread_name("Dataloader_" + str(id(self)))
         _set_expected_place(legacy_expected_place)
 
         while not self._thread_done_event.is_set():
@@ -1169,6 +1170,7 @@ def _reader_thread_loop_for_multiprocess(self, legacy_expected_place):
     def _reader_thread_loop_for_singleprocess(self, legacy_expected_place):
         try:
             # See _DataLoaderIterSingleProcess._thread_loop() for why set expected place here.
+            core.set_current_thread_name("Dataloader_" + str(id(self)))
             _set_expected_place(legacy_expected_place)
 
             for sample in self._batch_reader():
@@ -1419,6 +1421,7 @@ def _start(self):
         def __thread_main__(legacy_expected_place):
             try:
                 # See _DataLoaderIterSingleProcess._thread_loop() for why set expected place here.
+                core.set_current_thread_name("Dataloader_" + str(id(self)))
                 _set_expected_place(legacy_expected_place)
 
                 while not self._queue.wait_for_inited(1):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a67187869b812..be3a534208a12 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -609,6 +609,7 @@ if(WITH_DISTRIBUTE)
   add_subdirectory(ps)
   add_subdirectory(auto_parallel)
   add_subdirectory(collective)
+  add_subdirectory(rpc)
 
   # FIXME(typhoonzero): add these tests back
   list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index 2d477434d3a9b..ad8d477c81dbc 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -16,6 +16,8 @@
 import os
 import numpy as np
 import paddle
+import paddle.static as static
+import paddle.utils as utils
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.io import Dataset
@@ -26,7 +28,8 @@
 global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
 PP_MESH_0 = auto.ProcessMesh([0])
 PP_MESH_1 = auto.ProcessMesh([1])
-batch_size = 1
+epoch_num = 1
+batch_size = 2
 batch_num = 10
 hidden_size = 1024
 sequence_len = 512
@@ -36,6 +39,8 @@
 paddle.seed(44)
 
 is_fetch = True
+is_feed = True
+my_feed_vars = []
 
 
 class MyDataset(Dataset):
@@ -53,6 +58,23 @@ def __len__(self):
         return self.num_samples
 
 
+def get_random_inputs_and_labels(image_shape, label_shape):
+    input = np.random.random(size=image_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('int64')
+    return input, label
+
+
+def batch_generator_creator():
+
+    def __reader__():
+        for _ in range(batch_num):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, image_size], [batch_size, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
 class MLPLayer(nn.Layer):
 
     def __init__(self,
@@ -82,16 +104,20 @@ def __init__(self,
     def forward(self, input):
         out = auto.shard_op(self.norm, PP_MESH_0)(input)
         out = self.linear0(out)
+        if is_feed:
+            my_feed_vars.append((out, out.shape))
         out = F.gelu(out, approximate=True)
         out = auto.shard_op(self.linear1, PP_MESH_1)(out)
         out = self.dropout(out)
         out = self.linear2(out)
+        if is_feed:
+            my_feed_vars.append((out, out.shape))
         if is_fetch:
             auto.fetch(out, "my_out", logging=True)
         return out
 
 
-def train(fetch):
+def train_high_level(fetch):
     global is_fetch
     is_fetch = fetch
     mlp = MLPLayer(hidden_size=hidden_size,
@@ -135,7 +161,7 @@ def train(fetch):
     temp_dir.cleanup()
 
 
-def train_callable():
+def train_low_level():
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    dropout_ratio=0.1,
@@ -151,31 +177,38 @@ def train_callable():
     strategy = auto.Strategy()
     strategy.auto_mode = "semi"
 
-    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
+    engine = auto.Engine(mlp, loss, optimizer, metrics=None, strategy=strategy)
 
+    feed_dict = {}
+    for feed_var, shape in my_feed_vars:
+        feed_dict[feed_var.name] = np.zeros(shape, dtype="float32")
+
+    # Build normal normal dataloader
     # train
     train_dataset = MyDataset(batch_num * batch_size)
     train_dataloader = engine.dataloader(train_dataset,
                                          batch_size=batch_size,
                                          mode="train")
-    for _ in train_dataloader:
-        outs = engine(mode="train")
+    engine.prepare(mode="train")
+    for data in train_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="train")
 
     # eval
     eval_dataset2 = MyDataset(batch_size)
     eval_dataloader = engine.dataloader(eval_dataset2,
                                         batch_size=batch_size,
                                         mode="eval")
-    for _ in eval_dataloader:
-        outs = engine(mode="eval")
+    engine.prepare(mode="eval")
+    for data in eval_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="eval")
 
     # predict
+    engine.to_mode("predict")
     test_dataset = MyDataset(batch_size)
-    predict_dataloader = engine.dataloader(test_dataset,
-                                           batch_size=batch_size,
-                                           mode="predict")
-    for _ in predict_dataloader:
-        outs = engine(mode="predict")
+    predict_dataloader = engine.dataloader(test_dataset, batch_size=batch_size)
+    engine.prepare()
+    for data in predict_dataloader:
+        outs = engine.run(data, feed=feed_dict)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
@@ -184,8 +217,144 @@ def train_callable():
     engine.load(model_filename)
     temp_dir.cleanup()
 
+    # Build dataloader from generator
+    # train
+    train_dataset = MyDataset(batch_num * batch_size)
+    train_dataloader = engine.dataloader_from_generator(train_dataset,
+                                                        batch_size=batch_size,
+                                                        mode="train")
+    engine.prepare(mode="train")
+    for data in train_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="train")
+
+    # eval
+    engine.to_mode("eval")
+    eval_dataset2 = MyDataset(batch_size)
+    eval_dataloader = engine.dataloader_from_generator(eval_dataset2,
+                                                       batch_size=batch_size)
+    engine.prepare()
+    for data in eval_dataloader:
+        outs = engine.run(data, feed=feed_dict)
+
+    # predict
+    test_dataset = MyDataset(batch_size)
+    predict_dataloader = engine.dataloader_from_generator(test_dataset,
+                                                          batch_size=batch_size,
+                                                          mode="predict")
+    engine.prepare(mode="predict")
+    for data in predict_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="predict")
+
+    # save
+    temp_dir = tempfile.TemporaryDirectory()
+    model_filename = os.path.join(temp_dir.name, 'mlp')
+    engine.save(model_filename, training=True)
+    engine.load(model_filename)
+    temp_dir.cleanup()
+
+
+def train_builtin_data_vars():
+    mlp = MLPLayer(hidden_size=hidden_size,
+                   intermediate_size=4 * hidden_size,
+                   dropout_ratio=0.1,
+                   initializer_range=0.02)
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
+                                      beta1=0.9,
+                                      beta2=0.999,
+                                      epsilon=1e-08,
+                                      grad_clip=None)
+    metric = paddle.metric.Accuracy()
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
+
+    # train
+    engine.to_mode("train")
+
+    input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input')
+    label_spec = static.InputSpec([batch_size, 1], 'int64', 'label')
+    engine.prepare(inputs_spec=[input_spec], labels_spec=[label_spec])
+
+    with static.program_guard(engine.main_program, engine.startup_program):
+        feed_list = engine.inputs + engine.labels
+        print(feed_list)
+        loader = paddle.io.DataLoader.from_generator(feed_list=feed_list,
+                                                     capacity=4 * batch_size,
+                                                     iterable=False)
+
+        places = static.cuda_places()
+        loader.set_batch_generator(batch_generator_creator(), places=places)
+
+    for _ in range(epoch_num):
+        loader.start()  # call DataLoader.start() before each epoch starts
+        try:
+            while True:
+                engine.run()
+        except paddle.fluid.core.EOFException:
+            loader.reset(
+            )  # call DataLoader.reset() after catching EOFException
+
+
+def train_non_builtin_data_vars():
+    main_program = static.Program()
+    startup_program = static.Program()
+    with static.program_guard(main_program,
+                              startup_program), utils.unique_name.guard():
+        input = static.data(name="input",
+                            shape=[batch_size, image_size],
+                            dtype='float32')
+        label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
+
+        loader = paddle.io.DataLoader.from_generator(feed_list=[input, label],
+                                                     capacity=4 * batch_size,
+                                                     iterable=False)
+        places = static.cuda_places()
+        loader.set_batch_generator(batch_generator_creator(), places=places)
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
+                                          beta1=0.9,
+                                          beta2=0.999,
+                                          epsilon=1e-08,
+                                          grad_clip=None)
+        metric = paddle.metric.Accuracy()
+        predict = mlp(input)
+        loss_var = loss(predict, label)
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(loss=loss_var,
+                         optimizer=optimizer,
+                         metrics=metric,
+                         strategy=strategy)
+
+    # train
+    engine.to_mode("train")
+    engine.prepare(inputs=[input],
+                   labels=[label],
+                   main_program=main_program,
+                   startup_program=startup_program)
+    for _ in range(epoch_num):
+        loader.start()  # call DataLoader.start() before each epoch starts
+        try:
+            while True:
+                engine.run()
+        except paddle.fluid.core.EOFException:
+            loader.reset(
+            )  # call DataLoader.reset() after catching EOFException
+
 
 if __name__ == "__main__":
-    train(fetch=True)
-    train(fetch=False)
-    train_callable()
+    train_high_level(fetch=True)
+    train_high_level(fetch=False)
+    train_low_level()
+    train_builtin_data_vars()
+    train_non_builtin_data_vars()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
index 8e65ea8d8aee5..8842bcd08b640 100644
--- a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
@@ -15,9 +15,7 @@
 import os
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 import paddle.distributed as dist
-import test_communication_api_base as test_base
 import test_collective_api_base as test_collective_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
index 9bdfe124b0b49..bb36693b0fbc9 100644
--- a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
@@ -15,9 +15,7 @@
 import os
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 import paddle.distributed as dist
-import test_communication_api_base as test_base
 import test_collective_api_base as test_collective_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
index 4fa55d86840bc..b6094d6932176 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-import paddle
-import itertools
 import test_communication_api_base as test_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
index f1f099b9571f8..81836dbc13138 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-import paddle
-import itertools
 import test_communication_api_base as test_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
index 07537a480e851..693ef95a204e7 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-import paddle
-import itertools
 import test_communication_api_base as test_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
index c8a04c8d893e1..e0b50f6056fef 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-import paddle
-import itertools
 import test_communication_api_base as test_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
index a90e634860d95..1f6522d436225 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-import paddle
-import itertools
 import test_communication_api_base as test_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
index d96d931f43fbf..2de99b40e73f1 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 
 import unittest
-import paddle
-import itertools
 import test_communication_api_base as test_base
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index d7681b38a1728..f0e36fe6f4334 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -78,5 +78,15 @@ def set_op_attrs(self):
         self.attrs = {"perm": [4, 0, 2, 3, 1]}
 
 
+class TestCase_ZeroDim(TestBase):
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[])
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+
+    def set_op_attrs(self):
+        self.attrs = {"perm": []}
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bmm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bmm.py
new file mode 100644
index 0000000000000..c2ba476776d4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bmm.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+import os
+
+
+class TrtConvertBmmTest_dynamic(TrtLayerAutoScanTest):
+
+    def sample_program_configs(self):
+
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [10, 11, 12, 13, 14, 15]:
+            for trans_x in [False]:
+                for trans_y in [False]:
+                    input1_shape = [batch, 350, 75]
+                    input2_shape = [batch, 75, 25]
+                    dics = [{}]
+                    ops_config = [{
+                        "op_type": "bmm",
+                        "op_inputs": {
+                            "X": ["input1_data"],
+                            "Y": ["input2_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input1_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)),
+                            "input2_data":
+                            TensorConfig(
+                                data_gen=partial(generate_input, input2_shape))
+                        },
+                        outputs=["output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [10, 350, 75],
+                "input2_data": [10, 75, 25]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [100, 350, 75],
+                "input2_data": [100, 75, 25]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [15, 350, 75],
+                "input2_data": [15, 75, 25]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if dynamic_shape:
+                return 1, 3
+            else:
+                return 0, 4
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # The output has little diff between gpu and trt in CI-Windows-Inference
+        tol_fp32 = 1e-4
+        tol_half = 1e-4
+        if (os.name == 'nt'):
+            tol_fp32 = 1e-2
+            tol_half = 1e-2
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), tol_fp32
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), tol_half
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_grid_sampler.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_grid_sampler.py
new file mode 100644
index 0000000000000..17518622bf2d1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_grid_sampler.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertGridSampler(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1():
+            return np.random.random([1, 3, 32, 32]).astype(np.float32)
+
+        def generate_input2():
+            return np.random.random([1, 3, 3, 2]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "grid_sampler",
+            "op_inputs": {
+                "X": ["input_data"],
+                "Grid": ["grid_data"],
+            },
+            "op_outputs": {
+                "Output": ["output_data"]
+            },
+            "op_attrs": {}
+        }]
+
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data":
+                    TensorConfig(data_gen=partial(generate_input1)),
+                    "grid_data":
+                    TensorConfig(data_gen=partial(generate_input2)),
+                },
+                outputs=["output_data"])
+
+        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 3, 32, 32],
+                "grid_data": [1, 3, 3, 2]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 3, 64, 64],
+                "grid_data": [1, 3, 4, 4]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 3, 32, 32],
+                "grid_data": [1, 3, 3, 2]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 4), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 4), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_inverse.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_inverse.py
new file mode 100644
index 0000000000000..258aca7eb9bd4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_inverse.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertInverse(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1():
+            return np.random.random([32, 32]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "inverse",
+            "op_inputs": {
+                "Input": ["input_data"],
+            },
+            "op_outputs": {
+                "Output": ["output_data"]
+            },
+            "op_attrs": {}
+        }]
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data":
+                    TensorConfig(data_gen=partial(generate_input1)),
+                },
+                outputs=["output_data"])
+
+            yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 1],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [64, 64],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [32, 32],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad3d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad3d.py
new file mode 100644
index 0000000000000..5740be91c6574
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad3d.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertPad3d(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1():
+            return np.ones([1, 1, 3, 64, 64]).astype(np.float32)
+
+        for value in [True, False]:
+            for paddings in [[0, 0, 0, 0, 1, 1], [0, 0, 1, 2, 3, 4],
+                             [1, 1, 1, 1, 1, 1], [0, 0, -1, -1, 1, 1]]:
+                dics = [{"value": value, "paddings": paddings}, {}]
+
+                ops_config = [{
+                    "op_type": "pad3d",
+                    "op_inputs": {
+                        "X": ["input_data"]
+                    },
+                    "op_outputs": {
+                        "Out": ["output_data"]
+                    },
+                    "op_attrs": dics[0]
+                }]
+
+                ops = self.generate_op_config(ops_config)
+                for i in range(10):
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data":
+                            TensorConfig(data_gen=partial(generate_input1)),
+                        },
+                        outputs=["output_data"])
+
+                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 1, 3, 64, 64]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 1, 3, 64, 64]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 1, 3, 64, 64]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scatter_nd_add.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scatter_nd_add.py
new file mode 100644
index 0000000000000..4756c62ae887b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scatter_nd_add.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertScatterNd(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1():
+            return np.random.random([6]).astype(np.float32)
+
+        def generate_input2():
+            return np.random.random([4, 1]).astype(np.int32)
+
+        def generate_input3():
+            return np.random.random([4]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "scatter_nd_add",
+            "op_inputs": {
+                "X": ["input_data"],
+                "Index": ["index_data"],
+                "Updates": ["update_data"]
+            },
+            "op_outputs": {
+                "Out": ["output_data"]
+            },
+            "op_attrs": {}
+        }]
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data":
+                    TensorConfig(data_gen=partial(generate_input1)),
+                    "index_data":
+                    TensorConfig(data_gen=partial(generate_input2)),
+                    "update_data":
+                    TensorConfig(data_gen=partial(generate_input3)),
+                },
+                outputs=["output_data"])
+
+            yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1],
+                "index_data": [2, 1],
+                "update_data": [1],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [6],
+                "index_data": [4, 1],
+                "update_data": [4],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [6],
+                "index_data": [4, 1],
+                "update_data": [4],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 5), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 5), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 4), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 4), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py
new file mode 100644
index 0000000000000..1a5e8cd88371c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertUnfold(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1():
+            return np.random.random([1, 3, 24, 24]).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "unfold",
+            "op_inputs": {
+                "X": ["input_data"],
+            },
+            "op_outputs": {
+                "Y": ["output_data"]
+            },
+            "op_attrs": {
+                "dilations": [1, 1],
+                "kernel_sizes": [4, 4],
+                "paddings": [0, 0, 0, 0],
+                "strides": [1, 1],
+            }
+        }]
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data":
+                    TensorConfig(data_gen=partial(generate_input1)),
+                },
+                outputs=["output_data"])
+
+            yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 3, 4, 4],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 3, 24, 24],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 3, 24, 24],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
index 5fe56856a9252..f60050509e6bd 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -63,7 +63,7 @@ def init_test_case(self):
         pass
 
     def init_data_type(self):
-        pass
+        self.dtype = np.float32
 
     def setUp(self):
         self.op_type = "nearest_interp"
@@ -72,7 +72,6 @@ def setUp(self):
         self.use_mkldnn = True
         self.input_shape = [1, 1, 2, 2]
         self.data_layout = 'NCHW'
-        self.dtype = np.float32
         # priority: actual_shape > out_size > scale > out_h & out_w
         self.out_h = 1
         self.out_w = 1
@@ -176,11 +175,9 @@ def init_test_case(self):
 
 
 def create_test_class(parent):
-
-    class TestFp32Case(parent):
-
-        def init_data_type(self):
-            self.dtype = np.float32
+    '''
+    Create tests for int, uint8. By default parent class works on fp32.
+    '''
 
     class TestInt8Case(parent):
 
@@ -192,12 +189,10 @@ class TestUint8Case(parent):
         def init_data_type(self):
             self.dtype = np.uint8
 
-    TestFp32Case.__name__ = parent.__name__
-    TestInt8Case.__name__ = parent.__name__
-    TestUint8Case.__name__ = parent.__name__
-    globals()[parent.__name__] = TestFp32Case
-    globals()[parent.__name__] = TestInt8Case
-    globals()[parent.__name__] = TestUint8Case
+    TestInt8Case.__name__ = "{0}_{1}".format(parent.__name__, "INT8")
+    TestUint8Case.__name__ = "{0}_{1}".format(parent.__name__, "UINT8")
+    globals()[TestInt8Case.__name__] = TestInt8Case
+    globals()[TestUint8Case.__name__] = TestUint8Case
 
 
 create_test_class(TestNearestInterpMKLDNNOp)
@@ -205,7 +200,6 @@ def init_data_type(self):
 create_test_class(TestNearestNeighborInterpMKLDNNCase2)
 create_test_class(TestNearestNeighborInterpCase3)
 create_test_class(TestNearestNeighborInterpCase4)
-create_test_class(TestNearestInterpOpMKLDNNNHWC)
 create_test_class(TestNearestNeighborInterpSame)
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index 646126013d3ba..6e5601e91b02f 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -64,7 +64,7 @@ def init_test_case(self):
         pass
 
     def init_data_type(self):
-        pass
+        self.dtype = np.float32
 
     def setUp(self):
         self.op_type = "nearest_interp_v2"
@@ -73,7 +73,6 @@ def setUp(self):
         self.use_mkldnn = True
         self.input_shape = [1, 1, 2, 2]
         self.data_layout = 'NCHW'
-        self.dtype = np.float32
         # priority: actual_shape > out_size > scale > out_h & out_w
         self.out_h = 1
         self.out_w = 1
@@ -196,11 +195,9 @@ def init_test_case(self):
 
 
 def create_test_class(parent):
-
-    class TestFp32Case(parent):
-
-        def init_data_type(self):
-            self.dtype = np.float32
+    '''
+    Create tests for bf16, int, uint8. By default parent class works on fp32.
+    '''
 
     class TestBf16Case(parent):
 
@@ -217,11 +214,9 @@ class TestUint8Case(parent):
         def init_data_type(self):
             self.dtype = np.uint8
 
-    TestFp32Case.__name__ = "{0}_{1}".format(parent.__name__, "FP32")
     TestBf16Case.__name__ = "{0}_{1}".format(parent.__name__, "BF16")
     TestInt8Case.__name__ = "{0}_{1}".format(parent.__name__, "INT8")
     TestUint8Case.__name__ = "{0}_{1}".format(parent.__name__, "UINT8")
-    globals()[TestFp32Case.__name__] = TestFp32Case
     globals()[TestBf16Case.__name__] = TestBf16Case
     globals()[TestInt8Case.__name__] = TestInt8Case
     globals()[TestUint8Case.__name__] = TestUint8Case
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
index e0b90b278a258..ae4a8c16d67de 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
@@ -17,7 +17,6 @@
 import sys
 import random
 import functools
-import tempfile
 import numpy as np
 from PIL import Image
 import paddle
@@ -149,13 +148,13 @@ def setUp(self):
         self.infer_iterations = 50000 if os.environ.get(
             'DATASET') == 'full' else 2
 
-        self.root_path = tempfile.TemporaryDirectory()
-        self.int8_model = os.path.join(self.root_path.name,
-                                       "post_training_quantization")
+        self.int8_model = "post_training_quantization"
         print("self.int8_model: ", self.int8_model)
 
     def tearDown(self):
-        self.root_path.cleanup()
+        cmd = 'rm -rf post_training_quantization'
+        os.system(cmd)
+        pass
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
@@ -262,16 +261,8 @@ def generate_quantized_model(self,
                                  is_use_cache_file=False,
                                  is_optimize_model=False,
                                  onnx_format=False):
-        try:
-            os.system("mkdir " + self.int8_model)
-        except Exception as e:
-            print("Failed to create {} due to {}".format(
-                self.int8_model, str(e)))
-            sys.exit(-1)
-
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        scope = fluid.global_scope()
         val_reader = val()
 
         ptq = PostTrainingQuantization(executor=exe,
@@ -305,12 +296,6 @@ def run_test(self,
 
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
 
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            model, infer_iterations * batch_size))
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            os.path.join(model_cache_folder, "model"), batch_size,
-            infer_iterations)
-
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model, sample_iterations * batch_size))
         self.generate_quantized_model(os.path.join(model_cache_folder, "model"),
@@ -318,6 +303,12 @@ def run_test(self,
                                       is_full_quantize, is_use_cache_file,
                                       is_optimize_model, onnx_format)
 
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            os.path.join(model_cache_folder, "model"), batch_size,
+            infer_iterations)
+
         print("Start INT8 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
         (int8_throughput, int8_latency,
@@ -341,10 +332,10 @@ def run_test(self,
         self.assertLess(delta_value, diff_threshold)
 
 
-class TestMKLDNNInt8ForMobilenetv1AvgONNXFormat(TestPostTrainingQuantization):
+class TestMKLDNNInt8ForResnet50AvgONNXFormat(TestPostTrainingQuantization):
 
-    def test_onnx_format_avg_mobilenetv1(self):
-        model = "MobileNet-V1"
+    def test_onnx_format_avg_resnet50(self):
+        model = "resnet50"
         algo = "avg"
         round_type = "round"
         data_urls = [
@@ -373,66 +364,5 @@ def test_onnx_format_avg_mobilenetv1(self):
                       onnx_format=True)
 
 
-class TestMKLDNNInt8ForMobilenetv1Avg(TestPostTrainingQuantization):
-
-    def test_avg_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "avg"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        diff_threshold = 0
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      onnx_format=False)
-
-
-class TestMKLDNNInt8ForMobilenetv1AbsMax(TestPostTrainingQuantization):
-
-    def test_abs_max_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "abs_max"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        # The accuracy diff of post-training quantization (abs_max) maybe bigger
-        diff_threshold = 0
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      onnx_format=False)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index f088ca9b2e299..e6a457ca3e642 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -54,6 +54,12 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
+class TestCase_ZeroDim(TestTransposeOp):
+
+    def init_shape_axis(self):
+        self.shape = ()
+        self.axis = ()
+
 class TestCase0(TestTransposeOp):
 
     def init_shape_axis(self):
diff --git a/python/paddle/fluid/tests/unittests/rpc/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rpc/CMakeLists.txt
new file mode 100644
index 0000000000000..cb566a41aaaab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rpc/CMakeLists.txt
@@ -0,0 +1,11 @@
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  list(APPEND TEST_OPS ${TEST_OP})
+  set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
+endforeach()
diff --git a/python/paddle/fluid/tests/unittests/rpc/rpc_launch_sync_add.py b/python/paddle/fluid/tests/unittests/rpc/rpc_launch_sync_add.py
new file mode 100644
index 0000000000000..3294f159b7770
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rpc/rpc_launch_sync_add.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+
+paddle.device.set_device("cpu")
+
+
+def add(a, b):
+    a = paddle.to_tensor(a, dtype="float32")
+    b = paddle.to_tensor(b, dtype="float32")
+    res = paddle.add(a, b).numpy()
+    return res
+
+
+def rpc_add(to, args):
+    res = dist.rpc.rpc_sync(to, add, args=args)
+    return res
+
+
+def worker_name(rank):
+    return "worker{}".format(rank)
+
+
+def main():
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    dist.rpc.init_rpc(worker_name(rank))
+    if rank == 0:
+        mmap_data1 = np.memmap(
+            "rpc_launch_data1.npy",
+            dtype=np.float32,
+            mode="r",
+            shape=(10 * world_size, 100),
+        )
+        mmap_data2 = np.memmap(
+            "rpc_launch_data2.npy",
+            dtype=np.float32,
+            mode="r",
+            shape=(10 * world_size, 100),
+        )
+        mmap_out = np.memmap(
+            "rpc_launch_result.npy",
+            dtype=np.float32,
+            mode="w+",
+            shape=(10 * world_size, 100),
+        )
+        for i in range(world_size):
+            a = mmap_data1[i * 10:(i + 1) * 10, :]
+            b = mmap_data2[i * 10:(i + 1) * 10, :]
+            args = (a, b)
+            out = rpc_add(worker_name(i), args)
+            mmap_out[i * 10:(i + 1) * 10, :] = out[:]
+    dist.rpc.shutdown()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/rpc/test_rpc.py b/python/paddle/fluid/tests/unittests/rpc/test_rpc.py
new file mode 100644
index 0000000000000..c6c2eb43be204
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rpc/test_rpc.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import paddle
+import paddle.distributed as dist
+import numpy as np
+from test_rpc_base import RpcTestBase, RpcLaunchTestBase
+
+paddle.device.set_device("cpu")
+
+
+def worker_name(rank):
+    return "worker{}".format(rank)
+
+
+def paddle_add(a, b):
+    a = paddle.to_tensor(a)
+    b = paddle.to_tensor(b)
+    res = paddle.add(a, b).numpy()
+    return res
+
+
+class TestMultiProcessRpc(RpcTestBase):
+
+    def test_one_server_sync_paddle_add(self):
+        a = np.random.random((10, 100))
+        b = np.random.random((10, 100))
+        res = np.add(a, b)
+        args = (a, b)
+        queues = self.run_rpc(True, 1, paddle_add, args)
+        out = queues[0].get()
+        np.testing.assert_allclose(out, res, rtol=1e-05)
+
+    def test_one_server_async_paddle_add(self):
+        a = np.random.random((10, 100))
+        b = np.random.random((10, 100))
+        res = np.add(a, b)
+        args = (a, b)
+        queues = self.run_rpc(False, 1, paddle_add, args)
+        out = queues[0].get()
+        np.testing.assert_allclose(out, res, rtol=1e-05)
+
+    def test_two_server_sync_paddle_add(self):
+        a = np.random.random((10, 100))
+        b = np.random.random((10, 100))
+        res = np.add(a, b)
+        args = (a, b)
+        queues = self.run_rpc(True, 2, paddle_add, args)
+        out1 = queues[0].get()
+        out2 = queues[1].get()
+        np.testing.assert_allclose(out1, res, rtol=1e-05)
+        np.testing.assert_allclose(out2, res, rtol=1e-05)
+
+    def test_two_server_async_paddle_add(self):
+        a = np.random.random((10, 100))
+        b = np.random.random((10, 100))
+        res = np.add(a, b)
+        args = (a, b)
+        queues = self.run_rpc(False, 2, paddle_add, args)
+        out1 = queues[0].get()
+        out2 = queues[1].get()
+        np.testing.assert_allclose(out1, res, rtol=1e-05)
+        np.testing.assert_allclose(out2, res, rtol=1e-05)
+
+
+class TestSingleProcessRpc(RpcTestBase):
+
+    def setUp(self):
+        self._port_set = set()
+        master_endpoint = "127.0.0.1:{}".format(self._find_free_port())
+        dist.rpc.init_rpc(worker_name(0), 0, 1, master_endpoint)
+        print("Single Process RPC setUp...")
+
+    def tearDown(self):
+        dist.rpc.shutdown()
+        print("Single Process RPC tearDown...")
+
+    def test_sync_rpc_paddle_add(self):
+        a = np.random.random((10, 100))
+        b = np.random.random((10, 100))
+        res = np.add(a, b)
+        args = (a, b)
+        out = dist.rpc.rpc_sync(worker_name(0), paddle_add, args=args)
+        np.testing.assert_allclose(out, res, rtol=1e-05)
+
+    def test_async_rpc_paddle_add(self):
+        a = np.random.random((10, 100))
+        b = np.random.random((10, 100))
+        res = np.add(a, b)
+        args = (a, b)
+        out = dist.rpc.rpc_async(worker_name(0), paddle_add, args=args).wait()
+        np.testing.assert_allclose(out, res, rtol=1e-05)
+
+    def test_get_worker_info(self):
+        info = dist.rpc.get_worker_info(worker_name(0))
+        self.assertEqual(info.name, worker_name(0))
+        self.assertEqual(info.rank, 0)
+
+    def test_get_all_worker_infos(self):
+        infos = dist.rpc.get_all_worker_infos()
+        info = infos[0]
+        self.assertEqual(info.name, worker_name(0))
+        self.assertEqual(info.rank, 0)
+
+    def test_get_current_worker_info(self):
+        info = dist.rpc.get_current_worker_info()
+        self.assertEqual(info.name, worker_name(0))
+        self.assertEqual(info.rank, 0)
+
+
+class RpcLaunchTest(RpcLaunchTestBase):
+
+    def test_sync_rpc_paddle_add1(self):
+        nnodes = 2
+        nproc_per_node = 1
+        pwd, _ = os.path.split(os.path.realpath(__file__))
+        model_file = os.path.join(pwd, "rpc_launch_sync_add.py")
+        a, b = self.create_data(nnodes, nproc_per_node)
+        res = np.add(a, b)
+        out = self.launch_rpc(nnodes, nproc_per_node, model_file)
+        np.testing.assert_allclose(out, res, rtol=1e-05)
+
+    def test_sync_rpc_paddle_add2(self):
+        nnodes = 2
+        nproc_per_node = 2
+        pwd, _ = os.path.split(os.path.realpath(__file__))
+        model_file = os.path.join(pwd, "rpc_launch_sync_add.py")
+        a, b = self.create_data(nnodes, nproc_per_node)
+        res = np.add(a, b)
+        out = self.launch_rpc(nnodes, nproc_per_node, model_file)
+        np.testing.assert_allclose(out, res, rtol=1e-05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/rpc/test_rpc_base.py b/python/paddle/fluid/tests/unittests/rpc/test_rpc_base.py
new file mode 100644
index 0000000000000..95a59267d530c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rpc/test_rpc_base.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+from multiprocessing import Process, Queue
+import subprocess
+import socket
+from contextlib import closing
+
+import paddle.distributed as dist
+import numpy as np
+
+
+def worker_name(rank):
+    return "worker{}".format(rank)
+
+
+def run_rpc_sync(
+    rank,
+    world_size,
+    master_endpoint,
+    queue,
+    fn,
+    args=None,
+    kwargs=None,
+):
+    dist.rpc.init_rpc(
+        worker_name(rank),
+        rank,
+        world_size,
+        master_endpoint,
+    )
+    res = dist.rpc.rpc_sync(worker_name(0), fn, args=args, kwargs=kwargs)
+    queue.put(res)
+    dist.rpc.shutdown()
+
+
+def run_rpc_sync_master_working(
+    rank,
+    world_size,
+    master_endpoint,
+    queue,
+    fn,
+    args=None,
+    kwargs=None,
+):
+    dist.rpc.init_rpc(
+        worker_name(rank),
+        rank,
+        world_size,
+        master_endpoint,
+    )
+    if dist.get_rank() == 0:
+        for i in range(1, dist.get_rank()):
+            res = dist.rpc.rpc_sync(worker_name(i),
+                                    fn,
+                                    args=args,
+                                    kwargs=kwargs)
+            queue.put(res)
+    dist.rpc.shutdown()
+
+
+def run_rpc_async(
+    rank,
+    world_size,
+    master_endpoint,
+    queue,
+    fn,
+    args=None,
+    kwargs=None,
+):
+    dist.rpc.init_rpc(
+        worker_name(rank),
+        rank,
+        world_size,
+        master_endpoint,
+    )
+    res = dist.rpc.rpc_async(worker_name(0), fn, args=args, kwargs=kwargs)
+    queue.put(res.wait())
+    dist.rpc.shutdown()
+
+
+def run_rpc_async_master_working(
+    rank,
+    world_size,
+    master_endpoint,
+    queue,
+    fn,
+    args=None,
+    kwargs=None,
+):
+    dist.rpc.init_rpc(
+        worker_name(rank),
+        rank,
+        world_size,
+        master_endpoint,
+    )
+    if dist.get_rank() == 0:
+        for i in range(1, dist.get_rank()):
+            res = dist.rpc.rpc_async(worker_name(i),
+                                     fn,
+                                     args=args,
+                                     kwargs=kwargs)
+            queue.put(res.wait())
+    dist.rpc.shutdown()
+
+
+class RpcTestBase(unittest.TestCase):
+
+    def setUp(self):
+        self._port_set = set()
+        print("RPC setUp...")
+
+    def tearDown(self):
+        if len(self.processes) != 0:
+            [p.join() for p in self.processes]
+        print("RPC tearDown...")
+
+    def _find_free_port(self):
+
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(("", 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def run_rpc(self, sync, world_size, fn, fn_args=None, fn_kwargs=None):
+        self.processes = []
+        queues = []
+        master_endpoint = "127.0.0.1:{}".format(self._find_free_port())
+        for rank in range(world_size):
+            q = Queue()
+            queues.append(q)
+            if sync:
+                self.processes.append(
+                    Process(
+                        target=run_rpc_sync,
+                        args=(
+                            rank,
+                            world_size,
+                            master_endpoint,
+                            q,
+                            fn,
+                            fn_args,
+                            fn_kwargs,
+                        ),
+                    ))
+            else:
+                self.processes.append(
+                    Process(
+                        target=run_rpc_async,
+                        args=(
+                            rank,
+                            world_size,
+                            master_endpoint,
+                            q,
+                            fn,
+                            fn_args,
+                            fn_kwargs,
+                        ),
+                    ))
+        [p.start() for p in self.processes]
+        return queues
+
+
+class RpcLaunchTestBase(unittest.TestCase):
+
+    def setUp(self):
+        self._port_set = set()
+        print("Launch RPC setUp...")
+
+    def tearDown(self):
+        self.remove_data()
+        print("Launch RPC tearDown...")
+
+    def _find_free_port(self):
+
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(("", 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def create_data(self, nnodes, nproc_per_node):
+        mmap_data1 = np.memmap(
+            "rpc_launch_data1.npy",
+            dtype=np.float32,
+            mode="w+",
+            shape=(10 * nnodes * nproc_per_node, 100),
+        )
+        mmap_data2 = np.memmap(
+            "rpc_launch_data2.npy",
+            dtype=np.float32,
+            mode="w+",
+            shape=(10 * nnodes * nproc_per_node, 100),
+        )
+        for i in range(nnodes * nproc_per_node):
+            a = np.random.random((10, 100)).astype(np.float32)
+            b = np.random.random((10, 100)).astype(np.float32)
+            mmap_data1[i * 10:(i + 1) * 10, :] = a
+            mmap_data2[i * 10:(i + 1) * 10, :] = b
+        return mmap_data1, mmap_data2
+
+    def remove_data(self):
+        os.remove("rpc_launch_data1.npy")
+        os.remove("rpc_launch_data2.npy")
+
+    def launch_rpc(self, nnodes, nproc_per_node, model_file):
+        master_endpoint = "127.0.0.1:{}".format(self._find_free_port())
+        log_dir = "log"
+        tr_cmd = "python -m paddle.distributed.launch --master {} --rank {} --nnodes {} --nproc_per_node {} --run_mode rpc {} --log_dir {}"
+        cmds = [
+            tr_cmd.format(master_endpoint, rank, nnodes, nproc_per_node,
+                          model_file, log_dir) for rank in range(nnodes)
+        ]
+        processes = [subprocess.Popen(cmd.strip().split()) for cmd in cmds]
+        [proc.communicate() for proc in processes]
+        out = np.memmap(
+            "rpc_launch_result.npy",
+            dtype=np.float32,
+            mode="r",
+            shape=(10 * nnodes * nproc_per_node, 100),
+        )
+        os.remove("rpc_launch_result.npy")
+        import shutil
+
+        shutil.rmtree(log_dir)
+        return out
diff --git a/python/paddle/fluid/tests/unittests/test_compat.py b/python/paddle/fluid/tests/unittests/test_compat.py
index e7ecdeda8b6c8..e4fedede47ce3 100644
--- a/python/paddle/fluid/tests/unittests/test_compat.py
+++ b/python/paddle/fluid/tests/unittests/test_compat.py
@@ -236,18 +236,6 @@ def test_to_bytes(self):
         for i in l2:
             self.assertTrue(isinstance(i, bytes))
 
-    def test_round(self):
-        self.assertEqual(3.0, cpt.round(3.4))
-        self.assertEqual(4.0, cpt.round(3.5))
-        self.assertEqual(0.0, cpt.round(0.1))
-        self.assertEqual(0.0, cpt.round(0.0))
-        self.assertEqual(-0.0, cpt.round(-0.0))
-        self.assertEqual(-0.0, cpt.round(-0.1))
-        self.assertEqual(-3.0, cpt.round(-3.4))
-        self.assertEqual(-4.0, cpt.round(-3.5))
-        self.assertEqual(5.0, cpt.round(5))
-        self.assertRaises(TypeError, cpt.round, None)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 0e149b171048e..eacf6dba27312 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -19,8 +19,6 @@
 from op_test import OpTest
 from paddle.fluid.framework import _test_eager_guard
 
-paddle.enable_static()
-
 
 def dmc_bilinear(data_im, height, width, h, w):
     h_low = int(np.floor(h))
@@ -60,8 +58,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
 
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'], \
-                            conv_param['dilation']
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'],\
+        conv_param['dilation']
     out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
     out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
     assert out_h == in_h
@@ -75,18 +73,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
                     for kh in range(f_h):
                         for kw in range(f_w):
                             offset_h_table = \
-                                offset[n, ::2, h, w].reshape(f_h, f_w)
+                                    offset[n, ::2, h, w].reshape(f_h, f_w)
                             offset_w_table = \
-                                offset[n, 1::2, h, w].reshape(f_h, f_w)
+                                    offset[n, 1::2, h, w].reshape(f_h, f_w)
                             offset_h = offset_h_table[kh, kw]
                             offset_w = offset_w_table[kh, kw]
                             val = 0
                             im_h = h * stride[0] + kh * dilation[0] \
-                                   + offset_h - pad[0]
+                                + offset_h - pad[0]
                             im_w = w * stride[0] + kw * dilation[0] \
-                                   + offset_w - pad[1]
+                                + offset_w - pad[1]
                             if im_h > -1 and im_w > -1 and \
-                                    im_h < in_h and im_w < in_h:
+                                im_h < in_h and im_w < in_h:
                                 val = dmc_bilinear(input[n, c], in_h, in_w,
                                                    im_h, im_w)
                             val_out = val
@@ -285,69 +283,6 @@ def init_type(self):
         self.dtype = np.float64
 
 
-class TestFP16(unittest.TestCase):
-
-    def check_main(self, input_np, offset_np, filter_np, dtype):
-        paddle.disable_static()
-        input_np = input_np.astype(dtype)
-        offset_np = offset_np.astype(dtype)
-        filter_np = filter_np.astype(dtype)
-
-        input = paddle.to_tensor(input_np)
-        offset = paddle.to_tensor(offset_np)
-        filter = paddle.to_tensor(filter_np)
-
-        input.stop_gradient = False
-        offset.stop_gradient = False
-        filter.stop_gradient = False
-
-        y = paddle.vision.ops.deform_conv2d(input, offset, filter)
-        input_grad, offset_grad, filter_grad = paddle.grad(
-            y, [input, offset, filter])
-        y_np = y.numpy().astype('float32')
-        input_grad_np = input_grad.numpy().astype('float32')
-        offset_grad_np = offset_grad.numpy().astype('float32')
-        filter_grad_np = filter_grad.numpy().astype('float32')
-        paddle.enable_static()
-        return y_np, input_grad_np, offset_grad_np, filter_grad_np
-
-    def test_main(self):
-        if not paddle.is_compiled_with_cuda():
-            return
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [40, f_c, 1, 1]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-
-        input = np.random.random(self.input_size)
-        offset = 10 * np.random.random(self.offset_size)
-        filter = np.random.random(self.filter_size)
-
-        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(
-            input, offset, filter, 'float16')
-        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(
-            input, offset, filter, 'float32')
-
-        def assert_equal(x, y):
-            np.testing.assert_allclose(x, y, atol=3e-2)
-
-        assert_equal(y_np_1, y_np_2)
-        assert_equal(input_g_np_1, input_g_np_2)
-        assert_equal(offset_g_np_1, offset_g_np_2)
-        assert_equal(filter_g_np_1, filter_g_np_2)
-
-
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
index 3c06f0ca47ace..e0506f8eb5253 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
@@ -81,6 +81,24 @@ def setUp(self):
         self.outputs = {'Out': output}
 
 
+class TestExpandAsOpRank5(TestExpandAsBasic):
+    no_need_check_grad = True
+
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        self.python_api = paddle.expand_as
+        x = np.random.rand(1, 1, 7, 16).astype("int64")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float64")
+        self.inputs = {'X': x, "Y": target_tensor}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_grad(self):
+        pass
+
+
 class TestExpandAsV2Error(unittest.TestCase):
 
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index 3ee79f1268b52..df22555d357ba 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -269,3 +269,7 @@ def test_compute_all_static(self):
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
             self.test_compute_all_dygraph()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index 922349cab36db..ca732d6b0a766 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -15,6 +15,7 @@
 import paddle
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -35,6 +36,22 @@ def logsumexp_wrapper(x, axis=None, keepdim=False, allreduce=False):
     return paddle.logsumexp(x, axis, keepdim)
 
 
+def logsumexp_op_grad(x, axis=None, keepdim=False, reduce_all=False):
+    paddle.disable_static()
+    tensor_x = paddle.to_tensor(x)
+    tensor_x.stop_gradient = False
+    out = logsumexp_wrapper(tensor_x, axis, keepdim, reduce_all)
+    grad = paddle.grad(out, [tensor_x])
+    x_grad = grad[0].numpy()
+    paddle.enable_static()
+    return x_grad
+
+
+def logsumexp_ref_grad(x):
+    sum = np.exp(x).sum()
+    return np.exp(x) / sum
+
+
 class TestLogsumexp(OpTest):
 
     def setUp(self):
@@ -125,6 +142,47 @@ def set_attrs_addition(self):
             self.user_defined_grad_outputs = [np.ones(1, dtype=self.dtype)]
 
 
+class TestLogsumexp_FP32(TestLogsumexp):
+
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+    def test_check_grad(self):
+        self.__class__.dtype = self.dtype
+        x_grad = logsumexp_op_grad(self.inputs['X'])
+        ref_x_grad = logsumexp_ref_grad(self.inputs['X'])
+        np.testing.assert_allclose(x_grad, ref_x_grad, rtol=1e-08, atol=1e-08)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLogsumexp_FP16(TestLogsumexp):
+
+    def set_attrs(self):
+        self.dtype = 'float16'
+
+    def test_check_output(self):
+        ref_x = self.inputs['X'].astype(np.float32)
+        out_ref = ref_logsumexp(ref_x)
+        paddle.disable_static()
+        x = self.inputs['X'].astype(np.float16)
+        tensor_x = paddle.to_tensor(x)
+        out_pad = logsumexp_wrapper(tensor_x)
+        paddle.enable_static()
+        np.testing.assert_allclose(out_pad.numpy(),
+                                   out_ref,
+                                   rtol=1e-03,
+                                   atol=1e-08)
+
+    def test_check_grad(self):
+        self.__class__.dtype = self.dtype
+        ref_x = self.inputs['X'].astype(np.float32)
+        ref_x_grad = logsumexp_ref_grad(ref_x)
+        x = self.inputs['X'].astype(np.float16)
+        x_grad = logsumexp_op_grad(x)
+        np.testing.assert_allclose(x_grad, ref_x_grad, rtol=1e-03, atol=1e-05)
+
+
 class TestLogsumexpError(unittest.TestCase):
 
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_multimarginloss.py b/python/paddle/fluid/tests/unittests/test_multimarginloss.py
new file mode 100644
index 0000000000000..1eff1deb69295
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multimarginloss.py
@@ -0,0 +1,454 @@
+# -*- coding: utf-8 -*
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+
+
+def call_MultiMarginLoss_layer(
+    input,
+    label,
+    p=1,
+    margin=1.0,
+    weight=None,
+    reduction='mean',
+):
+    triplet_margin_loss = paddle.nn.MultiMarginLoss(p=p,
+                                                    margin=margin,
+                                                    weight=weight,
+                                                    reduction=reduction)
+    res = triplet_margin_loss(
+        input=input,
+        label=label,
+    )
+    return res
+
+
+def call_MultiMarginLoss_functional(
+    input,
+    label,
+    p=1,
+    margin=1.0,
+    weight=None,
+    reduction='mean',
+):
+    res = paddle.nn.functional.multi_margin_loss(input=input,
+                                                 label=label,
+                                                 p=p,
+                                                 margin=margin,
+                                                 weight=weight,
+                                                 reduction=reduction)
+    return res
+
+
+def test_static(place,
+                input_np,
+                label_np,
+                p=1,
+                margin=1.0,
+                weight_np=None,
+                reduction='mean',
+                functional=False):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.static.data(name='input',
+                                   shape=input_np.shape,
+                                   dtype=input_np.dtype)
+        label = paddle.static.data(name='label',
+                                   shape=label_np.shape,
+                                   dtype=label_np.dtype)
+        feed_dict = {
+            "input": input_np,
+            "label": label_np,
+        }
+        weight = None
+        if weight_np is not None:
+            weight = paddle.static.data(name='weight',
+                                        shape=weight_np.shape,
+                                        dtype=weight_np.dtype)
+            feed_dict['weight'] = weight_np
+        if functional:
+            res = call_MultiMarginLoss_functional(input=input,
+                                                  label=label,
+                                                  p=p,
+                                                  margin=margin,
+                                                  weight=weight,
+                                                  reduction=reduction)
+        else:
+            res = call_MultiMarginLoss_layer(input=input,
+                                             label=label,
+                                             p=p,
+                                             margin=margin,
+                                             weight=weight,
+                                             reduction=reduction)
+
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result[0]
+
+
+def test_static_data_shape(place,
+                           input_np,
+                           label_np,
+                           wrong_label_shape=None,
+                           weight_np=None,
+                           wrong_weight_shape=None,
+                           functional=False):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.static.data(name='input',
+                                   shape=input_np.shape,
+                                   dtype=input_np.dtype)
+        if wrong_label_shape is None:
+            label_shape = label_np.shape
+        else:
+            label_shape = wrong_label_shape
+        label = paddle.static.data(name='label',
+                                   shape=label_shape,
+                                   dtype=label_np.dtype)
+        feed_dict = {
+            "input": input_np,
+            "label": label_np,
+        }
+        weight = None
+        if weight_np is not None:
+            if wrong_weight_shape is None:
+                weight_shape = weight_np.shape
+            else:
+                weight_shape = wrong_weight_shape
+            weight = paddle.static.data(name='weight',
+                                        shape=weight_shape,
+                                        dtype=weight_np.dtype)
+            feed_dict['weight'] = weight_np
+        if functional:
+            res = call_MultiMarginLoss_functional(
+                input=input,
+                label=label,
+                weight=weight,
+            )
+        else:
+            res = call_MultiMarginLoss_layer(
+                input=input,
+                label=label,
+                weight=weight,
+            )
+
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 input,
+                 label,
+                 p=1,
+                 margin=1.0,
+                 weight=None,
+                 reduction='mean',
+                 functional=False):
+    paddle.disable_static()
+    input = paddle.to_tensor(input)
+    label = paddle.to_tensor(label)
+
+    if weight is not None:
+        weight = paddle.to_tensor(weight)
+    if functional:
+        dy_res = call_MultiMarginLoss_functional(input=input,
+                                                 label=label,
+                                                 p=p,
+                                                 margin=margin,
+                                                 weight=weight,
+                                                 reduction=reduction)
+    else:
+        dy_res = call_MultiMarginLoss_layer(input=input,
+                                            label=label,
+                                            p=p,
+                                            margin=margin,
+                                            weight=weight,
+                                            reduction=reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_multi_margin_loss(
+    input,
+    label,
+    p=1,
+    margin=1.0,
+    weight=None,
+    reduction='mean',
+):
+    index_sample = np.array([input[i, label[i]]
+                             for i in range(label.size)]).reshape(-1, 1)
+    if weight is None:
+        expected = np.mean(np.maximum(margin + input - index_sample, 0.0)**p,
+                           axis=1) - margin**p / input.shape[1]
+    else:
+        weight = np.array([weight[label[i]]
+                           for i in range(label.size)]).reshape(-1, 1)
+        expected = np.mean(np.maximum(weight * (margin + input - index_sample), 0.0) ** p, axis=1) - weight*(margin ** p / \
+                   input.shape[1])
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
+class TestMultiMarginLoss(unittest.TestCase):
+
+    def test_MultiMarginLoss(self):
+        batch_size = 5
+        num_classes = 2
+        shape = (batch_size, num_classes)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        label = np.random.uniform(0, input.shape[1],
+                                  size=(batch_size, )).astype(np.int64)
+
+        places = [paddle.CPUPlace()]
+        if paddle.device.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        for place in places:
+            for reduction in reductions:
+                expected = calc_multi_margin_loss(input=input,
+                                                  label=label,
+                                                  reduction=reduction)
+
+                dy_result = test_dygraph(
+                    place=place,
+                    input=input,
+                    label=label,
+                    reduction=reduction,
+                )
+
+                static_result = test_static(
+                    place=place,
+                    input_np=input,
+                    label_np=label,
+                    reduction=reduction,
+                )
+                np.testing.assert_allclose(static_result, expected)
+                np.testing.assert_allclose(static_result, dy_result)
+                np.testing.assert_allclose(dy_result, expected)
+                static_functional = test_static(place=place,
+                                                input_np=input,
+                                                label_np=label,
+                                                reduction=reduction,
+                                                functional=True)
+                dy_functional = test_dygraph(place=place,
+                                             input=input,
+                                             label=label,
+                                             reduction=reduction,
+                                             functional=True)
+                np.testing.assert_allclose(static_functional, expected)
+                np.testing.assert_allclose(static_functional, dy_functional)
+                np.testing.assert_allclose(dy_functional, expected)
+
+    def test_MultiMarginLoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(ValueError,
+                          paddle.nn.MultiMarginLoss,
+                          reduction="unsupport reduction")
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        label = paddle.to_tensor([0], dtype='int32')
+        self.assertRaises(ValueError,
+                          paddle.nn.functional.multi_margin_loss,
+                          input=input,
+                          label=label,
+                          reduction="unsupport reduction")
+        paddle.enable_static()
+
+    def test_MultiMarginLoss_dimension(self):
+        paddle.disable_static()
+
+        input = paddle.to_tensor([[0.1, 0.3], [1, 2]], dtype='float32')
+        label = paddle.to_tensor([0, 1, 1], dtype='int32')
+
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.multi_margin_loss,
+            input=input,
+            label=label,
+        )
+        MMLoss = paddle.nn.MultiMarginLoss()
+        self.assertRaises(
+            ValueError,
+            MMLoss,
+            input=input,
+            label=label,
+        )
+        paddle.enable_static()
+
+    def test_MultiMarginLoss_p(self):
+        p = 2
+        batch_size = 5
+        num_classes = 2
+        shape = (batch_size, num_classes)
+        reduction = 'mean'
+        place = paddle.CPUPlace()
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        label = np.random.uniform(0, input.shape[1],
+                                  size=(batch_size, )).astype(np.int64)
+        expected = calc_multi_margin_loss(input=input,
+                                          p=p,
+                                          label=label,
+                                          reduction=reduction)
+
+        dy_result = test_dygraph(
+            place=place,
+            p=p,
+            input=input,
+            label=label,
+            reduction=reduction,
+        )
+
+        static_result = test_static(
+            place=place,
+            p=p,
+            input_np=input,
+            label_np=label,
+            reduction=reduction,
+        )
+        np.testing.assert_allclose(static_result, expected)
+        np.testing.assert_allclose(static_result, dy_result)
+        np.testing.assert_allclose(dy_result, expected)
+        static_functional = test_static(place=place,
+                                        p=p,
+                                        input_np=input,
+                                        label_np=label,
+                                        reduction=reduction,
+                                        functional=True)
+        dy_functional = test_dygraph(place=place,
+                                     p=p,
+                                     input=input,
+                                     label=label,
+                                     reduction=reduction,
+                                     functional=True)
+        np.testing.assert_allclose(static_functional, expected)
+        np.testing.assert_allclose(static_functional, dy_functional)
+        np.testing.assert_allclose(dy_functional, expected)
+
+    def test_MultiMarginLoss_weight(self):
+        batch_size = 5
+        num_classes = 2
+        shape = (batch_size, num_classes)
+        reduction = 'mean'
+        place = paddle.CPUPlace()
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        label = np.random.uniform(0, input.shape[1],
+                                  size=(batch_size, )).astype(np.int64)
+        weight = np.random.uniform(0, 2,
+                                   size=(num_classes, )).astype(np.float64)
+        expected = calc_multi_margin_loss(input=input,
+                                          label=label,
+                                          weight=weight,
+                                          reduction=reduction)
+
+        dy_result = test_dygraph(
+            place=place,
+            input=input,
+            label=label,
+            weight=weight,
+            reduction=reduction,
+        )
+
+        static_result = test_static(
+            place=place,
+            input_np=input,
+            label_np=label,
+            weight_np=weight,
+            reduction=reduction,
+        )
+        np.testing.assert_allclose(static_result, expected)
+        np.testing.assert_allclose(static_result, dy_result)
+        np.testing.assert_allclose(dy_result, expected)
+        static_functional = test_static(place=place,
+                                        input_np=input,
+                                        label_np=label,
+                                        weight_np=weight,
+                                        reduction=reduction,
+                                        functional=True)
+        dy_functional = test_dygraph(place=place,
+                                     input=input,
+                                     label=label,
+                                     weight=weight,
+                                     reduction=reduction,
+                                     functional=True)
+        np.testing.assert_allclose(static_functional, expected)
+        np.testing.assert_allclose(static_functional, dy_functional)
+        np.testing.assert_allclose(dy_functional, expected)
+
+    def test_MultiMarginLoss_static_data_shape(self):
+        batch_size = 5
+        num_classes = 2
+        shape = (batch_size, num_classes)
+        place = paddle.CPUPlace()
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        label = np.random.uniform(0, input.shape[1],
+                                  size=(batch_size, )).astype(np.int64)
+        weight = np.random.uniform(0, 2,
+                                   size=(num_classes, )).astype(np.float64)
+
+        self.assertRaises(
+            ValueError,
+            test_static_data_shape,
+            place=place,
+            input_np=input,
+            label_np=label,
+            wrong_label_shape=(10, ),
+            functional=True,
+        )
+        self.assertRaises(
+            ValueError,
+            test_static_data_shape,
+            place=place,
+            input_np=input,
+            label_np=label,
+            wrong_label_shape=(10, ),
+            functional=False,
+        )
+        self.assertRaises(
+            ValueError,
+            test_static_data_shape,
+            place=place,
+            input_np=input,
+            label_np=label,
+            weight_np=weight,
+            wrong_weight_shape=(3, ),
+            functional=True,
+        )
+        self.assertRaises(
+            ValueError,
+            test_static_data_shape,
+            place=place,
+            input_np=input,
+            label_np=label,
+            weight_np=weight,
+            wrong_weight_shape=(3, ),
+            functional=False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index c354445209aa8..bcf169e90f426 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -46,6 +46,30 @@ def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
 
+class TestReshapeOp_ZeroDim1(OpTest):
+
+    def init_data(self):
+        self.ori_shape = ()
+        self.new_shape = (1)
+        self.infered_shape = (1)
+
+
+class TestReshapeOp_ZeroDim2(OpTest):
+
+    def init_data(self):
+        self.ori_shape = (1)
+        self.new_shape = ()
+        self.infered_shape = ()
+
+
+class TestReshapeOp_ZeroDim3(OpTest):
+
+    def init_data(self):
+        self.ori_shape = ()
+        self.new_shape = (-1)
+        self.infered_shape = (1)
+
+
 class TestReshapeBF16Op(OpTest):
 
     def setUp(self):
@@ -526,6 +550,58 @@ def test_reshape_zero_tensor_error(self):
             zero_tensor.reshape([2, 3])
 
 
+class TestReshapeAPI_ZeroDim(unittest.TestCase):
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reshape(x, [1])
+        out.backward()
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1, 1])
+        out.backward()
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+        paddle.enable_static()
+
+    def test_static(self):
+        main_prog = fluid.Program()
+        with fluid.program_guard(main_prog, fluid.Program()):
+            x = paddle.rand([])
+            x.stop_gradient = False
+            out = paddle.reshape(x, [-1])
+            fluid.backward.append_backward(out)
+
+            prog = paddle.static.default_main_program()
+            block = prog.global_block()
+
+            x_grad = block.var(fluid.framework.grad_var_name(x.name))
+            out_grad = block.var(fluid.framework.grad_var_name(out.name))
+
+            # Test compile shape
+            self.assertEqual(x.shape, ())
+            self.assertEqual(out.shape, (1, ))
+            self.assertEqual(x_grad.shape, ())
+            self.assertEqual(out_grad.shape, (1, ))
+
+            exe = fluid.Executor()
+            result = exe.run(main_prog, fetch_list=[x, out, x_grad, out_grad])
+
+            # Test runtime shape
+            self.assertEqual(result[0].shape, ())
+            self.assertEqual(result[1].shape, (1, ))
+            self.assertEqual(result[2].shape, ())
+            self.assertEqual(result[3].shape, (1, ))
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 97b058f0e1327..16ff2f57fb8f0 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -17,10 +17,20 @@
 import numpy as np
 import math
 import sys
-import paddle.compat as cpt
 from op_test import OpTest
 import paddle.fluid as fluid
 
+from decimal import Decimal, ROUND_HALF_UP
+
+
+def _round(x):
+    """In Python3 round function rounds to the nearest even number,
+    we use this function to make the result always round up when the
+    remainder is 0.5. See more at:
+    https://stackoverflow.com/questions/33019698/how-to-properly-round-up-half-float-numbers
+    """
+    return Decimal(x).to_integral_value(rounding=ROUND_HALF_UP)
+
 
 class TestROIPoolOp(OpTest):
 
@@ -67,10 +77,10 @@ def calc_roi_pool(self):
         for i in range(self.rois_num):
             roi = self.rois[i]
             roi_batch_id = int(roi[0])
-            roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
-            roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
-            roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
-            roi_end_h = int(cpt.round(roi[4] * self.spatial_scale))
+            roi_start_w = int(_round(roi[1] * self.spatial_scale))
+            roi_start_h = int(_round(roi[2] * self.spatial_scale))
+            roi_end_w = int(_round(roi[3] * self.spatial_scale))
+            roi_end_h = int(_round(roi[4] * self.spatial_scale))
 
             roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
             roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index adf358055f8ea..0677211aed948 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -19,6 +19,8 @@
 from op_test import OpTest, convert_float_to_uint16
 from paddle.fluid.framework import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestStackOpBase(OpTest):
 
@@ -99,6 +101,12 @@ def initParameters(self):
         self.axis = 3
 
 
+class TestStackOp_ZeroDim(TestStackOpBase):
+
+    def initParameters(self):
+        self.input_dim = ()
+
+
 class TestStackBF16Op(OpTest):
 
     def initDefaultParameters(self):
@@ -293,5 +301,26 @@ def test_out(self):
                                    rtol=1e-05)
 
 
+class TestStackAPI_ZeroDim(unittest.TestCase):
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        out = paddle.stack([x1, x2])
+        out.backward()
+
+        self.assertEqual(out.shape, [2])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(out.grad.shape, [2])
+
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index 7180e76aa4bb9..362464a1ba367 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -46,6 +46,27 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestTileOpRank_ZeroDim1(TestTileOpRank1):
+
+    def init_data(self):
+        self.ori_shape = []
+        self.repeat_times = []
+
+
+class TestTileOpRank_ZeroDim2(TestTileOpRank1):
+
+    def init_data(self):
+        self.ori_shape = []
+        self.repeat_times = [2]
+
+
+class TestTileOpRank_ZeroDim3(TestTileOpRank1):
+
+    def init_data(self):
+        self.ori_shape = []
+        self.repeat_times = [2, 3]
+
+
 # with dimension expanding
 class TestTileOpRank2Expanding(TestTileOpRank1):
 
@@ -338,6 +359,36 @@ def test_grad(self):
             self.func(p)
 
 
+class TestTileAPI_ZeroDim(unittest.TestCase):
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.tile(x, [])
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        out = paddle.tile(x, [3])
+        out.backward()
+        self.assertEqual(out.shape, [3])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [3])
+
+        out = paddle.tile(x, [2, 3])
+        out.backward()
+        self.assertEqual(out.shape, [2, 3])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [2, 3])
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index c641b9a0ff2e1..390208c0cddc6 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -127,6 +127,13 @@ def initTestCase(self):
         self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
 
 
+class TestCase_ZeroDim(TestTransposeOp):
+
+    def initTestCase(self):
+        self.shape = ()
+        self.axis = ()
+
+
 class TestAutoTuneTransposeOp(OpTest):
 
     def setUp(self):
@@ -601,6 +608,24 @@ def test_grad(self):
             self.func(p)
 
 
+class TestTransposeAPI_ZeroDim(unittest.TestCase):
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.transpose(x, [])
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
index 269b3fd6d908e..d585c8a3d0f78 100755
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
@@ -89,6 +89,30 @@ def init_test_case(self):
         self.new_shape = (10, 1, 1, 2, 5, 1)
 
 
+class TestUnsqueezeOp_ZeroDim1(TestUnsqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = ()
+        self.axes = (-1, )
+        self.new_shape = (1)
+
+
+class TestUnsqueezeOp_ZeroDim2(TestUnsqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = ()
+        self.axes = (-1, 1)
+        self.new_shape = (1, 1)
+
+
+class TestUnsqueezeOp_ZeroDim3(TestUnsqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = ()
+        self.axes = (0, 1, 2)
+        self.new_shape = (1, 1, 1)
+
+
 # axes is a list(with tensor)
 class TestUnsqueezeOp_AxesTensorList(OpTest):
 
@@ -284,5 +308,35 @@ def executed_api(self):
         self.unsqueeze = paddle.unsqueeze_
 
 
+class TestUnsqueezeAPI_ZeroDim(unittest.TestCase):
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.unsqueeze(x, [-1])
+        out.backward()
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.unsqueeze(x, [-1, 1])
+        out.backward()
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+        out = paddle.unsqueeze(x, [0, 1, 2])
+        out.backward()
+        self.assertEqual(out.shape, [1, 1, 1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [1, 1, 1])
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 09ee1ce7f1282..c1261d0485799 100755
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -115,6 +115,30 @@ def init_test_case(self):
         self.new_shape = (10, 1, 1, 2, 5, 1)
 
 
+class TestUnsqueezeOp_ZeroDim1(TestUnsqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = ()
+        self.axes = (-1, )
+        self.new_shape = (1)
+
+
+class TestUnsqueezeOp_ZeroDim2(TestUnsqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = ()
+        self.axes = (-1, 1)
+        self.new_shape = (1, 1)
+
+
+class TestUnsqueezeOp_ZeroDim3(TestUnsqueezeOp):
+
+    def init_test_case(self):
+        self.ori_shape = ()
+        self.axes = (0, 1, 2)
+        self.new_shape = (1, 1, 1)
+
+
 class API_TestUnsqueeze(unittest.TestCase):
 
     def test_out(self):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
index 6bb54c8f74edf..35df3c244bd0d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
@@ -22,7 +22,6 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid.op import Operator
-import paddle.compat as cpt
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
index 6a4041dd7837a..3dd7531f35048 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
@@ -60,6 +60,13 @@ def initTestCase(self):
         self.axis = (1, 0)
 
 
+class TestCase_ZeroDim(TestXPUTransposeOp):
+
+    def initTestCase(self):
+        self.shape = ()
+        self.axis = ()
+
+
 class TestCase0(TestXPUTransposeOp):
 
     def initTestCase(self):
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index e47fa8c3c5480..331131d6e2319 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -109,6 +109,7 @@
 from .layer.loss import SmoothL1Loss  # noqa: F401
 from .layer.loss import HingeEmbeddingLoss  # noqa: F401
 from .layer.loss import CosineEmbeddingLoss  # noqa: F401
+from .layer.loss import MultiMarginLoss
 from .layer.loss import TripletMarginWithDistanceLoss
 from .layer.loss import TripletMarginLoss
 from .layer.loss import SoftMarginLoss
@@ -319,6 +320,7 @@ def weight_norm(*args):
     'Identity',
     'CosineEmbeddingLoss',
     'RReLU',
+    'MultiMarginLoss',
     'TripletMarginWithDistanceLoss',
     'TripletMarginLoss',
     'SoftMarginLoss',
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 701997e0d0ab5..bf0554d78d8b3 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -92,6 +92,7 @@
 from .loss import ctc_loss  # noqa: F401
 from .loss import hinge_embedding_loss  # noqa: F401
 from .loss import cosine_embedding_loss  # noqa: F401
+from .loss import multi_margin_loss
 from .loss import multi_label_soft_margin_loss
 from .loss import triplet_margin_with_distance_loss
 from .loss import triplet_margin_loss
@@ -241,5 +242,6 @@
     'rrelu',
     'triplet_margin_with_distance_loss',
     'triplet_margin_loss',
+    'multi_margin_loss',
     'soft_margin_loss',
 ]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 20b699c7a2798..ed28bc2190261 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -3457,6 +3457,117 @@ def triplet_margin_loss(input,
         return loss
 
 
+def multi_margin_loss(input,
+                      label,
+                      p: int = 1,
+                      margin: float = 1.0,
+                      weight=None,
+                      reduction='mean',
+                      name=None):
+    r"""
+        Measures a multi-class classification hinge loss between input :math:`input` and label :math:`label`:
+
+        For i-th mini-batch sample, the loss in terms of the 1D input :math:`input_i` and scalar
+        output :math:`label_i` is:
+
+        .. math::
+            \text{loss}(input_i, label_i) = \frac{\sum_{j} \max(0, \text{margin} - input_i[label_i] + input_i[j])^p}{\text{C}}
+
+        where :math:`0 \leq j \leq \text{C}-1`, :math:`0 \leq i \leq \text{N}-1` and :math:`j \neq label_i`.
+
+        Optionally, you can give non-equal weighting on the classes by passing
+        a 1D :attr:`weight` tensor into the constructor.
+
+        The loss function for i-th sample then becomes:
+
+        .. math::
+            \text{loss}(input_i, label_i) = \frac{\sum_{j} \max(0, weight[label_i] * (\text{margin} - input_i[label_i] + input_i[j]))^p}{\text{C}}
+
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes.
+
+        label (Tensor): Label tensor, the data type is int32 or int64. The shape of label is (N,)
+
+        p (int, Optional): The power num. Default: :math:`1`.
+
+        margin (float, Optional): Default: :math:`1`.
+
+        weight (Tensor,optional): a manual rescaling weight given to each class.
+                If given, has to be a Tensor of shape (C,) and the data type is float32, float64.
+                Default is ``'None'`` .
+
+
+        reduction (str, Optional):Indicate how to calculate the loss by batch_size.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default: ``'mean'``
+
+        name (str, Optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Output: Tensor. The tensor variable storing the multi_margin_loss of input and label.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
+            label = paddle.to_tensor([1, 2, 1], dtype=paddle.int32)
+            loss = F.multi_margin_loss(input, label, margin=1.0, reduction='none')
+            print(loss)
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "'reduction' in 'multi_margin_loss' should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction))
+
+    if not _non_static_mode():
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'multi_margin_loss')
+        check_variable_and_dtype(label, 'label', ['int32', 'int64'],
+                                 'multi_margin_loss')
+    if not (input.shape[0] == label.shape[0]):
+        raise ValueError(
+            "The label's shape[0] should be equal to input's shape[0], "
+            "but received input's shape[0] {} and label's shape[0]:{}. ".format(
+                input.shape[0], label.shape[0]))
+    label = label.reshape((-1, 1))
+    index_sample = paddle.index_sample(input, label)
+    if weight is not None:
+        if not _non_static_mode():
+            check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                     'multi_margin_loss')
+        if not (input.shape[1] == weight.shape[0]):
+            raise ValueError(
+                "The weight's shape[0] should be equal to input's shape[1]"
+                "but received weight's shape[0]: {} and input's shape[1]: {}".
+                format(weight.shape[0], input.shape[1]))
+        weight = paddle.gather(weight, label, axis=0).reshape((-1, 1))
+        loss = paddle.mean(
+            paddle.pow(
+                paddle.clip(weight *
+                            (margin - index_sample + input), min=0.0), p),
+            axis=1) - weight * (margin**p / paddle.shape(input)[1])
+    else:
+        loss = paddle.mean(paddle.pow(
+            paddle.clip(margin - index_sample + input, min=0.0), p),
+                           axis=1) - margin**p / paddle.shape(input)[1]
+
+    if reduction == 'mean':
+        return paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(loss, name=name)
+    elif reduction == 'none':
+        return loss
+
+
 def soft_margin_loss(input, label, reduction='mean', name=None):
     """
     The API measures the soft margin loss between input predictions ``input``
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 45cb652332b3a..1acea10d6755c 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -83,6 +83,7 @@
 from .loss import TripletMarginWithDistanceLoss
 from .loss import TripletMarginLoss
 from .loss import SoftMarginLoss
+from .loss import MultiMarginLoss
 from .norm import BatchNorm1D  # noqa: F401
 from .norm import BatchNorm2D  # noqa: F401
 from .norm import BatchNorm3D  # noqa: F401
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index fea2add79b48d..6de2717a06165 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1674,6 +1674,103 @@ def forward(self, input, positive, negative):
                                      name=self.name)
 
 
+class MultiMarginLoss(Layer):
+    r"""Creates a criterion that optimizes a multi-class classification hinge loss (margin-based loss) between
+        input :math:`input` and label :math:`label`:
+
+        For i-th mini-batch sample, the loss in terms of the 1D input :math:`input_i` and scalar
+        output :math:`label_i` is:
+
+        .. math::
+            \text{loss}(input_i, label_i) = \frac{\sum_{j} \max(0, \text{margin} - input_i[label_i] + input_i[j])^p}{\text{C}}
+
+        where :math:`0 \leq j \leq \text{C}-1`, :math:`0 \leq i \leq \text{N}-1` and :math:`j \neq label_i`.
+
+        Optionally, you can give non-equal weighting on the classes by passing
+        a 1D :attr:`weight` tensor into the constructor.
+
+        The loss function for i-th sample then becomes:
+
+        .. math::
+            \text{loss}(input_i, label_i) = \frac{\sum_{j} \max(0, weight[label_i] * (\text{margin} - input_i[label_i] + input_i[j]))^p}{\text{C}}
+
+
+        Parameters:
+
+            p (int, Optional):The norm degree for pairwise distance. Default: :math:`1`.
+
+            margin (float, Optional):Default: :math:`1`.
+
+            weight (Tensor,optional): a manual rescaling weight given to each class.
+                    If given, has to be a Tensor of shape (C,) and the data type is float32, float64.
+                    Default is ``'None'`` .
+
+            reduction (str, optional): Indicate how to calculate the loss by batch_size,
+                    the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                    If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+                    If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+                    If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+                    Default: ``'mean'``
+
+            name (str, optional): Name for the operation (optional, default is None).
+                For more information, please refer to :ref:`api_guide_Name`.
+
+        Call parameters:
+            input (Tensor): Input tensor, the data type is float32 or float64.
+
+            label (Tensor): Label tensor, 0<= label < input.shape[1], the data type is int32 or int64.
+
+        Shape:
+            input: 2-D Tensor, the shape is [N, C], N is batch size and `C` means number of classes.
+
+            label: 1-D Tensor, the shape is [N,].
+
+            output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the label.
+
+        Returns:
+            A callable object of MultiMarginLoss.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.nn as nn
+
+                input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
+                label = paddle.to_tensor([0, 1, 2], dtype=paddle.int32)
+
+                multi_margin_loss = nn.MultiMarginLoss(reduction='mean')
+                loss = multi_margin_loss(input, label)
+                print(loss)
+        """
+
+    def __init__(self,
+                 p: int = 1,
+                 margin: float = 1.0,
+                 weight=None,
+                 reduction="mean",
+                 name=None):
+        super(MultiMarginLoss, self).__init__()
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "'reduction' in 'MultiMarginLoss' should be 'sum', 'mean' or 'none', "
+                "but received {}.".format(reduction))
+        self.p = p
+        self.margin = margin
+        self.weight = weight
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input, label):
+        return F.multi_margin_loss(input,
+                                   label,
+                                   p=self.p,
+                                   margin=self.margin,
+                                   weight=self.weight,
+                                   reduction=self.reduction,
+                                   name=self.name)
+
+
 class SoftMarginLoss(Layer):
     r"""
     Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index f4e70f2bd1688..cbd8b5e6493f8 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -14,7 +14,6 @@
 
 import six
 import math
-import sys
 from functools import reduce
 
 import numpy as np
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 9d2844f51afa9..d341deabbd2b0 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -3000,11 +3000,21 @@ def triangular_solve(x,
                      unitriangular=False,
                      name=None):
     r"""
-    Computes the solution of a system of equations with a triangular coefficient matrix `x` and
-    multiple right-hand sides `y` .
+    Computes the solution of a system of equations with a triangular coefficient.  `x` is coefficient matrix
+    `y` is multiple right-hand sides of equations.
 
-    Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs
-    is also batches.
+    Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs is also
+    batches.
+
+    Equations can be described as:
+
+    .. math::
+        x * Out = y
+
+    Solution of Equations is:
+
+    .. math::
+        Out = x ^ {-1} * y
 
     Args:
         x (Tensor): The input triangular coefficient matrix. Its shape should be `[*, M, M]`, where `*` is zero or
@@ -3031,7 +3041,6 @@ def triangular_solve(x,
             #               -x3 = 5
 
             import paddle
-
             x = paddle.to_tensor([[1, 1, 1],
                                   [0, 2, 1],
                                   [0, 0,-1]], dtype="float64")
diff --git a/python/paddle/tests/test_callback_visualdl.py b/python/paddle/tests/test_callback_visualdl.py
index 0cca89fa7cd3b..a1fc1cbadad4b 100644
--- a/python/paddle/tests/test_callback_visualdl.py
+++ b/python/paddle/tests/test_callback_visualdl.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 import tempfile
 import shutil
diff --git a/python/setup.py.in b/python/setup.py.in
index 555bf40de93bd..4b2128a96755f 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -304,6 +304,7 @@ packages=['paddle',
           'paddle.distributed.fleet.meta_optimizers.ascend',
           'paddle.distributed.fleet.meta_optimizers.dygraph_optimizer',
           'paddle.distributed.fleet.runtime',
+          'paddle.distributed.rpc',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',
           'paddle.distributed.fleet.metrics',
diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py
index 5625618f3ce2a..e91b3b840d3ba 100644
--- a/tools/prune_for_jetson.py
+++ b/tools/prune_for_jetson.py
@@ -17,7 +17,6 @@
 """
 
 import os
-import sys
 import re
 import glob
 
diff --git a/tools/remove_grad_op_and_kernel.py b/tools/remove_grad_op_and_kernel.py
index 8b434e17cb27d..08f52fc6049d6 100644
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
@@ -17,7 +17,6 @@
 """
 
 import os
-import sys
 import re
 import glob
 
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 1f1612100f1af..7e624652ad508 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -448,6 +448,8 @@ def get_filenames(full_test=False):
 
     '''
     global whl_error
+    import paddle  # noqa: F401
+    import paddle.fluid.contrib.slim.quantization  # noqa: F401
     whl_error = []
     if full_test:
         get_full_api_from_pr_spec()