diff --git a/AUTHORS.md b/AUTHORS.md index 6a5156183a517..b0a31b30295c6 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -83,6 +83,7 @@ This is an incomplete list of authors of [Paddle](https://github.com/PaddlePaddl | xushaoyong | Shao-Yong Xu | | Yancey1989 | Xu Yan | | zhaopu7 | Pu Zhao | +| zhiqiu | Qiu-Liang Chen | | zhouxiao-coder | Xiao Zhou | | Zrachel | Rui-Qing Zhang | | jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) | diff --git a/CMakeLists.txt b/CMakeLists.txt index 20f6413dfa7d2..52cef45b3a9d1 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,6 +47,7 @@ find_package(CUDA QUIET) find_package(MKL CONFIG QUIET) option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) +option(WITH_MPI "Compile PaddlePaddle with MPI" OFF) option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF) @@ -485,9 +486,6 @@ if(WITH_DISTRIBUTE) ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE) endif() - set(WITH_MPI - ON - CACHE STRING "Enable MPI when compiling WITH_DISTRIBUTE=ON." FORCE) if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC) # disable WITH_PSCORE for NPU before include third_party message( diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 0201d1131eb4a..ef76aa39604c6 100755 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -42,7 +42,9 @@ set(DISTRIBUTE_COMPILE_FLAGS if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() - +if(LINUX) + add_subdirectory(rpc) +endif() add_subdirectory(common) add_subdirectory(ps) add_subdirectory(test) diff --git a/paddle/fluid/distributed/rpc/CMakeLists.txt b/paddle/fluid/distributed/rpc/CMakeLists.txt new file mode 100644 index 0000000000000..655a28d4f7616 --- /dev/null +++ b/paddle/fluid/distributed/rpc/CMakeLists.txt @@ -0,0 +1,13 @@ +set(PADDLE_RPC_SRCS python_rpc_handler.cc rpc_agent.cc) + +set_source_files_properties( + python_rpc_handler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(rpc_agent.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + +set(PADDLE_RPC_DEPS brpc protobuf glog pybind) +proto_library(paddle_rpc_proto SRCS rpc.proto) +cc_library( + paddle_rpc + SRCS ${PADDLE_RPC_SRCS} + DEPS ${PADDLE_RPC_DEPS} paddle_rpc_proto) diff --git a/paddle/fluid/distributed/rpc/future_wrapper.h b/paddle/fluid/distributed/rpc/future_wrapper.h new file mode 100644 index 0000000000000..6592442f46e75 --- /dev/null +++ b/paddle/fluid/distributed/rpc/future_wrapper.h @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include + +#include "paddle/fluid/distributed/rpc/python_rpc_handler.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" + +namespace py = pybind11; +namespace paddle { +namespace distributed { +class FutureWrapper { + public: + FutureWrapper() {} + explicit FutureWrapper(std::future fut) : fut_(std::move(fut)) {} + py::object wait() { + // GIL must be released, otherwise fut_.get() blocking will cause the + // service to fail to process RPC requests, leading to deadlock + PADDLE_ENFORCE_EQ( + PyGILState_Check(), + false, + platform::errors::Fatal( + "GIL must be released before fut.wait(), otherwise fut_.get() " + "blocking will cause the service to fail to " + "process RPC requests, leading to deadlock")); + auto s = fut_.get(); + py::gil_scoped_acquire ag; + std::shared_ptr python_handler = + PythonRpcHandler::GetInstance(); + py::object obj = python_handler->Deserialize(py::bytes(s)); + return obj; + } + + private: + DISABLE_COPY_AND_ASSIGN(FutureWrapper); + std::future fut_; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/rpc/python_rpc_handler.cc b/paddle/fluid/distributed/rpc/python_rpc_handler.cc new file mode 100644 index 0000000000000..13322114def64 --- /dev/null +++ b/paddle/fluid/distributed/rpc/python_rpc_handler.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/rpc/python_rpc_handler.h" + +namespace paddle { +namespace distributed { +constexpr auto kInternalModule = "paddle.distributed.rpc.internal"; + +py::object getFunction(const py::object& module, const char* name) { + py::object fn = module.attr(name); + return fn; +} + +PythonRpcHandler::PythonRpcHandler() { + py::gil_scoped_acquire ag; + // import python module + py::object rpc_internal = py::module::import(kInternalModule); + py_run_function_ = getFunction(rpc_internal, "_run_py_func"); + py_serialize_ = getFunction(rpc_internal, "_serialize"); + py_deserialize_ = getFunction(rpc_internal, "_deserialize"); +} + +py::object PythonRpcHandler::RunPythonFunc(const py::object& python_func) { + py::gil_scoped_acquire ag; + return py_run_function_(python_func); +} + +std::string PythonRpcHandler::Serialize(const py::object& obj) { + py::gil_scoped_acquire ag; + py::object res = py_serialize_(obj); + return res.cast(); +} + +py::object PythonRpcHandler::Deserialize(const std::string& obj) { + py::gil_scoped_acquire ag; + return py_deserialize_(py::bytes(obj)); +} + +std::shared_ptr PythonRpcHandler::python_rpc_handler_ = + nullptr; +std::mutex PythonRpcHandler::lock_; + +std::shared_ptr PythonRpcHandler::GetInstance() { + if (python_rpc_handler_ == nullptr) { + std::lock_guard guard(lock_); + if (python_rpc_handler_ == nullptr) { + python_rpc_handler_ = std::make_shared(); + return python_rpc_handler_; + } + } + return python_rpc_handler_; +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/rpc/python_rpc_handler.h b/paddle/fluid/distributed/rpc/python_rpc_handler.h new file mode 100644 index 0000000000000..2c5221f53d57b --- /dev/null +++ b/paddle/fluid/distributed/rpc/python_rpc_handler.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include + +#include "paddle/fluid/platform/macros.h" + +namespace py = pybind11; + +namespace paddle { +namespace distributed { + +class PYBIND11_EXPORT PythonRpcHandler { + public: + PythonRpcHandler(); + ~PythonRpcHandler() = default; + static std::shared_ptr GetInstance(); + // Run a pickled Python function and return the result py::object + py::object RunPythonFunc(const py::object& python_func); + + // Serialized a py::object into a string + std::string Serialize(const py::object& obj); + + // Deserialize a string into a py::object + py::object Deserialize(const std::string& obj); + + private: + DISABLE_COPY_AND_ASSIGN(PythonRpcHandler); + + static std::shared_ptr python_rpc_handler_; + // Ref to `paddle.distributed.rpc.internal.run_py_func`. + py::object py_run_function_; + + // Ref to `paddle.distributed.rpc.internal.serialize`. + py::object py_serialize_; + + // Ref to `paddle.distributed.rpc.internal.deserialize`. + py::object py_deserialize_; + + // Lock to protect initialization. + static std::mutex lock_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/rpc/rpc.proto b/paddle/fluid/distributed/rpc/rpc.proto new file mode 100644 index 0000000000000..2da9e37ae88d9 --- /dev/null +++ b/paddle/fluid/distributed/rpc/rpc.proto @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +syntax="proto2"; +package paddle.distributed; + +option cc_generic_services = true; +option cc_enable_arenas = true; + +message RpcRequest { + required bytes message = 1; +}; + +message RpcResponse { + required bytes message = 1; +}; + +service RpcBaseService { + rpc Send(RpcRequest) returns (RpcResponse); + rpc InvokeRpc(RpcRequest) returns (RpcResponse); +}; diff --git a/paddle/fluid/distributed/rpc/rpc_agent.cc b/paddle/fluid/distributed/rpc/rpc_agent.cc new file mode 100644 index 0000000000000..18fa2aba841e5 --- /dev/null +++ b/paddle/fluid/distributed/rpc/rpc_agent.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/rpc/rpc_agent.h" + +#include +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +const int kTimeoutMs = 500000; +const int kConnectTimeoutMs = 10000; +const int kMaxRetry = 5; +const int kCloseWaitMs = 1000; +std::shared_ptr RpcAgent::rpc_agent_instance_ = nullptr; + +RpcAgent::RpcAgent(std::string name, std::vector infos) { + name_ = std::move(name); + for (auto info : infos) { + name_to_infos_.insert({info.name_, info}); + id_to_infos_.insert({info.id_, info}); + } + this->infos_ = std::move(infos); + auto it = name_to_infos_.find(name_); + this->rank_ = it->second.id_; + rpc_service_ = std::make_shared(); + PADDLE_ENFORCE_EQ( + server_.AddService(rpc_service_.get(), brpc::SERVER_DOESNT_OWN_SERVICE), + 0, + platform::errors::Fatal("Fail to add service: %s", name)); +} + +int RpcAgent::StartWorker() { + auto info = GetWorkerInfo(name_); + // Start the server. + int port = info.port_; + brpc::ServerOptions options; + PADDLE_ENFORCE_EQ(server_.Start(port, &options), + 0, + platform::errors::Fatal("Fail to start worker: %s", name_)); + VLOG(0) << "Start worker : " << name_; + return 0; +} + +int RpcAgent::StartClient() { + // Initialize the channel, NULL means using default options. + brpc::ChannelOptions channel_options; + channel_options.protocol = "baidu_std"; + channel_options.timeout_ms = kTimeoutMs; + channel_options.connection_type = "pooled"; + channel_options.connect_timeout_ms = kConnectTimeoutMs; + channel_options.max_retry = kMaxRetry; + channels_.resize(name_to_infos_.size()); + // build connection from client to all servers + for (std::size_t i = 0; i < channels_.size(); i++) { + auto info = id_to_infos_.find(i)->second; + channels_[i].reset(new brpc::Channel()); + PADDLE_ENFORCE_EQ( + channels_[i]->Init(info.ip_.c_str(), info.port_, &channel_options), + 0, + platform::errors::Fatal( + "Fail to initialize channel: %d, ip: %s, port: %d", + i, + info.ip_, + info.port_)); + } + VLOG(0) << "Init Channels: " << name_; + return 0; +} + +int RpcAgent::Stop() { + VLOG(0) << "Worker: " << name_ << " is going to stop."; + server_.Stop(kCloseWaitMs); + server_.Join(); + rpc_agent_instance_ = nullptr; + VLOG(0) << "Worker: " << name_ << " has stopped"; + return 0; +} +void OnRpcDone::Run() { + // delete this after Run + std::unique_ptr self_guard(this); + PADDLE_ENFORCE_EQ( + cntl_.Failed(), false, platform::errors::Fatal(cntl_.ErrorText())); + promise_->set_value(response_.message()); + VLOG(2) << "Received response from " << cntl_.remote_side() << " to " + << cntl_.local_side() << " (attached=" << cntl_.response_attachment() + << ")" + << " latency=" << cntl_.latency_us() << "us"; +} + +std::future RpcAgent::InvokeRpc(const std::string &py_func, + const std::string &to, + int timeout_ms = kTimeoutMs) { + auto it = name_to_infos_.find(to); + PADDLE_ENFORCE_NE( + it, + name_to_infos_.end(), + platform::errors::OutOfRange("Worker %s doesn't exist!", to)); + uint32_t id = it->second.id_; + auto channel = channels_[id]; + // `done` must be allocated on the heap because its life cycle is after + // calling done.Run(). + OnRpcDone *done = new OnRpcDone; + done->cntl_.set_timeout_ms(timeout_ms); + done->request_.set_message(py_func); + std::future fut = done->GetFuture(); + RpcBaseService_Stub stub(channel.get()); + stub.InvokeRpc(&done->cntl_, &done->request_, &done->response_, done); + return fut; +} + +std::shared_ptr RpcAgent::RpcAgentInstance() { + PADDLE_ENFORCE_NE(rpc_agent_instance_, + nullptr, + platform::errors::Fatal( + "RpcAgent is not set, please calling " + "paddle.distributed.rpc.int_rpc() to init rpc agent.")); + return rpc_agent_instance_; +} +void RpcAgent::SetAgentInstance(std::shared_ptr agent) { + PADDLE_ENFORCE_EQ( + rpc_agent_instance_, + nullptr, + platform::errors::Fatal( + "RpcAgent has been set, please don't set rpc agent repeatly.")); + rpc_agent_instance_ = agent; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/rpc/rpc_agent.h b/paddle/fluid/distributed/rpc/rpc_agent.h new file mode 100644 index 0000000000000..e6c5a7d099c1b --- /dev/null +++ b/paddle/fluid/distributed/rpc/rpc_agent.h @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "brpc/channel.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/rpc/python_rpc_handler.h" +#include "paddle/fluid/distributed/rpc/rpc.pb.h" +#include "paddle/fluid/distributed/rpc/rpc_service.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace distributed { +struct WorkerInfo { + std::string name_; + uint32_t id_; + std::string ip_; + uint32_t port_; + WorkerInfo(std::string name, uint32_t id, std::string ip, uint32_t port) + : name_(std::move(name)), id_(id), ip_(std::move(ip)), port_(port) {} + + std::string to_string() const { + std::string info = "{name: " + name_ + ", rank: " + std::to_string(id_) + + ", ip: " + ip_ + ", port: " + std::to_string(port_) + + "}"; + return info; + } +}; + +class OnRpcDone : public google::protobuf::Closure { + public: + OnRpcDone() { promise_ = std::make_shared>(); } + // process callback of response + void Run(); + std::future GetFuture() { + return std::future(promise_->get_future()); + } + RpcResponse response_; + RpcRequest request_; + brpc::Controller cntl_; + std::shared_ptr> promise_; +}; + +class RpcAgent { + public: + static std::shared_ptr RpcAgentInstance(); + static void SetAgentInstance(std::shared_ptr agent); + // init RpcAgent instance and get information of all services + RpcAgent(std::string name, std::vector infos); + ~RpcAgent() {} + + const WorkerInfo &GetWorkerInfo(const std::string &name) const { + auto it = name_to_infos_.find(name); + return it->second; + } + const WorkerInfo &GetWorkerInfoById(uint32_t id) const { + auto it = id_to_infos_.find(id); + return it->second; + } + const WorkerInfo &GetCurrentWorkerInfo() const { + return GetWorkerInfo(name_); + } + const std::vector &GetAllWorkerInfos() const { + return this->infos_; + } + + uint32_t Rank() { return this->rank_; } + + uint32_t WorldSize() { return infos_.size(); } + + int StartWorker(); + // build connection from client to all servers + int StartClient(); + int Stop(); + + std::future InvokeRpc(const std::string &msg, + const std::string &to, + int timeout_ms); + + private: + DISABLE_COPY_AND_ASSIGN(RpcAgent); + static std::shared_ptr rpc_agent_instance_; + brpc::Server server_; + std::shared_ptr rpc_service_; + std::vector> channels_; + std::string name_; + uint32_t rank_; + std::unordered_map name_to_infos_; + std::unordered_map id_to_infos_; + std::vector infos_; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/rpc/rpc_service.h b/paddle/fluid/distributed/rpc/rpc_service.h new file mode 100644 index 0000000000000..74d4ab0fe0d58 --- /dev/null +++ b/paddle/fluid/distributed/rpc/rpc_service.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include + +#include "paddle/fluid/distributed/rpc/python_rpc_handler.h" +#include "paddle/fluid/distributed/rpc/rpc.pb.h" + +namespace paddle { +namespace distributed { +class RpcService : public RpcBaseService { + public: + RpcService() {} + virtual ~RpcService() {} + + virtual void InvokeRpc(google::protobuf::RpcController *cntl_base, + const RpcRequest *request, + RpcResponse *response, + google::protobuf::Closure *done) { + // This object helps you to call done->Run() in RAII style. If you need + // to process the request asynchronously, pass done_guard.release(). + brpc::ClosureGuard done_guard(done); + + brpc::Controller *cntl = static_cast(cntl_base); + VLOG(2) << "InvokeRpc API: Received request[log_id=" << cntl->log_id() + << "] from " << cntl->remote_side() << " to " << cntl->local_side() + << ": " + << " (attached=" << cntl->request_attachment() << ")"; + std::string py_func_str = request->message(); + std::shared_ptr python_handler = + PythonRpcHandler::GetInstance(); + // acquire gil, because native Python objects are used + py::gil_scoped_acquire ag; + py::object py_func_obj = python_handler->Deserialize(py_func_str); + py::object res = python_handler->RunPythonFunc(py_func_obj); + std::string res_str = python_handler->Serialize(res); + response->set_message(res_str); + } +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h index f955d3df44ddd..ef2a7d8f0f1e0 100644 --- a/paddle/fluid/framework/details/nan_inf_utils.h +++ b/paddle/fluid/framework/details/nan_inf_utils.h @@ -27,7 +27,7 @@ namespace framework { namespace details { // assert false when meets NAN or inf void CheckVarHasNanOrInf(const std::string& op_type, - const framework::ScopeBase& scope, + const framework::Scope& scope, const std::string& var_name, const platform::Place& place); @@ -37,7 +37,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, const platform::Place& place); void CheckOpHasNanOrInf(const framework::OperatorBase& op, - const framework::ScopeBase& scope, + const framework::Scope& scope, const platform::Place& place); template @@ -56,7 +56,7 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type, #ifdef PADDLE_WITH_ASCEND_CL void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op, - const framework::ScopeBase& scope, + const framework::Scope& scope, const platform::Place& place); #endif diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index deb138f7847d7..bca61ddae69e7 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -450,7 +450,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, } void CheckVarHasNanOrInf(const std::string& op_type, - const framework::ScopeBase& scope, + const framework::Scope& scope, const std::string& var_name, const platform::Place& place) { auto* var = scope.FindVar(var_name); @@ -486,7 +486,7 @@ static phi::DenseTensor& npu_float_status() { } void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op, - const framework::ScopeBase& scope, + const framework::Scope& scope, const platform::Place& place) { if (!platform::is_npu_place(place)) return; @@ -555,7 +555,7 @@ void PrintNpuVarInfo(const std::string& op_type, } void PrintNPUOpValueInfo(const framework::OperatorBase& op, - const framework::ScopeBase& scope, + const framework::Scope& scope, const platform::Place& place) { LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type() << "), here we print some tensor value info of this op."; @@ -573,7 +573,7 @@ void PrintNPUOpValueInfo(const framework::OperatorBase& op, } static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op, - const framework::ScopeBase& scope, + const framework::Scope& scope, const platform::Place& place) { if (!platform::is_npu_place(place)) return; @@ -609,7 +609,7 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op, #endif void CheckOpHasNanOrInf(const framework::OperatorBase& op, - const framework::ScopeBase& exec_scope, + const framework::Scope& exec_scope, const platform::Place& place) { std::call_once(white_list_init_flag, InitWhiteListFormEnv); diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 3fd7a994a62fb..25f6ff8355d73 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -123,6 +123,7 @@ message BuildStrategy { optional bool allow_cuda_graph_capture = 14 [ default = false ]; optional int32 reduce_strategy = 15 [ default = 0 ]; optional bool fuse_gemm_epilogue = 16 [ default = false ]; + optional string debug_graphviz_path = 17; } message ExecutionStrategy { diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index de9f6a4745fd0..59355c942047e 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -490,9 +490,18 @@ std::vector CompatInferMetaContext::MutableOutputBetween( size_t start, size_t end) { std::vector result; result.reserve(end - start); + bool has_meta_tensor = false; + for (size_t i = start; i < end; ++i) { auto& out = compat_outputs_.at(i); result.emplace_back(out.initialized() ? &out : nullptr); + if (!has_meta_tensor && out.initialized()) { + has_meta_tensor = true; + } + } + + if (!has_meta_tensor) { + result.clear(); } return result; } diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc old mode 100644 new mode 100755 index df19bc9ade8d5..c416ebf200df6 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc @@ -19,7 +19,6 @@ #include #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -68,7 +67,7 @@ std::vector ComputePropagateScalesMkldnnPass::GetScales( for (int i = 0; i < columns; i++) { float max_value = FLT_MIN; for (int j = 0; j < rows; j++) { - max_value = std::max(max_value, std::abs(data[i + j * columns])); + max_value = std::max(max_value, std::abs(data[j + i * rows])); } max_value = 1.0 / max_value; if (std::isinf(max_value) || std::isnan(max_value)) { @@ -394,8 +393,13 @@ std::unordered_set ComputePropagateScalesMkldnnPass::UpdateScales( auto out_iter = var_quant_scales->find(op_node->Op()->Output("Out")[0]); if (out_iter != var_quant_scales->end()) { std::vector input_names = op_node->Op()->Input("X"); - for (auto input_name : input_names) - (*var_quant_scales)[input_name] = out_iter->second; + for (auto input_name : input_names) { + auto concat_in_iter = var_quant_scales->find(input_name); + if (concat_in_iter == var_quant_scales->end()) + (*var_quant_scales)[input_name] = out_iter->second; + else + (*var_quant_scales)[input_name].second = out_iter->second.second; + } } } else if (op_name == "scale") { const std::string output_name = op_node->Op()->Output("Out")[0]; @@ -409,6 +413,40 @@ std::unordered_set ComputePropagateScalesMkldnnPass::UpdateScales( } return waiting_for_scale; } +void ComputePropagateScalesMkldnnPass::UpdateReluOutputScales( + ir::Graph* graph, StringPairMap* var_quant_scales) const { + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp()) continue; + auto op = op_node->Op(); + bool is_unsigned = false; + std::string output_name = "Out"; + std::string act_name; + if (op->Type() == "relu") { + is_unsigned = true; + } else { + if (op->Type() == "conv2d") { + act_name = "fuse_activation"; + output_name = "Output"; + } else if (op->Type() == "fc") { + act_name = "activation_type"; + } + if (!act_name.empty()) { + auto act = op->GetAttrIfExists(act_name); + if (act == "relu" || act == "relu6") { + is_unsigned = true; + } + } + } + if (is_unsigned) { + std::string output_var_name = op->Output(output_name)[0]; + auto out_iter = var_quant_scales->find(output_var_name); + if (out_iter != var_quant_scales->end()) { + (*var_quant_scales)[output_var_name].first = true; + } + } + } +} void ComputePropagateScalesMkldnnPass::PropagateScales( ir::Graph* graph, @@ -427,21 +465,6 @@ void ComputePropagateScalesMkldnnPass::PropagateScales( } } -void ComputePropagateScalesMkldnnPass::ConvertStringPairMap( - const StringPairMap& var_quant_scales, - std::unordered_map>* info_map) const { - for (auto iter = var_quant_scales.begin(); iter != var_quant_scales.end(); - iter++) { - auto* data = iter->second.second.data(); - std::vector data_v; - for (int i = 0; i < iter->second.second.numel(); i++) { - data_v.push_back(data[i]); - } - - info_map->insert(std::make_pair(iter->first, data_v)); - } -} - void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Convert paddle model to mkldnn quantized model."; const std::string pattern_name = "compute_propagate_scales_mkldnn_pass"; @@ -461,13 +484,13 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const { auto* scope = param_scope(); GetQuantInfo(graph, &var_quant_scales); ComputeWeightScales(graph, scope, &var_quant_scales); + UpdateReluOutputScales(graph, &var_quant_scales); PropagateScales(graph, &var_quant_scales, scale_immutable_ops); // save var_quant_scales in the first op's attr // for cpu_quantize_pass - std::unordered_map> info_map; - ConvertStringPairMap(var_quant_scales, &info_map); - SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales", info_map); + SaveInfoInTheFirstOp( + graph, "has_quant_info", "var_quant_scales", var_quant_scales); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h index ecc3ad16a54e6..bae810746ae2d 100644 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h @@ -17,14 +17,12 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h" namespace paddle { namespace framework { namespace ir { -using StringPairMap = - std::unordered_map>; - class ComputePropagateScalesMkldnnPass : public FusePassBase { public: ComputePropagateScalesMkldnnPass() = default; @@ -78,6 +76,9 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase { Scope* scope, StringPairMap* var_quant_scales) const; + void UpdateReluOutputScales(ir::Graph* graph, + StringPairMap* var_quant_scales) const; + void UpdateScaleOpInScale(Node* op_node, const std::string& input_name, const std::string& output_name, @@ -92,10 +93,6 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase { ir::Graph* graph, StringPairMap* var_quant_scales, const std::unordered_set& scale_immutable_ops) const; - - void ConvertStringPairMap( - const StringPairMap& var_quant_scales, - std::unordered_map>* info_map) const; }; } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc index 03c01507ca27d..39ecfd2c0e79a 100644 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h" #include "paddle/fluid/framework/naive_executor.h" @@ -91,11 +92,16 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test { graph, scope, wx_name, wh_name, var_quant_scales); } + void UpdateReluOutputScales(ir::Graph* graph, + StringPairMap* var_quant_scales) const { + pass->UpdateReluOutputScales(graph, var_quant_scales); + } + void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, const std::string& var_name) { auto x = scope->Var(var_name); - auto tensor = x->GetMutable(); + auto tensor = x->GetMutable(); auto tensor_size = 1; if (var_name == "filter") { tensor_size = positive_and_negative_values.size(); @@ -124,7 +130,6 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test { } void ComputeRnnWeightScalesTest(const std::string& type, - const std::initializer_list& ops, const framework::ProgramDesc& prog, std::vector scales) { ir::Graph* graph(new ir::Graph(prog)); @@ -140,7 +145,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test { StringPairMap var_quant_scales; auto* wx_var = scope.FindVar(wx_var_names); - auto* wx_tensor = wx_var->GetMutable(); + auto* wx_tensor = wx_var->GetMutable(); wx_tensor->Resize(phi::make_dim(wx.size(), wx[0].size())); for (size_t i = 0; i < wx.size(); i++) std::copy(begin(wx[i]), @@ -149,7 +154,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test { i * wx[0].size()); auto* wh_var = scope.FindVar(wh_var_names); - auto* wh_tensor = wh_var->GetMutable(); + auto* wh_tensor = wh_var->GetMutable(); wh_tensor->Resize(phi::make_dim(wh.size(), wh[0].size())); for (size_t i = 0; i < wh.size(); i++) std::copy(begin(wh[i]), @@ -174,6 +179,24 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test { } } + void UpdateReluOutputScaleTest( + const framework::ProgramDesc& prog, + StringPairMap* var_quant_scales, + const std::initializer_list& variable_names) { + ir::Graph* graph(new ir::Graph(prog)); + Scope scope; + + PrepareGraph(graph, prog, &scope, conv_variable_names); + + UpdateReluOutputScales(graph, var_quant_scales); + + for (auto& var_name : variable_names) { + auto iter = var_quant_scales->find(var_name); + ASSERT_NE(iter, var_quant_scales->end()); + ASSERT_EQ((*var_quant_scales)[var_name].first, true); + } + } + private: std::unique_ptr pass; }; @@ -182,11 +205,15 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, const std::vector& inputs, - const std::vector& outputs) { + const std::vector& outputs, + const std::unordered_map& attrs = {}) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); op->SetAttr("use_mkldnn", true); op->SetAttr("name", name); + if (!attrs.empty()) + for (auto& attr : attrs) op->SetAttr(attr.first, attr.second); + if (type == "conv2d") { op->SetInput("Input", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); @@ -211,6 +238,23 @@ ProgramDesc BuildConv2dProgramDesc() { return prog; } +ProgramDesc BuildConv2dReluProgramDesc() { + ProgramDesc prog; + for (auto& v : conv_variable_names) { + prog.MutableBlock(0)->Var(v); + } + std::unordered_map attrs = { + {"fuse_activation", "relu"}}; + SetOp(&prog, + "conv2d", + "Conv2d", + {"conv_in", "filter", "bias"}, + {"conv_out"}, + attrs); + + return prog; +} + ProgramDesc BuildFusionGruProgramDesc() { ProgramDesc prog; for (auto& v : rnn_variable_names) { @@ -262,7 +306,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) { StringPairMap var_quant_scales; auto* var = scope.FindVar(weight_var_name); - auto* weight_tensor = var->GetMutable(); + auto* weight_tensor = var->GetMutable(); weight_tensor->Resize(phi::make_dim(1, values.size())); std::copy(begin(values), end(values), @@ -283,15 +327,24 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) { } TEST_F(ComputePropagateScalesMkldnnPassTest, compute_gru_weight_scales) { - ComputeRnnWeightScalesTest("gru", - {"fusion_gru", "multi_gru"}, - BuildFusionGruProgramDesc(), - gru_scales); + ComputeRnnWeightScalesTest("gru", BuildFusionGruProgramDesc(), gru_scales); } TEST_F(ComputePropagateScalesMkldnnPassTest, compute_lstm_weight_scales) { - ComputeRnnWeightScalesTest( - "lstm", {"fusion_lstm"}, BuildFusionLstmProgramDesc(), lstm_scales); + ComputeRnnWeightScalesTest("lstm", BuildFusionLstmProgramDesc(), lstm_scales); +} + +TEST_F(ComputePropagateScalesMkldnnPassTest, update_relu_output_scales) { + StringPairMap var_quant_scales; + for (auto& var_name : conv_variable_names) { + phi::DenseTensor tensor; + auto* data = tensor.mutable_data({1}, platform::CPUPlace()); + data[0] = 10; + auto pair = std::make_pair(false, tensor); + var_quant_scales.insert(std::make_pair(var_name, pair)); + } + UpdateReluOutputScaleTest( + BuildConv2dReluProgramDesc(), &var_quant_scales, {"conv_out"}); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 5ec22e2e88a1e..3161eeeb4b499 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -229,6 +229,7 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, std::vector({dequantize_in_node->Name()})); deq_desc.SetOutput("Output", std::vector({output->Name()})); deq_desc.SetAttr("Scale", scale); + deq_desc.SetAttr("is_negative_input", !is_unsigned); auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. // update op's output @@ -332,20 +333,8 @@ bool CPUQuantizePass::IsOpQuantized(const Node* node) const { } void CPUQuantizePass::GetQuantInfo(Graph* graph) const { - std::unordered_map> info_map{}; - GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map); - - for (auto iter = info_map.begin(); iter != info_map.end(); iter++) { - LoDTensor tensor; - const int size = static_cast(iter->second.size()); - auto* data = tensor.mutable_data({size}, platform::CPUPlace()); - for (int i = 0; i < size; i++) { - data[i] = static_cast(iter->second[i]); - } - - auto pair = std::make_pair(false, tensor); - var_quant_scales_->insert(std::make_pair(iter->first, pair)); - } + GetInfoFromTheFirstOp( + graph, "has_quant_info", "var_quant_scales", var_quant_scales_); } void CPUQuantizePass::QuantizeConv(Graph* graph, @@ -422,7 +411,16 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, auto filter_scale_tensor = GetScaleTensorForNode(conv_filter); EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data(), filter_scale_tensor.numel()}; - eigen_tensor *= static_cast(S8_MAX); + + // If the scale value of a weight is already multiplied by S8_MAX, it does + // not need to be multiplied again + if (std::find(change_weight_->begin(), + change_weight_->end(), + conv_filter->Name()) == change_weight_->end()) { + eigen_tensor *= static_cast(S8_MAX); + change_weight_->push_back(conv_filter->Name()); + } + std::vector filter_scale{ filter_scale_tensor.data(), filter_scale_tensor.data() + filter_scale_tensor.numel()}; @@ -593,6 +591,20 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const { return; } + bool are_all_inputs_unsigned{true}; + // if all inputs were unsigned, then the output was set to unsigned + // during the scale calculation step + auto inputs = concat_op->inputs; + for (size_t i = 0; i < inputs.size(); i++) { + if (AreScalesPresentForVarNames({inputs[i]->Name()})) { + auto scale_data = GetScaleDataByName(inputs[i]->Name()); + if (scale_data.first == false) { + are_all_inputs_unsigned = false; + break; + } + } + } + GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern); if (!AreScalesPresentForNodes({concat_out})) { @@ -601,17 +613,12 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const { return; } - // if all inputs were unsigned, then the output was set to unsigned - // during the scale calculation step - bool are_all_inputs_unsigned{false}; - auto output_scale = - GetScaleValueForNode(concat_out, &are_all_inputs_unsigned); + auto output_scale = GetScaleValueForNode(concat_out); QuantizeInputs(g, concat_op, "X", are_all_inputs_unsigned); DequantizeOutput( g, concat_op, concat_out, "Out", output_scale, are_all_inputs_unsigned); - ++quantize_concat_count; }; @@ -695,6 +702,13 @@ void CPUQuantizePass::QuantizeImmutable(Graph* graph, return; } + // skip if the dtype of immutable_in is not float32 + auto dtype = immutable_in->Var()->GetDataType(); + if (dtype != proto::VarType::FP32) { + MarkAndLogCannotQuantizeOp(immutable_op, "The input dtype is not float."); + return; + } + if (!AreScalesPresentForNodes({immutable_out})) { MarkAndLogCannotQuantizeOp(immutable_op, "No scale available for the operator"); @@ -1166,7 +1180,6 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeImmutable(graph, "reshape2", "X"); QuantizeImmutable(graph, "transpose2", "X"); QuantizeImmutable(graph, "slice", "Input"); - QuantizeImmutable(graph, "shape", "Input"); QuantizeImmutable(graph, "nearest_interp", "X"); QuantizeImmutable(graph, "nearest_interp_v2", "X"); QuantizeElementwise(graph, "elementwise_add"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index f26d8bfc84c15..ded113dfdc12d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -110,6 +110,11 @@ class CPUQuantizePass : public FusePassBase { VarQuantScale string_pair_map = {}; VarQuantScale* const var_quant_scales_ = &string_pair_map; + // Save the scale values of which weights have been processed to avoid + // secondary processing + std::vector change_weight = {}; + std::vector* const change_weight_ = &change_weight; + void GetQuantInfo(Graph* graph) const; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc old mode 100644 new mode 100755 index 4dabdd6bed0bd..70623214503d8 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -66,7 +66,7 @@ void SetOp(ProgramDesc* prog, type == "nearest_interp" || type == "nearest_interp_v2") { op->SetInput("X", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); - } else if (type == "slice" || type == "shape") { + } else if (type == "slice") { op->SetInput("Input", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); } else if (type == "dropout") { @@ -467,7 +467,7 @@ static const std::initializer_list variable_names_immutable_ops = { void TestImmutableOp(const std::string tested_op) { ProgramDesc prog; for (auto& v : variable_names_immutable_ops) { - prog.MutableBlock(0)->Var(v); + prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32); } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, tested_op, tested_op, {"b"}, {"c"}, true, "int8"); @@ -520,7 +520,7 @@ void TestImmutableOpBetweenNonQuantizedOp(const std::string tested_op) { void TestImmutableOpWithManyOutputs(const std::string tested_op) { ProgramDesc prog; for (auto& v : variable_names_immutable_ops) { - prog.MutableBlock(0)->Var(v); + prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32); } SetOp(&prog, "dropout", "Dropout1", {"a"}, {"b"}, true, "float32"); @@ -556,12 +556,8 @@ void TestImmutableOpWithManyOutputs(const std::string tested_op) { SCALE * S8_MAX); } -const std::vector immutables = {"reshape2", - "transpose2", - "slice", - "shape", - "nearest_interp", - "nearest_interp_v2"}; +const std::vector immutables = { + "reshape2", "transpose2", "slice", "nearest_interp", "nearest_interp_v2"}; class TestImmutables : public testing::TestWithParam {}; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 933d60b0a2739..e0a64b2036bb7 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -105,51 +105,24 @@ void CPUQuantizeSquashPass::FindNodesToKeep( AddStatis(found_count); } -bool CPUQuantizeSquashPass::IsDequantizeInputUint8( - const Node* dequant_in) const { - PADDLE_ENFORCE_EQ( - dequant_in->inputs.size(), - 1, - platform::errors::InvalidArgument( - "Dequantize (id: %f) should have only one input.", dequant_in->id())); - if (dequant_in->inputs[0]->IsOp()) { - auto prev_op = dequant_in->inputs[0]->Op(); - std::string act_name; - if (prev_op->Type() == "relu") { - return true; - } else { - if (prev_op->Type() == "conv2d") { - act_name = "fuse_activation"; - } else if (prev_op->Type() == "fc") { - act_name = "activation_type"; - } - if (!act_name.empty()) { - auto act = prev_op->GetAttrIfExists(act_name); - if (act == "relu" || act == "relu6") { - return true; - } - } - } - } - return false; -} - bool CPUQuantizeSquashPass::IsDequantizeQuantizeIncompatible( - Node* quant_op, Node* dequant_in, Node* next_op) const { - bool is_concat_signed = + Node* quant_op, Node* dequant_op, Node* next_op) const { + bool is_next_op_signed = quant_op->Op()->GetAttrIfExists("is_negative_input"); - bool is_input_unsigned = IsDequantizeInputUint8(dequant_in); + bool is_input_signed = + dequant_op->Op()->GetAttrIfExists("is_negative_input"); + /* TODO(sfraczek): remove elementwise from this condition when BinaryMKLDNN kernel will support two different input data types */ bool is_next_op_concat_or_elementwise = next_op->Op()->Type() == "concat" || next_op->Op()->Type().find("elementwise") == 0; - if (is_next_op_concat_or_elementwise && is_concat_signed && - is_input_unsigned) { + if (is_next_op_concat_or_elementwise && + (is_next_op_signed ^ is_input_signed)) { VLOG(4) << "Do not squash dequant-quant, because " << "next_op is: " << next_op->Op()->Type() - << ", is_concat_signed: " << is_concat_signed - << ", is_input_unsigned: " << is_input_unsigned << "."; + << ", is_next_op_signed: " << is_next_op_signed + << ", is_input_signed: " << is_input_signed << "."; return true; } return false; @@ -174,7 +147,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash( GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern); GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern); - if (IsDequantizeQuantizeIncompatible(quant_op, dequant_in, next_op)) { + if (IsDequantizeQuantizeIncompatible(quant_op, dequant_op, next_op)) { return; } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h index 5207cc519c698..3aed54609d451 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -43,11 +43,6 @@ class CPUQuantizeSquashPass : public FusePassBase { Graph* graph, std::unordered_map* nodes_keep_counter) const; - /* - * Check if input to dequantize is uint8 - */ - bool IsDequantizeInputUint8(const Node* dequant_in) const; - /* * Don't squash unsigned dequantize with signed quantize. * This is important for concat and elementwise ops. @@ -55,7 +50,7 @@ class CPUQuantizeSquashPass : public FusePassBase { * elementwise assumes first input type. */ bool IsDequantizeQuantizeIncompatible(Node* quant_op, - Node* dequant_in, + Node* dequant_op, Node* next_op) const; /* diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc index 655cc95bf28a0..cd71ff153d601 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -68,15 +68,11 @@ void SetOp(ProgramDesc* prog, op->SetAttr("padding_algorithm", std::string("EXPLICIT")); op->SetAttr("data_format", std::string("NCHW")); op->SetAttr("force_fp32_output", false); - } else if (type == "quantize") { + } else if (type == "quantize" || type == "dequantize") { op->SetInput("Input", {inputs[0]}); op->SetOutput("Output", {outputs[0]}); op->SetAttr("Scale", scale[0]); op->SetAttr("is_negative_input", is_negative_input); - } else if (type == "dequantize") { - op->SetInput("Input", {inputs[0]}); - op->SetOutput("Output", {outputs[0]}); - op->SetAttr("Scale", scale[0]); } else if (type == "requantize") { op->SetInput("Input", {inputs[0]}); op->SetOutput("Output", {outputs[0]}); @@ -303,31 +299,22 @@ ProgramDesc BuildConvMultiRequantProgramDesc(bool use_mkldnn, return prog; } -/* a->relu->b->Dequant->c(u8)->Quant->d-\ - * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x - * i->relu->j->Dequant->k(u8)->Quant->l-/ +/* a->relu->b->Dequant(u8)->c->Quant(u8)->d-\ + * e->relu->f->Dequant(u8)->g->Quant(u8)->h--Concat1->i */ -ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) { +ProgramDesc BuildU8U8ConcatProgramDesc(float scale_out, float scale) { ProgramDesc prog; for (auto& v : variable_names) { prog.MutableBlock(0)->Var(v); } SetOp(&prog, "relu", "Relu1", {"a"}, {"b"}, true, {scale, scale_out}); SetOp(&prog, "relu", "Relu2", {"e"}, {"f"}, true, {scale, scale_out}); - SetOp(&prog, "relu", "Relu3", {"i"}, {"j"}, true, {scale, scale_out}); - - SetOp( - &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out}); - SetOp( - &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out}); - SetOp( - &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out}); SetOp(&prog, - "quantize", - "Quant1", + "dequantize", + "Dequant1", + {"b"}, {"c"}, - {"d"}, true, {scale, scale_out}, 0.0f, @@ -336,10 +323,23 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) { 1, false); // is_negative_input = false SetOp(&prog, - "quantize", - "Quant2", + "dequantize", + "Dequant2", + {"f"}, {"g"}, - {"h"}, + true, + {scale, scale_out}, + 0.0f, + "float32", + false, + 1, + false); // is_negative_input = false + + SetOp(&prog, + "quantize", + "Quant1", + {"c"}, + {"d"}, true, {scale, scale_out}, 0.0f, @@ -349,9 +349,9 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) { false); // is_negative_input = false SetOp(&prog, "quantize", - "Quant3", - {"k"}, - {"l"}, + "Quant2", + {"g"}, + {"h"}, true, {scale, scale_out}, 0.0f, @@ -360,27 +360,47 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) { 1, false); // is_negative_input = false - SetOp(&prog, "concat", "Concat1", {"d", "h", "l"}, {"x"}, true); + SetOp(&prog, "concat", "Concat1", {"d", "h"}, {"i"}, true); return prog; } -/* a->relu->b->Dequant->c(u8)->Quant->d-\ - * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x - * i->pool2d->j->Dequant->k(s8)->Quant->l-/ +/* a->relu->b->Dequant(u8)->c->Quant(s8)->d-\ + * e->relu->f->Dequant(u8)->g->Quant(s8)->h--Concat1->x + * i->pool2d->j->Dequant(s8)->k->Quant(s8)->l-/ */ ProgramDesc BuildU8U8S8ConcatProgramDesc(float scale_out, float scale) { ProgramDesc prog; for (auto& v : variable_names) { prog.MutableBlock(0)->Var(v); } - SetOp(&prog, "relu", "Pool2d1", {"a"}, {"b"}, true, {scale, scale_out}); - SetOp(&prog, "relu", "Relu1", {"e"}, {"f"}, true, {scale, scale_out}); + SetOp(&prog, "relu", "Relu1", {"a"}, {"b"}, true, {scale, scale_out}); + SetOp(&prog, "relu", "Relu2", {"e"}, {"f"}, true, {scale, scale_out}); SetOp(&prog, "pool2d", "Pool2d2", {"i"}, {"j"}, true, {scale, scale_out}); - SetOp( - &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out}); - SetOp( - &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out}); + SetOp(&prog, + "dequantize", + "Dequant1", + {"b"}, + {"c"}, + true, + {scale, scale_out}, + 0.0f, + "float32", + false, + 1, + false); // is_negative_input = false + SetOp(&prog, + "dequantize", + "Dequant2", + {"f"}, + {"g"}, + true, + {scale, scale_out}, + 0.0f, + "float32", + false, + 1, + false); // is_negative_input = false SetOp( &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out}); @@ -392,9 +412,9 @@ ProgramDesc BuildU8U8S8ConcatProgramDesc(float scale_out, float scale) { return prog; } -/* a->pool2d->b->Dequant->c(s8)->Quant->d-\ - * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x - * i->pool2d->j->Dequant->k(s8)->Quant->l-/ +/* a->pool2d->b->Dequant(s8)->c->Quant(s8)->d-\ + * e->relu->f->Dequant(u8)->g->Quant(s8)->h--Concat1->x + * i->pool2d->j->Dequant(s8)->k->Quant(s8)->l-/ */ ProgramDesc BuildS8U8S8ConcatProgramDesc(float scale_out, float scale) { ProgramDesc prog; @@ -407,8 +427,18 @@ ProgramDesc BuildS8U8S8ConcatProgramDesc(float scale_out, float scale) { SetOp( &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out}); - SetOp( - &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out}); + SetOp(&prog, + "dequantize", + "Dequant2", + {"f"}, + {"g"}, + true, + {scale, scale_out}, + 0.0f, + "float32", + false, + 1, + false); // is_negative_input = false SetOp( &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out}); @@ -1141,13 +1171,12 @@ TEST(CpuQuantizeSquashPass, squash_all_s8_input_to_concat1) { } TEST(CpuQuantizeSquashPass, squash_all_u8_input_to_concat2) { - // removed 3 x 4 (dequantize_op, dequantize_out, quantize, quantize_out) - auto remove_nodes = 12; + // removed 2 x 4 (dequantize_op, dequantize_out, quantize, quantize_out) + auto remove_nodes = 8; std::unordered_map expected_operators = { - {"concat", 1}, {"quantize", 0}, {"dequantize", 0}, {"relu", 3}}; - CheckNodesTest(BuildU8U8U8ConcatProgramDesc(1.2f, 1.2f), - expected_operators, - remove_nodes); + {"concat", 1}, {"quantize", 0}, {"dequantize", 0}, {"relu", 2}}; + CheckNodesTest( + BuildU8U8ConcatProgramDesc(1.2f, 1.2f), expected_operators, remove_nodes); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h index a714f236c4616..6899a7202da9c 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h @@ -22,6 +22,9 @@ namespace paddle { namespace framework { namespace ir { +using StringPairMap = + std::unordered_map>; + static void SaveInfoInTheFirstOp( ir::Graph* graph, const std::string& flag, @@ -44,6 +47,31 @@ static void SaveInfoInTheFirstOp( } } +static void SaveInfoInTheFirstOp(ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + const StringPairMap& info_map) { + VLOG(3) << "save variables in the first op's attr"; + + const std::string suffix = "_" + key_suffix + "_" + flag; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + op_node->Op()->SetAttr(flag, true); + for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { + auto* data = iter->second.second.data(); + std::vector data_v(data, data + iter->second.second.numel()); + op_node->Op()->SetAttr(iter->first + suffix + "_unsigned", + iter->second.first); + op_node->Op()->SetAttr(iter->first + suffix, data_v); + } + break; + } +} + static void GetInfoFromTheFirstOp( ir::Graph* graph, const std::string& flag, @@ -77,6 +105,54 @@ static void GetInfoFromTheFirstOp( } } +static void GetInfoFromTheFirstOp(ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + StringPairMap* info_map) { + VLOG(3) << "get variables from the first op's attr"; + const std::string unsigned_flag = "_unsigned"; + const std::string suffix = "_" + key_suffix + "_" + flag; + const std::string suffix_is_unsigned = suffix + unsigned_flag; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + auto* op_desc = op_node->Op(); + if (op_desc->GetAttrIfExists(flag)) { + op_desc->RemoveAttr(flag); + std::vector attr_names = op_desc->AttrNames(); + for (auto fake_name : attr_names) { + auto is_unsigned = false; + size_t pos = fake_name.find(suffix_is_unsigned); + + if (pos != std::string::npos) { + std::string unsigned_var_name = fake_name; + is_unsigned = + PADDLE_GET_CONST(bool, op_desc->GetAttr(unsigned_var_name)); + + std::string var_name = fake_name.substr(0, pos); + size_t unsigned_pos = fake_name.find(unsigned_flag); + std::string vector_name = + fake_name.erase(unsigned_pos, unsigned_flag.length()); + auto scales_vector = PADDLE_GET_CONST(std::vector, + op_desc->GetAttr(vector_name)); + phi::DenseTensor tensor; + const int size = static_cast(scales_vector.size()); + auto data = tensor.mutable_data({size}, platform::CPUPlace()); + std::copy(scales_vector.begin(), scales_vector.end(), data); + auto pair = std::make_pair(is_unsigned, tensor); + info_map->insert(std::make_pair(var_name, pair)); + op_desc->RemoveAttr(unsigned_var_name); + op_desc->RemoveAttr(vector_name); + } + } + break; + } + } +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc index 177309376e825..b1a0aaa830e6a 100644 --- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc @@ -52,36 +52,25 @@ bool HasBias(ir::Node* conv_op) { conv_op->Op()->Input("Bias").size() > 0; } -bool ShouldSkipConv(ir::Node* conv_op, Scope* scope, ir::Node* conv_filter) { - if (!platform::HasOpINT8DataType(conv_op->Op())) { - VLOG(4) << "Skipping non-int8 convolution (id: " << conv_op->id() << ")."; - return true; - } - - auto filter_var = scope->GetVar(conv_filter->Name()); - if (filter_var->Get().dtype() != phi::DataType::FLOAT32) { - VLOG(4) << "Skipping convolution (id: " << conv_op->id() - << ") because it's a bug that it is detected again."; - return true; - } - - VLOG(4) << "Not skipping convolution (id: " << conv_op->id() << ")"; - return false; -} - template void QuantizeConvInput(Scope* scope, ir::Graph* g, ir::Node* conv_op, const std::string& input_name, const std::string& scales_attr_name) { - const auto scales = - conv_op->Op()->GetAttrIfExists>(scales_attr_name); - - auto* tensor = scope->GetVar(input_name)->GetMutable(); - QuantizeParams(tensor, scales); - - conv_op->Op()->SetAttr(scales_attr_name, std::vector(1, 1)); + auto var = scope->GetVar(input_name); + if (var->Get().dtype() != phi::DataType::FLOAT32) { + VLOG(0) << "Skipping convolution filter: " << input_name + << " because it is detected again."; + conv_op->Op()->SetAttr(scales_attr_name, std::vector(1, 1)); + } else { + const auto scales = + conv_op->Op()->GetAttrIfExists>(scales_attr_name); + + auto* tensor = scope->GetVar(input_name)->GetMutable(); + QuantizeParams(tensor, scales); + conv_op->Op()->SetAttr(scales_attr_name, std::vector(1, 1)); + } } } // namespace @@ -151,7 +140,8 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph, PADDLE_ENFORCE_NOT_NULL( scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - if (ShouldSkipConv(conv_op, scope, conv_filter)) { + // If not a quantized OP + if (!platform::HasOpINT8DataType(conv_op->Op())) { return; } diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc old mode 100644 new mode 100755 index 507f25d92d8bc..e04cf388ac0d7 --- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc @@ -89,8 +89,14 @@ struct ProgramStrategy { virtual void CheckOp(const OpDesc& op) const = 0; - VarDesc* AddInput(OpDesc* op, std::string input_name, const Data& data) { - const std::string var_name = input_name + "_var"; + VarDesc* AddInput(OpDesc* op, + std::string input_name, + const Data& data, + const std::string user_var_name = "") { + std::string var_name = user_var_name; + if (var_name.empty()) { + var_name = input_name + "_var"; + } op->SetInput(input_name, {var_name}); auto var = program.MutableBlock(0)->Var(var_name); var->SetShape(data.getShape()); @@ -98,8 +104,14 @@ struct ProgramStrategy { return var; } - void AddOutput(OpDesc* op, std::string output_name, const Data& data) { - const std::string var_name = output_name + "_var"; + void AddOutput(OpDesc* op, + std::string output_name, + const Data& data, + const std::string user_var_name = "") { + std::string var_name = user_var_name; + if (var_name.empty()) { + var_name = output_name + "_var"; + } op->SetOutput(output_name, {var_name}); program.MutableBlock(0)->Var(var_name); test_scope.CreateTensor(var_name, data); @@ -117,21 +129,23 @@ struct ConvProgramStrategy : public ProgramStrategy { std::vector&& scale_weights, int groups = 1, Data&& bias = Data(), - std::vector&& scale_bias = {}) + std::vector&& scale_bias = {}, + bool share_weight = false) : input(std::move(input)), filter(std::move(filter)), output(std::move(output)), scale_weights(std::move(scale_weights)), groups(std::move(groups)), bias(std::move(bias)), - scale_bias(std::move(scale_bias)) {} + scale_bias(std::move(scale_bias)), + share_weight(std::move(share_weight)) {} protected: - OpDesc* CreateBasicConvOp() { + OpDesc* CreateBasicConvOp(const std::string conv_name = "Conv1") { auto op = program.MutableBlock(0)->AppendOp(); op->SetType("conv2d"); op->SetAttr("use_mkldnn", true); - op->SetAttr("name", std::string{"Conv1"}); + op->SetAttr("name", conv_name); op->SetAttr("mkldnn_data_type", std::string{"int8"}); op->SetAttr("data_format", std::string{"NCHW"}); op->SetAttr("dilations", std::vector({1, 1})); @@ -155,6 +169,20 @@ struct ConvProgramStrategy : public ProgramStrategy { AddInput(op, "Bias", bias); op->SetAttr("Bias_scales", scale_bias); } + + if (share_weight) { + OpDesc* op2 = CreateBasicConvOp("Conv2"); + AddInput(op2, "Input", input); + AddInput(op2, "Filter", filter)->SetPersistable(true); + AddOutput(op2, "Output", output, "output2"); + op2->SetAttr("Scale_weights", scale_weights); + op2->SetAttr("Scale_in", 1.0f); + op2->SetAttr("groups", groups); + if (HasBias()) { + AddInput(op2, "Bias", bias, "Bias2"); + op2->SetAttr("Bias_scales", scale_bias); + } + } } void CheckOp(const OpDesc& op) const override { @@ -210,9 +238,9 @@ struct ConvProgramStrategy : public ProgramStrategy { const Data output; const std::vector scale_weights; const int groups; - const Data bias; const std::vector scale_bias; + const bool share_weight; }; struct ParamsQuantizationMkldnnPassTestFixture : public ::testing::Test { @@ -340,6 +368,19 @@ TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1w) { RunPassTest(std::move(program)); } +TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1ws) { + auto program = std::make_unique( + GenericInput(), + Data({2, 2, 2, 1, 1}, {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}), + GenericOutput(), + std::vector{2.f, 2.f, 4.f, 4.f}, + 2, + Data({2, 2, 1, 1, 1}, {1.5f, 1.5f, 1.5f, 1.5f}), + std::vector{2.f, 2.f, 4.f, 4.f}, + true); + RunPassTest(std::move(program)); +} + } // namespace } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc index 65c64af464281..7ba71b619d106 100755 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc @@ -109,27 +109,34 @@ void QuantDequantMkldnnPass::CollectWeightScalesInfoFromONNXFormatDequantize( if (op_node->Name() == "dequantize_linear") { auto* op_desc = op_node->Op(); + + auto scale_name = op_desc->Input("Scale")[0]; + auto* var = scope->FindVar(scale_name); + PADDLE_ENFORCE_NOT_NULL( + var, + platform::errors::NotFound( + "The Scales variable [%s] of dequantize op is not found.", var)); + + auto* scale_tensor = var->GetMutable(); + auto* scale_data = scale_tensor->data(); + auto x_var_name = op_desc->Input("X")[0]; auto* weight_var = scope->FindVar(x_var_name); if (!weight_var) { auto out_var_name = op_desc->Output("Y")[0]; - if (var_quant_scales->count(x_var_name) && - !var_quant_scales->count(out_var_name)) { - std::vector scale_v = var_quant_scales->at(x_var_name); + float scale = 1.0 / scale_data[0]; + if (std::isinf(scale) || std::isnan(scale)) { + scale = 0.0; + } + std::vector scale_v = {scale}; + if (!var_quant_scales->count(out_var_name)) { var_quant_scales->insert(std::make_pair(out_var_name, scale_v)); } + if (!var_quant_scales->count(x_var_name)) { + var_quant_scales->insert(std::make_pair(x_var_name, scale_v)); + } } else { *onnx_format_quantize_model = true; - auto scale_name = op_desc->Input("Scale")[0]; - auto* var = scope->FindVar(scale_name); - PADDLE_ENFORCE_NOT_NULL( - var, - platform::errors::NotFound( - "The Scales variable [%s] of dequantize op is not found.", - var)); - - auto* scale_tensor = var->GetMutable(); - auto* scale_data = scale_tensor->data(); std::vector thresholds(scale_data, scale_data + scale_tensor->numel()); weight_thresholds->insert(std::make_pair(x_var_name, thresholds)); @@ -182,7 +189,7 @@ void QuantDequantMkldnnPass::CollectInputScalesFromQuantize( auto* scale_data = scale_tensor->data(); float scale = 1.0 / scale_data[0]; if (std::isinf(scale) || std::isnan(scale)) { - scale = 0.0; + continue; } if (!var_quant_scales->count(x_var_name)) { @@ -520,12 +527,10 @@ void QuantDequantMkldnnPass::ConvertFromINT8ToFP32( int step_c = step_n / size; for (int i = 0; i < weight_dims[0]; i++) { int begin_n = i * step_n; - for (int j = begin_n; j < begin_n + step_n; j++) { - for (int k = 0; k < size; k++) { - int begin_c = k * step_c; - for (int m = begin_c; m < begin_c + step_c; m++) { - weight_data[m] *= scales[k]; - } + for (int j = 0; j < size; j++) { + int begin_c = begin_n + j * step_c; + for (int k = 0; k < step_c; k++) { + weight_data[begin_c + k] *= scales[j]; } } } @@ -588,7 +593,8 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat( Scope* scope, const std::string& weight_name, const std::unordered_map>& - weight_thresholds) const { + weight_thresholds, + std::vector* dequantized_weights_names) const { auto* op_desc = op_node->Op(); std::string weight_var_name = op_desc->Input(weight_name)[0]; @@ -596,6 +602,13 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat( auto iter = weight_thresholds.find(weight_var_name); if (iter != weight_thresholds.end()) { scales = iter->second; + auto name_iter = std::find(dequantized_weights_names->begin(), + dequantized_weights_names->end(), + weight_var_name); + // Has been dequantized + if (name_iter != dequantized_weights_names->end()) { + return; + } } else { if (!IsInt8Weight(op_node, scope, weight_name)) { return; @@ -605,7 +618,7 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat( "the model is correct.", weight_var_name)); } - + dequantized_weights_names->push_back(weight_var_name); auto* var = scope->FindVar(weight_var_name); PADDLE_ENFORCE_NOT_NULL( var, @@ -634,14 +647,17 @@ void QuantDequantMkldnnPass::DequantizeWeights( << "No need to dequantize weights because weight_thresholds is empty."; return; } - + std::vector dequantized_weights_names; for (auto* op_node : ir::TopologyVarientSort(*graph, static_cast(0))) { if (!op_node->IsOp()) continue; if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") { if (onnx_format_quantize_model) { - DequantizeOpWeightsFromONNXFormat( - op_node, scope, "Filter", weight_thresholds); + DequantizeOpWeightsFromONNXFormat(op_node, + scope, + "Filter", + weight_thresholds, + &dequantized_weights_names); } else if (IsInt8Weight(op_node, scope, "Filter")) { DequantizeOpWeights( op_node, scope, "Filter", "Output", weight_thresholds); @@ -650,7 +666,7 @@ void QuantDequantMkldnnPass::DequantizeWeights( op_node->Name() == "matmul_v2") { if (onnx_format_quantize_model) { DequantizeOpWeightsFromONNXFormat( - op_node, scope, "Y", weight_thresholds); + op_node, scope, "Y", weight_thresholds, &dequantized_weights_names); } else if (IsInt8Weight(op_node, scope, "Y")) { DequantizeOpWeights(op_node, scope, "Y", "Out", weight_thresholds); } diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h old mode 100644 new mode 100755 index deb9072e04a49..3095cf4d05b15 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h @@ -125,7 +125,8 @@ class QuantDequantMkldnnPass : public FusePassBase { Scope* scope, const std::string& weight_name, const std::unordered_map>& - weight_thresholds) const; + weight_thresholds, + std::vector* dequantized_weights_names) const; void DequantizeWeights( ir::Graph* graph, diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 273a7ee8bc48c..f41cda93bf9cc 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -535,7 +535,8 @@ void BuildOpFuncList(const platform::Place& place, if (op_with_kernel->PhiKernel()->IsValid()) { run_phi_kernel = true; } else { - if (!op_with_kernel->SupportsKernelType(expected_kernel_key)) { + if (!op_with_kernel->SupportsKernelType(expected_kernel_key, + exec_ctx)) { auto phi_cpu_kernel_key = FallBackToCpu( expected_kernel_key, phi_kernel_key, *op_with_kernel); op_with_kernel->ResetPhiKernel( diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 67e7293877846..a483de6f21bed 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -140,29 +140,31 @@ ProgramDesc GetLmMainProgram() { return main_prog; } -// TEST(StandaloneExecutor, run) { -// auto place = platform::CUDAPlace(0); -// ProgramDesc test_prog = load_from_file("lm_startup_program"); -// ProgramDesc main_prog = GetLmMainProgram(); - -// Scope scope; -// StandaloneExecutor exec(place, test_prog, main_prog, &scope); -// exec.Run({}, {}, {}); -// auto start = std::chrono::steady_clock::now(); +TEST(StandaloneExecutor, run) { + auto place = platform::CUDAPlace(0); + ProgramDesc startup_prog = load_from_file("lm_startup_program"); + ProgramDesc main_prog = GetLmMainProgram(); -// for (size_t i = 0; i < 10; ++i) { -// if (i % 200 == 0) { -// std::cout << i << std::endl; -// } + Scope scope; + StandaloneExecutor startup_exec(place, startup_prog); + startup_exec.Run(&scope, {}, {}); + StandaloneExecutor exec(place, main_prog); + exec.Run(&scope, {}, {}); + auto start = std::chrono::steady_clock::now(); + + for (size_t i = 0; i < 10; ++i) { + if (i % 200 == 0) { + std::cout << i << std::endl; + } -// exec.Run({}, {}, {}); -// } + exec.Run(&scope, {}, {}); + } -// auto end = std::chrono::steady_clock::now(); -// std::chrono::duration diff = end - start; + auto end = std::chrono::steady_clock::now(); + std::chrono::duration diff = end - start; -// std::cout << "time cost " << diff.count() << std::endl; -// } + std::cout << "time cost " << diff.count() << std::endl; +} TEST(InterpreterCore, skip_gc_vars) { auto place = platform::CUDAPlace(0); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f32995ae41704..d8f0eb5324b66 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -50,6 +50,7 @@ class DenseTensor; #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_op_list.h" #endif #ifdef PADDLE_WITH_MLU @@ -72,7 +73,7 @@ std::vector> kKernelPriority = { std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), }; -static DDim GetDimsDebug(const ScopeBase& scope, +static DDim GetDimsDebug(const Scope& scope, const std::string& name, bool get_actual_dim = false) { Variable* var = scope.FindVar(name); @@ -96,13 +97,13 @@ static DDim GetDimsDebug(const ScopeBase& scope, } } -static bool VarInited(const ScopeBase& scope, const std::string& name) { +static bool VarInited(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) return false; return var->IsInitialized(); } -static std::string GetDtype(const ScopeBase& scope, const std::string& name) { +static std::string GetDtype(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) { return ""; @@ -128,7 +129,7 @@ static std::string GetDtype(const ScopeBase& scope, const std::string& name) { } } -static std::string GetPlace(const ScopeBase& scope, const std::string& name) { +static std::string GetPlace(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) { return ""; @@ -157,7 +158,7 @@ static std::string GetPlace(const ScopeBase& scope, const std::string& name) { } } -static int GetRowSize(const ScopeBase& scope, const std::string& name) { +static int GetRowSize(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); if (var == nullptr) { return -1; @@ -170,7 +171,7 @@ static int GetRowSize(const ScopeBase& scope, const std::string& name) { return -1; } -static LoD GetLoDDebug(const ScopeBase& scope, const std::string& name) { +static LoD GetLoDDebug(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); auto default_lod = LoD({{}}); @@ -348,7 +349,7 @@ const std::vector& OperatorBase::Outputs( return it->second; } -std::string OperatorBase::DebugStringEx(const ScopeBase* scope) const { +std::string OperatorBase::DebugStringEx(const Scope* scope) const { std::stringstream ss; ss << "Op(" << type_ << "), inputs:{"; @@ -1352,7 +1353,7 @@ bool OperatorWithKernel::SupportsMKLDNN( } bool OperatorWithKernel::SupportsKernelType( - const OpKernelType& kernel_type) const { + const OpKernelType& kernel_type, const ExecutionContext& exe_ctx) const { auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); if (kernels_iter == all_op_kernels.end()) return false; @@ -1386,16 +1387,38 @@ bool OperatorWithKernel::SupportsKernelType( } #endif +// NOTE(jiahongyu): If MKLDNN can be used, the function SupportsKernelType needs +// to check whether current op supports MKLDNN kernel. There are three +// statements in if condition: The first statement checks whether library_type_ +// are changed by other high priority backends; the second checks whether this +// op has specific implementation; the third checks whether mkldnn kernel can be +// used. +#ifdef PADDLE_WITH_MKLDNN + if (kernel_type.library_type_ == framework::LibraryType::kPlain && + !paddle::platform::in_mkldnn_white_list(type_) && + this->CanMKLDNNBeUsed(exe_ctx, kernel_type.data_type_)) { + auto tmp_kernel_type = kernel_type; + tmp_kernel_type.library_type_ = framework::LibraryType::kMKLDNN; + tmp_kernel_type.data_layout_ = framework::DataLayout::kMKLDNN; + return kernels.find(tmp_kernel_type) != kernels.end(); + } +#endif + return kernel_iter != kernels.end(); } bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx, proto::VarType::Type data_type) const { + // NOTE(jiahongyu): Only mkldnn kernels need to check "use_mkldnn" attribute, + // hence we first call function SupportsMKLDNN. If we check "use_mkldnn" + // attribute first, it will cause error because some codes add "use_mkldnn" + // attribute to non-mkldnn ops. + if (!this->SupportsMKLDNN(data_type)) { + return false; + } const std::string use_mkldnn_attr = "use_mkldnn"; - bool use_mkldnn_ctx = ctx.HasAttr(use_mkldnn_attr) && - ctx.Attr(use_mkldnn_attr) && - platform::is_cpu_place(ctx.GetPlace()); - return use_mkldnn_ctx && this->SupportsMKLDNN(data_type); + return ctx.HasAttr(use_mkldnn_attr) && ctx.Attr(use_mkldnn_attr) && + platform::is_cpu_place(ctx.GetPlace()); } void OperatorWithKernel::InferShape(InferShapeContext* ctx) const { @@ -1544,6 +1567,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } else { phi_kernel_name = kernel_signature_->name; + +// NOTE(jiahongyu): The registered MKLDNN kernel have library_type = +// LibraryType::kMKLDNN and data_layout_ = DataLayout::kMKLDNN. But the default +// values are kPlain, so we need to modify the library_type and data_layout_ +// here. There are three statements in if condition: The first statement checks +// whether library_type_ are changed by other high priority backends; the second +// checks whether this op has specific implementation; the third checks whether +// mkldnn kernel can be used. +#ifdef PADDLE_WITH_MKLDNN + if (kernel_type_->library_type_ == framework::LibraryType::kPlain && + !paddle::platform::in_mkldnn_white_list(type_) && + this->CanMKLDNNBeUsed(exe_ctx, kernel_type_->data_type_)) { + kernel_type_->library_type_ = framework::LibraryType::kMKLDNN; + kernel_type_->data_layout_ = framework::DataLayout::kMKLDNN; + } +#endif + // NOTE(Liu-xiandong):In my ctest, this branch do not be executed, // I can't understand it, it's really confusing. // But we still need to keep this to avoid errors. @@ -1771,6 +1811,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( const ExecutionContext& ctx) const { auto expected_kernel_key = this->GetExpectedKernelType(ctx); + +// NOTE(jiahongyu): PADDLE_WITH_MKLDNN codes are moved outside function +// GetExpectedKernelType, so that if MKLDNN can be used, the library_type_ and +// data_layout_ of expected_kernel_key need to be adjusted. There are three +// statements in if condition: The first statement checks whether library_type_ +// are changed by other high priority backends; the second checks whether this +// op has specific implementation; the third checks whether mkldnn kernel can be +// used. +#ifdef PADDLE_WITH_MKLDNN + if (expected_kernel_key.library_type_ == framework::LibraryType::kPlain && + !paddle::platform::in_mkldnn_white_list(type_) && + this->CanMKLDNNBeUsed(ctx, expected_kernel_key.data_type_)) { + expected_kernel_key.library_type_ = framework::LibraryType::kMKLDNN; + expected_kernel_key.data_layout_ = framework::DataLayout::kMKLDNN; + } +#endif + if (HasAttr("op_device")) { if (Attr("op_device") == "cpu") { expected_kernel_key.place_ = platform::CPUPlace(); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e649bf2fc7e95..a8a0cd863ee10 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -168,7 +168,7 @@ class OperatorBase { virtual void Stop() {} /// if scope is not null, also show dimensions of arguments - virtual std::string DebugStringEx(const ScopeBase* scope) const; + virtual std::string DebugStringEx(const Scope* scope) const; std::string DebugString() const { return DebugStringEx(nullptr); } virtual bool SupportGPU() const { return false; } @@ -323,10 +323,16 @@ class ExecutionContext { virtual const Attribute& GetAttr(const std::string& name) const { auto iter = op_.Attrs().find(name); if (iter == op_.Attrs().end()) { - return op_.RuntimeAttrs().at(name); - } else { - return iter->second; + iter = op_.RuntimeAttrs().find(name); + PADDLE_ENFORCE_NE( + iter, + op_.RuntimeAttrs().end(), + platform::errors::NotFound("(%s) is not found in AttributeMap and " + "RuntimeAttributeMap of (%s) operator.", + name, + op_.Type())); } + return iter->second; } virtual bool HasInput(const std::string& name) const; @@ -621,7 +627,8 @@ class OperatorWithKernel : public OperatorBase { bool SupportsMKLDNN(proto::VarType::Type data_type) const; - bool SupportsKernelType(const OpKernelType& kernel_type) const; + bool SupportsKernelType(const OpKernelType& kernel_type, + const ExecutionContext& exe_ctx) const; bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx, proto::VarType::Type data_type) const; diff --git a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc index 26416269c9e1f..dc36f40d9c6a3 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc @@ -50,46 +50,16 @@ std::unordered_set GetConsumerOps(Node* node) { return consumers; } -struct Hasher { - size_t operator()(const CinnSubGraphPtr& subgraph) const noexcept { - return std::hash()(reinterpret_cast(subgraph.get())); - } -}; -struct Comparator { - bool operator()(const CinnSubGraphPtr& first, - const CinnSubGraphPtr& second) const noexcept { - return first.get() == second.get(); - } -}; - -struct CinnSubGraph { - using CinnSubGraphPtr = std::shared_ptr; - // construct function - CinnSubGraph() {} - // construct function - CinnSubGraph(Node* op, bool subst) : substitute(subst) { Insert(op); } +void CinnSubGraph::Insert(Node* op) { + nodes.push_back(op); + node_set.insert(op); - void Insert(Node* op) { - nodes.push_back(op); - node_set.insert(op); - - auto producers = GetProducerOps(op); - for (auto producer : producers) { - input_nodes.insert(producer); - } - input_nodes.erase(op); + auto producers = GetProducerOps(op); + for (auto producer : producers) { + input_nodes.insert(producer); } - - int depth{0}; - int max_depth{0}, min_depth{INT_MAX}; - bool substitute{true}; - std::vector nodes; - std::unordered_set node_set; - std::unordered_set input_nodes; - - std::unordered_set producers; - std::unordered_set consumers; -}; + input_nodes.erase(op); +} void CinnSubgraphDetector::DoOpFusion() { // sort node from input to output @@ -183,7 +153,7 @@ void CinnSubgraphDetector::DoSubGraphFusion() { continue; } // do fusion - update |= FuseSubGraph(&subgraph); + update |= FuseSubGraph(subgraph); } if (!update) { break; @@ -191,8 +161,8 @@ void CinnSubgraphDetector::DoSubGraphFusion() { } } -bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr* subgraph_ptr) { - auto producer = *subgraph_ptr; +bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr subgraph_ptr) { + auto producer = subgraph_ptr; auto& consumers = producer->consumers; std::vector candidates; for (auto& consumer : consumers) { @@ -276,11 +246,11 @@ bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr* subgraph_ptr) { bool CinnSubgraphDetector::IsDependency( const CinnSubGraphPtr& producer_g, const CinnSubGraphPtr& consumer, - const std::unordered_set& consumers) { + const std::unordered_set& consumers) { std::queue candidates; candidates.push(consumer); - std::unordered_set visited_set; + std::unordered_set visited_set; while (!candidates.empty()) { auto& candidate = candidates.front(); candidates.pop(); @@ -303,12 +273,12 @@ bool CinnSubgraphDetector::IsDependency( bool CinnSubgraphDetector::IsDependencySimplify( const CinnSubGraphPtr& producer_g, const CinnSubGraphPtr& consumer, - const std::unordered_set& consumers) { + const std::unordered_set& consumers) { std::queue candidates; candidates.push(consumer); // check upper bound. int check_upper_depth = producer_g->max_depth; - std::unordered_set visited_set; + std::unordered_set visited_set; while (!candidates.empty()) { auto& candidate = candidates.front(); candidates.pop(); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h index 1eb3ebbe62fca..e8ff3915c8511 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h @@ -31,10 +31,32 @@ namespace paddle2cinn { using Node = ir::Node; using Graph = ir::Graph; -struct Hasher; -struct Comparator; +/* + * + * + */ struct CinnSubGraph; using CinnSubGraphPtr = std::shared_ptr; + +struct CinnSubGraph { + // construct function + CinnSubGraph() {} + // construct function + CinnSubGraph(Node *op, bool subst) : substitute(subst) { Insert(op); } + void Insert(Node *op); + + int depth{0}; + int max_depth{0}; + int min_depth{INT_MAX}; + bool substitute{true}; + std::vector nodes; + std::unordered_set node_set; + std::unordered_set input_nodes; + + std::unordered_set producers; + std::unordered_set consumers; +}; + /* * Detect the nodes in a subgraph that meet some conditions. This class doesn't * modify the graph. @@ -55,16 +77,14 @@ class CinnSubgraphDetector { void BuildSubGraph(); // SubGraph Fusion void DoSubGraphFusion(); - bool FuseSubGraph(CinnSubGraphPtr *); + bool FuseSubGraph(CinnSubGraphPtr); // check exist depency. - bool IsDependency( - const CinnSubGraphPtr &, - const CinnSubGraphPtr &, - const std::unordered_set &); - bool IsDependencySimplify( - const CinnSubGraphPtr &, - const CinnSubGraphPtr &, - const std::unordered_set &); + bool IsDependency(const CinnSubGraphPtr &, + const CinnSubGraphPtr &, + const std::unordered_set &); + bool IsDependencySimplify(const CinnSubGraphPtr &, + const CinnSubGraphPtr &, + const std::unordered_set &); private: Graph *graph_; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 7f08fc9b4e22c..b87a294878051 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -38,17 +38,6 @@ class Variable; namespace paddle { namespace framework { - -// TODO(zhiqiu): add more function in base class -class ScopeBase { - public: - /// Find a variable in the scope or any of its ancestors. Returns - /// nullptr if cannot find. - /// Caller doesn't own the returned Variable. - virtual Variable* FindVar(const std::string& name) const = 0; - virtual ~ScopeBase() {} -}; - /** * @brief Scope that manage all variables. * @@ -57,7 +46,7 @@ class ScopeBase { * One net can run in different scopes and update different variable in the * scope. */ -class Scope : public ScopeBase { +class Scope { public: Scope() {} ~Scope(); diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index f5ae14bbf6109..18cca8739ec0f 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -79,7 +79,7 @@ class BKCLCommunicator; namespace framework { class LoDRankTable; -class ScopeBase; +class Scope; class ReaderHolder; class Scope; } // namespace framework diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index 7ed6e93ec7c7c..6d4f7c347b097 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -102,7 +102,10 @@ class DygraphExecutionContext : public framework::ExecutionContext { } bool HasAttr(const std::string& name) const override { - return attrs_.count(name) != 0 || default_attrs_.count(name) != 0; + if (attrs_.find(name) == attrs_.end()) { + return default_attrs_.find(name) != default_attrs_.end(); + } + return true; } const framework::AttributeMap& Attrs() const override { return attrs_; } diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 30cf0e82e9ff4..1f70bcf4f428a 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -25,6 +25,9 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_op_list.h" +#endif #include "paddle/fluid/framework/library_type.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" @@ -185,13 +188,29 @@ PreparedOp PrepareImpl( phi::KernelSignature kernel_signature; phi::KernelKey phi_kernel_key; std::string phi_kernel_name; + +// NOTE(jiahongyu): The registered MKLDNN kernel have library_type = +// LibraryType::kMKLDNN and data_layout_ = DataLayout::kMKLDNN. But the default +// values are kPlain, so we need to modify the library_type and data_layout_ +// here. There are three statements in if condition: The first statement checks +// whether library_type_ are changed by other high priority backends; the second +// checks whether this op has specific implementation; the third checks whether +// mkldnn kernel can be used. +#ifdef PADDLE_WITH_MKLDNN + if (expected_kernel_key.library_type_ == framework::LibraryType::kPlain && + !paddle::platform::in_mkldnn_white_list(op.Type()) && + op.CanMKLDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.data_type_)) { + expected_kernel_key.library_type_ = framework::LibraryType::kMKLDNN; + expected_kernel_key.data_layout_ = framework::DataLayout::kMKLDNN; + } +#endif + #if defined(PADDLE_WITH_XPU) bool is_xpu_unsupport = paddle::platform::is_xpu_place(expected_kernel_key.place_) && !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || paddle::platform::is_in_xpu_black_list(op.Type()); - #endif bool has_phi_kernel = false; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 42126b5048e68..4834039d64f15 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2174,6 +2174,7 @@ USE_TRT_CONVERTER(flatten); USE_TRT_CONVERTER(flatten_contiguous_range); USE_TRT_CONVERTER(matmul); USE_TRT_CONVERTER(matmul_v2); +USE_TRT_CONVERTER(bmm); USE_TRT_CONVERTER(conv2d); USE_TRT_CONVERTER(relu); USE_TRT_CONVERTER(exp); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index ed6508929ca1f..5e9e6d8f2c4f1 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -4,6 +4,7 @@ list( CONVERT_FILES matmul_op.cc matmul_v2_op.cc + bmm_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/bmm_op.cc b/paddle/fluid/inference/tensorrt/convert/bmm_op.cc new file mode 100644 index 0000000000000..4f4751d8ca977 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/bmm_op.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class BMMOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + framework::OpDesc op_desc(op, nullptr); + nvinfer1::ILayer* layer = nullptr; + + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); + auto output_name = op_desc.Output("Out")[0]; + + layer = TRT_ENGINE_ADD_LAYER(engine_, + MatrixMultiply, + *input1, + nvinfer1::MatrixOperation::kNONE, + *input2, + nvinfer1::MatrixOperation::kNONE); + + RreplenishLayerAndOutput(layer, "bmm", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(bmm, BMMOpConverter); diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc index bba2e84e32b9f..4c5944e79451c 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc @@ -105,10 +105,287 @@ nvinfer1::DimsExprs InstanceNormInferMeta( return x_dims; } +inline const nvinfer1::IDimensionExpr* CalcOutputSize( + const nvinfer1::IDimensionExpr* input_size, + const nvinfer1::IDimensionExpr* filter_size, + const nvinfer1::IDimensionExpr* dilation, + const nvinfer1::IDimensionExpr* padding1, + const nvinfer1::IDimensionExpr* padding2, + const nvinfer1::IDimensionExpr* stride, + nvinfer1::IExprBuilder& expr_builder // NOLINT +) { + // dkernel = dilation * (filter_size - 1) + 1; + const nvinfer1::IDimensionExpr* dkernel = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kPROD, + *dilation, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUB, + *filter_size, + *expr_builder.constant(1))), + *expr_builder.constant(1)); + + // output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1; + const nvinfer1::IDimensionExpr* tmp = expr_builder.operation( + nvinfer1::DimensionOperation::kSUB, + *expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, *input_size, *padding1), + *padding2), + *dkernel); + + const nvinfer1::IDimensionExpr* output_size = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, *tmp, *stride), + *expr_builder.constant(1)); + return output_size; +} + +nvinfer1::DimsExprs UnflodInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + PADDLE_ENFORCE_EQ( + nb_inputs, + 1, + phi::errors::InvalidArgument("inputs of unfold should be equal to 1, " + "But received (%s)", + nb_inputs)); + + const nvinfer1::DimsExprs in_dims = inputs[0]; + std::vector out_dims; + out_dims.push_back(in_dims.d[0]); + + auto kernel_sizes = + PADDLE_GET_CONST(std::vector, op_desc.GetAttr("kernel_sizes")); + auto dilations = + PADDLE_GET_CONST(std::vector, op_desc.GetAttr("dilations")); + auto paddings = + PADDLE_GET_CONST(std::vector, op_desc.GetAttr("paddings")); + auto strides = PADDLE_GET_CONST(std::vector, op_desc.GetAttr("strides")); + + // output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1]; + const nvinfer1::IDimensionExpr* output_channels = expr_builder.operation( + nvinfer1::DimensionOperation::kPROD, + *in_dims.d[1], + *expr_builder.operation(nvinfer1::DimensionOperation::kPROD, + *expr_builder.constant(kernel_sizes[0]), + *expr_builder.constant(kernel_sizes[1]))); + out_dims.push_back(output_channels); + + const nvinfer1::IDimensionExpr* output_height = + CalcOutputSize(in_dims.d[2], + expr_builder.constant(kernel_sizes[0]), + expr_builder.constant(dilations[0]), + expr_builder.constant(paddings[0]), + expr_builder.constant(paddings[2]), + expr_builder.constant(strides[0]), + expr_builder); + const nvinfer1::IDimensionExpr* output_width = + CalcOutputSize(in_dims.d[3], + expr_builder.constant(kernel_sizes[1]), + expr_builder.constant(dilations[1]), + expr_builder.constant(paddings[1]), + expr_builder.constant(paddings[3]), + expr_builder.constant(strides[1]), + expr_builder); + + const nvinfer1::IDimensionExpr* output_col_length = expr_builder.operation( + nvinfer1::DimensionOperation::kPROD, *output_height, *output_width); + + out_dims.push_back(output_col_length); + nvinfer1::DimsExprs output; + output.nbDims = out_dims.size(); + for (size_t i = 0; i < out_dims.size(); i++) output.d[i] = out_dims[i]; + return output; +} + +nvinfer1::DimsExprs ScatterNdAddInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + PADDLE_ENFORCE_EQ(nb_inputs, + 3, + phi::errors::InvalidArgument( + "inputs of scatter_nd_add should be equal to 3, " + "But received (%s)", + nb_inputs)); + const nvinfer1::DimsExprs ref_dims = inputs[0]; + return ref_dims; +} + +nvinfer1::DimsExprs UnchangedInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + PADDLE_ENFORCE_EQ(nb_inputs, + 1, + phi::errors::InvalidArgument( + "inputs of UnchangedInferMeta should be equal to 1, " + "But received (%s)", + nb_inputs)); + return inputs[0]; +} + +nvinfer1::DimsExprs Pad3dInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + const nvinfer1::DimsExprs x_dim = inputs[0]; + + nvinfer1::DimsExprs out_dims; + out_dims.nbDims = x_dim.nbDims; + + out_dims.d[0] = x_dim.d[0]; + + auto paddings = + PADDLE_GET_CONST(std::vector, op_desc.GetAttr("paddings")); + auto data_format = + PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_format")); + + if (data_format == "NCDHW") { + out_dims.d[1] = x_dim.d[1]; + } else { + out_dims.d[4] = x_dim.d[4]; + } + + if (data_format == "NCDHW") { + // depth + out_dims.d[2] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *x_dim.d[2], + *expr_builder.constant(paddings[4])), + *expr_builder.constant(paddings[5])); + // height + out_dims.d[3] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *x_dim.d[3], + *expr_builder.constant(paddings[2])), + *expr_builder.constant(paddings[3])); + // width + out_dims.d[4] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *x_dim.d[4], + *expr_builder.constant(paddings[0])), + *expr_builder.constant(paddings[1])); + } else { // NDHWC + // depth + out_dims.d[1] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *x_dim.d[1], + *expr_builder.constant(paddings[4])), + *expr_builder.constant(paddings[5])); + // height + out_dims.d[2] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *x_dim.d[2], + *expr_builder.constant(paddings[2])), + *expr_builder.constant(paddings[3])); + // width + out_dims.d[3] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *x_dim.d[3], + *expr_builder.constant(paddings[0])), + *expr_builder.constant(paddings[1])); + } + return out_dims; +} + +nvinfer1::DimsExprs PNormInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + const nvinfer1::DimsExprs x_dim = inputs[0]; + std::vector reduce_dims; + std::vector keep_dims; + + bool asvector = PADDLE_GET_CONST(bool, op_desc.GetAttr("asvector")); + bool keepdim = PADDLE_GET_CONST(bool, op_desc.GetAttr("keepdim")); + int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis")); + + if (asvector) { + reduce_dims.emplace_back(expr_builder.constant(1)); + keep_dims.emplace_back(expr_builder.constant(1)); + if (keepdim) { + for (int i = 1; i < x_dim.nbDims; ++i) { + keep_dims.emplace_back(expr_builder.constant(1)); + } + } + } else { + if (axis < 0) axis = x_dim.nbDims + axis; + for (int i = 0; i < x_dim.nbDims; ++i) { + if (i != axis) reduce_dims.emplace_back(x_dim.d[i]); + } + if (reduce_dims.size() == 0) { + reduce_dims.emplace_back(expr_builder.constant(1)); + } + } + keep_dims[axis] = expr_builder.constant(1); + + nvinfer1::DimsExprs output; + if (keepdim) { + output.nbDims = keep_dims.size(); + for (int i = 0; i < output.nbDims; i++) output.d[i] = keep_dims[i]; + } else { + output.nbDims = reduce_dims.size(); + for (int i = 0; i < output.nbDims; i++) output.d[i] = reduce_dims[i]; + } + return output; +} + +nvinfer1::DimsExprs GridSamplerInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + const nvinfer1::DimsExprs x_dims = inputs[0]; + const nvinfer1::DimsExprs grid_dims = inputs[1]; + + nvinfer1::DimsExprs output; + if (grid_dims.nbDims == 4) { + output.nbDims = 4; + output.d[0] = x_dims.d[0]; + output.d[1] = x_dims.d[1]; + output.d[2] = grid_dims.d[1]; + output.d[3] = grid_dims.d[2]; + } else { + output.nbDims = 4; + output.d[0] = x_dims.d[0]; + output.d[1] = x_dims.d[1]; + output.d[2] = grid_dims.d[1]; + output.d[3] = grid_dims.d[2]; + output.d[4] = grid_dims.d[3]; + } + return output; +} + PD_REGISTER_DYNAMIC_INFER_META_FN(gather_nd, GatherNdInferMeta); PD_REGISTER_DYNAMIC_INFER_META_FN(yolo_box, YoloBoxInferMeta); PD_REGISTER_DYNAMIC_INFER_META_FN(instance_norm, InstanceNormInferMeta); - +PD_REGISTER_DYNAMIC_INFER_META_FN(unfold, UnflodInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(scatter_nd_add, ScatterNdAddInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(inverse, UnchangedInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(pad3d, Pad3dInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(grid_sampler, GridSamplerInferMeta); } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h index 0bc2ff78b68df..c0ddaf5d983ef 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h @@ -23,6 +23,11 @@ namespace tensorrt { USE_TRT_DYNAMIC_INFER_META_FN(gather_nd); USE_TRT_DYNAMIC_INFER_META_FN(yolo_box); USE_TRT_DYNAMIC_INFER_META_FN(instance_norm); +USE_TRT_DYNAMIC_INFER_META_FN(unfold); +USE_TRT_DYNAMIC_INFER_META_FN(scatter_nd_add); +USE_TRT_DYNAMIC_INFER_META_FN(pad3d); +USE_TRT_DYNAMIC_INFER_META_FN(inverse); +USE_TRT_DYNAMIC_INFER_META_FN(grid_sampler); } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 904768d179d32..5fea48604ae11 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -327,6 +327,12 @@ struct SimpleOpTypeSetTeller : public Teller { } } + if (op_type == "bmm") { + if (!with_dynamic_shape) { + return false; + } + } + if (op_type == "matmul_v2") { if (!with_dynamic_shape) { return false; @@ -2115,6 +2121,7 @@ struct SimpleOpTypeSetTeller : public Teller { "mul", "matmul", "matmul_v2", + "bmm", "conv2d", "conv2d_fusion", "pool2d", @@ -2227,6 +2234,7 @@ struct SimpleOpTypeSetTeller : public Teller { "mul", "matmul", "matmul_v2", + "bmm", "conv2d", "conv2d_fusion", "pool2d", @@ -2353,6 +2361,14 @@ struct GenericPluginTeller : public Teller { if (!desc.HasAttr("iou_aware") && !desc.HasAttr("iou_aware_factor")) return false; } + if (op_type == "pad3d") { + auto pad3d_inputs = desc.Inputs(); + if (pad3d_inputs.find("Paddings") != pad3d_inputs.end()) { + if (desc.Input("Paddings").size() >= 1) { + return false; + } + } + } if (use_no_calib_int8) { return false; } else { diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu index d9afa475bff6a..e083e9633dc29 100644 --- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu @@ -290,6 +290,10 @@ bool GenericPlugin::supportsFormatCombination( if (op_desc_.Type() == "gather_nd" || op_desc_.Type() == "yolo_box") { if (pos == 0) return in_out[pos].type == nvinfer1::DataType::kFLOAT; if (pos == 1) return in_out[pos].type == nvinfer1::DataType::kINT32; + } else if (op_desc_.Type() == "scatter_nd_add") { + if (pos == 0) return in_out[pos].type == nvinfer1::DataType::kFLOAT; + if (pos == 1) return in_out[pos].type == nvinfer1::DataType::kINT32; + if (pos == 2) return in_out[pos].type == nvinfer1::DataType::kFLOAT; } else { return in_out[pos].type == nvinfer1::DataType::kFLOAT; } diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index 727d2576e57f7..db1f2953c742f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -40,7 +40,12 @@ T GetValueFromStream(std::stringstream &ss); template <> std::string GetValueFromStream(std::stringstream &ss); -TEST(Analyzer_bert, profile) { profile(); } +TEST(Analyzer_bert, profile) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif + profile(); +} #ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_bert, profile_mkldnn) { @@ -57,6 +62,9 @@ TEST(Analyzer_bert, profile_mkldnn_bf16) { // Check the fuse status TEST(Analyzer_bert, fuse_statis) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif auto cfg(SetConfig()); int num_ops; auto predictor = CreatePaddlePredictor(cfg); @@ -65,7 +73,12 @@ TEST(Analyzer_bert, fuse_statis) { LOG(INFO) << "num_ops: " << num_ops; } -TEST(Analyzer_bert, compare) { CompareNativeAndAnalysisWrapper(); } +TEST(Analyzer_bert, compare) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif + CompareNativeAndAnalysisWrapper(); +} #ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_bert, compare_mkldnn) { auto use_mkldnn = true; @@ -75,6 +88,9 @@ TEST(Analyzer_bert, compare_mkldnn) { // Compare Deterministic result TEST(Analyzer_bert, compare_determine) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif auto cfg(SetConfig()); auto inputs = LoadInputData(); @@ -83,6 +99,9 @@ TEST(Analyzer_bert, compare_determine) { } TEST(Analyzer_bert, transfer_scope_cache) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif auto config(SetConfig()); std::vector input, output; diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc index 529bc0a8194ba..1efbe7cecdde4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.cc @@ -39,18 +39,31 @@ void profile(bool use_mkldnn = false, bool use_gpu = false) { FLAGS_num_threads); } -TEST(Analyzer_ernie, profile) { profile(); } +TEST(Analyzer_ernie, profile) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif + profile(); +} #ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_ernie, profile_mkldnn) { profile(true, false); } #endif // Check the model by gpu #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -TEST(Analyzer_ernie, profile_gpu) { profile(false, true); } +TEST(Analyzer_ernie, profile_gpu) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif + profile(false, true); +} #endif // Check the fuse status TEST(Analyzer_Ernie, fuse_statis) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif AnalysisConfig cfg; SetConfig(&cfg); @@ -85,13 +98,21 @@ void compare(bool use_mkldnn = false) { reinterpret_cast(&cfg), inputs); } -TEST(Analyzer_ernie, compare) { compare(); } +TEST(Analyzer_ernie, compare) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif + compare(); +} #ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_ernie, compare_mkldnn) { compare(true /* use_mkldnn */); } #endif // Compare Deterministic result TEST(Analyzer_Ernie, compare_determine) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif AnalysisConfig cfg; SetConfig(&cfg); auto pass_builder = cfg.pass_builder(); @@ -104,6 +125,9 @@ TEST(Analyzer_Ernie, compare_determine) { // Compare results TEST(Analyzer_Ernie, compare_results) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif AnalysisConfig cfg; SetConfig(&cfg); auto pass_builder = cfg.pass_builder(); @@ -150,6 +174,9 @@ TEST(Analyzer_Ernie_ipu, ipu_compare_determine) { // IPU: Compare results TEST(Analyzer_Ernie_ipu, ipu_compare_results) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif AnalysisConfig cfg; SetIpuConfig(&cfg); diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index 870c73fe1e6f6..aeefcf1059243 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -23,6 +23,9 @@ namespace paddle { namespace inference { void run(const AnalysisConfig& config, std::vector* out_data, int bs) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif auto predictor = CreatePaddlePredictor(config); auto input_names = predictor->GetInputNames(); @@ -222,6 +225,9 @@ std::shared_ptr InitPredictor() { } void run(paddle_infer::Predictor* predictor, std::vector* out_data) { +#if !defined(_WIN32) + setenv("NVIDIA_TF32_OVERRIDE", "0", 1); +#endif const int run_batch = 2; const int run_seq_len = 71; const int max_seq_len = 128; diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index d8fd433c0417c..3310bdbbe8254 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -21,9 +21,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/infermeta/unary.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif namespace paddle { namespace operators { @@ -36,15 +33,6 @@ class AbsOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -86,15 +74,6 @@ class AbsGradOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 5160071486244..6f59e44d546fb 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -82,27 +82,18 @@ class ActivationGradOpMaker : public framework::SingleGradOpMaker { framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, const framework::OperatorWithKernel& oper, const std::string& name) { - framework::LibraryType library{framework::LibraryType::kPlain}; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; auto data_type = oper.IndicateVarDataType(ctx, name); -// FIXME(liuwei1031) temporarily disable the code to unblock users -// TODO(liuwei1031) figure out the reason behind -// https://github.com/PaddlePaddle/Paddle/issues/16096 -// and re-enable this in the future -// #ifdef PADDLE_WITH_CUDA -// auto it1 = oper.Attrs().find("use_cudnn"); -// if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { -// library = framework::LibraryType::kCUDNN; -// } -// #endif -#ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - oper.CanMKLDNNBeUsed(ctx, data_type)) { - library = framework::LibraryType::kMKLDNN; - layout = framework::DataLayout::kMKLDNN; - } -#endif - return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + // FIXME(liuwei1031) temporarily disable the code to unblock users + // TODO(liuwei1031) figure out the reason behind + // https://github.com/PaddlePaddle/Paddle/issues/16096 + // and re-enable this in the future + // #ifdef PADDLE_WITH_CUDA + // auto it1 = oper.Attrs().find("use_cudnn"); + // if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { + // library = framework::LibraryType::kCUDNN; + // } + // #endif + return framework::OpKernelType(data_type, ctx.GetPlace()); } class ActivationOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 4979ab0345200..4f134ff974637 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -197,16 +197,6 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType( platform::errors::InvalidArgument( "Variance input should be of float type")); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -396,18 +386,7 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType( platform::errors::InvalidArgument("gradient variable of Y is empty")); } - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index 7994dacf08794..997c017d3129c 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -30,15 +30,6 @@ class ClipOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -98,15 +89,6 @@ class ClipOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index a875f1fc8df9e..ae65930b86ac0 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -24,10 +24,6 @@ limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" -#ifdef PADDLE_WITH_MKLDNN -#include -#endif - namespace paddle { namespace operators { using Tensor = phi::DenseTensor; @@ -53,14 +49,6 @@ class ConcatOp : public framework::OperatorWithKernel { PADDLE_THROW(platform::errors::InvalidArgument( "All Inputs of Concat OP are Empty!")); } -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -127,19 +115,6 @@ class ConcatOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); - -#ifdef PADDLE_WITH_MKLDNN - // extra checking if attr "use_mkldnn" exist is needed because - // test_reverse_op is calling concat_grad kernel without setting - // "use_mkldnn" to any value - if (ctx.HasAttr("use_mkldnn") && - this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 42e5eb2a43820..c80cc2dc734c6 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -49,15 +49,6 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( } } #endif -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index 6f46e5bf2f8ed..4d620b5181ccb 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -18,9 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_layout.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -199,15 +196,6 @@ class DataNormOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "bias input should be of float type")); } - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -508,18 +496,7 @@ class DataNormGradOp : public framework::OperatorWithKernel { "Y@GRAD can not be found for computation")); } - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index b1f0817539f17..c8289ab098a3a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -45,15 +45,6 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Out"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index afc06b0d9981b..23271352f6b7c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -32,15 +32,6 @@ class ElementwiseMulOp : public ElementwiseOp { const framework::ExecutionContext& ctx) const override { auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index a8e1da9f7945d..70bdd11977b21 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -156,15 +156,6 @@ class ElementwiseOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -317,15 +308,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -371,15 +353,6 @@ class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -432,15 +405,6 @@ class ElementwiseOpDoubleGradWithoutDXDY input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY"); } - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -493,15 +457,6 @@ class ElementwiseOpTripleGrad : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { framework::proto::VarType::Type input_data_type; input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "D_DDOut"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 772ef09219817..09dc0f68cce2a 100644 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -24,6 +24,13 @@ namespace operators { class ExpandAsV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), + ctx.device_context()); + } }; class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index fb82f0b6524ba..6bf40fd3bb6b8 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -37,15 +37,6 @@ class ExpandV2Op : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -163,15 +154,6 @@ class ExpandV2GradOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 85aadcb07ad32..82c6b89063bea 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -104,15 +104,6 @@ class FillConstantOp : public framework::OperatorWithKernel { } } -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return kt; } }; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 888e447d798f9..679256b47ca00 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -153,14 +153,6 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionGRUOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index b09c6a2959b94..93507160a5072 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -176,14 +176,6 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index e2ee27f2561e1..f418e48f7d9c8 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -60,16 +60,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto data_type = static_cast(ctx.Attr("dtype")); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.device_context(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(data_type, ctx.device_context()); } diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 15b0a04ab2f67..eb3c55711641e 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -36,14 +36,6 @@ class GeluOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } }; @@ -76,14 +68,6 @@ class GeluGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index ac50da83e6b78..257e513700b85 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -340,20 +340,6 @@ class InterpolateOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - // TODO(danqing): support other interp_method - // (https://github.com/PaddlePaddle/Paddle/pull/30016/files) - // NOTE(jiahy0825): currently only support interp_method = nearest or - // interp_method = bilinear - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index e9d0d718b9fb7..a3e7f46fecafe 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -444,20 +444,6 @@ class InterpolateV2Op : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - // TODO(danqing): support other interp_method - // (https://github.com/PaddlePaddle/Paddle/pull/30016/files) - // NOTE(jiahy0825): currently only support interp_method = nearest or - // interp_method = bilinear - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index a4286aea07842..99da0b08af75b 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -33,15 +33,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index b2ef8f0370e37..a9cfadf3b6455 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -225,16 +225,6 @@ class LRNOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } @@ -359,16 +349,6 @@ class LRNOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index f2900bea21c26..024aa7731c9c6 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -697,15 +697,6 @@ class MatMulOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -889,15 +880,6 @@ class MatMulOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 876a90e7b9674..21537b70a4dc8 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -135,15 +135,6 @@ class MatMulV2Op : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -210,15 +201,6 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index f998ca8a5ec0f..ee70a441a754f 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -36,15 +36,6 @@ class PReluOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -127,15 +118,6 @@ class PReluGradOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index cab04e43e8681..7416269e33dd2 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -31,15 +31,6 @@ class ScaleOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index b191f7cfa0011..445514ab9b050 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -30,15 +30,6 @@ class ShapeOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index ba96e92d3030b..7e98514cde370 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -39,15 +39,6 @@ class ShuffleChannelOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 6c63b2719f409..7cb76dc56cb8a 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -53,7 +53,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "float16 can only be used on GPU/NPU/XPU/MLU and custom place")); } - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::CanCUDNNBeUsed(ctx)) { return framework::OpKernelType(input_data_type, @@ -62,15 +61,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { framework::LibraryType::kCUDNN); } #endif -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_); } }; @@ -158,15 +148,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { framework::LibraryType::kCUDNN); } #endif -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif - return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_); } }; diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index e9706f00ce889..d30320f9952ee 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -35,15 +35,6 @@ class StackOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); - -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { - return framework::OpKernelType(input_data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index df62da0b56546..48024cbb3ca17 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -98,14 +98,6 @@ class TransposeOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif auto &data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(data_type, ctx.GetPlace(), layout_); @@ -202,14 +194,6 @@ class TransposeOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(data_type, ctx.GetPlace(), layout_); @@ -360,14 +344,6 @@ class Transpose2OpGrad : public framework::OperatorWithKernel { framework::proto::VarType::Type data_type = OperatorWithKernel::IndicateVarDataType(ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, data_type)) { - return framework::OpKernelType(data_type, - ctx.GetPlace(), - framework::DataLayout::kMKLDNN, - framework::LibraryType::kMKLDNN); - } -#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType(data_type, ctx.GetPlace(), layout_); diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index 8b0fe26eeaa30..a533b17fc175d 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -32,6 +33,9 @@ inline void TransCompute(const int dim, phi::DenseTensor* out, const std::vector& axis) { switch (dim) { + case 0: + phi::Copy(dev_ctx, in, dev_ctx.GetPlace(), false, out); + break; case 1: phi::funcs::Transpose trans1; trans1(dev_ctx, in, out, axis); diff --git a/paddle/fluid/platform/mkldnn_op_list.h b/paddle/fluid/platform/mkldnn_op_list.h new file mode 100644 index 0000000000000..76f0a2affcc74 --- /dev/null +++ b/paddle/fluid/platform/mkldnn_op_list.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_MKLDNN + +#include + +namespace paddle { +namespace platform { + +// NOTE(jiahongyu): Below ops have specific PADDLE_WITH_MKLDNN hard codes within +// the function GetExpectedKernelType, so we need to handle them through +// mkldnn_white_list and solve them one-by-one in the future. +// TODO(jiahongyu): Delete mkldnn_white_list and fully support +// PADDLE_WITH_MKLDNN of GetExpectedKernelType. +static const std::unordered_set mkldnn_white_list = { + "cast", + "transfer_dtype", + "layer_norm", + "pad2d", + "pad3d", + "pool2d", + "pool2d_grad", + "slice", + "slice_grad", + "split", + "sum", + "sgd", + // NOTE(jiahongyu): squeeze MKLDNN kernel are disabled + // (https://github.com/PaddlePaddle/Paddle/pull/35781). If these MKLDNN + // kernels and codes are deleted in the future, attributes `use_mkldnn` + // should be removed from function declaration + "squeeze", + "squeeze_grad", + "squeeze2", + "squeeze2_grad", + // NOTE(jiahongyu): reshape and flatten have attribute use_mkldnn and they + // are registered in paddle, but they didn't change the ExpectedKernelType + // of tensor. Actually, mkldnn kernel of squeeze, reshape, and flatten + // should never be called. + "reshape", + "reshape_grad", + "reshape2", + "reshape2_grad", + "flatten", + "flatten_grad", + "flatten2", + "flatten2_grad", + // NOTE(jiahongyu): After fixing GetExpectedKernelType in ReduceOp, reduce + // series hard code can be deleted together. + "reduce_max", + "reduce_mean", + "reduce_mean_grad", + "reduce_min", + "reduce_sum", + "reduce_sum_grad", + // NOTE(jiahongyu): Below ops register kernel with customized_type_value, we + // need to analysis and solve them one-by-one. + "conv2d", + "conv2d_grad", + "depthwise_conv2d", + "depthwise_conv2d_grad", + "conv3d", + "conv3d_grad", + "prior_box", + "fc", + "mul", + "mul_grad", + "transpose2"}; + +inline bool in_mkldnn_white_list(const std::string& op_name) { + return mkldnn_white_list.find(op_name) != mkldnn_white_list.end(); +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc index 694f701b5ad9b..ec8561996681d 100644 --- a/paddle/fluid/platform/os_info.cc +++ b/paddle/fluid/platform/os_info.cc @@ -27,6 +27,7 @@ limitations under the License. */ #else #include #endif +#include "glog/logging.h" #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h" #include "paddle/fluid/platform/macros.h" // import DISABLE_COPY_AND_ASSIGN @@ -115,6 +116,7 @@ bool SetCurrentThreadName(const std::string& name) { return false; } instance.SetCurrentThreadData(name); + VLOG(4) << __func__ << " " << name; return true; } diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h index ef894fd3dc281..a827d063c152e 100644 --- a/paddle/fluid/platform/os_info.h +++ b/paddle/fluid/platform/os_info.h @@ -57,7 +57,7 @@ ThreadId GetCurrentThreadId(); // create/destory when using it. std::unordered_map GetAllThreadIds(); -static constexpr const char* kDefaultThreadName = "unset"; +static constexpr const char* kDefaultThreadName = "unnamed"; // Returns kDefaultThreadName if SetCurrentThreadName is never called. std::string GetCurrentThreadName(); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index c5540fa94ebf7..03a7b97af4df7 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -49,6 +49,14 @@ if(WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} graph_gpu_wrapper) endif() endif() +if(WITH_DISTRIBUTE + AND LINUX + AND NOT WITH_ASCEND_CL + AND NOT WITH_XPU + AND NOT WITH_CINN + AND NOT WITH_ROCM) + set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc) +endif() if(WITH_GPU OR WITH_ROCM) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) @@ -218,6 +226,29 @@ if(WITH_PSCORE) set(PYBIND_SRCS fleet_py.cc ${PYBIND_SRCS}) endif() +if(WITH_DISTRIBUTE + AND LINUX + AND NOT WITH_ASCEND_CL + AND NOT WITH_XPU + AND NOT WITH_CINN + AND NOT WITH_ROCM) + if(WITH_ARM_BRPC) + set(DISTRIBUTE_COMPILE_FLAGS + "-faligned-new -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" + ) + else() + set(DISTRIBUTE_COMPILE_FLAGS + "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" + ) + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + endif() + endif() + set_source_files_properties(rpc.cc PROPERTIES COMPILE_FLAGS + ${DISTRIBUTE_COMPILE_FLAGS}) + set(PYBIND_SRCS rpc.cc ${PYBIND_SRCS}) +endif() + if(WITH_NCCL OR WITH_RCCL) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) endif() diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index f6ace5a9fefdb..dc85c17e6d6c5 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -195,8 +195,8 @@ static PyObject* tensor__add__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__add__", 0); { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -292,8 +292,8 @@ static PyObject* tensor__sub__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__sub__", 0); { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -385,8 +385,8 @@ static PyObject* tensor__rsub__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__rsub__", 0); { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -480,11 +480,12 @@ static PyObject* tensor__mul__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__mul__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -581,11 +582,12 @@ static PyObject* tensor__div__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__div__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -696,11 +698,12 @@ static PyObject* tensor__rdiv__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__rdiv__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -809,11 +812,12 @@ static PyObject* tensor__gt__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__gt__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -894,11 +898,12 @@ static PyObject* tensor__ge__method(TensorObject* self, CastPyArg2Scalar(other_obj, "__ge__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -974,17 +979,18 @@ static PyObject* tensor__mod__method(TensorObject* self, other_tensor = full_ad_func(self_tensor.shape(), phi::Scalar(other_float), self_tensor.dtype(), - place); + self_tensor.place()); } else if (!PyCheckTensor(other_obj)) { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__mod__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -1056,17 +1062,21 @@ static PyObject* tensor__matmul__method(TensorObject* self, paddle::experimental::Tensor other_tensor; if (has_other_float) { eager_gil_scoped_release guard; - other_tensor = - full_ad_func({1}, phi::Scalar(other_float), self_tensor.dtype(), place); + other_tensor = full_ad_func({1}, + phi::Scalar(other_float), + self_tensor.dtype(), + self_tensor.place()); } else if (!PyCheckTensor(other_obj)) { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__matmul__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, self_tensor.dtype(), place); + other_tensor = + full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -1159,17 +1169,18 @@ static PyObject* tensor__lt__method(TensorObject* self, other_tensor = full_ad_func(self_tensor.shape(), phi::Scalar(other_float), self_tensor.dtype(), - place); + self_tensor.place()); } else if (!PyCheckTensor(other_obj)) { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__lt__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); @@ -1244,17 +1255,18 @@ static PyObject* tensor__le__method(TensorObject* self, other_tensor = full_ad_func(self_tensor.shape(), phi::Scalar(other_float), self_tensor.dtype(), - place); + self_tensor.place()); } else if (!PyCheckTensor(other_obj)) { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__le__", 0); if (PyComplex_Check(other_obj)) { eager_gil_scoped_release guard; - other_tensor = full_ad_func({1}, value, DataType::COMPLEX64, place); + other_tensor = + full_ad_func({1}, value, DataType::COMPLEX64, self_tensor.place()); } else { eager_gil_scoped_release guard; - other_tensor = - full_ad_func(self_tensor.shape(), value, self_tensor.dtype(), place); + other_tensor = full_ad_func( + self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } } else { other_tensor = CastPyArg2Tensor(other_obj, 0); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7486ce9402d75..71f1cf818428e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -182,6 +182,12 @@ limitations under the License. */ #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" #endif +#if defined(__linux__) && !defined(PADDLE_WITH_XPU) && \ + !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_CINN) && \ + !defined(PADDLE_WITH_HIP) +#include "paddle/fluid/pybind/rpc.h" +#endif + #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/imperative/layout_autotune.h" #include "paddle/fluid/pybind/eager_utils.h" @@ -848,6 +854,8 @@ PYBIND11_MODULE(libpaddle, m) { m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath); + m.def("set_current_thread_name", &paddle::platform::SetCurrentThreadName); + m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); @@ -2602,6 +2610,21 @@ All parameter, weight, gradient are variables in Paddle. BindGraphGpuWrapper(&m); #endif #endif +#if defined(__linux__) && !defined(PADDLE_WITH_XPU) && \ + !defined(PADDLE_WITH_ASCEND_CL) && !defined(PADDLE_WITH_CINN) && \ + !defined(PADDLE_WITH_HIP) + BindWorkerInfo(&m); + BindFuture(&m); + InitAndSetAgentInstance(&m); + InvokeRpc(&m); + StartWorker(&m); + StartClient(&m); + StopWorker(&m); + GetWorkerInfo(&m); + GetWorkerInfoByRank(&m); + GetCurrentWorkerInfo(&m); + GetAllWorkerInfos(&m); +#endif } } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/rpc.cc b/paddle/fluid/pybind/rpc.cc new file mode 100644 index 0000000000000..ee35e9c3a4164 --- /dev/null +++ b/paddle/fluid/pybind/rpc.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pybind/rpc.h" +#include "paddle/fluid/distributed/rpc/future_wrapper.h" +#include "paddle/fluid/distributed/rpc/python_rpc_handler.h" +#include "paddle/fluid/distributed/rpc/rpc_agent.h" + +namespace py = pybind11; +using paddle::distributed::FutureWrapper; +using paddle::distributed::PythonRpcHandler; +using paddle::distributed::RpcAgent; +using paddle::distributed::WorkerInfo; +namespace paddle { +namespace pybind { + +void BindWorkerInfo(py::module* m) { + py::class_(*m, "WorkerInfo") + .def(py::init()) + .def_readonly("name", &WorkerInfo::name_) + .def_readonly("rank", &WorkerInfo::id_) + .def_readonly("ip", &WorkerInfo::ip_) + .def_readonly("port", &WorkerInfo::port_) + .def("__str__", &WorkerInfo::to_string) + .def("__repr__", &WorkerInfo::to_string); +} +void BindFuture(py::module* m) { + py::class_>(*m, "Future") + .def(py::init<>()) + .def("wait", + &FutureWrapper::wait, + py::call_guard()); +} +void InitAndSetAgentInstance(py::module* m) { + m->def( + "init_and_set_agent_instance", + [](const std::string& name, const std::vector& infos) { + auto instance = std::make_shared(name, infos); + instance->SetAgentInstance(instance); + }, + py::call_guard(), + py::arg("name"), + py::arg("infos")); +} +void InvokeRpc(py::module* m) { + m->def( + "invoke_rpc", + [](const std::string& name, const std::string& py_func, int timeout_ms) { + auto instance = RpcAgent::RpcAgentInstance(); + return std::make_shared( + instance->InvokeRpc(py_func, name, timeout_ms)); + }, + py::call_guard(), + py::arg("to"), + py::arg("py_func"), + py::arg("timeout_ms")); +} +void StartWorker(py::module* m) { + m->def( + "rpc_start_worker", + []() { + auto instance = RpcAgent::RpcAgentInstance(); + instance->StartWorker(); + }, + py::call_guard()); +} +void StartClient(py::module* m) { + m->def( + "rpc_start_client", + []() { + auto instance = RpcAgent::RpcAgentInstance(); + instance->StartClient(); + }, + py::call_guard()); +} +void StopWorker(py::module* m) { + m->def( + "rpc_stop_worker", + []() { + auto instance = RpcAgent::RpcAgentInstance(); + instance->Stop(); + }, + py::call_guard()); +} +void GetWorkerInfo(py::module* m) { + m->def( + "rpc_get_worker_info", + [](const std::string& name) { + auto instance = RpcAgent::RpcAgentInstance(); + return instance->GetWorkerInfo(name); + }, + py::call_guard(), + py::arg("name")); +} +void GetWorkerInfoByRank(py::module* m) { + m->def( + "rpc_get_worker_info_by_rank", + [](uint32_t rank) { + auto instance = RpcAgent::RpcAgentInstance(); + return instance->GetWorkerInfoById(rank); + }, + py::call_guard(), + py::arg("rank")); +} +void GetCurrentWorkerInfo(py::module* m) { + m->def( + "rpc_get_current_worker_info", + []() { + auto instance = RpcAgent::RpcAgentInstance(); + return instance->GetCurrentWorkerInfo(); + }, + py::call_guard()); +} +void GetAllWorkerInfos(py::module* m) { + m->def( + "rpc_get_all_worker_infos", + []() { + auto instance = RpcAgent::RpcAgentInstance(); + return instance->GetAllWorkerInfos(); + }, + py::call_guard()); +} +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/rpc.h b/paddle/fluid/pybind/rpc.h new file mode 100644 index 0000000000000..7bd331387439e --- /dev/null +++ b/paddle/fluid/pybind/rpc.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindWorkerInfo(py::module* m); +void BindFuture(py::module* m); +void InitAndSetAgentInstance(py::module* m); +void InvokeRpc(py::module* m); +void StartWorker(py::module* m); +void StartClient(py::module* m); +void StopWorker(py::module* m); +void GetWorkerInfo(py::module* m); +void GetWorkerInfoByRank(py::module* m); +void GetCurrentWorkerInfo(py::module* m); +void GetAllWorkerInfos(py::module* m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index 015c1be57370a..176713b71bbcf 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -99,7 +99,6 @@ struct KernelKeyParser : ArgsIterator { inline void AssignKernelKeySet(const phi::TensorBase& tensor) { key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(tensor); - // TODO(chenweihang): select multi layout and dtype phi::DataLayout tensor_layout = tensor.layout(); key_set.layout = tensor_layout > key_set.layout ? tensor_layout : key_set.layout; diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index b578afa7c2b85..10b859fdac260 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -40,7 +40,7 @@ const std::unordered_set standard_kernel_suffixs({ * after 2.0, and can no longer be occupied by the previously abandoned ops. * They are marked here uniformly. */ -const std::unordered_set deprecated_op_names( +static const std::unordered_set deprecated_op_names( {"diag", "flatten", "flatten_grad", diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 71256bdabaa67..480882550dbca 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #include "paddle/phi/core/compat/convert_utils.h" #endif +#include "paddle/phi/core/compat/op_utils.h" DECLARE_bool(enable_api_kernel_fallback); @@ -45,6 +46,17 @@ KernelFactory& KernelFactory::Instance() { return g_op_kernel_factory; } +bool KernelFactory::HasCompatiblePhiKernel(const std::string& op_type) const { + if (deprecated_op_names.find(op_type) == deprecated_op_names.end()) { + if (phi::OpUtilsMap::Instance().Contains(op_type)) { + return true; + } else if (kernels_.find(op_type) != kernels_.end()) { + return true; + } + } + return false; +} + const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name, const KernelKey& kernel_key) const { auto iter = kernels_.find(kernel_name); diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index 8e98c276646d9..ed9280fa475bf 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -272,9 +272,7 @@ class KernelFactory { KernelNameMap& kernels() { return kernels_; } - bool HasCompatiblePhiKernel(const std::string& op_type) const { - return kernels_.find(TransToPhiKernelName(op_type)) != kernels_.end(); - } + bool HasCompatiblePhiKernel(const std::string& op_type) const; KernelResult SelectKernelOrThrowError(const std::string& kernel_name, const KernelKey& kernel_key, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 8d085a05a4c91..6cdf0b9345fe0 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -3713,7 +3713,7 @@ void TileInferMeta(const MetaTensor& x, repeat_times_data.size())); PADDLE_ENFORCE_GE( repeat_times_data.size(), - 1, + 0, errors::InvalidArgument( "The size of the shape of input 'repeat_times' for tile op " "must be positive integers, but the value received is %d.", @@ -3746,7 +3746,7 @@ void TileInferMeta(const MetaTensor& x, } out->set_dims(phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { + if (out_rank > 0 && (out_shape[0] == x_dims[0])) { out->share_lod(x); } out->set_dtype(x.dtype()); diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc index 050c61596fee5..a4d43ef8fbe89 100644 --- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc @@ -97,14 +97,14 @@ inline void ModulatedDeformableCol2imCPUKernel( width); *(grad_im + cur_bottom_grad_pos) = - *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad); + *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; } } } } } -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -116,7 +116,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - MT* grad_im) { + T* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; @@ -222,22 +222,22 @@ void ModulatedDeformableCol2imCoordCPUKernel( if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear( - data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += data_col_ptr[col_pos] * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } const T weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc index a2f5aa2a29795..583df78cc25f3 100644 --- a/paddle/phi/kernels/cpu/transpose_kernel.cc +++ b/paddle/phi/kernels/cpu/transpose_kernel.cc @@ -35,6 +35,9 @@ void TransposeKernel(const Context& ctx, } int rank = axis.size(); switch (rank) { + case 0: + phi::Copy(ctx, x, ctx.GetPlace(), false, out); + break; case 1: funcs::Transpose trans1; trans1(ctx, x, out, axis); diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc index 253a66adfc6a2..48858fa59390e 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" + #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" namespace phi { namespace funcs { @@ -82,8 +82,8 @@ inline void ModulatedDeformableIm2colCPUKernel( const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = DmcnIm2colBilinear( - data_im_ptr, width, height, width, h_im, w_im); + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); } *data_col_ptr = val; if (data_mask_ptr) { diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu index 0d5076a4937c3..48105d1f517e9 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu @@ -14,9 +14,6 @@ #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/device_context.h" namespace phi { namespace funcs { @@ -54,8 +51,6 @@ __global__ void ModulatedDeformableIm2colGpuKernel( T* data_col) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; - - using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t i = index; i < nthreads; i += offset) { const int w_col = i % width_col; const int h_col = (i / width_col) % height_col; @@ -90,22 +85,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); - const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); - MT val = static_cast(0); - const MT h_im = h_in + i * dilation_h + offset_h; - const MT w_im = w_in + j * dilation_w + offset_w; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { - val = DmcnIm2colBilinear( - data_im_ptr, width, height, width, h_im, w_im); + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); } + *data_col_ptr = val; if (data_mask_ptr) { const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; - const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); - val *= mask; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + *data_col_ptr *= mask; } - *data_col_ptr = static_cast(val); data_col_ptr += batch_size * height_col * width_col; } } @@ -169,20 +164,6 @@ template void ModulatedDeformableIm2col( const int deformable_groups, float* data_col); -template void ModulatedDeformableIm2col( - const phi::GPUContext& dev_ctx, - const phi::dtype::float16* data_im, - const phi::dtype::float16* data_offset, - const phi::dtype::float16* data_mask, - const std::vector& im_shape, - const std::vector& col_shape, - const std::vector& filter_shape, - const std::vector& paddings, - const std::vector& strides, - const std::vector& dilations, - const int deformable_groups, - phi::dtype::float16* data_col); - template void ModulatedDeformableIm2col( const phi::GPUContext& dev_ctx, const double* data_im, diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.h b/paddle/phi/kernels/funcs/deformable_conv_functor.h index 62e42cd58334f..eecda72927510 100644 --- a/paddle/phi/kernels/funcs/deformable_conv_functor.h +++ b/paddle/phi/kernels/funcs/deformable_conv_functor.h @@ -14,47 +14,44 @@ #pragma once -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { namespace funcs { -template -HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data, - const int data_width, - const int height, - const int width, - MT h, - MT w) { +template +HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, + const int data_width, + const int height, + const int width, + T h, + T w) { int h_low = floor(h); int w_low = floor(w); int h_high = h_low + 1; int w_high = w_low + 1; - MT lh = h - h_low; - MT lw = w - w_low; - MT hh = 1 - lh; - MT hw = 1 - lw; + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh; + T hw = 1 - lw; - MT v1 = (h_low >= 0 && w_low >= 0) - ? static_cast(bottom_data[h_low * data_width + w_low]) - : 0; - MT v2 = (h_low >= 0 && w_high <= width - 1) - ? static_cast(bottom_data[h_low * data_width + w_high]) - : 0; - MT v3 = (h_high <= height - 1 && w_low >= 0) - ? static_cast(bottom_data[h_high * data_width + w_low]) - : 0; - MT v4 = (h_high <= height - 1 && w_high <= width - 1) - ? static_cast(bottom_data[h_high * data_width + w_high]) - : 0; + T v1 = + (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; + T v2 = (h_low >= 0 && w_high <= width - 1) + ? bottom_data[h_low * data_width + w_high] + : 0; + T v3 = (h_high <= height - 1 && w_low >= 0) + ? bottom_data[h_high * data_width + w_low] + : 0; + T v4 = (h_high <= height - 1 && w_high <= width - 1) + ? bottom_data[h_high * data_width + w_high] + : 0; - MT w1 = hh * hw; - MT w2 = hh * lw; - MT w3 = lh * hw; - MT w4 = lh * lw; + T w1 = hh * hw; + T w2 = hh * lw; + T w3 = lh * hw; + T w4 = lh * lw; return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; } diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index 9c40fdd3f2778..76201a1077edb 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -170,7 +170,7 @@ __global__ void CrossEntropySoftLabel(T* loss, /* Hard label cross entropy. */ -template +template __global__ void CrossEntropyHardLabel(T* loss, const T* softmax, const LabelT* labels, @@ -185,22 +185,17 @@ __global__ void CrossEntropyHardLabel(T* loss, // thread ids compute loss[ids] using softmax[idx] if (ids < n * d) { auto lbl = static_cast(labels[ids]); - assert(lbl >= 0 && lbl < dim || lbl == ignore_idx); - if (lbl < 0 || lbl >= dim) { // label is out of bound + PADDLE_ENFORCE(lbl >= 0 && lbl < dim || lbl == ignore_idx, + "The value of label expected >= 0 and < %d, or == %d, " + "but got %ld. Please check label value.", + dim, + ignore_idx, + lbl); + if (lbl == ignore_idx) { loss[ids] = static_cast(0.0); } else { int64_t idx = idx_n * dim * d + lbl * d + idx_d; - if (IgnoreIndex == true) { - // IgnoreIndex is true - if (lbl == ignore_idx) { - loss[ids] = static_cast(0.0); - } else { - loss[ids] = -Log(softmax[idx]); - } - } else { - // IgnoreIndex is false - loss[ids] = -Log(softmax[idx]); - } + loss[ids] = -Log(softmax[idx]); } } } @@ -210,7 +205,7 @@ __global__ void CrossEntropyHardLabel(T* loss, Input: log softmax Output: loss and exp(input) */ -template +template __global__ void CrossEntropyExpHardLabel(T* loss, T* softmax, const LabelT* labels, @@ -226,24 +221,17 @@ __global__ void CrossEntropyExpHardLabel(T* loss, if (idx < n * dim * d) { auto lbl = static_cast(labels[ids]); - assert(lbl >= 0 && lbl < dim || lbl == ignore_idx); - if (IgnoreIndex == true) { - // IgnoreIndex is true - if (idx_dim == lbl) { - if (lbl == ignore_idx) { - loss[ids] = static_cast(0.0); - } else { - loss[ids] = -softmax[idx]; - } - } + PADDLE_ENFORCE(lbl >= 0 && lbl < dim || lbl == ignore_idx, + "The value of label expected >= 0 and < %d, or == %d, " + "but got %ld. Please check label value.", + dim, + ignore_idx, + lbl); + if (lbl == ignore_idx) { + loss[ids] = static_cast(0.0); } else { - // IgnoreIndex is false - if (lbl >= 0 && lbl < dim) { - if (lbl == idx_dim) { - loss[ids] = -softmax[idx]; - } - } else { - loss[ids] = static_cast(0.0); + if (lbl == idx_dim) { + loss[ids] = -softmax[idx]; } } softmax[idx] = Exp(softmax[idx]); @@ -292,7 +280,7 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input, return val; } -template +template __device__ __forceinline__ void ComputeLoss(T* loss, const T loss_value, const int label_id, @@ -302,14 +290,8 @@ __device__ __forceinline__ void ComputeLoss(T* loss, const int offset, const int ignore_index) { int loss_id = vec_size * tid + offset; - if (IgnoreIndex) { - if (label_value == loss_id) { - if (label_value == ignore_index) { - loss[label_id] = static_cast(0.0f); - } else { - loss[label_id] = loss_value; - } - } + if (label_value == ignore_index) { + loss[label_id] = static_cast(0.0f); } else { if (label_value == loss_id) { loss[label_id] = loss_value; @@ -317,11 +299,7 @@ __device__ __forceinline__ void ComputeLoss(T* loss, } } -template +template __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( T* loss, T* softmax, @@ -335,8 +313,13 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( int tid = threadIdx.x; int label_id = blockIdx.x; auto label_value = static_cast(label[label_id]); - assert(label_value >= 0 && label_value < size || label_value == ignore_index); - const bool label_valid = label_value >= 0 && label_value < size; + PADDLE_ENFORCE( + label_value >= 0 && label_value < size || label_value == ignore_index, + "The value of label expected >= 0 and < %d, or == %d, " + "but got %ld. Please check label value.", + size, + ignore_index, + label_value); int loss_id_offset = 0; if (offset > 0) { @@ -348,16 +331,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( AccT log_softmax = func(static_cast(logits[tid])); softmax[tid] = static_cast(std::exp(log_softmax)); // loss - if (label_valid) { - ComputeLoss(loss, - static_cast(-log_softmax), - label_id, - label_value, - tid, - 1, - loss_id_offset, - ignore_index); - } + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + 1, + loss_id_offset, + ignore_index); } size -= blockDim.x; logits += blockDim.x; @@ -383,16 +364,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( outs[i] = static_cast(std::exp(log_softmax)); // loss - if (label_valid) { - ComputeLoss(loss, - static_cast(-log_softmax), - label_id, - label_value, - tid, - VecSize, - loss_id_offset + i, - ignore_index); - } + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + VecSize, + loss_id_offset + i, + ignore_index); } // write @@ -406,29 +385,18 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( softmax[tid] = static_cast(std::exp(log_softmax)); // loss - if (label_valid) { - ComputeLoss(loss, - static_cast(-log_softmax), - label_id, - label_value, - tid, - 1, - loss_id_offset, - ignore_index); - } - } - - // invalid label, write once - if (!label_valid && threadIdx.x == 0) { - loss[label_id] = static_cast(0.0f); + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + 1, + loss_id_offset, + ignore_index); } } -template +template __device__ __forceinline__ void ScalarSoftmaxForwardImpl( T* loss, T* softmax, @@ -441,8 +409,13 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl( int remain = size % (VecSize * blockDim.x); int label_id = blockIdx.x; auto label_value = static_cast(label[label_id]); - assert(label_value >= 0 && label_value < size || label_value == ignore_index); - const bool label_valid = label_value >= 0 && label_value < size; + PADDLE_ENFORCE( + label_value >= 0 && label_value < size || label_value == ignore_index, + "The value of label expected >= 0 and < %d, or == %d, " + "but got %ld. Please check label value.", + size, + ignore_index, + label_value); // main part for (; tid < (size - remain); tid += VecSize * blockDim.x) { @@ -457,16 +430,14 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl( AccT log_softmax = func(static_cast(ins[i])); softmax[tid + i * blockDim.x] = static_cast(std::exp(log_softmax)); // loss - if (label_valid) { - ComputeLoss(loss, - static_cast(-log_softmax), - label_id, - label_value, - tid, - VecSize, - i, - ignore_index); - } + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + VecSize, + i, + ignore_index); } } @@ -475,29 +446,18 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl( AccT log_softmax = func(static_cast(logits[tid])); softmax[tid] = static_cast(std::exp(log_softmax)); // loss - if (label_valid) { - ComputeLoss(loss, - static_cast(-log_softmax), - label_id, - label_value, - tid, - 1, - 0, - ignore_index); - } - } - - // invalid label, write once - if (!label_valid && threadIdx.x == 0) { - loss[label_id] = static_cast(0.0f); + ComputeLoss(loss, + static_cast(-log_softmax), + label_id, + label_value, + tid, + 1, + 0, + ignore_index); } } -template +template __global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, @@ -537,17 +497,16 @@ __global__ void VectorizedSoftmaxForward(T* loss, // 3. softmax phi::LogSoftmaxForwardFunctor func(max, sum); if (input_offset == output_offset) { - VectorizedSoftmaxForwardImpl( - loss, - softmax, - logits, - label, - mid_dim, - input_offset, - func, - ignore_index); + VectorizedSoftmaxForwardImpl(loss, + softmax, + logits, + label, + mid_dim, + input_offset, + func, + ignore_index); } else { - ScalarSoftmaxForwardImpl( + ScalarSoftmaxForwardImpl( loss, softmax, logits, label, mid_dim, func, ignore_index); } } @@ -560,8 +519,8 @@ The computation includes - Compute: sum of - sum_{j}{ label_{i,j} * (src_{i,j} - maxvalue_{i} - log(sum[i]))} One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). -For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle -api to compute max (sum) in one warp. +For reduction max (sum), firstly compute max (sum) to one warp, then use +shuffle api to compute max (sum) in one warp. */ template __global__ void WarpSoftmaxForwardSoftLabel(T* loss, @@ -880,8 +839,7 @@ template + SoftmaxMode mode> __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src, @@ -1033,24 +991,21 @@ __global__ void WarpSoftmaxForward(T* loss, // label int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize; auto lbl = static_cast(label[first_batch + i]); - assert(lbl >= 0 && lbl < element_count || lbl == ignore_index); - if (IgnoreIndex == true) { - // IgnoreIndex is true - if (lbl == loss_idx) { - if (lbl != ignore_index) { - loss[first_batch + i] = -logsoftmax; - } else { - loss[first_batch + i] = static_cast(0.0); - } - } + if (lbl == ignore_index) { + loss[first_batch + i] = static_cast(0.0); } else { - // IgnoreIndex is false if (lbl >= 0 && lbl < element_count) { if (lbl == loss_idx) { loss[first_batch + i] = -logsoftmax; } } else { - loss[first_batch + i] = static_cast(0.0); + PADDLE_ENFORCE( + false, + "The value of label expected >= 0 and < %d, or == %d, " + "but got %ld. Please check label value.", + element_count, + ignore_index, + lbl); } } } else { // softmax @@ -1077,20 +1032,21 @@ __global__ void WarpSoftmaxForward(T* loss, // label int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s; auto lbl = static_cast(label[first_batch + i]); - assert(lbl >= 0 && lbl < element_count || lbl == ignore_index); - if (IgnoreIndex == true) { - // IgnoreIndex is true - if (lbl == loss_idx && lbl != ignore_index) { - loss[first_batch + i] = -logsoftmax; - } + if (lbl == ignore_index) { + loss[first_batch + i] = static_cast(0.0); } else { - // IgnoreIndex is false if (lbl >= 0 && lbl < element_count) { if (lbl == loss_idx) { loss[first_batch + i] = -logsoftmax; } } else { - loss[first_batch + i] = static_cast(0.0); + PADDLE_ENFORCE( + false, + "The value of label expected >= 0 and < %d, or == %d, " + "but got %ld. Please check label value.", + element_count, + ignore_index, + lbl); } } } else { // softmax @@ -1107,23 +1063,23 @@ __global__ void WarpSoftmaxForward(T* loss, } } -#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT) \ - case Log2Elements: \ - WarpSoftmaxForward \ - <<>>(loss, \ - softmax, \ - src, \ - label, \ - batch_size, \ - stride, \ - element_count, \ - ignore_index); \ +#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT) \ + case Log2Elements: \ + WarpSoftmaxForward \ + <<>>(loss, \ + softmax, \ + src, \ + label, \ + batch_size, \ + stride, \ + element_count, \ + ignore_index); \ break; /* Wrapper of softmax with cross entropy forward hard label. */ -template +template void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src, @@ -1162,7 +1118,7 @@ void SwitchWarpSoftmaxForward(T* loss, } } -template +template void LaunchVectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, @@ -1186,7 +1142,7 @@ void LaunchVectorizedSoftmaxForward(T* loss, block_size = std::max(block_size, kps::details::kWarpSize); dim3 grids(high_dim); dim3 blocks(block_size); - VectorizedSoftmaxForward + VectorizedSoftmaxForward <<>>( loss, softmax, logits, label, high_dim, mid_dim, ignore_index); } @@ -1197,7 +1153,7 @@ void LaunchVectorizedSoftmaxForward(T* loss, - LaunchVectorizedSoftmaxForward for large size when axis == -1 - cudnn function for axis != -1 */ -template +template static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, int rank, int axis, @@ -1214,24 +1170,24 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, if (D == 1) { if (dim <= max_dim) { // small size const SoftmaxMode mode = SoftmaxMode::kCrossEntropy; - SwitchWarpSoftmaxForward(loss_data, - softmax_data, - logits_data, - labels_data, - N, - dim, - dim, - ignore_index, - stream); + SwitchWarpSoftmaxForward(loss_data, + softmax_data, + logits_data, + labels_data, + N, + dim, + dim, + ignore_index, + stream); } else { // large size - LaunchVectorizedSoftmaxForward(loss_data, - softmax_data, - logits_data, - labels_data, - N, - dim, - ignore_index, - stream); + LaunchVectorizedSoftmaxForward(loss_data, + softmax_data, + logits_data, + labels_data, + N, + dim, + ignore_index, + stream); } } else { ScopedTensorDescriptor desc; @@ -1275,9 +1231,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx, int threads = 128; int blocks = (N * dim * D + threads - 1) / threads; // compute cross entropy, input is log softmax - CrossEntropyExpHardLabel - <<>>( - loss_data, softmax_data, labels_data, N, dim, D, ignore_index); + CrossEntropyExpHardLabel<<>>( + loss_data, softmax_data, labels_data, N, dim, D, ignore_index); } } @@ -1373,25 +1328,14 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, auto* labels_data = labels.data(); int threads = 128; int blocks = (n * d / axis_dim + threads - 1) / threads; - if (ignore_index >= 0 && ignore_index < axis_dim) { - CrossEntropyHardLabel - <<>>(loss_data, - logits_data, - labels_data, - n, - axis_dim, - d / axis_dim, - ignore_index); - } else { - CrossEntropyHardLabel - <<>>(loss_data, - logits_data, - labels_data, - n, - axis_dim, - d / axis_dim, - ignore_index); - } + CrossEntropyHardLabel + <<>>(loss_data, + logits_data, + labels_data, + n, + axis_dim, + d / axis_dim, + ignore_index); } // cause of input is softmax @@ -1456,31 +1400,17 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, } else { auto* logits_data = logits.data(); auto* labels_data = label.data(); - if (ignore_index >= 0 && ignore_index < axis_dim) { - SoftmaxWithCrossEntropyHardLabel(dev_ctx, - rank, - axis_v, - logits_data, - labels_data, - loss_data, - softmax_data, - n, - axis_dim, - d / axis_dim, - ignore_index); - } else { - SoftmaxWithCrossEntropyHardLabel(dev_ctx, - rank, - axis_v, - logits_data, - labels_data, - loss_data, - softmax_data, - n, - axis_dim, - d / axis_dim, - ignore_index); - } + SoftmaxWithCrossEntropyHardLabel(dev_ctx, + rank, + axis_v, + logits_data, + labels_data, + loss_data, + softmax_data, + n, + axis_dim, + d / axis_dim, + ignore_index); } } } diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index b65bdc8e44f74..b46f1f4a3314d 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) { kNumMaximumNumBlocks); } -template +template __global__ void ModulatedDeformableCol2imGpuKernel( const int nthreads, const T* data_col, @@ -51,7 +51,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel( const int deformable_group, const int height_col, const int width_col, - MT* grad_im) { + T* grad_im) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; for (size_t thread = index; thread < nthreads; thread += offset) { @@ -78,17 +78,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel( ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); - const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); - const MT cur_inv_h_data = h_in + i * dilation_h + offset_h; - const MT cur_inv_w_data = w_in + j * dilation_w + offset_w; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; - MT cur_top_grad = static_cast(data_col[thread]); + T cur_top_grad = data_col[thread]; if (data_mask) { const T* data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; - const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); + const T mask = data_mask_ptr[data_mask_hw_ptr]; cur_top_grad *= mask; } const int cur_h = static_cast(cur_inv_h_data); @@ -100,12 +100,13 @@ __global__ void ModulatedDeformableCol2imGpuKernel( abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; - MT weight = DmcnGetGradientWeight(cur_inv_h_data, - cur_inv_w_data, - cur_h + dy, - cur_w + dx, - height, - width); + T weight = DmcnGetGradientWeight(cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); + paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } @@ -114,7 +115,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel( } } -template +template void ModulatedDeformableCol2im(const Context& dev_ctx, const T* data_col, const T* data_offset, @@ -126,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx, const std::vector& stride, const std::vector& dilation, const int deformable_group, - MT* grad_im) { + T* grad_im) { int channel_per_deformable_group = im_shape[0] / deformable_group; int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; int blocks = NumBlocks(num_kernels); int threads = kNumCUDAThreads; - ModulatedDeformableCol2imGpuKernel + ModulatedDeformableCol2imGpuKernel <<>>(num_kernels, data_col, data_offset, @@ -184,9 +185,8 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( T* grad_mask) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; - using MT = typename phi::dtype::MPTypeTrait::Type; for (size_t i = index; i < nthreads; i += offset) { - MT val = 0, mval = 0; + T val = 0, mval = 0; const int w = i % width_col; const int h = (i / width_col) % height_col; const int c = (i / width_col / height_col) % offset_channels; @@ -231,42 +231,40 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel( const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); - - const MT offset_h = static_cast(data_offset_ptr[data_offset_h_ptr]); - const MT offset_w = static_cast(data_offset_ptr[data_offset_w_ptr]); - MT inv_h = h_in + i * dilation_h + offset_h; - MT inv_w = w_in + j * dilation_w + offset_w; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { inv_h = inv_w = -2; } else { - mval += - static_cast(data_col_ptr[col_pos]) * - funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, - width, - height, - width, - inv_h, - inv_w); + mval += data_col_ptr[col_pos] * + funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); } - const MT weight = - DmcnGetCoordinateWeight(inv_h, - inv_w, - height, - width, - data_im_ptr + cnt * height * width, - width, - bp_dir); + const T weight = + DmcnGetCoordinateWeight(inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); if (data_mask_ptr) { const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); - const MT mask = static_cast(data_mask_ptr[data_mask_hw_ptr]); - val += weight * static_cast(data_col_ptr[col_pos]) * mask; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + val += weight * data_col_ptr[col_pos] * mask; } else { - val += weight * static_cast(data_col_ptr[col_pos]); + val += weight * data_col_ptr[col_pos]; } cnt += 1; } - grad_offset[i] = static_cast(val); + grad_offset[i] = val; if (grad_mask && offset_c % 2 == 0) grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + @@ -361,5 +359,4 @@ PD_REGISTER_KERNEL(deformable_conv_grad, ALL_LAYOUT, phi::DeformableConvGradKernel, float, - double, - paddle::platform::float16) {} + double) {} diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu index 021791ca93061..2476dcbafb984 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -23,5 +23,4 @@ PD_REGISTER_KERNEL(deformable_conv, ALL_LAYOUT, phi::DeformableConvKernel, float, - double, - phi::dtype::float16) {} + double) {} diff --git a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu index 490b3e9404561..0599c7e4f06a0 100644 --- a/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/logsumexp_grad_kernel.cu @@ -15,8 +15,16 @@ #include "paddle/phi/kernels/logsumexp_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h" -PD_REGISTER_KERNEL( - logsumexp_grad, GPU, ALL_LAYOUT, phi::LogsumexpGradKernel, float, double) {} +using float16 = phi::dtype::float16; + +PD_REGISTER_KERNEL(logsumexp_grad, + GPU, + ALL_LAYOUT, + phi::LogsumexpGradKernel, + float, + double, + float16) {} diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu index 18249ff3bafb0..7963808476ded 100644 --- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu +++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu @@ -15,8 +15,11 @@ #include "paddle/phi/kernels/logsumexp_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h" +using float16 = phi::dtype::float16; + PD_REGISTER_KERNEL( - logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double) {} + logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double, float16) {} diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu index 3f3760a4890a2..9b895adb0a3be 100644 --- a/paddle/phi/kernels/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -35,6 +35,10 @@ void TransposeKernel(const Context& ctx, if (out->numel() == 0) { return; } + if (axis.size() == 0) { + phi::Copy(ctx, x, ctx.GetPlace(), false, out); + return; + } paddle::operators::TransposeGPUKernelDriver(ctx, x, axis, out); } diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 75dfc8514abf8..a079827ca9fd7 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -346,28 +346,41 @@ template