[OPENCL] Add write tuned param to file & improve lws generator. test=…

…develop (PaddlePaddle#5485) * [OPENCL] improve local work size generate. test=develop * [OPENCL][API] add api for saving local work size to device
saisai98 · Mar 3, 2021 · 68e358b · 68e358b
1 parent 08dca04
commit 68e358b
Show file tree

Hide file tree

Showing 20 changed files with 732 additions and 185 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -193,6 +193,7 @@ else()
             COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/third-party/flatbuffers/pre-build/framework_generated.h" "${CMAKE_SOURCE_DIR}/lite/model_parser/flatbuffers/"
             COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/third-party/flatbuffers/pre-build/param_generated.h" "${CMAKE_SOURCE_DIR}/lite/model_parser/flatbuffers/"
             COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/third-party/flatbuffers/pre-build/cache_generated.h" "${CMAKE_SOURCE_DIR}/lite/backends/opencl/utils/"
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different "${CMAKE_SOURCE_DIR}/third-party/flatbuffers/pre-build/tune_cache_generated.h" "${CMAKE_SOURCE_DIR}/lite/backends/opencl/utils/"
             )
 endif()
 

diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake
@@ -107,12 +107,14 @@ set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
 set(FRAMEWORK_SCHEMA_PATH "lite/model_parser/flatbuffers/framework.fbs")
 set(PARAM_SCHEMA_PATH "lite/model_parser/flatbuffers/param.fbs")
 set(CL_CACHE_SCHEMA_PATH "lite/backends/opencl/utils/cache.fbs")
+set(CL_TUNE_CACHE_SCHEMA_PATH "lite/backends/opencl/utils/tune_cache.fbs")
 compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
 compile_flatbuffers_schema_to_cpp_opt(param_fbs_header ${PARAM_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
 compile_flatbuffers_schema_to_cpp_opt(cl_cache_fbs_header ${CL_CACHE_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
+compile_flatbuffers_schema_to_cpp_opt(cl_tune_cache_fbs_header ${CL_TUNE_CACHE_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
 
 # All header files generated by flatbuffers must be declared here to avoid compilation failure.
-add_custom_target(fbs_headers ALL DEPENDS framework_fbs_header param_fbs_header cl_cache_fbs_header)
+add_custom_target(fbs_headers ALL DEPENDS framework_fbs_header param_fbs_header cl_cache_fbs_header cl_tune_cache_fbs_header)
 
 file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.fbs_dummy.cc CONTENT "")
 add_library(fbs_headers_dummy STATIC ${CMAKE_CURRENT_BINARY_DIR}/.fbs_dummy.cc)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
@@ -287,15 +287,20 @@ void ConfigBase::set_opencl_binary_path_name(const std::string &path,
 #endif
 }
 
-void ConfigBase::set_opencl_tune(CLTuneMode tune_mode, size_t lws_repeats) {
+void ConfigBase::set_opencl_tune(CLTuneMode tune_mode,
+                                 const std::string &path,
+                                 const std::string &name,
+                                 size_t lws_repeats) {
 #ifdef LITE_WITH_OPENCL
   if (paddle::lite_api::IsOpenCLBackendValid()) {
     opencl_tune_mode_ = tune_mode;
-    paddle::lite::CLRuntime::Global()->set_auto_tune(opencl_tune_mode_,
-                                                     lws_repeats);
+    paddle::lite::CLRuntime::Global()->set_auto_tune(
+        opencl_tune_mode_, path, name, lws_repeats);
 #ifdef LITE_WITH_LOG
     LOG(INFO) << "set opencl_tune_mode: "
-              << CLTuneModeToStr(lite::CLRuntime::Global()->auto_tune());
+              << CLTuneModeToStr(lite::CLRuntime::Global()->auto_tune())
+              << ", lws_repeats:" << lws_repeats;
+    LOG(INFO) << "tuned file path & name:" << path << "/" << name;
 #endif
   }
 #endif

diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
@@ -177,7 +177,10 @@ class LITE_API ConfigBase {
                                    const std::string& name);
   // set GPU opencl tune
   void set_opencl_tune(CLTuneMode tune_mode = CL_TUNE_NONE,
+                       const std::string& path = "",
+                       const std::string& name = "",
                        size_t lws_repeats = 4);
+
   // set GPU opencl precision
   void set_opencl_precision(CLPrecisionType p = CL_PRECISION_AUTO);
   // set subgraph_model_dir

diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
@@ -106,155 +106,76 @@ cl::NDRange CLContext::DefaultGlobalWorkSize(const CLImage &image) {
   }
 }
 
-std::vector<cl::NDRange> CLContext::GenerateLocalWorkSizes(
-    cl::NDRange global_work_size, size_t max_work_size) {
-  size_t generate_lws_type = CLRuntime::Global()->auto_tune();
+std::set<cl::NDRange> CLContext::GenerateLocalWorkSizes(cl::NDRange gws,
+                                                        size_t max_ws) {
+  size_t tune_type = CLRuntime::Global()->auto_tune();
 
-  cl::NDRange tmp_lws = DefaultLocalWorkSize(
-      global_work_size, max_work_size, /*divisor=*/2, /*tune_reverse=*/false);
+  cl::NDRange tmp_lws =
+      DefaultLocalWorkSize(gws, max_ws, /*divisor=*/2, /*tune_reverse=*/false);
   cl::NDRange last_lws = cl::NDRange{
       static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
-
-  std::vector<cl::NDRange> lwss{tmp_lws};
-  VLOG(1) << "generate_lws_type:" << generate_lws_type;
-  // 0 - None, 1 - Rapid, 2 - Normal, 3 - Exhaustive
-  if (generate_lws_type == lite_api::CL_TUNE_NONE) {
-    // 0 - None: nothing to do
-  } else if (generate_lws_type == lite_api::CL_TUNE_RAPID) {
-    // 1 - Rapid
-    for (auto tune_reverse : {true, false}) {
-      for (size_t divisor = 1; divisor < /*max_divisor=*/17; divisor *= 2) {
-        tmp_lws = DefaultLocalWorkSize(
-            global_work_size, max_work_size, divisor, tune_reverse);
-        if (last_lws[0] == tmp_lws[0] && last_lws[1] == tmp_lws[1] &&
-            last_lws[2] == tmp_lws[2]) {
-          // skip tuned lws
-          continue;
-        }
-        lwss.emplace_back(tmp_lws);
-      }
-    }
-  } else if (generate_lws_type == lite_api::CL_TUNE_NORMAL) {
-    // 2 - Normal
-    for (auto tune_reverse : {true, false}) {
-      for (size_t divisor = 1; divisor < /*max_divisor=*/15; divisor += 2) {
-        tmp_lws = DefaultLocalWorkSize(
-            global_work_size, max_work_size, divisor, tune_reverse);
-        if (last_lws[0] == tmp_lws[0] && last_lws[1] == tmp_lws[1] &&
-            last_lws[2] == tmp_lws[2]) {
-          // skip tuned lws
-          continue;
-        }
-        lwss.emplace_back(tmp_lws);
-      }
-    }
-  } else if (generate_lws_type == lite_api::CL_TUNE_EXHAUSTIVE) {
-    // 3 - Exhaustive
-    for (auto tune_reverse : {true, false}) {
-      for (size_t divisor = 1; divisor < /*max_divisor=*/15; divisor++) {
-        tmp_lws = DefaultLocalWorkSize(
-            global_work_size, max_work_size, divisor, tune_reverse);
-        if (last_lws[0] == tmp_lws[0] && last_lws[1] == tmp_lws[1] &&
-            last_lws[2] == tmp_lws[2]) {
-          // skip tuned lws
-          continue;
-        }
-        lwss.emplace_back(tmp_lws);
+  std::set<cl::NDRange> lwss{tmp_lws};
+
+  auto gen_lws = [&](const std::set<bool> &tune_reverses,
+                     const std::set<size_t> &divisors) {
+    for (bool tune_reverse : tune_reverses) {
+      for (size_t divisor : divisors) {
+        tmp_lws = DefaultLocalWorkSize(gws, max_ws, divisor, tune_reverse);
+        lwss.emplace(tmp_lws);
       }
     }
+  };
+
+  std::set<bool> tune_reverses{true, false};
+  std::set<size_t> divisors;
+  if (tune_type == lite_api::CL_TUNE_NONE) {
+    // do nothing
+  } else if (tune_type == lite_api::CL_TUNE_RAPID) {
+    divisors = {1, 2, 4, 8};
+  } else if (tune_type == lite_api::CL_TUNE_NORMAL) {
+    divisors = {1, 3, 5, 7, 9, 11, 13};
+  } else if (tune_type == lite_api::CL_TUNE_EXHAUSTIVE) {
+    divisors = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14};
   } else {
-    LOG(FATAL) << "Unsupported opencl tune type:" << generate_lws_type;
+    LOG(FATAL) << "Unsupported opencl tune type:" << tune_type;
   }
-
+  gen_lws(tune_reverses, divisors);
   return lwss;
 }
 
 cl::NDRange CLContext::DefaultLocalWorkSize(
-    cl::NDRange global_work_size,
-    size_t max_work_size,
-    int divisor /*=2*/,
-    bool tune_reverse /*=false*/,
-    size_t user_defined_max_work_size /*=0*/) {
-  int gws0 = global_work_size[0];
-  int gws1 = global_work_size[1];
-  int gws2 = global_work_size[2];
-
-  if (tune_reverse) {
-    gws2 = global_work_size[0];
-    gws1 = global_work_size[1];
-    gws0 = global_work_size[2];
-  }
-
-  if (divisor > 1) {
-    max_work_size /= divisor;
-  }
-  if (user_defined_max_work_size > 0 &&
-      user_defined_max_work_size <= max_work_size) {
-    max_work_size = user_defined_max_work_size;
-  }
-
-  while (gws1 > max_work_size && max_work_size > 0) {
-    gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
-  }
-  while (gws2 * gws1 > max_work_size && max_work_size > 0) {
-    gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
-  }
-  while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
-    gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
+    const cl::NDRange &gws,
+    register size_t max_ws,
+    const int &divisor /*=2*/,
+    const bool &reverse /*=false*/,
+    const size_t &user_def_max_ws /*=0*/) {
+  register size_t lx = reverse ? gws[2] : gws[0];
+  register size_t ly = gws[1];
+  register size_t lz = reverse ? gws[0] : gws[2];
+
+  max_ws = (user_def_max_ws > 0 && user_def_max_ws <= max_ws) ? user_def_max_ws
+                                                              : max_ws;
+  max_ws = divisor > 1 ? max_ws / divisor : max_ws;
+
+  if (max_ws > 0) {
+    while (ly > max_ws) {
+      // replace mod with bit operate
+      ly = (ly & 0x01) ? 1 : ly >> 1;
+    }
+    while (ly * lz > max_ws) {
+      lz = (lz & 0x01) ? 1 : lz >> 1;
+    }
+    while (ly * lz * lx > max_ws) {
+      lx = (lx & 0x01) ? 1 : lx >> 1;
+    }
   }
 
-  if (tune_reverse) {
-    return cl::NDRange{static_cast<size_t>(gws2),
-                       static_cast<size_t>(gws1),
-                       static_cast<size_t>(gws0)};
-  } else {
-    return cl::NDRange{static_cast<size_t>(gws0),
-                       static_cast<size_t>(gws1),
-                       static_cast<size_t>(gws2)};
-  }
+  return reverse ? cl::NDRange{lz, ly, lx} : cl::NDRange{lx, ly, lz};
 }
 
 bool CLContext::IsArmMali() {
   return CLRuntime::Global()->GetGpuType() == GpuType::ARM_MALI;
 }
 
-bool CLContext::HasTunedLocalWorkSizeMap(const std::string &key,
-                                         cl::NDRange *lws) {
-  bool has = false;
-  auto it = tuned_lwss_map_.find(key);
-  if (it != tuned_lwss_map_.end()) {
-    *lws = it->second;
-    has = true;
-  }
-  return has;
-}
-
-void CLContext::SetTunedLocalWorkSizeMap(const std::string &key,
-                                         const cl::NDRange lws) {
-  auto it = tuned_lwss_map_.find(key);
-  if (it != tuned_lwss_map_.end()) {
-    auto lws_old = it->second;
-    LOG(FATAL) << "===> found lws_old with same key, please add more detailed "
-                  "info to key <==="
-               << "\n lws_old:" << lws_old[0] << "," << lws_old[1] << ","
-               << lws_old[2] << "\n lws_new:" << lws[0] << "," << lws[1] << ","
-               << lws[2];
-  }
-  tuned_lwss_map_.insert(std::pair<std::string, cl::NDRange>(key, lws));
-}
-
-std::map<std::string, cl::NDRange> CLContext::GetTunedLocalWorkSizeMap() {
-  return tuned_lwss_map_;
-}
-
-cl::NDRange CLContext::GetTunedLocalWorkSizeFromMap(const std::string &key) {
-  cl::NDRange lws = cl::NullRange;
-  auto it = tuned_lwss_map_.find(key);
-  if (it != tuned_lwss_map_.end()) {
-    lws = it->second;
-  }
-  return lws;
-}
-
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/backends/opencl/cl_image.h"
@@ -57,24 +58,17 @@ class CLContext {
 
   cl::NDRange DefaultGlobalWorkSize(const CLImage &image);
 
-  cl::NDRange DefaultLocalWorkSize(cl::NDRange global_work_size,
-                                   size_t max_work_size,
-                                   int divitor = 2,
-                                   bool tune_reverse = false,
-                                   size_t user_defined_max_work_size = 0);
+  cl::NDRange DefaultLocalWorkSize(
+      const cl::NDRange &global_work_size,
+      register size_t max_work_size,
+      const int &divitor = 2,
+      const bool &tune_reverse = false,
+      const size_t &user_defined_max_work_size = 0);
 
-  std::vector<cl::NDRange> GenerateLocalWorkSizes(cl::NDRange global_work_size,
-                                                  size_t max_work_size);
+  std::set<cl::NDRange> GenerateLocalWorkSizes(cl::NDRange global_work_size,
+                                               size_t max_work_size);
   bool IsArmMali();
 
-  bool HasTunedLocalWorkSizeMap(const std::string &key, cl::NDRange *lws);
-
-  void SetTunedLocalWorkSizeMap(const std::string &key, const cl::NDRange lws);
-
-  std::map<std::string, cl::NDRange> GetTunedLocalWorkSizeMap();
-
-  cl::NDRange GetTunedLocalWorkSizeFromMap(const std::string &key);
-
  private:
   std::vector<std::shared_ptr<cl::Kernel>> kernels_;
   std::map<std::string, int> kernel_offset_;