Adding RunOptions synchronization behaviour to C/C++ API (#14088)

### Description This is exposing the already existent interface of asynchronous work of all CUDA base EP's (CUDA + TensorRT). ### Motivation and Context This is something requested in #12216. It will enable users to build an efficient data pipeline with ONNXRuntime and CUDA pre-/post-processing. PCI traffic to the CUDA device can be run during inference as soon as the postprocessing consumed the input buffer and it can be overwritten. To do this work has to be submitted async to the device. Please see below screenshots showing the illustration of this using NSight Systems. Async: <img width="1401" alt="image" src="https://user-images.githubusercontent.com/44298237/209894303-706460ed-cbdb-4be2-a2e4-0c111ec875dd.png"> Synchronous: <img width="1302" alt="image" src="https://user-images.githubusercontent.com/44298237/209894630-1ce40925-bbd5-470d-b888-46553ab75fb9.png"> Note the gap in between the 2 inference runs due to issuing PCI traffic in between and to the CPU overhead the active synchronization has. --------- Co-authored-by: Chi Lo <[email protected]>
microsoft · Feb 8, 2023 · ddeec0a · ddeec0a
1 parent cb5749c
commit ddeec0a
Show file tree

Hide file tree

Showing 6 changed files with 12 additions and 9 deletions.
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
@@ -27,10 +27,6 @@ struct OrtRunOptions {
   // So it is possible that only some of the nodes are executed.
   bool only_execute_path_to_fetches = false;
 
-  // Set to 'true' to synchronize execution providers with CPU at the end of session run.
-  // Taking CUDA EP as an example, it will trigger cudaStreamSynchronize on the compute stream.
-  bool synchronize_execution_providers = true;
-
 #ifdef ENABLE_TRAINING
   // Used by onnxruntime::training::TrainingSession. This class is now deprecated.
   // Delete training_mode when TrainingSession is deleted.

diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -25,3 +25,8 @@
 // Example usage: "cpu:0;gpu:0" (or) "gpu:0"
 // By default, the value for this key is empty (i.e.) no memory arenas are shrunk
 static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memory.enable_memory_arena_shrinkage";
+
+// Set to '1' to not synchronize execution providers with CPU at the end of session run.
+// Per default it will be set to '0'
+// Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
+static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
@@ -20,6 +20,8 @@
 #include "core/framework/tensorprotoutils.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/framework/TensorSeq.h"
+#include "core/framework/run_options.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #ifdef USE_AZURE
 #include "core/framework/cloud_executor.h"
 #endif
@@ -793,13 +795,14 @@ common::Status ExecuteGraph(const SessionState& session_state,
                                   logger);
   }
 #endif
+  bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
   return ExecuteGraph(session_state,
                       feeds_fetches_manager,
                       feeds, fetches,
                       execution_mode,
                       run_options.terminate,
                       logger,
-                      run_options.synchronize_execution_providers,
+                      synchronize_execution_providers,
                       run_options.only_execute_path_to_fetches);
 }
 

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -1997,7 +1997,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
 
     // info all execution providers InferenceSession:Run ended
     for (auto* xp : exec_providers_to_stop) {
-      auto status = xp->OnRunEnd(run_options.synchronize_execution_providers);
+      bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
+      auto status = xp->OnRunEnd(synchronize_execution_providers);
       ORT_CHECK_AND_SET_RETVAL(status);
     }
 

diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1343,8 +1343,6 @@ RunOptions instance. The individual calls will exit gracefully and return an err
 #endif
       .def_readwrite("only_execute_path_to_fetches", &RunOptions::only_execute_path_to_fetches,
                      R"pbdoc(Only execute the nodes needed by fetch list)pbdoc")
-      .def_readwrite("synchronize_execution_providers", &RunOptions::synchronize_execution_providers,
-                     R"pbdoc(Synchronize execution providers after executing session.)pbdoc")
       .def(
           "add_run_config_entry",
           [](RunOptions* options, const char* config_key, const char* config_value) -> void {

diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@@ -392,7 +392,7 @@ def _run_onnx_session_with_ortvaluevector(
 
     _nvtx_range_push("run_with_ortvaluevector")
     run_options = onnxruntime.RunOptions()
-    run_options.synchronize_execution_providers = True
+    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
     sess.run_with_ortvaluevector(run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices)
     _nvtx_range_pop()