Enable Ep context with external data for CPU nodes (microsoft#23498)

### Description When user dump the EP context model, if the nodes not partitioned to the EP, and they have external initializers, then the dumped model still point to the old external data file. It does not make sense that new generated model still point to old external data file. Example, model has node A, B, C, D all has external initializer in ext.bin. So ext.bin contains data for A, B, C, D. After dumping the EP context model, node A is on CPU, node B, C, D are on EP and dumped as EPContext node. If A's data is still in ext.bin, then new generated model has to depend on old ext.bin which contains all external data for the old model which is a big overhead. Fix: For new generated model, user should have option to specify the new external data file, so that the new generated model either pack all initializers into the Onnx model or has all initializers in the external data file. Add option ep.context_model_external_initializers_file_name to specify the new external data file and size threshold. All initializers will be inside the external data fie if the options is specified. Otherwise all initializers will be inside the EP context Onnx model. ### Motivation and Context Fix the issue microsoft#23358
intel · Jan 29, 2025 · 80bc1d2 · 80bc1d2
1 parent bf023ab
commit 80bc1d2
Show file tree

Hide file tree

Showing 5 changed files with 117 additions and 16 deletions.
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -283,6 +283,11 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_
 // Share EP related resources across EPs
 static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
 
+// Use this config when dumping EP context model with an external initializers file
+// All initializers will be inside the external data file if specified, otherwise all in Onnx file
+static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
+    "ep.context_model_external_initializers_file_name";
+
 // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
 // Option values:
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
@@ -645,6 +646,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
 static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
                                    const Graph& graph,
                                    const std::filesystem::path& ep_context_path,
+                                   const std::filesystem::path& ep_context_ext_ini_path,
                                    const logging::Logger& logger) {
   InlinedVector<const Node*> all_ep_context_nodes;
   for (const auto& ep : execution_providers) {
@@ -727,7 +729,20 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
     }
   }
 
-  ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  size_t ini_size_threshold = 0;
+  std::filesystem::path external_ini_path;
+  if (ep_context_ext_ini_path.empty()) {
+    // Set the threshold to the max so all initializers are forced into the Onnx file
+    ini_size_threshold = SIZE_MAX;
+    external_ini_path = "./model_ext_ini.bin";
+  } else {
+    // Set the theshold to 0 so all initializers are forced into the external file
+    ini_size_threshold = 0;
+    external_ini_path = ep_context_ext_ini_path;
+  }
+  ModelSavingOptions model_saving_options{ini_size_threshold};
+  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
+                                                          external_ini_path, model_saving_options));
 
   return Status::OK();
 }
@@ -993,9 +1008,10 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));
 
     bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     if (ep_context_enabled) {
-      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+      std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+      std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name, logger));
     }
 #else
     ORT_UNUSED_PARAMETER(config_options);

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
@@ -4175,6 +4175,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
       size_t tensor_bytes_size = raw_data.size();
       if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
+        // Data with size above the threshold is written into the new external initializer file
+        // Data with size below the threshold should be kept inside the new model file
+        // instead of leaving it in the old external initializer file for the old Onnx file
+        if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) {
+          TensorShape shape(initializer.dims());
+          output_proto->set_raw_data(raw_data.data(), raw_data.size());
+          output_proto->clear_data_location();
+        }
         if (process_prepacks) {
           // These pre-packs will reside in memory
           processed_weights.insert(initializer.name());
@@ -4263,6 +4271,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
 
   // Create the external file.
   std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
+  auto const external_empty_pos = external_stream.tellp();
   ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
   int64_t external_offset = 0;
 
@@ -4275,6 +4284,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
     ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
   }
 
+  // Delete if the external data file is empty
+  if (external_empty_pos == external_stream.tellp()) {
+    external_stream.close();
+    std::remove(modified_external_file_path.string().c_str());
+  }
+
   return result;
 }
 

diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -7,6 +7,7 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/inference_session.h"
+#include "core/graph/model_saving_options.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -49,19 +50,19 @@ static const std::string& GetNodeAttr(const Node& node, const std::string& attr_
 static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
   return [single_ep_node](ModelTestBuilder& builder) {
     // Creat non-quantized FusedMatMul node1
-    NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({2, 2}, false, {0, 1, 0, 1}));
-    NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
+    std::vector<float> data(200 * 200, 1.0f);
+    NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({200, 200}, false, data));
+    NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
 
     auto* add1_output = builder.MakeIntermediate();
     builder.AddNode("FusedMatMul", {input1, add1_ini_input2}, {add1_output}, kMSDomain);
 
     // Create quantized Add node2
-    std::vector<float> data = {0.0f, 0.0f, 1.0f, 0.0f};
     gsl::span<float> data_range = gsl::make_span(data);
     QuantParams<uint8_t> q_parameter = GetDataQuantParams<uint8_t>(data_range);
     auto* add2_input1_qdq = AddQDQNodePair<uint8_t>(builder, add1_output, q_parameter.scale, q_parameter.zero_point);
 
-    NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
+    NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
     auto* add2_input2_qdq = AddQDQNodePair<uint8_t>(builder, add2_input2, q_parameter.scale, q_parameter.zero_point);
 
     auto* add2_output = builder.MakeIntermediate();
@@ -73,15 +74,15 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
       AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
     } else {
       auto* add3_input1_qdq = AddQDQNodePair<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
-      NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
+      NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
 
       auto* add3_output = builder.MakeIntermediate();
       builder.AddNode("FusedMatMul", {add3_input1_qdq, add3_ini_input2}, {add3_output}, kMSDomain);
 
       // Create quantized Add node4
       auto* add4_input1_qdq = AddQDQNodePair<uint8_t>(builder, add3_output, q_parameter.scale, q_parameter.zero_point);
 
-      NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
+      NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
       auto* add4_input2_qdq = AddQDQNodePair<uint8_t>(builder, add4_input2, q_parameter.scale, q_parameter.zero_point);
 
       auto* add4_output = builder.MakeIntermediate();
@@ -179,6 +180,75 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport2) {
   QnnContextBinaryMultiPartitionTestBody(single_ep_node);
 }
 
+void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  BuildGraphWithQAndNonQ(true)(helper);
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+  ModelSavingOptions model_saving_options{10};
+  const std::string model_with_ext = "model_external.onnx";
+  const std::string model_ext_file = "model_external.bin";
+  ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model, model_with_ext,
+                                                       model_ext_file, model_saving_options));
+
+  EXPECT_TRUE(std::filesystem::exists(model_with_ext.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(model_ext_file.c_str()));
+
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AppendExecutionProvider("QNN", provider_options);
+  const std::string ep_context_model_file = "./qnn_ctx_part_external_ini_ctx.onnx";
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, ep_context_model_file.c_str());
+  const std::string external_ini_file = "./qnn_ctx_part_external_ini.bin";
+  if (expect_external_ini_file) {
+    // Set the external ini file name will force all initializers to the external file
+    so.AddConfigEntry(kOrtSessionOptionsEpContextModelExternalInitializersFileName, external_ini_file.c_str());
+  }  // otherwise all initializers are in Onnx file, no external data file generated
+
+  Ort::Session session(*ort_env, ToPathString(model_with_ext).c_str(), so);
+
+  EXPECT_TRUE(std::filesystem::exists(ep_context_model_file.c_str()));
+  if (expect_external_ini_file) {
+    EXPECT_TRUE(std::filesystem::exists(external_ini_file.c_str()));
+    ASSERT_EQ(std::remove(external_ini_file.c_str()), 0);
+  } else {
+    EXPECT_FALSE(std::filesystem::exists(external_ini_file.c_str()));
+  }
+
+  // clean up
+  ASSERT_EQ(std::remove(model_with_ext.c_str()), 0);
+  ASSERT_EQ(std::remove(model_ext_file.c_str()), 0);
+  ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0);
+}
+
+// Set the external initializer size threshold to 1024 so FusedMatMul (which fallback on CPU)
+// will dump initializer data to external file
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithExternalWeights) {
+  EpCtxCpuNodeWithExternalIniFileTestBody(true);
+}
+
+// Use the default external initializer size threshold (1024000) so FusedMatMul (which fallback on CPU)
+// will NOT dump initializer data to external file
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithoutExternalWeights) {
+  EpCtxCpuNodeWithExternalIniFileTestBody(false);
+}
+
 // Create a model with Case + Add (quantized)
 // cast_input -> Cast -> Q -> DQ \
 //                                Add -> Q -> DQ -> output

diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -183,7 +183,7 @@ def test_model_serialization_with_original_external_initializers_to_directory(se
             so.add_session_config_entry(
                 "session.optimized_model_external_initializers_file_name", external_initializers_file
             )
-            so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
+            so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "20")
             onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
             self.assertTrue(os.path.isfile(so.optimized_model_filepath))
             self.assertTrue(os.path.isfile(os.path.join(directory, external_initializers_file)))
@@ -213,14 +213,10 @@ def test_model_serialization_with_original_external_initializers_to_current_dire
             "session.optimized_model_external_initializers_file_name", external_initializers_file
         )
 
-        # TODO(anyone): Set this to 100 will cause test error since some tensor below the threshold
-        # still refers to the original external data file. We shall fix this issue so that the
-        # optimized model only refers to one external data file.
-        so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "10")
+        so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
         session1 = onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
         del session1
         self.assertTrue(os.path.isfile(optimized_model_filepath))
-        self.assertTrue(os.path.isfile(external_initializers_file))
 
         so2 = onnxrt.SessionOptions()
         so2.log_severity_level = 1
@@ -240,7 +236,6 @@ def test_model_serialization_with_original_external_initializers_to_current_dire
 
         # Remove model 1 to make sure optimized model 2 can be loaded independently from model 1
         os.remove(optimized_model_filepath)
-        os.remove(external_initializers_file)
 
         session3 = onnxrt.InferenceSession(optimized_model_filepath_2, sess_options=onnxrt.SessionOptions())
         del session3