Skip to content

Commit

Permalink
Enable Ep context with external data for CPU nodes (microsoft#23498)
Browse files Browse the repository at this point in the history
### Description
When user dump the EP context model, if the nodes not partitioned to the EP, and they have external initializers, then the dumped model still point to the old external data file. It does not make sense that new generated model still point to old external data file.
Example, model has node A, B, C, D all has external initializer in ext.bin. So ext.bin contains data for A, B, C, D.
After dumping the EP context model, node A is on CPU, node B, C, D are on EP and dumped as EPContext node. If A's data is still in ext.bin, then new generated model has to depend on old ext.bin which contains all external data for the old model which is a big overhead.

Fix:
For new generated model, user should have option to specify the new external data file, so that the new generated model either pack all initializers into the Onnx model or has all initializers in the external data file.
Add option ep.context_model_external_initializers_file_name to specify the new external data file and size threshold. All initializers will be inside the external data fie if the options is specified. Otherwise all initializers will be inside the EP context Onnx model.

### Motivation and Context
Fix the issue microsoft#23358
  • Loading branch information
HectorSVC authored Jan 29, 2025
1 parent bf023ab commit 80bc1d2
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,11 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_
// Share EP related resources across EPs
static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";

// Use this config when dumping EP context model with an external initializers file
// All initializers will be inside the external data file if specified, otherwise all in Onnx file
static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
"ep.context_model_external_initializers_file_name";

// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
// Option values:
// - "0": Gemm FastMath mode is not enabled. [DEFAULT]
Expand Down
22 changes: 19 additions & 3 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "core/graph/function_utils.h"
#include "core/graph/graph_viewer.h"
#include "core/graph/model.h"
#include "core/graph/model_saving_options.h"
#include "core/session/onnxruntime_session_options_config_keys.h"

// uncomment this line to count non-CUDA ops in ONNX domain
Expand Down Expand Up @@ -645,6 +646,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
const Graph& graph,
const std::filesystem::path& ep_context_path,
const std::filesystem::path& ep_context_ext_ini_path,
const logging::Logger& logger) {
InlinedVector<const Node*> all_ep_context_nodes;
for (const auto& ep : execution_providers) {
Expand Down Expand Up @@ -727,7 +729,20 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
}
}

ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
size_t ini_size_threshold = 0;
std::filesystem::path external_ini_path;
if (ep_context_ext_ini_path.empty()) {
// Set the threshold to the max so all initializers are forced into the Onnx file
ini_size_threshold = SIZE_MAX;
external_ini_path = "./model_ext_ini.bin";
} else {
// Set the theshold to 0 so all initializers are forced into the external file
ini_size_threshold = 0;
external_ini_path = ep_context_ext_ini_path;
}
ModelSavingOptions model_saving_options{ini_size_threshold};
ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
external_ini_path, model_saving_options));

return Status::OK();
}
Expand Down Expand Up @@ -993,9 +1008,10 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));

bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
if (ep_context_enabled) {
ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name, logger));
}
#else
ORT_UNUSED_PARAMETER(config_options);
Expand Down
15 changes: 15 additions & 0 deletions onnxruntime/core/graph/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4175,6 +4175,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
size_t tensor_bytes_size = raw_data.size();
if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
*output_proto = initializer;
// Data with size above the threshold is written into the new external initializer file
// Data with size below the threshold should be kept inside the new model file
// instead of leaving it in the old external initializer file for the old Onnx file
if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) {
TensorShape shape(initializer.dims());
output_proto->set_raw_data(raw_data.data(), raw_data.size());
output_proto->clear_data_location();
}
if (process_prepacks) {
// These pre-packs will reside in memory
processed_weights.insert(initializer.name());
Expand Down Expand Up @@ -4263,6 +4271,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(

// Create the external file.
std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
auto const external_empty_pos = external_stream.tellp();
ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
int64_t external_offset = 0;

Expand All @@ -4275,6 +4284,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
}

// Delete if the external data file is empty
if (external_empty_pos == external_stream.tellp()) {
external_stream.close();
std::remove(modified_external_file_path.string().c_str());
}

return result;
}

Expand Down
82 changes: 76 additions & 6 deletions onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "core/session/onnxruntime_cxx_api.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "core/session/inference_session.h"
#include "core/graph/model_saving_options.h"

#include "test/providers/qnn/qnn_test_utils.h"

Expand Down Expand Up @@ -49,19 +50,19 @@ static const std::string& GetNodeAttr(const Node& node, const std::string& attr_
static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
return [single_ep_node](ModelTestBuilder& builder) {
// Creat non-quantized FusedMatMul node1
NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({2, 2}, false, {0, 1, 0, 1}));
NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
std::vector<float> data(200 * 200, 1.0f);
NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({200, 200}, false, data));
NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));

auto* add1_output = builder.MakeIntermediate();
builder.AddNode("FusedMatMul", {input1, add1_ini_input2}, {add1_output}, kMSDomain);

// Create quantized Add node2
std::vector<float> data = {0.0f, 0.0f, 1.0f, 0.0f};
gsl::span<float> data_range = gsl::make_span(data);
QuantParams<uint8_t> q_parameter = GetDataQuantParams<uint8_t>(data_range);
auto* add2_input1_qdq = AddQDQNodePair<uint8_t>(builder, add1_output, q_parameter.scale, q_parameter.zero_point);

NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
auto* add2_input2_qdq = AddQDQNodePair<uint8_t>(builder, add2_input2, q_parameter.scale, q_parameter.zero_point);

auto* add2_output = builder.MakeIntermediate();
Expand All @@ -73,15 +74,15 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
} else {
auto* add3_input1_qdq = AddQDQNodePair<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));

auto* add3_output = builder.MakeIntermediate();
builder.AddNode("FusedMatMul", {add3_input1_qdq, add3_ini_input2}, {add3_output}, kMSDomain);

// Create quantized Add node4
auto* add4_input1_qdq = AddQDQNodePair<uint8_t>(builder, add3_output, q_parameter.scale, q_parameter.zero_point);

NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
auto* add4_input2_qdq = AddQDQNodePair<uint8_t>(builder, add4_input2, q_parameter.scale, q_parameter.zero_point);

auto* add4_output = builder.MakeIntermediate();
Expand Down Expand Up @@ -179,6 +180,75 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport2) {
QnnContextBinaryMultiPartitionTestBody(single_ep_node);
}

void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif

const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};

auto& logging_manager = DefaultLoggingManager();
logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);

onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
logging_manager.DefaultLogger());
Graph& graph = model.MainGraph();
ModelTestBuilder helper(graph);
BuildGraphWithQAndNonQ(true)(helper);
helper.SetGraphOutputs();
ASSERT_STATUS_OK(model.MainGraph().Resolve());
ModelSavingOptions model_saving_options{10};
const std::string model_with_ext = "model_external.onnx";
const std::string model_ext_file = "model_external.bin";
ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model, model_with_ext,
model_ext_file, model_saving_options));

EXPECT_TRUE(std::filesystem::exists(model_with_ext.c_str()));
EXPECT_TRUE(std::filesystem::exists(model_ext_file.c_str()));

Ort::SessionOptions so;
so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
so.AppendExecutionProvider("QNN", provider_options);
const std::string ep_context_model_file = "./qnn_ctx_part_external_ini_ctx.onnx";
so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, ep_context_model_file.c_str());
const std::string external_ini_file = "./qnn_ctx_part_external_ini.bin";
if (expect_external_ini_file) {
// Set the external ini file name will force all initializers to the external file
so.AddConfigEntry(kOrtSessionOptionsEpContextModelExternalInitializersFileName, external_ini_file.c_str());
} // otherwise all initializers are in Onnx file, no external data file generated

Ort::Session session(*ort_env, ToPathString(model_with_ext).c_str(), so);

EXPECT_TRUE(std::filesystem::exists(ep_context_model_file.c_str()));
if (expect_external_ini_file) {
EXPECT_TRUE(std::filesystem::exists(external_ini_file.c_str()));
ASSERT_EQ(std::remove(external_ini_file.c_str()), 0);
} else {
EXPECT_FALSE(std::filesystem::exists(external_ini_file.c_str()));
}

// clean up
ASSERT_EQ(std::remove(model_with_ext.c_str()), 0);
ASSERT_EQ(std::remove(model_ext_file.c_str()), 0);
ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0);
}

// Set the external initializer size threshold to 1024 so FusedMatMul (which fallback on CPU)
// will dump initializer data to external file
TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithExternalWeights) {
EpCtxCpuNodeWithExternalIniFileTestBody(true);
}

// Use the default external initializer size threshold (1024000) so FusedMatMul (which fallback on CPU)
// will NOT dump initializer data to external file
TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithoutExternalWeights) {
EpCtxCpuNodeWithExternalIniFileTestBody(false);
}

// Create a model with Case + Add (quantized)
// cast_input -> Cast -> Q -> DQ \
// Add -> Q -> DQ -> output
Expand Down
9 changes: 2 additions & 7 deletions onnxruntime/test/python/onnxruntime_test_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_model_serialization_with_original_external_initializers_to_directory(se
so.add_session_config_entry(
"session.optimized_model_external_initializers_file_name", external_initializers_file
)
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "20")
onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
self.assertTrue(os.path.isfile(so.optimized_model_filepath))
self.assertTrue(os.path.isfile(os.path.join(directory, external_initializers_file)))
Expand Down Expand Up @@ -213,14 +213,10 @@ def test_model_serialization_with_original_external_initializers_to_current_dire
"session.optimized_model_external_initializers_file_name", external_initializers_file
)

# TODO(anyone): Set this to 100 will cause test error since some tensor below the threshold
# still refers to the original external data file. We shall fix this issue so that the
# optimized model only refers to one external data file.
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "10")
so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
session1 = onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
del session1
self.assertTrue(os.path.isfile(optimized_model_filepath))
self.assertTrue(os.path.isfile(external_initializers_file))

so2 = onnxrt.SessionOptions()
so2.log_severity_level = 1
Expand All @@ -240,7 +236,6 @@ def test_model_serialization_with_original_external_initializers_to_current_dire

# Remove model 1 to make sure optimized model 2 can be loaded independently from model 1
os.remove(optimized_model_filepath)
os.remove(external_initializers_file)

session3 = onnxrt.InferenceSession(optimized_model_filepath_2, sess_options=onnxrt.SessionOptions())
del session3
Expand Down

0 comments on commit 80bc1d2

Please sign in to comment.