Revert "[CPU] Avoid shared weights double repacking in latency mode" (#…

…25597) This reverts commit 446b2f0. ### Details: PR #24386 led to the issues with CPU plugin weights caching, this sporadically affected accuracy in a negative way. This PR reverts PR #24386 changes in order to restore stability. ### Tickets: - *CVS-146939*
openvinotoolkit · Jul 17, 2024 · ddf8241 · ddf8241
1 parent 9bc7676
commit ddf8241
Show file tree

Hide file tree

Showing 7 changed files with 18 additions and 25 deletions.
diff --git a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp
@@ -49,8 +49,6 @@ class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor {
 
     int get_stream_id() override;
 
-    int get_streams_num();
-
     int get_numa_node_id() override;
 
     int get_socket_id() override;

diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp
@@ -502,10 +502,6 @@ int CPUStreamsExecutor::get_stream_id() {
     return stream->_streamId;
 }
 
-int CPUStreamsExecutor::get_streams_num() {
-    return _impl->_config.get_streams();
-}
-
 int CPUStreamsExecutor::get_numa_node_id() {
     auto stream = _impl->_streams.local();
     return stream->_numaNodeId;

diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -130,11 +130,13 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const {
                 GraphContext::Ptr ctx;
                 {
                     std::lock_guard<std::mutex> lock{*m_mutex.get()};
+                    // disable weights caching if graph was created only once
+                    auto weightsCache = m_cfg.streamExecutorConfig.get_streams() != 1 ? m_socketWeights[socketId] : nullptr;
                     auto isQuantizedFlag =
                         (m_cfg.lpTransformsMode == Config::On) &&
                         ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model);
 
-                    ctx = std::make_shared<GraphContext>(m_cfg, m_socketWeights[socketId], isQuantizedFlag, streamsExecutor);
+                    ctx = std::make_shared<GraphContext>(m_cfg, weightsCache, isQuantizedFlag, streamsExecutor);
                 }
                 const std::shared_ptr<const ov::Model> model = m_model;
                 graphLock._graph.CreateGraph(model, ctx);

diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
@@ -895,7 +895,7 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD
     MemoryPtr ptr;
     const auto& format = dstWeightDesc->serializeFormat();
 
-    OPENVINO_ASSERT(privateWeightCache, "privateWeightCache is nullptr");
+    assert(privateWeightCache);
 
     auto itr = privateWeightCache->find(format);
     if (privateWeightCache->end() != itr) {

diff --git a/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp b/src/plugins/intel_cpu/src/nodes/executors/dnnl/dnnl_utils.cpp
@@ -32,7 +32,6 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc,
     const auto& format = dstWeightDesc->serializeFormat();
 
     const auto privateWeightCache = context->getPrivateWeighCache();
-    OPENVINO_ASSERT(privateWeightCache, "privateWeightCache is nullptr");
     if (privateWeightCache) {
         auto itr = privateWeightCache->find(format);
         if (privateWeightCache->end() != itr) {
@@ -87,7 +86,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc,
     if (globalWeightCache &&
         dnnl::memory::format_kind::blocked == dstWeightDesc->getDnnlDesc().get_format_kind()) {
         const std::string string_hash = format + "_" + std::to_string(weightsMem->getSize()) + "_" +
-                                        std::to_string(reinterpret_cast<uint64_t>(weightsMem->getData()));
+                                        std::to_string(*weightsMem->getDataAs<uint64_t>());
         ptr = *globalWeightCache->findOrCreate(string_hash, create);
     } else {
         ptr = create();

diff --git a/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp b/src/plugins/intel_cpu/src/nodes/executors/mlas/mlas_gemm.cpp
@@ -51,7 +51,7 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory,
     if (weightCache != nullptr) {
         std::string format = "gemm_mlas_" + std::to_string(N) + "_" + std::to_string(K);
         const std::string string_hash = format + "_" + std::to_string(weightsMemory->getSize()) + "_" +
-                                        std::to_string(reinterpret_cast<uint64_t>(weightsMemory->getData()));
+            std::to_string(*weightsMemory->getDataAs<uint64_t>());
         DEBUG_LOG("MlasGemmExecutor: findOrCreate, string_hash: ", string_hash);
         return *weightCache->findOrCreate(string_hash, create);
     }

diff --git a/src/plugins/intel_cpu/src/nodes/input.cpp b/src/plugins/intel_cpu/src/nodes/input.cpp
@@ -373,20 +373,18 @@ void Input::cloneBlobIfRequired() {
                 + "_" + ptr;
     };
 
-    const auto weightCache = context->getWeightsCache();
-    const bool clone_is_not_needed =
-        prec != element::string && !isWA() &&
-        // IRs already have all subnormals flushed to zero, but in
-        // read_model scenario with directly loaded original model still can have subnormals
-        isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) &&
-        // Blob should be cloned in cache only if original weights are stored on other numa node.
-        // This is possible only in multistream case on multisocket machine.
-        // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored.
-        (!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1);
-
-    memoryPtr = clone_is_not_needed ? std::make_shared<Memory>(getEngine(), memDesc, constOp->get_data_ptr())
-                                    : std::const_pointer_cast<const IMemory>(
-                                          weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob());
+    auto weightCache = context->getWeightsCache();
+
+    if (weightCache) {
+        MemoryPtr ptr = *weightCache->findOrCreate(blobKey(), cloneBlob);
+        memoryPtr = std::const_pointer_cast<const IMemory>(ptr);
+    // IRs already have all subnormals flushed to zero, but in
+    // read_model scenario with directly loaded original model still can have subnormals
+    } else if (prec != element::string && isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) && !isWA()) {
+        memoryPtr = std::make_shared<Memory>(getEngine(), memDesc, constOp->get_data_ptr());
+    } else {
+        memoryPtr = std::const_pointer_cast<const IMemory>(cloneBlob());
+    }
 }
 
 static std::vector<Shape> createInputShapes(const Shape& shape,