Skip to content

Commit

Permalink
Revert "[CPU] Avoid shared weights double repacking in latency mode" (#…
Browse files Browse the repository at this point in the history
…25597)

This reverts commit 446b2f0.

### Details:
PR #24386 led to the issues with CPU plugin weights caching, this
sporadically affected accuracy in a negative way. This PR reverts PR
#24386 changes in order to restore stability.

### Tickets:
 - *CVS-146939*
  • Loading branch information
v-Golubev authored Jul 17, 2024
1 parent 9bc7676 commit ddf8241
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor {

int get_stream_id() override;

int get_streams_num();

int get_numa_node_id() override;

int get_socket_id() override;
Expand Down
4 changes: 0 additions & 4 deletions src/inference/src/dev/threading/cpu_streams_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,10 +502,6 @@ int CPUStreamsExecutor::get_stream_id() {
return stream->_streamId;
}

int CPUStreamsExecutor::get_streams_num() {
return _impl->_config.get_streams();
}

int CPUStreamsExecutor::get_numa_node_id() {
auto stream = _impl->_streams.local();
return stream->_numaNodeId;
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,13 @@ CompiledModel::GraphGuard::Lock CompiledModel::get_graph() const {
GraphContext::Ptr ctx;
{
std::lock_guard<std::mutex> lock{*m_mutex.get()};
// disable weights caching if graph was created only once
auto weightsCache = m_cfg.streamExecutorConfig.get_streams() != 1 ? m_socketWeights[socketId] : nullptr;
auto isQuantizedFlag =
(m_cfg.lpTransformsMode == Config::On) &&
ov::pass::low_precision::LowPrecision::isFunctionQuantized(m_model);

ctx = std::make_shared<GraphContext>(m_cfg, m_socketWeights[socketId], isQuantizedFlag, streamsExecutor);
ctx = std::make_shared<GraphContext>(m_cfg, weightsCache, isQuantizedFlag, streamsExecutor);
}
const std::shared_ptr<const ov::Model> model = m_model;
graphLock._graph.CreateGraph(model, ctx);
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,7 @@ MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr dstWeightDesc, DnnlMemoryD
MemoryPtr ptr;
const auto& format = dstWeightDesc->serializeFormat();

OPENVINO_ASSERT(privateWeightCache, "privateWeightCache is nullptr");
assert(privateWeightCache);

auto itr = privateWeightCache->find(format);
if (privateWeightCache->end() != itr) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc,
const auto& format = dstWeightDesc->serializeFormat();

const auto privateWeightCache = context->getPrivateWeighCache();
OPENVINO_ASSERT(privateWeightCache, "privateWeightCache is nullptr");
if (privateWeightCache) {
auto itr = privateWeightCache->find(format);
if (privateWeightCache->end() != itr) {
Expand Down Expand Up @@ -87,7 +86,7 @@ MemoryPtr prepareWeightsMemory(const DnnlMemoryDescPtr srcWeightDesc,
if (globalWeightCache &&
dnnl::memory::format_kind::blocked == dstWeightDesc->getDnnlDesc().get_format_kind()) {
const std::string string_hash = format + "_" + std::to_string(weightsMem->getSize()) + "_" +
std::to_string(reinterpret_cast<uint64_t>(weightsMem->getData()));
std::to_string(*weightsMem->getDataAs<uint64_t>());
ptr = *globalWeightCache->findOrCreate(string_hash, create);
} else {
ptr = create();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ static MemoryPtr prepareWeightMemory(const MemoryPtr weightsMemory,
if (weightCache != nullptr) {
std::string format = "gemm_mlas_" + std::to_string(N) + "_" + std::to_string(K);
const std::string string_hash = format + "_" + std::to_string(weightsMemory->getSize()) + "_" +
std::to_string(reinterpret_cast<uint64_t>(weightsMemory->getData()));
std::to_string(*weightsMemory->getDataAs<uint64_t>());
DEBUG_LOG("MlasGemmExecutor: findOrCreate, string_hash: ", string_hash);
return *weightCache->findOrCreate(string_hash, create);
}
Expand Down
26 changes: 12 additions & 14 deletions src/plugins/intel_cpu/src/nodes/input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,20 +373,18 @@ void Input::cloneBlobIfRequired() {
+ "_" + ptr;
};

const auto weightCache = context->getWeightsCache();
const bool clone_is_not_needed =
prec != element::string && !isWA() &&
// IRs already have all subnormals flushed to zero, but in
// read_model scenario with directly loaded original model still can have subnormals
isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) &&
// Blob should be cloned in cache only if original weights are stored on other numa node.
// This is possible only in multistream case on multisocket machine.
// TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where original weights are stored.
(!weightCache || context->getNumNumaNodes() == 1 || context->getCPUStreamExecutor()->get_streams_num() == 1);

memoryPtr = clone_is_not_needed ? std::make_shared<Memory>(getEngine(), memDesc, constOp->get_data_ptr())
: std::const_pointer_cast<const IMemory>(
weightCache ? *weightCache->findOrCreate(blobKey(), cloneBlob) : cloneBlob());
auto weightCache = context->getWeightsCache();

if (weightCache) {
MemoryPtr ptr = *weightCache->findOrCreate(blobKey(), cloneBlob);
memoryPtr = std::const_pointer_cast<const IMemory>(ptr);
// IRs already have all subnormals flushed to zero, but in
// read_model scenario with directly loaded original model still can have subnormals
} else if (prec != element::string && isBlobAligned() && (!needFlushDenormalsToZero || !hasSubnormals()) && !isWA()) {
memoryPtr = std::make_shared<Memory>(getEngine(), memDesc, constOp->get_data_ptr());
} else {
memoryPtr = std::const_pointer_cast<const IMemory>(cloneBlob());
}
}

static std::vector<Shape> createInputShapes(const Shape& shape,
Expand Down

0 comments on commit ddf8241

Please sign in to comment.