apache · eric-haibin-lin · Jul 8, 2018 · Jun 2, 2018 · Jun 4, 2018 · Jun 5, 2018
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
@@ -98,10 +98,21 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
     is_default = nd.IsDefaultData();
 #endif
     if (!is_default) {
-      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
-                                                             true, nd.dtype());
 #if MXNET_USE_MKLDNN == 1
+      NDArray temp;
+      if (bufs != nullptr) {
+        temp = bufs->at(i);
+      } else if (kAddTo == req->at(i) && nd.IsMKLDNNData()) {
+        temp = nd.Reorder2Default();
+      } else if (kAddTo == req->at(i)) {
+        temp = nd;
+      } else {
+        temp = NDArray(nd.shape(), nd.ctx(), true, nd.dtype());
+      }
       CHECK(temp.IsDefaultData());
+#else
+      NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
+          true, nd.dtype());
 #endif
       temp_src->emplace_back(nd);
       temp_dst->emplace_back(temp);

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
@@ -482,7 +482,7 @@ const mkldnn::memory *NDArray::GetMKLDNNData(
   if (mem->get_primitive_desc() == desc
       || (desc1.data.format == GetDefaultFormat(desc1)
         && desc2.data.format == GetDefaultFormat(desc2))) {
-    return GetMKLDNNExact(ptr_->mkl_mem_->GetRaw(), desc);
+    return GetMKLDNNExact(mem, desc);
   } else {
     return nullptr;
   }
@@ -638,82 +638,14 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
 
   CHECK(mem.get_primitive_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
       << "The size of NDArray doesn't match the requested MKLDNN memory desc";
-  MKLDNNStream *stream = MKLDNNStream::Get();
   // If this array uses MKLDNN layout, we have to make sure it's not a view.
   // Otherwise, we'll have to change the layout inside the array.
 
   if (IsMKLDNNData() && IsView())
     ptr_->Reorder2Default();
 
-  const mkldnn::memory *this_mem = GetMKLDNNData();
-  mkldnn::memory::primitive_desc from_pd = mem.get_primitive_desc();
-  mkldnn::memory::desc from_desc = from_pd.desc();
-  mkldnn::memory::primitive_desc this_pd = this_mem->get_primitive_desc();
-  mkldnn::memory::desc this_desc = this_pd.desc();
-  mkldnn_memory_format_t from_def_format = GetDefaultFormat(from_desc);
-  mkldnn_memory_format_t this_def_format = GetDefaultFormat(this_desc);
-  if (IsView()) {
-    // Sliced array must use the default layout.
-    CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format);
-  }
-  // It's possible that the memory and the NDArray don't have the same shape.
-  if (!same_shape(this_desc, from_desc)
-      // If the source memory uses the default layout, we can reshape directly.
-      && from_def_format == from_desc.data.format) {
-    // In this case, we can simply create a new MKLDNN memory for the required
-    // shape.
-    mkldnn::memory::dims dims(this_desc.data.dims,
-                              this_desc.data.dims + this_desc.data.ndims);
-    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
-    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
-    stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
-  } else if (!same_shape(this_desc, from_desc)) {
-    // In this case, the source memory stores data in a customized layout. We
-    // need to reorganize the data in memory before we can reshape.
-    mkldnn::memory::primitive_desc def_pd = GetPrimitiveDesc(from_pd, from_def_format);
-    mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_pd);
-    stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
-    // Now we can reshape it
-    mkldnn::memory::dims dims(this_desc.data.dims,
-                              this_desc.data.dims + this_desc.data.ndims);
-    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
-    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
-    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
-    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
-    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
-    stream->RegisterMem(tmp_mem);
-    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
-  } else if (from_pd == this_pd) {
-    // If the layout is the same, we can just copy data.
-    stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
-  } else {
-    // If both are not using the default layouts. There isn't much we can do,
-    // other than reorder data layout directly.
-    if (this_def_format != this_desc.data.format
-        && from_def_format != from_desc.data.format) {
-      stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
-    } else if (this_def_format == this_desc.data.format) {
-      // If the dest mem uses the default memory layout, we can simply use
-      // the default format of the source memory to improve perf of reorder.
-      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(from_pd,
-                                                           from_def_format);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, this_mem->get_data_handle()));
-      stream->RegisterMem(tmp_mem);
-      stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
-    } else {
-      // If the src mem uses the default memory layout, we can use
-      // the default format of the source memory to improve perf.
-      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(this_pd,
-                                                           this_def_format);
-      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
-      stream->RegisterMem(tmp_mem);
-      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
-    }
-  }
+  const mkldnn::memory* this_mem = GetMKLDNNData();
+  MKLDNNCopy(mem, this_mem);
 }
 
 mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) {

diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -168,10 +168,10 @@ void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
 
   auto input_mem = in_buffer.GetMKLDNNData();
   MKLDNNActForward &fwd = GetActForward(param, ctx, in_buffer, *input_mem);
-  auto out_mem = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_primitive_desc(), req, &in_buffer);
-  fwd.SetNewMem(*input_mem, *out_mem.second);
+  auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_primitive_desc(), req, &in_buffer);
+  fwd.SetNewMem(*input_mem, *out_mem_t.second);
   stream->RegisterPrim(fwd.GetFwd());
-  CommitOutput(out_data, out_mem);
+  CommitOutput(out_data, out_mem_t);
   stream->Submit();
 }
 

diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -319,6 +319,10 @@ enum OutDataOp {
 
 typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
 
+
+void MKLDNNCopy(const mkldnn::memory &mem,
+                      const mkldnn::memory* this_mem);
+
 /*
  * These two functions try to create MKLDNN memory in an NDArray based on `req'.
  * The difference is that the first function can create MKLDNN memory with

diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -22,6 +22,8 @@
 #include <atomic>
 #include "./mkldnn_base-inl.h"
 #include "./mkldnn_ops-inl.h"
+#include "../../../common/exec_utils.h"
+
 
 namespace mxnet {
 
@@ -77,6 +79,75 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
   }
 }
 
+void MKLDNNCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem) {
+  MKLDNNStream *stream = MKLDNNStream::Get();
+
+  mkldnn::memory::primitive_desc from_pd = mem.get_primitive_desc();
+  mkldnn::memory::desc from_desc = from_pd.desc();
+  mkldnn::memory::primitive_desc this_pd = this_mem->get_primitive_desc();
+  mkldnn::memory::desc this_desc = this_pd.desc();
+  mkldnn_memory_format_t from_def_format = GetDefaultFormat(from_desc);
+  mkldnn_memory_format_t this_def_format = GetDefaultFormat(this_desc);
+  // It's possible that the memory and the NDArray don't have the same shape.
+  if (!same_shape(this_desc, from_desc)
+      // If the source memory uses the default layout, we can reshape directly.
+      && from_def_format == from_desc.data.format) {
+    // In this case, we can simply create a new MKLDNN memory for the required
+    // shape.
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
+    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
+    stream->RegisterMem(tmp_mem);
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+  } else if (!same_shape(this_desc, from_desc)) {
+    // In this case, the source memory stores data in a customized layout. We
+    // need to reorganize the data in memory before we can reshape.
+    mkldnn::memory::primitive_desc def_pd = GetPrimitiveDesc(from_pd, from_def_format);
+    mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_pd);
+    stream->RegisterPrim(mkldnn::reorder(mem, *def_mem));
+    // Now we can reshape it
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, from_pd.get_engine());
+    mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle()));
+    stream->RegisterMem(tmp_mem);
+    stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+  } else if (from_pd == this_pd) {
+    // If the layout is the same, we can just copy data.
+    stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
+  } else {
+    // If both are not using the default layouts. There isn't much we can do,
+    // other than reorder data layout directly.
+    if (this_def_format != this_desc.data.format
+        && from_def_format != from_desc.data.format) {
+      stream->RegisterPrim(mkldnn::reorder(mem, *this_mem));
+    } else if (this_def_format == this_desc.data.format) {
+      // If the dest mem uses the default memory layout, we can simply use
+      // the default format of the source memory to improve perf of reorder.
+      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(from_pd,
+                                                           from_def_format);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, this_mem->get_data_handle()));
+      stream->RegisterMem(tmp_mem);
+      stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem));
+    } else {
+      // If the src mem uses the default memory layout, we can use
+      // the default format of the source memory to improve perf.
+      mkldnn::memory::primitive_desc pd = GetPrimitiveDesc(this_pd,
+                                                           this_def_format);
+      mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle()));
+      stream->RegisterMem(tmp_mem);
+      stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *this_mem));
+    }
+  }
+}
+
 bool CanWriteTo(const NDArray &out_arr,
                 const NDArray &in_arr,
                 const mkldnn::memory::primitive_desc &desc) {
@@ -141,13 +212,16 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
   if (res.first == CopyBack) {
     const_cast<NDArray &>(arr).CopyFrom(*res.second);
   } else if (res.first == AddBack) {
+    auto res_memory = res.second;
+    auto target_pd = arr.GetMKLDNNData()->get_primitive_desc();
     auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc());
-    CHECK(mem != nullptr);
-    // We have to allocate new memory for the sum result.
-    auto sum_res = TmpMemMgr::Get()->Alloc(
-        res.second->get_primitive_desc());
-    op::MKLDNNSum(*res.second, *mem, *sum_res);
-    const_cast<NDArray &>(arr).CopyFrom(*sum_res);
+    if (mem == nullptr) {
+      auto tmp_memory = TmpMemMgr::Get()->Alloc(target_pd);
+      MKLDNNCopy(*res_memory, tmp_memory);
+      res_memory = tmp_memory;
+      mem = arr.GetMKLDNNData();
+    }
+    op::MKLDNNSum(*mem, *res_memory, *mem);
   }
 }
 
@@ -317,18 +391,28 @@ void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
   MKLDNNStream::Get()->Submit();
 
   std::vector<TBlob> out_blobs(outputs.size());
+  std::vector<NDArray> temp_src, temp_dst;
   for (size_t i = 0; i < out_blobs.size(); i++) {
     NDArray output = outputs[i];
     // ensure output does not use mkldnn mem.
     // for inplace, we already converted & copied input above.
-    if ((req[i] == kWriteTo) || (req[i] == kWriteInplace))
+    if ((req[i] == kWriteTo) || (req[i] == kWriteInplace)) {
       const_cast<NDArray &>(output).InvalidateMKLDNNData();
-    else if (req[i] == kAddTo)
-      output = outputs[i].Reorder2Default();
+    } else if (req[i] == kAddTo && output.IsMKLDNNData()) {
+      NDArray temp = outputs[i].Reorder2Default();
+      temp_src.emplace_back(temp);
+      temp_dst.emplace_back(outputs[i]);
+      output = temp;
+    }
     CHECK(output.IsDefaultData());
     out_blobs[i] = output.data();
   }
+
   fn(attrs, ctx, in_blobs, req, out_blobs);
+  for (size_t i = 0; i < out_blobs.size(); i++) {
+    if (req[i] == kAddTo && outputs[i].IsMKLDNNData())
+      mxnet::common::CastNonDefaultStorage(temp_src, temp_dst, ctx, false);
+  }
 }
 
 template<typename DType>

diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -44,14 +44,13 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   auto in_mem = data.GetMKLDNNData();
   if (req == kAddTo) {
     TmpMemMgr::Get()->Init(ctx.requested[0]);
-    // We should try and force the output memory has the same format
-    // as the input memory. If not, we'll have to reorder memory.
-    auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc());
-    if (out_mem == nullptr)
-      out_mem = out_data.GetMKLDNNData();
-    auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc());
-    MKLDNNSum(*in_mem, *out_mem, *sum_res);
-    const_cast<NDArray &>(out_data).CopyFrom(*sum_res);
+    // We should try and force the input memory has the same format
+    // as the input output. If not, we'll have to reorder memory.
+    auto out_mem = out_data.GetMKLDNNData();
+    in_mem = data.GetMKLDNNData(out_mem ->get_primitive_desc());
+    if (in_mem == nullptr)
+      in_mem = data.GetMKLDNNDataReorder(out_mem->get_primitive_desc());
+    MKLDNNSum(*out_mem, *in_mem, *out_mem);
   } else {
     const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
   }

diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -171,9 +171,7 @@ static void CopyEx(const nnvm::NodeAttrs& attrs,
     // This happens if inputs are supposed to be in MKLDNN format
     // but MKLDNN doesn't support the data type or the shape. We're
     // forced to convert it to the default format.
-    std::vector<TBlob> in_blobs {inputs[0].data()};
-    std::vector<TBlob> out_blobs {outputs[0].data()};
-    UnaryOp::IdentityCompute<cpu>(attrs, ctx, in_blobs, req, out_blobs);
+    FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
     return;
   }
 #endif