Merge pull request #95 from Tencent/master

20200325
Tencent · Mar 25, 2020 · 6e1547e · 6e1547e
2 parents 8c10ddb + 9ce0ad7
commit 6e1547e
Show file tree

Hide file tree

Showing 11 changed files with 1,294 additions and 70 deletions.
diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp
diff --git a/src/layer/arm/binaryop_arm.h b/src/layer/arm/binaryop_arm.h
@@ -27,6 +27,11 @@ class BinaryOp_arm : virtual public BinaryOp
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
+
 };
 
 } // namespace ncnn

diff --git a/src/layer/arm/hardsigmoid_arm.cpp b/src/layer/arm/hardsigmoid_arm.cpp
@@ -27,20 +27,22 @@ HardSigmoid_arm::HardSigmoid_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -65,8 +67,6 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -111,4 +111,83 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
     return 0;
 }
 
+int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            float32x4_t _zero = vdupq_n_f32(0.f);
+            float32x4_t _one = vdupq_n_f32(1.f);
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                float32x4_t _ans = vdupq_n_f32(beta);
+                _ans = vmlaq_n_f32(_ans, _p, alpha);
+                _ans = vmaxq_f32(_ans, _zero);
+                _ans = vminq_f32(_ans, _one);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));
+
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _zero = vdupq_n_f32(0.f);
+        float32x4_t _one = vdupq_n_f32(1.f);
+        while (nn--)
+        {
+            float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            float32x4_t _ans = vdupq_n_f32(beta);
+            _ans = vmlaq_n_f32(_ans, _p, alpha);
+            _ans = vmaxq_f32(_ans, _zero);
+            _ans = vminq_f32(_ans, _one);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));
+
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            if (v < lower)
+                v = 0.f;
+            else if (v > upper)
+                v = 1.f;
+            else
+                v = v * alpha + beta;
+            *ptr = float32_to_bfloat16(v);
+            ++ptr;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/hardsigmoid_arm.h b/src/layer/arm/hardsigmoid_arm.h
@@ -25,6 +25,9 @@ class HardSigmoid_arm : virtual public HardSigmoid
     HardSigmoid_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn

diff --git a/src/layer/arm/hardswish_arm.cpp b/src/layer/arm/hardswish_arm.cpp
@@ -27,20 +27,22 @@ HardSwish_arm::HardSwish_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -66,8 +68,6 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -113,4 +113,85 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
     return 0;
 }
 
+int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            float32x4_t _zero = vdupq_n_f32(0.f);
+            float32x4_t _one = vdupq_n_f32(1.f);
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                float32x4_t _ans = vdupq_n_f32(beta);
+                _ans = vmlaq_n_f32(_ans, _p, alpha);
+                _ans = vmaxq_f32(_ans, _zero);
+                _ans = vminq_f32(_ans, _one);
+                _ans = vmulq_f32(_ans, _p);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));
+
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _zero = vdupq_n_f32(0.f);
+        float32x4_t _one = vdupq_n_f32(1.f);
+        while (nn--)
+        {
+            float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            float32x4_t _ans = vdupq_n_f32(beta);
+            _ans = vmlaq_n_f32(_ans, _p, alpha);
+            _ans = vmaxq_f32(_ans, _zero);
+            _ans = vminq_f32(_ans, _one);
+            _ans = vmulq_f32(_ans, _p);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));
+
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            if (v < lower)
+                v = 0.f;
+            else if (v > upper)
+                ;
+            else
+                v = v * (v * alpha + beta);
+            *ptr = float32_to_bfloat16(v);
+            ++ptr;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/hardswish_arm.h b/src/layer/arm/hardswish_arm.h
@@ -25,6 +25,9 @@ class HardSwish_arm : virtual public HardSwish
     HardSwish_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn

diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp
@@ -39,6 +39,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int op_type)
     ncnn::Option opt;
     opt.num_threads = 1;
     opt.use_vulkan_compute = true;
+    opt.use_int8_inference = false;
     opt.use_fp16_packed = false;
     opt.use_fp16_storage = false;
     opt.use_fp16_arithmetic = false;
@@ -78,6 +79,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int op_type)
     ncnn::Option opt;
     opt.num_threads = 1;
     opt.use_vulkan_compute = true;
+    opt.use_int8_inference = false;
     opt.use_fp16_packed = false;
     opt.use_fp16_storage = false;
     opt.use_fp16_arithmetic = false;

diff --git a/toolchains/iossimxc-x64.toolchain.cmake b/toolchains/iossimxc-x64.toolchain.cmake
@@ -26,6 +26,10 @@ set(IOS_ARCH x86_64)
 
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS Simulator")
 
+# enable bitcode
+set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")
+
 # Set the find root to the iOS developer roots and to user defined paths
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS Simulator find search path root")
 

diff --git a/toolchains/iossimxc.toolchain.cmake b/toolchains/iossimxc.toolchain.cmake
@@ -28,6 +28,10 @@ set(IOS_ARCH i386;x86_64)
 
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS Simulator")
 
+# enable bitcode
+set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")
+
 # Set the find root to the iOS developer roots and to user defined paths
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS Simulator find search path root")
 

diff --git a/toolchains/iosxc-arm64.toolchain.cmake b/toolchains/iosxc-arm64.toolchain.cmake
@@ -26,6 +26,10 @@ set(IOS_ARCH arm64)
 
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")
 
+# enable bitcode
+set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")
+
 # Set the find root to the iOS developer roots and to user defined paths
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS find search path root")
 

diff --git a/toolchains/iosxc.toolchain.cmake b/toolchains/iosxc.toolchain.cmake
@@ -27,6 +27,10 @@ set(IOS_ARCH armv7;arm64)
 
 set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")
 
+# enable bitcode
+set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")
+
 # Set the find root to the iOS developer roots and to user defined paths
 set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS find search path root")