Merge pull request #96 from Tencent/master

20200326
Tencent · Mar 26, 2020 · 4d5df91 · 4d5df91
2 parents 6e1547e + f3b39a3
commit 4d5df91
Show file tree

Hide file tree

Showing 9 changed files with 265 additions and 17 deletions.
diff --git a/.github/workflows/linux-mips32-mti.yml b/.github/workflows/linux-mips32-mti.yml
@@ -6,12 +6,15 @@ jobs:
     steps:
     - uses: actions/checkout@v1
     - name: mips-mti-compiler
+      continue-on-error: true
       run: |
         wget https://codescape.mips.com/components/toolchain/2019.09-01/Codescape.GNU.Tools.Package.2019.09-01.for.MIPS.MTI.Linux.CentOS-6.x86_64.tar.gz -O mips-mti-linux-gnu.tar.gz
         tar -zxf mips-mti-linux-gnu.tar.gz
     - name: configure
+      continue-on-error: true
       run: |
         export PATH=`pwd`/mips-mti-linux-gnu/2019.09-01/bin:$PATH
         mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips-mti-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF ..
     - name: build
+      continue-on-error: true
       run: cmake --build build -j 2
diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp
@@ -1571,10 +1571,10 @@ int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
     Mat& top_blob = top_blobs[0];
 
-#if __ARM_NEON
     int elempack = bottom_blob.elempack;
     int elempack1 = bottom_blob1.elempack;
 
+#if __ARM_NEON
     if (elempack == 4 || elempack1 == 4)
     {
         if (op_type == Operation_ADD)
@@ -1641,9 +1641,9 @@ int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
 int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
-#if __ARM_NEON
     int elempack = bottom_top_blob.elempack;
 
+#if __ARM_NEON
     if (elempack == 4)
     {
         if (op_type == Operation_ADD)

diff --git a/src/layer/arm/clip_arm.cpp b/src/layer/arm/clip_arm.cpp
@@ -27,20 +27,22 @@ Clip_arm::Clip_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -64,8 +66,6 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -135,4 +135,80 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            float32x4_t _max = vdupq_n_f32(max);
+            float32x4_t _min = vdupq_n_f32(min);
+
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                _ptr = vmaxq_f32(_ptr, _min);
+                _ptr = vminq_f32(_ptr, _max);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16));
+
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size & 3;
+#else
+        int remain = size;
+#endif
+
+#if __ARM_NEON
+        float32x4_t _max = vdupq_n_f32(max);
+        float32x4_t _min = vdupq_n_f32(min);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            _ptr = vmaxq_f32(_ptr, _min);
+            _ptr = vminq_f32(_ptr, _max);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16));
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            if (v < min)
+                v = min;
+
+            if (v > max)
+                v = max;
+
+            *ptr = float32_to_bfloat16(v);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h
@@ -25,6 +25,9 @@ class Clip_arm : virtual public Clip
     Clip_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn

diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp
@@ -30,20 +30,22 @@ Sigmoid_arm::Sigmoid_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -69,8 +71,6 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -112,4 +112,81 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            float32x4_t _one = vdupq_n_f32(1.f);
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                _p = vnegq_f32(_p);
+                _p = exp_ps(_p);
+                _p = vaddq_f32(_p, _one);
+                float32x4_t _outp = vrecpeq_f32(_p);
+                _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+//                 _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));
+
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _one = vdupq_n_f32(1.f);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            _p = vnegq_f32(_p);
+            _p = exp_ps(_p);
+            _p = vaddq_f32(_p, _one);
+            float32x4_t _outp = vrecpeq_f32(_p);
+            _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+//             _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));
+
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            v = 1.f / (1.f + exp(-v));
+            *ptr = float32_to_bfloat16(v);
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
@@ -25,6 +25,9 @@ class Sigmoid_arm : virtual public Sigmoid
     Sigmoid_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn

diff --git a/src/layer/arm/tanh_arm.cpp b/src/layer/arm/tanh_arm.cpp
@@ -30,20 +30,22 @@ TanH_arm::TanH_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
     int size = w * h;
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -62,8 +64,6 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -97,4 +97,66 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                _p = tanh_ps(_p);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16));
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            _p = tanh_ps(_p);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16));
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            v = tanh(v);
+            *ptr = float32_to_bfloat16(v);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn