diff --git a/.github/workflows/linux-mips32-mti.yml b/.github/workflows/linux-mips32-mti.yml
index 9c44f97de43..c85198f7454 100644
--- a/.github/workflows/linux-mips32-mti.yml
+++ b/.github/workflows/linux-mips32-mti.yml
@@ -6,12 +6,15 @@ jobs:
     steps:
     - uses: actions/checkout@v1
     - name: mips-mti-compiler
+      continue-on-error: true
       run: |
         wget https://codescape.mips.com/components/toolchain/2019.09-01/Codescape.GNU.Tools.Package.2019.09-01.for.MIPS.MTI.Linux.CentOS-6.x86_64.tar.gz -O mips-mti-linux-gnu.tar.gz
         tar -zxf mips-mti-linux-gnu.tar.gz
     - name: configure
+      continue-on-error: true
       run: |
         export PATH=`pwd`/mips-mti-linux-gnu/2019.09-01/bin:$PATH
         mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips-mti-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF ..
     - name: build
+      continue-on-error: true
       run: cmake --build build -j 2
diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp
index 5ca8582e8db..287fce8d037 100644
--- a/src/layer/arm/binaryop_arm.cpp
+++ b/src/layer/arm/binaryop_arm.cpp
@@ -1571,10 +1571,10 @@ int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
     Mat& top_blob = top_blobs[0];
 
-#if __ARM_NEON
     int elempack = bottom_blob.elempack;
     int elempack1 = bottom_blob1.elempack;
 
+#if __ARM_NEON
     if (elempack == 4 || elempack1 == 4)
     {
         if (op_type == Operation_ADD)
@@ -1641,9 +1641,9 @@ int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
 int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
 {
-#if __ARM_NEON
     int elempack = bottom_top_blob.elempack;
 
+#if __ARM_NEON
     if (elempack == 4)
     {
         if (op_type == Operation_ADD)
diff --git a/src/layer/arm/clip_arm.cpp b/src/layer/arm/clip_arm.cpp
index beccfb81c4d..2dcf8014745 100644
--- a/src/layer/arm/clip_arm.cpp
+++ b/src/layer/arm/clip_arm.cpp
@@ -27,10 +27,15 @@ Clip_arm::Clip_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
@@ -38,9 +43,6 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -64,8 +66,6 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -135,4 +135,80 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            float32x4_t _max = vdupq_n_f32(max);
+            float32x4_t _min = vdupq_n_f32(min);
+
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                _ptr = vmaxq_f32(_ptr, _min);
+                _ptr = vminq_f32(_ptr, _max);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16));
+
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size & 3;
+#else
+        int remain = size;
+#endif
+
+#if __ARM_NEON
+        float32x4_t _max = vdupq_n_f32(max);
+        float32x4_t _min = vdupq_n_f32(min);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            _ptr = vmaxq_f32(_ptr, _min);
+            _ptr = vminq_f32(_ptr, _max);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16));
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            if (v < min)
+                v = min;
+
+            if (v > max)
+                v = max;
+
+            *ptr = float32_to_bfloat16(v);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h
index e066a748962..f556e73e5f6 100644
--- a/src/layer/arm/clip_arm.h
+++ b/src/layer/arm/clip_arm.h
@@ -25,6 +25,9 @@ class Clip_arm : virtual public Clip
     Clip_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp
index 1f8858f416d..62cc3f647ed 100644
--- a/src/layer/arm/sigmoid_arm.cpp
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -30,10 +30,15 @@ Sigmoid_arm::Sigmoid_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
@@ -41,9 +46,6 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -69,8 +71,6 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -112,4 +112,81 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            float32x4_t _one = vdupq_n_f32(1.f);
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                _p = vnegq_f32(_p);
+                _p = exp_ps(_p);
+                _p = vaddq_f32(_p, _one);
+                float32x4_t _outp = vrecpeq_f32(_p);
+                _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+//                 _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));
+
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        float32x4_t _one = vdupq_n_f32(1.f);
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            _p = vnegq_f32(_p);
+            _p = exp_ps(_p);
+            _p = vaddq_f32(_p, _one);
+            float32x4_t _outp = vrecpeq_f32(_p);
+            _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+//             _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));
+
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            v = 1.f / (1.f + exp(-v));
+            *ptr = float32_to_bfloat16(v);
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
index 8e8adfdca05..92bb3a3daab 100644
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -25,6 +25,9 @@ class Sigmoid_arm : virtual public Sigmoid
     Sigmoid_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/tanh_arm.cpp b/src/layer/arm/tanh_arm.cpp
index 87a3bf66b34..f8f1310b800 100644
--- a/src/layer/arm/tanh_arm.cpp
+++ b/src/layer/arm/tanh_arm.cpp
@@ -30,10 +30,15 @@ TanH_arm::TanH_arm()
 #if __ARM_NEON
     support_packing = true;
 #endif // __ARM_NEON
+
+    support_bf16_storage = true;
 }
 
 int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (opt.use_bf16_storage)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
@@ -41,9 +46,6 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     int elempack = bottom_top_blob.elempack;
 
 #if __ARM_NEON
-    if (opt.use_packing_layout)
-    {
-
     if (elempack == 4)
     {
         #pragma omp parallel for num_threads(opt.num_threads)
@@ -62,8 +64,6 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
         return 0;
     }
-
-    } // opt.use_packing_layout
 #endif // __ARM_NEON
 
     #pragma omp parallel for num_threads(opt.num_threads)
@@ -97,4 +97,66 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+    int elempack = bottom_top_blob.elempack;
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            unsigned short* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+                _p = tanh_ps(_p);
+                vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16));
+                ptr += 4;
+            }
+        }
+
+        return 0;
+    }
+#endif // __ARM_NEON
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q=0; q<channels; q++)
+    {
+        unsigned short* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+        int nn = size >> 2;
+        int remain = size - (nn << 2);
+#else
+        int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+        for (; nn>0; nn--)
+        {
+            float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
+            _p = tanh_ps(_p);
+            vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16));
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; remain>0; remain--)
+        {
+            float v = bfloat16_to_float32(*ptr);
+            v = tanh(v);
+            *ptr = float32_to_bfloat16(v);
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/arm/tanh_arm.h b/src/layer/arm/tanh_arm.h
index 1b8b291233f..5b0f8eae092 100644
--- a/src/layer/arm/tanh_arm.h
+++ b/src/layer/arm/tanh_arm.h
@@ -25,6 +25,9 @@ class TanH_arm : virtual public TanH
     TanH_arm();
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp
index 2b4fc493bae..535d3d3ee01 100644
--- a/tools/onnx/onnx2ncnn.cpp
+++ b/tools/onnx/onnx2ncnn.cpp
@@ -404,6 +404,7 @@ static void fuse_hardswish(onnx::GraphProto* mutable_graph, std::map<std::string
         onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
         // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Div(/6)
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Div(/6)
         //     out = x * F.relu6(x + 3, inplace=True) / 6
         if (node->op_type() == "Add")
         {
@@ -428,6 +429,14 @@ static void fuse_hardswish(onnx::GraphProto* mutable_graph, std::map<std::string
             onnx::NodeProto* node3 = mutable_graph->mutable_node(i+2);
             onnx::NodeProto* node4 = mutable_graph->mutable_node(i+3);
 
+            if (node4->op_type() == "Constant")
+            {
+                if (i+4 >= node_count)
+                    continue;
+
+                node4 = mutable_graph->mutable_node(i+4);
+            }
+
             if (node2->op_type() != "Clip" || node3->op_type() != "Mul" || node4->op_type() != "Div")
                 continue;
 
@@ -564,6 +573,7 @@ static void fuse_hardsigmoid(onnx::GraphProto* mutable_graph, std::map<std::stri
         onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
         // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6)
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Div(/6)
         //     out = F.relu6(x + 3, inplace=True) / 6
         if (node->op_type() == "Add")
         {
@@ -587,6 +597,14 @@ static void fuse_hardsigmoid(onnx::GraphProto* mutable_graph, std::map<std::stri
             onnx::NodeProto* node2 = mutable_graph->mutable_node(i+1);
             onnx::NodeProto* node3 = mutable_graph->mutable_node(i+2);
 
+            if (node3->op_type() == "Constant")
+            {
+                if (i+3 >= node_count)
+                    continue;
+
+                node3 = mutable_graph->mutable_node(i+3);
+            }
+
             if (node2->op_type() != "Clip" || node3->op_type() != "Div")
                 continue;
 
@@ -1491,6 +1509,9 @@ int main(int argc, char** argv)
             // check weight before BinaryOp
             if (binaryop_weights.find(node.output(0)) != binaryop_weights.end())
             {
+                if (std::find(reduced_binaryop_weights.begin(), reduced_binaryop_weights.end(), node.output(0)) != reduced_binaryop_weights.end())
+                    continue;
+
                 fprintf(pp, "%-16s", "MemoryData");
             }
             else