diff --git a/.github/workflows/linux-mips32-mti.yml b/.github/workflows/linux-mips32-mti.yml index 9c44f97de43..c85198f7454 100644 --- a/.github/workflows/linux-mips32-mti.yml +++ b/.github/workflows/linux-mips32-mti.yml @@ -6,12 +6,15 @@ jobs: steps: - uses: actions/checkout@v1 - name: mips-mti-compiler + continue-on-error: true run: | wget https://codescape.mips.com/components/toolchain/2019.09-01/Codescape.GNU.Tools.Package.2019.09-01.for.MIPS.MTI.Linux.CentOS-6.x86_64.tar.gz -O mips-mti-linux-gnu.tar.gz tar -zxf mips-mti-linux-gnu.tar.gz - name: configure + continue-on-error: true run: | export PATH=`pwd`/mips-mti-linux-gnu/2019.09-01/bin:$PATH mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips-mti-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF .. - name: build + continue-on-error: true run: cmake --build build -j 2 diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp index 5ca8582e8db..287fce8d037 100644 --- a/src/layer/arm/binaryop_arm.cpp +++ b/src/layer/arm/binaryop_arm.cpp @@ -1571,10 +1571,10 @@ int BinaryOp_arm::forward_bf16s(const std::vector& bottom_blobs, std::vecto Mat& top_blob = top_blobs[0]; -#if __ARM_NEON int elempack = bottom_blob.elempack; int elempack1 = bottom_blob1.elempack; +#if __ARM_NEON if (elempack == 4 || elempack1 == 4) { if (op_type == Operation_ADD) @@ -1641,9 +1641,9 @@ int BinaryOp_arm::forward_bf16s(const std::vector& bottom_blobs, std::vecto int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const { -#if __ARM_NEON int elempack = bottom_top_blob.elempack; +#if __ARM_NEON if (elempack == 4) { if (op_type == Operation_ADD) diff --git a/src/layer/arm/clip_arm.cpp b/src/layer/arm/clip_arm.cpp index beccfb81c4d..2dcf8014745 100644 --- a/src/layer/arm/clip_arm.cpp +++ b/src/layer/arm/clip_arm.cpp @@ -27,10 +27,15 @@ Clip_arm::Clip_arm() #if __ARM_NEON support_packing = true; #endif // __ARM_NEON + + support_bf16_storage = true; } int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { + if (opt.use_bf16_storage) + return forward_inplace_bf16s(bottom_top_blob, opt); + int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; @@ -38,9 +43,6 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int elempack = bottom_top_blob.elempack; #if __ARM_NEON - if (opt.use_packing_layout) - { - if (elempack == 4) { #pragma omp parallel for num_threads(opt.num_threads) @@ -64,8 +66,6 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } - - } // opt.use_packing_layout #endif // __ARM_NEON #pragma omp parallel for num_threads(opt.num_threads) @@ -135,4 +135,80 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } +int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + int elempack = bottom_top_blob.elempack; + +#if __ARM_NEON + if (elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 2; + int remain = size & 3; +#else + int remain = size; +#endif + +#if __ARM_NEON + float32x4_t _max = vdupq_n_f32(max); + float32x4_t _min = vdupq_n_f32(min); + for (; nn>0; nn--) + { + float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16)); + _ptr = vmaxq_f32(_ptr, _min); + _ptr = vminq_f32(_ptr, _max); + vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16)); + ptr += 4; + } +#endif // __ARM_NEON + + for (; remain>0; remain--) + { + float v = bfloat16_to_float32(*ptr); + if (v < min) + v = min; + + if (v > max) + v = max; + + *ptr = float32_to_bfloat16(v); + ptr++; + } + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h index e066a748962..f556e73e5f6 100644 --- a/src/layer/arm/clip_arm.h +++ b/src/layer/arm/clip_arm.h @@ -25,6 +25,9 @@ class Clip_arm : virtual public Clip Clip_arm(); virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: + int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp index 1f8858f416d..62cc3f647ed 100644 --- a/src/layer/arm/sigmoid_arm.cpp +++ b/src/layer/arm/sigmoid_arm.cpp @@ -30,10 +30,15 @@ Sigmoid_arm::Sigmoid_arm() #if __ARM_NEON support_packing = true; #endif // __ARM_NEON + + support_bf16_storage = true; } int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { + if (opt.use_bf16_storage) + return forward_inplace_bf16s(bottom_top_blob, opt); + int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; @@ -41,9 +46,6 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int elempack = bottom_top_blob.elempack; #if __ARM_NEON - if (opt.use_packing_layout) - { - if (elempack == 4) { #pragma omp parallel for num_threads(opt.num_threads) @@ -69,8 +71,6 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } - - } // opt.use_packing_layout #endif // __ARM_NEON #pragma omp parallel for num_threads(opt.num_threads) @@ -112,4 +112,81 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } +int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + int elempack = bottom_top_blob.elempack; + +#if __ARM_NEON + if (elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON + float32x4_t _one = vdupq_n_f32(1.f); + for (; nn>0; nn--) + { + float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16)); + _p = vnegq_f32(_p); + _p = exp_ps(_p); + _p = vaddq_f32(_p, _one); + float32x4_t _outp = vrecpeq_f32(_p); + _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp); +// _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp); + vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16)); + + ptr += 4; + } +#endif // __ARM_NEON + for (; remain>0; remain--) + { + float v = bfloat16_to_float32(*ptr); + v = 1.f / (1.f + exp(-v)); + *ptr = float32_to_bfloat16(v); + + ptr++; + } + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h index 8e8adfdca05..92bb3a3daab 100644 --- a/src/layer/arm/sigmoid_arm.h +++ b/src/layer/arm/sigmoid_arm.h @@ -25,6 +25,9 @@ class Sigmoid_arm : virtual public Sigmoid Sigmoid_arm(); virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: + int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/src/layer/arm/tanh_arm.cpp b/src/layer/arm/tanh_arm.cpp index 87a3bf66b34..f8f1310b800 100644 --- a/src/layer/arm/tanh_arm.cpp +++ b/src/layer/arm/tanh_arm.cpp @@ -30,10 +30,15 @@ TanH_arm::TanH_arm() #if __ARM_NEON support_packing = true; #endif // __ARM_NEON + + support_bf16_storage = true; } int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { + if (opt.use_bf16_storage) + return forward_inplace_bf16s(bottom_top_blob, opt); + int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; @@ -41,9 +46,6 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int elempack = bottom_top_blob.elempack; #if __ARM_NEON - if (opt.use_packing_layout) - { - if (elempack == 4) { #pragma omp parallel for num_threads(opt.num_threads) @@ -62,8 +64,6 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } - - } // opt.use_packing_layout #endif // __ARM_NEON #pragma omp parallel for num_threads(opt.num_threads) @@ -97,4 +97,66 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } +int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + int elempack = bottom_top_blob.elempack; + +#if __ARM_NEON + if (elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON + for (; nn>0; nn--) + { + float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16)); + _p = tanh_ps(_p); + vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16)); + ptr += 4; + } +#endif // __ARM_NEON + for (; remain>0; remain--) + { + float v = bfloat16_to_float32(*ptr); + v = tanh(v); + *ptr = float32_to_bfloat16(v); + ptr++; + } + } + + return 0; +} + } // namespace ncnn diff --git a/src/layer/arm/tanh_arm.h b/src/layer/arm/tanh_arm.h index 1b8b291233f..5b0f8eae092 100644 --- a/src/layer/arm/tanh_arm.h +++ b/src/layer/arm/tanh_arm.h @@ -25,6 +25,9 @@ class TanH_arm : virtual public TanH TanH_arm(); virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: + int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; }; } // namespace ncnn diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp index 2b4fc493bae..535d3d3ee01 100644 --- a/tools/onnx/onnx2ncnn.cpp +++ b/tools/onnx/onnx2ncnn.cpp @@ -404,6 +404,7 @@ static void fuse_hardswish(onnx::GraphProto* mutable_graph, std::mapmutable_node(i); // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Div(/6) + // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Div(/6) // out = x * F.relu6(x + 3, inplace=True) / 6 if (node->op_type() == "Add") { @@ -428,6 +429,14 @@ static void fuse_hardswish(onnx::GraphProto* mutable_graph, std::mapmutable_node(i+2); onnx::NodeProto* node4 = mutable_graph->mutable_node(i+3); + if (node4->op_type() == "Constant") + { + if (i+4 >= node_count) + continue; + + node4 = mutable_graph->mutable_node(i+4); + } + if (node2->op_type() != "Clip" || node3->op_type() != "Mul" || node4->op_type() != "Div") continue; @@ -564,6 +573,7 @@ static void fuse_hardsigmoid(onnx::GraphProto* mutable_graph, std::mapmutable_node(i); // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6) + // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Div(/6) // out = F.relu6(x + 3, inplace=True) / 6 if (node->op_type() == "Add") { @@ -587,6 +597,14 @@ static void fuse_hardsigmoid(onnx::GraphProto* mutable_graph, std::mapmutable_node(i+1); onnx::NodeProto* node3 = mutable_graph->mutable_node(i+2); + if (node3->op_type() == "Constant") + { + if (i+3 >= node_count) + continue; + + node3 = mutable_graph->mutable_node(i+3); + } + if (node2->op_type() != "Clip" || node3->op_type() != "Div") continue; @@ -1491,6 +1509,9 @@ int main(int argc, char** argv) // check weight before BinaryOp if (binaryop_weights.find(node.output(0)) != binaryop_weights.end()) { + if (std::find(reduced_binaryop_weights.begin(), reduced_binaryop_weights.end(), node.output(0)) != reduced_binaryop_weights.end()) + continue; + fprintf(pp, "%-16s", "MemoryData"); } else