Skip to content

Commit

Permalink
Merge pull request #96 from Tencent/master
Browse files Browse the repository at this point in the history
20200326
  • Loading branch information
qaz734913414 authored Mar 26, 2020
2 parents 6e1547e + f3b39a3 commit 4d5df91
Show file tree
Hide file tree
Showing 9 changed files with 265 additions and 17 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/linux-mips32-mti.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ jobs:
steps:
- uses: actions/checkout@v1
- name: mips-mti-compiler
continue-on-error: true
run: |
wget https://codescape.mips.com/components/toolchain/2019.09-01/Codescape.GNU.Tools.Package.2019.09-01.for.MIPS.MTI.Linux.CentOS-6.x86_64.tar.gz -O mips-mti-linux-gnu.tar.gz
tar -zxf mips-mti-linux-gnu.tar.gz
- name: configure
continue-on-error: true
run: |
export PATH=`pwd`/mips-mti-linux-gnu/2019.09-01/bin:$PATH
mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips-mti-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF ..
- name: build
continue-on-error: true
run: cmake --build build -j 2
4 changes: 2 additions & 2 deletions src/layer/arm/binaryop_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1571,10 +1571,10 @@ int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vecto

Mat& top_blob = top_blobs[0];

#if __ARM_NEON
int elempack = bottom_blob.elempack;
int elempack1 = bottom_blob1.elempack;

#if __ARM_NEON
if (elempack == 4 || elempack1 == 4)
{
if (op_type == Operation_ADD)
Expand Down Expand Up @@ -1641,9 +1641,9 @@ int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vecto

int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
#if __ARM_NEON
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (elempack == 4)
{
if (op_type == Operation_ADD)
Expand Down
86 changes: 81 additions & 5 deletions src/layer/arm/clip_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,22 @@ Clip_arm::Clip_arm()
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON

support_bf16_storage = true;
}

int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (opt.use_bf16_storage)
return forward_inplace_bf16s(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
Expand All @@ -64,8 +66,6 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
Expand Down Expand Up @@ -135,4 +135,80 @@ int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return 0;
}

int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

float32x4_t _max = vdupq_n_f32(max);
float32x4_t _min = vdupq_n_f32(min);

for (int i=0; i<size; i++)
{
float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
_ptr = vmaxq_f32(_ptr, _min);
_ptr = vminq_f32(_ptr, _max);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16));

ptr += 4;
}
}

return 0;
}
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size & 3;
#else
int remain = size;
#endif

#if __ARM_NEON
float32x4_t _max = vdupq_n_f32(max);
float32x4_t _min = vdupq_n_f32(min);
for (; nn>0; nn--)
{
float32x4_t _ptr = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
_ptr = vmaxq_f32(_ptr, _min);
_ptr = vminq_f32(_ptr, _max);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ptr), 16));
ptr += 4;
}
#endif // __ARM_NEON

for (; remain>0; remain--)
{
float v = bfloat16_to_float32(*ptr);
if (v < min)
v = min;

if (v > max)
v = max;

*ptr = float32_to_bfloat16(v);
ptr++;
}
}

return 0;
}

} // namespace ncnn
3 changes: 3 additions & 0 deletions src/layer/arm/clip_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class Clip_arm : virtual public Clip
Clip_arm();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn
Expand Down
87 changes: 82 additions & 5 deletions src/layer/arm/sigmoid_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,22 @@ Sigmoid_arm::Sigmoid_arm()
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON

support_bf16_storage = true;
}

int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (opt.use_bf16_storage)
return forward_inplace_bf16s(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
Expand All @@ -69,8 +71,6 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
Expand Down Expand Up @@ -112,4 +112,81 @@ int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return 0;
}

int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

float32x4_t _one = vdupq_n_f32(1.f);
for (int i=0; i<size; i++)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
_p = vnegq_f32(_p);
_p = exp_ps(_p);
_p = vaddq_f32(_p, _one);
float32x4_t _outp = vrecpeq_f32(_p);
_outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
// _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));

ptr += 4;
}
}

return 0;
}
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _one = vdupq_n_f32(1.f);
for (; nn>0; nn--)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
_p = vnegq_f32(_p);
_p = exp_ps(_p);
_p = vaddq_f32(_p, _one);
float32x4_t _outp = vrecpeq_f32(_p);
_outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
// _outp = vmulq_f32(vrecpsq_f32(_p, _outp), _outp);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_outp), 16));

ptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float v = bfloat16_to_float32(*ptr);
v = 1.f / (1.f + exp(-v));
*ptr = float32_to_bfloat16(v);

ptr++;
}
}

return 0;
}

} // namespace ncnn
3 changes: 3 additions & 0 deletions src/layer/arm/sigmoid_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class Sigmoid_arm : virtual public Sigmoid
Sigmoid_arm();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn
Expand Down
72 changes: 67 additions & 5 deletions src/layer/arm/tanh_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,22 @@ TanH_arm::TanH_arm()
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON

support_bf16_storage = true;
}

int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (opt.use_bf16_storage)
return forward_inplace_bf16s(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
Expand All @@ -62,8 +64,6 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
Expand Down Expand Up @@ -97,4 +97,66 @@ int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
return 0;
}

int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

for (int i=0; i<size; i++)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
_p = tanh_ps(_p);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16));
ptr += 4;
}
}

return 0;
}
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
for (; nn>0; nn--)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
_p = tanh_ps(_p);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_p), 16));
ptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float v = bfloat16_to_float32(*ptr);
v = tanh(v);
*ptr = float32_to_bfloat16(v);
ptr++;
}
}

return 0;
}

} // namespace ncnn
Loading

0 comments on commit 4d5df91

Please sign in to comment.