Skip to content

Commit

Permalink
Merge pull request #95 from Tencent/master
Browse files Browse the repository at this point in the history
20200325
  • Loading branch information
qaz734913414 authored Mar 25, 2020
2 parents 8c10ddb + 9ce0ad7 commit 6e1547e
Show file tree
Hide file tree
Showing 11 changed files with 1,294 additions and 70 deletions.
1,155 changes: 1,095 additions & 60 deletions src/layer/arm/binaryop_arm.cpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions src/layer/arm/binaryop_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class BinaryOp_arm : virtual public BinaryOp
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;

};

} // namespace ncnn
Expand Down
89 changes: 84 additions & 5 deletions src/layer/arm/hardsigmoid_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,22 @@ HardSigmoid_arm::HardSigmoid_arm()
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON

support_bf16_storage = true;
}

int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (opt.use_bf16_storage)
return forward_inplace_bf16s(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
Expand All @@ -65,8 +67,6 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
Expand Down Expand Up @@ -111,4 +111,83 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
return 0;
}

int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _one = vdupq_n_f32(1.f);
for (int i=0; i<size; i++)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
float32x4_t _ans = vdupq_n_f32(beta);
_ans = vmlaq_n_f32(_ans, _p, alpha);
_ans = vmaxq_f32(_ans, _zero);
_ans = vminq_f32(_ans, _one);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));

ptr += 4;
}
}

return 0;
}
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _one = vdupq_n_f32(1.f);
while (nn--)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
float32x4_t _ans = vdupq_n_f32(beta);
_ans = vmlaq_n_f32(_ans, _p, alpha);
_ans = vmaxq_f32(_ans, _zero);
_ans = vminq_f32(_ans, _one);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));

ptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float v = bfloat16_to_float32(*ptr);
if (v < lower)
v = 0.f;
else if (v > upper)
v = 1.f;
else
v = v * alpha + beta;
*ptr = float32_to_bfloat16(v);
++ptr;
}
}

return 0;
}

} // namespace ncnn
3 changes: 3 additions & 0 deletions src/layer/arm/hardsigmoid_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class HardSigmoid_arm : virtual public HardSigmoid
HardSigmoid_arm();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn
Expand Down
91 changes: 86 additions & 5 deletions src/layer/arm/hardswish_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,22 @@ HardSwish_arm::HardSwish_arm()
#if __ARM_NEON
support_packing = true;
#endif // __ARM_NEON

support_bf16_storage = true;
}

int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
if (opt.use_bf16_storage)
return forward_inplace_bf16s(bottom_top_blob, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (opt.use_packing_layout)
{

if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
Expand All @@ -66,8 +68,6 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons

return 0;
}

} // opt.use_packing_layout
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
Expand Down Expand Up @@ -113,4 +113,85 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
return 0;
}

int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
int size = w * h;
int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
if (elempack == 4)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _one = vdupq_n_f32(1.f);
for (int i=0; i<size; i++)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
float32x4_t _ans = vdupq_n_f32(beta);
_ans = vmlaq_n_f32(_ans, _p, alpha);
_ans = vmaxq_f32(_ans, _zero);
_ans = vminq_f32(_ans, _one);
_ans = vmulq_f32(_ans, _p);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));

ptr += 4;
}
}

return 0;
}
#endif // __ARM_NEON

#pragma omp parallel for num_threads(opt.num_threads)
for (int q=0; q<channels; q++)
{
unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2);
#else
int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
float32x4_t _zero = vdupq_n_f32(0.f);
float32x4_t _one = vdupq_n_f32(1.f);
while (nn--)
{
float32x4_t _p = vreinterpretq_f32_u32(vshll_n_u16(vld1_u16(ptr), 16));
float32x4_t _ans = vdupq_n_f32(beta);
_ans = vmlaq_n_f32(_ans, _p, alpha);
_ans = vmaxq_f32(_ans, _zero);
_ans = vminq_f32(_ans, _one);
_ans = vmulq_f32(_ans, _p);
vst1_u16(ptr, vshrn_n_u32(vreinterpretq_u32_f32(_ans), 16));

ptr += 4;
}
#endif // __ARM_NEON
for (; remain>0; remain--)
{
float v = bfloat16_to_float32(*ptr);
if (v < lower)
v = 0.f;
else if (v > upper)
;
else
v = v * (v * alpha + beta);
*ptr = float32_to_bfloat16(v);
++ptr;
}
}

return 0;
}

} // namespace ncnn
3 changes: 3 additions & 0 deletions src/layer/arm/hardswish_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class HardSwish_arm : virtual public HardSwish
HardSwish_arm();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn
Expand Down
2 changes: 2 additions & 0 deletions tests/test_binaryop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int op_type)
ncnn::Option opt;
opt.num_threads = 1;
opt.use_vulkan_compute = true;
opt.use_int8_inference = false;
opt.use_fp16_packed = false;
opt.use_fp16_storage = false;
opt.use_fp16_arithmetic = false;
Expand Down Expand Up @@ -78,6 +79,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int op_type)
ncnn::Option opt;
opt.num_threads = 1;
opt.use_vulkan_compute = true;
opt.use_int8_inference = false;
opt.use_fp16_packed = false;
opt.use_fp16_storage = false;
opt.use_fp16_arithmetic = false;
Expand Down
4 changes: 4 additions & 0 deletions toolchains/iossimxc-x64.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ set(IOS_ARCH x86_64)

set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS Simulator")

# enable bitcode
set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")

# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS Simulator find search path root")

Expand Down
4 changes: 4 additions & 0 deletions toolchains/iossimxc.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ set(IOS_ARCH i386;x86_64)

set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS Simulator")

# enable bitcode
set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")

# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS Simulator find search path root")

Expand Down
4 changes: 4 additions & 0 deletions toolchains/iosxc-arm64.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ set(IOS_ARCH arm64)

set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")

# enable bitcode
set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")

# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS find search path root")

Expand Down
4 changes: 4 additions & 0 deletions toolchains/iosxc.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ set(IOS_ARCH armv7;arm64)

set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")

# enable bitcode
set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")

# Set the find root to the iOS developer roots and to user defined paths
set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE STRING "iOS find search path root")

Expand Down

0 comments on commit 6e1547e

Please sign in to comment.