Skip to content

Commit

Permalink
Make FloatToFloat16 conversion 75x faster using SVE2 instructions (#3626
Browse files Browse the repository at this point in the history
)

Summary:
Pull Request resolved: #3626

X-link: facebookresearch/FBGEMM#703

Rounding was previously (1) not vectorized and (2) [implemented in software](https://fburl.com/code/fa1jzpmo), so speeds were less than 1 byte per cycle. That's really slow.

With SVE2 instructions, it's 75x faster (see test plan for measurement). That's due to a combination of vectorization + hardware support for rounding.

Reviewed By: q10

Differential Revision: D68520774

fbshipit-source-id: 1e8113114a291acafc2446aa831bf51da6591c9c
  • Loading branch information
Elliot Gorokhovsky authored and facebook-github-bot committed Jan 29, 2025
1 parent 4859433 commit c5e1cde
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 3 deletions.
2 changes: 1 addition & 1 deletion bench/ConvertBenchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void performance_test() {
normal_distribution<float> dist;
default_random_engine engine;

cout << setw(4) << "M" << " elements_per_sec_ref" << " elements_per_sec_simd"
cout << setw(4) << "M" << " elements_per_ns_ref" << " elements_per_ns_simd"
<< endl;

array<int, 8> dims{1, 10, 32, 40, 129, 256, 1024, 8000};
Expand Down
14 changes: 12 additions & 2 deletions defs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,24 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
"src/FbgemmFP16UKernelsSve128.cc",
"src/KleidiAIFP16UKernelsNeon.cc",
"src/UtilsSve.cc",
]
] + select({
"DEFAULT": [],
"ovr_config//cpu:arm64": [
"src/FbgemmFloat16ConvertSVE.cc",
],
})

#FP16 kernels contain inline assembly and inline assembly syntax for MSVC is different.
asm_srcs = [
"src/FbgemmFP16UKernelsSve128.cc",
"src/KleidiAIFP16UKernelsNeon.cc",
"src/UtilsSve.cc",
]
] + select({
"DEFAULT": [],
"ovr_config//cpu:arm64": [
"src/FbgemmFloat16ConvertSVE.cc",
],
})
if buck:
return select({
"DEFAULT": asm_srcs,
Expand Down
10 changes: 10 additions & 0 deletions include/fbgemm/FbgemmConvert.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,16 @@ FBGEMM_API void FloatToFloat16_avx512(
size_t size,
bool do_clip = false);

/**
* @brief SVE2 implementation to convert fp32 numbers to fp16 numbers.
*
*/
FBGEMM_API void FloatToFloat16_sve2(
const float* src,
float16* dst,
size_t size,
bool do_clip = false);

/**
* @brief AVX2 implementation to convert fp16 numbers to fp32 numbers.
*
Expand Down
4 changes: 4 additions & 0 deletions src/FbgemmFloat16Convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ void FloatToFloat16_simd(
FloatToFloat16_avx512(src, dst, size, do_clip);
} else if (fbgemmHasAvx2Support()) {
FloatToFloat16_avx2(src, dst, size, do_clip);
#ifdef __aarch64__
} else if (fbgemmHasArmSve2Support()) {
FloatToFloat16_sve2(src, dst, size, do_clip);
#endif
} else {
FloatToFloat16_ref(src, dst, size, do_clip);
return;
Expand Down
97 changes: 97 additions & 0 deletions src/FbgemmFloat16ConvertSVE.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#if defined(__ARM_FEATURE_SVE2)
#include <arm_sve.h>
#endif

#define FBGEMM_EXPORTS
#include "fbgemm/FbgemmConvert.h"

namespace fbgemm {

#if defined(__ARM_FEATURE_SVE2)

namespace {

// Load two vectors, convert them from fp32 to fp16, store one vector.
void FloatToFloat16KernelSve2_TwoVecs(const float* src, float16* dst) {
const svbool_t pt = svptrue_b16();
svfloat32x2_t srcVecs = svld2_f32(pt, src);
svfloat16_t even = svcvt_f16_f32_x(pt, svget2(srcVecs, 0));
svfloat16_t result = svcvtnt_f16_f32_x(even, pt, svget2(srcVecs, 1));
svst1_f16(pt, reinterpret_cast<float16_t*>(dst), result);
}

// Load and clip two vectors, convert them from fp32 to fp16, store one
// vector.
void FloatToFloat16KernelSve2_TwoVecs_WithClip(const float* src, float16* dst) {
const svbool_t pt = svptrue_b16();
constexpr float FP16_MAX = 65504.f;

// Load two vectors
const svfloat32x2_t srcVecs = svld2_f32(pt, src);
svfloat32_t src0 = svget2(srcVecs, 0);
svfloat32_t src1 = svget2(srcVecs, 1);

// Do the clipping
src0 = svmin_n_f32_x(pt, src0, FP16_MAX);
src0 = svmax_n_f32_x(pt, src0, -FP16_MAX);
src1 = svmin_n_f32_x(pt, src1, FP16_MAX);
src1 = svmax_n_f32_x(pt, src1, -FP16_MAX);

// Convert fp32 -> fp16
const svfloat16_t even = svcvt_f16_f32_x(pt, src0);
const svfloat16_t result = svcvtnt_f16_f32_x(even, pt, src1);

// Store one vector
svst1_f16(pt, reinterpret_cast<float16_t*>(dst), result);
}

} // namespace

void FloatToFloat16_sve2(
const float* src,
float16* dst,
size_t size,
bool do_clip) {
#pragma STDC FENV_ROUND FE_TONEAREST
const size_t chunkSize = svcntw() * 2;

// Note: we don't use predicates here, because then we can't use svld2. This
// is not optimal for small buffers, but we already have high overhead on
// small buffers because we have to set fp rounding mode, so I don't care.
if (do_clip) {
size_t i;
for (i = 0; i + chunkSize < size; i += chunkSize) {
FloatToFloat16KernelSve2_TwoVecs_WithClip(src + i, dst + i);
}
FloatToFloat16_ref(src + i, dst + i, size - i, do_clip);
} else {
size_t i;
for (i = 0; i + chunkSize < size; i += chunkSize) {
FloatToFloat16KernelSve2_TwoVecs(src + i, dst + i);
}
FloatToFloat16_ref(src + i, dst + i, size - i, do_clip);
}
}

#else

void FloatToFloat16_sve2(
const float* src,
float16* dst,
size_t size,
bool do_clip) {
throw std::runtime_error{
"CPU supports SVE2 instructions, but you didn't enable SVE2 in your build command. Fix your build!"};
}

#endif // defined(__ARM_FEATURE_SVE2)

} // namespace fbgemm

0 comments on commit c5e1cde

Please sign in to comment.