Make FloatToFloat16 conversion 75x faster using SVE2 instructions (#3626

) Summary: Pull Request resolved: #3626 X-link: facebookresearch/FBGEMM#703 Rounding was previously (1) not vectorized and (2) [implemented in software](https://fburl.com/code/fa1jzpmo), so speeds were less than 1 byte per cycle. That's really slow. With SVE2 instructions, it's 75x faster (see test plan for measurement). That's due to a combination of vectorization + hardware support for rounding. Reviewed By: q10 Differential Revision: D68520774 fbshipit-source-id: 1e8113114a291acafc2446aa831bf51da6591c9c
pytorch · Jan 29, 2025 · c5e1cde · c5e1cde
1 parent 4859433
commit c5e1cde
Show file tree

Hide file tree

Showing 5 changed files with 124 additions and 3 deletions.
diff --git a/bench/ConvertBenchmark.cc b/bench/ConvertBenchmark.cc
@@ -28,7 +28,7 @@ void performance_test() {
   normal_distribution<float> dist;
   default_random_engine engine;
 
-  cout << setw(4) << "M" << " elements_per_sec_ref" << " elements_per_sec_simd"
+  cout << setw(4) << "M" << " elements_per_ns_ref" << " elements_per_ns_simd"
        << endl;
 
   array<int, 8> dims{1, 10, 32, 40, 129, 256, 1024, 8000};

diff --git a/defs.bzl b/defs.bzl
@@ -154,14 +154,24 @@ def get_fbgemm_inline_sve_srcs(msvc = False, buck = False):
         "src/FbgemmFP16UKernelsSve128.cc",
         "src/KleidiAIFP16UKernelsNeon.cc",
         "src/UtilsSve.cc",
-    ]
+    ] + select({
+        "DEFAULT": [],
+        "ovr_config//cpu:arm64": [
+            "src/FbgemmFloat16ConvertSVE.cc",
+        ],
+    })
 
     #FP16 kernels contain inline assembly and inline assembly syntax for MSVC is different.
     asm_srcs = [
         "src/FbgemmFP16UKernelsSve128.cc",
         "src/KleidiAIFP16UKernelsNeon.cc",
         "src/UtilsSve.cc",
-    ]
+    ] + select({
+        "DEFAULT": [],
+        "ovr_config//cpu:arm64": [
+            "src/FbgemmFloat16ConvertSVE.cc",
+        ],
+    })
     if buck:
         return select({
             "DEFAULT": asm_srcs,

diff --git a/include/fbgemm/FbgemmConvert.h b/include/fbgemm/FbgemmConvert.h
@@ -135,6 +135,16 @@ FBGEMM_API void FloatToFloat16_avx512(
     size_t size,
     bool do_clip = false);
 
+/**
+ * @brief SVE2 implementation to convert fp32 numbers to fp16 numbers.
+ *
+ */
+FBGEMM_API void FloatToFloat16_sve2(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip = false);
+
 /**
  * @brief AVX2 implementation to convert fp16 numbers to fp32 numbers.
  *

diff --git a/src/FbgemmFloat16Convert.cc b/src/FbgemmFloat16Convert.cc
@@ -43,6 +43,10 @@ void FloatToFloat16_simd(
       FloatToFloat16_avx512(src, dst, size, do_clip);
     } else if (fbgemmHasAvx2Support()) {
       FloatToFloat16_avx2(src, dst, size, do_clip);
+#ifdef __aarch64__
+    } else if (fbgemmHasArmSve2Support()) {
+      FloatToFloat16_sve2(src, dst, size, do_clip);
+#endif
     } else {
       FloatToFloat16_ref(src, dst, size, do_clip);
       return;

diff --git a/src/FbgemmFloat16ConvertSVE.cc b/src/FbgemmFloat16ConvertSVE.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#if defined(__ARM_FEATURE_SVE2)
+#include <arm_sve.h>
+#endif
+
+#define FBGEMM_EXPORTS
+#include "fbgemm/FbgemmConvert.h"
+
+namespace fbgemm {
+
+#if defined(__ARM_FEATURE_SVE2)
+
+namespace {
+
+// Load two vectors, convert them from fp32 to fp16, store one vector.
+void FloatToFloat16KernelSve2_TwoVecs(const float* src, float16* dst) {
+  const svbool_t pt = svptrue_b16();
+  svfloat32x2_t srcVecs = svld2_f32(pt, src);
+  svfloat16_t even = svcvt_f16_f32_x(pt, svget2(srcVecs, 0));
+  svfloat16_t result = svcvtnt_f16_f32_x(even, pt, svget2(srcVecs, 1));
+  svst1_f16(pt, reinterpret_cast<float16_t*>(dst), result);
+}
+
+// Load and clip two vectors, convert them from fp32 to fp16, store one
+// vector.
+void FloatToFloat16KernelSve2_TwoVecs_WithClip(const float* src, float16* dst) {
+  const svbool_t pt = svptrue_b16();
+  constexpr float FP16_MAX = 65504.f;
+
+  // Load two vectors
+  const svfloat32x2_t srcVecs = svld2_f32(pt, src);
+  svfloat32_t src0 = svget2(srcVecs, 0);
+  svfloat32_t src1 = svget2(srcVecs, 1);
+
+  // Do the clipping
+  src0 = svmin_n_f32_x(pt, src0, FP16_MAX);
+  src0 = svmax_n_f32_x(pt, src0, -FP16_MAX);
+  src1 = svmin_n_f32_x(pt, src1, FP16_MAX);
+  src1 = svmax_n_f32_x(pt, src1, -FP16_MAX);
+
+  // Convert fp32 -> fp16
+  const svfloat16_t even = svcvt_f16_f32_x(pt, src0);
+  const svfloat16_t result = svcvtnt_f16_f32_x(even, pt, src1);
+
+  // Store one vector
+  svst1_f16(pt, reinterpret_cast<float16_t*>(dst), result);
+}
+
+} // namespace
+
+void FloatToFloat16_sve2(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip) {
+#pragma STDC FENV_ROUND FE_TONEAREST
+  const size_t chunkSize = svcntw() * 2;
+
+  // Note: we don't use predicates here, because then we can't use svld2. This
+  // is not optimal for small buffers, but we already have high overhead on
+  // small buffers because we have to set fp rounding mode, so I don't care.
+  if (do_clip) {
+    size_t i;
+    for (i = 0; i + chunkSize < size; i += chunkSize) {
+      FloatToFloat16KernelSve2_TwoVecs_WithClip(src + i, dst + i);
+    }
+    FloatToFloat16_ref(src + i, dst + i, size - i, do_clip);
+  } else {
+    size_t i;
+    for (i = 0; i + chunkSize < size; i += chunkSize) {
+      FloatToFloat16KernelSve2_TwoVecs(src + i, dst + i);
+    }
+    FloatToFloat16_ref(src + i, dst + i, size - i, do_clip);
+  }
+}
+
+#else
+
+void FloatToFloat16_sve2(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip) {
+  throw std::runtime_error{
+      "CPU supports SVE2 instructions, but you didn't enable SVE2 in your build command. Fix your build!"};
+}
+
+#endif // defined(__ARM_FEATURE_SVE2)
+
+} // namespace fbgemm