Correct build issue with gcc and neon version

ARM-software · Jan 22, 2025 · f6ba3eb · f6ba3eb
1 parent 662f0b4
commit f6ba3eb
Show file tree

Hide file tree

Showing 33 changed files with 155 additions and 140 deletions.
diff --git a/Include/dsp/matrix_utils.h b/Include/dsp/matrix_utils.h
@@ -51,7 +51,7 @@ extern "C"
                                         \
   for(_w=0;_w < nb; _w++)                  \
   {                                     \
-     *data *= CAST v;                   \
+     *data = CAST *data * CAST v;                   \
      data += _numCols;                   \
   }                                     \
 }
@@ -178,54 +178,63 @@ extern "C"
   }                                    \
 }
 
-#define SCALE_ROW_F16(A,COL,v,i)       \
-{                                      \
+#define SCALE_ROW_F16(A,COL,v,i)        \
+{                                       \
   int32_t _w;                           \
-  float16_t *data = (A)->pData;        \
+  float16_t *data = (A)->pData;         \
   const int32_t _numCols = (A)->numCols;\
   const int32_t nb = _numCols-(COL);    \
-                                       \
+                                        \
   data += i*_numCols + (COL);           \
-                                       \
-  for(_w=0;_w < nb; _w++)                 \
-  {                                    \
-     *data++ *= (_Float16)v;           \
-  }                                    \
+                                        \
+  _Float16 sum;                         \
+  for(_w=0;_w < nb; _w++)               \
+  {                                     \
+     sum = *data;                       \
+     sum *= (_Float16)v;                \
+     *data++ = sum;                     \
+  }                                     \
 }
 
 
-#define MAC_ROW_F16(COL,A,i,v,B,j)                \
-{                                                 \
-  int32_t _w;                                      \
-  float16_t *dataA = (A)->pData;                  \
-  float16_t *dataB = (B)->pData;                  \
-  const int32_t _numCols = (A)->numCols;           \
-  const int32_t nb = _numCols-(COL);               \
-                                                  \
-  dataA += i*_numCols + (COL);                     \
-  dataB += j*_numCols + (COL);                     \
-                                                  \
-  for(_w=0;_w < nb; _w++)                            \
-  {                                               \
-     *dataA++ += (_Float16)v * (_Float16)*dataB++;\
-  }                                               \
+#define MAC_ROW_F16(COL,A,i,v,B,j)           \
+{                                            \
+  int32_t _w;                                \
+  float16_t *dataA = (A)->pData;             \
+  float16_t *dataB = (B)->pData;             \
+  const int32_t _numCols = (A)->numCols;     \
+  const int32_t nb = _numCols-(COL);         \
+                                             \
+  dataA += i*_numCols + (COL);               \
+  dataB += j*_numCols + (COL);               \
+                                             \
+  _Float16 sum ;                             \
+  for(_w=0;_w < nb; _w++)                    \
+  {                                          \
+     sum = *dataA;                           \
+     sum += (_Float16)v * (_Float16)*dataB++;\
+     *dataA++ = sum;                         \
+  }                                          \
 }
 
-#define MAS_ROW_F16(COL,A,i,v,B,j)                \
-{                                                 \
-  int32_t _w;                                      \
-  float16_t *dataA = (A)->pData;                  \
-  float16_t *dataB = (B)->pData;                  \
-  const int32_t _numCols = (A)->numCols;           \
-  const int32_t nb = _numCols-(COL);               \
-                                                  \
-  dataA += i*_numCols + (COL);                     \
-  dataB += j*_numCols + (COL);                     \
-                                                  \
-  for(_w=0;_w < nb; _w++)                            \
-  {                                               \
-     *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
-  }                                               \
+#define MAS_ROW_F16(COL,A,i,v,B,j)           \
+{                                            \
+  int32_t _w;                                \
+  float16_t *dataA = (A)->pData;             \
+  float16_t *dataB = (B)->pData;             \
+  const int32_t _numCols = (A)->numCols;     \
+  const int32_t nb = _numCols-(COL);         \
+                                             \
+  dataA += i*_numCols + (COL);               \
+  dataB += j*_numCols + (COL);               \
+                                             \
+  _Float16 sum ;                             \
+  for(_w=0;_w < nb; _w++)                    \
+  {                                          \
+     sum = *dataA;                           \
+     sum -= (_Float16)v * (_Float16)*dataB++;\
+     *dataA++ = sum;                         \
+  }                                          \
 }
 
 #endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/

diff --git a/Ne10/CMSIS_NE10_fft.neonintrinsic.h b/Ne10/CMSIS_NE10_fft.neonintrinsic.h
@@ -110,17 +110,17 @@
 
 #define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR }
 
-#define CONST_TW_81   0.70710678
-#define CONST_TW_81N -0.70710678
+#define CONST_TW_81   0.70710678f
+#define CONST_TW_81N -0.70710678f
 
-const static float32x4_t Q_TW_81    = VDUPQ_N_F32(CONST_TW_81 );
-const static float32x4_t Q_TW_81N   = VDUPQ_N_F32(CONST_TW_81N);
+static const float32x4_t Q_TW_81    = VDUPQ_N_F32(CONST_TW_81 );
+static const float32x4_t Q_TW_81N   = VDUPQ_N_F32(CONST_TW_81N);
 
 #define DIV_TW81   1.4142136f
 #define DIV_TW81N -1.4142136f
 
-const static float32x4_t DIV_TW81_NEON  = VDUPQ_N_F32(DIV_TW81);
-const static float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
+static const float32x4_t DIV_TW81_NEON  = VDUPQ_N_F32(DIV_TW81);
+static const float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
 
 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do {   \
         Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4);      \

diff --git a/Ne10/CMSIS_NE10_fft.neonintrinsic_f16.h b/Ne10/CMSIS_NE10_fft.neonintrinsic_f16.h
@@ -119,14 +119,14 @@
 #define CONST_TW_81   0.70710678f16
 #define CONST_TW_81N -0.70710678f16
 
-const static float16x4_t Q_TW_81    = VDUPQ_N_F16(CONST_TW_81 );
-const static float16x4_t Q_TW_81N   = VDUPQ_N_F16(CONST_TW_81N);
+static const float16x4_t Q_TW_81    = VDUPQ_N_F16(CONST_TW_81 );
+static const float16x4_t Q_TW_81N   = VDUPQ_N_F16(CONST_TW_81N);
 
 #define DIV_TW81   1.4142136f16
 #define DIV_TW81N -1.4142136f16
 
-const static float16x4_t DIV_TW81_NEON  = VDUPQ_N_F16(DIV_TW81);
-const static float16x4_t DIV_TW81N_NEON = VDUPQ_N_F16(DIV_TW81N);
+static const float16x4_t DIV_TW81_NEON  = VDUPQ_N_F16(DIV_TW81);
+static const float16x4_t DIV_TW81N_NEON = VDUPQ_N_F16(DIV_TW81N);
 
 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do {   \
         Q_OUT ## 0 = vadd_f16 (Q_IN ## 0, Q_IN ## 4);      \

diff --git a/Ne10/CMSIS_NE10_fft_common_variables.h b/Ne10/CMSIS_NE10_fft_common_variables.h
@@ -40,60 +40,60 @@
 ///////////////////////////
 
 /* Twiddles used in Radix-8 FFT */
-const static ne10_float32_t TW_81_F32  =  0.70710678; // sqrt (2) / 2
-const static ne10_float32_t TW_81N_F32 = -0.70710678; // - TW_81_F32
+static const ne10_float32_t TW_81_F32  =  0.70710678f; // sqrt (2) / 2
+static const ne10_float32_t TW_81N_F32 = -0.70710678f; // - TW_81_F32
 
 /* Twiddles used in Radix-5 FFT */
-const static ne10_fft_cpx_float32_t TW_5A_F32 =
+static const ne10_fft_cpx_float32_t TW_5A_F32 =
         {
-             0.309016994374947, //   cos (2 * pi / 5)
-            -0.951056516295154  // - sin (2 * pi / 5)
+             0.309016994374947f, //   cos (2 * pi / 5)
+            -0.951056516295154f  // - sin (2 * pi / 5)
         };
-const static ne10_fft_cpx_int32_t TW_5A_S32 =
+static const ne10_fft_cpx_int32_t TW_5A_S32 =
         {
               663608942, // round (TW_5A_F32.r * 2^31)
             -2042378317  // round (TW_5A_F32.i * 2^31)
         };
 
-const static ne10_fft_cpx_float32_t TW_5B_F32 =
+static const ne10_fft_cpx_float32_t TW_5B_F32 =
         {
-            -0.809016994374947, //   cos (4 * pi / 5)
-            -0.587785252292473  // - sin (4 * pi / 5)
+            -0.809016994374947f, //   cos (4 * pi / 5)
+            -0.587785252292473f  // - sin (4 * pi / 5)
         };
-const static ne10_fft_cpx_int32_t TW_5B_S32 =
+static const ne10_fft_cpx_int32_t TW_5B_S32 =
         {
             -1737350766, // round (TW_5B_F32.r * 2^31)
             -1262259218  // round (TW_5B_F32.i * 2^31)
         };
 
 /* Twiddles used in Radix-3 FFT */
-const static ne10_float32_t TW_3I_F32  =   0.866025403784439; // sqrt (3) / 2
-const static ne10_float32_t TW_3IN_F32 = - 0.866025403784439; // - TW_3IN_F32
-const static ne10_int32_t TW_3I_S32 = 1859775393; // round (TW_3I_F32 * 2^31)
-const static ne10_int32_t TW_3IN_S32 = -1859775393; // round (TW_3IN_F32 * 2^31)
+static const ne10_float32_t TW_3I_F32  =   0.866025403784439f; // sqrt (3) / 2
+static const ne10_float32_t TW_3IN_F32 = - 0.866025403784439f; // - TW_3IN_F32
+static const ne10_int32_t TW_3I_S32 = 1859775393; // round (TW_3I_F32 * 2^31)
+static const ne10_int32_t TW_3IN_S32 = -1859775393; // round (TW_3IN_F32 * 2^31)
 
 #if defined(ARM_MATH_NEON_FLOAT16) && defined(ARM_FLOAT16_SUPPORTED)
 
 /* Twiddles used in Radix-8 FFT */
-const static ne10_float16_t TW_81_F16  =  0.70710678f16; // sqrt (2) / 2
-const static ne10_float16_t TW_81N_F16 = -0.70710678f16; // - TW_81_F32
+static const ne10_float16_t TW_81_F16  =  0.70710678f16; // sqrt (2) / 2
+static const ne10_float16_t TW_81N_F16 = -0.70710678f16; // - TW_81_F32
 
 /* Twiddles used in Radix-5 FFT */
-const static ne10_fft_cpx_float16_t TW_5A_F16 =
+static const ne10_fft_cpx_float16_t TW_5A_F16 =
         {
              0.309016994374947f16, //   cos (2 * pi / 5)
             -0.951056516295154f16  // - sin (2 * pi / 5)
         };
 
-const static ne10_fft_cpx_float16_t TW_5B_F16 =
+static const ne10_fft_cpx_float16_t TW_5B_F16 =
         {
             -0.809016994374947f16, //   cos (4 * pi / 5)
             -0.587785252292473f16  // - sin (4 * pi / 5)
         };
 
 /* Twiddles used in Radix-3 FFT */
-const static ne10_float16_t TW_3I_F16  =   0.866025403784439f16; // sqrt (3) / 2
-const static ne10_float16_t TW_3IN_F16 = - 0.866025403784439f16; // - TW_3IN_F32
+static const ne10_float16_t TW_3I_F16  =   0.866025403784439f16; // sqrt (3) / 2
+static const ne10_float16_t TW_3IN_F16 = - 0.866025403784439f16; // - TW_3IN_F32
 #endif 
 
 #endif // NE10_FFT_COMMON_VARIBLES_H
diff --git a/Ne10/CMSIS_NE10_fft_generic_float16.neonintrisic.c b/Ne10/CMSIS_NE10_fft_generic_float16.neonintrisic.c
@@ -345,12 +345,12 @@ static inline void NE10_FFT8_FUC_NEON_F16 (CPLX out[8],
         const CPLX in[8])
 {
     CPLX s[8];
-    const static ne10_fft_cpx_float16_t TW_8[4] =
+    static const ne10_fft_cpx_float16_t TW_8[4] =
     {
-        {  1.00000,  0.00000 },
-        {  0.70711, -0.70711 },
-        {  0.00000, -1.00000 },
-        { -0.70711, -0.70711 },
+        {  1.00000f16,  0.00000f16 },
+        {  0.70711f16, -0.70711f16 },
+        {  0.00000f16, -1.00000f16 },
+        { -0.70711f16, -0.70711f16 },
     };
 
     // STAGE - 1
@@ -430,7 +430,7 @@ static void ne10_radix_2_butterfly_float16_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25f16 / nfft);          \
                                                                                 \
     for (f_count = fstride; f_count > 0; f_count--)                             \
     {                                                                           \
@@ -509,7 +509,7 @@ static void ne10_radix_4_butterfly_float16_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25f16 / nfft);          \
                                                                                 \
     for (f_count = fstride; f_count > 0; f_count--)                             \
     {                                                                           \
@@ -593,7 +593,7 @@ static void ne10_radix_3_butterfly_float16_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25f16 / nfft);          \
     const float16x4_t TW_3IN_NEON_F16 = vdup_n_f16 (TW_3IN_F16);               \
     const float16x4_t HALF_NEON_F16 = vdup_n_f16 (0.5f16);                       \
                                                                                 \
@@ -693,7 +693,7 @@ static void ne10_radix_5_butterfly_float16_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                             \
     ne10_int32_t m_count;                                                             \
                                                                                       \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25 / nfft);                \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25f16 / nfft);                \
                                                                                       \
     for (f_count = fstride; f_count > 0; f_count--)                                   \
     {                                                                                 \
@@ -815,7 +815,7 @@ static void ne10_radix_8_butterfly_float16_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F16 (0.25f16 / nfft);          \
                                                                                 \
     for (f_count = fstride; f_count > 0; f_count--)                             \
     {                                                                           \

diff --git a/Ne10/CMSIS_NE10_fft_generic_float32.neonintrisic.c b/Ne10/CMSIS_NE10_fft_generic_float32.neonintrisic.c
@@ -429,12 +429,12 @@ static inline void NE10_FFT8_FUC_NEON_F32 (CPLX out[8],
         const CPLX in[8])
 {
     CPLX s[8];
-    const static ne10_fft_cpx_float32_t TW_8[4] =
+    static const ne10_fft_cpx_float32_t TW_8[4] =
     {
-        {  1.00000,  0.00000 },
-        {  0.70711, -0.70711 },
-        {  0.00000, -1.00000 },
-        { -0.70711, -0.70711 },
+        {  1.00000f,  0.00000f },
+        {  0.70711f, -0.70711f },
+        {  0.00000f, -1.00000f },
+        { -0.70711f, -0.70711f },
     };
 
     // STAGE - 1
@@ -514,7 +514,7 @@ static void ne10_radix_2_butterfly_float32_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25f / nfft);          \
                                                                                 \
     for (f_count = fstride; f_count > 0; f_count--)                             \
     {                                                                           \
@@ -593,7 +593,7 @@ static void ne10_radix_4_butterfly_float32_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25f / nfft);          \
                                                                                 \
     for (f_count = fstride; f_count > 0; f_count--)                             \
     {                                                                           \
@@ -677,7 +677,7 @@ static void ne10_radix_3_butterfly_float32_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25f / nfft);          \
     const float32x4_t TW_3IN_NEON_F32 = vdupq_n_f32 (TW_3IN_F32);               \
     const float32x4_t HALF_NEON_F32 = vdupq_n_f32 (0.5f);                       \
                                                                                 \
@@ -777,7 +777,7 @@ static void ne10_radix_5_butterfly_float32_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                             \
     ne10_int32_t m_count;                                                             \
                                                                                       \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);                \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25f / nfft);                \
                                                                                       \
     for (f_count = fstride; f_count > 0; f_count--)                                   \
     {                                                                                 \
@@ -899,7 +899,7 @@ static void ne10_radix_8_butterfly_float32_neon_##ISFIRSTSTAGE##_##ISINVERSE##_#
     ne10_int32_t f_count;                                                       \
     ne10_int32_t m_count;                                                       \
                                                                                 \
-    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25 / nfft);          \
+    const REAL one_by_fft_neon = NE10_REAL_DUP_NEON_F32 (0.25f / nfft);          \
                                                                                 \
     for (f_count = fstride; f_count > 0; f_count--)                             \
     {                                                                           \

diff --git a/Ne10/NE10_fft_float32.neonintrinsic.c b/Ne10/NE10_fft_float32.neonintrinsic.c
@@ -422,8 +422,8 @@ __STATIC_INLINE void ne10_radix8x4_neon (ne10_fft_cpx_float32_t *out,
     ne10_int32_t src_step = stride << 1; // ne10_fft_cpx_float32_t -> float32_t offset
     const float32_t *p_src = (const float32_t *) in;
     float32_t *p_dst = (float32_t *) out;
-    const ne10_float32_t TW_81 = 0.70710678;
-    const ne10_float32_t TW_81N = -0.70710678;
+    const ne10_float32_t TW_81 = 0.70710678f;
+    const ne10_float32_t TW_81N = -0.70710678f;
 
     CMPLX_VEC_F32 q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
     float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;
@@ -759,8 +759,8 @@ __STATIC_INLINE void ne10_radix8x4_inverse_neon (ne10_fft_cpx_float32_t *out,
     ne10_int32_t src_step = stride << 1;
     const float32_t *p_src = (const float32_t *) in;
     float32_t *p_dst = (float32_t *) out;
-    const ne10_float32_t TW_81 = 0.70710678;
-    const ne10_float32_t TW_81N = -0.70710678;
+    const ne10_float32_t TW_81 = 0.70710678f;
+    const ne10_float32_t TW_81N = -0.70710678f;
 
     CMPLX_VEC_F32 q2_in0, q2_in1, q2_in2, q2_in3, q2_in4, q2_in5, q2_in6, q2_in7;
     float32x4_t q_sin0_r, q_sin0_i, q_sin1_r, q_sin1_i, q_sin2_r, q_sin2_i, q_sin3_r, q_sin3_i;