Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable some NEON optimizations on ARM32 that we only had on ARM64 before #18450

Merged
merged 2 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Core/MIPS/IR/IRInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_load_ps(&mips->f[inst->src1]));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vld1q_f32(&mips->f[inst->src1]));
#else
memcpy(&mips->f[inst->dest], &mips->f[inst->src1], 4 * sizeof(float));
Expand All @@ -347,7 +347,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_add_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vaddq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
Expand All @@ -360,7 +360,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_sub_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vsubq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
Expand All @@ -373,7 +373,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vmulq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
Expand Down Expand Up @@ -408,7 +408,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_xor_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)signBits)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vnegq_f32(vld1q_f32(&mips->f[inst->src1])));
#else
for (int i = 0; i < 4; i++)
Expand All @@ -421,7 +421,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_and_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps((const float *)noSignMask)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vabsq_f32(vld1q_f32(&mips->f[inst->src1])));
#else
for (int i = 0; i < 4; i++)
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1944,7 +1944,7 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
}

void GPUCommon::UpdateUVScaleOffset() {
#ifdef _M_SSE
#if defined(_M_SSE)
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
#elif PPSSPP_ARCH(ARM_NEON)
Expand Down
90 changes: 69 additions & 21 deletions GPU/Math3D.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ class Vec3
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
Expand All @@ -238,7 +238,7 @@ class Vec3
Vec3(const Vec3Packed<T> &_xyz) {
vec = _mm_loadu_ps(_xyz.AsArray());
}
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
Vec3(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
Expand Down Expand Up @@ -578,7 +578,7 @@ class Vec4
#if defined(_M_SSE)
__m128i ivec;
__m128 vec;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
int32x4_t ivec;
float32x4_t vec;
#endif
Expand All @@ -595,7 +595,7 @@ class Vec4
#if defined(_M_SSE)
Vec4(const __m128 &_vec) : vec(_vec) {}
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
Vec4(const float32x4_t &_vec) : vec(_vec) {}
#if !defined(_MSC_VER)
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
Expand All @@ -607,14 +607,14 @@ class Vec4
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
#if defined(_M_SSE)
return _mm_cvtps_epi32(SAFE_M128(vec));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return vcvtq_s32_f32(vec);
#endif
}
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
#if defined(_M_SSE)
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return vcvtq_f32_s32(ivec);
#endif
}
Expand Down Expand Up @@ -922,7 +922,7 @@ inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, c
_mm_add_ps(_mm_mul_ps(col2, z), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
#elif PPSSPP_ARCH(ARM64_NEON)
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
Expand All @@ -933,6 +933,17 @@ inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t col3 = vld1q_f32(m + 9);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
return sum;
}
#endif

// v and vecOut must point to different memory.
Expand All @@ -947,7 +958,7 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
vecOut[0] = vgetq_lane_f32(sum, 0);
Expand All @@ -967,7 +978,7 @@ inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return Vec3ByMatrix43Internal(v.vec, m);
#else
Vec3f vecOut;
Expand Down Expand Up @@ -999,6 +1010,17 @@ inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 4);
float32x4_t col2 = vld1q_f32(m + 8);
float32x4_t col3 = vld1q_f32(m + 12);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
return sum;
}
#endif

inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
Expand All @@ -1008,7 +1030,7 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
__m128 z = _mm_set1_ps(v[2]);
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
_mm_storeu_ps(vecOut, sum);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
vst1q_f32(vecOut, sum);
Expand All @@ -1027,7 +1049,7 @@ inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Vec3ByMatrix44Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return Vec3ByMatrix44Internal(v.vec, m);
#else
Vec4f vecOut;
Expand Down Expand Up @@ -1057,6 +1079,16 @@ inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
vmulq_laneq_f32(col2, vec, 2));
return sum;
}
#elif PPSSPP_ARCH(ARM_NEON)
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
vmulq_lane_f32(col2, vget_high_f32(vec), 2));
return sum;
}
#endif

inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
Expand All @@ -1068,7 +1100,7 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
vecOut[0] = _mm_cvtss_f32(sum);
vecOut[1] = vectorGetByIndex<1>(sum);
vecOut[2] = vectorGetByIndex<2>(sum);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
vecOut[0] = vgetq_lane_f32(sum, 0);
vecOut[1] = vgetq_lane_f32(sum, 1);
Expand All @@ -1087,7 +1119,7 @@ inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
return Norm3ByMatrix43Internal(x, y, z, m);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
return Norm3ByMatrix43Internal(v.vec, m);
#else
Vec3f vecOut;
Expand Down Expand Up @@ -1120,6 +1152,13 @@ inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
}

inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6];
Expand All @@ -1132,6 +1171,7 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11];
#endif
m4x4[12] = 0.0f;
m4x4[13] = 0.0f;
m4x4[14] = 0.0f;
Expand All @@ -1147,6 +1187,13 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
// 89AB
// Don't see a way to SIMD that. Should be pretty fast anyway.
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
#if PPSSPP_ARCH(ARM_NEON)
// vld3q is a perfect match here!
float32x4x3_t packed = vld3q_f32(m4x3);
vst1q_f32(m4x4, packed.val[0]);
vst1q_f32(m4x4 + 4, packed.val[1]);
vst1q_f32(m4x4 + 8, packed.val[2]);
#else
m4x4[0] = m4x3[0];
m4x4[1] = m4x3[3];
m4x4[2] = m4x3[6];
Expand All @@ -1159,6 +1206,7 @@ inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
m4x4[9] = m4x3[5];
m4x4[10] = m4x3[8];
m4x4[11] = m4x3[11];
#endif
}

inline void Transpose4x4(float out[16], const float in[16]) {
Expand Down Expand Up @@ -1209,7 +1257,7 @@ inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
__m128i c = _mm_cvtsi32_si128(rgb);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
Expand All @@ -1228,7 +1276,7 @@ inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
__m128i c = _mm_cvtsi32_si128(rgb);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec3<int>(c);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec3<int>(vreinterpretq_s32_u32(u));
Expand All @@ -1244,7 +1292,7 @@ __forceinline unsigned int Vec3<float>::ToRGB() const
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand All @@ -1261,7 +1309,7 @@ __forceinline unsigned int Vec3<int>::ToRGB() const
#if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand All @@ -1278,7 +1326,7 @@ inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
__m128i c = _mm_cvtsi32_si128(rgba);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
Expand All @@ -1304,7 +1352,7 @@ inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
__m128i c = _mm_cvtsi32_si128(rgba);
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
return Vec4<int>(c);
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
return Vec4<int>(vreinterpretq_s32_u32(u));
Expand All @@ -1320,7 +1368,7 @@ __forceinline unsigned int Vec4<float>::ToRGBA() const
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
__m128i c16 = _mm_packs_epi32(c, c);
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand All @@ -1338,7 +1386,7 @@ __forceinline unsigned int Vec4<int>::ToRGBA() const
#if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
#elif PPSSPP_ARCH(ARM64_NEON)
#elif PPSSPP_ARCH(ARM_NEON)
uint16x4_t c16 = vqmovun_s32(ivec);
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
Expand Down
2 changes: 1 addition & 1 deletion ppsspp_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
#if defined(__aarch64__) || defined(_M_ARM64)
#define PPSSPP_ARCH_ARM64 1
#define PPSSPP_ARCH_64BIT 1
#define PPSSPP_ARCH_ARM_NEON 1
#define PPSSPP_ARCH_ARM_NEON 1 // Applies to both ARM32 and ARM64
#define PPSSPP_ARCH_ARM64_NEON 1
#endif

Expand Down