Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IR Interpreter: Some minor optimizations #19231

Merged
merged 2 commits into from
Jun 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion Common/BitSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,21 @@
#include <cstddef>
#include "CommonTypes.h"

// Helper functions:
// TODO: ARM has an intrinsic for the RBIT instruction in some compilers, __rbit.
inline u32 ReverseBits32(u32 v) {
// http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
// swap odd and even bits
v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
// swap consecutive pairs
v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
// swap nibbles ...
v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
// swap bytes
v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
// swap 2-byte long pairs
v = (v >> 16) | (v << 16);
return v;
}

#ifdef _WIN32
#include <intrin.h>
Expand All @@ -35,6 +49,7 @@ inline int LeastSignificantSetBit(u64 val)
_BitScanForward64(&index, val);
return (int)index;
}

#endif
#else
inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
Expand Down
111 changes: 52 additions & 59 deletions Core/MIPS/IR/IRInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <cmath>

#include "ppsspp_config.h"
#include "Common/BitSet.h"
#include "Common/BitScan.h"
#include "Common/Common.h"
#include "Common/Data/Convert/SmallDataConvert.h"
Expand Down Expand Up @@ -88,25 +89,6 @@ u32 IRRunMemCheck(u32 pc, u32 addr) {
return coreState != CORE_RUNNING ? 1 : 0;
}

template <uint32_t alignment>
u32 RunValidateAddress(u32 pc, u32 addr, u32 isWrite) {
const auto toss = [&](MemoryExceptionType t) {
Core_MemoryException(addr, alignment, pc, t);
return coreState != CORE_RUNNING ? 1 : 0;
};

if (!Memory::IsValidRange(addr, alignment)) {
MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD;
if constexpr (alignment > 4)
t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK;
return toss(t);
}
if constexpr (alignment > 1)
if ((addr & (alignment - 1)) != 0)
return toss(MemoryExceptionType::ALIGNMENT);
return 0;
}

// We cannot use NEON on ARM32 here until we make it a hard dependency. We can, however, on ARM64.
u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
while (true) {
Expand Down Expand Up @@ -166,31 +148,6 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
mips->r[inst->dest] = ReverseBits32(mips->r[inst->src1]);
break;

case IROp::ValidateAddress8:
if (RunValidateAddress<1>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;
case IROp::ValidateAddress16:
if (RunValidateAddress<2>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;
case IROp::ValidateAddress32:
if (RunValidateAddress<4>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;
case IROp::ValidateAddress128:
if (RunValidateAddress<16>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;

case IROp::Load8:
mips->r[inst->dest] = Memory::ReadUnchecked_U8(mips->r[inst->src1] + inst->constant);
break;
Expand Down Expand Up @@ -369,6 +326,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_div_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_load_ps(&mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM64_NEON)
vst1q_f32(&mips->f[inst->dest], vdivq_f32(vld1q_f32(&mips->f[inst->src1]), vld1q_f32(&mips->f[inst->src2])));
#else
for (int i = 0; i < 4; i++)
mips->f[inst->dest + i] = mips->f[inst->src1 + i] / mips->f[inst->src2 + i];
Expand All @@ -380,6 +339,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
{
#if defined(_M_SSE)
_mm_store_ps(&mips->f[inst->dest], _mm_mul_ps(_mm_load_ps(&mips->f[inst->src1]), _mm_set1_ps(mips->f[inst->src2])));
#elif PPSSPP_ARCH(ARM_NEON)
vst1q_f32(&mips->f[inst->dest], vmulq_lane_f32(vld1q_f32(&mips->f[inst->src1]), vdup_n_f32(mips->f[inst->src2]), 0));
#else
const float factor = mips->f[inst->src2];
for (int i = 0; i < 4; i++)
Expand Down Expand Up @@ -435,6 +396,11 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
src = _mm_unpacklo_epi8(src, _mm_setzero_si128());
src = _mm_unpacklo_epi16(src, _mm_setzero_si128());
_mm_store_si128((__m128i *)&mips->fi[inst->dest], _mm_slli_epi32(src, 24));
#elif PPSSPP_ARCH(ARM_NEON) && 0 // Untested
const uint8x8_t value = (uint8x8_t)vdup_n_u32(mips->fi[inst->src1]);
const uint16x8_t value16 = vmovl_u8(value);
const uint32x4_t value32 = vshll_n_u16(vget_low_u16(value16), 24);
vst1q_u32(&mips->fi[inst->dest], value32);
#else
mips->fi[inst->dest] = (mips->fi[inst->src1] << 24);
mips->fi[inst->dest + 1] = (mips->fi[inst->src1] << 16) & 0xFF000000;
Expand Down Expand Up @@ -523,8 +489,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {

case IROp::FCmpVfpuBit:
{
int op = inst->dest & 0xF;
int bit = inst->dest >> 4;
const int op = inst->dest & 0xF;
const int bit = inst->dest >> 4;
int result = 0;
switch (op) {
case VC_EQ: result = mips->f[inst->src1] == mips->f[inst->src2]; break;
Expand Down Expand Up @@ -556,8 +522,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {

case IROp::FCmpVfpuAggregate:
{
u32 mask = inst->dest;
u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
const u32 mask = inst->dest;
const u32 cc = mips->vfpuCtrl[VFPU_CTRL_CC];
int anyBit = (cc & mask) ? 0x10 : 0x00;
int allBit = (cc & mask) == mask ? 0x20 : 0x00;
mips->vfpuCtrl[VFPU_CTRL_CC] = (cc & ~0x30) | anyBit | allBit;
Expand Down Expand Up @@ -759,13 +725,14 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
case IROp::BSwap16:
{
u32 x = mips->r[inst->src1];
// Don't think we can beat this with intrinsics.
mips->r[inst->dest] = ((x & 0xFF00FF00) >> 8) | ((x & 0x00FF00FF) << 8);
break;
}
case IROp::BSwap32:
{
u32 x = mips->r[inst->src1];
mips->r[inst->dest] = ((x & 0xFF000000) >> 24) | ((x & 0x00FF0000) >> 8) | ((x & 0x0000FF00) << 8) | ((x & 0x000000FF) << 24);
mips->r[inst->dest] = swap32(x);
break;
}

Expand Down Expand Up @@ -1078,10 +1045,6 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
break;
}

case IROp::Break:
Core_Break(mips->pc);
return mips->pc + 4;

case IROp::SetCtrlVFPU:
mips->vfpuCtrl[inst->dest] = inst->constant;
break;
Expand All @@ -1094,6 +1057,20 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
memcpy(&mips->vfpuCtrl[inst->dest], &mips->f[inst->src1], 4);
break;

case IROp::ApplyRoundingMode:
// TODO: Implement
break;
case IROp::RestoreRoundingMode:
// TODO: Implement
break;
case IROp::UpdateRoundingMode:
// TODO: Implement
break;

case IROp::Break:
Core_Break(mips->pc);
return mips->pc + 4;

case IROp::Breakpoint:
if (IRRunBreakpoint(inst->constant)) {
CoreTiming::ForceCheck();
Expand All @@ -1108,15 +1085,31 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
}
break;

case IROp::ApplyRoundingMode:
// TODO: Implement
case IROp::ValidateAddress8:
if (RunValidateAddress<1>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;
case IROp::RestoreRoundingMode:
// TODO: Implement
case IROp::ValidateAddress16:
if (RunValidateAddress<2>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;
case IROp::UpdateRoundingMode:
// TODO: Implement
case IROp::ValidateAddress32:
if (RunValidateAddress<4>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;
case IROp::ValidateAddress128:
if (RunValidateAddress<16>(mips->pc, mips->r[inst->src1] + inst->constant, inst->src2)) {
CoreTiming::ForceCheck();
return mips->pc;
}
break;

case IROp::Nop:
_assert_(false);
break;
Expand Down
37 changes: 21 additions & 16 deletions Core/MIPS/IR/IRInterpreter.h
Original file line number Diff line number Diff line change
@@ -1,26 +1,31 @@
#pragma once

#include "Common/CommonTypes.h"
#include "Core/Core.h"
#include "Core/MemMap.h"

class MIPSState;
struct IRInst;

inline static u32 ReverseBits32(u32 v) {
// http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
// swap odd and even bits
v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
// swap consecutive pairs
v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
// swap nibbles ...
v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
// swap bytes
v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
// swap 2-byte long pairs
v = ( v >> 16 ) | ( v << 16);
return v;
}

u32 IRRunBreakpoint(u32 pc);
u32 IRRunMemCheck(u32 pc, u32 addr);

u32 IRInterpret(MIPSState *ms, const IRInst *inst);

template <uint32_t alignment>
u32 RunValidateAddress(u32 pc, u32 addr, u32 isWrite) {
const auto toss = [&](MemoryExceptionType t) {
Core_MemoryException(addr, alignment, pc, t);
return coreState != CORE_RUNNING ? 1 : 0;
};

if (!Memory::IsValidRange(addr, alignment)) {
MemoryExceptionType t = isWrite == 1 ? MemoryExceptionType::WRITE_WORD : MemoryExceptionType::READ_WORD;
if constexpr (alignment > 4)
t = isWrite ? MemoryExceptionType::WRITE_BLOCK : MemoryExceptionType::READ_BLOCK;
return toss(t);
}
if constexpr (alignment > 1)
if ((addr & (alignment - 1)) != 0)
return toss(MemoryExceptionType::ALIGNMENT);
return 0;
}
Loading