From 061131ec8aed90d11c14561aac0223d64c9b463e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 30 Jul 2023 14:42:22 +0200 Subject: [PATCH 1/3] Cache planes used for BBOX culling This isn't a huge performance boost for the games that use BBOX (like Tekken), but it'll be more valuable if we start using soft culling more widely automatically, see #17808 --- GPU/Common/DrawEngineCommon.cpp | 125 ++++++++++++++++---------------- GPU/Common/DrawEngineCommon.h | 14 ++++ GPU/Common/ShaderCommon.h | 3 +- GPU/GPU.h | 9 ++- GPU/GPUCommon.cpp | 1 + GPU/GPUCommonHW.cpp | 38 +++++----- 6 files changed, 106 insertions(+), 84 deletions(-) diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index bd04fa13f120..de00bf5d6af5 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -19,9 +19,9 @@ #include #include "Common/Data/Convert/ColorConv.h" -#include "Common/Math/lin/matrix4x4.h" #include "Common/Profiler/Profiler.h" #include "Common/LogReporting.h" +#include "Common/Math/lin/matrix4x4.h" #include "Core/Config.h" #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/SplineCommon.h" @@ -136,21 +136,6 @@ std::string DrawEngineCommon::DebugGetVertexLoaderString(std::string id, DebugSh return dec ? dec->GetString(stringType) : "N/A"; } -struct Plane { - float x, y, z, w; - void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; } - float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; } -}; - -static void PlanesFromMatrix(const float mtx[16], Plane planes[6]) { - planes[0].Set(mtx[3]-mtx[0], mtx[7]-mtx[4], mtx[11]-mtx[8], mtx[15]-mtx[12]); // Right - planes[1].Set(mtx[3]+mtx[0], mtx[7]+mtx[4], mtx[11]+mtx[8], mtx[15]+mtx[12]); // Left - planes[2].Set(mtx[3]+mtx[1], mtx[7]+mtx[5], mtx[11]+mtx[9], mtx[15]+mtx[13]); // Bottom - planes[3].Set(mtx[3]-mtx[1], mtx[7]-mtx[5], mtx[11]-mtx[9], mtx[15]-mtx[13]); // Top - planes[4].Set(mtx[3]+mtx[2], mtx[7]+mtx[6], mtx[11]+mtx[10], mtx[15]+mtx[14]); // Near - planes[5].Set(mtx[3]-mtx[2], mtx[7]-mtx[6], mtx[11]-mtx[10], mtx[15]-mtx[14]); // Far -} - static Vec3f ClipToScreen(const Vec4f& coords) { float xScale = gstate.getViewportXScale(); float xCenter = gstate.getViewportXCenter(); @@ -250,6 +235,52 @@ void DrawEngineCommon::DispatchSubmitImm(GEPrimitiveType prim, TransformedVertex } } +// Gated by DIRTY_CULL_PLANES +void DrawEngineCommon::UpdatePlanes() { + float world[16]; + float view[16]; + float worldview[16]; + float worldviewproj[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + // TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3? + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); + + // Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y. + // Note that the PSP does not clip against the viewport. + const Vec2f baseOffset = Vec2f(gstate.getOffsetX(), gstate.getOffsetY()); + // Region1 (rate) is used as an X1/Y1 here, matching PSP behavior. + minOffset_ = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f); + maxOffset_ = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f); + + // Now let's apply the viewport to our scissor/region + offset range. + Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale()); + Vec2f minViewport = (minOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale; + Vec2f maxViewport = (maxOffset_ - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale; + + Lin::Matrix4x4 applyViewport; + applyViewport.empty(); + // Scale to the viewport's size. + applyViewport.xx = 2.0f / (maxViewport.x - minViewport.x); + applyViewport.yy = 2.0f / (maxViewport.y - minViewport.y); + applyViewport.zz = 1.0f; + applyViewport.ww = 1.0f; + // And offset to the viewport's centers. + applyViewport.wx = -(maxViewport.x + minViewport.x) / (maxViewport.x - minViewport.x); + applyViewport.wy = -(maxViewport.y + minViewport.y) / (maxViewport.y - minViewport.y); + + float mtx[16]; + Matrix4ByMatrix4(mtx, worldviewproj, applyViewport.m); + + planes_[0].Set(mtx[3] - mtx[0], mtx[7] - mtx[4], mtx[11] - mtx[8], mtx[15] - mtx[12]); // Right + planes_[1].Set(mtx[3] + mtx[0], mtx[7] + mtx[4], mtx[11] + mtx[8], mtx[15] + mtx[12]); // Left + planes_[2].Set(mtx[3] + mtx[1], mtx[7] + mtx[5], mtx[11] + mtx[9], mtx[15] + mtx[13]); // Bottom + planes_[3].Set(mtx[3] - mtx[1], mtx[7] - mtx[5], mtx[11] - mtx[9], mtx[15] - mtx[13]); // Top + planes_[4].Set(mtx[3] + mtx[2], mtx[7] + mtx[6], mtx[11] + mtx[10], mtx[15] + mtx[14]); // Near + planes_[5].Set(mtx[3] - mtx[2], mtx[7] - mtx[6], mtx[11] - mtx[10], mtx[15] - mtx[14]); // Far +} + // This code has plenty of potential for optimization. // // It does the simplest and safest test possible: If all points of a bbox is outside a single of @@ -273,7 +304,7 @@ bool DrawEngineCommon::TestBoundingBox(const void *control_points, const void *i verts[i] = vtx[i] * (1.0f / 128.0f); } } else if ((vertType & 0xFFFFFF) == GE_VTYPE_POS_16BIT && !inds) { - const s16 *vtx = (const s16*)control_points; + const s16 *vtx = (const s16 *)control_points; for (int i = 0; i < vertexCount * 3; i++) { verts[i] = vtx[i] * (1.0f / 32768.0f); } @@ -302,70 +333,42 @@ bool DrawEngineCommon::TestBoundingBox(const void *control_points, const void *i } } - Plane planes[6]; - - float world[16]; - float view[16]; - float worldview[16]; - float worldviewproj[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); - ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - // TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3? - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); - - // Next, we need to apply viewport, scissor, region, and even offset - but only for X/Y. - // Note that the PSP does not clip against the viewport. - const Vec2f baseOffset = Vec2f(gstate.getOffsetX(), gstate.getOffsetY()); - // Region1 (rate) is used as an X1/Y1 here, matching PSP behavior. - Vec2f minOffset = baseOffset + Vec2f(std::max(gstate.getRegionRateX() - 0x100, gstate.getScissorX1()), std::max(gstate.getRegionRateY() - 0x100, gstate.getScissorY1())) - Vec2f(1.0f, 1.0f); - Vec2f maxOffset = baseOffset + Vec2f(std::min(gstate.getRegionX2(), gstate.getScissorX2()), std::min(gstate.getRegionY2(), gstate.getScissorY2())) + Vec2f(1.0f, 1.0f); - - // Now let's apply the viewport to our scissor/region + offset range. - Vec2f inverseViewportScale = Vec2f(1.0f / gstate.getViewportXScale(), 1.0f / gstate.getViewportYScale()); - Vec2f minViewport = (minOffset - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale; - Vec2f maxViewport = (maxOffset - Vec2f(gstate.getViewportXCenter(), gstate.getViewportYCenter())) * inverseViewportScale; - - Lin::Matrix4x4 applyViewport; - applyViewport.empty(); - // Scale to the viewport's size. - applyViewport.xx = 2.0f / (maxViewport.x - minViewport.x); - applyViewport.yy = 2.0f / (maxViewport.y - minViewport.y); - applyViewport.zz = 1.0f; - applyViewport.ww = 1.0f; - // And offset to the viewport's centers. - applyViewport.wx = -(maxViewport.x + minViewport.x) / (maxViewport.x - minViewport.x); - applyViewport.wy = -(maxViewport.y + minViewport.y) / (maxViewport.y - minViewport.y); - - float screenBounds[16]; - Matrix4ByMatrix4(screenBounds, worldviewproj, applyViewport.m); + // Due to world matrix updates per "thing", this isn't quite as effective as it could be if we did world transform + // in here as well. Though, it still does cut down on a lot of updates in Tekken 6. + if (gstate_c.IsDirty(DIRTY_CULL_PLANES)) { + UpdatePlanes(); + gpuStats.numPlaneUpdates++; + gstate_c.Clean(DIRTY_CULL_PLANES); + } - PlanesFromMatrix(screenBounds, planes); // Note: near/far are not checked without clamp/clip enabled, so we skip those planes. int totalPlanes = gstate.isDepthClampEnabled() ? 6 : 4; for (int plane = 0; plane < totalPlanes; plane++) { int inside = 0; int out = 0; for (int i = 0; i < vertexCount; i++) { - // Here we can test against the frustum planes! - float value = planes[plane].Test(verts + i * 3); + // Test against the frustum planes, and count. + // TODO: We should test 4 vertices at a time using SIMD. + // I guess could also test one vertex against 4 planes at a time, though a lot of waste at the common case of 6. + float value = planes_[plane].Test(verts + i * 3); if (value <= -FLT_EPSILON) out++; else inside++; } + // No vertices inside this one plane? Don't need to draw. if (inside == 0) { // All out - but check for X and Y if the offset was near the cullbox edge. bool outsideEdge = false; if (plane == 1) - outsideEdge = minOffset.x < 1.0f; + outsideEdge = minOffset_.x < 1.0f; if (plane == 2) - outsideEdge = minOffset.y < 1.0f; + outsideEdge = minOffset_.y < 1.0f; else if (plane == 0) - outsideEdge = maxOffset.x >= 4096.0f; + outsideEdge = maxOffset_.x >= 4096.0f; else if (plane == 3) - outsideEdge = maxOffset.y >= 4096.0f; + outsideEdge = maxOffset_.y >= 4096.0f; // Only consider this outside if offset + scissor/region is fully inside the cullbox. if (!outsideEdge) diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index b9be2ee2e4e4..beb3e2775733 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -22,6 +22,7 @@ #include "Common/CommonTypes.h" #include "Common/Data/Collections/Hashmaps.h" +#include "GPU/Math3D.h" #include "GPU/GPUState.h" #include "GPU/Common/GPUStateUtils.h" #include "GPU/Common/GPUDebugInterface.h" @@ -68,6 +69,13 @@ class TessellationDataTransfer { virtual void SendDataToShader(const SimpleVertex *const *points, int size_u, int size_v, u32 vertType, const Spline::Weight2D &weights) = 0; }; +// Culling plane. +struct Plane { + float x, y, z, w; + void Set(float _x, float _y, float _z, float _w) { x = _x; y = _y; z = _z; w = _w; } + float Test(const float f[3]) const { return x * f[0] + y * f[1] + z * f[2] + w; } +}; + class DrawEngineCommon { public: DrawEngineCommon(); @@ -131,6 +139,7 @@ class DrawEngineCommon { protected: virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; } + void UpdatePlanes(); int ComputeNumVertsToDecode() const; void DecodeVerts(u8 *dest); @@ -236,4 +245,9 @@ class DrawEngineCommon { // Hardware tessellation TessellationDataTransfer *tessDataTransfer; + + // Culling + Plane planes_[6]; + Vec2f minOffset_; + Vec2f maxOffset_; }; diff --git a/GPU/Common/ShaderCommon.h b/GPU/Common/ShaderCommon.h index 513024224098..95cf3908f221 100644 --- a/GPU/Common/ShaderCommon.h +++ b/GPU/Common/ShaderCommon.h @@ -93,7 +93,7 @@ enum : uint64_t { DIRTY_LIGHT_CONTROL = 1ULL << 38, DIRTY_TEX_ALPHA_MUL = 1ULL << 39, - // Bits 40-43 are free for new uniforms. Then we're really out and need to start merging. + // Bits 40-42 are free for new uniforms. Then we're really out and need to start merging. // Don't forget to update DIRTY_ALL_UNIFORMS when you start using them. DIRTY_BONE_UNIFORMS = 0xFF000000ULL, @@ -101,6 +101,7 @@ enum : uint64_t { DIRTY_ALL_UNIFORMS = 0x0FFFFFFFFFFULL, // Other dirty elements that aren't uniforms + DIRTY_CULL_PLANES = 1ULL << 43, DIRTY_FRAMEBUF = 1ULL << 44, DIRTY_TEXTURE_IMAGE = 1ULL << 45, // Means that the definition of the texture image has changed (address, stride etc), and we need to look up again. DIRTY_TEXTURE_PARAMS = 1ULL << 46, diff --git a/GPU/GPU.h b/GPU/GPU.h index ba5ada12e8d0..9e9dd6049a41 100644 --- a/GPU/GPU.h +++ b/GPU/GPU.h @@ -66,6 +66,7 @@ inline unsigned int toFloat24(float f) { return i >> 8; } +// The ToString function lives in GPUCommonHW.cpp. struct GPUStatistics { void Reset() { ResetFrame(); @@ -84,10 +85,10 @@ struct GPUStatistics { numTextureInvalidations = 0; numTextureInvalidationsByFramebuffer = 0; numTexturesHashed = 0; - numTextureSwitches = 0; numTextureDataBytesHashed = 0; - numShaderSwitches = 0; numFlushes = 0; + numBBOXJumps = 0; + numPlaneUpdates = 0; numTexturesDecoded = 0; numFramebufferEvaluations = 0; numBlockingReadbacks = 0; @@ -114,6 +115,8 @@ struct GPUStatistics { int numListSyncs; int numCachedDrawCalls; int numFlushes; + int numBBOXJumps; + int numPlaneUpdates; int numVertsSubmitted; int numCachedVertsDrawn; int numUncachedVertsDrawn; @@ -122,8 +125,6 @@ struct GPUStatistics { int numTextureInvalidationsByFramebuffer; int numTexturesHashed; int numTextureDataBytesHashed; - int numTextureSwitches; - int numShaderSwitches; int numTexturesDecoded; int numFramebufferEvaluations; int numBlockingReadbacks; diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index c5257f241399..c92a233a8b86 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -931,6 +931,7 @@ void GPUCommon::Execute_BJump(u32 op, u32 diff) { if (!currentList->bboxResult) { // bounding box jump. const u32 target = gstate_c.getRelativeAddress(op & 0x00FFFFFC); + gpuStats.numBBOXJumps++; if (Memory::IsValidAddress(target)) { UpdatePC(currentList->pc, target - 4); currentList->pc = target - 4; // pc will be increased after we return, counteract that diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index ffdf75c0d09a..701cfb1651e1 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -191,12 +191,12 @@ const CommonCommandTableEntry commonCommandTable[] = { { GE_CMD_ANTIALIASENABLE, FLAG_FLUSHBEFOREONCHANGE }, // Viewport. - { GE_CMD_OFFSETX, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, - { GE_CMD_OFFSETY, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, - { GE_CMD_VIEWPORTXSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE }, - { GE_CMD_VIEWPORTYSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE }, - { GE_CMD_VIEWPORTXCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE }, - { GE_CMD_VIEWPORTYCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE }, + { GE_CMD_OFFSETX, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_CULL_PLANES }, + { GE_CMD_OFFSETY, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_CULL_PLANES }, + { GE_CMD_VIEWPORTXSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULL_PLANES }, + { GE_CMD_VIEWPORTYSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULL_PLANES }, + { GE_CMD_VIEWPORTXCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULL_PLANES }, + { GE_CMD_VIEWPORTYCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULL_PLANES }, { GE_CMD_VIEWPORTZSCALE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE }, { GE_CMD_VIEWPORTZCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE }, { GE_CMD_DEPTHCLAMPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_RASTER_STATE }, @@ -206,12 +206,12 @@ const CommonCommandTableEntry commonCommandTable[] = { { GE_CMD_MAXZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, // Region - { GE_CMD_REGION1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, - { GE_CMD_REGION2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, + { GE_CMD_REGION1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_CULL_PLANES }, + { GE_CMD_REGION2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_CULL_PLANES }, // Scissor - { GE_CMD_SCISSOR1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, - { GE_CMD_SCISSOR2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE }, + { GE_CMD_SCISSOR1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_CULL_PLANES }, + { GE_CMD_SCISSOR2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_CULL_PLANES }, // Lighting base colors { GE_CMD_AMBIENTCOLOR, FLAG_FLUSHBEFOREONCHANGE, DIRTY_AMBIENT }, @@ -1368,7 +1368,7 @@ void GPUCommonHW::Execute_WorldMtxNum(u32 op, u32 diff) { if (dst[i] != newVal) { Flush(); dst[i] = newVal; - gstate_c.Dirty(DIRTY_WORLDMATRIX); + gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES); } if (++i >= end) { break; @@ -1391,7 +1391,7 @@ void GPUCommonHW::Execute_WorldMtxData(u32 op, u32 diff) { if (num < 12 && newVal != ((const u32 *)gstate.worldMatrix)[num]) { Flush(); ((u32 *)gstate.worldMatrix)[num] = newVal; - gstate_c.Dirty(DIRTY_WORLDMATRIX); + gstate_c.Dirty(DIRTY_WORLDMATRIX | DIRTY_CULL_PLANES); } num++; gstate.worldmtxnum = (GE_CMD_WORLDMATRIXNUMBER << 24) | (num & 0x00FFFFFF); @@ -1421,7 +1421,7 @@ void GPUCommonHW::Execute_ViewMtxNum(u32 op, u32 diff) { if (dst[i] != newVal) { Flush(); dst[i] = newVal; - gstate_c.Dirty(DIRTY_VIEWMATRIX); + gstate_c.Dirty(DIRTY_VIEWMATRIX | DIRTY_CULL_PLANES); } if (++i >= end) { break; @@ -1444,7 +1444,7 @@ void GPUCommonHW::Execute_ViewMtxData(u32 op, u32 diff) { if (num < 12 && newVal != ((const u32 *)gstate.viewMatrix)[num]) { Flush(); ((u32 *)gstate.viewMatrix)[num] = newVal; - gstate_c.Dirty(DIRTY_VIEWMATRIX); + gstate_c.Dirty(DIRTY_VIEWMATRIX | DIRTY_CULL_PLANES); } num++; gstate.viewmtxnum = (GE_CMD_VIEWMATRIXNUMBER << 24) | (num & 0x00FFFFFF); @@ -1474,7 +1474,7 @@ void GPUCommonHW::Execute_ProjMtxNum(u32 op, u32 diff) { if (dst[i] != newVal) { Flush(); dst[i] = newVal; - gstate_c.Dirty(DIRTY_PROJMATRIX); + gstate_c.Dirty(DIRTY_PROJMATRIX | DIRTY_CULL_PLANES); } if (++i >= end) { break; @@ -1497,7 +1497,7 @@ void GPUCommonHW::Execute_ProjMtxData(u32 op, u32 diff) { if (num < 16 && newVal != ((const u32 *)gstate.projMatrix)[num]) { Flush(); ((u32 *)gstate.projMatrix)[num] = newVal; - gstate_c.Dirty(DIRTY_PROJMATRIX); + gstate_c.Dirty(DIRTY_PROJMATRIX | DIRTY_CULL_PLANES); } num++; if (num <= 16) @@ -1647,8 +1647,8 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f; return snprintf(buffer, size, "DL processing time: %0.2f ms, %d drawsync, %d listsync\n" - "Draw calls: %d, flushes %d, clears %d (cached: %d)\n" - "Num Tracked Vertex Arrays: %d\n" + "Draw calls: %d, flushes %d, clears %d, bbox jumps %d (%d updates)\n" + "Cached draws: %d (tracked: %d)\n" "Vertices: %d cached: %d uncached: %d\n" "FBOs active: %d (evaluations: %d)\n" "Textures: %d, dec: %d, invalidated: %d, hashed: %d kB\n" @@ -1662,6 +1662,8 @@ size_t GPUCommonHW::FormatGPUStatsCommon(char *buffer, size_t size) { gpuStats.numDrawCalls, gpuStats.numFlushes, gpuStats.numClears, + gpuStats.numBBOXJumps, + gpuStats.numPlaneUpdates, gpuStats.numCachedDrawCalls, gpuStats.numTrackedVertexArrays, gpuStats.numVertsSubmitted, From fd656c629daeb5095fb87000d7a5995250dc1e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 30 Jul 2023 17:45:19 +0200 Subject: [PATCH 2/3] More dirtying --- GPU/GPUState.cpp | 7 +++++++ GPU/Software/SoftGpu.cpp | 3 +++ 2 files changed, 10 insertions(+) diff --git a/GPU/GPUState.cpp b/GPU/GPUState.cpp index 28c13b871a3b..c5f9882202d9 100644 --- a/GPU/GPUState.cpp +++ b/GPU/GPUState.cpp @@ -128,6 +128,8 @@ void GPUgstate::Reset() { memset(gstate.boneMatrix, 0, sizeof(gstate.boneMatrix)); savedContextVersion = 1; + + gstate_c.Dirty(DIRTY_CULL_PLANES); } void GPUgstate::Save(u32_le *ptr) { @@ -258,6 +260,8 @@ void GPUgstate::Restore(const u32_le *ptr) { if (gpu) gpu->ResetMatrices(); + + gstate_c.Dirty(DIRTY_CULL_PLANES); } bool vertTypeIsSkinningEnabled(u32 vertType) { @@ -366,6 +370,9 @@ void GPUStateCache::DoState(PointerWrap &p) { } else { Do(p, savedContextVersion); } + + if (p.GetMode() == PointerWrap::MODE_READ) + gstate_c.Dirty(DIRTY_CULL_PLANES); } static const char *const gpuUseFlagNames[32] = { diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index 5f708a2df0a6..c02b0ed9f9d1 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -1081,6 +1081,7 @@ void SoftGPU::Execute_WorldMtxData(u32 op, u32 diff) { if (newVal != *target) { *target = newVal; dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX; + gstate_c.Dirty(DIRTY_CULL_PLANES); } } @@ -1101,6 +1102,7 @@ void SoftGPU::Execute_ViewMtxData(u32 op, u32 diff) { if (newVal != *target) { *target = newVal; dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX; + gstate_c.Dirty(DIRTY_CULL_PLANES); } } @@ -1121,6 +1123,7 @@ void SoftGPU::Execute_ProjMtxData(u32 op, u32 diff) { if (newVal != *target) { *target = newVal; dirtyFlags_ |= SoftDirty::TRANSFORM_MATRIX; + gstate_c.Dirty(DIRTY_CULL_PLANES); } } From f0fd9e85aa9e0c593169fe7e4cdbdc1908fd187d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sun, 30 Jul 2023 18:35:18 +0200 Subject: [PATCH 3/3] Try dirtying CULL_PLANES in Execute_BoundingBox in SoftGPU --- Core/HLE/sceDisplay.cpp | 2 +- GPU/Software/SoftGpu.cpp | 7 ++++++- GPU/Software/SoftGpu.h | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Core/HLE/sceDisplay.cpp b/Core/HLE/sceDisplay.cpp index 19a1224bc0d5..70fbc0796f93 100644 --- a/Core/HLE/sceDisplay.cpp +++ b/Core/HLE/sceDisplay.cpp @@ -594,7 +594,7 @@ void __DisplayFlip(int cyclesLate) { #ifndef _DEBUG auto err = GetI18NCategory(I18NCat::ERRORS); if (g_Config.bSoftwareRendering) { - g_OSD.Show(OSDType::MESSAGE_INFO, err->T("Running slow: Try turning off Software Rendering")); + g_OSD.Show(OSDType::MESSAGE_INFO, err->T("Running slow: Try turning off Software Rendering"), 5.0f); } else { g_OSD.Show(OSDType::MESSAGE_INFO, err->T("Running slow: try frameskip, sound is choppy when slow")); } diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index c02b0ed9f9d1..b385680ebe90 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -81,7 +81,7 @@ const SoftwareCommandTableEntry softgpuCommandTable[] = { { GE_CMD_VADDR, FLAG_EXECUTE, SoftDirty::NONE, &GPUCommon::Execute_Vaddr }, { GE_CMD_IADDR, FLAG_EXECUTE, SoftDirty::NONE, &GPUCommon::Execute_Iaddr }, { GE_CMD_BJUMP, FLAG_EXECUTE | FLAG_READS_PC | FLAG_WRITES_PC, SoftDirty::NONE, &GPUCommon::Execute_BJump }, - { GE_CMD_BOUNDINGBOX, FLAG_EXECUTE, SoftDirty::NONE, &GPUCommon::Execute_BoundingBox }, + { GE_CMD_BOUNDINGBOX, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_BoundingBox }, { GE_CMD_PRIM, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_Prim }, { GE_CMD_BEZIER, FLAG_EXECUTE, SoftDirty::NONE, &SoftGPU::Execute_Bezier }, @@ -1031,6 +1031,11 @@ void SoftGPU::Execute_FramebufFormat(u32 op, u32 diff) { drawEngine_->transformUnit.Flush("framebuf"); } +void SoftGPU::Execute_BoundingBox(u32 op, u32 diff) { + gstate_c.Dirty(DIRTY_CULL_PLANES); + GPUCommon::Execute_BoundingBox(op, diff); +} + void SoftGPU::Execute_ZbufPtr(u32 op, u32 diff) { // We assume depthbuf.data won't change while we're drawing. if (diff) { diff --git a/GPU/Software/SoftGpu.h b/GPU/Software/SoftGpu.h index 8171c0e6b3c2..a5c9a1730162 100644 --- a/GPU/Software/SoftGpu.h +++ b/GPU/Software/SoftGpu.h @@ -187,6 +187,9 @@ class SoftGPU : public GPUCommon { // Overridden to change flushing behavior. void Execute_Call(u32 op, u32 diff); + // Overridden for a dirty flag change. + void Execute_BoundingBox(u32 op, u32 diff); + void Execute_WorldMtxNum(u32 op, u32 diff); void Execute_ViewMtxNum(u32 op, u32 diff); void Execute_ProjMtxNum(u32 op, u32 diff);