Compare commits

...

6 Commits

Author SHA1 Message Date
Falco Girgis
430ab335de Fixed really stupid mistake.
- Whoops, accidentally took x to the 4th power in MATH_Fast_Invert().
2025-02-28 23:01:15 -06:00
Falco Girgis
f0052c79b9 TnL micro gainz: faster inverse + mat3 loading
This PR is just for a couple of quick micro-gainz which caught my eye:

1) The rw_mat_load_3x3() routine now has better instruction scheduling
   for (mostly) superscalar dual dispatch.
2) Every time we used MATH_Fast_Inverse() for perspective division, we
   were doing extra work to maintain the value's sign.
    - Added MATH_Very_Fast_Invert(x) which doesn't maintain sign bit
2025-02-28 17:02:00 -06:00
Stefanos Kornilios Mitsis Poiitidis
3a4aa88259 Merge branch 'skmp/oix-emulator-workarounds' into 'main'
Submit via OCR path if emulator is detected with no support for OIX

See merge request skmp/dca3-game!26
2025-02-25 17:07:58 +00:00
Stefanos Kornilios Mitsis Poiitidis
97a4e0d3a4 Merge branch 'skmp/fix-sniper' into 'main'
Fix sniper bug / bulletInfoInUse counting

See merge request skmp/dca3-game!25
2025-02-25 17:01:28 +00:00
Stefanos Kornilios Mitsis Poiitidis
7748c85d6b Submit via OCR path if emulator is detected with no support for OIX 2025-02-25 18:55:51 +02:00
Stefanos Kornilios Mitsis Poiitidis
e5308dea97 Fix sniper bug / bulletInfoInUse counting 2025-02-25 18:54:32 +02:00
2 changed files with 328 additions and 38 deletions

View File

@@ -126,6 +126,28 @@ void CBulletInfo::Update(void)
}
pPed->InflictDamage(pBullet->m_pSource, pBullet->m_eWeaponType, pBullet->m_nDamage, (ePedPieceTypes)point.pieceB, pPed->GetLocalDirection(pPed->GetPosition() - point.point));
CEventList::RegisterEvent(pPed->m_nPedType == PEDTYPE_COP ? EVENT_SHOOT_COP : EVENT_SHOOT_PED, EVENT_ENTITY_PED, pPed, (CPed*)pBullet->m_pSource, 1000);
if (CGame::nastyGame) {
CVector vecParticleDirection = (point.point - pPed->GetPosition()) * 0.01f;
vecParticleDirection.z = 0.01f;
if (pPed->GetIsOnScreen()) {
for (int j = 0; j < NUM_PED_BLOOD_PARTICLES; j++)
CParticle::AddParticle(PARTICLE_BLOOD_SMALL, point.point + BLOOD_PARTICLE_OFFSET, vecParticleDirection);
}
if (pPed->GetPedState() == PED_DEAD) {
CAnimBlendAssociation* pAnim;
if (RpAnimBlendClumpGetFirstAssociation(pPed->GetClump(), ASSOC_FRONTAL))
pAnim = CAnimManager::BlendAnimation(pPed->GetClump(), ASSOCGRP_STD, ANIM_STD_HIT_FLOOR_FRONT, 8.0f);
else
pAnim = CAnimManager::BlendAnimation(pPed->GetClump(), ASSOCGRP_STD, ANIM_STD_HIT_FLOOR, 8.0f);
if (pAnim) {
pAnim->SetCurrentTime(0.0f);
pAnim->flags |= ASSOC_RUNNING;
pAnim->flags &= ~ASSOC_FADEOUTWHENDONE;
}
}
}
pBullet->m_bInUse = false;
#ifdef SQUEEZE_PERFORMANCE
bulletInfoInUse--;
@@ -136,31 +158,6 @@ void CBulletInfo::Update(void)
bAddSound = false;
}
}
if (CGame::nastyGame) {
CVector vecParticleDirection = (point.point - pPed->GetPosition()) * 0.01f;
vecParticleDirection.z = 0.01f;
if (pPed->GetIsOnScreen()) {
for (int j = 0; j < NUM_PED_BLOOD_PARTICLES; j++)
CParticle::AddParticle(PARTICLE_BLOOD_SMALL, point.point + BLOOD_PARTICLE_OFFSET, vecParticleDirection);
}
if (pPed->GetPedState() == PED_DEAD) {
CAnimBlendAssociation* pAnim;
if (RpAnimBlendClumpGetFirstAssociation(pPed->GetClump(), ASSOC_FRONTAL))
pAnim = CAnimManager::BlendAnimation(pPed->GetClump(), ASSOCGRP_STD, ANIM_STD_HIT_FLOOR_FRONT, 8.0f);
else
pAnim = CAnimManager::BlendAnimation(pPed->GetClump(), ASSOCGRP_STD, ANIM_STD_HIT_FLOOR, 8.0f);
if (pAnim) {
pAnim->SetCurrentTime(0.0f);
pAnim->flags |= ASSOC_RUNNING;
pAnim->flags &= ~ASSOC_FADEOUTWHENDONE;
}
}
pBullet->m_bInUse = false;
#ifdef SQUEEZE_PERFORMANCE
bulletInfoInUse--;
#endif
vecNewPos = point.point;
}
}
else if (pHitEntity->IsVehicle()) {
CVehicle* pVehicle = (CVehicle*)pHitEntity;

View File

@@ -164,8 +164,18 @@ struct alignas(32) pvr_vertex32_ut {
static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch");
static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch");
#define MATH_Very_Fast_Invert(x) ({ 1.0f / sqrtf((x) * (x)); })
#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); })
__always_inline float MATH_Fast_Invert(float x) {
bool neg = 0;
if(x < 0.0f)
neg = true;
x = MATH_Very_Fast_Invert(x); // 1.0f / sqrt(x^2)
return (neg)? -x : x;
}
#define logf(...) // printf(__VA_ARGS__)
@@ -221,19 +231,21 @@ static pvr_dr_state_t drState;
frchg
fmov @%[mtx]+, dr0
fldi0 fr12
fldi0 fr13
fmov @%[mtx]+, dr2
fmov @%[mtx]+, dr4
fmov @%[mtx]+, dr6
fmov @%[mtx]+, dr8
fmov @%[mtx]+, dr10
fldi0 fr13
fmov @%[mtx]+, dr4
fldi0 fr3
fmov @%[mtx]+, dr6
fldi0 fr7
fmov @%[mtx]+, dr8
fldi0 fr11
fmov @%[mtx]+, dr10
fmov dr12, dr14
fschg
@@ -418,6 +430,66 @@ void leave_oix_() {
#endif
}
void enter_ocr_() {
#if defined(DC_SH4)
auto mask = irq_disable();
dcache_purge_all();
volatile uint32_t * CCN_CCR = (uint32_t *)0xFF00001C;
*CCN_CCR |= (1 << 5); // enable OCR (ORA)
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
irq_restore(mask);
#endif
}
void leave_ocr_() {
#if defined(DC_SH4)
auto mask = irq_disable();
dcache_inval_range(0x92000000, 8192);
dcache_purge_all();
volatile uint32_t * CCN_CCR = (uint32_t *)0xFF00001C;
*CCN_CCR &= ~( 1 << 5); // disable OCR (ORA)
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
__asm__ __volatile__ ("nop");
irq_restore(mask);
#endif
}
#if defined(DC_SH4)
#define FLUSH_TA_DATA(src) do { __asm__ __volatile__("ocbwb @%0" : : "r" (src) : "memory"); } while(0)
#else
@@ -606,7 +678,7 @@ struct atomic_context_t {
__always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) {
auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
auto *src = reinterpret_cast<const uint32_t *>(v);
float sz = MATH_Fast_Invert(v->z);
float sz = MATH_Very_Fast_Invert(v->z);
float sx = v->x * sz;
float sy = v->y * sz;
@@ -633,7 +705,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
{
auto *sq = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
auto *uv32 = reinterpret_cast<const uint32_t *>(uv);
float sz = MATH_Fast_Invert(w);
float sz = MATH_Very_Fast_Invert(w);
float sx = x * sz;
float sy = y * sz;
@@ -1429,7 +1501,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
pvrVert->flags = flags;
pvrVert->x = gtaVert.x;
pvrVert->y = gtaVert.y;
pvrVert->z = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
pvrVert->z = MATH_Very_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
pvrVert->u = gtaVert.u;
pvrVert->v = gtaVert.v;
pvrVert->argb = (gtaVert.a << 24) |
@@ -2362,6 +2434,36 @@ __attribute__ ((noinline)) void submitMeshlet(uint8_t* OCR, const int8_t* indexD
} while(--indexCount);
}
template<bool textured>
__attribute__ ((noinline)) void submitMeshletFallback(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) {
struct SQBUF {
union {
uint32_t flags;
uint64_t data[4];
uint8_t data8[32];
};
};
SQBUF* sq = (SQBUF*)pvr_dr_target(drState);
static_assert(sizeof(SQBUF) == 32);
do {
auto idx = *indexData++;
auto flags = idx & 0x80 ? PVR_CMD_VERTEX_EOL : PVR_CMD_VERTEX;
auto lookup_idx = idx & 0x7F;
auto src = (SQBUF*)(OCR + lookup_idx * 64);
src[0].flags = flags;
*sq = src[0];
pvr_dr_commit(sq);
if (textured) {
*sq = src[1];
pvr_dr_commit(sq);
}
} while(--indexCount);
}
#if defined(DC_SH4)
template<>
@@ -2441,7 +2543,7 @@ __attribute__ ((noinline)) void submitMeshlet<true>(uint8_t* OCR, const int8_t*
// 8 kb in total
#if defined(DC_SH4)
uint8_t* OCR_SPACE = (uint8_t*)0x92000000;
uint8_t* OCR_SPACE;
#else
uint8_t OCR_SPACE[32 * 256] __attribute__((aligned(32)));
#endif
@@ -2672,6 +2774,159 @@ __attribute__ ((noinline)) void clipAndsubmitMeshlet(uint8_t* vertexData, const
}
};
} while(indexCount != 0);
#undef FILLVERT
#undef SUBMIT_VTX
#undef SUBMIT_INTERPOLATE
}
template<bool textured>
__attribute__ ((noinline)) void clipAndsubmitMeshletFallback(uint8_t* vertexData, const int8_t* indexData, uint32_t indexCount) {
struct SQBUF {
union {
uint32_t flags;
uint64_t data[4];
uint8_t data8[32];
};
};
static_assert(sizeof(SQBUF) == 32);
SQBUF* sq = (SQBUF*)pvr_dr_target(drState);
constexpr int8_t VERTEX = 0;
constexpr int8_t VERTEX_EOL = 0x80;
#define FILLVERT(n) \
do { \
auto idx = *indexData++; \
auto local_idx = idx & 0x7f; \
eol_now = idx & 0x80; \
auto local_ptr = (vertexData + local_idx * 64); \
vpp[n] = local_ptr; \
auto v = (const pvr_vertex64_t*)local_ptr; \
vismask >>= 1; \
if((textured?v->tex_z:v->o_b) >= -v->o_g) vismask |= 0b100; \
indexCount--; \
currentCount++; \
} while(0)
#define SUBMIT_VTX(vid, eolf) \
do { \
auto src = (SQBUF*) vpp[vid]; \
src[0].flags = eolf ? PVR_CMD_VERTEX_EOL : PVR_CMD_VERTEX; \
*sq = src[0]; \
pvr_dr_commit(sq); \
if (textured) { \
*sq = src[1]; \
pvr_dr_commit(sq); \
} \
} while(0)
#define SUBMIT_INTERPOLATE(vid1, vid2, eolf) \
do { \
sq = (SQBUF*)interpolateAndSubmit<textured>(sq, vpp[vid1], vpp[vid2], eolf ? PVR_CMD_VERTEX_EOL : PVR_CMD_VERTEX); \
} while(0)
uint32_t vismask = 0;
uint8_t* vpp[3];
int8_t eol = 0;
int8_t eol_now = 0;
do {
uint32_t currentCount = -1;
FILLVERT(0);
FILLVERT(1);
FILLVERT(2);
if (vismask & 1) {
SUBMIT_VTX(0, VERTEX);
if (vismask & 2) {
// both first verts visible
SUBMIT_VTX(1, VERTEX);
} else {
// 0 visible, 1 hidden
SUBMIT_INTERPOLATE(0, 1, VERTEX);
}
} else if (vismask & 2) {
// 0 hidden, 1 visible
SUBMIT_INTERPOLATE(1, 0, VERTEX);
SUBMIT_VTX(1, VERTEX);
}
eol = 0;
// each remaining vertex of the strip
while(!eol) {
// "ring buffery" indices
uint8_t vertZeroIdx = (currentCount - 2) % 3;
uint8_t vertOneIdx = (currentCount - 1) % 3;
uint8_t vertTwoIdx = currentCount % 3;
//dcache_pref_block(&vph[vertZeroIdx]); not sure where to put this honestly -jaxyn
eol = eol_now;
if (!vismask) {
if (!eol) {
// "ring buffery" filling
FILLVERT(vertZeroIdx);
}
continue;
}
if (vismask == 7) {
// all visible
SUBMIT_VTX(vertTwoIdx, eol);
if (!eol) {
// "ring buffery" filling
FILLVERT(vertZeroIdx);
}
continue;
}
switch (vismask) {
case 1: // 0 visible, 1 and 2 hidden
// pause strip
SUBMIT_INTERPOLATE(vertZeroIdx, vertTwoIdx, VERTEX_EOL);
break;
case 3: // 0 and 1 visible, 2 hidden
SUBMIT_INTERPOLATE(vertZeroIdx, vertTwoIdx, VERTEX);
SUBMIT_VTX(vertOneIdx, VERTEX);
case 2: // 0 hidden, 1 visible, 2 hidden
SUBMIT_INTERPOLATE(vertOneIdx, vertTwoIdx, eol);
break;
case 4: // 0 and 1 hidden, 2 visible
SUBMIT_INTERPOLATE(vertTwoIdx, vertZeroIdx, VERTEX);
if (currentCount & 0x01) { // flip directionality
case 5: // 0 visible, 1 hidden, 2 visible
SUBMIT_VTX(vertTwoIdx, VERTEX);
}
SUBMIT_INTERPOLATE(vertTwoIdx, vertOneIdx, VERTEX);
SUBMIT_VTX(vertTwoIdx, eol);
break;
case 6: // 0 hidden, 1 and 2 visible
SUBMIT_INTERPOLATE(vertTwoIdx, vertZeroIdx, VERTEX);
SUBMIT_VTX(vertOneIdx, VERTEX);
SUBMIT_VTX(vertTwoIdx, eol);
break;
default:
break;
}
if (!eol) {
// "ring buffery" filling
FILLVERT(vertZeroIdx);
}
};
} while(indexCount != 0);
#undef FILLVERT
#undef SUBMIT_VTX
#undef SUBMIT_INTERPOLATE
}
@@ -2938,16 +3193,26 @@ static constexpr void (*tnlMeshletDiffuseColorSelector[8])(uint8_t* dstCol, cons
&tnlMeshletDiffuseColor<4, true>,
};
static constexpr void (*submitMeshletSelector[2])(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) = {
static void (*submitMeshletSelector[2])(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) = {
&submitMeshlet<false>,
&submitMeshlet<true>,
};
static constexpr void (*clipAndsubmitMeshletSelector[2])(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) = {
static void (*submitMeshletSelectorFallback[2])(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) = {
&submitMeshletFallback<false>,
&submitMeshletFallback<true>,
};
static void (*clipAndsubmitMeshletSelector[2])(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) = {
&clipAndsubmitMeshlet<false>,
&clipAndsubmitMeshlet<true>,
};
static void (*clipAndsubmitMeshletSelectorFallback[2])(uint8_t* OCR, const int8_t* indexData, uint32_t indexCount) = {
&clipAndsubmitMeshletFallback<false>,
&clipAndsubmitMeshletFallback<true>,
};
static constexpr void(*tnlMeshletSkinVerticesSelector[4])(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* vertex, const uint8_t* normals, const uint8_t* skinWeights, const uint8_t* skinIndexes, int vertexCount, int vertexSize, Matrix* skinMatrices) = {
&tnlMeshletSkinVertices<false, false>,
&tnlMeshletSkinVertices<true , false>,
@@ -4382,6 +4647,34 @@ ObjPipeline* makeDefaultPipeline(void)
static void*
driverOpen(void *o, int32, int32)
{
#if defined(DC_SH4)
OCR_SPACE = (uint8_t*)0x92000000;
bool has_oix = true;
enter_oix();
*(volatile uint8_t*)OCR_SPACE = 1;
if (*(volatile uint8_t*)OCR_SPACE != 1) {
has_oix = false;
}
leave_oix();
if (!has_oix) {
dbglog(DBG_CRITICAL, "You appear to be using an emulator that does not support OIX. Attempting fallback to OCR\n");
OCR_SPACE = (uint8_t*)0x7c001000;
enter_oix = (void(*)())(((uintptr_t)&enter_ocr_) - 0x8c000000 + 0xAc000000);
leave_oix = (void(*)())(((uintptr_t)&leave_ocr_) - 0x8c000000 + 0xAc000000);
for (size_t i = 0; i < ARRAY_SIZE(submitMeshletSelector); i++) {
submitMeshletSelector[i] = submitMeshletSelectorFallback[i];
}
for (size_t i = 0; i < ARRAY_SIZE(clipAndsubmitMeshletSelector); i++) {
clipAndsubmitMeshletSelector[i] = clipAndsubmitMeshletSelectorFallback[i];
}
}
#endif
pvr_init(&pvr_params);
fake_tex = pvr_mem_malloc(sizeof(fake_tex_data));