Merge branch 'falco/dreamsdk4_fixes' into 'main'

Fixed builds for DreamSDK4. See merge request skmp/dca3-game!102
Fixed builds for DreamSDK4.
2025-10-02 17:30:23 +00:00 · 2025-07-15 16:55:44 -05:00 · 2025-06-15 16:56:46 +00:00 · 2025-06-15 11:36:40 -05:00 · 2025-05-07 07:30:02 +00:00 · 2025-05-07 10:18:05 +03:00
47 changed files with 3487 additions and 1008 deletions
--- a/liberty/common.mk
+++ b/liberty/common.mk
@@ -381,6 +381,7 @@ INCLUDE = \
 -I../src/liberty/skel/win \
 \
 -I../vendor/librw \
+-I../vendor/librw/src/dc \
 \
 -I../vendor/miniLZO \
 \
--- a/miami/common.mk
+++ b/miami/common.mk
@@ -396,6 +396,7 @@ INCLUDE = \
 -I../src/miami/skel/win \
 \
 -I../vendor/librw \
+-I../vendor/librw/src/dc \
 \
 -I../vendor/miniLZO \
 \
--- a/src/common/vmu/vmu.cpp
+++ b/src/common/vmu/vmu.cpp
@@ -94,23 +94,26 @@ void VmuProfiler::run() {
            pvr_stats_t pvrStats;   pvr_get_stats(&pvrStats);
            uint32_t    sramStats = snd_mem_available();
            size_t      pvrAvail  = pvr_mem_available();
-            float       fps       = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f) 
-                                    / static_cast<float>(fpsSamples);

            float sh4Mem = heapUtilization();
            float pvrMem = (8_MB - pvrAvail ) / 8_MB * 100.0f;
            float armMem = (2_MB - sramStats) / 2_MB * 100.0f;
-            float vtxBuf = vertBuffUse_;
-            {
-                std::shared_lock lk(mtx_);

-                vmu_printf("FPS :%6.2f\n"
-                           "SH4 :%6.2f%%\n"
-                           "PVR :%6.2f%%\n"
-                           "ARM :%6.2f%%\n"
-                           "VTX :%6.2f%%",
-                           fps, sh4Mem, pvrMem, armMem, vtxBuf);
+            float vtxBuf;
+            float fps;
+            { /* Critical section with main thread. */
+                std::shared_lock lk(mtx_);
+                vtxBuf = vertBuffUse_;
+                fps    = std::accumulate(std::begin(fps_), std::end(fps_), 0.0f)
+                       / static_cast<float>(fpsSamples);
            }
+
+            vmu_printf(" FPS:%6.2f\n"
+                       " RAM:%6.2f%%\n"
+                       "VRAM:%6.2f%%\n"
+                       "SRAM:%6.2f%%\n"
+                       " VTX:%6.2f%%",
+                       fps, sh4Mem, pvrMem, armMem, vtxBuf);
        }
 #endif

@@ -119,15 +122,21 @@ void VmuProfiler::run() {
 }

 void VmuProfiler::updateVertexBufferUsage() {
+#ifndef DC_SH4
    std::unique_lock lk(mtx_);
    updated_ = true;

-#ifdef DC_SH4
-    vertBuffUse_ = vertexBufferUtilization();
+#else
+    pvr_stats_t pvrStats;
+    pvr_get_stats(&pvrStats);
+    float vtxUtil  = vertexBufferUtilization();

-    pvr_stats_t pvrStats; 
-    pvr_get_stats(&pvrStats);       
-    fps_[fpsFrame_++] = pvrStats.frame_rate;
+    { /* Critical section with VMU thread. */
+        std::unique_lock lk(mtx_);
+        vertBuffUse_ = vtxUtil;
+        updated_ = true;
+        fps_[fpsFrame_++] = pvrStats.frame_rate;
+    }

    if(fpsFrame_ >= fpsSamples)
        fpsFrame_ = 0;
--- a/src/liberty/collision/Collision.cpp
+++ b/src/liberty/collision/Collision.cpp
@@ -1619,7 +1619,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@@ -1798,7 +1798,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@@ -2173,8 +2173,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matAB *= matrixA;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matAB));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
+	dc::mat_load_apply(matAB, matrixA);
 #endif

 	CColSphere bsphereAB;	// bounding sphere of A in B space
@@ -2246,8 +2245,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matBA *= matrixB;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matBA));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
+	dc::mat_load_apply(matBA, matrixB);
 #endif
 	for(i = 0; i < modelB.numSpheres; i++){
 		s.radius = modelB.spheres[i].radius;
@@ -2309,7 +2307,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	}

 #ifdef DC_SH4
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB)));
+	dc::mat_load2(matrixB);
 #endif
 	for(i = 0; i < numCollisions; i++){
 #ifndef DC_SH4
--- a/src/liberty/core/common.h
+++ b/src/liberty/core/common.h
@@ -82,8 +82,6 @@

 #define rwVENDORID_ROCKSTAR 0x0253F2

-__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); }
-__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); }

 // Use this to add const that wasn't there in the original code
 #define Const const
@@ -299,15 +297,6 @@ extern int strcasecmp(const char *str1, const char *str2);

 extern wchar *AllocUnicode(const char*src);

-template<typename T>
-__always_inline T Clamp(T v, auto low, auto high) {
-	return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
-}
-
-__always_inline auto Clamp2(auto v, auto center, auto radius) {
-	return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
-}
-
 #define SQR(x) ((x) * (x))
 __always_inline auto sq(auto x) { return SQR(x); }

@@ -418,15 +407,7 @@ template<int s, int t> struct check_size {
 #endif
 #define BIT(num)                         (1<<(num))

-#define ABS(a) std::abs(a)
+#define ABS(a) Abs(a)

-__always_inline auto norm(auto value, auto min, auto max) {
-	return (Clamp(value, min, max) - min) / (max - min);
-}
 // we use std::lerp now
 //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
-
-#define STRINGIFY(x)                    #x
-#define STR(x)                          STRINGIFY(x)
-#define CONCAT_(x,y) x##y
-#define CONCAT(x,y) CONCAT_(x,y)
--- a/src/liberty/fakerw/fake.cpp
+++ b/src/liberty/fakerw/fake.cpp
@@ -449,8 +449,8 @@ RwBool RwIm3DRenderPrimitive(RwPrimitiveType primType);

 RwBool RwRenderStateGet(RwRenderState state, void *value)
 {
-	uint32 *uival = (uint32*)value;
-	uint32 fog;
+	uint32_t *uival = (uint32_t*)value;
+	uint32_t fog;
 	switch(state){
 	case rwRENDERSTATETEXTURERASTER: *(void**)value = GetRenderStatePtr(TEXTURERASTER); return true;
 	case rwRENDERSTATETEXTUREADDRESS: *uival = GetRenderState(TEXTUREADDRESS); return true;
@@ -496,8 +496,8 @@ RwBool RwRenderStateGet(RwRenderState state, void *value)
 }
 RwBool RwRenderStateSet(RwRenderState state, void *value)
 {
-	uint32 uival = (uintptr)value;
-	uint32 fog;
+	uint32_t uival = (uintptr_t)value;
+	uint32_t fog;
 	switch(state){
 	case rwRENDERSTATETEXTURERASTER: SetRenderStatePtr(TEXTURERASTER, value); return true;
 	case rwRENDERSTATETEXTUREADDRESS: SetRenderState(TEXTUREADDRESS, uival); return true;
@@ -544,8 +544,8 @@ RwBool RwRenderStateSet(RwRenderState state, void *value)
 static rw::MemoryFunctions gMemfuncs;
 static void *(*real_malloc)(size_t size);
 static void *(*real_realloc)(void *mem, size_t newSize);
-static void *mallocWrap(size_t sz, uint32 hint) { if(sz == 0) return nil; return real_malloc(sz); }
-static void *reallocWrap(void *p, size_t sz, uint32 hint) { return real_realloc(p, sz); }
+static void *mallocWrap(size_t sz, uint32_t hint) { if(sz == 0) return nil; return real_malloc(sz); }
+static void *reallocWrap(void *p, size_t sz, uint32_t hint) { return real_realloc(p, sz); }


 // WARNING: unused parameters
@@ -830,7 +830,7 @@ RwCamera *RwCameraForAllClumpsNotInFrustum(RwCamera *camera, RwInt32 numClumps,
 RwBool RpMatFXPluginAttach( void ) { registerMatFXPlugin(); return true; }
 RpAtomic *RpMatFXAtomicEnableEffects( RpAtomic *atomic ) { MatFX::enableEffects(atomic); return atomic; }
 RpMatFXMaterialFlags RpMatFXMaterialGetEffects( const RpMaterial *material ){ return (RpMatFXMaterialFlags)MatFX::getEffects(material); }
-RpMaterial *RpMatFXMaterialSetEffects( RpMaterial *material, RpMatFXMaterialFlags flags ) { MatFX::setEffects(material, (uint32)flags); return material; }
+RpMaterial *RpMatFXMaterialSetEffects( RpMaterial *material, RpMatFXMaterialFlags flags ) { MatFX::setEffects(material, (uint32_t)flags); return material; }
 RpMaterial *RpMatFXMaterialSetupEnvMap( RpMaterial *material, RwTexture *texture, RwFrame *frame, RwBool useFrameBufferAlpha, RwReal coef ) {
 	MatFX *mfx = MatFX::get(material);
 	mfx->setEnvTexture(texture);
--- a/src/liberty/math/Matrix.cpp
+++ b/src/liberty/math/Matrix.cpp
@@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
 {
 	m_attachment = nil;
 	m_hasRwMatrix = false;
-	*this = m;
+	mat_copy(*this, m);
 }

 CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
 void
 CMatrix::operator=(CMatrix const &rhs)
 {
-	memcpy(this, &rhs, sizeof(f));
+	mat_copy(*this, rhs);
 	if (m_attachment)
 		UpdateRW();
 }
@@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
 void
 CMatrix::CopyOnlyMatrix(const CMatrix &other)
 {
-	memcpy(this, &other, sizeof(f));
+	mat_copy(*this, other);
 }

 CMatrix &
@@ -277,9 +277,9 @@ void
 CMatrix::RotateX(float x)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_x(x);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate_x(x);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [s, c] = SinCos(x);

@@ -307,9 +307,9 @@ void
 CMatrix::RotateY(float y)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_y(y);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate_y(y);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [s, c] = SinCos(y);

@@ -337,9 +337,9 @@ void
 CMatrix::RotateZ(float z)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_z(z);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate_z(z);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else	
 	auto [s, c] = SinCos(z);

@@ -367,9 +367,9 @@ void
 CMatrix::Rotate(float x, float y, float z)
 {
 #if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate(x, y, z);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate(x, y, z);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [sX, cX] = SinCos(x);
 	auto [sY, cY] = SinCos(y);
@@ -449,65 +449,13 @@ CMatrix::Reorthogonalise(void)
 	f = CrossProduct(u, r);
 }

-#ifdef DC_SH4
-static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2)
-{
-    unsigned int prefetch_scratch;
-
-    asm volatile (
-        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // back matrix
-        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
-        "fmov.d @%[bmtrx]+, XD2\n\t"
-        "fmov.d @%[bmtrx]+, XD4\n\t"
-        "fmov.d @%[bmtrx]+, XD6\n\t"
-        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
-        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
-        "fmov.d @%[bmtrx]+, XD10\n\t"
-        "fmov.d @%[bmtrx]+, XD12\n\t"
-        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fmov.d @%[bmtrx], XD14\n\t"
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // front matrix
-        // interleave loads and matrix multiply 4x4
-        "fmov.d @%[fmtrx]+, DR0\n\t"
-        "fmov.d @%[fmtrx]+, DR2\n\t"
-        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
-        "ftrv XMTRX, FV0\n\t" // (FE)
-
-        "fmov.d @%[fmtrx]+, DR6\n\t"
-        "fmov.d @%[fmtrx]+, DR8\n\t"
-        "ftrv XMTRX, FV4\n\t"
-
-        "fmov.d @%[fmtrx]+, DR10\n\t"
-        "fmov.d @%[fmtrx]+, DR12\n\t"
-        "ftrv XMTRX, FV8\n\t"
-
-        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
-        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
-        "ftrv XMTRX, FV12\n\t" // (FE)
-        // Save output in XF regs
-        "frchg\n"
-        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
-        : // no inputs
-        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
-    );
-}
-#endif
-
 CMatrix
 operator*(const CMatrix &m1, const CMatrix &m2)
 {
 	// TODO: VU0 code
 	CMatrix out;
-#if defined(RW_DC)
-	mat_load(reinterpret_cast<const matrix_t *>(&m1));
-	mat_apply(reinterpret_cast<const matrix_t *>(&m2));
-	mat_store(reinterpret_cast<matrix_t *>(&out));
+#ifdef DC_SH4
+	mat_mult(out, m1, m2);
 #else
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
--- a/src/liberty/math/Matrix.h
+++ b/src/liberty/math/Matrix.h
@@ -1,5 +1,7 @@
 #pragma once

+#include "rwdc_common.h"
+
 class alignas(8) CMatrix
 {
 public:
@@ -27,6 +29,8 @@ public:
 		SetScale(scale);
 	}
 	~CMatrix(void);
+	operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
+	operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
 	void Attach(RwMatrix *matrix, bool owner = false);
 	void AttachRW(RwMatrix *matrix, bool owner = false);
 	void Detach(void);
@@ -102,15 +106,24 @@ CMatrix Invert(const CMatrix &matrix);
 CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
 inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
 {
+#ifndef DC_SH4
 	CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
 	return CVector(
 		mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
 		mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
 		mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
+#else
+    register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
+    register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
+    register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
+	return CVector(
+		fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
+	);
+#endif
 }

-
-
 class CCompressedMatrixNotAligned
 {
 	CVector m_vecPos;
--- a/src/liberty/math/Quaternion.cpp
+++ b/src/liberty/math/Quaternion.cpp
@@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
 void
 CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
 	y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
 	z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
 	w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+#else
+	quat_mult(*this, q1, q2);
+#endif
 }

 void
@@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
 	*angle = Acos(w);
 	float s = Sin(*angle);

+#ifndef DC_SH4
 	axis->x = x * (1.0f / s);
 	axis->y = y * (1.0f / s);
 	axis->z = z * (1.0f / s);
+#else
+	float invS = dc::Invert<true, false>(s);
+	axis->x = x * invS;
+	axis->y = y * invS;
+	axis->z = z * invS;
+#endif
 }

 void
@@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		w = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		x = (matrix.up.z - matrix.at.y) * m;
 		y = (matrix.at.x - matrix.right.z) * m;
 		z = (matrix.right.y - matrix.up.x) * m;
@@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		x = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		y = (matrix.up.x + matrix.right.y) * m;
 		z = (matrix.at.x + matrix.right.z) * m;
 		w = (matrix.up.z - matrix.at.y) * m;
@@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		y = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		w = (matrix.at.x - matrix.right.z) * m;
 		x = (matrix.up.x - matrix.right.y) * m;
 		z = (matrix.at.y + matrix.up.z) * m;
--- a/src/liberty/math/Quaternion.h
+++ b/src/liberty/math/Quaternion.h
@@ -1,5 +1,8 @@
 #pragma once

+#include "src/common_defines.h"
+#include "rwdc_common.h"
+
 // TODO: actually implement this
 class CQuaternion
 {
@@ -8,13 +11,23 @@ public:
 	CQuaternion(void) {}
 	CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}

-	float Magnitude(void) const { return Sqrt(MagnitudeSqr()); }
+	operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
+	operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
+	float Magnitude(void) const {
+#ifndef DC_SH4
+		return Sqrt(x*x + y*y + z*z + w*w);
+#else
+		return Sqrt(fipr_magnitude_sqr(x, y, z, w));
+#endif
+	}
 	float MagnitudeSqr(void) const {
-	#ifdef DC_SH4
-		return fipr_magnitude_sqr(x, y, z, w);
-	#else
+#ifndef DC_SH4
 		return x*x + y*y + z*z + w*w;
-	#endif
+#else
+		return fipr_magnitude_sqr(x, y, z, w);
+#endif
 	}
 	void Normalise(void);
 	void Multiply(const CQuaternion &q1, const CQuaternion &q2);
@@ -49,10 +62,11 @@ public:
 	}

 	const CQuaternion &operator/=(float right) {
-		x /= right;
-		y /= right;
-		z /= right;
-		w /= right;
+		right = dc::Invert<false>(right);
+		x *= right;
+		y *= right;
+		z *= right;
+		w *= right;
 		return *this;
 	}

@@ -72,7 +86,11 @@ public:
 inline float
 DotProduct(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	return q1.x*q2.x + q1.y*q2.y + q1.z*q2.z + q1.w*q2.w;
+#else
+	return fipr(q1.x, q1.y, q1.z, q1.w, q2.x, q2.y, q2.z, q2.w);
+#endif
 }

 inline CQuaternion operator+(const CQuaternion &left, const CQuaternion &right)
@@ -97,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)

 inline CQuaternion operator/(const CQuaternion &left, float right)
 {
-	return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right);
+	right = Invert<false>(right);
+	return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
 }
--- a/src/liberty/math/Vector.cpp
+++ b/src/liberty/math/Vector.cpp
@@ -27,32 +27,35 @@ CrossProduct(const CVector &v1, const CVector &v2)
 CVector
 Multiply3x3(const CMatrix &mat, const CVector &vec)
 {
-#ifdef DC_SH4
-    register float __x __asm__("fr12") = vec.x;
-    register float __y __asm__("fr13") = vec.y;
-    register float __z __asm__("fr14") = vec.z;
-    register float __w __asm__("fr15") = 0.0f;
-	
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
-
-	asm volatile( "ftrv  xmtrx, fv12\n"
-                : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
-                : "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
-    return { __x, __y, __z };
-#else
+#ifndef DC_SH4
 	// TODO: VU0 code
 	return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
 	               mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
 	               mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+    return out;
 #endif
 }

 CVector
 Multiply3x3(const CVector &vec, const CMatrix &mat)
 {
+#ifndef DC_SH4
 	return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
 	               mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
 	               mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_transpose();
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+	return out;
+#endif
 }

 CVector
@@ -60,7 +63,7 @@ operator*(const CMatrix &mat, const CVector &vec)
 {
 #ifdef DC_SH4
 	CVector out;
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
 	return out;
 #else
--- a/src/liberty/math/Vector.h
+++ b/src/liberty/math/Vector.h
@@ -19,22 +19,22 @@ public:
  }
  // (0,1,0) means no rotation. So get right vector and its atan
  __always_inline float Heading(void) const { return Atan2(-x, y); }
-  __always_inline float Magnitude(void) const { 
+  __always_inline float Magnitude(void) const {
 #ifdef DC_SH4
    float w;
    vec3f_length(x, y, z, w);
    return w;
 #else
-    return Sqrt(x*x + y*y + z*z); 
+    return Sqrt(x*x + y*y + z*z);
 #endif
 }
-  __always_inline float MagnitudeSqr(void) const { 
+  __always_inline float MagnitudeSqr(void) const {
 #ifdef DC_SH4
-    return fipr_magnitude_sqr(x, y,z, 0.0f); 
+    return fipr_magnitude_sqr(x, y,z, 0.0f);
 #else
    return x*x + y*y + z*z;
 #endif
-} 
+}
  __always_inline float Magnitude2D(void) const { return Sqrt(x*x + y*y); }
  float MagnitudeSqr2D(void) const { return x*x + y*y; }
  void Normalise(void);
@@ -68,7 +68,7 @@ public:
  }

  const CVector &operator/=(float right) {
-    right = Invert(right);
+    right = Invert<false>(right);
    x *= right;
    y *= right;
    z *= right;
@@ -112,7 +112,8 @@ inline CVector operator*(float left, const CVector &right)

 inline CVector operator/(const CVector &left, float right)
 {
-  return CVector(left.x / right, left.y / right, left.z / right);
+  right = Invert<false>(right);
+  return CVector(left.x * right, left.y * right, left.z * right);
 }

 __always_inline float
--- a/src/liberty/math/VuVector.h
+++ b/src/liberty/math/VuVector.h
@@ -44,7 +44,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@@ -71,7 +71,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@@ -108,7 +108,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		bnez	%1,1b\n\
 		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
 #elif defined(DC_SH4)
-    mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+    dc::mat_load2(mat);
 	while(n--) {
 		mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
 		in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
--- a/src/liberty/math/maths.h
+++ b/src/liberty/math/maths.h
@@ -1,73 +1,6 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

-#include <tuple>
-#include <dc/matrix.h>
-
-#ifdef DC_SH4
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        w = __w; \
-    } while(false)
-
-__always_inline float Fmac(float a, float b, float c) {
-    asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
-        : [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : );
-    return c;
-}
-
-#else
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
-	} while(false)
-
-#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		w_ = tmp.w; \
-	} while(false)
-
-__always_inline float Fmac(float a, float b, float c) { return a * b + c; }
-
-#endif
-
-__always_inline float Sin(float x) { return __builtin_sinf(x); }
-__always_inline float Cos(float x) { return __builtin_cosf(x); }
-__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
-__always_inline float Tan(float x) { return __builtin_tanf(x); }
-__always_inline float Abs(float x) { return __builtin_fabsf(x); }
-__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); }
-__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); }
-__always_inline float Asin(float x) { return __builtin_asinf(x); }
-__always_inline float Acos(float x) { return __builtin_acosf(x); }
-__always_inline float Atan(float x) { return __builtin_atanf(x); }
-__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); }
-__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ }
-__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); }
-__always_inline float Floor(float x) { return __builtin_floorf(x); }
-__always_inline float Ceil(float x) { return __builtin_ceilf(x); }
-__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); } 
-__always_inline float Div(float x, float y) { return x * Invert(y); }
-__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
+using namespace dc;
--- a/src/liberty/skel/crossplatform.h
+++ b/src/liberty/skel/crossplatform.h
@@ -13,7 +13,9 @@ enum eWinVersion
    OS_WINXP,
 };

-#ifdef _WIN32
+#ifdef _WIN64
+#include <windows.h>
+#elif defined(_WIN32)

 // As long as WITHWINDOWS isn't defined / <Windows.h> isn't included, we only need type definitions so let's include <IntSafe.h>.
 // NOTE: It's perfectly fine to include <Windows.h> here, but it can increase build size and time in *some* conditions, and maybe substantially in future if we'll use crossplatform.h more.
--- a/src/miami/collision/Collision.cpp
+++ b/src/miami/collision/Collision.cpp
@@ -1473,7 +1473,7 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@@ -1653,7 +1653,7 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
 #else
-		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		dc::mat_load2(matrix);
 		mat_trans_single3_nodiv(point.point.x,
 		                        point.point.y,
 		                        point.point.z);
@@ -2027,8 +2027,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matAB *= matrixA;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matAB));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixA));
+	dc::mat_load_apply(matAB, matrixA);
 #endif

 	CColSphere bsphereAB;	// bounding sphere of A in B space
@@ -2099,8 +2098,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 #ifndef DC_SH4
 	matBA *= matrixB;
 #else
-	mat_load(reinterpret_cast<const matrix_t*>(&matBA));
-	mat_apply(reinterpret_cast<const matrix_t*>(&matrixB));
+	dc::mat_load_apply(matBA, matrixB);
 #endif
 	for(i = 0; i < modelB.numSpheres; i++){
 		s.radius = modelB.spheres[i].radius;
@@ -2162,7 +2160,7 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	}

 #ifdef DC_SH4
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB)));
+	dc::mat_load2(matrixB);
 #endif
 	for(i = 0; i < numCollisions; i++){
 #ifndef DC_SH4
--- a/src/miami/core/ControllerConfig.cpp
+++ b/src/miami/core/ControllerConfig.cpp
@@ -2042,7 +2042,7 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 		// CONTROL_CLASSIC : more like GTA: Liberty City Stories on PlayStation Portable

 		//static wchar ActionText[50];
-		static wchar ActionText[24];
+		static wchar ActionText[32];
 		for (int32 i = 0; i < ARRAY_SIZE(ActionText); i++)
 			ActionText[i] = '\0';

@@ -2066,14 +2066,17 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
        static const char* Dreamcast_VehicleLookLeft = "A + Joystick Left";
        static const char* Dreamcast_VehicleLookRight = "A + Joystick Right";
        static const char* Dreamcast_CenterCamera = "Double click X";
-		static const char* Dreamcast_CameraUp = "X + Joystick Up";
-        static const char* Dreamcast_CameraDown = "X + Joystick Down";
-        static const char* Dreamcast_CameraLeft = "X + Joystick Left";
-        static const char* Dreamcast_CameraRight = "X + Joystick Right";
-        static const char* Dreamcast_LookBehind = "Right trigger + Left trigger";
+		//static const char* Dreamcast_CameraUp = "X + Joystick Up"; // Not used, commented to saving memory
+        //static const char* Dreamcast_CameraDown = "X + Joystick Down"; // Not used, commented to saving memory
+        //static const char* Dreamcast_CameraLeft = "X + Joystick Left"; // Not used, commented to saving memory
+        //static const char* Dreamcast_CameraRight = "X + Joystick Right"; // Not used, commented to saving memory
+
+        //static const char* Dreamcast_ReplaceMeleeWeapon = "A + D-Pad Down";
+        //static const char* Dreamcast_BothTriggers = "Right Trigger + Left Trigger";
+        static const char* Dreamcast_BothTriggers = "Two Triggers / A + D-Pad Down"; // Not a perfect fix but avoid problem with PED_ANSWER_PHONE
        static const char* Dreamcast_NotAssigned = "Not Assigned";

-        //Dual sticks
+        // Dual Sticks
        static const char* LefAnalog_Up = "Left Stick Up";
 		static const char* LefAnalog_Down = "Left Stick Down";
 		static const char* LefAnalog_Left = "Left Stick Left";
@@ -2084,14 +2087,13 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 		static const char* RightAnalog_Left = "Right Stick Left";
 		static const char* RightAnalog_Right = "Right Stick Right";

-        //Xbox Specific
+        // Xbox Specific
        static const char* Xbox_VehicleLookLeft = "LB";
        static const char* Xbox_VehicleLookRight = "RB";
        static const char* Xbox_VehicleLookBehind = "RB + LB";
        static const char* Xbox_Back = "Back";
        
-
-        //PS2 Specific
+        // PS2 Specific
        static const char* Dreamcast_PS2_VehicleLookLeft = "B + Joystick Left";
        static const char* Dreamcast_PS2_VehicleLookRight = "B + Joystick Right";
        static const char* PS2_L1 = "L1";
@@ -2125,6 +2127,9 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 				case VEHICLE_ACCELERATE:
 					for (int i = 0; (ActionText[i] = Dreamcast_RightTrigger[i]) != '\0' && i < iLimitCopy; i++);
 					break;
+				case PED_ANSWER_PHONE: // Both Triggers to answer the Phone and switch the Melee Weapon on Dreamcast
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
+					break;
 				case VEHICLE_CHANGE_RADIO_STATION: // D-Pad Right to switch RADIO on Dreamcast
 					for (int i = 0; (ActionText[i] = Dreamcast_DPad_Right[i]) != '\0' && i < iLimitCopy; i++);
 					break;
@@ -2213,7 +2218,7 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 					for (int i = 0; (ActionText[i] = Dreamcast_DPad_Up[i]) != '\0' && i < iLimitCopy; i++);
 					break;
 				case VEHICLE_LOOKBEHIND:
-					for (int i = 0; (ActionText[i] = Dreamcast_LookBehind[i]) != '\0' && i < iLimitCopy; i++);
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
 					break;
 				case NETWORK_TALK: // Not Used on Dreamcast
 					for (int i = 0; (ActionText[i] = Dreamcast_NotAssigned[i]) != '\0' && i < iLimitCopy; i++);
@@ -2263,6 +2268,9 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 				case VEHICLE_ACCELERATE:
 					for (int i = 0; (ActionText[i] = Dreamcast_RightTrigger[i]) != '\0' && i < iLimitCopy; i++);
 					break;
+				case PED_ANSWER_PHONE: // Both Triggers to answer the Phone on Dreamcast
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
+					break;
 				case VEHICLE_CHANGE_RADIO_STATION: // D-Pad Right to switch RADIO on Dreamcast
 					for (int i = 0; (ActionText[i] = Dreamcast_DPad_Right[i]) != '\0' && i < iLimitCopy; i++);
 					break;
@@ -2401,7 +2409,9 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 				case VEHICLE_ACCELERATE:
 					for (int i = 0; (ActionText[i] = Dreamcast_A[i]) != '\0' && i < iLimitCopy; i++);
 					break;
-
+				case PED_ANSWER_PHONE: // Both Triggers to answer the Phone on Dreamcast
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
+					break;
 				case VEHICLE_CHANGE_RADIO_STATION: // D-Pad Right to switch RADIO on Dreamcast
 					for (int i = 0; (ActionText[i] = Dreamcast_DPad_Right[i]) != '\0' && i < iLimitCopy; i++);
 					break;
@@ -2490,7 +2500,7 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 					for (int i = 0; (ActionText[i] = Dreamcast_DPad_Up[i]) != '\0' && i < iLimitCopy; i++);
 					break;
 				case VEHICLE_LOOKBEHIND:
-					for (int i = 0; (ActionText[i] = Dreamcast_LookBehind[i]) != '\0' && i < iLimitCopy; i++);
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
 					break;
 				case NETWORK_TALK: // Not Used on Dreamcast
 					for (int i = 0; (ActionText[i] = Dreamcast_NotAssigned[i]) != '\0' && i < iLimitCopy; i++);
@@ -2541,6 +2551,9 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 				case VEHICLE_ACCELERATE:
 					for (int i = 0; (ActionText[i] = PS2_Cross[i]) != '\0' && i < iLimitCopy; i++);
 					break;
+				case PED_ANSWER_PHONE: // Both Triggers to answer the Phone on Dreamcast
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
+					break;
 				case VEHICLE_CHANGE_RADIO_STATION:
 					for (int i = 0; (ActionText[i] = PS2_L1[i]) != '\0' && i < iLimitCopy; i++);
 					break;
@@ -2629,7 +2642,7 @@ wchar *CControllerConfigManager::GetControllerSettingTextWithOrderNumber(e_Contr
 					for (int i = 0; (ActionText[i] = Dreamcast_DPad_Up[i]) != '\0' && i < iLimitCopy; i++);
 					break;
 				case VEHICLE_LOOKBEHIND:
-					for (int i = 0; (ActionText[i] = Dreamcast_LookBehind[i]) != '\0' && i < iLimitCopy; i++);
+					for (int i = 0; (ActionText[i] = Dreamcast_BothTriggers[i]) != '\0' && i < iLimitCopy; i++);
 					break;
 				case NETWORK_TALK: // Not Used on Dreamcast
 					for (int i = 0; (ActionText[i] = Dreamcast_NotAssigned[i]) != '\0' && i < iLimitCopy; i++);
--- a/src/miami/core/common.h
+++ b/src/miami/core/common.h
@@ -85,8 +85,6 @@

 #define rwVENDORID_ROCKSTAR 0x0253F2

-#define Max(a,b) ((a) > (b) ? (a) : (b))
-#define Min(a,b) ((a) < (b) ? (a) : (b))

 // Use this to add const that wasn't there in the original code
 #define Const const
@@ -303,12 +301,8 @@ extern int strncasecmp(const char *str1, const char *str2, size_t len);

 extern wchar *AllocUnicode(const char*src);

-#define Clamp(v, low, high) ((v)<(low) ? (low) : (v)>(high) ? (high) : (v))
-
-#define Clamp2(v, center, radius) ((v) > (center) ? Min(v, center + radius) : Max(v, center - radius))
-
-inline float sq(float x) { return x*x; }
 #define SQR(x) ((x) * (x))
+__always_inline auto sq(auto x) { return SQR(x); }

 #ifdef __MWERKS__
 #define M_E        2.71828182845904523536   // e
@@ -326,7 +320,10 @@ inline float sq(float x) { return x*x; }
 #define M_SQRT1_2  0.707106781186547524401  // 1/sqrt(2)
 #endif

-#define PI (float)M_PI
+#ifndef DC_SH4
+#define F_PI M_PI
+#endif
+#define PI (float)F_PI
 #define TWOPI (PI*2)
 #define HALFPI (PI/2)
 #define DEGTORAD(x) ((x) * PI / 180.0f)
@@ -380,7 +377,10 @@ __inline__ void TRACE(char *f, ...) { } // this is re3 only, and so the function
 #endif
 #endif

-#ifndef MASTER
+#ifdef assert
+#undef assert
+#endif
+#if !defined(MASTER)
 #define assert(_Expression) (void)( (!!(_Expression)) || (re3_assert(#_Expression, __FILE__, __LINE__, __FUNCTION__), 0) )
 #else
 #define assert(_Expression) (_Expression)
@@ -411,12 +411,7 @@ template<int s, int t> struct check_size {
 #endif
 #define BIT(num)                         (1<<(num))

-#define ABS(a)  (((a) < 0) ? (-(a)) : (a))
-#define norm(value, min, max) (((value) < (min)) ? 0 : (((value) > (max)) ? 1 : (((value) - (min)) / ((max) - (min)))))
-#define Lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
-
-#define STRINGIFY(x)                    #x
-#define STR(x)                          STRINGIFY(x)
-#define CONCAT_(x,y) x##y
-#define CONCAT(x,y) CONCAT_(x,y)
+#define ABS(a) Abs(a)

+// we use std::lerp now
+//#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )
--- a/src/miami/fakerw/fake.cpp
+++ b/src/miami/fakerw/fake.cpp
@@ -450,8 +450,8 @@ RwBool RwIm3DRenderPrimitive(RwPrimitiveType primType);

 RwBool RwRenderStateGet(RwRenderState state, void *value)
 {
-	uint32 *uival = (uint32*)value;
-	uint32 fog;
+	uint32_t *uival = (uint32_t*)value;
+	uint32_t fog;
 	switch(state){
 	case rwRENDERSTATETEXTURERASTER: *(void**)value = GetRenderStatePtr(TEXTURERASTER); return true;
 	case rwRENDERSTATETEXTUREADDRESS: *uival = GetRenderState(TEXTUREADDRESS); return true;
@@ -497,8 +497,8 @@ RwBool RwRenderStateGet(RwRenderState state, void *value)
 }
 RwBool RwRenderStateSet(RwRenderState state, void *value)
 {
-	uint32 uival = (uintptr)value;
-	uint32 fog;
+	uint32_t uival = (uintptr_t)value;
+	uint32_t fog;
 	switch(state){
 	case rwRENDERSTATETEXTURERASTER: SetRenderStatePtr(TEXTURERASTER, value); return true;
 	case rwRENDERSTATETEXTUREADDRESS: SetRenderState(TEXTUREADDRESS, uival); return true;
@@ -545,8 +545,8 @@ RwBool RwRenderStateSet(RwRenderState state, void *value)
 static rw::MemoryFunctions gMemfuncs;
 static void *(*real_malloc)(size_t size);
 static void *(*real_realloc)(void *mem, size_t newSize);
-static void *mallocWrap(size_t sz, uint32 hint) { if(sz == 0) return nil; return real_malloc(sz); }
-static void *reallocWrap(void *p, size_t sz, uint32 hint) { return real_realloc(p, sz); }
+static void *mallocWrap(size_t sz, uint32_t hint) { if(sz == 0) return nil; return real_malloc(sz); }
+static void *reallocWrap(void *p, size_t sz, uint32_t hint) { return real_realloc(p, sz); }


 // WARNING: unused parameters
@@ -832,7 +832,7 @@ RwCamera *RwCameraForAllClumpsNotInFrustum(RwCamera *camera, RwInt32 numClumps,
 RwBool RpMatFXPluginAttach( void ) { registerMatFXPlugin(); return true; }
 RpAtomic *RpMatFXAtomicEnableEffects( RpAtomic *atomic ) { MatFX::enableEffects(atomic); return atomic; }
 RpMatFXMaterialFlags RpMatFXMaterialGetEffects( const RpMaterial *material ){ return (RpMatFXMaterialFlags)MatFX::getEffects(material); }
-RpMaterial *RpMatFXMaterialSetEffects( RpMaterial *material, RpMatFXMaterialFlags flags ) { MatFX::setEffects(material, (uint32)flags); return material; }
+RpMaterial *RpMatFXMaterialSetEffects( RpMaterial *material, RpMatFXMaterialFlags flags ) { MatFX::setEffects(material, (uint32_t)flags); return material; }
 RpMaterial *RpMatFXMaterialSetupEnvMap( RpMaterial *material, RwTexture *texture, RwFrame *frame, RwBool useFrameBufferAlpha, RwReal coef ) {
 	MatFX *mfx = MatFX::get(material);
 	mfx->setEnvTexture(texture);
--- a/src/miami/math/Matrix.cpp
+++ b/src/miami/math/Matrix.cpp
@@ -4,7 +4,7 @@ CMatrix::CMatrix(CMatrix const &m)
 {
 	m_attachment = nil;
 	m_hasRwMatrix = false;
-	*this = m;
+	mat_copy(*this, m);
 }

 CMatrix::CMatrix(RwMatrix *matrix, bool owner)
@@ -75,7 +75,7 @@ CMatrix::UpdateRW(void)
 void
 CMatrix::operator=(CMatrix const &rhs)
 {
-	memcpy(this, &rhs, sizeof(f));
+	mat_copy(*this, rhs);
 	if (m_attachment)
 		UpdateRW();
 }
@@ -83,7 +83,7 @@ CMatrix::operator=(CMatrix const &rhs)
 void
 CMatrix::CopyOnlyMatrix(const CMatrix &other)
 {
-	memcpy(this, &other, sizeof(f));
+	mat_copy(*this, other);
 }

 CMatrix &
@@ -358,12 +358,14 @@ CMatrix::RotateZ(float z)
 void
 CMatrix::Rotate(float x, float y, float z)
 {
-	float cX = Cos(x);
-	float sX = Sin(x);
-	float cY = Cos(y);
-	float sY = Sin(y);
-	float cZ = Cos(z);
-	float sZ = Sin(z);
+#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
+       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
+       mat_rotate(x, y, z);
+       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
+#else
+	auto [sX, cX] = SinCos(x);
+	auto [sY, cY] = SinCos(y);
+	auto [sZ, cZ] = SinCos(z);
 	
 	float rx = this->rx;
 	float ry = this->ry;
@@ -388,6 +390,20 @@ CMatrix::Rotate(float x, float y, float z)
 	float z2 = sZ * sY - (cZ * sX) * cY;
 	float z3 = cX * cY;

+	#if !defined(DC_TEXCONV) && !defined(DC_SIM)
+	this->rx = fipr(x1, y1, z1, 0,  rx, ry, rz, 0);
+	this->ry = fipr(x2, y2, z2, 0,  rx, ry, rz, 0);
+	this->rz = fipr(x3, y3, z3, 0,  rx, ry, rz, 0);
+	this->fx = fipr(x1, y1, z1, 0,  ux, uy, uz, 0);
+	this->fy = fipr(x2, y2, z2, 0,  ux, uy, uz, 0);
+	this->fz = fipr(x3, y3, z3, 0,  ux, uy, uz, 0);
+	this->ux = fipr(x1, y1, z1, 0,  ax, ay, az, 0);
+	this->uy = fipr(x2, y2, z2, 0,  ax, ay, az, 0);
+	this->uz = fipr(x3, y3, z3, 0,  ax, ay, az, 0);
+	this->px = fipr(x1, y1, z1, 0,  px, py, pz, 0);
+	this->py = fipr(x2, y2, z2, 0,  px, py, pz, 0);
+	this->pz = fipr(x3, y3, z3, 0,  px, py, pz, 0);
+	#else
 	this->rx = x1 * rx + y1 * ry + z1 * rz;
 	this->ry = x2 * rx + y2 * ry + z2 * rz;
 	this->rz = x3 * rx + y3 * ry + z3 * rz;
@@ -400,6 +416,8 @@ CMatrix::Rotate(float x, float y, float z)
 	this->px = x1 * px + y1 * py + z1 * pz;
 	this->py = x2 * px + y2 * py + z2 * pz;
 	this->pz = x3 * px + y3 * py + z3 * pz;
+	#endif
+#endif
 }

 CMatrix &
@@ -429,9 +447,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
 	// TODO: VU0 code
 	CMatrix out;
 #if defined(RW_DC)
-	mat_load(reinterpret_cast<const matrix_t *>(&m1));
-	mat_apply(reinterpret_cast<const matrix_t *>(&m2));
-	mat_store(reinterpret_cast<matrix_t *>(&out));
+	mat_mult(out, m1, m2);
 #else
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
--- a/src/miami/math/Matrix.h
+++ b/src/miami/math/Matrix.h
@@ -43,6 +43,8 @@ public:
 		SetScale(scale);
 	}
 	~CMatrix(void);
+	operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
+	operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
 	void Attach(RwMatrix *matrix, bool owner = false);
 	void AttachRW(RwMatrix *matrix, bool owner = false);
 	void Detach(void);
@@ -82,13 +84,12 @@ public:
 	void Scale(float sx, float sy, float sz)
 	{
 		for (int i = 0; i < 3; i++){
-			f[i][0] *= sx;
-			f[i][1] *= sy;
-			f[i][2] *= sz;
+				f[i][0] *= sx;
+				f[i][1] *= sy;
+				f[i][2] *= sz;
 		}
 	}

-
 	void SetRotateXOnly(float angle);
 	void SetRotateYOnly(float angle);
 	void SetRotateZOnly(float angle);
@@ -125,11 +126,22 @@ CMatrix Invert(const CMatrix &matrix);
 CMatrix operator*(const CMatrix &m1, const CMatrix &m2);
 inline CVector MultiplyInverse(const CMatrix &mat, const CVector &vec)
 {
+#ifndef DC_SH4
 	CVector v(vec.x - mat.px, vec.y - mat.py, vec.z - mat.pz);
 	return CVector(
 		mat.rx * v.x + mat.ry * v.y + mat.rz * v.z,
 		mat.fx * v.x + mat.fy * v.y + mat.fz * v.z,
 		mat.ux * v.x + mat.uy * v.y + mat.uz * v.z);
+#else
+    register float x asm(KOS_FPARG(0)) = vec.x - mat.px;
+    register float y asm(KOS_FPARG(1)) = vec.y - mat.py;
+    register float z asm(KOS_FPARG(2)) = vec.z - mat.pz;
+	return CVector(
+		fipr(x, y, z, 0.0f, mat.rx, mat.ry, mat.rz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.fx, mat.fy, mat.fz, 0.0f),
+		fipr(x, y, z, 0.0f, mat.ux, mat.uy, mat.uz, 0.0f)
+	);
+#endif
 }


--- a/src/miami/math/Quaternion.cpp
+++ b/src/miami/math/Quaternion.cpp
@@ -39,10 +39,14 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
 void
 CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
 	y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
 	z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
 	w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+#else
+	quat_mult(*this, q1, q2);
+#endif
 }

 void
@@ -51,9 +55,16 @@ CQuaternion::Get(RwV3d *axis, float *angle)
 	*angle = Acos(w);
 	float s = Sin(*angle);

+#ifndef DC_SH4
 	axis->x = x * (1.0f / s);
 	axis->y = y * (1.0f / s);
 	axis->z = z * (1.0f / s);
+#else
+	float invS = dc::Invert<true, false>(s);
+	axis->x = x * invS;
+	axis->y = y * invS;
+	axis->z = z * invS;
+#endif
 }

 void
@@ -104,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		w = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		x = (matrix.up.z - matrix.at.y) * m;
 		y = (matrix.at.x - matrix.right.z) * m;
 		z = (matrix.right.y - matrix.up.x) * m;
@@ -115,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		x = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		y = (matrix.up.x + matrix.right.y) * m;
 		z = (matrix.at.x + matrix.right.z) * m;
 		w = (matrix.up.z - matrix.at.y) * m;
@@ -126,7 +137,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		y = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		w = (matrix.at.x - matrix.right.z) * m;
 		x = (matrix.up.x - matrix.right.y) * m;
 		z = (matrix.at.y + matrix.up.z) * m;
--- a/src/miami/math/Quaternion.h
+++ b/src/miami/math/Quaternion.h
@@ -1,5 +1,8 @@
 #pragma once

+#include "src/common_defines.h"
+#include "rwdc_common.h"
+
 // TODO: actually implement this
 class CQuaternion
 {
@@ -8,8 +11,24 @@ public:
 	CQuaternion(void) {}
 	CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}

-	float Magnitude(void) const { return Sqrt(x*x + y*y + z*z + w*w); }
-	float MagnitudeSqr(void) const { return x*x + y*y + z*z + w*w; }
+	operator quaternion_t *() { return reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t *() const { return reinterpret_cast<const quaternion_t *>(this); }
+	operator quaternion_t &() { return *reinterpret_cast<quaternion_t *>(this); }
+	operator const quaternion_t &() const { return *reinterpret_cast<const quaternion_t *>(this); }
+	float Magnitude(void) const {
+#ifndef DC_SH4
+		return Sqrt(x*x + y*y + z*z + w*w);
+#else
+		return Sqrt(fipr_magnitude_sqr(x, y, z, w));
+#endif
+	}
+	float MagnitudeSqr(void) const {
+#ifndef DC_SH4
+		return x*x + y*y + z*z + w*w;
+#else
+		return fipr_magnitude_sqr(x, y, z, w);
+#endif
+	}
 	void Normalise(void);
 	void Multiply(const CQuaternion &q1, const CQuaternion &q2);
 	void Invert(void){	// Conjugate would have been a better name
@@ -43,10 +62,11 @@ public:
 	}

 	const CQuaternion &operator/=(float right) {
-		x /= right;
-		y /= right;
-		z /= right;
-		w /= right;
+		right = dc::Invert<false>(right);
+		x *= right;
+		y *= right;
+		z *= right;
+		w *= right;
 		return *this;
 	}

@@ -66,7 +86,11 @@ public:
 inline float
 DotProduct(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	return q1.x*q2.x + q1.y*q2.y + q1.z*q2.z + q1.w*q2.w;
+#else
+	return fipr(q1.x, q1.y, q1.z, q1.w, q2.x, q2.y, q2.z, q2.w);
+#endif
 }

 inline CQuaternion operator+(const CQuaternion &left, const CQuaternion &right)
@@ -91,5 +115,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)

 inline CQuaternion operator/(const CQuaternion &left, float right)
 {
-	return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right);
+	right = Invert<false>(right);
+	return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
 }
--- a/src/miami/math/Vector.cpp
+++ b/src/miami/math/Vector.cpp
@@ -3,6 +3,10 @@
 void
 CVector::Normalise(void)
 {
+#ifdef DC_SH4_BROKEN
+	// TODO: This needs to handle zero vectors here
+	vec3f_normalize(x, y, z);
+#else
 	float sq = MagnitudeSqr();
 	if (sq > 0.0f) {
 		float invsqrt = RecipSqrt(sq);
@@ -11,6 +15,7 @@ CVector::Normalise(void)
 		z *= invsqrt;
 	} else
 		x = 1.0f;
+#endif
 }

 CVector
@@ -22,25 +27,49 @@ CrossProduct(const CVector &v1, const CVector &v2)
 CVector
 Multiply3x3(const CMatrix &mat, const CVector &vec)
 {
+#ifndef DC_SH4
 	// TODO: VU0 code
 	return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
 	               mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z,
 	               mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+    return out;
+#endif
 }

 CVector
 Multiply3x3(const CVector &vec, const CMatrix &mat)
 {
+#ifndef DC_SH4
 	return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
 	               mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
 	               mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
+#else
+	CVector out;
+	dc::mat_load2(mat);
+	mat_transpose();
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+	                        out.x, out.y, out.z);
+	return out;
+#endif
 }

 CVector
 operator*(const CMatrix &mat, const CVector &vec)
 {
+#ifdef DC_SH4
+	CVector out;
+	dc::mat_load2(mat);
+	mat_trans_single3_nodiv_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z);
+	return out;
+#else
 	// TODO: VU0 code
 	return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z + mat.px,
 	               mat.ry * vec.x + mat.fy * vec.y + mat.uy * vec.z + mat.py,
 	               mat.rz * vec.x + mat.fz * vec.y + mat.uz * vec.z + mat.pz);
+#endif
 }
--- a/src/miami/math/Vector.h
+++ b/src/miami/math/Vector.h
@@ -18,11 +18,25 @@ public:
 		z = v.z;
 	}
 	// (0,1,0) means no rotation. So get right vector and its atan
-	float Heading(void) const { return Atan2(-x, y); }
-	float Magnitude(void) const { return Sqrt(x*x + y*y + z*z); }
-	float MagnitudeSqr(void) const { return x*x + y*y + z*z; }
-	float Magnitude2D(void) const { return Sqrt(x*x + y*y); }
-	float MagnitudeSqr2D(void) const { return x*x + y*y; }
+__always_inline float Heading(void) const { return Atan2(-x, y); }
+  __always_inline float Magnitude(void) const {
+#ifdef DC_SH4
+    float w;
+    vec3f_length(x, y, z, w);
+    return w;
+#else
+    return Sqrt(x*x + y*y + z*z);
+#endif
+}
+  __always_inline float MagnitudeSqr(void) const {
+#ifdef DC_SH4
+    return fipr_magnitude_sqr(x, y,z, 0.0f);
+#else
+    return x*x + y*y + z*z;
+#endif
+}
+  __always_inline float Magnitude2D(void) const { return Sqrt(x*x + y*y); }
+  float MagnitudeSqr2D(void) const { return x*x + y*y; }
 	void Normalise(void);
 	
 	void Normalise2D(void) {
@@ -103,7 +117,11 @@ inline CVector operator/(const CVector &left, float right)
 inline float
 DotProduct(const CVector &v1, const CVector &v2)
 {
-	return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z;
+#ifdef DC_SH4
+  return fipr(v1.x, v1.y, v1.z, 0.0f, v2.x, v2.y, v2.z, 0.0f);
+#else
+  return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z;
+#endif
 }

 CVector CrossProduct(const CVector &v1, const CVector &v2);
@@ -111,7 +129,13 @@ CVector CrossProduct(const CVector &v1, const CVector &v2);
 inline float
 Distance(const CVector &v1, const CVector &v2)
 {
-	return (v2 - v1).Magnitude();
+  float w;
+#ifdef DC_SH4
+  vec3f_distance(v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, w);
+  return w;
+#else
+  return (v2 - v1).Magnitude();
+#endif
 }

 inline float
--- a/src/miami/math/VuVector.h
+++ b/src/miami/math/VuVector.h
@@ -50,7 +50,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CV
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@@ -77,7 +77,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw
 		sqc2    vf06,0x0(%0)\n\
 		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
 #elif defined(DC_SH4)
-	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	dc::mat_load2(mat);
 	mat_trans_single3_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z);
 #else
 	out = mat * in;
@@ -86,7 +86,7 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw

 __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
 {
-#ifdef GTA_PS3
+#ifdef GTA_PS2
 	__asm__ __volatile__("\n\
 		paddub	$3,%4,$0\n\
 		lqc2    vf02,0x0(%2)\n\
@@ -114,7 +114,7 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		bnez	%1,1b\n\
 		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
 #elif defined(DC_SH4)
-    mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+    dc::mat_load2(mat);
 	while(n--) {
 		mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
 		in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
@@ -137,7 +137,6 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		lqc2    vf03,0x10(%2)\n\
 		lqc2    vf04,0x20(%2)\n\
 		lqc2    vf05,0x30(%2)\n\
-		lqc2    vf01,0x0(%3)\n\
 		nop\n\
 	1:	vmulax.xyz	ACC,   vf02,vf01\n\
 		vmadday.xyz	ACC,   vf03,vf01\n\
--- a/src/miami/math/maths.h
+++ b/src/miami/math/maths.h
@@ -1,46 +1,6 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

-#include <dc/matrix.h>
-
-#ifdef DC_SH4
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#else
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
-	} while(false)
-#endif
-
-
-// wrapper around float versions of functions
-// in gta they are in CMaths but that makes the code rather noisy
-
-inline float Sin(float x) { return sinf(x); }
-inline float Asin(float x) { return asinf(x); }
-inline float Cos(float x) { return cosf(x); }
-inline float Acos(float x) { return acosf(x); }
-inline float Tan(float x) { return tanf(x); }
-inline float Atan(float x) { return atanf(x); }
-inline float Atan2(float y, float x) { return atan2f(y, x); }
-inline float Abs(float x) { return fabsf(x); }
-inline float Sqrt(float x) { return sqrtf(x); }
-inline float RecipSqrt(float x, float y) { return x/Sqrt(y); }
-inline float RecipSqrt(float x) { return RecipSqrt(1.0f, x); }
-inline float Pow(float x, float y) { return powf(x, y); }
-inline float Floor(float x) { return floorf(x); }
-inline float Ceil(float x) { return ceilf(x); }
+using namespace dc;
--- a/src/miami/skel/crossplatform.h
+++ b/src/miami/skel/crossplatform.h
@@ -17,7 +17,7 @@ enum eWinVersion
 char *_strdate(char *buf);
 #endif

-#ifdef _WIN32
+#if defined(_WIN32) || defined(_WIN64)

 // As long as WITHWINDOWS isn't defined / <Windows.h> isn't included, we only need type definitions so let's include <IntSafe.h>.
 // NOTE: It's perfectly fine to include <Windows.h> here, but it can increase build size and time in *some* conditions, and maybe substantially in future if we'll use crossplatform.h more.
--- a/vendor/dca3-kos
+++ b/vendor/dca3-kos
--- a/vendor/emu/lxdream/tacore.cpp
+++ b/vendor/emu/lxdream/tacore.cpp
@@ -182,7 +182,8 @@ struct pvr2_ta_status {
    int32_t state;
    int32_t width, height; /* Tile resolution, ie 20x15 */
    int32_t tilelist_dir; /* Growth direction of the tilelist, 0 = up, 1 = down */
-    uint32_t tilelist_size; /* Size of the tilelist segments */
+    // uint32_t tilelist_size; /* Size of the tilelist segments */
+    // this needs to be current_tile_size
    uint32_t tilelist_start; /* Initial address of the tilelist */
    uint32_t polybuf_start; /* Initial bank address of the polygon buffer (ie &0x00F00000) */
    int32_t current_vertex_type;
@@ -208,6 +209,9 @@ struct pvr2_ta_status {
    struct tile_bounds last_triangle_bounds;
    struct pvr2_ta_vertex poly_vertex[8];
    uint32_t debug_output;
+
+    bool modifier_last_volume;
+    struct tile_bounds modifier_bounds;
 };

 static struct pvr2_ta_status ta_status;
@@ -253,13 +257,9 @@ void lxd_ta_init(u8* vram) {
    ta_status.clip.y2 = ta_status.height-1;
    uint32_t control = TA_ALLOC_CTRL; //MMIO_READ( PVR2, TA_TILECFG );
    ta_status.tilelist_dir = (control >> 20) & 0x01;
-    ta_status.tilelist_size = tilematrix_sizes[ (control & 0x03) ];
    TA_ISP_CURRENT = TA_ISP_BASE;
    //MMIO_WRITE( PVR2, TA_POLYPOS, MMIO_READ( PVR2, TA_POLYBASE ) );
    uint32_t plistpos = TA_NEXT_OPB_INIT >> 2; //MMIO_READ( PVR2, TA_LISTBASE )
-    if( ta_status.tilelist_dir == TA_GROW_DOWN ) {
-        plistpos -= ta_status.tilelist_size;
-    }
    TA_NEXT_OPB = plistpos;
    //MMIO_WRITE( PVR2, TA_LISTPOS, plistpos );
    ta_status.tilelist_start = plistpos;
@@ -433,7 +433,9 @@ static uint32_t ta_alloc_tilelist( uint32_t reference ) {
    uint32_t limit = TA_OL_LIMIT >> 2;//MMIO_READ( PVR2, TA_LISTEND ) >> 2;
    uint32_t newposn;
    if( ta_status.tilelist_dir == TA_GROW_DOWN ) {
-        newposn = posn - ta_status.tilelist_size;
+        // printf("**TA WARNING**: Allocating tilelist in GROW_DOWN mode\n");
+        posn -= ta_status.current_tile_size;
+        newposn = posn;
        if( posn == limit ) {
            PVRRAM(posn<<2) = 0xF0000000;
            PVRRAM(reference) = 0xE0000000 | (posn<<2);
@@ -442,7 +444,7 @@ static uint32_t ta_alloc_tilelist( uint32_t reference ) {
            PVRRAM(reference) = 0xE0000000 | (posn<<2);
            return TA_NO_ALLOC;
        } else if( newposn <= limit ) {
-        } else if( newposn <= (limit + ta_status.tilelist_size) ) {
+        } else if( newposn <= (limit + ta_status.current_tile_size) ) {
            // asic_RaiseInterrupt(holly_MATR_NOMEM);
            pvr_queue_interrupt(ASIC_EVT_PVR_OPB_OUTOFMEM);
            printf("TA error: holly_MATR_NOMEM. Interrupt raised\n");
@@ -456,7 +458,7 @@ static uint32_t ta_alloc_tilelist( uint32_t reference ) {
        PVRRAM(reference) = 0xE0000000 | (posn<<2);
        return posn << 2;
    } else {
-        newposn = posn + ta_status.tilelist_size;
+        newposn = posn + ta_status.current_tile_size;
        if( posn == limit ) {
            PVRRAM(posn<<2) = 0xF0000000;
            PVRRAM(reference) = 0xE0000000 | (posn<<2);
@@ -465,7 +467,7 @@ static uint32_t ta_alloc_tilelist( uint32_t reference ) {
            PVRRAM(reference) = 0xE0000000 | (posn<<2);
            return TA_NO_ALLOC;
        } else if( newposn >= limit ) {
-        } else if( newposn >= (limit - ta_status.tilelist_size) ) {
+        } else if( newposn >= (limit - ta_status.current_tile_size) ) {
            // asic_RaiseInterrupt(holly_MATR_NOMEM);
            pvr_queue_interrupt(ASIC_EVT_PVR_OPB_OUTOFMEM);
            printf("TA error: holly_MATR_NOMEM. Interrupt raised\n");
@@ -621,6 +623,17 @@ static void ta_commit_polygon( ) {
    if( polygon_bound.y1 < 0 ) polygon_bound.y1 = 0;
    if( polygon_bound.y2 >= ta_status.height ) polygon_bound.y2 = ta_status.height-1;

+    if (ta_status.current_vertex_type == TA_VERTEX_MOD_VOLUME) {
+        ta_status.modifier_bounds.x1 = MIN(ta_status.modifier_bounds.x1, polygon_bound.x1);
+        ta_status.modifier_bounds.x2 = MAX(ta_status.modifier_bounds.x2, polygon_bound.x2);
+        ta_status.modifier_bounds.y1 = MIN(ta_status.modifier_bounds.y1, polygon_bound.y1);
+        ta_status.modifier_bounds.y2 = MAX(ta_status.modifier_bounds.y2, polygon_bound.y2);
+    
+        if (ta_status.modifier_last_volume) {
+            polygon_bound = ta_status.modifier_bounds;
+        }
+    }
+
    /* Set the "single tile" flag if it's entirely contained in 1 tile */
    if( polygon_bound.x1 == polygon_bound.x2 &&
            polygon_bound.y1 == polygon_bound.y2 ) {
@@ -843,6 +856,14 @@ static void ta_parse_modifier_context( union ta_data *data ) {
    ta_status.vertex_count = 0;
    ta_status.max_vertex = 3;
    ta_status.poly_pointer = 0;
+
+    if (ta_status.modifier_last_volume) {
+        ta_status.modifier_bounds.x1 = INT_MAX/32;
+        ta_status.modifier_bounds.y1 = INT_MAX/32;
+        ta_status.modifier_bounds.x2 = -1;
+        ta_status.modifier_bounds.y2 = -1;
+    }
+    ta_status.modifier_last_volume = data[0].i & TA_POLYCMD_FULLMOD;
 }

 /**
@@ -1006,7 +1027,11 @@ static void ta_parse_vertex( union ta_data *data ) {
        vertex++;
        vertex->x = data[7].f;
        ta_status.vertex_count += 2;
-        ta_status.state = STATE_EXPECT_VERTEX_BLOCK2;
+        if (ta_status.current_vertex_type  == TA_VERTEX_SPRITE || ta_status.current_vertex_type == TA_VERTEX_TEX_SPRITE) {
+            ta_status.state = STATE_EXPECT_END_VERTEX_BLOCK2;
+        } else {
+            ta_status.state = STATE_EXPECT_VERTEX_BLOCK2;
+        }
        break;
    }
    ta_status.vertex_count++;
--- a/vendor/emu/refsw/TexUtils.cpp
+++ b/vendor/emu/refsw/TexUtils.cpp
@@ -14,9 +14,9 @@
 #endif

 u32 detwiddle[2][11][1024];
-u8 BM_SIN90[256];
-u8 BM_COS90[256];
-u8 BM_COS360[256];
+s8 BM_SIN90[256];
+s8 BM_COS90[256];
+s8 BM_COS360[256];

 //input : address in the yyyyyxxxxx format
 //output : address in the xyxyxyxy format
@@ -70,9 +70,9 @@ void BuildTables()
 	}

 	for (int i = 0; i < 256; i++) {
-		BM_SIN90[i]  = 255 * sinf((i / 256.0f) * (M_PI / 2));
-		BM_COS90[i]  = 255 * cosf((i / 256.0f) * (M_PI / 2));
-		BM_COS360[i] = 255 * cosf((i / 256.0f) * (2 * M_PI));
+		BM_SIN90[i]  = 127 * sinf((i / 256.0f) * (M_PI / 2));
+		BM_COS90[i]  = 127 * cosf((i / 256.0f) * (M_PI / 2));
+		BM_COS360[i] = 127 * cosf((i / 256.0f) * (2 * M_PI));
 	}
 	
 }
--- a/vendor/emu/refsw/TexUtils.h
+++ b/vendor/emu/refsw/TexUtils.h
@@ -8,9 +8,9 @@
 #include <algorithm>

 extern u32 detwiddle[2][11][1024];
-extern u8 BM_SIN90[256];
-extern u8 BM_COS90[256];
-extern u8 BM_COS360[256];
+extern s8 BM_SIN90[256];
+extern s8 BM_COS90[256];
+extern s8 BM_COS360[256];

 void BuildTables();

--- a/vendor/emu/refsw/gentable.h
+++ b/vendor/emu/refsw/gentable.h
--- a/vendor/emu/refsw/gentable.py
+++ b/vendor/emu/refsw/gentable.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+import itertools
+
+def generate_table(name, parameters):
+    """
+    Generates C++ code for PixelFlush_tsp_table given parameter ranges.
+    :param parameters: tuple of ints, the range for each template parameter
+    :return: string containing the C++ table declaration and initializer
+    """
+    num_params = len(parameters)
+    # Construct the table dimensions
+    dims = ''.join(f'[{p}]' for p in parameters)
+    # Start building the initializer as a list of strings
+    lines = []
+
+    def recurse(level, indices, indent):
+        if level == num_params:
+            # Leaf: generate function pointer
+            params_list = ', '.join(str(i) for i in indices)
+            lines.append(f"{indent}&{name}<{params_list}>,")
+        else:
+            # Open brace for this dimension
+            lines.append(f"{indent}{{")
+            for i in range(parameters[level]):
+                recurse(level + 1, indices + [i], indent + '    ')
+            # Close brace
+            if level != 0:
+                lines.append(f"{indent}}},")
+            else:
+                lines.append(f"{indent}}}")
+
+    # Table declaration
+    decl = f"{name}_fp {name}_table{dims} ="
+    lines.append(decl)
+    recurse(0, [], '    ')
+    lines.append(';')
+
+    return '\n'.join(lines)
+
+if __name__ == '__main__':
+    # Example usage: each bool has 2 possibilities, some enums have more
+    bitwidths = (
+        1, # [pp_AlphaTest]
+        1, # [entry->params.tsp[two_voume_index].UseAlpha]
+        1, # [entry->params.isp.Texture]
+        1, # [entry->params.isp.Offset]
+        1, # [entry->params.tsp[two_voume_index].ColorClamp]
+        2, # [entry->params.tsp[two_voume_index].FogCtrl]
+        1, # [FPU_SHAD_SCALE.intensity_shadow]
+    )
+    code = generate_table("PixelFlush_tsp", tuple(1 << bw for bw in bitwidths))
+    print(code)
+
+    bitwidths = (
+        1, # [entry->params.tsp[two_voume_index].IgnoreTexA]
+        1, # [entry->params.tsp[two_voume_index].ClampU]
+        1, # [entry->params.tsp[two_voume_index].ClampV]
+        1, # [entry->params.tsp[two_voume_index].FlipU]
+        1, # [entry->params.tsp[two_voume_index].FlipV]
+        2, # [entry->params.tsp[two_voume_index].FilterMode]
+    )
+    code = generate_table("TextureFilter", tuple(1 << bw for bw in bitwidths))
+    print(code)
+
+    bitwidths = (
+        1, # [entry->params.isp.Texture]
+        1, # [entry->params.isp.Offset]
+        2, # [entry->params.tsp[two_voume_index].ShadInstr ]
+    )
+    code = generate_table("ColorCombiner", tuple(1 << bw for bw in bitwidths))
+    print(code)
+
+    bitwidths = (
+        1, # [entry->params.tsp[two_voume_index].SrcSelect]
+        1, # [entry->params.tsp[two_voume_index].DstSelect]
+        3, # [entry->params.tsp[two_voume_index].SrcInstr]
+        3, # [entry->params.tsp[two_voume_index].DstInstr]
+    )
+    code = generate_table("BlendingUnit", tuple(1 << bw for bw in bitwidths))
+    print(code)
+
+    bitwidths = (
+        1, # [entry->params.tcw[two_voume_index].VQ_Comp]
+        1, # [entry->params.tcw[two_voume_index].MipMapped]
+        1, # [entry->params.tcw[two_voume_index].ScanOrder]
+        1, # [entry->params.tcw[two_voume_index].StrideSel]
+        3, # [entry->params.tcw[two_voume_index].PixelFmt]
+    )
+    code = generate_table("TextureFetch", tuple(1 << bw for bw in bitwidths))
+    print(code)
--- a/vendor/emu/refsw/refsw_lists.cpp
+++ b/vendor/emu/refsw/refsw_lists.cpp
@@ -81,34 +81,16 @@

 #include "refsw_tile.h"

-#define JLOG(...)
-#define JLOG2(...)
-#define V(x) x
-
-void log_vertex(const Vertex& v) {
-    JLOG(ll, 
-        V(v.x), V(v.y), V(v.z),
-        V(v.col[0]), V(v.col[1]), V(v.col[2]), V(v.col[3]), V(v.spc[0]), V(v.spc[1]), V(v.spc[2]), V(v.spc[3]), V(v.u), V(v.v),
-        V(v.col1[0]), V(v.col1[1]), V(v.col1[2]), V(v.col1[3]), V(v.spc1[0]), V(v.spc1[1]), V(v.spc1[2]), V(v.spc1[3]), V(v.u1), V(v.v1)
-    );
-}
-
 /*
    Main renderer class
 */
-
 void RenderTriangle(RenderMode render_mode, DrawParameters* params, parameter_tag_t tag, const Vertex& v1, const Vertex& v2, const Vertex& v3, const Vertex* v4, taRECT* area)
-{
-    JLOG(ll, V(render_mode), V(tag), "tsp0", params->tsp[0].full, "tcw0", params->tcw[0].full, "tsp1", params->tsp[1].full, "tcw1", params->tcw[1].full);
+{   
+    RasterizeTriangle_table[render_mode](params, tag, v1, v2, v3, v4, area);

-    log_vertex(v1);
-    log_vertex(v2);
-    log_vertex(v3);
-    if (v4) { 
-        log_vertex(*v4);
+    if (render_mode == RM_TRANSLUCENT_PRESORT) {
+        RenderParamTags<RM_TRANSLUCENT_PRESORT>(area->left, area->top);
    }
-    
-    RasterizeTriangle(render_mode, params, tag, v1, v2, v3, v4, area);

    if (render_mode == RM_MODIFIER)
    {
@@ -138,6 +120,7 @@ u32 ReadRegionArrayEntry(u32 base, RegionArrayEntry* entry)
    u32 rv;
    if (fmt_v1)
    {
+        entry->control.pre_sort = ISP_FEED_CFG.pre_sort;
        entry->puncht.full = 0x80000000;
        rv = 5 * 4;
    }
@@ -160,13 +143,13 @@ u32 ReadRegionArrayEntry(u32 base, RegionArrayEntry* entry)
 	}

 ISP_BACKGND_T_type CoreTagFromDesc(u32 cache_bypass, u32 shadow, u32 skip, u32 param_offs_in_words, u32 tag_offset) {
-    ISP_BACKGND_T_type rv {
-        .tag_offset = tag_offset,
-        .param_offs_in_words = param_offs_in_words,
-        .skip = skip,
-        .shadow = shadow,
-        .cache_bypass = cache_bypass 
-    };
+    ISP_BACKGND_T_type rv;
+    rv.full = 0;
+    rv.tag_offset = tag_offset;
+    rv.param_offs_in_words = param_offs_in_words;
+    rv.skip = skip;
+    rv.shadow = shadow;
+    rv.cache_bypass = cache_bypass;

    return rv;
 }
@@ -174,8 +157,6 @@ ISP_BACKGND_T_type CoreTagFromDesc(u32 cache_bypass, u32 shadow, u32 skip, u32 p
 // render a triangle strip object list entry
 void RenderTriangleStrip(RenderMode render_mode, ObjectListEntry obj, taRECT* rect)
 {
-    JLOG(ll, V(render_mode), "obj", obj.full);
-
    Vertex vtx[8];
    DrawParameters params;

@@ -204,7 +185,6 @@ void RenderTriangleStrip(RenderMode render_mode, ObjectListEntry obj, taRECT* re
 // render a triangle array object list entry
 void RenderTriangleArray(RenderMode render_mode, ObjectListEntry obj, taRECT* rect)
 {
-    JLOG(ll, V(render_mode), "obj", obj.full);
    auto triangles = obj.tarray.prims + 1;
    u32 param_base = PARAM_BASE & 0xF00000;

@@ -222,8 +202,6 @@ void RenderTriangleArray(RenderMode render_mode, ObjectListEntry obj, taRECT* re
            
        parameter_tag_t tag  = CoreTagFromDesc(params.isp.CacheBypass, obj.tstrip.shadow, obj.tstrip.skip, (tag_address - param_base)/4, 0).full;

-        assert(!(tag & TAG_INVALID));
-
        RenderTriangle(render_mode, &params, tag, vtx[0], vtx[1], vtx[2], nullptr, rect);
    }
 }
@@ -231,8 +209,6 @@ void RenderTriangleArray(RenderMode render_mode, ObjectListEntry obj, taRECT* re
 // render a quad array object list entry
 void RenderQuadArray(RenderMode render_mode, ObjectListEntry obj, taRECT* rect)
 {
-    JLOG(ll, V(render_mode), "obj", obj.full);
-
    auto quads = obj.qarray.prims + 1;
    u32 param_base = PARAM_BASE & 0xF00000;

@@ -301,22 +277,12 @@ void RenderCORE() {
    }
    u32 base = REGION_BASE;

-    JLOG(ll, V(REGION_BASE));
-
    RegionArrayEntry entry;
        
    // Parse region array
    do {
        auto step = ReadRegionArrayEntry(base, &entry);
        
-        JLOG2(llrrae, "ReadRegionArrayEntry", V(base), 
-            "control", entry.control.full,
-            "opaque", entry.opaque.full,
-            "opaque_mod", entry.opaque_mod.full,
-            "trans", entry.trans.full,
-            "trans_mod", entry.trans_mod.full,
-            "puncht", entry.puncht.full);
-
        base += step;

        taRECT rect;
@@ -328,6 +294,7 @@ void RenderCORE() {

        parameter_tag_t bgTag;

+        ClearFpuCache();
        // register BGPOLY to fpu
        {
            bgTag = ISP_BACKGND_T.full;
@@ -338,85 +305,102 @@ void RenderCORE() {
        {
            // Clear Param + Z + stencil buffers
            ClearBuffers(bgTag, ISP_BACKGND_D.f, 0);
+        } else {
+            ClearParamStatusBuffer();
        }

        // Render OPAQ to TAGS
        if (!entry.opaque.empty)
        {
-            JLOG2(llo, "opaque", V(entry.opaque.ptr_in_words));
            RenderObjectList(RM_OPAQUE, entry.opaque.ptr_in_words * 4, &rect);
+        
+            if (!entry.opaque_mod.empty)
+            {
+                RenderObjectList(RM_MODIFIER, entry.opaque_mod.ptr_in_words * 4, &rect);
+            }
        }
-
        // Render TAGS to ACCUM
-        RenderParamTags(RM_OPAQUE, rect.left, rect.top);
+        RenderParamTags<RM_OPAQUE>(rect.left, rect.top);

        // render PT to TAGS
        if (!entry.puncht.empty)
        {
            PeelBuffersPTInitial(FLT_MAX);
+            
+            ClearMoreToDraw();

-            do {
+            // Render to TAGS
+            RenderObjectList(RM_PUNCHTHROUGH_PASS0, entry.puncht.ptr_in_words * 4, &rect);
+
+            // keep reference Z buffer
+            PeelBuffersPT();
+
+            // Render TAGS to ACCUM, making Z holes as-needed
+            RenderParamTags<RM_PUNCHTHROUGH_PASS0>(rect.left, rect.top);
+
+            while (GetMoreToDraw()) {
                ClearMoreToDraw();

                // Render to TAGS
-                {
-                    JLOG2(llo, "puncht", V(entry.puncht.ptr_in_words));
-                    RenderObjectList(RM_PUNCHTHROUGH, entry.puncht.ptr_in_words * 4, &rect);
-                }
+                RenderObjectList(RM_PUNCHTHROUGH_PASSN, entry.puncht.ptr_in_words * 4, &rect);

+                if (!GetMoreToDraw())
+                    break;
+                
+                ClearMoreToDraw();
                // keep reference Z buffer
                PeelBuffersPT();

                // Render TAGS to ACCUM, making Z holes as-needed
-                RenderParamTags(RM_PUNCHTHROUGH, rect.left, rect.top);
-                
-                // Copy TAGB=TAGA buffers, clear TAGA
-                PeelBuffersPTAfterHoles();
-
-            } while (GetMoreToDraw() != 0);
-        }
-        
-
-        //TODO: Actually render OPAQ modvol affected pixels
-        if (!entry.opaque_mod.empty)
-        {
-            JLOG2(llo, "opaque_mod", V(entry.opaque_mod.ptr_in_words));
-            RenderObjectList(RM_MODIFIER, entry.opaque_mod.ptr_in_words * 4, &rect);
-            RenderParamTags(RM_OP_PT_MV, rect.left, rect.top);
+                RenderParamTags<RM_PUNCHTHROUGH_PASS0>(rect.left, rect.top);
+            }
+            if (!entry.opaque_mod.empty)
+            {
+                RenderObjectList(RM_MODIFIER, entry.opaque_mod.ptr_in_words * 4, &rect);
+                RenderParamTags<RM_PUNCHTHROUGH_MV>(rect.left, rect.top);
+            }
        }

        // layer peeling rendering
        if (!entry.trans.empty)
        {
-            // clear the param buffer
-            ClearParamBuffer(TAG_INVALID);
+            if (entry.control.pre_sort) {
+                 // clear the param buffer
+                 ClearParamStatusBuffer();

-            do
-            {
-                // prepare for a new pass
-                ClearMoreToDraw();
+                 // render to TAGS
+                 {
+                     RenderObjectList(RM_TRANSLUCENT_PRESORT, entry.trans.ptr_in_words * 4, &rect);
+                 }
+
+                // what happens with modvols here?
+                //  if (!entry.trans_mod.empty)
+                //  {
+                //      RenderObjectList(RM_MODIFIER, entry.trans_mod.ptr_in_words * 4, &rect);
+                //  }
+            } else {
+                do
+                {
+                    // prepare for a new pass
+                    ClearMoreToDraw();

-                if (!ISP_FEED_CFG.pre_sort) {
                    // copy depth test to depth reference buffer, clear depth test buffer, clear stencil
                    PeelBuffers(FLT_MAX, 0);
-                }

-                // render to TAGS
-                {
-                    JLOG2(llo, "trans", V(entry.trans.ptr_in_words));
-                    RenderObjectList(RM_TRANSLUCENT, entry.trans.ptr_in_words * 4, &rect);
-                }
+                    // render to TAGS
+                    {
+                        RenderObjectList(RM_TRANSLUCENT_AUTOSORT, entry.trans.ptr_in_words * 4, &rect);
+                    }

-                if (!entry.trans_mod.empty)
-                {
-                    JLOG2(llo, "trans_mod", V(entry.trans_mod.ptr_in_words));
-                    RenderObjectList(RM_MODIFIER, entry.trans_mod.ptr_in_words * 4, &rect);
-                }
+                    if (!entry.trans_mod.empty)
+                    {
+                        RenderObjectList(RM_MODIFIER, entry.trans_mod.ptr_in_words * 4, &rect);
+                    }

-                // render TAGS to ACCUM
-                // also marks TAGS as invalid, but keeps the index for coplanar sorting
-                RenderParamTags(RM_TRANSLUCENT, rect.left, rect.top);
-            } while (GetMoreToDraw() != 0);
+                    // render TAGS to ACCUM
+                    RenderParamTags<RM_TRANSLUCENT_AUTOSORT>(rect.left, rect.top);
+                } while (GetMoreToDraw() != 0);
+            }
        }

        // Copy to vram
@@ -469,9 +453,6 @@ void RenderCORE() {
                }
            }
        }
-
-        // clear the tsp cache
-        ClearFpuEntries();
    } while (!entry.control.last_region);
 }

--- a/vendor/emu/refsw/refsw_lists.h
+++ b/vendor/emu/refsw/refsw_lists.h
@@ -26,13 +26,23 @@ struct DrawParameters

 enum RenderMode {
    RM_OPAQUE,
-    RM_PUNCHTHROUGH,
-    RM_OP_PT_MV, // OP and PT with modvol
-    RM_TRANSLUCENT,
+    RM_PUNCHTHROUGH_PASS0,
+    RM_PUNCHTHROUGH_PASSN,
+    RM_PUNCHTHROUGH_MV, // PT MODVOL 2nd pass
+    RM_TRANSLUCENT_AUTOSORT,
+    RM_TRANSLUCENT_PRESORT,
    RM_MODIFIER,
 };

-#define TAG_INVALID (1 << 31)
+struct TagState {
+    union {
+        struct {
+            bool valid: 1;
+            bool rendered: 1;
+        };
+        uint8_t raw;
+    };
+};

 typedef u32 parameter_tag_t;

--- a/vendor/emu/refsw/refsw_tile.cpp
+++ b/vendor/emu/refsw/refsw_tile.cpp
--- a/vendor/emu/refsw/refsw_tile.h
+++ b/vendor/emu/refsw/refsw_tile.h
@@ -39,6 +39,10 @@ struct PlaneStepper3

        float C = ((v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y));
        
+        if (C == 0) {
+            C = 1; // avoid divide by zero
+        }
+
        ddx = -Aa / C;
        ddy = -Ba / C;

@@ -54,6 +58,16 @@ struct PlaneStepper3
    {
        return Ip(x, y) * W;
    }
+
+    float IpU8(float x, float y, float W) const
+    {
+        float rv = Ip(x, y, W);
+
+        if (rv < 0) rv = 0;
+        if (rv > 255) rv = 255;
+
+        return rv;
+    }
 };

 /*
@@ -87,8 +101,8 @@ struct IPs3
        }

        if (TwoVolumes) {
-            U[1].Setup(rect, v1, v2, v3, v1.u * v1.z, v2.u1 * v2.z, v3.u1 * v3.z);
-            V[1].Setup(rect, v1, v2, v3, v1.v * v1.z, v2.v1 * v2.z, v3.v1 * v3.z);
+            U[1].Setup(rect, v1, v2, v3, v1.u1 * v1.z, v2.u1 * v2.z, v3.u1 * v3.z);
+            V[1].Setup(rect, v1, v2, v3, v1.v1 * v1.z, v2.v1 * v2.z, v3.v1 * v3.z);
            if (params->isp.Gouraud) {
                for (int i = 0; i < 4; i++)
                    Col[1][i].Setup(rect, v1, v2, v3, v1.col1[i] * v1.z, v2.col1[i] * v2.z, v3.col1[i] * v3.z);
@@ -128,10 +142,9 @@ extern u32  colorBuffer1 [MAX_RENDER_PIXELS];
 extern const char* dump_textures;

 void ClearBuffers(u32 paramValue, float depthValue, u32 stencilValue);
-void ClearParamBuffer(parameter_tag_t paramValue);
+void ClearParamStatusBuffer();
 void PeelBuffers(float depthValue, u32 stencilValue);
 void PeelBuffersPT();
-void PeelBuffersPTAfterHoles();
 void PeelBuffersPTInitial(float depthValue);
 void SummarizeStencilOr();
 void SummarizeStencilAnd();
@@ -140,31 +153,29 @@ bool GetMoreToDraw();

 // Render to ACCUM from TAG buffer
 // TAG holds references to triangles, ACCUM is the tile framebuffer
-void RenderParamTags(RenderMode rm, int tileX, int tileY);
-void ClearFpuEntries();
+template<RenderMode rm>
+void RenderParamTags(int tileX, int tileY);

-f32 f16(u16 v);
+inline __attribute__((always_inline)) f32 f16(u16 v)
+{
+    u32 z=v<<16;
+    return *(f32*)&z;
+}

 //decode a vertex in the native pvr format
 void decode_pvr_vertex(DrawParameters* params, pvr32addr_t ptr,Vertex* cv, u32 shadow);
 // decode an object (params + vertexes)
 u32 decode_pvr_vertices(DrawParameters* params, pvr32addr_t base, u32 skip, u32 two_volumes, Vertex* vtx, int count, int offset);

-FpuEntry GetFpuEntry(taRECT *rect, RenderMode render_mode, ISP_BACKGND_T_type core_tag);
+const FpuEntry& GetFpuEntry(taRECT *rect, RenderMode render_mode, ISP_BACKGND_T_type core_tag);
 // Lookup/create cached TSP parameters, and call PixelFlush_tsp
-bool PixelFlush_tsp(bool pp_AlphaTest, FpuEntry* entry, float x, float y, u32 index, float invW, bool InVolume);
+bool PixelFlush_tsp(bool pp_AlphaTest, const FpuEntry* entry, float x, float y, u32 index, float invW, bool InVolume);
 // Rasterize a single triangle to ISP (or ISP+TSP for PT)
-void RasterizeTriangle(RenderMode render_mode, DrawParameters* params, parameter_tag_t tag, const Vertex& v1, const Vertex& v2, const Vertex& v3, const Vertex* v4, taRECT* area);
+
+extern void (*RasterizeTriangle_table[])(DrawParameters* params, parameter_tag_t tag, const Vertex& v1, const Vertex& v2, const Vertex& v3, const Vertex* v4, taRECT* area);
+
 u8* GetColorOutputBuffer();

-// Implement the full texture/shade pipeline for a pixel
-bool PixelFlush_tsp(
-    bool pp_UseAlpha, bool pp_Texture, bool pp_Offset, bool pp_ColorClamp, u32 pp_FogCtrl, bool pp_IgnoreAlpha, bool pp_ClampU, bool pp_ClampV, bool pp_FlipU, bool pp_FlipV, u32 pp_FilterMode, u32 pp_ShadInstr, bool pp_AlphaTest, u32 pp_SrcSel, u32 pp_DstSel, u32 pp_SrcInst, u32 pp_DstInst,
-    const FpuEntry *entry, float x, float y, float W, bool InVolume, u32 index);
-
-// Depth processing for a pixel -- render_mode 0: OPAQ, 1: PT, 2: TRANS
-void PixelFlush_isp(RenderMode render_mode, u32 depth_mode, u32 ZWriteDis, float x, float y, float invW, u32 index, parameter_tag_t tag);
-

 /*
    Main renderer class
@@ -180,4 +191,5 @@ void RenderTriangleArray(RenderMode render_mode, ObjectListEntry obj, taRECT* re
 void RenderQuadArray(RenderMode render_mode, ObjectListEntry obj, taRECT* rect);
 void RenderObjectList(RenderMode render_mode, pvr32addr_t base, taRECT* rect);
 void RenderCORE();
-void Hackpresent();
+void Hackpresent();
+void ClearFpuCache();
--- a/vendor/koshle/dc_hle_types.h
+++ b/vendor/koshle/dc_hle_types.h
@@ -1,6 +1,27 @@
 #pragma once

-#if ( (__LONG_MAX__ *2UL+1UL) == 18446744073709551615ULL) && ((__INT_MAX__  *2U +1U) == 4294967295ULL)
+#if defined(_WIN64) && _WIN64 == 1
+    typedef unsigned long long uint64;  /**< \brief 64-bit unsigned integer */
+    typedef unsigned int uint32;       /**< \brief 32-bit unsigned integer */
+    typedef unsigned short uint16;      /**< \brief 16-bit unsigned integer */
+    typedef unsigned char uint8;        /**< \brief 8-bit unsigned integer */
+    typedef long long int64;            /**< \brief 64-bit signed integer */
+    typedef int int32;                 /**< \brief 32-bit signed integer */
+    typedef short int16;                /**< \brief 16-bit signed integer */
+    typedef char int8;                  /**< \brief 8-bit signed integer */ 
+
+    typedef volatile unsigned long long vuint64;  /**< \brief 64-bit unsigned integer */
+    typedef volatile unsigned int vuint32;       /**< \brief 32-bit unsigned integer */
+    typedef volatile unsigned short vuint16;      /**< \brief 16-bit unsigned integer */
+    typedef volatile unsigned char vuint8;        /**< \brief 8-bit unsigned integer */
+    typedef volatile long long vint64;            /**< \brief 64-bit signed integer */
+    typedef volatile int vint32;                 /**< \brief 32-bit signed integer */
+    typedef volatile short vint16;                /**< \brief 16-bit signed integer */
+    typedef volatile char vint8;                  /**< \brief 8-bit signed integer */ 
+
+typedef uint64 ptr_t;
+#define INT32_IS_INT
+#elif (( (__LONG_MAX__ *2UL+1UL) == 18446744073709551615ULL) && ((__INT_MAX__  *2U +1U) == 4294967295ULL))
 typedef unsigned long uint64;  /**< \brief 64-bit unsigned integer */
 typedef unsigned int uint32;       /**< \brief 32-bit unsigned integer */
 typedef unsigned short uint16;      /**< \brief 16-bit unsigned integer */
--- a/vendor/koshle/hlekos.cpp
+++ b/vendor/koshle/hlekos.cpp
@@ -24,7 +24,7 @@ void * maple_dev_status(maple_device*) {
 maple_device_t * maple_enum_type(int n, uint32 func) {
    return &dev;
 }
-
+namespace kos {
 int sem_wait_timed(semaphore_t *sem, int timeout) {
    auto count = sem->count.load();

@@ -54,6 +54,7 @@ int sem_signal(semaphore_t *sem) {
    sem->count++;
    return 0;
 }
+}

 void sq_lock(void *dest) {

--- a/vendor/koshle/kos/sem.h
+++ b/vendor/koshle/kos/sem.h
@@ -32,6 +32,7 @@

    \headerfile kos/sem.h
 */
+namespace kos {
 typedef struct semaphore {
    std::atomic<int> initialized;    /**< \brief Are we initialized? */
    std::atomic<int> count;          /**< \brief The semaphore count */
@@ -177,5 +178,7 @@ int sem_signal(semaphore_t *sem);
 */
 int sem_count(semaphore_t *sem);

+}
+using namespace kos;

 #endif  /* __KOS_SEM_H */
--- a/vendor/koshle/pvr_internal.h
+++ b/vendor/koshle/pvr_internal.h
@@ -233,7 +233,7 @@ typedef struct {

    /* Wait-ready semaphore: this will be signaled whenever the pvr_wait_ready()
       call should be ready to return. */
-    semaphore_t ready_sem;
+    kos::semaphore_t ready_sem;

    // Handle for the vblank interrupt
    int     vbl_handle;
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -89,10 +89,18 @@ strncmp_ci(const char *s1, const char *s2, int n)
 Quat
 mult(const Quat &q, const Quat &p)
 {
+#ifndef DC_SH4
 	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
 	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
 	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
 	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
+#else
+	Quat o;
+	dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
+	              reinterpret_cast<const dc::quaternion_t &>(q),
+				  reinterpret_cast<const dc::quaternion_t &>(p));
+	return o;
+#endif
 }


--- a/vendor/librw/src/camera.cpp
+++ b/vendor/librw/src/camera.cpp
@@ -429,6 +429,7 @@ Camera::frustumTestSphere(const Sphere *s) const
 {
 	int32 res = SPHEREINSIDE;
 	const FrustumPlane *p = this->frustumPlanes;
+#ifndef DC_SH4
 	for(int32 i = 0; i < 6; i++){
 		float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
 		if(s->radius < dist)
@@ -437,6 +438,114 @@ Camera::frustumTestSphere(const Sphere *s) const
 			res = SPHEREBOUNDARY;
 		p++;
 	}
+#else
+	__builtin_prefetch(p);
+
+	register float sx asm("fr0") = s->center.x;
+	register float sy asm("fr1") = s->center.y;
+	register float sz asm("fr2") = s->center.z;
+	register float sw asm("fr3") = -1.0f;
+
+	// far
+	register float px asm("fr4") = p->plane.normal.x;
+	register float py asm("fr5") = p->plane.normal.y;
+	register float pz asm("fr6") = p->plane.normal.z;
+	register float pw asm("fr7") = p->plane.distance;
+
+	asm volatile("fipr fv0, fv4"
+				: "+f" (pw)
+				: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				"f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	else if(s->radius > -pw)
+		res = SPHEREBOUNDARY;
+	p++;
+
+	// near
+	px = p->plane.normal.x;
+	py = p->plane.normal.y;
+	pz = p->plane.normal.z;
+	pw = p->plane.distance;
+	asm volatile("fipr fv0, fv4"
+				: "+f" (pw)
+				: "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				"f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	if(s->radius > -pw)
+		res = SPHEREBOUNDARY_NEAR;
+	p++;
+
+	const float* base_ptr0 = &p[0].plane.normal.x;
+	const float* base_ptr1 = &p[1].plane.normal.x;
+	const float* base_ptr2 = &p[2].plane.normal.x;
+	const float* base_ptr3 = &p[3].plane.normal.x;
+
+	__builtin_prefetch(base_ptr0);
+
+	static_assert(offsetof (decltype (p[0].plane.normal), y)
+				 -offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane.normal), z)
+				  -offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane), distance)
+				 -offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
+
+	asm volatile (R"(
+		frchg
+
+		fmov.s  @%0+,fr0
+		fmov.s  @%1+,fr1
+		fmov.s  @%2+,fr2
+		fmov.s  @%3+,fr3
+
+		fmov.s  @%0+,fr4
+		fmov.s  @%1+,fr5
+		fmov.s  @%2+,fr6
+		fmov.s  @%3+,fr7
+
+		fmov.s  @%0+,fr8
+		fmov.s  @%1+,fr9
+		fmov.s  @%2+,fr10
+		fmov.s  @%3+,fr11
+
+		fmov.s  @%0,fr12
+		fmov.s  @%1,fr13
+		fmov.s  @%2,fr14
+		fmov.s  @%3,fr15
+
+		frchg
+	)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
+	:
+	: );
+
+	float dists[4];
+	mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
+	                           dists[0], dists[1], dists[2], dists[3]);
+
+	if(s->radius < dists[0])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[0])
+		res = SPHEREBOUNDARY;
+
+	if(s->radius < dists[1])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[1])
+		res = SPHEREBOUNDARY;
+
+	if(s->radius < dists[2])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[2])
+		res = SPHEREBOUNDARY;
+
+	if(s->radius < dists[3])
+		return SPHEREOUTSIDE;
+	else if(s->radius > -dists[3])
+		res = SPHEREBOUNDARY;
+
+#endif
 	return res;
 }

@@ -445,7 +554,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const
 {
 	int32 res = SPHEREINSIDE;
 	const FrustumPlane *p = this->frustumPlanes;
-
+#ifndef DC_SH4
 	// far
 	float32 dist = dot(p->plane.normal, s->center) - p->plane.distance;
 	if(s->radius < dist)
@@ -481,6 +590,101 @@ Camera::frustumTestSphereNear(const Sphere *s) const
 		return SPHEREOUTSIDE;
 	p++;

+#else
+	__builtin_prefetch(p);
+
+	register float sx asm("fr0") = s->center.x;
+	register float sy asm("fr1") = s->center.y;
+	register float sz asm("fr2") = s->center.z;
+	register float sw asm("fr3") = -1.0f;
+
+	// far
+	register float px asm("fr4") = p->plane.normal.x;
+	register float py asm("fr5") = p->plane.normal.y;
+	register float pz asm("fr6") = p->plane.normal.z;
+	register float pw asm("fr7") = p->plane.distance;
+
+	asm volatile("fipr fv0, fv4"
+				 : "+f" (pw)
+				 : "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				   "f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	p++;
+
+	// near
+	px = p->plane.normal.x;
+	py = p->plane.normal.y;
+	pz = p->plane.normal.z;
+	pw = p->plane.distance;
+	asm volatile("fipr fv0, fv4"
+				 : "+f" (pw)
+				 : "f" (sx), "f" (sy), "f" (sz), "f" (sw),
+				   "f" (px), "f" (py), "f" (pz));
+	if(s->radius < pw)
+		return SPHEREOUTSIDE;
+	if(s->radius > -pw)
+		res = SPHEREBOUNDARY_NEAR;
+	p++;
+
+	const float* base_ptr0 = &p[0].plane.normal.x;
+	const float* base_ptr1 = &p[1].plane.normal.x;
+	const float* base_ptr2 = &p[2].plane.normal.x;
+	const float* base_ptr3 = &p[3].plane.normal.x;
+
+	__builtin_prefetch(base_ptr0);
+
+	static_assert(offsetof (decltype (p[0].plane.normal), y)
+				 -offsetof (decltype (p[0].plane.normal), x) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane.normal), z)
+				  -offsetof (decltype (p[0].plane.normal), y) == sizeof (float));
+
+	static_assert(offsetof (decltype (p[0].plane), distance)
+				 -offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
+
+	asm volatile (R"(
+		frchg
+
+		fmov.s  @%0+,fr0
+		fmov.s  @%1+,fr1
+		fmov.s  @%2+,fr2
+		fmov.s  @%3+,fr3
+
+		fmov.s  @%0+,fr4
+		fmov.s  @%1+,fr5
+		fmov.s  @%2+,fr6
+		fmov.s  @%3+,fr7
+
+		fmov.s  @%0+,fr8
+		fmov.s  @%1+,fr9
+		fmov.s  @%2+,fr10
+		fmov.s  @%3+,fr11
+
+		fmov.s  @%0,fr12
+		fmov.s  @%1,fr13
+		fmov.s  @%2,fr14
+		fmov.s  @%3,fr15
+
+		frchg
+	)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
+	:
+	: );
+
+	float dists[4];
+	mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
+	                           dists[0], dists[1], dists[2], dists[3]);
+
+	if(s->radius < dists[0])
+		return SPHEREOUTSIDE;
+	else if(s->radius < dists[1])
+		return SPHEREOUTSIDE;
+	else if(s->radius < dists[2])
+		return SPHEREOUTSIDE;
+	else if(s->radius < dists[3])
+		return SPHEREOUTSIDE;
+
+#endif
 	return res;
 }

--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -37,8 +37,6 @@ extern const char* currentFile;
 #include <functional>
 #include <fstream>

-#define ARRAY_SIZE(array)                (sizeof(array) / sizeof(array[0]))
-
 #define errorf(...) dbglog(DBG_CRITICAL, __VA_ARGS__)
 #define logf(...) // printf(__VA_ARGS__)
 bool re3RemoveLeastUsedModel();
@@ -47,21 +45,13 @@ void* re3StreamingAlloc(size_t size);

 // #include "rwdcimpl.h"

-#include <dc/pvr.h>
-#include <dc/matrix.h>
 #include "alloc.h"

-#undef PVR_TXRFMT_STRIDE
-#define PVR_TXRFMT_STRIDE       (1 << 25)
-
-static_assert(PVR_TXRFMT_STRIDE == (1 << 25), "PVR_TXRFMT_STRIDE is bugged in your KOS version");
+using namespace dc;

 // TODO: probably needs a better place to be
 bool doEnvironmentMaps = true;

-#define fclamp0_1(n) ((n) > 1.0f ? 1.0f : n < 0.0f ? 0.0f : n)
-#define fclamp1(n) ((n) > 1.0f ? 1.0f : n)
-
 struct alignas(32) pvr_vertex16_t {
 	uint32_t flags;			/**< \brief TA command (vertex flags) */
 	float	 x;				/**< \brief X coordinate */
@@ -170,177 +160,10 @@ struct alignas(32) pvr_vertex32_ut {
 static_assert(sizeof(pvr_vertex16_t) == 32, "pvr_vertex16_t size mismatch");
 static_assert(alignof(pvr_vertex16_t) == 32, "pvr_vertex16_t alignof mismatch");

-
-#define MATH_Fast_Invert(x) ({ (((x) < 0.0f)? -1.0f : 1.0f) * frsqrt((x) * (x)); }) 
-
 static pvr_dr_state_t drState;

-#include <kos/dbglog.h>
-
 float VIDEO_MODE_SCALE_X;

-#if !defined(DC_TEXCONV) && !defined(DC_SIM)
-#include <kos.h>
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 0.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        w = __w; \
-    } while(false)
-
-	// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
-
-	inline void rw_mat_load_3x3(const rw::Matrix* mtx) {
-		__asm__ __volatile__ (
-			R"(
-				fschg
-				frchg
-
-				fmov        @%[mtx]+, dr0
-
-				fldi0 		fr12
-				fldi0 		fr13
-
-				fmov        @%[mtx]+, dr2
-				fmov        @%[mtx]+, dr4
-				fmov        @%[mtx]+, dr6
-				fmov        @%[mtx]+, dr8
-				fmov        @%[mtx]+, dr10
-
-				fldi0	    fr3
-				fldi0	    fr7
-				fldi0	    fr11
-				fmov        dr12, dr14
-
-				fschg
-				frchg
-			)"
-			: [mtx] "+r" (mtx)
-		);
-	}
-
-	// sets pos.w to 1
-	inline void rw_mat_load_4x4(const rw::Matrix* mtx) {
-		__asm__ __volatile__ (
-			R"(
-				fschg
-				frchg
-				fmov        @%[mtx]+, dr0
-
-				fmov        @%[mtx]+, dr2
-				fmov        @%[mtx]+, dr4
-				fmov        @%[mtx]+, dr6
-				fmov        @%[mtx]+, dr8
-				fmov        @%[mtx]+, dr10
-				fmov        @%[mtx]+, dr12
-				fmov        @%[mtx]+, dr14
-				fldi1 	 	fr15
-
-				fschg
-				frchg
-			)"
-			: [mtx] "+r" (mtx)
-		);
-	}
-
-#else
-extern matrix_t XMTRX;
-
-void rw_mat_load_3x3(rw::Matrix* mtx) {
-	memcpy(XMTRX, mtx, sizeof(matrix_t));
-	XMTRX[0][3] = 0.0f;
-	XMTRX[1][3] = 0.0f;
-	XMTRX[2][3] = 0.0f;
-
-	XMTRX[3][0] = 0.0f;
-	XMTRX[3][1] = 0.0f;
-	XMTRX[3][2] = 0.0f;
-	XMTRX[3][3] = 0.0f;
-}
-
-void rw_mat_load_4x4(rw::Matrix* mtx) {
-	memcpy(XMTRX, mtx, sizeof(matrix_t));
-	XMTRX[3][3] = 1.0f;
-}
-
-#include <dc/matrix.h>
-#define frsqrt(a) 				(1.0f/sqrt(a))
-#define dcache_pref_block(a)	__builtin_prefetch(a)
-
-#ifndef __always_inline
-#define __always_inline 		__attribute__((always_inline)) inline
-#endif
-
-#ifdef DC_TEXCONV
-#define mat_transform(a, b, c, d)
-#define mat_apply(a)
-#define mat_load(a)
-#define mat_store(a)
-#define mat_identity(a)
-#define pvr_fog_table_color(a,r,g,b)
-#define pvr_fog_table_linear(s,e)
-#define pvr_fog_table_exp(d)
-#define pvr_fog_table_custom(d)
-#endif
-
-#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		z2 = 1.0f / tmp.w; \
-		x2 = tmp.x * z2; \
-		y2 = tmp.y * z2; \
-	} while(false)
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
-		x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
-	} while(false)
-
-#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
-		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
-		x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
-	} while(false)
-
-#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
-		vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
-		w_ = tmp1233123.w; \
-	} while(false)
-
-#define memcpy4 memcpy
-
-// END STUBS
-#endif
-
 static pvr_ptr_t fake_tex;

 alignas(4) static const uint16_t fake_tex_data[] = {
@@ -551,9 +374,9 @@ void DCE_MatrixViewport(float x, float y, float width, float height) {

 void DCE_InitMatrices() {
 	// Setup the screenview matrix.  Only need to do once since this matrix does not need to change for single player viewpoint.
-	mat_identity();
+	mat_identity2();
 	
-	mat_store(&DCE_MAT_SCREENVIEW);
+	mat_store2(&DCE_MAT_SCREENVIEW);
 }

 }
@@ -684,7 +507,7 @@ struct atomic_context_t {
 __always_inline void DCE_RenderSubmitVertex(const pvr_vertex_t *v, uint32_t flags) {
    auto *sq  = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
    auto *src = reinterpret_cast<const uint32_t *>(v);
-    float sz  = MATH_Fast_Invert(v->z);
+    float sz  = Invert<true, false>(v->z);
    float sx  = v->x * sz;
    float sy  = v->y * sz;
    
@@ -711,7 +534,7 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,
 {
    auto *sq   = reinterpret_cast<uint32_t *>(pvr_dr_target(drState));
    auto *uv32 = reinterpret_cast<const uint32_t *>(uv);
-    float sz   = MATH_Fast_Invert(w);
+    float sz   = Invert<true, false>(w);
    float sx   = x * sz;
    float sy   = y * sz;

@@ -728,13 +551,6 @@ __always_inline void DCE_RenderSubmitVertexIM3D(float x, float y, float w,

 /* END TA Submission Functions*/

-
-
-
-#if defined(DC_TEXCONV)
-void malloc_stats() { }
-#endif
-
 #if 0
 #define UNIMPL_LOG() printf("TODO: Implement %s @ %s:%d\n", __func__, __FILE__, __LINE__);
 #define UNIMPL_LOGV(fmt, ...) printf("TODO: Implement %s @ %s:%d " fmt "\n", __func__, __FILE__, __LINE__, __VA_ARGS__);
@@ -808,9 +624,8 @@ void beginUpdate(Camera* cam)  {
 	
 	DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height);
 	
-	mat_load((matrix_t*)&DCE_MAT_SCREENVIEW);
-	mat_apply((matrix_t*)&cam->devProj);
-	mat_store((matrix_t*)&cam->devProjScreen);
+	mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj);
+	mat_store2((matrix_t*)&cam->devProjScreen);

 }

@@ -1838,7 +1653,7 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 			pvrVert->flags = flags;
 			pvrVert->x 	   = gtaVert.x * VIDEO_MODE_SCALE_X;
 			pvrVert->y	   = gtaVert.y;
-			pvrVert->z 	   = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
+			pvrVert->z 	   = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
 			pvrVert->u 	   = gtaVert.u;
 			pvrVert->v 	   = gtaVert.v;
 			pvrVert->argb  = (gtaVert.a << 24) |
@@ -1959,7 +1774,7 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu
 			pvrVert->flags = flags;
 			pvrVert->x 	   = gtaVert.x * VIDEO_MODE_SCALE_X;
 			pvrVert->y	   = gtaVert.y;
-			pvrVert->z 	   = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
+			pvrVert->z 	   = Invert<true, false>(gtaVert.w); // this is perfect for almost every case...
 			pvrVert->u 	   = gtaVert.u;
 			pvrVert->v 	   = gtaVert.v;
 			pvrVert->argb  = (gtaVert.a << 24) |
@@ -2022,8 +1837,8 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f
 	rw::RawMatrix::mult(&worldview, &world, &cam->devView);
 	rw::RawMatrix::mult(&proj, &worldview, &cam->devProj);
 	rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW);
-	// mat_load(&DCE_MAT_SCREENVIEW);     // ~11 cycles.
-	mat_load(( matrix_t*)&mtx.right);  // Number of cycles: ~32.
+	// mat_load2(&DCE_MAT_SCREENVIEW);     // ~11 cycles.
+	mat_load2(( matrix_t*)&mtx.right);  // Number of cycles: ~32.
    if (im3dVertices) {
 		free(im3dVertices);
 	}
@@ -2117,7 +1932,7 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,

 			// assuming near plane is 0.0f
 			// gv1 is visible (posi), and gv2 is behind the plane (negative)
-			float t = (1.0f - gv1.position.z) * MATH_Fast_Invert(gv2.position.z - gv1.position.z);
+			float t = (1.0f - gv1.position.z) * Invert<true, true>(gv2.position.z - gv1.position.z);

 			pvr_vertex_t pvrVert; 

@@ -3395,13 +3210,13 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 		auto skinningWeightData = (uint8_t*)skinWeights;

 		if (!matrix0Identity) {
-			rw_mat_load_4x4(&skinMatrices[0]);
+			mat_load_4x4(&skinMatrices[0]);
 			if (small_xyz) {
 				mat_apply(&DCE_MESHLET_MAT_DECODE);
 			}
 		} else {
 			if (small_xyz) {
-				mat_load(&DCE_MESHLET_MAT_DECODE);
+				mat_load2(&DCE_MESHLET_MAT_DECODE);
 			}
 		}

@@ -3465,7 +3280,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				break;
 			}

-			rw_mat_load_4x4(currentMatrix);
+			mat_load_4x4(currentMatrix);
 			if (small_xyz){
 				mat_apply(&DCE_MESHLET_MAT_DECODE);
 			}
@@ -3498,7 +3313,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 		auto skinningWeightData = (uint8_t*)skinWeights;

 		if (!matrix0Identity) {
-			rw_mat_load_3x3(&skinMatrices[0]);
+			mat_load_3x3(&skinMatrices[0]);
 		}

 		for(;;) {
@@ -3556,7 +3371,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				break;
 			}

-			rw_mat_load_3x3(currentMatrix);
+			mat_load_3x3(currentMatrix);

 			do {
 				auto srcOffset = *skinningIndexData++;
@@ -3581,7 +3396,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 __attribute__((noinline))
 void tnlMeshletEnvMap(uint8_t* OCR, uint8_t* normal, int vertexCount, int vertexSize, matrix_t* matfxMatrix, float matfxCoefficient) {

-	mat_load(matfxMatrix);
+	mat_load2(matfxMatrix);

 	do {
 		pvr_vertex64_t* v = (pvr_vertex64_t*)OCR;
@@ -3998,15 +3813,12 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 	lightingCB(atomic, ac->uniform);

-
 	rw::RawMatrix world;
 	rw::convMatrix(&world, atomic->getFrame()->getLTM());
 	
-
-	mat_load((matrix_t*)&cam->devProjScreen);
-	mat_apply((matrix_t*)&cam->devView);
+	mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView);
 	mat_apply((matrix_t*)&world);
-	mat_store((matrix_t*)&atomicContexts.back().mtx);
+	mat_store2((matrix_t*)&atomicContexts.back().mtx);

 	auto meshes = geo->meshHeader->getMeshes();

@@ -4245,15 +4057,14 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 						unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2;
 						tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset],  normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx);
 						
-						mat_load(&mtx);
+						mat_load2(&mtx);
 						tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64);
 					} else {
 						
 						if (selector & 8) {
-							mat_load(&mtx);
-							mat_apply(&DCE_MESHLET_MAT_DECODE);
+							mat_load_apply(&mtx, &DCE_MESHLET_MAT_DECODE);
 						} else {
-							mat_load(&mtx);
+							mat_load2(&mtx);
 						}
 						tnlMeshletTransformSelector[smallSelector](OCR_SPACE, &dcModel->data[meshlet->vertexOffset], meshlet->vertexCount, meshlet->vertexSize);
 					}
@@ -4280,7 +4091,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 						unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
 						dce_set_mat_vertex_color(&residual, &material);
-						mat_load(&DCE_MESHLET_MAT_VERTEX_COLOR);
+						mat_load2(&DCE_MESHLET_MAT_VERTEX_COLOR);
 						tnlMeshletVertexColorSelector[0](OCR_SPACE + dstColOffset, (int8_t*)&dcModel->data[meshlet->vertexOffset] + colOffset, meshlet->vertexCount, meshlet->vertexSize);
 					} else {
 						unsigned dstColOffset = textured ? offsetof(pvr_vertex64_t, a) : offsetof(pvr_vertex32_ut, a);
@@ -4302,7 +4113,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 						
 						unsigned normalSelector = (pass1 - 1) + (skin != 0) * 4;
-						mat_load((matrix_t*)&uniformObject.dir[0][0][0]);
+						mat_load2((matrix_t*)&uniformObject.dir[0][0][0]);
 						auto normalPointer = &dcModel->data[meshlet->vertexOffset] + normalOffset;
 						auto vtxSize = meshlet->vertexSize;
 						if (skin) {
@@ -4317,7 +4128,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 					
 						if (pass2) {
 							unsigned normalSelector = (pass2 - 1) + (skin != 0) * 4;
-							mat_load((matrix_t*)&uniformObject.dir[1][0][0]);
+							mat_load2((matrix_t*)&uniformObject.dir[1][0][0]);
 							tnlMeshletDiffuseColorSelector[normalSelector](OCR_SPACE + dstColOffset, normalPointer, meshlet->vertexCount, vtxSize, &lightDiffuseColors[4]);
 						}
 					}
@@ -4380,7 +4191,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 					indices.back() |= 0x80;

 					pvr_vertex64_t *vd = (pvr_vertex64_t *)OCR_SPACE;
-					mat_load(&mtx);  // Number of cycles: ~11
+					mat_load2(&mtx);  // Number of cycles: ~11

 					for (int idx = 0; idx < geo->numVertices; idx++) {
 						auto& vert = vertices[idx];
@@ -4716,6 +4527,10 @@ imageFindRasterFormat(Image *img, int32 type,

 #if defined(_WIN32) || defined(_WIN64)
 #include <windows.h> // For QueryPerformanceCounter and GetCurrentThreadId
+#ifdef _WIN64
+#include <profileapi.h>
+#include <processthreadsapi.h>
+#endif
 #elif defined(__APPLE__)
 #include <mach/mach_time.h> // For mach_absolute_time
 #include <pthread.h>
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -0,0 +1,739 @@
+#ifndef __RWDC_COMMON_H
+#define __RWDC_COMMON_H
+
+#include <tuple>
+#include <limits>
+#include <cmath>
+#include <type_traits>
+#include <algorithm>
+#include <cstring>
+
+#ifndef RW_DC
+#   pragma warning(disable: 4244)  // int to float
+#   pragma warning(disable: 4800)  // int to bool
+#   pragma warning(disable: 4838)  // narrowing conversion
+#   pragma warning(disable: 4996)  // POSIX names
+#else
+#   if !defined(DC_TEXCONV) && !defined(DC_SIM)
+#       include <kos.h>
+#       define DC_SH4
+#       define VIDEO_MODE_WIDTH     static_cast<float>(vid_mode->width)
+#       define VIDEO_MODE_HEIGHT    static_cast<float>(vid_mode->height)
+#       define memcpy4              memcpy
+#   else
+#       ifdef DC_TEXCONV
+#           define malloc_stats()
+#       endif
+#       include <dc/matrix.h>
+#       include <dc/pvr.h>
+#       include <kos/dbglog.h>
+#       define VIDEO_MODE_WIDTH     640.0f
+#       define VIDEO_MODE_HEIGHT    480.0f
+#       define memcpy4              memcpy
+#       define frsqrt(a)            (1.0f/sqrtf(a))
+#       define dcache_pref_block(a)	__builtin_prefetch(a)
+#       define F_PI                 M_PI
+#       ifndef __always_inline 
+#           define __always_inline  inline
+#       endif
+#   endif
+#   ifdef PVR_TXRFMT_STRIDE
+#       undef PVR_TXRFMT_STRIDE
+#       define PVR_TXRFMT_STRIDE (1 << 25)
+#   endif
+    static_assert(PVR_TXRFMT_STRIDE == (1 << 25), 
+                  "PVR_TXRFMT_STRIDE is bugged in your KOS version");
+#endif
+
+#define F_PI_2              (F_PI * 0.5f)
+#define __hot               __attribute__((hot))
+#define __cold              __attribute__((cold))
+#define __icache_aligned    __attribute__((aligned(32)))
+
+#define STRINGIFY(x)        #x
+#define STR(x)              STRINGIFY(x)
+#define CONCAT_(x,y)        x##y
+#define CONCAT(x,y)         CONCAT_(x,y)
+#define ARRAY_SIZE(array)   (sizeof(array) / sizeof(array[0]))
+
+namespace rw {
+    class Matrix;
+}
+
+namespace dc {
+
+struct quaternion_t {
+    float x, y, z, w;
+};
+
+__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
+__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
+__always_inline __hot constexpr auto  SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
+__always_inline __hot constexpr float Tan(float x) { return tanf(x); }
+__always_inline __hot constexpr float Atan(float x) { return atanf(x); }
+__always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); }
+__always_inline __hot constexpr float Asin(float x) { return asinf(x);  }
+__always_inline __hot constexpr float Acos(float x) { return acosf(x); }
+__always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
+__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
+__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
+__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
+__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
+__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
+__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
+__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
+__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
+__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); }
+
+template<bool CHECK_ZERO=false>
+__always_inline __hot constexpr float RecipSqrt(float x) { 
+    if constexpr(CHECK_ZERO)
+        if(x == 0.0f) 
+            return 0.0f;
+    
+    return 1.0f / Sqrt(x); 
+}
+
+template<typename T>
+__always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
+    return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
+}
+
+__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) {
+    return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
+}
+
+template<bool FAST_APPROX=true, bool COPY_SIGN=true>
+__always_inline __hot constexpr float Invert(float x) { 
+    float value;
+    
+    if(!std::is_constant_evaluated() && FAST_APPROX) {
+        value = RecipSqrt(x * x);
+
+        if constexpr(COPY_SIGN)
+            if(x < 0.0f)
+                value = -value;
+
+    } else value = 1.0f / x;
+
+    return value;
+} 
+
+template<bool FAST_APPROX=true, bool COPY_SIGN=true>
+__always_inline __hot constexpr float Div(float x, float y) { 
+    if(FAST_APPROX && !std::is_constant_evaluated())
+        return x * Invert<true, COPY_SIGN>(y);
+    else
+        return x / y;
+}
+
+template<bool FAST_APPROX=false, bool COPY_SIGN=true>
+__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
+    auto numerator = Clamp(value, min, max) - min;
+    auto denominator = (max - min);
+
+    if(FAST_APPROX && !std::is_constant_evaluated())
+        return Div<true, COPY_SIGN>(numerator, denominator);
+    else
+        return numerator / denominator;
+}
+
+#ifdef RW_DC
+#   ifdef DC_SH4
+
+#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 0.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        w = __w; \
+    } while(false)
+
+#define mat_trans_vec4_nodiv_nomod(x, y, z, w, x2, y2, z2, w2) { \
+        register float __x __asm__("fr0") = (x); \
+        register float __y __asm__("fr1") = (y); \
+        register float __z __asm__("fr2") = (z); \
+        register float __w __asm__("fr3") = (w); \
+        __asm__ __volatile__( "ftrv  xmtrx, fv0\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) {
+    asm volatile(
+        R"(
+            fschg
+            fmov.d	@%[mtx],xd0
+            add	    #32,%[mtx]
+            pref	@%[mtx]
+            add	    #-(32-8),%[mtx]
+            fmov.d	@%[mtx]+,xd2
+            fmov.d	@%[mtx]+,xd4
+            fmov.d	@%[mtx]+,xd6
+            fmov.d	@%[mtx]+,xd8
+            fmov.d	@%[mtx]+,xd10
+            fmov.d	@%[mtx]+,xd12
+            fmov.d	@%[mtx]+,xd14
+            fschg
+        )"
+        : [mtx] "+r" (mtx)
+        :
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
+    asm volatile(
+        R"(
+            fschg
+            add	    #64-8,%[mtx]
+            fmov.d	xd14,@%[mtx]
+            add	    #-32,%[mtx]
+            pref	@%[mtx]
+            add	    #32,%[mtx]
+            fmov.d	xd12,@-%[mtx]
+            fmov.d	xd10,@-%[mtx]
+            fmov.d	xd8,@-%[mtx]
+            fmov.d	xd6,@-%[mtx]
+            fmov.d	xd4,@-%[mtx]
+            fmov.d	xd2,@-%[mtx]
+            fmov.d	xd0,@-%[mtx]
+            fschg
+        )"
+        : [mtx] "+&r" (mtx), "=m" (*mtx)
+        :
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_identity2(void) {
+    asm volatile(
+        R"(
+            frchg
+            fldi1	fr0
+            fschg
+            fldi0	fr1
+            fldi0	fr2
+            fldi0	fr3
+            fldi0	fr4
+            fldi1	fr5
+            fmov	dr2,dr6
+            fmov	dr2,dr8
+            fmov	dr0,dr10
+            fmov	dr2,dr12
+            fmov	dr4,dr14
+            fschg
+            frchg
+        )"
+    );
+}
+
+inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) {
+    asm volatile(
+        R"(
+            frchg
+            fldi0	fr1
+            fschg
+            fldi0	fr2
+            fldi0	fr3
+            fldi0	fr4
+            fmov	dr2, dr6
+            fmov	dr2, dr8
+            fldi0	fr11
+            fmov	dr2, dr12
+            fldi0	fr14
+            fschg
+            frchg
+            fmov    %[x], xf0
+            fmov    %[y], xf5
+            fmov    %[z], xf10
+        )"
+        :
+        : [x] "r" (x), [y] "r" (y), [z] "r" (z)
+        :
+    );
+}
+
+// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
+inline __hot void mat_load_3x3(const rw::Matrix* mtx) {
+    __asm__ __volatile__ (
+        R"(
+            fschg
+            frchg
+
+            fmov        @%[mtx]+, dr0
+
+            fldi0 		fr12
+            fldi0 		fr13
+
+            fmov        @%[mtx]+, dr2
+            fmov        @%[mtx]+, dr4
+            fmov        @%[mtx]+, dr6
+            fmov        @%[mtx]+, dr8
+            fmov        @%[mtx]+, dr10
+
+            fldi0	    fr3
+            fldi0	    fr7
+            fldi0	    fr11
+            fmov        dr12, dr14
+
+            fschg
+            frchg
+        )"
+        : [mtx] "+r" (mtx)
+    );
+}
+
+// sets pos.w to 1
+inline __hot void mat_load_4x4(const rw::Matrix* mtx) {
+		__asm__ __volatile__ (
+			R"(
+				fschg
+				frchg
+				fmov        @%[mtx]+, dr0
+
+				fmov        @%[mtx]+, dr2
+				fmov        @%[mtx]+, dr4
+				fmov        @%[mtx]+, dr6
+				fmov        @%[mtx]+, dr8
+				fmov        @%[mtx]+, dr10
+				fmov        @%[mtx]+, dr12
+				fmov        @%[mtx]+, dr14
+				fldi1 	 	fr15
+
+				fschg
+				frchg
+			)"
+			: [mtx] "+r" (mtx)
+		);
+	}
+
+__hot inline void mat_transpose(void) {
+    asm volatile (
+        "frchg\n\t" // fmov for singles only works on front bank
+        // FR0, FR5, FR10, and FR15 are already in place
+        // swap FR1 and FR4
+        "flds FR1, FPUL\n\t"
+        "fmov FR4, FR1\n\t"
+        "fsts FPUL, FR4\n\t"
+        // swap FR2 and FR8
+        "flds FR2, FPUL\n\t"
+        "fmov FR8, FR2\n\t"
+        "fsts FPUL, FR8\n\t"
+        // swap FR3 and FR12
+        "flds FR3, FPUL\n\t"
+        "fmov FR12, FR3\n\t"
+        "fsts FPUL, FR12\n\t"
+        // swap FR6 and FR9
+        "flds FR6, FPUL\n\t"
+        "fmov FR9, FR6\n\t"
+        "fsts FPUL, FR9\n\t"
+        // swap FR7 and FR13
+        "flds FR7, FPUL\n\t"
+        "fmov FR13, FR7\n\t"
+        "fsts FPUL, FR13\n\t"
+        // swap FR11 and FR14
+        "flds FR11, FPUL\n\t"
+        "fmov FR14, FR11\n\t"
+        "fsts FPUL, FR14\n\t"
+        // restore XMTRX to back bank
+        "frchg\n"
+        : // no outputs
+        : // no inputs
+        : "fpul" // clobbers
+    );
+}
+
+__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) {
+    asm volatile(R"(
+        fschg
+
+        pref    @%[dst]
+
+        fmov.d  @%[src]+, xd0
+        fmov.d  @%[src]+, xd2
+        fmov.d  @%[src]+, xd4
+        fmov.d  @%[src]+, xd6
+    
+        pref    @%[src]
+
+        add     #32, %[dst]
+
+        fmov.d  xd6, @-%[dst]
+        fmov.d  xd4, @-%[dst]
+        fmov.d  xd2, @-%[dst]
+        fmov.d  xd0, @-%[dst]
+
+        add     #32, %[dst]
+        pref    @%[dst]
+
+        fmov.d  @%[src]+, xd0
+        fmov.d  @%[src]+, xd2
+        fmov.d  @%[src]+, xd4
+        fmov.d  @%[src]+, xd6
+
+        add     #32, %[dst]
+
+        fmov.d  xd6, @-%[dst]
+        fmov.d  xd4, @-%[dst]
+        fmov.d  xd2, @-%[dst]
+        fmov.d  xd0, @-%[dst]
+
+        fschg
+    )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst)
+      :
+      :);
+}
+
+//TODO: FIXME FOR VC (AND USE FTRV)
+template<bool FAST_APPROX=false>
+__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+    /*
+        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
+        // q2 then needs to be rotated after each inner product
+        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+    */
+        // keep q1 in fv4
+        register float q1x __asm__ ("fr4") = (q1.x);
+        register float q1y __asm__ ("fr5") = (q1.y);
+        register float q1z __asm__ ("fr6") = (q1.z);
+        register float q1w __asm__ ("fr7") = (q1.w);
+
+        // load q2 into fv8, use it to get the shuffled reorder into fv0
+        register float q2x __asm__ ("fr8")  = (q2.x);
+        register float q2y __asm__ ("fr9")  = (q2.y);
+        register float q2z __asm__ ("fr10") = (q2.z);
+        register float q2w __asm__ ("fr11") = (q2.w);
+
+        // temporary operand / result in fv0
+        register float t1x __asm__ ("fr0");
+        register float t1y __asm__ ("fr1");
+        register float t1z __asm__ ("fr2");
+        register float t1w __asm__ ("fr3");
+
+        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        t1x = q2w;
+        t1y = q2z;
+        t1z = -q2y;
+        t1w = q2w;
+        __asm__ ("\n"
+            " fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
+
+        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        t1x = -q2z;
+        t1y = q2w;
+        t1z = q2x;
+        __atomic_thread_fence(1);
+        r->x = t1w;   // get previous result
+        t1w = q2y;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //y = t1w;
+
+        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        t1x = q2y;
+        t1y = -q2x;
+        t1z = q2w;
+        __atomic_thread_fence(1);
+        r->y = t1w;   // get previous result
+        t1w = q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //z = t1w;
+        __atomic_thread_fence(1);
+
+        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+        q2x = -q2x;
+        q2y = -q2y;
+        q2z = -q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv8\n"
+            : "+f" (q2w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (q2x), "f" (q2y), "f" (q2z)
+        );
+
+        __atomic_thread_fence(1);
+        r->z = t1w;
+        __atomic_thread_fence(1);
+        r->w = q2w;
+    } else {
+        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+    }
+}
+
+__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+    unsigned int prefetch_scratch;
+
+    asm volatile (
+        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
+        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
+        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
+        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
+        // back matrix
+        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
+        "fmov.d @%[bmtrx]+, XD2\n\t"
+        "fmov.d @%[bmtrx]+, XD4\n\t"
+        "fmov.d @%[bmtrx]+, XD6\n\t"
+        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
+        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
+        "fmov.d @%[bmtrx]+, XD10\n\t"
+        "fmov.d @%[bmtrx]+, XD12\n\t"
+        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
+        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
+        "fmov.d @%[bmtrx], XD14\n\t"
+        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
+        // front matrix
+        // interleave loads and matrix multiply 4x4
+        "fmov.d @%[fmtrx]+, DR0\n\t"
+        "fmov.d @%[fmtrx]+, DR2\n\t"
+        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
+        "ftrv XMTRX, FV0\n\t" // (FE)
+
+        "fmov.d @%[fmtrx]+, DR6\n\t"
+        "fmov.d @%[fmtrx]+, DR8\n\t"
+        "ftrv XMTRX, FV4\n\t"
+
+        "fmov.d @%[fmtrx]+, DR10\n\t"
+        "fmov.d @%[fmtrx]+, DR12\n\t"
+        "ftrv XMTRX, FV8\n\t"
+
+        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
+        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
+        "ftrv XMTRX, FV12\n\t" // (FE)
+        // Save output in XF regs
+        "frchg\n"
+        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+        : // no inputs
+        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
+    );
+}
+
+__hot inline void mat_apply_rotate_x(float x) {
+    x *= 10430.37835f;
+    asm volatile(
+        "ftrc	%[a], fpul\n\t"
+        "fsca 	fpul, dr4\n\t"
+        "fldi0	fr8\n\t"
+        "fldi0	fr11\n\t"
+        "fmov	fr5, fr10\n\t"
+        "fmov	fr4, fr9\n\t"
+        "fneg	fr9\n\t"
+        "ftrv	xmtrx, fv8\n\t"
+        "fmov	fr4, fr6\n\t"
+        "fldi0	fr7\n\t"
+        "fldi0	fr4\n\t"
+        "ftrv	xmtrx, fv4\n\t"
+        "fschg\n\t"
+        "fmov	dr8, xd8\n\t"
+        "fmov	dr10, xd10\n\t"
+        "fmov	dr4, xd4\n\t"
+        "fmov	dr6, xd6\n\t"
+        "fschg\n"
+        :
+        : [a] "f"(x)
+        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+__hot inline void mat_apply_rotate_y(float y) {
+    y *= 10430.37835f;
+    asm volatile(
+        "ftrc	%[a], fpul\n\t"
+        "fsca   fpul, dr6\n\t"
+        "fldi0	fr9\n\t"
+        "fldi0	fr11\n\t"
+        "fmov	fr6, fr8\n\t"
+        "fmov	fr7, fr10\n\t"
+        "ftrv	xmtrx, fv8\n\t"
+        "fmov	fr7, fr4\n\t"
+        "fldi0	fr5\n\t"
+        "fneg	fr6\n\t"
+        "fldi0	fr7\n\t"
+        "ftrv	xmtrx, fv4\n\t"
+        "fschg\n\t"
+        "fmov	dr8, xd8\n\t"
+        "fmov	dr10, xd10\n\t"
+        "fmov	dr4, xd0\n\t"
+        "fmov	dr6, xd2\n\t"
+        "fschg\n"
+        :
+        : [a] "f"(y)
+        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+__hot inline void mat_apply_rotate_z(float z) {
+    z *= 10430.37835f;
+    asm volatile(
+        "ftrc	%[a], fpul\n\t"
+        "fsca   fpul, dr8\n\t"
+        "fldi0	fr10\n\t"
+        "fldi0	fr11\n\t"
+        "fmov	fr8, fr5\n\t"
+        "fneg	fr8\n\t"
+        "ftrv	xmtrx, fv8\n\t"
+        "fmov	fr9, fr4\n\t"
+        "fschg\n\t"
+        "fmov	dr10, dr6\n\t"
+        "ftrv	xmtrx, fv4\n\t"
+        "fmov	dr8, xd4\n\t"
+        "fmov	dr10, xd6\n\t"
+        "fmov	dr4, xd0\n\t"
+        "fmov	dr6, xd2\n\t"
+        "fschg\n"
+        :
+        : [a] "f"(z)
+        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+#   else
+#       ifdef DC_TEXCONV
+#           define mat_apply(a)
+#           define mat_load2(a)
+#           define mat_store2(a)
+#           define mat_identity2()
+#           define mat_transform(a, b, c, d)
+#           define pvr_fog_table_color(a,r,g,b)
+#           define pvr_fog_table_linear(s,e)
+#       else
+#           define mat_load2(a)    mat_load(a)
+#           define mat_store2(a)   mat_store(a)
+#           define mat_identity2() mat_identity()
+#       endif
+
+#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
+        vector_t tmp = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        z2 = 1.0f / tmp.w; \
+        x2 = tmp.x * z2; \
+        y2 = tmp.y * z2; \
+    } while(false)
+
+#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \
+        vector_t tmp = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        z2 = tmp.z; \
+        x2 = tmp.x; \
+        y2 = tmp.y; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
+        vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+        x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
+        vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
+        mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+        x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
+    } while(false)
+
+#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
+        vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
+        mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+        w_ = tmp1233123.w; \
+    } while(false)
+
+inline void mat_load_3x3(const rw::Matrix* mtx) {
+    memcpy(XMTRX, mtx, sizeof(matrix_t));
+    XMTRX[0][3] = 0.0f;
+    XMTRX[1][3] = 0.0f;
+    XMTRX[2][3] = 0.0f;
+
+    XMTRX[3][0] = 0.0f;
+    XMTRX[3][1] = 0.0f;
+    XMTRX[3][2] = 0.0f;
+    XMTRX[3][3] = 0.0f;
+}
+
+inline void mat_load_4x4(const rw::Matrix* mtx) {
+    memcpy(XMTRX, mtx, sizeof(matrix_t));
+    XMTRX[3][3] = 1.0f;
+}
+
+inline void mat_transpose(void) {
+    matrix_t tmp;
+
+    for(unsigned i = 0; i < 4; ++i)
+        for(unsigned j = 0; j < 4; ++j)
+            tmp[j][i] = XMTRX[i][j];
+    
+    mat_load(&tmp);
+}
+
+template<bool FAST_APPROX=true>
+__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+    r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+    r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+    r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+}
+
+__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+    mat_load(matrix1);
+    mat_apply(matrix2);
+}
+
+__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
+    mat_load(src);
+    mat_store(dst);
+}
+
+#endif
+
+__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
+    mat_load_apply(matrix1, matrix2);
+    mat_store2(out);
+}
+
+#endif
+
+}
+
+#endif /* RWDC_COMMON_H */
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -1,6 +1,9 @@
 #pragma once

 #include "common_defines.h"
+#ifdef RW_DC
+#include "rwdc_common.h"
+#endif

 #ifndef RW_PS2
 #include <stdint.h>
@@ -253,11 +256,25 @@ inline V3d neg(const V3d &a) { return makeV3d(-a.x, -a.y, -a.z); }
 inline V3d add(const V3d &a, const V3d &b) { return makeV3d(a.x+b.x, a.y+b.y, a.z+b.z); }
 inline V3d sub(const V3d &a, const V3d &b) { return makeV3d(a.x-b.x, a.y-b.y, a.z-b.z); }
 inline V3d scale(const V3d &a, float32 r) { return makeV3d(a.x*r, a.y*r, a.z*r); }
-inline float32 length(const V3d &v) { return sqrtf(v.x*v.x + v.y*v.y + v.z*v.z); }
+inline float32 length(const V3d &v) {
+#ifndef DC_SH4
+	return sqrtf(v.x*v.x + v.y*v.y + v.z*v.z);
+#else
+	float len;
+	vec3f_length(v.x, v.y, v.z, len);
+	return len;
+#endif
+}
 inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); }
 inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); }
 V3d cross(const V3d &a, const V3d &b);
-inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
+inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) {
+#ifdef DC_SH4
+	return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f);
+#else
+	return a.x*b.x + a.y*b.y + a.z*b.z;
+#endif
+}
 inline V3d lerp(const V3d &a, const V3d &b, float32 r){
 	return makeV3d(a.x + r*(b.x - a.x),
 	               a.y + r*(b.y - a.y),
@@ -300,9 +317,21 @@ inline Quat makeQuat(float32 w, const V3d &vec) { Quat q = { vec.x, vec.y, vec.z
 inline Quat add(const Quat &q, const Quat &p) { return makeQuat(q.w+p.w, q.x+p.x, q.y+p.y, q.z+p.z); }
 inline Quat sub(const Quat &q, const Quat &p) { return makeQuat(q.w-p.w, q.x-p.x, q.y-p.y, q.z-p.z); }
 inline Quat negate(const Quat &q) { return makeQuat(-q.w, -q.x, -q.y, -q.z); }
-inline float32 dot(const Quat &q, const Quat &p) { return q.w*p.w + q.x*p.x + q.y*p.y + q.z*p.z; }
+inline float32 dot(const Quat &q, const Quat &p) {
+#ifdef DC_SH4
+	return fipr(q.x, q.y, q.z, q.w, p.x, p.y, p.z, p.w);
+#else
+	return q.w*p.w + q.x*p.x + q.y*p.y + q.z*p.z;
+#endif
+}
 inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*r, q.z*r); }
-inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); }
+inline float32 length(const Quat &q) {
+#ifndef DC_SH4
+	return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
+#else
+	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
+#endif
+}
 inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
 inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
 Quat mult(const Quat &q, const Quat &p);
Author	SHA1	Message	Date
Stefanos Kornilios Mitsis Poiitidis	fa84fefa06	Merge branch 'falco/dreamsdk4_fixes' into 'main' Some checks failed re3 conan+cmake / build-cmake (openal, glfw, macos-latest, gl3) (push) Has been cancelled Details re3 conan+cmake / build-cmake (openal, glfw, ubuntu-18.04, gl3) (push) Has been cancelled Details re3 conan+cmake / build-cmake (openal, glfw, windows-latest, gl3) (push) Has been cancelled Details re3 conan+cmake / build-cmake (openal, windows-latest, d3d9) (push) Has been cancelled Details re3 cmake devkitA64 (Nintendo Switch) / build-nintendo-switch (push) Has been cancelled Details re3 premake amd64 / build (Debug, win-amd64-librw_d3d9-oal) (push) Has been cancelled Details re3 premake amd64 / build (Debug, win-amd64-librw_gl3_glfw-oal) (push) Has been cancelled Details re3 premake amd64 / build (Release, win-amd64-librw_d3d9-oal) (push) Has been cancelled Details re3 premake amd64 / build (Release, win-amd64-librw_gl3_glfw-oal) (push) Has been cancelled Details re3 premake x86 / build (Debug, win-x86-librw_d3d9-mss) (push) Has been cancelled Details re3 premake x86 / build (Debug, win-x86-librw_d3d9-oal) (push) Has been cancelled Details re3 premake x86 / build (Debug, win-x86-librw_gl3_glfw-mss) (push) Has been cancelled Details re3 premake x86 / build (Debug, win-x86-librw_gl3_glfw-oal) (push) Has been cancelled Details re3 premake x86 / build (Release, win-x86-librw_d3d9-mss) (push) Has been cancelled Details re3 premake x86 / build (Release, win-x86-librw_d3d9-oal) (push) Has been cancelled Details re3 premake x86 / build (Release, win-x86-librw_gl3_glfw-mss) (push) Has been cancelled Details re3 premake x86 / build (Release, win-x86-librw_gl3_glfw-oal) (push) Has been cancelled Details re3 premake x86 / build (Vanilla, win-x86-librw_d3d9-mss) (push) Has been cancelled Details re3 premake x86 / build (Vanilla, win-x86-librw_d3d9-oal) (push) Has been cancelled Details re3 premake x86 / build (Vanilla, win-x86-librw_gl3_glfw-mss) (push) Has been cancelled Details re3 premake x86 / build (Vanilla, win-x86-librw_gl3_glfw-oal) (push) Has been cancelled Details Fixed builds for DreamSDK4. See merge request skmp/dca3-game!102	2025-10-02 17:30:23 +00:00
Falco	0b7892665e	Fixed builds for DreamSDK4. - Both Miami and Liberty fixed. - FUNDAMENTAL issues with data types not being configured correctly for 64-bit Windows - More issues with <windows.h> and other files not being included properly for _WIN64 - Successfully baked and tested both liberty and miami CDIs in Flycast	2025-07-15 16:55:44 -05:00
Stefanos Kornilios Mitsis Poiitidis	fa563ee805	Merge branch 'falco/dreamsdk_build_fixes' into 'main' Fixed DreamSDK Builds (liberty + miami). See merge request skmp/dca3-game!101	2025-06-15 16:56:46 +00:00
Falco Girgis	81fa320f1e	Fixed DreamSDK Builds (liberty + miami). Looks like DreamSDK was shitting the bed on both games' "fake.cpp." Turns out there were two headers that typedef'd "uint32," so the compiiler would freak out and claim it was an ambigious typedef if it saw it used within that translation unit. - Changed references to "uint32" to "uint32_t" to no longer be ambiguous.	2025-06-15 11:36:40 -05:00
Stefanos Kornilios Mitsis Poiitidis	449be62e98	Merge branch 'skmp/refsw-and-kos-fixes' into 'main' refsw and kos fixes See merge request skmp/dca3-game!100	2025-05-07 07:30:02 +00:00
Stefanos Kornilios Mitsis Poiitidis	e4814f3e51	hlekos: Fix for semaphore_t collision	2025-05-07 10:18:05 +03:00
Stefanos Kornilios Mitsis Poiitidis	edeee9fd4f	refsw2: Fix for presort mode	2025-05-07 10:17:35 +03:00
Stefanos Kornilios Mitsis Poiitidis	15fdaecfc7	Merge branch 'skmp/update-refsw2-tacore' into 'main' koshle: Update refsw2 and tacore See merge request skmp/dca3-game!99	2025-05-06 20:20:18 +00:00
Stefanos Kornilios Mitsis Poiitidis	5316a3572e	koshle: Update refsw2 and tacore	2025-05-06 22:20:42 +03:00
Stefanos Kornilios Mitsis Poiitidis	1f2f270da9	Merge branch 'falco/gainz_phase_3' into 'main' Liberty/Miami Perf Gainz Phase 3 See merge request skmp/dca3-game!97	2025-04-26 17:00:40 +00:00
Falco Girgis	1f4dace511	Fixing simulator builds. Ugh. Stupid typo. Accidentally defined mat_store2() to mat_store2(). :(	2025-04-26 11:36:49 -05:00
Falco Girgis	fc4a7e3791	Fixing simulator builds. - Had forgotten to define mat_load2(), mat_store2(), mat_identity2() for simulator builds in rwdc_common.h. - Just defining them to use the regular versions.	2025-04-26 11:28:08 -05:00
Falco Girgis	04b11dfb9d	Synchronized dca3-kos with main branch. Which is where KOS's Gainz Phase 3 work now resides.	2025-04-26 11:15:07 -05:00
Falco Girgis	2716147db4	Everything tested/working: liberty/miami + gainz!	2025-04-26 10:52:36 -05:00
Falco Girgis	dc96ffc551	FIXED LIBERTY BOAT ROTATION ISSUES.	2025-04-15 13:53:19 -05:00
Falco Girgis	4d13e821b5	Liberty/Miami Perf Gainz Phase 3 1) synced dca3-kos repo which has some gainzy commits 2) rwdc_common.h - all low-level and matrix/vector routines for SH4 are now shared in this common file, included in both RW and Liberty/Miami engines 3) CMatrix a. assignment operator: now uses asm-optimized mat_copy() b. multiplication operator: now use mat_mult() SH4 routine c. Scale(): applies a scale matrix via mat_scale d. MultiplyInverse: fipr-optimizations 4) CQuaternion a. multiplication: SH4 ASM FIPR optimized b. Get(V3d& axis, float &angle): fast inversion/division c. Set(RWMatrix&): fast division 5) CVector a. Multiply3x3() now accelerated with mat_transpose 5) RwQuat a. mult(): FIPR accelerated b. length(): FIPR/FSRRA accelerated	2025-04-15 12:06:12 -05:00
Stefanos Kornilios Mitsis Poiitidis	cca7b3c6fa	Merge branch 'falco/gainz_phase_2' into 'main' Phase 2 of Liberty/Mmiami engine+RW SH4 math See merge request skmp/dca3-game!96	2025-04-04 17:26:28 +00:00
Stefanos Kornilios Mitsis Poiitidis	dbcc46b774	Merge branch 'frogbull/fix-not-assigned-hints' into 'main' Fix for "Not assigned" in two Hints (Answer cell phone and replace melee weapon) See merge request skmp/dca3-game!95	2025-04-04 17:25:15 +00:00
Falco Girgis	92be9cfbdd	Phase 2 of liberty/miami engine+RW SH4 math. This PR is the second batch of SH4-specific optimizations for key linear algebra routines within both the RW layer and liberty + miami engines. It also includes a bunch of routines that were accelerated within liberty but were never added to miami, which has helped contribute to the lower perf in Miami. 1) CQuaternion - magnitude, squared magnitude, and dot product use SH4 instructions now 2) CVector (Miami port from Liberty) - Multiply3x3, magnitude, squared magnitude, dot product, distance use SH4 instructions 3) rwbase.h - vector3 length, vector3 dot product, and quaternion dot product use SH4 instructions	2025-04-02 22:39:53 -05:00
Frogbull	7bb5e1640e	Fix for "Not assigned" in two Hints (Answer cell phone and replace melee weapon)	2025-04-01 23:18:19 +02:00