Saving state.

Merge remote-tracking branch 'origin/main' into falco/sh4_math_collision_physics
Removec uneccessary 64-byte align for RwMatrix.
2025-03-27 15:17:11 -05:00 · 2025-03-17 18:52:34 -05:00 · 2025-03-15 13:20:58 -05:00 · 2025-03-15 00:27:15 -05:00 · 2025-03-14 23:33:34 -05:00 · 2025-03-14 11:56:28 -05:00
36 changed files with 1663 additions and 1192 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,151 +0,0 @@
-{
-  "C_Cpp.default.cStandard": "gnu11",
-  "C_Cpp.default.cppStandard": "gnu++14",
-  "C_Cpp.default.includePath": [
-    "src",
-    "src/animation",
-    "src/audio",
-    "src/audio/eax",
-    "src/audio/oal",
-    "src/buildings",
-    "src/collision",
-    "src/control",
-    "src/core",
-    "src/entities",
-    "src/extras",
-    "src/fakerw",
-    "src/math",
-    "src/modelinfo",
-    "src/objects",
-    "src/peds",
-    "src/renderer",
-    "src/rw",
-    "src/save/",
-    "src/skel/",
-    "src/skel/glfw",
-    "src/text",
-    "src/vehicles",
-    "src/weapons",
-    "vendor/librw"
-  ],
-  "C_Cpp.vcFormat.indent.gotoLabels": "leftmostColumn",
-  "C_Cpp.vcFormat.space.pointerReferenceAlignment": "right",
-  "cSpell.enabled": false,
-  "files.trimFinalNewlines": false,
-  "files.trimTrailingWhitespace": false,
-  "files.associations": {
-    "system_error": "cpp",
-    "xlocale": "cpp",
-    "xiosbase": "cpp",
-    "memory": "cpp",
-    "mutex": "cpp",
-    "xmemory": "cpp",
-    "deque": "cpp",
-    "initializer_list": "cpp",
-    "list": "cpp",
-    "queue": "cpp",
-    "type_traits": "cpp",
-    "vector": "cpp",
-    "xhash": "cpp",
-    "xstring": "cpp",
-    "xutility": "cpp",
-    "iosfwd": "cpp",
-    "algorithm": "cpp",
-    "atomic": "cpp",
-    "iterator": "cpp",
-    "xtree": "cpp",
-    "ios": "cpp",
-    "*.inc": "cpp",
-    "cstdlib": "cpp",
-    "ratio": "cpp",
-    "array": "cpp",
-    "functional": "cpp",
-    "tuple": "cpp",
-    "utility": "cpp",
-    "*.tcc": "cpp",
-    "unordered_map": "cpp",
-    "chrono": "cpp",
-    "bit": "cpp",
-    "cctype": "cpp",
-    "cinttypes": "cpp",
-    "clocale": "cpp",
-    "cmath": "cpp",
-    "compare": "cpp",
-    "concepts": "cpp",
-    "condition_variable": "cpp",
-    "csignal": "cpp",
-    "cstdarg": "cpp",
-    "cstddef": "cpp",
-    "cstdint": "cpp",
-    "cstdio": "cpp",
-    "cstring": "cpp",
-    "ctime": "cpp",
-    "cwchar": "cpp",
-    "cwctype": "cpp",
-    "map": "cpp",
-    "set": "cpp",
-    "string": "cpp",
-    "exception": "cpp",
-    "memory_resource": "cpp",
-    "numeric": "cpp",
-    "random": "cpp",
-    "string_view": "cpp",
-    "fstream": "cpp",
-    "iostream": "cpp",
-    "istream": "cpp",
-    "limits": "cpp",
-    "new": "cpp",
-    "numbers": "cpp",
-    "ostream": "cpp",
-    "semaphore": "cpp",
-    "sstream": "cpp",
-    "stdexcept": "cpp",
-    "stop_token": "cpp",
-    "streambuf": "cpp",
-    "thread": "cpp",
-    "typeinfo": "cpp",
-    "codecvt": "cpp",
-    "iomanip": "cpp",
-    "format": "cpp",
-    "__locale": "cpp",
-    "optional": "cpp",
-    "span": "cpp",
-    "__bit_reference": "cpp",
-    "__config": "cpp",
-    "__hash_table": "cpp",
-    "__node_handle": "cpp",
-    "__split_buffer": "cpp",
-    "__threading_support": "cpp",
-    "__tree": "cpp",
-    "__verbose_abort": "cpp",
-    "bitset": "cpp",
-    "charconv": "cpp",
-    "complex": "cpp",
-    "execution": "cpp",
-    "locale": "cpp",
-    "variant": "cpp",
-    "text_encoding": "cpp",
-    "stack": "cpp",
-    "unordered_set": "cpp",
-    "shared_mutex": "cpp",
-    "forward_list": "cpp",
-    "cholesky": "cpp",
-    "core": "cpp",
-    "eigenvalues": "cpp",
-    "geometry": "cpp",
-    "householder": "cpp",
-    "jacobi": "cpp",
-    "lu": "cpp",
-    "qr": "cpp",
-    "svd": "cpp",
-    "print": "cpp",
-    "strstream": "cpp",
-    "regex": "cpp",
-    "cassert": "cpp",
-    "barrier": "cpp",
-    "coroutine": "cpp",
-    "future": "cpp",
-    "latch": "cpp",
-    "syncstream": "cpp"
-  }
-}
--- a/liberty/Makefile
+++ b/liberty/Makefile
@@ -96,20 +96,24 @@ OBJS_TEXCONV += \
 	../vendor/librw/src/d3d-x/d3d8render.texconv.o \
 	../vendor/librw/src/bmp.texconv.o \
 	../vendor/librw/src/png.texconv.o \
-	../vendor/librw/src/lodepng/lodepng.texconv.o	
+	../vendor/librw/src/lodepng/lodepng.texconv.o

 # Add compilation units to this list to explicity compile them with
 # -O3 optimizations, while the rest get the default (-Os) treatment
 # to conserve RAM.
 OBJS_O3 = \
 	../vendor/librw/src/dc/rwdc.o \
-	../src/liberty/core/World.o \
-	../src/liberty/collision/Collision.o \
-	../src/liberty/math/math.o \
-	../src/liberty/math/Matrix.o \
-	../src/liberty/math/Quaternion.o \
-	../src/liberty/math/Rect.o \
-	../src/liberty/math/Vector.o \
+	../src/core/World.o \
+	../src/core/ZoneCull.o \
+	../src/entities/Physical.o \
+	../src/collision/Collision.o \
+	../src/peds/PedAI.o \
+	../src/render/Renderer.o \
+	../src/math/math.o \
+	../src/math/Matrix.o \
+	../src/math/Quaternion.o \
+	../src/math/Rect.o \
+	../src/math/Vector.o \
 	../vendor/librw/src/base.o \
 	../src/liberty/renderer/Shadows.o

@@ -138,7 +142,7 @@ CXXFLAGS += $(if $(WITH_32MB),-O3,-Os) \
 	-Wno-multichar -Wno-unused-value -Wno-char-subscripts -Wno-reorder \
 	-Wno-unused-function -Wno-class-memaccess -fno-permissive \
 	-fno-asynchronous-unwind-tables -fno-enforce-eh-specs -fno-non-call-exceptions \
-	-fno-strict-aliasing -fwrapv
+	-fno-strict-aliasing -fwrapv -fno-PIC

 clean-texconv:
 	-rm -f $(OBJS_TEXCONV)
--- a/liberty/common.mk
+++ b/liberty/common.mk
@@ -381,6 +381,7 @@ INCLUDE = \
 -I../src/liberty/skel/win \
 \
 -I../vendor/librw \
+-I../vendor/librw/src/dc \
 \
 -I../vendor/miniLZO \
 \
--- a/src/liberty/collision/ColLine.h
+++ b/src/liberty/collision/ColLine.h
@@ -1,14 +1,14 @@
 #pragma once

-struct CColLine
+struct alignas(8) CColLine
 {
 	// NB: this has to be compatible with two CVuVectors
-	CVector p0;
-	int pad0;
-	CVector p1;
-	int pad1;
+	alignas(8) CVector p0;
+	float pad0 = 0.0f;
+	alignas(8) CVector p1;
+	float pad1 = 0.0f;

-	CColLine(void) { };
+	CColLine(void) = default;
 	CColLine(const CVector &p0, const CVector &p1) { this->p0 = p0; this->p1 = p1; };
 	void Set(const CVector &p0, const CVector &p1);
 };
--- a/src/liberty/collision/ColPoint.h
+++ b/src/liberty/collision/ColPoint.h
@@ -1,12 +1,12 @@
 #pragma once

-struct CColPoint
+struct alignas(8) CColPoint
 {
-	CVector point;
-	int pad1;
+	alignas(8) CVector point;
+	float pad1 = 0.0f;
 	// the surface normal on the surface of point
-	CVector normal;
-	int pad2;
+	alignas(8) CVector normal;
+	float pad2 = 0.0f;
 	uint8 surfaceA;
 	uint8 pieceA;
 	uint8 surfaceB;
--- a/src/liberty/collision/ColSphere.cpp
+++ b/src/liberty/collision/ColSphere.cpp
@@ -4,8 +4,6 @@
 void
 CColSphere::Set(float radius, const CVector &center, uint8 surf, uint8 piece)
 {
-	this->radius = radius;
+	Set(radius, surf, piece);
 	this->center = center;
-	this->surface = surf;
-	this->piece = piece;
 }
--- a/src/liberty/collision/ColSphere.h
+++ b/src/liberty/collision/ColSphere.h
@@ -9,5 +9,10 @@ struct CColSphere
 	float radius;
 	uint8 surface;
 	uint8 piece;
+	void Set(float radius, uint8 surf = SURFACE_DEFAULT, uint8 piece = 0) {
+		this->radius = radius;
+		this->surface = surf;
+		this->piece = piece;
+	}
 	void Set(float radius, const CVector &center, uint8 surf = SURFACE_DEFAULT, uint8 piece = 0);
 };
--- a/src/liberty/collision/Collision.cpp
+++ b/src/liberty/collision/Collision.cpp
@@ -23,6 +23,10 @@
 #include "Collision.h"
 #include "Frontend.h"

+#ifdef DC_SH4
+#include "VuCollision.h"
+#endif
+
 #ifdef VU_COLLISION
 #include "VuCollision.h"

@@ -579,7 +583,7 @@ CCollision::TestLineSphere(const CColLine &line, const CColSphere &sph)
 	// Sqrt(diffsq) somehow works out to be the distance from that
 	// midpoint to the intersection points.
 	// So subtract that and get rid of the awkward scaling:
-	float f = (-projline - Sqrt(diffsq)) / (2.0f*linesq);
+	float f = Div((-projline - Sqrt(diffsq)), (2.0f*linesq));
 	// f should now be in range [0, 1] for [p0, p1]
 	return f >= 0.0f && f <= 1.0f;
 }
@@ -614,7 +618,7 @@ CCollision::TestSphereTriangle(const CColSphere &sphere,
 	// calculate two orthogonal basis vectors for the triangle
 	CVector vec2 = vb - va;
 	float len = vec2.Magnitude();
-	vec2 = vec2 * (1.0f/len);
+	vec2 = vec2 * Invert<true, false>(len);
 	CVector normal;
 	plane.GetNormal(normal);
 	CVector vec1 = CrossProduct(vec2, normal);
@@ -732,12 +736,17 @@ CCollision::TestLineOfSight(const CColLine &line, const CMatrix &matrix, CColMod

 	return false;
 #else
-	static CMatrix matTransform;
+	CMatrix matTransform;
 	int i;

 	// transform line to model space
 	Invert(matrix, matTransform);
-	CColLine newline(matTransform * line.p0, matTransform * line.p1);
+	CColLine newline;
+#ifndef DC_SH4	
+	newline.Set(matTransform * line.p0, matTransform * line.p1);
+#else
+	TransformPoints(reinterpret_cast<CVuVector*>(&newline), 2, matTransform, &line.p0, sizeof(CColLine)/2);
+#endif

 	// If we don't intersect with the bounding box, no chance on the rest
 	if(!TestLineBox(newline, model.boundingBox))
@@ -778,12 +787,13 @@ bool
 CCollision::ProcessSphereSphere(const CColSphere &s1, const CColSphere &s2, CColPoint &point, float &mindistsq)
 {
 	CVector dist = s1.center - s2.center;
-	float d = dist.Magnitude() - s2.radius;	// distance from s1's center to s2
+	float mag = dist.Magnitude();
+	float d = mag - s2.radius;	// distance from s1's center to s2
 	float depth = s1.radius - d;	// sphere overlap
 	if(d < 0.0f) d = 0.0f;		// clamp to zero, i.e. if s1's center is inside s2
 	// no collision if sphere is not close enough
 	if(d*d < mindistsq && d < s1.radius){
-		dist.Normalise();
+		dist *= Invert<true, false>(mag);
 		point.point = s1.center - dist*d;
 		point.normal = dist;
 #ifndef VU_COLLISION
@@ -834,7 +844,7 @@ CCollision::ProcessSphereBox(const CColSphere &sph, const CColBox &box, CColPoin
 		dist = sph.center - p;
 		float lensq = dist.MagnitudeSqr();
 		if(lensq < mindistsq){
-			point.normal = dist * (1.0f/Sqrt(lensq));
+			point.normal = dist * (RecipSqrt(lensq));
 			point.point = sph.center - point.normal;
 #ifndef VU_COLLISION
 			point.surfaceA = sph.surface;
@@ -879,8 +889,9 @@ CCollision::ProcessSphereBox(const CColSphere &sph, const CColBox &box, CColPoin
 		float lensq = dist.MagnitudeSqr();
 		if(lensq < mindistsq){
 			float len = Sqrt(lensq);
+			float invlen = Invert<true, false>(len);
 			point.point = p;
-			point.normal = dist * (1.0f/len);
+			point.normal = dist * invlen;
 #ifndef VU_COLLISION
 			point.surfaceA = sph.surface;
 			point.pieceA = sph.piece;
@@ -906,7 +917,7 @@ CCollision::ProcessLineBox(const CColLine &line, const CColBox &box, CColPoint &
 	// check if points are on opposite sides of min x plane
 	if((box.min.x - line.p1.x) * (box.min.x - line.p0.x) < 0.0f){
 		// parameter along line where we intersect
-		t = (box.min.x - line.p0.x) / (line.p1.x - line.p0.x);
+		t = Div((box.min.x - line.p0.x), (line.p1.x - line.p0.x));
 		// y of intersection
 		y = line.p0.y + (line.p1.y - line.p0.y)*t;
 		if(y > box.min.y && y < box.max.y){
@@ -923,7 +934,7 @@ CCollision::ProcessLineBox(const CColLine &line, const CColBox &box, CColPoint &

 	// max x plane
 	if((line.p1.x - box.max.x) * (line.p0.x - box.max.x) < 0.0f){
-		t = (line.p0.x - box.max.x) / (line.p0.x - line.p1.x);
+		t = Div((line.p0.x - box.max.x), (line.p0.x - line.p1.x));
 		y = line.p0.y + (line.p1.y - line.p0.y)*t;
 		if(y > box.min.y && y < box.max.y){
 			z = line.p0.z + (line.p1.z - line.p0.z)*t;
@@ -938,7 +949,7 @@ CCollision::ProcessLineBox(const CColLine &line, const CColBox &box, CColPoint &

 	// min y plne
 	if((box.min.y - line.p0.y) * (box.min.y - line.p1.y) < 0.0f){
-		t = (box.min.y - line.p0.y) / (line.p1.y - line.p0.y);
+		t = Div((box.min.y - line.p0.y), (line.p1.y - line.p0.y));
 		x = line.p0.x + (line.p1.x - line.p0.x)*t;
 		if(x > box.min.x && x < box.max.x){
 			z = line.p0.z + (line.p1.z - line.p0.z)*t;
@@ -953,7 +964,7 @@ CCollision::ProcessLineBox(const CColLine &line, const CColBox &box, CColPoint &

 	// max y plane
 	if((line.p0.y - box.max.y) * (line.p1.y - box.max.y) < 0.0f){
-		t = (line.p0.y - box.max.y) / (line.p0.y - line.p1.y);
+		t = Div((line.p0.y - box.max.y), (line.p0.y - line.p1.y));
 		x = line.p0.x + (line.p1.x - line.p0.x)*t;
 		if(x > box.min.x && x < box.max.x){
 			z = line.p0.z + (line.p1.z - line.p0.z)*t;
@@ -968,7 +979,7 @@ CCollision::ProcessLineBox(const CColLine &line, const CColBox &box, CColPoint &

 	// min z plne
 	if((box.min.z - line.p0.z) * (box.min.z - line.p1.z) < 0.0f){
-		t = (box.min.z - line.p0.z) / (line.p1.z - line.p0.z);
+		t = Div((box.min.z - line.p0.z), (line.p1.z - line.p0.z));
 		x = line.p0.x + (line.p1.x - line.p0.x)*t;
 		if(x > box.min.x && x < box.max.x){
 			y = line.p0.y + (line.p1.y - line.p0.y)*t;
@@ -983,7 +994,7 @@ CCollision::ProcessLineBox(const CColLine &line, const CColBox &box, CColPoint &

 	// max z plane
 	if((line.p0.z - box.max.z) * (line.p1.z - box.max.z) < 0.0f){
-		t = (line.p0.z - box.max.z) / (line.p0.z - line.p1.z);
+		t = Div((line.p0.z - box.max.z), (line.p0.z - line.p1.z));
 		x = line.p0.x + (line.p1.x - line.p0.x)*t;
 		if(x > box.min.x && x < box.max.x){
 			y = line.p0.y + (line.p1.y - line.p0.y)*t;
@@ -1029,7 +1040,7 @@ CCollision::ProcessLineSphere(const CColLine &line, const CColSphere &sphere, CC
 	if(diffsq < 0.0f)
 		return false;
 	// point of first intersection, in range [0,1] between p0 and p1
-	float t = (projline - Sqrt(diffsq)) / linesq;
+	float t = Div<true, false>((projline - Sqrt(diffsq)), linesq);
 	// if not on line or beyond mindist, no intersection
 	if(t < 0.0f || t > 1.0f || t >= mindist)
 		return false;
@@ -1083,7 +1094,7 @@ CCollision::ProcessVerticalLineTriangle(const CColLine &line,

 	// intersection parameter on line
 	float h = (line.p1 - p0).z;
-	t = -plane.CalcPoint(p0) / (h * normal.z);
+	t = Div(-plane.CalcPoint(p0), (h * normal.z));
 	// early out if we're beyond the mindist
 	if(t >= mindist)
 		return false;
@@ -1206,7 +1217,7 @@ CCollision::IsStoredPolyStillValidVerticalLine(const CVector &pos, float z, CCol
 	// intersection parameter on line
 	CVector normal;
 	plane.GetNormal(normal);
-	t = -plane.CalcPoint(p0) / DotProduct(p1 - p0, normal);
+	t = Div(-plane.CalcPoint(p0), DotProduct(p1 - p0, normal));
 	// find point of intersection
 	CVector p = p0 + (p1-p0)*t;

@@ -1303,7 +1314,7 @@ CCollision::ProcessLineTriangle(const CColLine &line,
 #endif

 	// intersection parameter on line
-	t = -plane.CalcPoint(line.p0) / p0dist;
+	t = Div(-plane.CalcPoint(line.p0), p0dist);

 	// early out if we're beyond the mindist
 	if(t >= mindist)
@@ -1415,7 +1426,7 @@ CCollision::ProcessSphereTriangle(const CColSphere &sphere,
 	plane.GetNormal(normal);
 	CVector vec2 = vb - va;
 	float len = vec2.Magnitude();
-	vec2 = vec2 * (1.0f/len);
+	vec2 = vec2 * Invert<true, false>(len);
 	CVector vec1 = CrossProduct(vec2, normal);

 	// We know A has local coordinate [0,0] and B has [0,len].
@@ -1572,12 +1583,17 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 	}
 	return false;
 #else
-	static CMatrix matTransform;
+	CMatrix matTransform;
 	int i;

 	// transform line to model space
 	Invert(matrix, matTransform);
-	CColLine newline(matTransform * line.p0, matTransform * line.p1);
+	CColLine newline;
+#ifdef DC_SH4
+	TransformPoints(reinterpret_cast<CVuVector*>(&newline), 2, matTransform, &line.p0, sizeof(CColLine)/2);
+#else
+	newline.Set(matTransform * line.p0, matTransform * line.p1);
+#endif

 	// If we don't intersect with the bounding box, no chance on the rest
 	if(!TestLineBox(newline, model.boundingBox))
@@ -1601,8 +1617,20 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 	}

 	if(coldist < mindist){
+	#ifndef DC_SH4
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
+	#else
+		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		mat_trans_single3_nodiv(point.point.x,
+								point.point.y,
+								point.point.z);
+		float w = 0;
+		mat_trans_nodiv(point.normal.x, 
+						point.normal.y, 
+						point.normal.z, 
+						w);
+	#endif	
 		mindist = coldist;
 		return true;
 	}
@@ -1739,9 +1767,14 @@ CCollision::ProcessVerticalLine(const CColLine &line,

 	// transform line to model space
 	// Why does the game seem to do this differently than above?
-	CColLine newline(MultiplyInverse(matrix, line.p0), MultiplyInverse(matrix, line.p1));
-	newline.p1.x = newline.p0.x;
-	newline.p1.y = newline.p0.y;
+	CMatrix matTransform;
+	Invert(matrix, matTransform);
+	CColLine newline;
+#ifndef DC_SH4
+	newline.Set(matTransform * line.p0, matTransform * line.p1);
+#else
+	TransformPoints(reinterpret_cast<CVuVector*>(&newline), 2, matTransform, &line.p0, sizeof(CColLine)/2);
+#endif

 	if(!TestVerticalLineBox(newline, model.boundingBox))
 		return false;
@@ -1765,13 +1798,31 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 	}

 	if(coldist < mindist){
+#ifndef DC_SH4
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
+#else
+		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		mat_trans_single3_nodiv(point.point.x,
+								point.point.y,
+								point.point.z);
+		float w = 0;
+		mat_trans_nodiv(point.normal.x, 
+						point.normal.y, 
+						point.normal.z, 
+						w);
+#endif
 		if(TempStoredPoly.valid && poly){
 			*poly = TempStoredPoly;
+#ifndef DC_SH4
 			poly->verts[0] = matrix * poly->verts[0];
 			poly->verts[1] = matrix * poly->verts[1];
 			poly->verts[2] = matrix * poly->verts[2];
+#else
+			mat_trans_single3_nodiv(poly->verts[0].x, poly->verts[0].y, poly->verts[0].z);
+			mat_trans_single3_nodiv(poly->verts[1].x, poly->verts[1].y, poly->verts[1].z);
+			mat_trans_single3_nodiv(poly->verts[2].x, poly->verts[2].y, poly->verts[2].z);
+#endif
 		}
 		mindist = coldist;
 		return true;
@@ -2107,15 +2158,27 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,

 	return numCollisions;	// sphere collisions
 #else
-	static int aSphereIndicesA[MAXNUMSPHERES];
-	static int aLineIndicesA[MAXNUMLINES];
-	static int aSphereIndicesB[MAXNUMSPHERES];
-	static int aBoxIndicesB[MAXNUMBOXES];
-	static int aTriangleIndicesB[MAXNUMTRIS];
-	static bool aCollided[MAXNUMLINES];
-	static CColSphere aSpheresA[MAXNUMSPHERES];
-	static CColLine aLinesA[MAXNUMLINES];
-	static CMatrix matAB, matBA;
+#if 0
+	static auto *aSpheresA 		   = reinterpret_cast<CColSphere*>((uintptr_t)OCRAM);
+	static auto *aLinesA 		   = reinterpret_cast<CColLine  *>((uintptr_t)aSpheresA         + sizeof(CColSphere) * MAXNUMSPHERES);
+	static auto *aSphereIndicesA   = reinterpret_cast<uint16_t  *>((uintptr_t)aLinesA   		+ sizeof(CColLine)   * MAXNUMLINES);
+	static auto *aLineIndicesA	   = reinterpret_cast<uint16_t  *>((uintptr_t)aSphereIndicesA   + sizeof(uint16_t)   * MAXNUMSPHERES);
+	static auto *aSphereIndicesB   = reinterpret_cast<uint16_t  *>((uintptr_t)aLineIndicesA     + sizeof(uint16_t)   * MAXNUMLINES);
+	static auto *aBoxIndicesB	   = reinterpret_cast<uint16_t  *>((uintptr_t)aSphereIndicesB   + sizeof(uint16_t)   * MAXNUMSPHERES);
+	static auto *aTriangleIndicesB = reinterpret_cast<uint16_t  *>((uintptr_t)aBoxIndicesB      + sizeof(uint16_t)   * MAXNUMBOXES);
+	static auto *aCollided		   = reinterpret_cast<bool      *>((uintptr_t)aTriangleIndicesB + sizeof(uint16_t)   * MAXNUMTRIS);
+	CMatrix matAB, matBA;
+#else
+       static CColSphere aSpheresA[MAXNUMSPHERES];
+       static CColLine aLinesA[MAXNUMLINES];
+       static int aSphereIndicesA[MAXNUMSPHERES];
+       static int aLineIndicesA[MAXNUMLINES];
+       static int aSphereIndicesB[MAXNUMSPHERES];
+       static int aBoxIndicesB[MAXNUMBOXES];
+       static int aTriangleIndicesB[MAXNUMTRIS];
+       static bool aCollided[MAXNUMLINES];
+       static CMatrix matAB, matBA;
+#endif
 	CColSphere s;
 	int i, j;

@@ -2128,20 +2191,54 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,

 	CColSphere bsphereAB;	// bounding sphere of A in B space
 	bsphereAB.radius = modelA.boundingSphere.radius;
+#ifndef DC_SH4
 	bsphereAB.center = matAB * modelA.boundingSphere.center;
+#else	
+	/* No need to reload the matrix, since it's already banked.
+	mat_load(reinterpret_cast<const matrix_t *>(&matAB)); */
+	mat_trans_single3_nodiv_nomod(modelA.boundingSphere.center.x,
+								  modelA.boundingSphere.center.y,
+								  modelA.boundingSphere.center.z,
+								  bsphereAB.center.x,
+								  bsphereAB.center.y,
+								  bsphereAB.center.z);
+#endif
 	if(!TestSphereBox(bsphereAB, modelB.boundingBox))
 		return 0;
-	// B to A space
-	matBA = Invert(matrixA, matBA);
-	matBA *= matrixB;

 	// transform modelA's spheres and lines to B space
 	for(i = 0; i < modelA.numSpheres; i++){
 		CColSphere &s = modelA.spheres[i];
+#ifndef DC_SH4
 		aSpheresA[i].Set(s.radius, matAB * s.center, s.surface, s.piece);
+#else
+		auto &d = aSpheresA[i];
+
+		mat_trans_single3_nodiv_nomod(s.center.x, s.center.y, s.center.z, 
+									  d.center.x, d.center.y, d.center.z);
+		
+		d.Set(s.radius, s.surface, s.piece);
+#endif
 	}
-	for(i = 0; i < modelA.numLines; i++)
+	for(i = 0; i < modelA.numLines; i++) {
+#ifndef DC_SH4
 		aLinesA[i].Set(matAB * modelA.lines[i].p0, matAB * modelA.lines[i].p1);
+#else
+		mat_trans_single3_nodiv_nomod(modelA.lines[i].p0.x,
+									  modelA.lines[i].p0.y, 
+									  modelA.lines[i].p0.z, 
+									  aLinesA[i].p0.x, 
+									  aLinesA[i].p0.y,
+									  aLinesA[i].p0.z);
+		
+		mat_trans_single3_nodiv_nomod(modelA.lines[i].p1.x, 
+									  modelA.lines[i].p1.y, 
+									  modelA.lines[i].p1.z, 
+									  aLinesA[i].p1.x, 
+									  aLinesA[i].p1.y,
+									  aLinesA[i].p1.z);
+#endif
+	}

 	// Test them against model B's bounding volumes
 	int numSpheresA = 0;
@@ -2160,9 +2257,25 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	int numSpheresB = 0;
 	int numBoxesB = 0;
 	int numTrianglesB = 0;
+	// B to A space
+	matBA = Invert(matrixA, matBA);
+	matBA *= matrixB;
+#ifdef DC_SH4
+	/* No need to reload the matrix, since it's already banked.
+	mat_load(reinterpret_cast<const matrix_t *>(&matBA)); */
+#endif
 	for(i = 0; i < modelB.numSpheres; i++){
 		s.radius = modelB.spheres[i].radius;
+#ifndef DC_SH4
 		s.center = matBA * modelB.spheres[i].center;
+#else
+		mat_trans_single3_nodiv_nomod(modelB.spheres[i].center.x,
+									  modelB.spheres[i].center.y,
+									  modelB.spheres[i].center.z,
+									  s.center.x,
+									  s.center.y,
+									  s.center.z);
+#endif
 		if(TestSphereBox(s, modelA.boundingBox))
 			aSphereIndicesB[numSpheresB++] = i;
 	}
@@ -2209,9 +2322,23 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 		if(hasCollided)
 			numCollisions++;
 	}
+
+#ifdef DC_SH4
+	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB)));
+#endif
 	for(i = 0; i < numCollisions; i++){
+#ifndef DC_SH4
 		spherepoints[i].point = matrixB * spherepoints[i].point;
 		spherepoints[i].normal = Multiply3x3(matrixB, spherepoints[i].normal);
+#else
+
+		mat_trans_single3_nodiv(spherepoints[i].point.x, 
+				    		 	spherepoints[i].point.y,
+								spherepoints[i].point.z);
+		mat_trans_vec3(spherepoints[i].normal.x, 
+					   spherepoints[i].normal.y,
+	    			   spherepoints[i].normal.z);
+#endif
 	}

 	// And the same thing for the lines in A
@@ -2242,8 +2369,17 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	for(i = 0; i < numLinesA; i++)
 		if(aCollided[i]){
 			j = aLineIndicesA[i];
+#ifndef DC_SH4
 			linepoints[j].point = matrixB * linepoints[j].point;
 			linepoints[j].normal = Multiply3x3(matrixB, linepoints[j].normal);
+#else
+			mat_trans_single3_nodiv(linepoints[j].point.x, 
+									linepoints[j].point.y,
+									linepoints[j].point.z);
+			mat_trans_vec3(linepoints[j].normal.x, 
+			    		   linepoints[j].normal.y,
+						   linepoints[j].normal.z);	
+#endif
 		}

 	return numCollisions;	// sphere collisions
@@ -2265,7 +2401,7 @@ CCollision::DistToLine(const CVector *l0, const CVector *l1, const CVector *poin
 	if(dot <= 0.0f) return (*point - *l0).Magnitude();
 	if(dot >= lensq) return (*point - *l1).Magnitude();
 	// distance to line
-	float distSqr = (*point - *l0).MagnitudeSqr() - dot * dot / lensq;
+	float distSqr = (*point - *l0).MagnitudeSqr() - dot * Div<true, false>(dot, lensq);
 	if(distSqr <= 0.f) return 0.f;
 	return Sqrt(distSqr);
 }
@@ -2282,7 +2418,7 @@ CCollision::DistToLine(const CVector *l0, const CVector *l1, const CVector *poin
 	else if(dot >= lensq)
 		closest = *l1;
 	else
-		closest = *l0 + (*l1 - *l0)*(dot/lensq);
+		closest = *l0 + (*l1 - *l0)*(Div<true, false>(dot, lensq));
 	// this is the distance
 	return (*point - closest).Magnitude();
 }
--- a/src/liberty/control/CarCtrl.cpp
+++ b/src/liberty/control/CarCtrl.cpp
@@ -805,6 +805,7 @@ CCarCtrl::CountCarsOfType(int32 mi)
 void
 CCarCtrl::UpdateCarOnRails(CVehicle* pVehicle)
 {
+	// Bullshit inner lambda to prevent ICE due to interaction between register allocator and fast-math.
 	if (pVehicle->AutoPilot.m_nTempAction == TEMPACT_WAIT){
 		pVehicle->SetMoveSpeed(0.0f, 0.0f, 0.0f);
 		pVehicle->AutoPilot.ModifySpeed(0.0f);
@@ -833,24 +834,27 @@ CCarCtrl::UpdateCarOnRails(CVehicle* pVehicle)
 	CVector positionOnNextLinkIncludingLane(
 		pNextLink->GetX() + ((pVehicle->AutoPilot.m_nNextLane + pNextLink->OneWayLaneOffset()) * LANE_WIDTH) * nextPathLinkForwardY,
 		pNextLink->GetY() - ((pVehicle->AutoPilot.m_nNextLane + pNextLink->OneWayLaneOffset()) * LANE_WIDTH) * nextPathLinkForwardX,
-		0.0f);
+		0.0f);	
 	CVector directionCurrentLink(currentPathLinkForwardX, currentPathLinkForwardY, 0.0f);
 	CVector directionNextLink(nextPathLinkForwardX, nextPathLinkForwardY, 0.0f);
 	CVector positionIncludingCurve;
 	CVector directionIncludingCurve;
-	CCurves::CalcCurvePoint(
-		&positionOnCurrentLinkIncludingLane,
-		&positionOnNextLinkIncludingLane,
-		&directionCurrentLink,
-		&directionNextLink,
-		GetPositionAlongCurrentCurve(pVehicle),
-		pVehicle->AutoPilot.m_nTimeToSpendOnCurrentCurve,
-		&positionIncludingCurve,
-		&directionIncludingCurve
-	);
-	positionIncludingCurve.z = 15.0f;
-	DragCarToPoint(pVehicle, &positionIncludingCurve);
-	pVehicle->SetMoveSpeed(directionIncludingCurve / GAME_SPEED_TO_CARAI_SPEED);
+
+	[&](void) __attribute__((noinline)) { // Prevents an ICE on fast-math by breaking function into mutiple subfunctions for register allocator.
+		CCurves::CalcCurvePoint(
+			&positionOnCurrentLinkIncludingLane,
+			&positionOnNextLinkIncludingLane,
+			&directionCurrentLink,
+			&directionNextLink,
+			GetPositionAlongCurrentCurve(pVehicle),
+			pVehicle->AutoPilot.m_nTimeToSpendOnCurrentCurve,
+			&positionIncludingCurve,
+			&directionIncludingCurve
+		);
+		positionIncludingCurve.z = 15.0f;
+		DragCarToPoint(pVehicle, &positionIncludingCurve);
+		pVehicle->SetMoveSpeed(directionIncludingCurve * dc::Invert<false>(GAME_SPEED_TO_CARAI_SPEED));
+	}();
 }

 float
@@ -2036,10 +2040,10 @@ void CCarCtrl::DragCarToPoint(CVehicle* pVehicle, CVector* pPoint)
 	float angleZ = Atan2((actualAheadZ - actualBehindZ) / 3, 1.0f);
 	float cosZ = Cos(angleZ);
 	float sinZ = Sin(angleZ);
-	pVehicle->GetRight() = CVector(posTarget.y - midPos.y, -(posTarget.x - midPos.x), 0.0f) / 3;
+	pVehicle->GetRight() = CVector(posTarget.y - midPos.y, -(posTarget.x - midPos.x), 0.0f) * (1.0f / 3.0f);
 	pVehicle->GetForward() = CVector(-cosZ * pVehicle->GetRight().y, cosZ * pVehicle->GetRight().x, sinZ);
 	pVehicle->GetUp() = CrossProduct(pVehicle->GetRight(), pVehicle->GetForward());
-	pVehicle->SetPosition((CVector(midPos.x, midPos.y, actualBehindZ) + CVector(posTarget.x, posTarget.y, actualAheadZ)) / 2);
+	pVehicle->SetPosition((CVector(midPos.x, midPos.y, actualBehindZ) + CVector(posTarget.x, posTarget.y, actualAheadZ)) * 0.5f);
 	pVehicle->GetMatrix().GetPosition().z += pVehicle->GetHeightAboveRoad();
 }

--- a/src/liberty/control/PathFind.cpp
+++ b/src/liberty/control/PathFind.cpp
@@ -526,7 +526,7 @@ void
 CPathFind::PreparePathDataForType(uint8 type, CTempNode *tempnodes, CPathInfoForObject *objectpathinfo,
 	float maxdist, CTempDetachedNode *detachednodes, int32 numDetached)
 {
-	static CVector CoorsXFormed;
+	CVector CoorsXFormed;
 	int i, j, k, l;
 	int l1, l2;
 	int start;
--- a/src/liberty/core/Cam.cpp
+++ b/src/liberty/core/Cam.cpp
@@ -4009,7 +4009,7 @@ void
 CCam::Process_ModelView(const CVector &CameraTarget, float, float, float)
 {
 	CVector TargetCoors = CameraTarget;
-	float Angle = Atan2(Front.x, Front.y);
+	float Angle = Atan2<false>(Front.x, Front.y);
 	FOV = DefaultFOV;

 	Angle += CPad::GetPad(0)->GetLeftStickX()/1280.0f;
--- a/src/liberty/core/World.cpp
+++ b/src/liberty/core/World.cpp
@@ -1268,6 +1268,8 @@ CWorld::FindObjectsIntersectingAngledCollisionBoxSectorList(CPtrList &list, cons
                                                            int16 *nEntitiesFound, int16 maxEntitiesToFind,
                                                            CEntity **aEntities)
 {
+	mat_load(reinterpret_cast<const matrix_t*>(&matrix));
+	mat_transpose();
 	for(CPtrNode *pNode = list.first; pNode; pNode = pNode->next) {
 		CEntity *pEntity = (CEntity *)pNode->item;
 		if(pEntity->m_scanCode != GetCurrentScanCode()) {
@@ -1275,7 +1277,8 @@ CWorld::FindObjectsIntersectingAngledCollisionBoxSectorList(CPtrList &list, cons
 			CColSphere sphere;
 			CVector vecDistance = pEntity->GetPosition() - position;
 			sphere.radius = pEntity->GetBoundRadius();
-			sphere.center = Multiply3x3(vecDistance, matrix);
+			mat_trans_normal3_nomod(vecDistance.x, vecDistance.y, vecDistance.z,
+									sphere.center.x, sphere.center.y, sphere.center.z);
 			if(CCollision::TestSphereBox(sphere, boundingBox) && *nEntitiesFound < maxEntitiesToFind) {
 				if(aEntities) aEntities[*nEntitiesFound] = pEntity;
 				++*nEntitiesFound;
@@ -1906,6 +1909,9 @@ CWorld::Process(void)
 				movingEnt->UpdateRwFrame();
 			}
 		} else {
+
+			//ocram_enter();
+
 			bNoMoreCollisionTorque = false;
 			for(CPtrNode *node = ms_listMovingEntityPtrs.first; node; node = node->next) {
 				CEntity *movingEnt = (CEntity *)node->item;
@@ -1936,6 +1942,9 @@ CWorld::Process(void)
 					if(!movingEnt->bIsInSafePosition) { movingEnt->bIsStuck = true; }
 				}
 			}
+
+			//ocram_leave();
+
 			bSecondShift = false;
 			for(CPtrNode *node = ms_listMovingEntityPtrs.first; node; node = node->next) {
 				CEntity *movingEnt = (CEntity *)node->item;
--- a/src/liberty/core/ZoneCull.cpp
+++ b/src/liberty/core/ZoneCull.cpp
@@ -15,6 +15,10 @@
 #include "Debug.h"
 #include "Renderer.h"

+#include "VuVector.h"
+
+#include <array>
+
 int32     CCullZones::NumCullZones;
 CCullZone CCullZones::aZones[NUMCULLZONES];
 int32     CCullZones::NumAttributeZones;
@@ -1496,10 +1500,22 @@ CCullZone::TestEntityVisibilityFromCullZone(CEntity *entity, float extraDist, CE
 	else
 		boundMaxZ += extraDist;

-	CVector vecMin = entity->GetMatrix() * CVector(boundMinX, boundMinY, boundMinZ);
-	CVector vecMaxX = entity->GetMatrix() * CVector(boundMaxX, boundMinY, boundMinZ);
-	CVector vecMaxY = entity->GetMatrix() * CVector(boundMinX, boundMaxY, boundMinZ);
-	CVector vecMaxZ = entity->GetMatrix() * CVector(boundMinX, boundMinY, boundMaxZ);
+	std::array<CVector, 4> inVecs = {
+		CVector(boundMinX, boundMinY, boundMinZ),
+		CVector(boundMaxX, boundMinY, boundMinZ),
+		CVector(boundMinX, boundMaxY, boundMinZ),
+		CVector(boundMinX, boundMinY, boundMaxZ)
+	};
+
+	std::array<CVector, 4> outVecs;
+	auto &[vecMin, vecMaxX, vecMaxY, vecMaxZ] = outVecs;
+
+	TransformPoints(reinterpret_cast<CVuVector*>(&outVecs),
+	                4,
+					entity->GetMatrix(),
+					inVecs.data(),
+					sizeof(CVector));
+
 	CVector dirx = vecMaxX - vecMin;
 	CVector diry = vecMaxY - vecMin;
 	CVector dirz = vecMaxZ - vecMin;
@@ -1520,22 +1536,23 @@ CCullZone::TestEntityVisibilityFromCullZone(CEntity *entity, float extraDist, CE

 	float distToZone = CalcDistToCullZone(entity->GetPosition().x, entity->GetPosition().y)/15.0f;
 	distToZone = Max(distToZone, 7.0f);
-	int numX = (boundMaxX - boundMinX)/distToZone + 2.0f;
-	int numY = (boundMaxY - boundMinY)/distToZone + 2.0f;
-	int numZ = (boundMaxZ - boundMinZ)/distToZone + 2.0f;
+	float invDistToZone = Invert<true, false>(distToZone);
+	int numX = (boundMaxX - boundMinX)*invDistToZone + 2.0f;
+	int numY = (boundMaxY - boundMinY)*invDistToZone + 2.0f;
+	int numZ = (boundMaxZ - boundMinZ)*invDistToZone + 2.0f;

-	float stepX = 1.0f/(numX-1);
-	float stepY = 1.0f/(numY-1);
-	float stepZ = 1.0f/(numZ-1);
+	float stepX = Invert<true, false>(numX-1);
+	float stepY = Invert<true, false>(numY-1);
+	float stepZ = Invert<true, false>(numZ-1);
 	float midX = (boundMaxX + boundMinX)/2.0f;
 	float midY = (boundMaxY + boundMinY)/2.0f;
 	float midZ = (boundMaxZ + boundMinZ)/2.0f;

 	// check both xy planes
+	CVector mid = entity->GetMatrix() * CVector(midX, midY, midZ);
+	mid.z += 0.1f;
 	for(int i = 0; i < NumTestPoints; i++){
 		CVector testPoint = aTestPoints[i];
-		CVector mid = entity->GetMatrix() * CVector(midX, midY, midZ);
-		mid.z += 0.1f;
 		if(DoThoroughLineTest(testPoint, mid, entity))
 			return true;

--- a/src/liberty/core/common.h
+++ b/src/liberty/core/common.h
@@ -82,9 +82,6 @@

 #define rwVENDORID_ROCKSTAR 0x0253F2

-__always_inline auto Max(auto a, auto b) { return ((a > b)? a : b); }
-__always_inline auto Min(auto a, auto b) { return ((a < b)? a : b); }
-
 // Use this to add const that wasn't there in the original code
 #define Const const

@@ -299,15 +296,6 @@ extern int strcasecmp(const char *str1, const char *str2);

 extern wchar *AllocUnicode(const char*src);

-template<typename T>
-__always_inline T Clamp(T v, auto low, auto high) {
-	return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
-}
-
-__always_inline auto Clamp2(auto v, auto center, auto radius) {
-	return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
-}
-
 #define SQR(x) ((x) * (x))
 __always_inline auto sq(auto x) { return SQR(x); }

@@ -418,15 +406,8 @@ template<int s, int t> struct check_size {
 #endif
 #define BIT(num)                         (1<<(num))

-#define ABS(a) std::abs(a)
+#define ABS(a) Abs(a)

-__always_inline auto norm(auto value, auto min, auto max) {
-	return (Clamp(value, min, max) - min) / (max - min);
-}
 // we use std::lerp now
 //#define lerp(norm, min, max) ( (norm) * ((max) - (min)) + (min) )

-#define STRINGIFY(x)                    #x
-#define STR(x)                          STRINGIFY(x)
-#define CONCAT_(x,y) x##y
-#define CONCAT(x,y) CONCAT_(x,y)
--- a/src/liberty/entities/Physical.cpp
+++ b/src/liberty/entities/Physical.cpp
@@ -436,7 +436,7 @@ CPhysical::ApplyTurnSpeed(void)
 void
 CPhysical::ApplyMoveForce(float jx, float jy, float jz)
 {
-	m_vecMoveSpeed += CVector(jx, jy, jz)*(1.0f/m_fMass);
+	m_vecMoveSpeed += CVector(jx, jy, jz)*Invert<true, false>(m_fMass);
 }

 void
@@ -444,13 +444,13 @@ CPhysical::ApplyTurnForce(float jx, float jy, float jz, float px, float py, floa
 {
 	CVector com = Multiply3x3(GetMatrix(), m_vecCentreOfMass);
 	CVector turnimpulse = CrossProduct(CVector(px, py, pz)-com, CVector(jx, jy, jz));
-	m_vecTurnSpeed += turnimpulse*(1.0f/m_fTurnMass);
+	m_vecTurnSpeed += turnimpulse*Invert<true, false>(m_fTurnMass);
 }

 void
 CPhysical::ApplyFrictionMoveForce(float jx, float jy, float jz)
 {
-	m_vecMoveFriction += CVector(jx, jy, jz)*(1.0f/m_fMass);
+	m_vecMoveFriction += CVector(jx, jy, jz)*Invert<true, false>(m_fMass);
 }

 void
@@ -458,7 +458,7 @@ CPhysical::ApplyFrictionTurnForce(float jx, float jy, float jz, float px, float
 {
 	CVector com = Multiply3x3(GetMatrix(), m_vecCentreOfMass);
 	CVector turnimpulse = CrossProduct(CVector(px, py, pz)-com, CVector(jx, jy, jz));
-	m_vecTurnFriction += turnimpulse*(1.0f/m_fTurnMass);
+	m_vecTurnFriction += turnimpulse*Invert<true, false>(m_fTurnMass);
 }

 bool
@@ -488,11 +488,11 @@ CPhysical::ApplySpringDampening(float damping, CVector &springDir, CVector &poin
 	float impulse = -damping * (speedA + speedB)/2.0f * m_fMass * step * 0.53f;

 	// what is this?
-	float a = m_fTurnMass / ((point.MagnitudeSqr() + 1.0f) * 2.0f * m_fMass);
+	float a = Div<true, false>(m_fTurnMass, ((point.MagnitudeSqr() + 1.0f) * 2.0f * m_fMass));
 	a = Min(a, 1.0f);
-	float b = Abs(impulse / (speedB * m_fMass));
+	float b = Abs(Div<true, false>(impulse, (speedB * m_fMass)));
 	if(a < b)
-		impulse *= a/b;
+		impulse *= Div<true, false>(a, b);

 	ApplyMoveForce(springDir*impulse);
 	ApplyTurnForce(springDir*impulse, point);
@@ -523,7 +523,7 @@ CPhysical::ApplyAirResistance(void)
 		m_vecMoveSpeed *= f;
 		m_vecTurnSpeed *= f;
 	}else{
-		float f = Pow(1.0f/Abs(m_fAirResistance*0.5f*m_vecMoveSpeed.MagnitudeSqr() + 1.0f), CTimer::GetTimeStep());
+		float f = Pow(Invert<false, true>(Abs(m_fAirResistance*0.5f*m_vecMoveSpeed.MagnitudeSqr() + 1.0f)), CTimer::GetTimeStep());
 		m_vecMoveSpeed *= f;
 		m_vecTurnSpeed *= 0.99f;
 	}
@@ -551,7 +551,7 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl
 	if(A->bPedPhysics){
 		if(A->IsPed() && ((CPed*)A)->IsPlayer() && B->IsVehicle() &&
 		   (B->GetStatus() == STATUS_ABANDONED || B->GetStatus() == STATUS_WRECKED || A->bHasHitWall))
-			massFactorB = 2200.0f / B->m_fMass;
+			massFactorB = Div<true, false>(2200.0f, B->m_fMass);
 		else
 			massFactorB = 10.0f;

@@ -678,7 +678,7 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl

 		float mA = A->m_fMass*massFactorA;
 		float mB = B->GetMassTweak(pointposB, colpoint.normal, massFactorB);
-		float speedSum = (mB*speedB + mA*speedA)/(mA + mB);
+		float speedSum = Div<true, false>((mB*speedB + mA*speedA), (mA + mB));
 		if(speedA < speedSum){
 			if(A->bHasHitWall)
 				eA = speedSum;
@@ -690,8 +690,8 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl
 				eB = speedSum - (speedB - speedSum) * (A->m_fElasticity+B->m_fElasticity)/2.0f;
 			impulseA = (eA - speedA) * mA;
 			impulseB = -(eB - speedB) * mB;
-			CVector fA = colpoint.normal*(impulseA/massFactorA);
-			CVector fB = colpoint.normal*(-impulseB/massFactorB);
+			CVector fA = colpoint.normal*(Div<true, false>(impulseA, massFactorA));
+			CVector fB = colpoint.normal* -Div<true, false>(impulseB, massFactorB);
 			if(!A->bInfiniteMass){
 				if(fA.z < 0.0f) fA.z = 0.0f;
 				if(ispedcontactB){
@@ -713,7 +713,7 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl

 		float mA = A->GetMassTweak(pointposA, colpoint.normal, massFactorA);
 		float mB = B->m_fMass*massFactorB;
-		float speedSum = (mB*speedB + mA*speedA)/(mA + mB);
+		float speedSum = Div<true, false>((mB*speedB + mA*speedA), (mA + mB));
 		if(speedA < speedSum){
 			if(A->bHasHitWall)
 				eA = speedSum;
@@ -725,8 +725,8 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl
 				eB = speedSum - (speedB - speedSum) * (A->m_fElasticity+B->m_fElasticity)/2.0f;
 			impulseA = (eA - speedA) * mA;
 			impulseB = -(eB - speedB) * mB;
-			CVector fA = colpoint.normal*(impulseA/massFactorA);
-			CVector fB = colpoint.normal*(-impulseB/massFactorB);
+			CVector fA = colpoint.normal*(Div<true, false>(impulseA, massFactorA));
+			CVector fB = colpoint.normal* -Div<true, false>(impulseB, massFactorB);
 			if(!A->bInfiniteMass && !ispedcontactA){
 				if(fA.z < 0.0f) fA.z = 0.0f;
 				A->ApplyMoveForce(fA);
@@ -753,7 +753,7 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl
 		speedB = DotProduct(B->GetSpeed(pointposB), colpoint.normal);
 		float mA = A->GetMassTweak(pointposA, colpoint.normal, massFactorA);
 		float mB = B->GetMassTweak(pointposB, colpoint.normal, massFactorB);
-		float speedSum = (mB*speedB + mA*speedA)/(mA + mB);
+		float speedSum = Div<true, false>((mB*speedB + mA*speedA), (mA + mB));
 		if(speedA < speedSum){
 			if(A->bHasHitWall)
 				eA = speedSum;
@@ -765,8 +765,8 @@ CPhysical::ApplyCollision(CPhysical *B, CColPoint &colpoint, float &impulseA, fl
 				eB = speedSum - (speedB - speedSum) * (A->m_fElasticity+B->m_fElasticity)/2.0f;
 			impulseA = (eA - speedA) * mA;
 			impulseB = -(eB - speedB) * mB;
-			CVector fA = colpoint.normal*(impulseA/massFactorA);
-			CVector fB = colpoint.normal*(-impulseB/massFactorB);
+			CVector fA = colpoint.normal*(Div<true, false>(impulseA, massFactorA));
+			CVector fB = colpoint.normal* -Div<true, false>(impulseB, massFactorB);
 			if(A->IsVehicle() && !A->bHasHitWall){
 				fA.x *= 1.4f;
 				fA.y *= 1.4f;
@@ -852,14 +852,14 @@ CPhysical::ApplyCollisionAlt(CEntity *B, CColPoint &colpoint, float &impulse, CV
 			if(IsVehicle() &&
 			   (!bHasHitWall ||
 			    !(m_vecMoveSpeed.MagnitudeSqr() > 0.1 || !(B->IsBuilding() || ((CPhysical*)B)->bInfiniteMass))))
-				moveSpeed += vImpulse * 1.2f * (1.0f/m_fMass);
+				moveSpeed += vImpulse * 1.2f * Invert<true, false>(m_fMass);
 			else
-				moveSpeed += vImpulse * (1.0f/m_fMass);
+				moveSpeed += vImpulse * Invert<true, false>(m_fMass);

 			// ApplyTurnForce
 			CVector com = Multiply3x3(GetMatrix(), m_vecCentreOfMass);
 			CVector turnimpulse = CrossProduct(pointpos-com, vImpulse);
-			turnSpeed += turnimpulse*(1.0f/m_fTurnMass);
+			turnSpeed += turnimpulse*Invert<true, false>(m_fTurnMass);

 			return true;
 		}
@@ -893,10 +893,10 @@ CPhysical::ApplyFriction(CPhysical *B, float adhesiveLimit, CColPoint &colpoint)
 		frictionDir = vOtherSpeedA;
 		frictionDir.Normalise();
 #else
-		frictionDir = vOtherSpeedA * (1.0f/fOtherSpeedA);
+		frictionDir = vOtherSpeedA * Invert<true, false>(fOtherSpeedA);
 #endif

-		speedSum = (B->m_fMass*fOtherSpeedB + A->m_fMass*fOtherSpeedA)/(B->m_fMass + A->m_fMass);
+		speedSum = Div<true, false>(B->m_fMass*fOtherSpeedB + A->m_fMass*fOtherSpeedA, B->m_fMass + A->m_fMass);
 		if(fOtherSpeedA > speedSum){
 			impulseA = (speedSum - fOtherSpeedA) * A->m_fMass;
 			impulseB = (speedSum - fOtherSpeedB) * B->m_fMass;
@@ -929,10 +929,10 @@ CPhysical::ApplyFriction(CPhysical *B, float adhesiveLimit, CColPoint &colpoint)
 		frictionDir = vOtherSpeedA;
 		frictionDir.Normalise();
 #else
-		frictionDir = vOtherSpeedA * (1.0f/fOtherSpeedA);
+		frictionDir = vOtherSpeedA * Invert<true, false>(fOtherSpeedA);
 #endif
 		float massB = B->GetMass(pointposB, frictionDir);
-		speedSum = (massB*fOtherSpeedB + A->m_fMass*fOtherSpeedA)/(massB + A->m_fMass);
+		speedSum = Div<true, false>(massB*fOtherSpeedB + A->m_fMass*fOtherSpeedA, massB + A->m_fMass);
 		if(fOtherSpeedA > speedSum){
 			impulseA = (speedSum - fOtherSpeedA) * A->m_fMass;
 			impulseB = (speedSum - fOtherSpeedB) * massB;
@@ -962,10 +962,10 @@ CPhysical::ApplyFriction(CPhysical *B, float adhesiveLimit, CColPoint &colpoint)
 		frictionDir = vOtherSpeedA;
 		frictionDir.Normalise();
 #else
-		frictionDir = vOtherSpeedA * (1.0f/fOtherSpeedA);
+		frictionDir = vOtherSpeedA * Invert<true, false>(fOtherSpeedA);
 #endif
 		float massA = A->GetMass(pointposA, frictionDir);
-		speedSum = (B->m_fMass*fOtherSpeedB + massA*fOtherSpeedA)/(B->m_fMass + massA);
+		speedSum = Div<true, false>(B->m_fMass*fOtherSpeedB + massA*fOtherSpeedA, B->m_fMass + massA);
 		if(fOtherSpeedA > speedSum){
 			impulseA = (speedSum - fOtherSpeedA) * massA;
 			impulseB = (speedSum - fOtherSpeedB) * B->m_fMass;
@@ -995,11 +995,11 @@ CPhysical::ApplyFriction(CPhysical *B, float adhesiveLimit, CColPoint &colpoint)
 		frictionDir = vOtherSpeedA;
 		frictionDir.Normalise();
 #else
-		frictionDir = vOtherSpeedA * (1.0f/fOtherSpeedA);
+		frictionDir = vOtherSpeedA * Invert<true, false>(fOtherSpeedA);
 #endif
 		float massA = A->GetMass(pointposA, frictionDir);
 		float massB = B->GetMass(pointposB, frictionDir);
-		speedSum = (massB*fOtherSpeedB + massA*fOtherSpeedA)/(massB + massA);
+		speedSum = Div<true, false>(massB*fOtherSpeedB + massA*fOtherSpeedA, massB + massA);
 		if(fOtherSpeedA > speedSum){
 			impulseA = (speedSum - fOtherSpeedA) * massA;
 			impulseB = (speedSum - fOtherSpeedB) * massB;
@@ -1037,12 +1037,12 @@ CPhysical::ApplyFriction(float adhesiveLimit, CColPoint &colpoint)
 			frictionDir = vOtherSpeed;
 			frictionDir.Normalise();
 #else
-			frictionDir = vOtherSpeed * (1.0f/fOtherSpeed);
+			frictionDir = vOtherSpeed * Div<true, false>(fOtherSpeed);
 #endif
 			// not really impulse but speed
 			// maybe use ApplyFrictionMoveForce instead?
 			fImpulse = -fOtherSpeed;
-			impulseLimit = adhesiveLimit*CTimer::GetTimeStep() / m_fMass;
+			impulseLimit = Div<true, false>(adhesiveLimit*CTimer::GetTimeStep(), m_fMass);
 			if(fImpulse < -impulseLimit) fImpulse = -impulseLimit;
 			CVector vImpulse = frictionDir*fImpulse;
 			m_vecMoveFriction += CVector(vImpulse.x, vImpulse.y, 0.0f);
@@ -1060,7 +1060,7 @@ CPhysical::ApplyFriction(float adhesiveLimit, CColPoint &colpoint)
 			frictionDir = vOtherSpeed;
 			frictionDir.Normalise();
 #else
-			frictionDir = vOtherSpeed * (1.0f/fOtherSpeed);
+			frictionDir = vOtherSpeed * Invert<true, false>(fOtherSpeed);
 #endif
 			fImpulse = -fOtherSpeed * m_fMass;
 			impulseLimit = adhesiveLimit*CTimer::GetTimeStep() * 1.5;
@@ -1199,10 +1199,10 @@ CPhysical::ProcessShiftSectorList(CPtrList *lists)

 			if(CWorld::bSecondShift)
 				for(j = 0; j < numCollisions; j++)
-					shift += colpoints[j].GetNormal() * colpoints[j].GetDepth() * 1.5f / numCollisions;
+					shift += colpoints[j].GetNormal() * colpoints[j].GetDepth() * 1.5f * Invert<true, false>(numCollisions);
 			else
 				for(j = 0; j < numCollisions; j++)
-					shift += colpoints[j].GetNormal() * colpoints[j].GetDepth() * 1.2f / numCollisions;
+					shift += colpoints[j].GetNormal() * colpoints[j].GetDepth() * 1.2f * Invert<true, false>(numCollisions);

 			if(A->IsVehicle() && B->IsVehicle()){
 				CVector dir = A->GetPosition() - B->GetPosition();
@@ -1215,14 +1215,14 @@ CPhysical::ProcessShiftSectorList(CPtrList *lists)
 				float f = Min(Abs(dir.z), 0.9f);
 				dir.z = 0.0f;
 				dir.Normalise();
-				shift += dir * colpoints[mostColliding].GetDepth() / (1.0f - f);
+				shift += dir * colpoints[mostColliding].GetDepth() * Invert<true, false>(1.0f - f);
 				boat = B;
 			}else if(B->IsPed() && A->IsVehicle() && ((CVehicle*)A)->IsBoat()){
 				CVector dir = colpoints[mostColliding].GetNormal() * -1.0f;
 				float f = Min(Abs(dir.z), 0.9f);
 				dir.z = 0.0f;
 				dir.Normalise();
-				B->GetMatrix().Translate(dir * colpoints[mostColliding].GetDepth() / (1.0f - f));
+				B->GetMatrix().Translate(dir * colpoints[mostColliding].GetDepth() * Invert<true, false>(1.0f - f));
 				// BUG? how can that ever happen? A is a Ped
 				if(B->IsVehicle())
 					B->ProcessEntityCollision(A, colpoints);
@@ -1364,7 +1364,7 @@ collision:

 			DMAudio.ReportCollision(A, B, aColPoints[i].surfaceA, aColPoints[i].surfaceB, impulseA, Max(turnSpeedDiff, moveSpeedDiff));

-			if(A->ApplyFriction(B, CSurfaceTable::GetAdhesiveLimit(aColPoints[i])/numCollisions, aColPoints[i])){
+			if(A->ApplyFriction(B, Div<true, false>(CSurfaceTable::GetAdhesiveLimit(aColPoints[i]), numCollisions), aColPoints[i])){
 				A->bHasContacted = true;
 				B->bHasContacted = true;
 			}
@@ -1593,7 +1593,7 @@ CPhysical::ProcessCollisionSectorList(CPtrList *lists)

 						DMAudio.ReportCollision(A, B, aColPoints[i].surfaceA, aColPoints[i].surfaceB, imp, Max(turnSpeedDiff, moveSpeedDiff));

-						float adhesion = CSurfaceTable::GetAdhesiveLimit(aColPoints[i]) / numCollisions;
+						float adhesion = Div<true, false>(CSurfaceTable::GetAdhesiveLimit(aColPoints[i]), numCollisions);

 						if(A->GetModelIndex() == MI_RCBANDIT)
 							adhesion *= 0.2f;
@@ -1618,15 +1618,16 @@ CPhysical::ProcessCollisionSectorList(CPtrList *lists)
 				}

 				if(numResponses){
-					m_vecMoveSpeed += moveSpeed / numResponses;
-					m_vecTurnSpeed += turnSpeed / numResponses;
+					m_vecMoveSpeed += moveSpeed * Invert<true, false>(numResponses);
+					m_vecTurnSpeed += turnSpeed * Invert<true, false>(numResponses);
 					if(!CWorld::bNoMoreCollisionTorque &&
 					   A->GetStatus() == STATUS_PLAYER && A->IsVehicle() &&
 					   Abs(A->m_vecMoveSpeed.x) > 0.2f &&
 					   Abs(A->m_vecMoveSpeed.y) > 0.2f){
-						A->m_vecMoveFriction.x += moveSpeed.x * -0.3f / numCollisions;
-						A->m_vecMoveFriction.y += moveSpeed.y * -0.3f / numCollisions;
-						A->m_vecTurnFriction += turnSpeed * -0.3f / numCollisions;
+						float invNumCollisions = Invert<true, false>(numCollisions);
+						A->m_vecMoveFriction.x += moveSpeed.x * -0.3f * invNumCollisions;
+						A->m_vecMoveFriction.y += moveSpeed.y * -0.3f * invNumCollisions;
+						A->m_vecTurnFriction += turnSpeed * -0.3f * invNumCollisions;
 					}
 					return true;
 				}
@@ -1685,7 +1686,7 @@ CPhysical::ProcessCollisionSectorList(CPtrList *lists)

 						DMAudio.ReportCollision(A, B, aColPoints[i].surfaceA, aColPoints[i].surfaceB, impulseA, Max(turnSpeedDiff, moveSpeedDiff));

-						if(A->ApplyFriction(B, CSurfaceTable::GetAdhesiveLimit(aColPoints[i])/numCollisions, aColPoints[i])){
+						if(A->ApplyFriction(B, Div<true, false>(CSurfaceTable::GetAdhesiveLimit(aColPoints[i]), numCollisions), aColPoints[i])){
 							A->bHasContacted = true;
 							B->bHasContacted = true;
 						}
@@ -1854,7 +1855,7 @@ CPhysical::ProcessShift(void)
 }

 // x is the number of units (m) we would like to step
-#define NUMSTEPS(x) Ceil(Sqrt(distSq) * (1.0f/(x)))
+#define NUMSTEPS(x) Ceil(Sqrt(distSq) * (Invert<true, false>(x)))

 void
 CPhysical::ProcessCollision(void)
@@ -1898,13 +1899,13 @@ CPhysical::ProcessCollision(void)
 			n = Max(NUMSTEPS(0.2f), 2.0f);
 		else
 			n = NUMSTEPS(0.3f);
-		step = savedTimeStep / n;
+		step = Div<true, false>(savedTimeStep, n);
 	}else if(IsVehicle() && distSq >= sq(0.4f)){
 		if(GetStatus() == STATUS_PLAYER)
 			n = NUMSTEPS(0.2f);
 		else
 			n = distSq > 0.32f ? NUMSTEPS(0.3f) : NUMSTEPS(0.4f);
-		step = savedTimeStep / n;
+		step = Div<true, false>(savedTimeStep, n);
 	}else if(IsObject()){
 		int responsecase = ((CObject*)this)->m_nSpecialCollisionResponseCases;
 		if(responsecase == COLLRESPONSE_LAMPOST){
@@ -1912,29 +1913,30 @@ CPhysical::ProcessCollision(void)
 			CVector speedDown = CVector(0.0f, 0.0f, 0.0f);
 			speedUp.z = GetBoundRadius();
 			speedDown.z = -speedUp.z;
-			speedUp = Multiply3x3(GetMatrix(), speedUp);
-			speedDown = Multiply3x3(GetMatrix(), speedDown);
+			mat_load(reinterpret_cast<matrix_t *>(&GetMatrix()));
+			mat_trans_normal3(speedUp.x, speedUp.y, speedUp.z);
+			mat_trans_normal3(speedDown.x, speedDown.y, speedDown.z);	;
 			speedUp = GetSpeed(speedUp);
 			speedDown = GetSpeed(speedDown);
 			distSq = Max(speedUp.MagnitudeSqr(), speedDown.MagnitudeSqr()) * sq(CTimer::GetTimeStep());
 			if(distSq >= sq(0.3f)){
 				n = NUMSTEPS(0.3f);
-				step = savedTimeStep / n;
+				step = Div<true, false>(savedTimeStep, n);
 			}
 		}else if(responsecase == COLLRESPONSE_UNKNOWN5){
 			if(distSq >= 0.009f){
 				n = NUMSTEPS(0.09f);
-				step = savedTimeStep / n;
+				step = Div<true, false>(savedTimeStep, n);
 			}
 		}else if(responsecase == COLLRESPONSE_SMALLBOX){
 			if(distSq >= sq(0.15f)){
 				n = NUMSTEPS(0.15f);
-				step = savedTimeStep / n;
+				step = Div<true, false>(savedTimeStep, n);
 			}
 		}else if(responsecase != COLLRESPONSE_FENCEPART){
 			if(distSq >= sq(0.3f)){
 				n = NUMSTEPS(0.3f);
-				step = savedTimeStep / n;
+				step = Div<true, false>(savedTimeStep, n);
 			}
 		}
 	}
--- a/src/liberty/math/Matrix.cpp
+++ b/src/liberty/math/Matrix.cpp
@@ -1,11 +1,5 @@
 #include "common.h"

-CMatrix::CMatrix(void)
-{
-	m_attachment = nil;
-	m_hasRwMatrix = false;
-}
-
 CMatrix::CMatrix(CMatrix const &m)
 {
 	m_attachment = nil;
@@ -60,20 +54,30 @@ CMatrix::Detach(void)
 void
 CMatrix::Update(void)
 {
+#ifndef DC_SH4
 	GetRight() = m_attachment->right;
 	GetForward() = m_attachment->up;
 	GetUp() = m_attachment->at;
 	GetPosition() = m_attachment->pos;
+#else
+	mat_copy(reinterpret_cast<matrix_t*>(this),
+	         reinterpret_cast<const matrix_t*>(m_attachment));
+#endif
 }

 void
 CMatrix::UpdateRW(void)
 {
 	if (m_attachment) {
+#ifndef DC_SH4
 		m_attachment->right = GetRight();
 		m_attachment->up = GetForward();
 		m_attachment->at = GetUp();
 		m_attachment->pos = GetPosition();
+#else
+		mat_copy(reinterpret_cast<matrix_t*>(m_attachment),
+		         reinterpret_cast<const matrix_t*>(this));
+#endif
 		RwMatrixUpdate(m_attachment);
 	}
 }
@@ -81,7 +85,8 @@ CMatrix::UpdateRW(void)
 void
 CMatrix::operator=(CMatrix const &rhs)
 {
-	memcpy(this, &rhs, sizeof(f));
+	mat_copy(reinterpret_cast<matrix_t*>(this),
+	         reinterpret_cast<const matrix_t*>(&rhs));
 	if (m_attachment)
 		UpdateRW();
 }
@@ -89,7 +94,8 @@ CMatrix::operator=(CMatrix const &rhs)
 void
 CMatrix::CopyOnlyMatrix(const CMatrix &other)
 {
-	memcpy(this, &other, sizeof(f));
+	mat_copy(reinterpret_cast<matrix_t*>(this),
+	         reinterpret_cast<const matrix_t*>(&other));
 }

 CMatrix &
@@ -282,10 +288,33 @@ CMatrix::SetRotate(float xAngle, float yAngle, float zAngle)
 void
 CMatrix::RotateX(float x)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_x(x);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+#if defined(DC_SH4)
+	x *= 10430.37835f;
+
+	mat_load(reinterpret_cast<const matrix_t*>(this));
+	asm volatile(
+		"ftrc	%[a], fpul\n\t"
+		".short	0xf4fd\n\t" /* fsca dr4 */
+		"fldi0	fr8\n\t"
+		"fldi0	fr11\n\t"
+		"fmov	fr5, fr10\n\t"
+		"fmov	fr4, fr9\n\t"
+		"fneg	fr9\n\t"
+		"ftrv	xmtrx, fv8\n\t"
+		"fmov	fr4, fr6\n\t"
+		"fldi0	fr7\n\t"
+		"fldi0	fr4\n\t"
+		"ftrv	xmtrx, fv4\n\t"
+		"fschg\n\t"
+		"fmov	dr8, xd8\n\t"
+		"fmov	dr10, xd10\n\t"
+		"fmov	dr4, xd4\n\t"
+		"fmov	dr6, xd6\n\t"
+		"fschg\n"
+		:
+		: [a] "f"(x)
+		: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+	mat_store(reinterpret_cast<matrix_t*>(this));
 #else
 	auto [s, c] = SinCos(x);

@@ -312,10 +341,33 @@ CMatrix::RotateX(float x)
 void
 CMatrix::RotateY(float y)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_y(y);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+#if defined(DC_SH4)
+	y *= 10430.37835f;
+
+	mat_load(reinterpret_cast<const matrix_t*>(this));
+	asm volatile(
+		"ftrc	%[a], fpul\n\t"
+		".short	0xf6fd\n\t" /* fsca dr6 */
+		"fldi0	fr9\n\t"
+		"fldi0	fr11\n\t"
+		"fmov	fr6, fr8\n\t"
+		"fmov	fr7, fr10\n\t"
+		"ftrv	xmtrx, fv8\n\t"
+		"fmov	fr7, fr4\n\t"
+		"fldi0	fr5\n\t"
+		"fneg	fr6\n\t"
+		"fldi0	fr7\n\t"
+		"ftrv	xmtrx, fv4\n\t"
+		"fschg\n\t"
+		"fmov	dr8, xd8\n\t"
+		"fmov	dr10, xd10\n\t"
+		"fmov	dr4, xd0\n\t"
+		"fmov	dr6, xd2\n\t"
+		"fschg\n"
+		:
+		: [a] "f"(y)
+		: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+	mat_store(reinterpret_cast<matrix_t*>(this));
 #else
 	auto [s, c] = SinCos(y);

@@ -342,10 +394,32 @@ CMatrix::RotateY(float y)
 void
 CMatrix::RotateZ(float z)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate_z(z);
-	mat_store(reinterpret_cast<matrix_t *>(this));
+#if defined(DC_SH4)
+
+	z *= 10430.37835f;
+
+	mat_load(reinterpret_cast<const matrix_t*>(this));
+	asm volatile(
+		"ftrc	%[a], fpul\n\t"
+		".short	0xf8fd\n\t" /* fsca dr8 */
+		"fldi0	fr10\n\t"
+		"fldi0	fr11\n\t"
+		"fmov	fr8, fr5\n\t"
+		"fneg	fr8\n\t"
+		"ftrv	xmtrx, fv8\n\t"
+		"fmov	fr9, fr4\n\t"
+		"fschg\n\t"
+		"fmov	dr10, dr6\n\t"
+		"ftrv	xmtrx, fv4\n\t"
+		"fmov	dr8, xd4\n\t"
+		"fmov	dr10, xd6\n\t"
+		"fmov	dr4, xd0\n\t"
+		"fmov	dr6, xd2\n\t"
+		"fschg\n"
+		:
+		: [a] "f"(z)
+		: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+	mat_store(reinterpret_cast<matrix_t*>(this));
 #else	
 	auto [s, c] = SinCos(z);

@@ -372,15 +446,22 @@ CMatrix::RotateZ(float z)
 void
 CMatrix::Rotate(float x, float y, float z)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-	mat_load(reinterpret_cast<matrix_t *>(this));
-	mat_rotate(x, y, z);
+#if defined(DC_SH4)
+	CMatrix rot;
+	rot.SetRotate(x, y, z);
+	mat_load_apply(reinterpret_cast<matrix_t *>(this),
+				   reinterpret_cast<matrix_t *>(&rot));
 	mat_store(reinterpret_cast<matrix_t *>(this));
 #else
 	auto [sX, cX] = SinCos(x);
 	auto [sY, cY] = SinCos(y);
+#ifndef DC_SH4
 	auto [sZ, cZ] = SinCos(z);
-	
+#else /* Protecting against another goddamn fast-math ICE here! */
+	float sZ, cZ;
+	fsincosr(z, &sZ, &cZ);
+#endif
+
 	float rx = this->rx;
 	float ry = this->ry;
 	float rz = this->rz;
@@ -455,70 +536,15 @@ CMatrix::Reorthogonalise(void)
 	f = CrossProduct(u, r);
 }

-#ifdef DC_SH4
-static __always_inline void MATH_Load_Matrix_Product(const matrix_t* matrix1, const matrix_t* matrix2)
-{
-    unsigned int prefetch_scratch;
-
-    asm volatile (
-        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // back matrix
-        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
-        "fmov.d @%[bmtrx]+, XD2\n\t"
-        "fmov.d @%[bmtrx]+, XD4\n\t"
-        "fmov.d @%[bmtrx]+, XD6\n\t"
-        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
-        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
-        "fmov.d @%[bmtrx]+, XD10\n\t"
-        "fmov.d @%[bmtrx]+, XD12\n\t"
-        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fmov.d @%[bmtrx], XD14\n\t"
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // front matrix
-        // interleave loads and matrix multiply 4x4
-        "fmov.d @%[fmtrx]+, DR0\n\t"
-        "fmov.d @%[fmtrx]+, DR2\n\t"
-        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
-        "ftrv XMTRX, FV0\n\t" // (FE)
-
-        "fmov.d @%[fmtrx]+, DR6\n\t"
-        "fmov.d @%[fmtrx]+, DR8\n\t"
-        "ftrv XMTRX, FV4\n\t"
-
-        "fmov.d @%[fmtrx]+, DR10\n\t"
-        "fmov.d @%[fmtrx]+, DR12\n\t"
-        "ftrv XMTRX, FV8\n\t"
-
-        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
-        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
-        "ftrv XMTRX, FV12\n\t" // (FE)
-        // Save output in XF regs
-        "frchg\n"
-        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
-        : // no inputs
-        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
-    );
-}
-#endif
-
 CMatrix
 operator*(const CMatrix &m1, const CMatrix &m2)
 {
 	// TODO: VU0 code
 	CMatrix out;
-#if defined(RW_DC) && 0 // THIS IS BROKEN, 4th element shouldn't be processed
-#	ifdef DC_SH4
-	MATH_Load_Matrix_Product(reinterpret_cast<const matrix_t *>(&m1), reinterpret_cast<const matrix_t *>(&m2));
-
-#	elif defined(RW_DC)
-	mat_load(reinterpret_cast<const matrix_t *>(&m2));
-	mat_apply(reinterpret_cast<const matrix_t *>(&m1));
-#	endif
-	mat_store(reinterpret_cast<matrix_t *>(&out));
+#ifdef RW_DC
+    mat_mult(reinterpret_cast<matrix_t *>(&out),
+             reinterpret_cast<const matrix_t *>(&m1),
+             reinterpret_cast<const matrix_t *>(&m2));
 #else
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
--- a/src/liberty/math/Matrix.h
+++ b/src/liberty/math/Matrix.h
@@ -1,24 +1,25 @@
 #pragma once

-class CMatrix
+class alignas(8) CMatrix
 {
 public:
 	union
 	{
 		alignas(8) float f[4][4];
+		alignas(8) float m[16];
 		struct alignas(8)
 		{
-			float rx, ry, rz, rw;
-			float fx, fy, fz, fw;
-			float ux, uy, uz, uw;
-			float px, py, pz, pw;
+			float rx, ry, rz, rw=0.0f;
+			float fx, fy, fz, fw=0.0f;
+			float ux, uy, uz, uw=0.0f;
+			float px, py, pz, pw=1.0f;
 		};
 	};

-	RwMatrix *m_attachment;
-	bool m_hasRwMatrix;	// are we the owner?
+	RwMatrix *m_attachment = nil;
+	bool m_hasRwMatrix = false;	// are we the owner?

-	CMatrix(void);
+	CMatrix(void) = default;
 	CMatrix(CMatrix const &m);
 	CMatrix(RwMatrix *matrix, bool owner = false);
 	CMatrix(float scale){
--- a/src/liberty/math/Quaternion.cpp
+++ b/src/liberty/math/Quaternion.cpp
@@ -39,10 +39,9 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
 void
 CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
 {
-	x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
-	y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
-	z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
-	w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+	quat_mult(reinterpret_cast<quaternion_t *>(this), 
+			  reinterpret_cast<const quaternion_t &>(q1),
+			  reinterpret_cast<const quaternion_t &>(q2));
 }

 void
@@ -50,10 +49,11 @@ CQuaternion::Get(RwV3d *axis, float *angle)
 {
 	*angle = Acos(w);
 	float s = Sin(*angle);
+	float invS = dc::Invert<true, false>(s);

-	axis->x = x * (1.0f / s);
-	axis->y = y * (1.0f / s);
-	axis->z = z * (1.0f / s);
+	axis->x = x * invS;
+	axis->y = y * invS;
+	axis->z = z * invS;
 }

 void
@@ -104,7 +104,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		w = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		x = (matrix.up.z - matrix.at.y) * m;
 		y = (matrix.at.x - matrix.right.z) * m;
 		z = (matrix.right.y - matrix.up.x) * m;
@@ -115,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		x = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		y = (matrix.up.x + matrix.right.y) * m;
 		z = (matrix.at.x + matrix.right.z) * m;
 		w = (matrix.up.z - matrix.at.y) * m;
@@ -126,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		y = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		w = (matrix.at.x - matrix.right.z) * m;
 		x = (matrix.up.x - matrix.right.y) * m;
 		z = (matrix.at.y + matrix.up.z) * m;
@@ -136,7 +136,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	f = matrix.at.z - (matrix.up.y + matrix.right.x);
 	s = Sqrt(f + 1.0f);
 	z = 0.5f * s;
-	m = 0.5f / s;
+	m = Div<true, false>(0.5f, s);
 	w = (matrix.right.y - matrix.up.x) * m;
 	x = (matrix.at.x + matrix.right.z) * m;
 	y = (matrix.at.y + matrix.up.z) * m;
@@ -151,8 +151,7 @@ CQuaternion::Get(float *f1, float *f2, float *f3)
 	*f3 = Atan2(matrix.right.y, matrix.up.y);
 	if (*f3 < 0.0f)
 		*f3 += TWOPI;
-	float s = Sin(*f3);
-	float c = Cos(*f3);
+	auto [s, c] = SinCos(*f3);
 	*f1 = Atan2(-matrix.at.y, s * matrix.right.y + c * matrix.up.y);
 	if (*f1 < 0.0f)
 		*f1 += TWOPI;
@@ -164,12 +163,10 @@ CQuaternion::Get(float *f1, float *f2, float *f3)
 void
 CQuaternion::Set(float f1, float f2, float f3)
 {
-	float c1 = Cos(f1 * 0.5f);
-	float c2 = Cos(f2 * 0.5f);
-	float c3 = Cos(f3 * 0.5f);
-	float s1 = Sin(f1 * 0.5f);
-	float s2 = Sin(f2 * 0.5f);
-	float s3 = Sin(f3 * 0.5f);
+	auto [s1, c1] = SinCos(f1 * 0.5f);
+	auto [s2, c2] = SinCos(f2 * 0.5f);
+	auto [s3, c3] = SinCos(f3 * 0.5f);
+
 	x = ((c2 * c1) * s3) - ((s2 * s1) * c3);
 	y = ((s1 * c2) * c3) + ((s2 * c1) * s3);
 	z = ((s2 * c1) * c3) - ((s1 * c2) * s3);
--- a/src/liberty/math/Quaternion.h
+++ b/src/liberty/math/Quaternion.h
@@ -1,11 +1,11 @@
 #pragma once

 // TODO: actually implement this
-class CQuaternion
+class alignas(8) CQuaternion
 {
 public:
 	float x, y, z, w;
-	CQuaternion(void) {}
+	CQuaternion(void) = default;
 	CQuaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}

 	float Magnitude(void) const { return Sqrt(MagnitudeSqr()); }
@@ -49,10 +49,11 @@ public:
 	}

 	const CQuaternion &operator/=(float right) {
-		x /= right;
-		y /= right;
-		z /= right;
-		w /= right;
+		float inv = dc::Invert(right);
+		x *= inv;
+		y *= inv;
+		z *= inv;
+		w *= inv;
 		return *this;
 	}

@@ -72,7 +73,11 @@ public:
 inline float
 DotProduct(const CQuaternion &q1, const CQuaternion &q2)
 {
+#ifndef DC_SH4
 	return q1.x*q2.x + q1.y*q2.y + q1.z*q2.z + q1.w*q2.w;
+#else
+	return fipr(q1.x, q1.y, q1.z, q1.w, q2.x, q2.y, q2.z, q2.w);
+#endif
 }

 inline CQuaternion operator+(const CQuaternion &left, const CQuaternion &right)
@@ -97,5 +102,6 @@ inline CQuaternion operator*(float left, const CQuaternion &right)

 inline CQuaternion operator/(const CQuaternion &left, float right)
 {
-	return CQuaternion(left.x / right, left.y / right, left.z / right, left.w / right);
+	right = dc::Invert(right);
+	return CQuaternion(left.x * right, left.y * right, left.z * right, left.w * right);
 }
--- a/src/liberty/math/Vector.cpp
+++ b/src/liberty/math/Vector.cpp
@@ -28,17 +28,11 @@ CVector
 Multiply3x3(const CMatrix &mat, const CVector &vec)
 {
 #ifdef DC_SH4
-    register float __x __asm__("fr12") = vec.x;
-    register float __y __asm__("fr13") = vec.y;
-    register float __z __asm__("fr14") = vec.z;
-    register float __w __asm__("fr15") = 0.0f;
-	
+	CVector out;
 	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
-
-	asm volatile( "ftrv  xmtrx, fv12\n"
-                : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
-                : "0" (__x), "1" (__y), "2" (__z), "3" (__w) );
-    return { __x, __y, __z };
+	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
+							out.x, out.y, out.z);
+	return out;
 #else
 	// TODO: VU0 code
 	return CVector(mat.rx * vec.x + mat.fx * vec.y + mat.ux * vec.z,
@@ -50,9 +44,12 @@ Multiply3x3(const CMatrix &mat, const CVector &vec)
 CVector
 Multiply3x3(const CVector &vec, const CMatrix &mat)
 {
-	return CVector(mat.rx * vec.x + mat.ry * vec.y + mat.rz * vec.z,
-	               mat.fx * vec.x + mat.fy * vec.y + mat.fz * vec.z,
-	               mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
+	CVector out;
+	mat_load(reinterpret_cast<const matrix_t*>(&mat));
+	mat_transpose();
+	mat_trans_vec3_nomod(vec.x, vec.y, vec.z,
+				         out.x, out.y, out.z);
+	return out;
 }

 CVector
--- a/src/liberty/math/Vector.h
+++ b/src/liberty/math/Vector.h
@@ -3,7 +3,7 @@
 class CVector : public RwV3d
 {
 public:
-	CVector(void){}
+	CVector(void) = default;
 	CVector(float x, float y, float z)
 	{
 		this->x = x;
@@ -20,7 +20,7 @@ public:
  // (0,1,0) means no rotation. So get right vector and its atan
  __always_inline float Heading(void) const { return Atan2(-x, y); }
  __always_inline float Magnitude(void) const { 
-#ifdef DC_SH4
+#if defined(DC_SH4) && 0  /* Probably not a gain! */
    float w;
    vec3f_length(x, y, z, w);
    return w;
@@ -29,7 +29,7 @@ public:
 #endif
 }
  __always_inline float MagnitudeSqr(void) const { 
-#ifdef DC_SH4
+#if defined(DC_SH4) && 0 /* Not a gain, shitty FIPR codegen! */
    return fipr_magnitude_sqr(x, y,z, 0.0f); 
 #else
    return x*x + y*y + z*z;
@@ -112,13 +112,14 @@ inline CVector operator*(float left, const CVector &right)

 inline CVector operator/(const CVector &left, float right)
 {
-  return CVector(left.x / right, left.y / right, left.z / right);
+  right = Invert(right);
+  return CVector(left.x * right, left.y * right, left.z * right);
 }

 __always_inline float
 DotProduct(const CVector &v1, const CVector &v2)
 {
-#ifdef DC_SH4
+#if defined(DC_SH4) && 0 /* Not a gain, FIPR sucks ass with codegen! */
  return fipr(v1.x, v1.y, v1.z, 0.0f, v2.x, v2.y, v2.z, 0.0f);
 #else
  return v1.x*v2.x + v1.y*v2.y + v1.z*v2.z;
@@ -131,7 +132,7 @@ inline float
 Distance(const CVector &v1, const CVector &v2)
 {
  float w;
-#ifdef DC_SH4
+#if defined(DC_SH4) && 0 /* Probably not a gain! */
  vec3f_distance(v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, w);
  return w;
 #else
--- a/src/liberty/math/VuVector.h
+++ b/src/liberty/math/VuVector.h
@@ -80,7 +80,6 @@ __always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const Rw

 __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
 {
-	assert(false);
 #ifdef GTA_PS2
 	__asm__ __volatile__("\n\
 		paddub	$3,%4,$0\n\
@@ -110,9 +109,11 @@ __always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat,
 		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
 #elif defined(DC_SH4)
    mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
-	mat_transform(reinterpret_cast<vector_t *>(const_cast<RwV3d *>(in)),
-	              reinterpret_cast<vector_t *>(out), 
-				  n, stride - sizeof(vector_t));
+	while(n--) {
+		mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
+		in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
+		++out;
+	}
 #else
 	while(n--){
 		*out = mat * *in;
--- a/src/liberty/math/maths.h
+++ b/src/liberty/math/maths.h
@@ -1,73 +1,6 @@
 #pragma once

 #include "src/common_defines.h"
+#include "rwdc_common.h"

-#include <tuple>
-#include <dc/matrix.h>
-
-#ifdef DC_SH4
-
-#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
-    } while(false)
-
-#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
-        register float __x __asm__("fr12") = (x); \
-        register float __y __asm__("fr13") = (y); \
-        register float __z __asm__("fr14") = (z); \
-        register float __w __asm__("fr15") = 1.0f; \
-        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
-                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
-                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
-        w = __w; \
-    } while(false)
-
-__always_inline float Fmac(float a, float b, float c) {
-    asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
-        : [floatc] "+f" (c) : "w" (a), [floatb] "f" (b) : );
-    return c;
-}
-
-#else
-
-#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
-	} while(false)
-
-#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
-		vector_t tmp = { x_, y_, z_, 1.0f }; \
-		mat_transform(&tmp, &tmp, 1, 0); \
-		w_ = tmp.w; \
-	} while(false)
-
-__always_inline float Fmac(float a, float b, float c) { return a * b + c; }
-
-#endif
-
-__always_inline float Sin(float x) { return __builtin_sinf(x); }
-__always_inline float Cos(float x) { return __builtin_cosf(x); }
-__always_inline auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
-__always_inline float Tan(float x) { return __builtin_tanf(x); }
-__always_inline float Abs(float x) { return __builtin_fabsf(x); }
-__always_inline float Sqrt(float x) { return __builtin_sqrtf(x); }
-__always_inline float RecipSqrt(float x) { return 1.0f / __builtin_sqrtf(x); }
-__always_inline float Asin(float x) { return __builtin_asinf(x); }
-__always_inline float Acos(float x) { return __builtin_acosf(x); }
-__always_inline float Atan(float x) { return __builtin_atanf(x); }
-__always_inline float Atan2(float y, float x) { return __builtin_atan2f(y, x); }
-__always_inline float RecipSqrt(float x, float y) { return x / __builtin_sqrtf(y); /*y = RecipSqrt(y); return x * y * y;*/ }
-__always_inline float Pow(float x, float y) { return __builtin_powf(x, y); }
-__always_inline float Floor(float x) { return __builtin_floorf(x); }
-__always_inline float Ceil(float x) { return __builtin_ceilf(x); }
-__always_inline float Invert(float x) { return (((x) < 0.0f)? -1.0f : 1.0f) * RecipSqrt((x) * (x)); } 
-__always_inline float Div(float x, float y) { return x * Invert(y); }
-__always_inline float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
+using namespace dc;
--- a/src/liberty/renderer/Coronas.cpp
+++ b/src/liberty/renderer/Coronas.cpp
@@ -482,12 +482,12 @@ CCoronas::RenderReflections(void)
 						float fadeDistance = drawDist / 2.0f;
 						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
 						distanceFade = Clamp(distanceFade, 0.0f, 1.0f);
-						float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera);
+						float recipz = Invert<true, false>(RwCameraGetNearClipPlane(Scene.camera));
 						float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f;
 						int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads;

 						CSprite::RenderBufferedOneXLUSprite(
-#ifdef FIX_BUGS
+#if defined(FIX_BUGS) && 1
 							spriteCoors.x, spriteCoors.y, spriteCoors.z,
 #else
 							spriteCoors.x, spriteCoors.y, RwIm2DGetNearScreenZ(),
@@ -608,13 +608,16 @@ CEntity::ProcessLightsForEntity(void)
 	flashTimer3 = 0;

 	n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects();
+	mat_load(reinterpret_cast<const matrix_t*>(&GetMatrix()));
 	for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){
 		effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i);

 		if(effect->type != EFFECT_LIGHT)
 			continue;

-		pos = GetMatrix() * effect->pos;
+		//pos = GetMatrix() * effect->pos;
+		mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z,
+									  pos.x, pos.y, pos.z);

 		lightOn = false;
 		lightFlickering = false;
--- a/src/miami/collision/ColLine.h
+++ b/src/miami/collision/ColLine.h
@@ -1,14 +1,14 @@
 #pragma once

-struct CColLine
+struct alignas(8) CColLine
 {
 	// NB: this has to be compatible with two CVuVectors
-	CVector p0;
-	int pad0;
-	CVector p1;
-	int pad1;
+	alignas(8) CVector p0;
+	float pad0 = 0.0f;
+	alignas(8) CVector p1;
+	float pad1 = 0.0f;

-	CColLine(void) { };
+	CColLine(void) = default;
 	CColLine(const CVector &p0, const CVector &p1) { this->p0 = p0; this->p1 = p1; };
 	void Set(const CVector &p0, const CVector &p1);
 };
--- a/src/miami/collision/ColPoint.h
+++ b/src/miami/collision/ColPoint.h
@@ -1,12 +1,12 @@
 #pragma once

-struct CColPoint
+struct alignas(8) CColPoint
 {
-	CVector point;
-	int pad1;
+	alignas(8) CVector point;
+	float pad1 = 0.0f;
 	// the surface normal on the surface of point
-	CVector normal;
-	int pad2;
+	alignas(8) CVector normal;
+	float pad2 = 0.0f;
 	uint8 surfaceA;
 	uint8 pieceA;
 	uint8 surfaceB;
--- a/src/miami/collision/Collision.cpp
+++ b/src/miami/collision/Collision.cpp
@@ -617,12 +617,13 @@ bool
 CCollision::ProcessSphereSphere(const CColSphere &s1, const CColSphere &s2, CColPoint &point, float &mindistsq)
 {
 	CVector dist = s1.center - s2.center;
-	float d = dist.Magnitude() - s2.radius;	// distance from s1's center to s2
+	float mag = dist.Magnitude();
+	float d = mag - s2.radius;	// distance from s1's center to s2
 	float depth = s1.radius - d;	// sphere overlap
 	if(d < 0.0f) d = 0.0f;		// clamp to zero, i.e. if s1's center is inside s2
 	// no collision if sphere is not close enough
 	if(d*d < mindistsq && d < s1.radius){
-		dist.Normalise();
+		dist *= Invert<true, false>(mag);
 		point.point = s1.center - dist*d;
 		point.normal = dist;
 #ifndef VU_COLLISION
--- a/src/miami/math/Quaternion.cpp
+++ b/src/miami/math/Quaternion.cpp
@@ -39,10 +39,9 @@ CQuaternion::Slerp(const CQuaternion &q1, const CQuaternion &q2, float theta, fl
 void
 CQuaternion::Multiply(const CQuaternion &q1, const CQuaternion &q2)
 {
-	x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
-	y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
-	z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
-	w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+	quat_mult(reinterpret_cast<quaternion_t *>(this), 
+			  reinterpret_cast<const quaternion_t &>(q1),
+			  reinterpret_cast<const quaternion_t &>(q2));
 }

 void
@@ -50,10 +49,11 @@ CQuaternion::Get(RwV3d *axis, float *angle)
 {
 	*angle = Acos(w);
 	float s = Sin(*angle);
+	float invS = dc::Invert<true, false>(s);

-	axis->x = x * (1.0f / s);
-	axis->y = y * (1.0f / s);
-	axis->z = z * (1.0f / s);
+	axis->x = x * invS;
+	axis->y = y * invS;
+	axis->z = z * invS;
 }

 void
@@ -104,7 +104,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		w = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		x = (matrix.up.z - matrix.at.y) * m;
 		y = (matrix.at.x - matrix.right.z) * m;
 		z = (matrix.right.y - matrix.up.x) * m;
@@ -115,7 +115,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		x = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		y = (matrix.up.x + matrix.right.y) * m;
 		z = (matrix.at.x + matrix.right.z) * m;
 		w = (matrix.up.z - matrix.at.y) * m;
@@ -126,7 +126,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	if (f >= 0.0f) {
 		s = Sqrt(f + 1.0f);
 		y = 0.5f * s;
-		m = 0.5f / s;
+		m = Div<true, false>(0.5f, s);
 		w = (matrix.at.x - matrix.right.z) * m;
 		x = (matrix.up.x - matrix.right.y) * m;
 		z = (matrix.at.y + matrix.up.z) * m;
@@ -136,7 +136,7 @@ CQuaternion::Set(const RwMatrix &matrix)
 	f = matrix.at.z - (matrix.up.y + matrix.right.x);
 	s = Sqrt(f + 1.0f);
 	z = 0.5f * s;
-	m = 0.5f / s;
+	m = Div<true, false>(0.5f, s);
 	w = (matrix.right.y - matrix.up.x) * m;
 	x = (matrix.at.x + matrix.right.z) * m;
 	y = (matrix.at.y + matrix.up.z) * m;
@@ -151,8 +151,7 @@ CQuaternion::Get(float *f1, float *f2, float *f3)
 	*f3 = Atan2(matrix.right.y, matrix.up.y);
 	if (*f3 < 0.0f)
 		*f3 += TWOPI;
-	float s = Sin(*f3);
-	float c = Cos(*f3);
+	auto [s, c] = SinCos(*f3);
 	*f1 = Atan2(-matrix.at.y, s * matrix.right.y + c * matrix.up.y);
 	if (*f1 < 0.0f)
 		*f1 += TWOPI;
@@ -164,12 +163,10 @@ CQuaternion::Get(float *f1, float *f2, float *f3)
 void
 CQuaternion::Set(float f1, float f2, float f3)
 {
-	float c1 = Cos(f1 * 0.5f);
-	float c2 = Cos(f2 * 0.5f);
-	float c3 = Cos(f3 * 0.5f);
-	float s1 = Sin(f1 * 0.5f);
-	float s2 = Sin(f2 * 0.5f);
-	float s3 = Sin(f3 * 0.5f);
+	auto [s1, c1] = SinCos(f1 * 0.5f);
+	auto [s2, c2] = SinCos(f2 * 0.5f);
+	auto [s3, c3] = SinCos(f3 * 0.5f);
+
 	x = ((c2 * c1) * s3) - ((s2 * s1) * c3);
 	y = ((s1 * c2) * c3) + ((s2 * c1) * s3);
 	z = ((s2 * c1) * c3) - ((s1 * c2) * s3);
--- a/vendor/dca3-kos
+++ b/vendor/dca3-kos
--- a/vendor/koshle/hlematrix3d.cpp
+++ b/vendor/koshle/hlematrix3d.cpp
@@ -21,18 +21,18 @@ matrix_t XMTRX = {
    { 0.0f, 0.0f, 0.0f, 0.0f }
 };

-void mat_identity() {
+void (mat_identity)() {
    XMTRX[0][0] = 1.0f; XMTRX[1][0] = 0.0f; XMTRX[2][0] = 0.0f; XMTRX[3][0] = 0.0f;
    XMTRX[0][1] = 0.0f; XMTRX[1][1] = 1.0f; XMTRX[2][1] = 0.0f; XMTRX[3][1] = 0.0f;
    XMTRX[0][2] = 0.0f; XMTRX[1][2] = 0.0f; XMTRX[2][2] = 1.0f; XMTRX[3][2] = 0.0f;
    XMTRX[0][3] = 0.0f; XMTRX[1][3] = 0.0f; XMTRX[2][3] = 0.0f; XMTRX[3][3] = 1.0f;    
 }

-void mat_load(const matrix_t *mat) {
+void (mat_load)(const matrix_t *mat) {
    memcpy(XMTRX, mat, sizeof(matrix_t));
 }

-void mat_store(matrix_t *mat) {
+void (mat_store)(matrix_t *mat) {
    memcpy(mat, XMTRX, sizeof(matrix_t));
 }

@@ -47,13 +47,13 @@ static void mat_mult(matrix_t dst, const matrix_t &src1, const matrix_t &src2) {
    }
 }

-void mat_apply(const matrix_t* mat) {
+void (mat_apply)(const matrix_t* mat) {
    matrix_t result;
    mat_mult(result, *mat, XMTRX);
    mat_load(&result);
 }

-void mat_transform(vector_t *invecs, vector_t *outvecs, int veccnt, int vecskip) {
+void (mat_transform)(vector_t *invecs, vector_t *outvecs, int veccnt, int vecskip) {
    for(unsigned v = 0; v < veccnt; ++v) {
        auto offset = v * (sizeof(vector_t) + vecskip);
        auto *in    = reinterpret_cast<vector_t *>(reinterpret_cast<uint8_t *>(invecs) + offset);
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -23,6 +23,8 @@
 #include "rwengine.h"
 #include "fcaseopen.h"

+using namespace dc;
+
 namespace rw {

 #define PLUGIN_ID 0
@@ -45,12 +47,17 @@ int32 build = 0xFFFF;
 bool32 streamAppendFrames = 0;
 char *debugFile = nil;

-static Matrix identMat = {
-	{ 1.0f, 0.0f, 0.0f }, Matrix::IDENTITY|Matrix::TYPEORTHONORMAL,
-	{ 0.0f, 1.0f, 0.0f }, 0,
-	{ 0.0f, 0.0f, 1.0f }, 0,
-	{ 0.0f, 0.0f, 0.0f }, 0
-};
+static Matrix identMat = {{
+    .right 	= { 1.0f, 0.0f, 0.0f }, 
+    .flags 	= Matrix::IDENTITY|Matrix::TYPEORTHONORMAL,
+	.pad0   = 0,
+    .up 	= { 0.0f, 1.0f, 0.0f }, 
+    .upw 	= 0.0f,
+    .at 	= { 0.0f, 0.0f, 1.0f }, 
+    .atw 	= 0.0f,
+    .pos 	= { 0.0f, 0.0f, 0.0f }, 
+    .posw 	= 1.0f
+}};

 // lazy implementation
 int
@@ -86,16 +93,6 @@ strncmp_ci(const char *s1, const char *s2, int n)
 	return 0;
 }

-Quat
-mult(const Quat &q, const Quat &p)
-{
-	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
-	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
-	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
-	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
-}
-
-
 Quat*
 Quat::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
@@ -133,18 +130,21 @@ lerp(const Quat &q, const Quat &p, float32 r)
 Quat
 slerp(const Quat &q, const Quat &p, float32 a)
 {
-	float32 c;
 	Quat q1 = q;
-	c = dot(q1, p);
+	float c = dot(q1, p);
 	if(c < 0.0f){
 		c = -c;
 		q1 = negate(q1);
 	}
 	float32 phi = acosf(c);
 	if(phi > 0.00001f){
-		float32 s = sinf(phi);
-		return add(scale(q1, sinf((1.0f-a)*phi)/s),
-		           scale(p,  sinf(a*phi)/s));
+#ifndef DC_SH4 
+		float32 s = dc::Invert(sinf(phi));
+#else /* Can't use builtin here, ICEs! */
+		float32 s = dc::Invert(fsin(phi));
+#endif
+		return add(scale(q1, sinf((1.0f-a)*phi)*s),
+		           scale(p,  sinf(a*phi)*s));
 	}
 	return q1;
 }
@@ -165,6 +165,7 @@ void
 V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
 {
 	int32 i;
+#ifndef DC_SH4
 	V3d tmp;
 	for(i = 0; i < n; i++){
 		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
@@ -172,12 +173,21 @@ V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
 		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
 		out[i] = tmp;
 	}
+#else
+    mat_load(reinterpret_cast<const matrix_t*>(m));
+	//rw_mat_load_4x4(m);
+    for(i = 0; i < n; i++) {
+        mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
+                                      out[i].x, out[i].y, out[i].z);
+	}
+#endif
 }

 void
 V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 {
 	int32 i;
+#ifndef DC_SH4
 	V3d tmp;
 	for(i = 0; i < n; i++){
 		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
@@ -185,6 +195,14 @@ V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
 		out[i] = tmp;
 	}
+#else
+    //mat_load_3x3(reinterpret_cast<const matrix_t *>(m));
+    mat_load(reinterpret_cast<const matrix_t*>(m));
+	for(i = 0; i < n; i++) {
+        mat_trans_vec3_nomod(in[i].x, in[i].y, in[i].z,
+                             out[i].x, out[i].y, out[i].z);
+	}
+#endif
 }

 //
@@ -194,27 +212,15 @@ V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 void
 RawMatrix::mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2)
 {
-	dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x + src1->rightw*src2->pos.x;
-	dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y + src1->rightw*src2->pos.y;
-	dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z + src1->rightw*src2->pos.z;
-	dst->rightw  = src1->right.x*src2->rightw  + src1->right.y*src2->upw  + src1->right.z*src2->atw  + src1->rightw*src2->posw;
-	dst->up.x    = src1->up.x*src2->right.x    + src1->up.y*src2->up.x    + src1->up.z*src2->at.x + src1->upw*src2->pos.x;
-	dst->up.y    = src1->up.x*src2->right.y    + src1->up.y*src2->up.y    + src1->up.z*src2->at.y + src1->upw*src2->pos.y;
-	dst->up.z    = src1->up.x*src2->right.z    + src1->up.y*src2->up.z    + src1->up.z*src2->at.z + src1->upw*src2->pos.z;
-	dst->upw     = src1->up.x*src2->rightw     + src1->up.y*src2->upw     + src1->up.z*src2->atw  + src1->upw*src2->posw;
-	dst->at.x    = src1->at.x*src2->right.x    + src1->at.y*src2->up.x    + src1->at.z*src2->at.x + src1->atw*src2->pos.x;
-	dst->at.y    = src1->at.x*src2->right.y    + src1->at.y*src2->up.y    + src1->at.z*src2->at.y + src1->atw*src2->pos.y;
-	dst->at.z    = src1->at.x*src2->right.z    + src1->at.y*src2->up.z    + src1->at.z*src2->at.z + src1->atw*src2->pos.z;
-	dst->atw     = src1->at.x*src2->rightw     + src1->at.y*src2->upw     + src1->at.z*src2->atw  + src1->atw*src2->posw;
-	dst->pos.x   = src1->pos.x*src2->right.x   + src1->pos.y*src2->up.x   + src1->pos.z*src2->at.x + src1->posw*src2->pos.x;
-	dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src1->posw*src2->pos.y;
-	dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src1->posw*src2->pos.z;
-	dst->posw    = src1->pos.x*src2->rightw    + src1->pos.y*src2->upw    + src1->pos.z*src2->atw  + src1->posw*src2->posw;
+    mat_mult(reinterpret_cast<matrix_t *>(dst), 
+             reinterpret_cast<const matrix_t *>(src2), 
+             reinterpret_cast<const matrix_t *>(src1));
 }

 void
 RawMatrix::transpose(RawMatrix *dst, RawMatrix *src)
 {
+#ifndef DC_SH4
 	dst->right.x = src->right.x;
 	dst->up.x = src->right.y;
 	dst->at.x = src->right.z;
@@ -231,17 +237,22 @@ RawMatrix::transpose(RawMatrix *dst, RawMatrix *src)
 	dst->upw = src->pos.y;
 	dst->atw = src->pos.z;
 	dst->posw = src->posw;
+#else
+	mat_load(reinterpret_cast<const matrix_t*>(src));
+	mat_transpose();
+	mat_store(reinterpret_cast<matrix_t*>(dst));
+#endif
 }

 void
 RawMatrix::setIdentity(RawMatrix *dst)
 {
-	static RawMatrix identity = {
+	static RawMatrix identity = {{
 		{ 1.0f, 0.0f, 0.0f }, 0.0f,
 		{ 0.0f, 1.0f, 0.0f }, 0.0f,
 		{ 0.0f, 0.0f, 1.0f }, 0.0f,
 		{ 0.0f, 0.0f, 0.0f }, 1.0f
-	};
+	}};
 	*dst = identity;
 }

@@ -302,8 +313,9 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2)
 	else if(src2->flags & IDENTITY)
 		*dst = *src1;
 	else{
+		uint8_t flags = src1->flags & src2->flags;
 		mult_(dst, src1, src2);
-		dst->flags = src1->flags & src2->flags;
+		dst->flags = flags;
 	}
 	return dst;
 }
@@ -324,6 +336,7 @@ Matrix::invert(Matrix *dst, const Matrix *src)
 Matrix*
 Matrix::transpose(Matrix *dst, const Matrix *src)
 {
+#ifndef DC_SH4
 	if(src->flags & IDENTITY)
 		*dst = *src;
 	dst->right.x = src->right.x;
@@ -338,25 +351,29 @@ Matrix::transpose(Matrix *dst, const Matrix *src)
 	dst->pos.x = 0.0;
 	dst->pos.y = 0.0;
 	dst->pos.z = 0.0;
+#else
+	mat_load(reinterpret_cast<const matrix_t*>(src));
+	mat_transpose();
+	mat_store(reinterpret_cast<matrix_t*>(dst));
+#endif
 	return dst;
 }

 Matrix*
 Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
-	Matrix tmp, rot;
-	makeRotation(&rot, axis, angle);
+	Matrix rot;
 	switch(op){
 	case COMBINEREPLACE:
-		*this = rot;
+		makeRotation(this, axis, angle);
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
-		*this = tmp;
+		makeRotation(&rot, axis, angle);
+		mult(this, &rot, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
-		*this = tmp;
+		makeRotation(&rot, axis, angle);
+		mult(this, this, &rot);
 		break;
 	}
 	return this;
@@ -365,19 +382,18 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 Matrix*
 Matrix::rotate(const Quat &q, CombineOp op)
 {
-	Matrix tmp, rot;
-	makeRotation(&rot, q);
+	Matrix rot;
 	switch(op){
 	case COMBINEREPLACE:
-		*this = rot;
+		makeRotation(this, q);
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
-		*this = tmp;
+		makeRotation(&rot, q);
+		mult(this, &rot, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
-		*this = tmp;
+		makeRotation(&rot, q);
+		mult(this, this, &rot);
 		break;
 	}
 	return this;
@@ -385,21 +401,24 @@ Matrix::rotate(const Quat &q, CombineOp op)
 Matrix*
 Matrix::translate(const V3d *translation, CombineOp op)
 {
-	Matrix tmp;
-	Matrix trans = identMat;
-	trans.pos = *translation;
-	trans.flags &= ~IDENTITY;
+	Matrix trans;
 	switch(op){
 	case COMBINEREPLACE:
-		*this = trans;
+		*this = identMat;
+		this->pos = *translation;
+		this->flags &= ~IDENTITY;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &trans, this);
-		*this = tmp;
+		trans = identMat;
+		trans.pos = *translation;
+		trans.flags &= ~IDENTITY;
+		mult(this, &trans, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &trans);
-		*this = tmp;
+		trans = identMat;
+		trans.pos = *translation;
+		trans.flags &= ~IDENTITY;
+		mult(this, this, &trans);
 		break;
 	}
 	return this;
@@ -408,23 +427,30 @@ Matrix::translate(const V3d *translation, CombineOp op)
 Matrix*
 Matrix::scale(const V3d *scale, CombineOp op)
 {
-	Matrix tmp;
-	Matrix scl = identMat;
-	scl.right.x = scale->x;
-	scl.up.y = scale->y;
-	scl.at.z = scale->z;
-	scl.flags &= ~IDENTITY;
+	Matrix scl;
 	switch(op){
 	case COMBINEREPLACE:
-		*this = scl;
+		*this = identMat;
+		this->right.x = scale->x;
+		this->up.y = scale->y;
+		this->at.z = scale->z;
+		this->flags &= ~IDENTITY;	
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &scl, this);
-		*this = tmp;
+		scl = identMat;
+		scl.right.x = scale->x;
+		scl.up.y = scale->y;
+		scl.at.z = scale->z;
+		scl.flags &= ~IDENTITY;
+		mult(this, &scl, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &scl);
-		*this = tmp;
+		scl = identMat;
+		scl.right.x = scale->x;
+		scl.up.y = scale->y;
+		scl.at.z = scale->z;
+		scl.flags &= ~IDENTITY;
+		mult(this, this, &scl);
 		break;
 	}
 	return this;
@@ -433,18 +459,15 @@ Matrix::scale(const V3d *scale, CombineOp op)
 Matrix*
 Matrix::transform(const Matrix *mat, CombineOp op)
 {
-	Matrix tmp;
 	switch(op){
 	case COMBINEREPLACE:
 		*this = *mat;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, mat, this);
-		*this = tmp;
+		mult(this, mat, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, mat);
-		*this = tmp;
+		mult(this, this, mat);
 		break;
 	}
 	return this;
@@ -456,29 +479,34 @@ Matrix::getRotation(void)
 	Quat q = { 0.0f, 0.0f, 0.0f, 1.0f };
 	float32 tr = right.x + up.y + at.z;
 	float s;
+	float invs;
 	if(tr > 0.0f){
 		s = sqrtf(1.0f + tr) * 2.0f;
+		invs = Invert<true, false>(s);
 		q.w = s / 4.0f;
-		q.x = (up.z - at.y) / s;
-		q.y = (at.x - right.z) / s;
-		q.z = (right.y - up.x) / s;
+		q.x = (up.z - at.y) * invs;
+		q.y = (at.x - right.z) * invs;
+		q.z = (right.y - up.x) * invs;
 	}else if(right.x > up.y && right.x > at.z){
 		s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f;
-		q.w = (up.z - at.y) / s;
+		invs = Invert<true, false>(s);
+		q.w = (up.z - at.y) * invs;
 		q.x = s / 4.0f;
-		q.y = (up.x + right.y) / s;
-		q.z = (at.x + right.z) / s;
+		q.y = (up.x + right.y) * invs;
+		q.z = (at.x + right.z) * invs;
 	}else if(up.y > at.z){
 		s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f;
-		q.w = (at.x - right.z) / s;
-		q.x = (up.x + right.y) / s;
+		invs = Invert<true, false>(s);
+		q.w = (at.x - right.z) * invs;
+		q.x = (up.x + right.y) * invs;
 		q.y = s / 4.0f;
-		q.z = (at.y + up.z) / s;
+		q.z = (at.y + up.z) * invs;
 	}else{
 		s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f;
-		q.w = (right.y - up.x) / s;
-		q.x = (at.x + right.z) / s;
-		q.y = (at.y + up.z) / s;
+		invs = Invert<true, false>(s);
+		q.w = (right.y - up.x) * invs;
+		q.x = (at.x + right.z) * invs;
+		q.y = (at.y + up.z) * invs;
 		q.z = s / 4.0f;
 	}
 	return q;
@@ -499,35 +527,27 @@ Matrix::lookAt(const V3d &dir, const V3d &up)
 * i.e. a vector is first xformed by src1, then by src2
 */
 void
-Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2)
+Matrix::mult_(Matrix *dst, const Matrix *src1, const Matrix *src2)
 {
-	#if !defined(DC_TEXCONV) && !defined(DC_SIM)
-	dst->right.x = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->right.y = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->right.z = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->up.x    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->up.y    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->up.z    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->at.x    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->at.y    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->at.z    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->pos.x   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.x, src2->up.x, src2->at.x, src2->pos.x);
-	dst->pos.y   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  	 	  src2->right.y, src2->up.y, src2->at.y, src2->pos.y);
-	dst->pos.z   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.z, src2->up.z, src2->at.z, src2->pos.z);
-	#else
-	dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x;
-	dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y;
-	dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z;
-	dst->up.x    = src1->up.x*src2->right.x    + src1->up.y*src2->up.x    + src1->up.z*src2->at.x;
-	dst->up.y    = src1->up.x*src2->right.y    + src1->up.y*src2->up.y    + src1->up.z*src2->at.y;
-	dst->up.z    = src1->up.x*src2->right.z    + src1->up.y*src2->up.z    + src1->up.z*src2->at.z;
-	dst->at.x    = src1->at.x*src2->right.x    + src1->at.y*src2->up.x    + src1->at.z*src2->at.x;
-	dst->at.y    = src1->at.x*src2->right.y    + src1->at.y*src2->up.y    + src1->at.z*src2->at.y;
-	dst->at.z    = src1->at.x*src2->right.z    + src1->at.y*src2->up.z    + src1->at.z*src2->at.z;
-	dst->pos.x   = src1->pos.x*src2->right.x   + src1->pos.y*src2->up.x   + src1->pos.z*src2->at.x + src2->pos.x;
-	dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src2->pos.y;
-	dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src2->pos.z;
-	#endif
+
+#ifdef RW_DC
+	mat_mult(reinterpret_cast<matrix_t *>(dst),
+			 reinterpret_cast<const matrix_t *>(src2),
+			 reinterpret_cast<const matrix_t *>(src1));
+#else
+    dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x;
+    dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y;
+    dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z;
+    dst->up.x    = src1->up.x*src2->right.x    + src1->up.y*src2->up.x    + src1->up.z*src2->at.x;
+    dst->up.y    = src1->up.x*src2->right.y    + src1->up.y*src2->up.y    + src1->up.z*src2->at.y;
+    dst->up.z    = src1->up.x*src2->right.z    + src1->up.y*src2->up.z    + src1->up.z*src2->at.z;
+    dst->at.x    = src1->at.x*src2->right.x    + src1->at.y*src2->up.x    + src1->at.z*src2->at.x;
+    dst->at.y    = src1->at.x*src2->right.y    + src1->at.y*src2->up.y    + src1->at.z*src2->at.y;
+    dst->at.z    = src1->at.x*src2->right.z    + src1->at.y*src2->up.z    + src1->at.z*src2->at.z;
+    dst->pos.x   = src1->pos.x*src2->right.x   + src1->pos.y*src2->up.x   + src1->pos.z*src2->at.x + src2->pos.x;
+    dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src2->pos.y;
+    dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src2->pos.z;
+#endif
 }

 void
@@ -536,12 +556,16 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 	dst->right.x = src->right.x;
 	dst->right.y = src->up.x;
 	dst->right.z = src->at.x;
+	dst->flags = TYPEORTHONORMAL;
+	dst->pad0 = 0;
 	dst->up.x = src->right.y;
 	dst->up.y = src->up.y;
 	dst->up.z = src->at.y;
+	dst->upw = 0.0f;
 	dst->at.x = src->right.z;
 	dst->at.y = src->up.z;
 	dst->at.z = src->at.z;
+	dst->atw = 0.0f;
 	dst->pos.x = -(src->pos.x*src->right.x +
 	               src->pos.y*src->right.y +
 	               src->pos.z*src->right.z);
@@ -551,7 +575,7 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 	dst->pos.z = -(src->pos.x*src->at.x +
 	               src->pos.y*src->at.y +
 	               src->pos.z*src->at.z);
-	dst->flags = TYPEORTHONORMAL;
+	dst->posw = 1.0f;
 }

 Matrix*
@@ -566,7 +590,7 @@ Matrix::invertGeneral(Matrix *dst, const Matrix *src)
 	det = src->up.x * dst->right.y + src->at.x * dst->right.z + dst->right.x * src->right.x;
 	invdet = 1.0;
 	if(det != 0.0f)
-		invdet = 1.0f/det;
+		invdet = Invert<false>(det);
 	dst->right.x *= invdet;
 	dst->right.y *= invdet;
 	dst->right.z *= invdet;
@@ -580,6 +604,10 @@ Matrix::invertGeneral(Matrix *dst, const Matrix *src)
 	dst->pos.y = -(src->pos.x*dst->right.y + src->pos.y*dst->up.y + src->pos.z*dst->at.y);
 	dst->pos.z = -(src->pos.x*dst->right.z + src->pos.y*dst->up.z + src->pos.z*dst->at.z);
 	dst->flags &= ~IDENTITY;
+	dst->pad0 = 0;
+	dst->upw = 0.0f;
+	dst->atw = 0.0f;
+	dst->posw = 1.0f;
 	return dst;
 }

@@ -1035,66 +1063,6 @@ StreamMemory::getLength(void)
 	return this->length;
 }

-#if defined(DC_SH4) && 0 // Disabled for now because it is broken
-
-#include <kos/fs.h>
-
-StreamFile*
-StreamFile::open(const char *path, const char *mode)
-{
-	assert(this->fd < 0);
-	this->fd = fs_open(path, mode[0] == 'r' ? O_RDONLY : O_WRONLY);
-	if(this->fd < 0){
-		RWERROR((ERR_FILE, path));
-		return nil;
-	}
-	return this;
-}
-
-void
-StreamFile::close(void)
-{
-	assert(this->fd < 0);
-	fs_close(this->fd);
-	this->fd = -1;
-}
-
-uint32
-StreamFile::write8(const void *data, uint32 length)
-{
-	auto rv = fs_write(this->fd, data, length);
-	assert(rv == length);
-	return (uint32)rv;
-}
-
-uint32
-StreamFile::read8(void *data, uint32 length)
-{
-	auto rv = fs_read(this->fd, data, length);
-	assert(rv == length);
-	return (uint32)rv;
-}
-
-void
-StreamFile::seek(int32 offset, int32 whence)
-{
-	fs_seek(this->fd, offset, whence);
-}
-
-uint32
-StreamFile::tell(void)
-{
-	return fs_tell(this->fd);
-}
-
-bool
-StreamFile::eof(void)
-{
-	return fs_total(this->fd) == fs_tell(this->fd);
-}
-
-#else
-
 StreamFile*
 StreamFile::open(const char *path, const char *mode)
 {
@@ -1148,7 +1116,6 @@ StreamFile::eof(void)
 {
 	return ( feof(this->file) != 0 );
 }
-#endif

 uint8 *
 StreamFile::mmap(uint32 len)
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -0,0 +1,613 @@
+#ifndef __RWDC_COMMON_H
+#define __RWDC_COMMON_H
+
+#include <tuple>
+#include <limits>
+#include <cmath>
+#include <type_traits>
+#include <algorithm>
+#include <cstring>
+
+#ifndef RW_DC
+#   pragma warning(disable: 4244)  // int to float
+#   pragma warning(disable: 4800)  // int to bool
+#   pragma warning(disable: 4838)  // narrowing conversion
+#   pragma warning(disable: 4996)  // POSIX names
+#else
+#   if !defined(DC_TEXCONV) && !defined(DC_SIM)
+#       include <kos.h>
+#       define DC_SH4
+#       define VIDEO_MODE_WIDTH     static_cast<float>(vid_mode->width)
+#       define VIDEO_MODE_HEIGHT    static_cast<float>(vid_mode->height)
+#       define memcpy4              memcpy
+#   else
+#       ifdef DC_TEXCONV
+#           define malloc_stats()
+#       endif
+#       include <dc/matrix.h>
+#       include <dc/pvr.h>
+#       include <kos/dbglog.h>
+#       define VIDEO_MODE_WIDTH     640.0f
+#       define VIDEO_MODE_HEIGHT    480.0f
+#       define memcpy4              memcpy
+#       define dcache_pref_block(a)	__builtin_prefetch(a)
+#       define F_PI                 M_PI
+#       ifndef __always_inline 
+#           define __always_inline  inline
+#       endif
+#   endif
+#   ifdef PVR_TXRFMT_STRIDE
+#       undef PVR_TXRFMT_STRIDE
+#       define PVR_TXRFMT_STRIDE (1 << 25)
+#   endif
+    static_assert(PVR_TXRFMT_STRIDE == (1 << 25), 
+                  "PVR_TXRFMT_STRIDE is bugged in your KOS version");
+#endif
+
+#define F_PI_2          (F_PI * 0.5f)
+#define __hot           __attribute__((hot))
+#define __cold          __attribute__((cold))
+
+#define STRINGIFY(x)    #x
+#define STR(x)          STRINGIFY(x)
+#define CONCAT_(x,y)    x##y
+#define CONCAT(x,y)     CONCAT_(x,y)
+
+namespace rw {
+    class Matrix;
+}
+
+namespace dc {
+
+struct quaternion_t {
+    float x, y, z, w;
+};
+
+__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
+__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
+__always_inline __hot constexpr auto  SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
+__always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
+__always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
+__always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
+__always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
+__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
+__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
+__always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
+__always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
+__always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
+__always_inline __hot constexpr auto Min(auto a, auto b) { return ((a < b)? a : b); }
+
+template<bool CHECK_ZERO=false>
+__always_inline __hot constexpr float RecipSqrt(float x) { 
+    if constexpr(CHECK_ZERO)
+        if(x == 0.0f) 
+            return 0.0f;
+    
+    return 1.0f / Sqrt(x); 
+}
+
+template<typename T>
+__always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
+    return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
+}
+
+__always_inline constexpr auto Clamp2(auto v, auto center, auto radius) {
+    return (v > center) ? Min(v, center + radius) : Max(v, center - radius);
+}
+
+template<bool FAST_APPROX=true, bool COPY_SIGN=true>
+__always_inline __hot constexpr float Invert(float x) { 
+    float value;
+    
+    if(!std::is_constant_evaluated() && FAST_APPROX) {
+        value = RecipSqrt(x * x);
+
+        if constexpr(COPY_SIGN)
+            if(x < 0.0f)
+                value = -value;
+
+    } else value = 1.0f / x;
+
+    return value;
+} 
+
+template<bool FAST_APPROX=true, bool COPY_SIGN=true>
+__always_inline __hot constexpr float Div(float x, float y) { 
+    if(FAST_APPROX && !std::is_constant_evaluated())
+        return x * Invert<true, COPY_SIGN>(y);
+    else
+        return x / y;
+}
+
+template<bool FAST_APPROX=false, bool COPY_SIGN=true>
+__always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
+    auto numerator = Clamp(value, min, max) - min;
+    auto denominator = (max - min);
+
+    if(FAST_APPROX && !std::is_constant_evaluated())
+        return Div<true, COPY_SIGN>(numerator, denominator);
+    else
+        return numerator / denominator;
+}
+
+template<bool FAST_APPROX=false, bool FAST_DIV=false, bool DIV_COPY_SIGN=false>
+__always_inline __hot constexpr float Tan(float x) { 
+    if(!std::is_constant_evaluated() && FAST_APPROX) {
+        constexpr float pisqby4 = 2.4674011002723397f;
+        constexpr float adjpisqby4 = 2.471688400562703f;
+        constexpr float adj1minus8bypisq = 0.189759681063053f;
+        float xsq = x * x;
+        
+        return x * Div<FAST_DIV, DIV_COPY_SIGN>(adjpisqby4 - adj1minus8bypisq * xsq, 
+                                                pisqby4 - xsq);
+    } else
+        return tanf(x); 
+}
+
+template<bool FAST_APPROX=false>
+__always_inline __hot constexpr float Atan(float x) { 
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+        constexpr float a[3] = { // 
+            0.998418889819911f, -2.9993501171084700E-01f, 0.0869142852883849f};
+        float xx = x * x;
+        return ((a[2] * xx + a[1]) * xx + a[0]) * x;
+    } else return atanf(x); 
+}
+
+template<bool FAST_APPROX=false>
+__hot constexpr float Atan2(float y, float x) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+#if 0
+        constexpr float halfpi_i754 = M_PI * 0.5f;
+        constexpr float quarterpi_i754 = M_PI * 0.25f;
+        // kludge to prevent 0/0 condition
+        float abs_y = Abs(y) + std::numeric_limits<float>::epsilon();
+        float absy_plus_absx = abs_y + Abs(x);
+        float inv_absy_plus_absx = Invert<true, true>(absy_plus_absx);
+        float angle = halfpi_i754 - copysignf(quarterpi_i754, x);
+        float r = (x - copysignf(abs_y, x)) * inv_absy_plus_absx;
+        angle += (0.1963f * r * r - 0.9817f) * r;
+        return copysignf(angle, y);
+#else
+    // Ensure input is in [-1, +1]
+    bool swap = fabs(x) < fabs(y);
+    float atan_input = (swap ? x : y) / (swap ? y : x);
+
+    // Approximate atan
+    float res = Atan<true>(atan_input);
+
+    // If swapped, adjust atan output
+    res = swap ? (atan_input >= 0.0f ? F_PI_2 : -F_PI_2) - res : res;
+    // Adjust quadrants
+    if      (x >= 0.0f && y >= 0.0f) {}                     // 1st quadrant
+    else if (x <  0.0f && y >= 0.0f) { res =  F_PI + res; } // 2nd quadrant
+    else if (x <  0.0f && y <  0.0f) { res = -F_PI + res; } // 3rd quadrant
+    else if (x >= 0.0f && y <  0.0f) {}                     // 4th quadrant
+
+    // Store result
+    return res;
+#endif
+    } else return atan2f(y, x);
+}
+
+template<bool FAST_APPROX=false>
+__always_inline __hot constexpr float Asin(float x) { 
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+        Atan(Div<true, false>(x, Sqrt(1.0f-(x*x))));
+    } else return asinf(x); 
+}
+
+template<bool FAST_APPROX=false>
+__always_inline __hot constexpr float Acos(float x) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+         return (-0.69813170079773212f * x * x - 0.87266462599716477f) * x + 1.5707963267948966f;
+    } else return acosf(x); 
+}
+
+#ifdef RW_DC
+#   ifdef DC_SH4
+
+#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#define mat_trans_nodiv_nomod_zerow(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 0.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#define mat_trans_w_nodiv_nomod(x, y, z, w) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        w = __w; \
+    } while(false)
+
+#define mat_trans_vec3(x, y, z) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        __asm__ __volatile__( \
+                              "fldi0 fr15\n" \
+                              "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z) \
+                              : "0" (__x), "1" (__y), "2" (__z) \
+                              : "fr15" ); \
+        x = __x; y = __y; z = __z; \
+    } while(false)
+
+
+#define mat_trans_vec3_nomod(x, y, z, x2, y2, z2) { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        __asm__ __volatile__( \
+                              "fldi0 fr15\n" \
+                              "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z) \
+                              : "0" (__x), "1" (__y), "2" (__z) \
+                              : "fr15" ); \
+        x2 = __x; y2 = __y; z2 = __z; \
+    }
+
+__always_inline __hot void mat_transpose(void) {
+    asm volatile (
+        "frchg\n\t" // fmov for singles only works on front bank
+        // FR0, FR5, FR10, and FR15 are already in place
+        // swap FR1 and FR4
+        "flds FR1, FPUL\n\t"
+        "fmov FR4, FR1\n\t"
+        "fsts FPUL, FR4\n\t"
+        // swap FR2 and FR8
+        "flds FR2, FPUL\n\t"
+        "fmov FR8, FR2\n\t"
+        "fsts FPUL, FR8\n\t"
+        // swap FR3 and FR12
+        "flds FR3, FPUL\n\t"
+        "fmov FR12, FR3\n\t"
+        "fsts FPUL, FR12\n\t"
+        // swap FR6 and FR9
+        "flds FR6, FPUL\n\t"
+        "fmov FR9, FR6\n\t"
+        "fsts FPUL, FR9\n\t"
+        // swap FR7 and FR13
+        "flds FR7, FPUL\n\t"
+        "fmov FR13, FR7\n\t"
+        "fsts FPUL, FR13\n\t"
+        // swap FR11 and FR14
+        "flds FR11, FPUL\n\t"
+        "fmov FR14, FR11\n\t"
+        "fsts FPUL, FR14\n\t"
+        // restore XMTRX to back bank
+        "frchg\n"
+        : // no outputs
+        : // no inputs
+        : "fpul" // clobbers
+    );
+}
+
+
+template<bool FAST_APPROX=true>
+__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+    /*
+        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
+        // q2 then needs to be rotated after each inner product
+        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+    */
+        // keep q1 in fv4
+        register float q1x __asm__ ("fr4") = (q1.x);
+        register float q1y __asm__ ("fr5") = (q1.y);
+        register float q1z __asm__ ("fr6") = (q1.z);
+        register float q1w __asm__ ("fr7") = (q1.w);
+
+        // load q2 into fv8, use it to get the shuffled reorder into fv0
+        register float q2x __asm__ ("fr8")  = (q2.x);
+        register float q2y __asm__ ("fr9")  = (q2.y);
+        register float q2z __asm__ ("fr10") = (q2.z);
+        register float q2w __asm__ ("fr11") = (q2.w);
+
+        // temporary operand / result in fv0
+        register float t1x __asm__ ("fr0");
+        register float t1y __asm__ ("fr1");
+        register float t1z __asm__ ("fr2");
+        register float t1w __asm__ ("fr3");
+
+        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        t1x = q2w;
+        t1y = q2z;
+        t1z = -q2y;
+        t1w = q2w;
+        __asm__ ("\n"
+            " fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
+
+        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        t1x = -q2z;
+        t1y = q2w;
+        t1z = q2x;
+        __atomic_thread_fence(1);
+        r->x = t1w;   // get previous result
+        t1w = q2y;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //y = t1w;
+
+        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        t1x = q2y;
+        t1y = -q2x;
+        t1z = q2w;
+        __atomic_thread_fence(1);
+        r->y = t1w;   // get previous result
+        t1w = q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //z = t1w;
+        __atomic_thread_fence(1);
+
+        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+        q2x = -q2x;
+        q2y = -q2y;
+        q2z = -q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv8\n"
+            : "+f" (q2w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (q2x), "f" (q2y), "f" (q2z)
+        );
+
+        __atomic_thread_fence(1);
+        r->z = t1w;
+        __atomic_thread_fence(1);
+        r->w = q2w;
+    } else {
+        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+	    r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+	    r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+	    r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+    }
+}
+
+__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+    unsigned int prefetch_scratch;
+
+    asm volatile (
+        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
+        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
+        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
+        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
+        // back matrix
+        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
+        "fmov.d @%[bmtrx]+, XD2\n\t"
+        "fmov.d @%[bmtrx]+, XD4\n\t"
+        "fmov.d @%[bmtrx]+, XD6\n\t"
+        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
+        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
+        "fmov.d @%[bmtrx]+, XD10\n\t"
+        "fmov.d @%[bmtrx]+, XD12\n\t"
+        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
+        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
+        "fmov.d @%[bmtrx], XD14\n\t"
+        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
+        // front matrix
+        // interleave loads and matrix multiply 4x4
+        "fmov.d @%[fmtrx]+, DR0\n\t"
+        "fmov.d @%[fmtrx]+, DR2\n\t"
+        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
+        "ftrv XMTRX, FV0\n\t" // (FE)
+
+        "fmov.d @%[fmtrx]+, DR6\n\t"
+        "fmov.d @%[fmtrx]+, DR8\n\t"
+        "ftrv XMTRX, FV4\n\t"
+
+        "fmov.d @%[fmtrx]+, DR10\n\t"
+        "fmov.d @%[fmtrx]+, DR12\n\t"
+        "ftrv XMTRX, FV8\n\t"
+
+        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
+        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
+        "ftrv XMTRX, FV12\n\t" // (FE)
+        // Save output in XF regs
+        "frchg\n"
+        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+        : // no inputs
+        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
+    );
+}
+
+#   else
+#       ifdef DC_TEXCONV
+#           define mat_transform(a, b, c, d)
+#           define mat_apply(a)
+#           define mat_load(a)
+#           define mat_store(a)
+#           define mat_identity(a)
+#           define pvr_fog_table_color(a,r,g,b)
+#           define pvr_fog_table_linear(s,e)
+#       endif
+
+#define mat_trans_vec3(x_, y_, z_) do { \
+        vector_t tmp = { x_, y_, z_, 0.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        x_ = tmp.x; y_ = tmp.y; z_ = tmp.z; \
+    } while(false)
+
+#define mat_trans_vec3_nomod(x_, y_, z_, x2, y2, z2) do { \
+        vector_t tmp = { x_, y_, z_, 0.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \
+    } while(false)
+
+#define mat_trans_single3_nomod(x_, y_, z_, x2, y2, z2) do { \
+		vector_t tmp = { x_, y_, z_, 1.0f }; \
+		mat_transform(&tmp, &tmp, 1, 0); \
+		z2 = 1.0f / tmp.w; \
+		x2 = tmp.x * z2; \
+		y2 = tmp.y * z2; \
+	} while(false)
+
+#define mat_trans_single3_nodiv_nomod(x_, y_, z_, x2, y2, z2) do { \
+		vector_t tmp = { x_, y_, z_, 1.0f }; \
+		mat_transform(&tmp, &tmp, 1, 0); \
+		z2 = tmp.z; \
+		x2 = tmp.x; \
+		y2 = tmp.y; \
+	} while(false)
+
+#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
+		vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
+		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+		x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
+	} while(false)
+
+#define mat_trans_nodiv_nomod_zerow(x_, y_, z_, x2, y2, z2, w2) do { \
+		vector_t tmp1233123 = { x_, y_, z_, 0.0f }; \
+		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+		x2 = tmp1233123.x; y2 = tmp1233123.y; z2 = tmp1233123.z; w2 = tmp1233123.w; \
+	} while(false)
+
+#define mat_trans_w_nodiv_nomod(x_, y_, z_, w_) do { \
+		vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
+		mat_transform(&tmp1233123, &tmp1233123, 1, 0); \
+		w_ = tmp1233123.w; \
+	} while(false)
+
+inline void mat_transpose(void) {
+    matrix_t tmp;
+
+    for(unsigned i = 0; i < 4; ++i)
+        for(unsigned j = 0; j < 4; ++j)
+            tmp[j][i] = XMTRX[i][j];
+    
+    mat_load(&tmp);
+}
+
+template<bool FAST_APPROX=true>
+__hot inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+    r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+    r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+    r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+}
+
+__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+    mat_load(matrix1);
+    mat_apply(matrix2);
+}
+
+#endif
+
+__hot inline void mat_mult(matrix_t *out, const matrix_t* matrix1, const matrix_t* matrix2) {
+    mat_load_apply(matrix1, matrix2);
+    mat_store(out);
+}
+
+__always_inline __hot void mat_copy(matrix_t *dst, const matrix_t *src) {
+    mat_load(src);
+    mat_store(dst);
+}
+
+#if defined(DC_SH4)
+inline uint8_t* OCRAM = (uint8_t*)0x7c001000;
+#else
+alignas(32) inline uint8_t OCRAM[32 * 256];
+#endif
+
+inline void ocram_enter() {
+	#if defined(DC_SH4)
+	auto mask = irq_disable();
+	dcache_purge_all();
+	volatile uint32_t * CCN_CCR = (uint32_t *)0xFF00001C;
+	*CCN_CCR |= (1 << 5); // enable OCR (ORA)
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+
+	irq_restore(mask);
+	#endif
+}
+
+inline void ocram_leave() {
+	#if defined(DC_SH4)
+	auto mask = irq_disable();
+	dcache_inval_range(0x92000000, 8192);
+	dcache_purge_all();
+	volatile uint32_t * CCN_CCR = (uint32_t *)0xFF00001C;
+	*CCN_CCR &= ~( 1 << 5); // disable OCR (ORA)
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	__asm__ __volatile__ ("nop");
+	irq_restore(mask);
+	#endif
+}
+
+
+#endif
+
+}
+
+#endif /* RWDC_COMMON_H */
--- a/vendor/librw/src/engine.cpp
+++ b/vendor/librw/src/engine.cpp
@@ -64,13 +64,14 @@ malloc_managed(size_t sz, uint32 hint)
 	MemoryBlock *mem;

 	if(sz == 0) return nil;
+	bool align64 = !!(hint & ID_MATRIX);
 	origPtr = malloc(sz + sizeof(MemoryBlock) + 15);
 	if(origPtr == nil)
 		return nil;
 	totalMemoryAllocated += sz;
 	data = (uint8*)origPtr;
 	data += sizeof(MemoryBlock);
-	data = (uint8*)ALIGN16((uintptr)data);
+	data = (uint8*) ALIGN16((uintptr)data);
 	mem = (MemoryBlock*)(data-sizeof(MemoryBlock));

 	mem->sz = sz;
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -10,6 +10,8 @@
 #define M_PI 3.14159265358979323846
 #endif

+#include "rwdc_common.h"
+
 // TODO: clean up the opengl defines
 //       and figure out what we even want here...
 #ifdef RW_GL3
@@ -305,21 +307,45 @@ inline Quat scale(const Quat &q, float32 r) { return makeQuat(q.w*r, q.x*r, q.y*
 inline float32 length(const Quat &q) { return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); }
 inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
 inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
-Quat mult(const Quat &q, const Quat &p);
+inline Quat mult(const Quat &q, const Quat &p) {
+	Quat r;
+	dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&r),
+	              reinterpret_cast<const dc::quaternion_t &>(q),
+		     	  reinterpret_cast<const dc::quaternion_t &>(p));
+	return r;
+}
 inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); }
 Quat lerp(const Quat &q, const Quat &p, float32 r);
 Quat slerp(const Quat &q, const Quat &p, float32 a);

-struct __attribute__((aligned(8))) RawMatrix 
+struct alignas(8) RawMatrixBase 
 {
 	V3d right;
-	float32 rightw;
+	float32 rightw = 0.0f;
 	V3d up;
-	float32 upw;
+	float32 upw = 0.0f;
 	V3d at;
-	float32 atw;
+	float32 atw = 0.0f;
 	V3d pos;
-	float32 posw;
+	float32 posw = 1.0f;
+};
+
+struct RawMatrix : public RawMatrixBase
+{
+	RawMatrix() = default;
+
+	RawMatrix(RawMatrixBase &&aggregate):
+		RawMatrixBase{aggregate}
+	{}
+
+	RawMatrix(const RawMatrix& rhs) {
+		*this = rhs;
+	}
+
+	RawMatrix &operator=(const RawMatrix& rhs) {
+		dc::mat_copy(reinterpret_cast<matrix_t*>(this), reinterpret_cast<const matrix_t*>(&rhs));
+		return *this;
+	}

 	// NB: this is dst = src2*src1, i.e. src1 is applied first, then src2
 	static void mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2);
@@ -327,7 +353,7 @@ struct __attribute__((aligned(8))) RawMatrix
 	static void setIdentity(RawMatrix *dst);
 };

-struct Matrix
+struct alignas(8) MatrixBase
 {
 	enum Type {
 		TYPENORMAL	= 1,
@@ -336,7 +362,8 @@ struct Matrix
 		TYPEMASK = 3
 	};
 	enum Flags {
-		IDENTITY = 0x20000
+		IDENTITY = 0x4,
+		IDENTITY_OLD = 0x20000
 	};
 	struct Tolerance {
 		float32 normal;
@@ -344,14 +371,47 @@ struct Matrix
 		float32 identity;
 	};

-	V3d right;
-	uint32 flags;
-	V3d up;
-	uint32 pad1;
-	V3d at;
-	uint32 pad2;
-	V3d pos;
-	uint32 pad3;
+	V3d right = { 1.0f, 0.0f, 0.0f };
+	union {
+		struct {
+			uint32 flags: 3 = TYPEORTHONORMAL|IDENTITY;
+			uint32_t pad0: 29 = 0;
+		};
+		float  rightw;
+	};
+	V3d up = { 0.0f, 1.0f, 0.0f };
+	union {
+		uint32 pad1;
+		float  upw = 0.0f;
+	};
+	V3d at = { 0.0f, 0.0f, 1.0f };
+	union {
+		uint32 pad2;
+		float  atw = 0.0f;
+	};
+	V3d pos = { 0.0f, 0.0f, 0.0f };
+	union {
+		uint32 pad3;
+		float  posw = 1.0f;
+	};
+};
+
+struct Matrix: public MatrixBase
+{
+	Matrix() = default;
+	
+	Matrix(MatrixBase &&aggregate):
+		MatrixBase{aggregate}
+	{}
+
+	Matrix(const Matrix& rhs) {
+		*this = rhs;
+	}
+
+	Matrix &operator=(const Matrix& rhs) {
+		dc::mat_copy(reinterpret_cast<matrix_t*>(this), reinterpret_cast<const matrix_t*>(&rhs));
+		return *this;
+	}

 	static Matrix *create(void);
 	void destroy(void);
--- a/vendor/librw/src/skin.cpp
+++ b/vendor/librw/src/skin.cpp
@@ -224,6 +224,7 @@ writeSkin(Stream *stream, int32 len, void *object, int32 offset, int32)
 	for(int32 i = 0; i < skin->numBones; i++){
 		if(oldFormat)
 			stream->writeU32(0xdeaddead);
+
 		stream->write32(&skin->inverseMatrices[i*16], 64);
 	}
Author	SHA1	Message	Date
Falco Girgis	2ade727c46	Saving state.	2025-03-27 15:17:11 -05:00
Falco Girgis	c02781fe81	Merge remote-tracking branch 'origin/main' into falco/sh4_math_collision_physics	2025-03-17 18:52:34 -05:00
Falco Girgis	446d899c31	Removec uneccessary 64-byte align for RwMatrix.	2025-03-15 13:20:58 -05:00
Falco Girgis	e0b6ac6ab7	Fixed rw::Matrix inversion routines for 4x4s.	2025-03-15 00:27:15 -05:00
Falco Girgis	6ce11f0006	SKINNING IS FIXED!!!	2025-03-14 23:33:34 -05:00
Falco Girgis	011912c4e3	Everything working except skinning!	2025-03-14 11:56:28 -05:00
Falco Girgis	03106c18c2	Added rwdc_common.h!	2025-03-11 18:36:46 -05:00
Falco Girgis	fea70f5275	FIXED MATFX ENV/SPECULAR MAPS!	2025-03-03 07:39:49 -06:00
Falco Girgis	801756d444	Everything working except matfx env mapping.	2025-03-03 05:17:42 -06:00