Skip to content

Commit 0ea7278

Browse files
authored
gu: add a function to multiply 4x4 matrices (#172)
Add guMtx44Concat() in both C and PS variants. The PS variant is more than 3 times faster that the C one; it's written in a separate file (and not in gu_psasm.S) because it uses a different naming of the matrix registers. These functions are especially useful when porting programs written for OpenGL, which uses 4x4 matrices.
1 parent f1c3747 commit 0ea7278

File tree

4 files changed

+164
-1
lines changed

4 files changed

+164
-1
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ OGCOBJ := \
143143
exception_handler.o exception.o irq.o irq_handler.o semaphore.o \
144144
video_asm.o video.o pad.o dvd.o exi.o mutex.o arqueue.o arqmgr.o \
145145
cache_asm.o system.o system_asm.o cond.o \
146-
gx.o gu.o gu_psasm.o audio.o cache.o decrementer.o \
146+
gx.o gu.o gu_psasm.o gu_ps_concat44.o audio.o cache.o decrementer.o \
147147
message.o card.o aram.o depackrnc.o decrementer_handler.o \
148148
depackrnc1.o dsp.o si.o tpl.o ipc.o ogc_crt0.o \
149149
console_font_8x16.o timesupp.o lock_supp.o usbgecko.o usbmouse.o \

gc/ogc/gu.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ void c_guMtxRotTrig(Mtx mt,const char axis,f32 sinA,f32 cosA);
397397
void c_guMtxRotAxisRad(Mtx mt,guVector *axis,f32 rad);
398398
void c_guMtxReflect(Mtx m,const guVector *p,const guVector *n);
399399
void c_guMtxQuat(Mtx m,const guQuaternion *a);
400+
void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab);
400401

401402
#ifdef GEKKO
402403
void ps_guMtxIdentity( Mtx mt);
@@ -415,6 +416,7 @@ void ps_guMtxRotRad( Mtx mt, const char axis, f32 rad);
415416
void ps_guMtxRotTrig( Mtx mt, const char axis, f32 sinA, f32 cosA);
416417
void ps_guMtxRotAxisRad( Mtx mt, guVector *axis, f32 tmp0);
417418
void ps_guMtxReflect( Mtx m, const guVector *p, const guVector *n);
419+
void ps_guMtx44Concat(const Mtx44 a, const Mtx44 b, Mtx44 ab);
418420
#endif //GEKKO
419421

420422
void guMtx44Identity(Mtx44 mt);
@@ -457,6 +459,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv);
457459
#define guMtxReflect c_guMtxReflect
458460
#define guMtxQuat c_guMtxQuat
459461

462+
#define guMtx44Concat c_guMtx44Concat
463+
460464
#else //MTX_USE_C
461465

462466
#define guVecAdd ps_guVecAdd
@@ -491,6 +495,8 @@ u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv);
491495
#define guMtxRotAxisRad ps_guMtxRotAxisRad
492496
#define guMtxReflect ps_guMtxReflect
493497

498+
#define guMtx44Concat ps_guMtx44Concat
499+
494500
#endif //MTX_USE_PS
495501

496502
#define guMtxRotDeg(mt,axis,deg) guMtxRotRad(mt,axis,DegToRad(deg))

libogc/gu.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,40 @@ void guMtx44Copy(const Mtx44 src,Mtx44 dst)
112112
dst[3][0] = src[3][0]; dst[3][1] = src[3][1]; dst[3][2] = src[3][2]; dst[3][3] = src[3][3];
113113
}
114114

115+
void c_guMtx44Concat(const Mtx44 a,const Mtx44 b,Mtx44 ab)
116+
{
117+
Mtx44 tmp;
118+
Mtx44P m;
119+
120+
if(ab==b || ab==a)
121+
m = tmp;
122+
else
123+
m = ab;
124+
125+
m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0] + a[0][3]*b[3][0];
126+
m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1] + a[0][3]*b[3][1];
127+
m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2] + a[0][3]*b[3][2];
128+
m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3]*b[3][3];
129+
130+
m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0] + a[1][3]*b[3][0];
131+
m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1] + a[1][3]*b[3][1];
132+
m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2] + a[1][3]*b[3][2];
133+
m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3]*b[3][3];
134+
135+
m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0] + a[2][3]*b[3][0];
136+
m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1] + a[2][3]*b[3][1];
137+
m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2] + a[2][3]*b[3][2];
138+
m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3]*b[3][3];
139+
140+
m[3][0] = a[3][0]*b[0][0] + a[3][1]*b[1][0] + a[3][2]*b[2][0] + a[3][3]*b[3][0];
141+
m[3][1] = a[3][0]*b[0][1] + a[3][1]*b[1][1] + a[3][2]*b[2][1] + a[3][3]*b[3][1];
142+
m[3][2] = a[3][0]*b[0][2] + a[3][1]*b[1][2] + a[3][2]*b[2][2] + a[3][3]*b[3][2];
143+
m[3][3] = a[3][0]*b[0][3] + a[3][1]*b[1][3] + a[3][2]*b[2][3] + a[3][3]*b[3][3];
144+
145+
if(m==tmp)
146+
guMtx44Copy(tmp,ab);
147+
}
148+
115149
u32 guMtx44Inverse(const Mtx44 src,Mtx44 inv)
116150
{
117151
f32 det;

libogc/gu_ps_concat44.S

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#include <asm.h>
2+
3+
/* We can use up to 32 registers, but registers starting from fr14 need to be
4+
* saved and restored at the end on the function call, so we try to avoid them.
5+
* We try to optimize the registers usage in the following way:
6+
*
7+
* The Dxx_Dxx registers are used to accumulate the value of the resulting
8+
* matrix. We compute them in row-major order, after which the register is
9+
* stored to the destination variable and can be reused. That's why we just
10+
* need to keep two D registers per row.
11+
*
12+
* The Axx_Axx values can also be disposed after a row has been computed, so
13+
* two registers per row could also be enough. However, to prevent data hazard
14+
* while processing row N we do preload the two A registers used in row N+1.
15+
* So, during the processing of a row, we can have up to four active A
16+
* registers.
17+
*
18+
* The only values that are needed throught the computation are the
19+
* elements of the B matrix, so to avoid unnnecessary re-loading we keep
20+
* them in registers all the time. We therefore don't reuse B registers. */
21+
22+
#define A00_A01 fr10
23+
#define A02_A03 fr11
24+
#define A10_A11 fr12
25+
#define A12_A13 fr13
26+
#define A20_A21 fr10
27+
#define A22_A23 fr11
28+
#define A30_A31 fr12
29+
#define A32_A33 fr13
30+
31+
#define B00_B01 fr0
32+
#define B02_B03 fr1
33+
#define B10_B11 fr2
34+
#define B12_B13 fr3
35+
#define B20_B21 fr4
36+
#define B22_B23 fr5
37+
#define B30_B31 fr6
38+
#define B32_B33 fr7
39+
40+
#define D00_D01 fr8
41+
#define D02_D03 fr9
42+
#define D10_D11 fr8
43+
#define D12_D13 fr9
44+
#define D20_D21 fr8
45+
#define D22_D23 fr9
46+
#define D30_D31 fr8
47+
#define D32_D33 fr9
48+
49+
.globl ps_guMtx44Concat
50+
//r3 = mtxA, r4 = mtxB, r5 = mtxAB
51+
ps_guMtx44Concat:
52+
/* First row. This block is longer than the others below due to the fact
53+
* that we must also load all the B matrix into registers.
54+
* The code is less readable than what it could be because we intertwine
55+
* the instructions in order to avoid data hazards.
56+
*/
57+
psq_l A00_A01,0(r3),0,0
58+
psq_l B00_B01,0(r4),0,0
59+
psq_l B02_B03,8(r4),0,0
60+
psq_l B10_B11,16(r4),0,0
61+
ps_muls0 D00_D01,B00_B01,A00_A01
62+
psq_l A02_A03,8(r3),0,0
63+
ps_muls0 D02_D03,B02_B03,A00_A01
64+
psq_l B12_B13,24(r4),0,0
65+
ps_madds1 D00_D01,B10_B11,A00_A01,D00_D01
66+
psq_l B20_B21,32(r4),0,0
67+
ps_madds1 D02_D03,B12_B13,A00_A01,D02_D03
68+
psq_l B22_B23,40(r4),0,0
69+
ps_madds0 D00_D01,B20_B21,A02_A03,D00_D01
70+
psq_l B30_B31,48(r4),0,0
71+
ps_madds0 D02_D03,B22_B23,A02_A03,D02_D03
72+
psq_l B32_B33,56(r4),0,0
73+
ps_madds1 D00_D01,B30_B31,A02_A03,D00_D01
74+
psq_l A10_A11,16(r3),0,0
75+
ps_madds1 D02_D03,B32_B33,A02_A03,D02_D03
76+
psq_st D00_D01,0(r5),0,0
77+
psq_l A12_A13,24(r3),0,0
78+
psq_st D02_D03,8(r5),0,0
79+
80+
// Second row
81+
ps_muls0 D10_D11,B00_B01,A10_A11
82+
ps_muls0 D12_D13,B02_B03,A10_A11
83+
ps_madds0 D10_D11,B20_B21,A12_A13,D10_D11
84+
ps_madds0 D12_D13,B22_B23,A12_A13,D12_D13
85+
ps_madds1 D10_D11,B10_B11,A10_A11,D10_D11
86+
ps_madds1 D12_D13,B12_B13,A10_A11,D12_D13
87+
psq_l A20_A21,32(r3),0,0
88+
ps_madds1 D10_D11,B30_B31,A12_A13,D10_D11
89+
psq_l A22_A23,40(r3),0,0
90+
ps_madds1 D12_D13,B32_B33,A12_A13,D12_D13
91+
psq_st D10_D11,16(r5),0,0
92+
psq_st D12_D13,24(r5),0,0
93+
94+
// Third row
95+
ps_muls0 D20_D21,B00_B01,A20_A21
96+
ps_muls0 D22_D23,B02_B03,A20_A21
97+
ps_madds0 D20_D21,B20_B21,A22_A23,D20_D21
98+
ps_madds0 D22_D23,B22_B23,A22_A23,D22_D23
99+
ps_madds1 D20_D21,B10_B11,A20_A21,D20_D21
100+
ps_madds1 D22_D23,B12_B13,A20_A21,D22_D23
101+
psq_l A30_A31,48(r3),0,0
102+
ps_madds1 D20_D21,B30_B31,A22_A23,D20_D21
103+
psq_l A32_A33,56(r3),0,0
104+
ps_madds1 D22_D23,B32_B33,A22_A23,D22_D23
105+
psq_st D20_D21,32(r5),0,0
106+
psq_st D22_D23,40(r5),0,0
107+
108+
// Fourth row
109+
ps_muls0 D30_D31,B00_B01,A30_A31
110+
ps_muls0 D32_D33,B02_B03,A30_A31
111+
ps_madds0 D30_D31,B20_B21,A32_A33,D30_D31
112+
ps_madds0 D32_D33,B22_B23,A32_A33,D32_D33
113+
ps_madds1 D30_D31,B10_B11,A30_A31,D30_D31
114+
ps_madds1 D32_D33,B12_B13,A30_A31,D32_D33
115+
ps_madds1 D30_D31,B30_B31,A32_A33,D30_D31
116+
ps_madds1 D32_D33,B32_B33,A32_A33,D32_D33
117+
psq_st D30_D31,48(r5),0,0
118+
psq_st D32_D33,56(r5),0,0
119+
120+
blr
121+
122+
.section .sdata
123+
.balign 16

0 commit comments

Comments
 (0)