; ***************************************************************************
;
; Module        : batransf.asm
;
; Purpose       : Transformation 'go-faster' things for x86
;
; ***************************************************************************

	ifdef	TASM
	warn
	endif

; ***************************************************************************
; Includes

	include macros.i
        include baequate.i

RWASM386

; ***************************************************************************
; Data segment

RWASMDATASEGMENTBEGIN
RWASMDATASEGMENTEND

; ***************************************************************************
; Code segment

RWASMCODESEGMENTBEGIN

; ***************************************************************************
; _rwX86AsmMatMul

; On entry      : Matrix (OUT)
;               : Matrix A
;               : Matrix B
; On exit       :

RWASMFUNC _rwX86AsmMatMul
ifdef STACK
	push eax
	push edx
	push ebx

    mov     eax,[esp + 16]
    mov     edx,[esp + 20]
    mov     ebx,[esp + 24]
endif ; STACK
    ; edx src1
    ; ebx src2
    ; eax dst
	; ecx counter

	push ecx

	mov ecx,3

matmul_rep:
    fld DWORD PTR [edx+mat_mxx]     ; 1xx
    fmul DWORD PTR [ebx+mat_mxx]    ; 1xx*2xx
    fld DWORD PTR [edx+mat_mxy]     ; 1xy,1xx*2xx
    fmul DWORD PTR [ebx+mat_myx]    ; 1xy*2yx,1xx*2xx
    fld DWORD PTR [edx+mat_mxz]     ; 1xz,1xy*2yx,1xx*2xx
    fmul DWORD PTR [ebx+mat_mzx]    ; 1xz*2zx,1xy*2yx,1xx*2xx

    fld DWORD PTR [edx+mat_mxx]     ; 1xx,1xz*2zx,1xy*2yx,1xx*2xx
	fxch st(1)                      ; 1xz*2zx,1xx,1xy*2yx,1xx*2xx
	faddp st(2),st                  ; 1xx,1xz*2zx+1xy*2yx,1xx*2xx
    fmul DWORD PTR [ebx+mat_mxy]    ; 1xx*2xy,1xz*2zx+1xy*2yx,1xx*2xx
	fxch st(1)                      ; 1xz*2zx+1xy*2yx,1xx*2xy,1xx*2xx
	faddp st(2),st                  ; 1xx*2xy,R1
    fld DWORD PTR [edx+mat_mxy]     ; 1xy,1xx*2xy,R1
    fmul DWORD PTR [ebx+mat_myy]    ; 1xy*2yy,1xx*2xy,R1
    fld DWORD PTR [edx+mat_mxz]     ; 1xz,1xy*2yy,1xx*2xy,R1
    fmul DWORD PTR [ebx+mat_mzy]    ; 1xz*2zy,1xy*2yy,1xx*2xy,R1

    fld DWORD PTR [edx+mat_mxx]     ; 1xx,1xz*2zy,1xy*2yy,1xx*2xy,R1
	fxch st(1)                      ; 1xz*2zy,1xx,1xy*2yy,1xx*2xy,R1
	faddp st(2),st                  ; 1xx,1xz*2zy+1xy*2yy,1xx*2xy,R1
    fmul DWORD PTR [ebx+mat_mxz]    ; 1xx*2xz,1xz*2zy+1xy*2yy,1xx*2xy,R1
	fxch st(1)                      ; 1xz*2zy+1xy*2yy,1xx*2xz,1xx*2xy,R1
	faddp st(2),st                  ; 1xx*2xz,R2,R1
    fld DWORD PTR [edx+mat_mxy]     ; 1xy,1xx*2xz,R2,R1
    fmul DWORD PTR [ebx+mat_myz]    ; 1xy*2yz,1xx*2xz,R2,R1
    fld DWORD PTR [edx+mat_mxz]     ; 1xz,1xy*2yz,1xx*2xy,R2,R1
    fmul DWORD PTR [ebx+mat_mzz]    ; 1xz*2zy,1xy*2yz,1xx*2xy,R2,R1

	fxch st(4)                      ; R1,1xy*2yz,1xx*2xy,R2,1xz*2zy
    fstp DWORD PTR [eax+mat_mxx]    ; 1xy*2yz,1xx*2xy,R2,1xz*2zy
	faddp st(3),st                  ; 1xx*2xy,R2,1xz*2zy+1xy*2yz
	fxch st(1)                      ; R2,1xx*2xy,1xz*2zy+1xy*2yz
    fstp DWORD PTR [eax+mat_mxy]    ; 1xx*2xy,1xz*2zy+1xy*2yz
	faddp st(1),st                  ; R3

    lea edx,[edx+mat_myx-mat_mxx]   ; Next row
    dec ecx				; Decrease counter

    fstp DWORD PTR [eax+mat_mxz]    ;

    lea eax,[eax+mat_myx-mat_mxx]   ; Next line down!
    jnz matmul_rep

	; Handle the last line differently (Do the adds)

    fld DWORD PTR [edx+mat_mxx]     ; 1xx
    fmul DWORD PTR [ebx+mat_mxx]    ; 1xx*2xx
    fld DWORD PTR [edx+mat_mxy]     ; 1xy,1xx*2xx
    fxch st(1)
    fadd DWORD PTR [ebx+mat_mwx]    ; (Add)
	fxch st(1)
    fmul DWORD PTR [ebx+mat_myx]    ; 1xy*2yx,1xx*2xx
    fld DWORD PTR [edx+mat_mxz]     ; 1xz,1xy*2yx,1xx*2xx
    fmul DWORD PTR [ebx+mat_mzx]    ; 1xz*2zx,1xy*2yx,1xx*2xx

    fld DWORD PTR [edx+mat_mxx]     ; 1xx,1xz*2zx,1xy*2yx,1xx*2xx
	fxch st(1)                      ; 1xz*2zx,1xx,1xy*2yx,1xx*2xx
	faddp st(2),st                  ; 1xx,1xz*2zx+1xy*2yx,1xx*2xx
    fmul DWORD PTR [ebx+mat_mxy]    ; 1xx*2xy,1xz*2zx+1xy*2yx,1xx*2xx
	fxch st(1)                      ; 1xz*2zx+1xy*2yx,1xx*2xy,1xx*2xx
	faddp st(2),st                  ; 1xx*2xy,R1
    fld DWORD PTR [edx+mat_mxy]     ; 1xy,1xx*2xy,R1
	fxch st(1)
    fadd DWORD PTR [ebx+mat_mwy]    ; (Add)
	fxch st(1)
    fmul DWORD PTR [ebx+mat_myy]    ; 1xy*2yy,1xx*2xy,R1
    fld DWORD PTR [edx+mat_mxz]     ; 1xz,1xy*2yy,1xx*2xy,R1
    fmul DWORD PTR [ebx+mat_mzy]    ; 1xz*2zy,1xy*2yy,1xx*2xy,R1

    fld DWORD PTR [edx+mat_mxx]     ; 1xx,1xz*2zy,1xy*2yy,1xx*2xy,R1
	fxch st(1)                      ; 1xz*2zy,1xx,1xy*2yy,1xx*2xy,R1
	faddp st(2),st                  ; 1xx,1xz*2zy+1xy*2yy,1xx*2xy,R1
    fmul DWORD PTR [ebx+mat_mxz]    ; 1xx*2xz,1xz*2zy+1xy*2yy,1xx*2xy,R1
	fxch st(1)                      ; 1xz*2zy+1xy*2yy,1xx*2xz,1xx*2xy,R1
	faddp st(2),st                  ; 1xx*2xz,R2,R1
    fld DWORD PTR [edx+mat_mxy]     ; 1xy,1xx*2xz,R2,R1
	fxch st(1)
    fadd DWORD PTR [ebx+mat_mwz]    ; (Add)
	fxch st(1)
    fmul DWORD PTR [ebx+mat_myz]    ; 1xy*2yz,1xx*2xz,R2,R1
    fld DWORD PTR [edx+mat_mxz]     ; 1xz,1xy*2yz,1xx*2xy,R2,R1
    fmul DWORD PTR [ebx+mat_mzz]    ; 1xz*2zy,1xy*2yz,1xx*2xy,R2,R1

	fxch st(4)                      ; R1,1xy*2yz,1xx*2xy,R2,1xz*2zy
    fstp DWORD PTR [eax+mat_mxx]    ; 1xy*2yz,1xx*2xy,R2,1xz*2zy
	faddp st(3),st                  ; 1xx*2xy,R2,1xz*2zy+1xy*2yz
	fxch st(1)                      ; R2,1xx*2xy,1xz*2zy+1xy*2yz
    fstp DWORD PTR [eax+mat_mxy]    ; 1xx*2xy,1xz*2zy+1xy*2yz
	faddp st(1),st                  ; R3

    ; 1 clock
    fstp DWORD PTR [eax+mat_mxz]    ;

	pop ecx

ifdef STACK
	pop ebx
	pop edx
	pop eax
endif ; STACK
	ret

RWASMCODESEGMENTEND
	end
