AMD 250 Computer Hardware User Manual


 
232 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
imul ecx, 2 ; ECX = # quadwords of vertices to rotate
add edi, edx ; EDI = -> end of "v"
add eax, edx ; EAX = -> end of "rotv"
neg ecx ; ECX = -# quadwords of vertices to rotate
;==============================================================================
; THE 4 ASM LINES BELOW LOAD THE TRANSPOSED ROTATION MATRIX "R" INTO XMM0-XMM3
; IN THE FOLLOWING MANNER:
; xmm0 = column 0 of "R" or row 0 of "R" transpose
; xmm1 = column 1 of "R" or row 1 of "R" transpose
; xmm2 = column 2 of "R" or row 2 of "R" transpose
; xmm3 = column 3 of "R" or row 3 of "R" transpose
;==============================================================================
movaps xmm0, [esi] ; XMM0 = [R30,R20,R10,R00]
movaps xmm1, [esi+16] ; XMM1 = [R31,R21,R11,R01]
movaps xmm2, [esi+32] ; XMM2 = [R32,R22,R12,R02]
movaps xmm3, [esi+48] ; XMM3 = [R33,R23,R13,R03]
;==============================================================================
; THIS LOOP ROTATES "num_vertices_to_rotate" VERTICES BY THE TRANSPOSED
; ROTATION MATRIX "R" PASSED INTO THE ROUTINE AND STORES THE ROTATED
; VERTICES TO "rotv".
;==============================================================================
ALIGN 16 ; Align address of loop to a 16-byte boundary.
rotate_vertices_loop:
movlps xmm4, [edi+8*ecx] ; XMM4=[,,v1,v0]
movlps xmm6, [edi+8*ecx+8] ; XMM6=[,,v3,v2]
unpcklps xmm4, xmm4 ; XMM4=[v1,v1,v0,v0]
unpcklps xmm6, xmm6 ; XMM6=[v3,v3,v2,v2]
movhlps xmm5, xmm4 ; XMM5=[,,v1,v1]
movhlps xmm7, xmm6 ; XMM7=[,,v3,v3]
movlhps xmm4, xmm4 ; XMM4=[v0,v0,v0,v0]
mulps xmm4, xmm0 ; XMM4=[R30*v0,R20*v0,R10*v0,R00*v0]
movlhps xmm5, xmm5 ; XMM5=[v1,v1,v1,v1]
mulps xmm5, xmm1 ; XMM5=[R31*v1,R21*v1,R11*v1,R01*v1]
movlhps xmm6, xmm6 ; XMM6=[v2,v2,v2,v2]
mulps xmm6, xmm2 ; XMM6=[R32*v2,R22*v2,R12*v2,R02*v2]
addps xmm4, xmm5 ; XMM4=[R30*v0+R31*v1,R20*v0+R21*v1,
; R10*v0+R11*v1,R00*v0+R01*v1]
movlhps xmm7, xmm7 ; XMM7=[v3,v3,v3,v3]
mulps xmm7, xmm3 ; XMM6=[R33*v3,R23*v3,R13*v3,R03*v3]
addps xmm6, xmm7 ; XMM6=[R32*v2+R33*v3,R22*v2+R23*v3,
; R12*v2+R13*v3,R02*v2+R03*v3]
addps xmm4, xmm6 ; XMM4=New rotated vertex
movntps [eax+8*ecx], xmm4 ; Store rotated vertex to rotv.
add ecx, 2 ; Decrement the # of QWORDs to rotate by 2.
jnz rotate_vertices_loop
sfence ; Finish all memory writes.
;==============================================================================
; INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE
; WAS ENTERED