212 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
; [ebp+8] = ->r
; [ebp+12] = ->rcp_sqrt_r
; [ebp+16] = num_points
;==============================================================================
push ebx
push esi
push edi
;==============================================================================
; THE FIRST 3 ASM LINES BELOW LOAD THE FUNCTION'S ARGUMENTS INTO GENERAL-PURPOSE
; REGISTERS (GPRS)
; esi = address of "r"'s to calculate the reciprocal square root of
; edi = address of "rcp_sqrt_r"'s to store reciprocal square root to
; ecx = num_points
;==============================================================================
mov esi,[ebp+8] ; ESI = ->r
mov edi,[ebp+12] ; EDI = ->rcp_sqrt_r
mov ecx,[ebp+16] ; ECX = num_points
mov edx,ecx ; EDX = num_points
mov eax,ecx ; EAX = num_points
shl edx,2 ; EDX = 4*num_points
shr eax,4 ; EAX = num_points/16
add edi,edx ; EDI = -> end of "r"
add esi,edx ; EAX = -> end of "rcp_sqrt_r"
neg ecx ; ECX = -# quadwords of vertices to rotate
or eax,eax ; If num_points/16 = 0, then skip
; reciprocal square root.
jz skip_recprcl_sqrt_4xloop ; Unroll loop by 4 to work
; on 16 floats at a time.
;==============================================================================
; THIS LOOP RECIPROCATES AND SQUARE ROOTS 16 FLOATING-POINT NUMBERS EACH
; LOOP ITERATION AND WORDS WITH THOSE ELEMENTS OF "r" THAT OCCUPY A
; FULL CACHELINE
;==============================================================================
ALIGN 16 ; Align address of loop to a 16-byte boundary.
reciprocal_sqrt_4xloop:
prefetchnta [esi+4*ecx+256] ; Prefetch the elements "r" 4 cache lines
; ahead to reciprocate and squareroot 4 loops
; from now.
movaps xmm0, [esi+4*ecx] ; XMM0=[r3,r2,r1,r0]
sqrtps xmm0, xmm0 ; XMM0=[sqrtr3,sqrtr2,sqrtr0,sqrtr0]
rcpps xmm0, xmm0 ; XMM0=[1/sqrtr3,1/sqrtr2,1/sqrtr0,1/sqrtr0]
movaps xmm1, [esi+4*ecx+16] ; XMM1=[r7,r6,r5,r4]
sqrtps xmm1, xmm1 ; XMM1=[sqrtr7,sqrtr6,sqrtr5,sqrtr4]
rcpps xmm1, xmm1 ; XMM1=[1/sqrtr7,1/sqrtr6,1/sqrtr5,1/sqrtr4]
movaps xmm2, [esi+4*ecx+32] ; XMM2=[r11,r10,r9,r8]
sqrtps xmm2, xmm2 ; XMM2=[sqrtr11,sqrtr10,sqrtr9,sqrtr8]
rcpps xmm2, xmm2 ; XMM2=[1/sqrtr11,1/sqrtr10,1/sqrtr9,1/sqrtr8]
movaps xmm3, [esi+4*ecx+48] ; XMM2=[r15,r14,r13,r12]
sqrtps xmm3, xmm3 ; XMM2=[sqrtr15,sqrtr14,sqrtr13,sqrtr12]
rcpps xmm3, xmm3 ; XMM2=[1/sqrtr15,1/sqrtr14,1/sqrtr13,1/sqrtr12]
movntps [edi+4*ecx], xmm0 ; Store reciprocal square root to rcp_sqrt_r.