Support User Manuals

AMD 250 Computer Hardware User Manual

Open as PDF

of 384

212 Optimizing with SIMD Instructions Chapter 9

25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

; [ebp+8] = ->r

; [ebp+12] = ->rcp_sqrt_r

; [ebp+16] = num_points

;==============================================================================

push ebx

push esi

push edi

;==============================================================================

; THE FIRST 3 ASM LINES BELOW LOAD THE FUNCTION'S ARGUMENTS INTO GENERAL-PURPOSE

; REGISTERS (GPRS)

; esi = address of "r"'s to calculate the reciprocal square root of

; edi = address of "rcp_sqrt_r"'s to store reciprocal square root to

; ecx = num_points

;==============================================================================

mov esi,[ebp+8] ; ESI = ->r

mov edi,[ebp+12] ; EDI = ->rcp_sqrt_r

mov ecx,[ebp+16] ; ECX = num_points

mov edx,ecx ; EDX = num_points

mov eax,ecx ; EAX = num_points

shl edx,2 ; EDX = 4*num_points

shr eax,4 ; EAX = num_points/16

add edi,edx ; EDI = -> end of "r"

add esi,edx ; EAX = -> end of "rcp_sqrt_r"

neg ecx ; ECX = -# quadwords of vertices to rotate

or eax,eax ; If num_points/16 = 0, then skip

; reciprocal square root.

jz skip_recprcl_sqrt_4xloop ; Unroll loop by 4 to work

; on 16 floats at a time.

;==============================================================================

; THIS LOOP RECIPROCATES AND SQUARE ROOTS 16 FLOATING-POINT NUMBERS EACH

; LOOP ITERATION AND WORDS WITH THOSE ELEMENTS OF "r" THAT OCCUPY A

; FULL CACHELINE

;==============================================================================

ALIGN 16 ; Align address of loop to a 16-byte boundary.

reciprocal_sqrt_4xloop:

prefetchnta [esi+4*ecx+256] ; Prefetch the elements "r" 4 cache lines

; ahead to reciprocate and squareroot 4 loops

; from now.

movaps xmm0, [esi+4*ecx] ; XMM0=[r3,r2,r1,r0]

sqrtps xmm0, xmm0 ; XMM0=[sqrtr3,sqrtr2,sqrtr0,sqrtr0]

rcpps xmm0, xmm0 ; XMM0=[1/sqrtr3,1/sqrtr2,1/sqrtr0,1/sqrtr0]

movaps xmm1, [esi+4*ecx+16] ; XMM1=[r7,r6,r5,r4]

sqrtps xmm1, xmm1 ; XMM1=[sqrtr7,sqrtr6,sqrtr5,sqrtr4]

rcpps xmm1, xmm1 ; XMM1=[1/sqrtr7,1/sqrtr6,1/sqrtr5,1/sqrtr4]

movaps xmm2, [esi+4*ecx+32] ; XMM2=[r11,r10,r9,r8]

sqrtps xmm2, xmm2 ; XMM2=[sqrtr11,sqrtr10,sqrtr9,sqrtr8]

rcpps xmm2, xmm2 ; XMM2=[1/sqrtr11,1/sqrtr10,1/sqrtr9,1/sqrtr8]

movaps xmm3, [esi+4*ecx+48] ; XMM2=[r15,r14,r13,r12]

sqrtps xmm3, xmm3 ; XMM2=[sqrtr15,sqrtr14,sqrtr13,sqrtr12]

rcpps xmm3, xmm3 ; XMM2=[1/sqrtr15,1/sqrtr14,1/sqrtr13,1/sqrtr12]

movntps [edi+4*ecx], xmm0 ; Store reciprocal square root to rcp_sqrt_r.

previous next