Chapter 9 Optimizing with SIMD Instructions 213
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
movntps [edi+4*ecx+16], xmm1 ; Store reciprocal square root to rcp_sqrt_r.
movntps [edi+4*ecx+32], xmm2 ; Store reciprocal square root to rcp_sqrt_r.
movntps [edi+4*ecx+48], xmm3 ; Store reciprocal square root to rcp_sqrt_r.
add ecx, 16 ; Decrement the # of reciprocal square
; roots to calculate by 16.
dec eax ; Decrement # of 16 float reciprocal square
; root loops to perform by 1.
jnz reciprocal_sqrt_4xloop
jmp skip_recprcl_sqrt_4xloop ; Jump into loop to calculate reciprocal
; square root of floats that don't
; occupy a full cache line.
;==============================================================================
; THIS LOOP RECIPROCATES AND SQUARE ROOTS 1 FLOATING POINT NUMBER EACH
; LOOP ITERATION
;==============================================================================
ALIGN 16 ; Align address of loop to a 16-byte boundary.
reciprocal_sqrt_1xloop:
movss xmm0, [esi+4*ecx] ; XMM0=[,,,r0]
sqrtss xmm0, xmm0 ; XMM0=[,,,sqrt(r0)]
rcpss xmm0, xmm0 ; XMM0=[,,,1/sqrt(r0)]
movss [edi+4*ecx], xmm0 ; Store reciprocal square root to rcp_sqrt_r.
inc ecx ; Decrement the # of reciprocal square roots
; to calculate.
skip_recprcl_sqrt_4xloop:
or ecx, ecx ; If ECX != 0, then calculate the reciprocal
; square root of another float.
jnz reciprocal_sqrt_1xloop
sfence ; Finish all memory writes.
;==============================================================================
; INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE
; WAS ENTERED.
; REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED,
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
pop edi
pop esi
pop ebx
mov esp,ebp
pop ebp
;===============================================================================
ret
_reciprocal_sqrt_sse ENDP
_TEXT ENDS
END
The preceding code illustrates the use of separate loops for optimal performance. The loop titled
reciprocal_sqrt_4xloop works with 16 floating-point numbers in each iteration and is unrolled to
keep the processor busy by masking the latencies of the reciprocal and square-root instructions. In