Chapter 9 Optimizing with SIMD Instructions 223
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
mov [esp+4*ebx+12], esi ; Move into address esp+4*ebx the single-precision
mov [esp+4*ebx+8], edi ; floating-point sign mask.
mov [esp+4*ebx+4], esi
mov [esp+4*ebx], edi
;==============================================================================
; THE 4 ASM LINES BELOW LOAD THE FUNCTION's ARGUMENTS INTO GENERAL-PURPOSE
; REGISTERS (GPRS)
; esi = address of array "x"
; edi = address of array "y"
; ecx = # of cmplx products to compute
; eax = address of product to which results are stored
;==============================================================================
mov esi, [ebp+8] ; esi = ->x
mov edi, [ebp+12] ; edi = ->y
mov ecx, [ebp+16] ; ecx = num_cmplx_elem
mov eax, [ebp+20] ; eax = ->prod
;==============================================================================
; THE 6 ASM LINES BELOW OFFSET THE ADDRESS TO THE ARRAYS x[] AND y[] SUCH
; THAT THEY CAN BE ACCESSED IN THE MOST EFFICIENT MANNER AS ILLUSTRATED
; BELOW IN THE LOOP mult4cmplxnum_loop WITH THE MINIMUM NUMBER OF
; ADDRESS INCREMENTS
;==============================================================================
mov edx, ecx ; edx = num_cmplx_elem
neg ecx ; ecx = -num_cmplx_elem
shl edx, 3 ; edx = 8 * num_cmplx_elem = # bytes in x[] and y[] to multiply
add esi, edx ; esi = -> to last element of x[] to multiply
add edi, edx ; edi = -> to last element of y[] to multiply
add eax, edx ; eax = -> end of prod[] to calculate
;==============================================================================
; THIS LOOP MULTIPLIES 4 COMPLEX #s FROM "x[]" UPON 4 COMPLEX #s FROM "y[]"
; AND RETURNS THE PRODUCT IN "prod[]".
;==============================================================================
ALIGN 16 ; Align address of loop to a 16-byte boundary.
eight_cmplx_prod_loop:
movaps xmm0, [esi+ecx*8] ; xmm0=[x1i,x1r,x0i,x0r]
movaps xmm1, [esi+ecx*8+16] ; xmm1=[x3i,x3r,x2i,x2r]
movaps xmm4, [edi+ecx*8] ; xmm4=[y1i,y1r,y0i,y0r]
movaps xmm5, [edi+ecx*8+16] ; xmm5=[y3i,y3r,y2i,y2r]
movaps xmm2, xmm0 ; xmm2=[x1i,x1r,x0i,x0r]
movaps xmm3, xmm1 ; xmm3=[x3i,x3r,x2i,x2r]
movaps xmm6, xmm4 ; xmm6=[y1i,y1r,y0i,y0r]
movaps xmm7, xmm5 ; xmm7=[y3i,y3r,y2i,y2r]
shufps xmm0, xmm0, 10100000b ; xmm0=[x1r,x1r,x0r,x0r]
shufps xmm1, xmm1, 10100000b ; xmm1=[x3r,x3r,x2r,x2r]
shufps xmm2, xmm2, 11110101b ; xmm2=[x1i,x1i,x0i,x0i]
shufps xmm3, xmm3, 11110101b ; xmm3=[x3i,x3i,x2i,x2i]
xorps xmm6, [esp+4*ebx] ; xmm6=[-y1i,y1r,-y0i,y0r]
xorps xmm7, [esp+4*ebx] ; xmm7=[-y3i,y3r,-y2i,y2r]
mulps xmm0, xmm4 ; xmm0=[x1r*y1i,x1r*y1r,x0r*y0i,x0r*y0r]
mulps xmm1, xmm5 ; xmm1=[x3r*y3i,x3r*y3r,x2r*y2i,x2r*y2r]
shufps xmm7, xmm7, 10110001b ; xmm7=[y3r,-y3i,y2r,-y2i]