Chapter 9 Optimizing with SIMD Instructions 225
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
_cmplx_multiply_3dnow PROC NEAR
;==============================================================================
; INSTRUCTIONS BELOW SAVE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS ENTERED
; REGISTERS EAX, ECX, EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
push ebp
mov ebp, esp
;==============================================================================
; Parameters passed into routine:
; [ebp+8] = ->x
; [ebp+12] = ->y
; [ebp+16] = num_cmplx_elem
; [ebp+20] = ->prod
;==============================================================================
push ebx
push esi
push edi
;==============================================================================
; THE 4 ASM LINES BELOW LOAD THE FUNCTION's ARGUMENTS INTO GENERAL-PURPOSE
; REGISTERS (GPRS)
; esi = address of array "x"
; edi = address of array "y"
; ecx = # of cmplx products to compute
; eax = address of product to which results are stored
;==============================================================================
mov esi, [ebp+8] ; esi = ->x
mov edi, [ebp+12] ; edi = ->y
mov ecx, [ebp+16] ; ecx = num_cmplx_elem
mov eax, [ebp+20] ; eax = ->prod
;==============================================================================
; THE 6 ASM LINES BELOW OFFSET THE ADDRESS TO THE ARRAYS x[] AND y[] SUCH
; THAT THEY CAN BE ACCESSED IN THE MOST EFFICIENT MANNER AS ILLUSTRATED
; BELOW IN THE LOOP mult4cmplxnum_loop WITH THE MINIMUM NUMBER OF
; ADDRESS INCREMENTS
;==============================================================================
mov edx, ecx ; edx = num_cmplx_elem]
neg ecx ; ecx = -num_cmplx_elem
imul edx, 8 ; edx = 8 * num_cmplx_elem = # bytes in x[] and y[] to multiply
add esi, edx ; esi = -> to last element of x[] to multiply
add edi, edx ; edi = -> to last element of y[] to multiply
add eax, edx ; eax = -> end of prod[] to calculate
;==============================================================================
; THIS LOOP MULTIPLIES 4 COMPLEX #s FROM "x[]" UPON 4 COMPLEX #s FROM "y[]"
; AND RETURNS THE PRODUCT IN "prod[]".
;==============================================================================
ALIGN 16 ; Align address of loop to a 16-byte boundary.
four_cmplx_prod_loop: ;
movq mm0, QWORD PTR [esi+ecx*8] ; mm0=[x0i,x0r]
movq mm1, QWORD PTR [esi+ecx*8+8] ; mm1=[x1i,x1r]
movq mm2, QWORD PTR [esi+ecx*8+16] ; mm2=[x2i,x2r]
movq mm3, QWORD PTR [esi+ecx*8+24] ; mm3=[x3i,x3r]