222 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
The following functions use SSE and 3DNow! instructions to illustrate complex multiplication of
streams of complex numbers x[] and y[] stored in a product stream prod[]. For these examples,
assume that the sizes of x[] and y[] are even multiples of four.
Examples
Listing 25. Complex Multiplication of Streams of Complex Numbers (SSE)
; cmplx_multiply_sse(float *x, float *y, int num_cmplx_elem, float *prod);
;
; TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
; ml.exe -coff -c cmplx_multiply_sse.asm
;
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _cmplx_multiply_sse
_cmplx_multiply_sse PROC NEAR
;==============================================================================
; INSTRUCTIONS BELOW SAVE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS ENTERED
; REGISTERS (EAX, ECX, EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED)
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
push ebp
mov ebp, esp
;==============================================================================
; parameters passed into routine:
; [ebp+8] = ->x
; [ebp+12] = ->y
; [ebp+16] = num_cmplx_elem
; [ebp+20] = ->prod
;==============================================================================
push ebx ; preserve contents in ebx,esi, and edi on stack
push esi ;
push edi ;
;===============================================================================
; THE CODE BELOW PUTS THE FLOATING POINT SIGN MASK
; [800000000000000800000000000000h]
; TO FLIP THE SIGN OF PACKED SINGLE PRECISION NUMBERS BY USING XORPS
;==============================================================================
mov eax, esp ; Copy stack pointer into EAX.
mov ebx, 16
sub esp, 32 ; Subtract 32 bytes from stack pointer.
and eax, 15 ; AND old stack pointer address with 15 to
; determine # of bytes the address is past a
; 16-byte-aligned address.
sub ebx, eax ; EBX = # of bytes above ESP to next
; 16-byte-aligned address
mov edi, 0h ; EDI = 00000000h
mov esi, 80000000h ; EBX = 80000000h
shr ebx, 2 ; EBX = # of DWORDs past 16-byte-aligned address