AMD 250 Computer Hardware User Manual


 
Chapter 9 Optimizing with SIMD Instructions 231
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
XMM register, but it does require the duplication of the elements of the 4 × 1 column vector V in all
four floating-point values of the XMM register in each step above. Listing 27 is an SSE function that
performs 4 × 4 matrix multiplication upon a stream of num_vertices_to_rotate vertices.
Examples
Listing 27. 4 × 4 Matrix Multiplication (SSE)
; matrix_x_vector_sse(float *trR, float *v, int num_vertices_to_rotate,
float *rotv);
;
; TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
; ml.exe -coff -c matrix_x_vector_sse.asm
;
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _matrix_x_vector_sse
_matrix_x_vector_sse PROC NEAR
;==============================================================================
; INSTRUCTIONS BELOW SAVE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS
; ENTERED.
; REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED,
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
push ebp
mov ebp, esp
;==============================================================================
; Parameters passed into routine:
; [ebp+8] = ->trR
; [ebp+12] = ->v
; [ebp+16] = num_vertices_to_rotate
; [ebp+20] = ->rotv
;==============================================================================
push ebx
push esi
push edi
;==============================================================================
; THE 4 ASM LINES BELOW LOAD THE FUNCTION's ARGUMENTS INTO GENERAL-PURPOSE
; REGISTERS (GPRS)
; esi = address of Transposed Rotation Matrix
; edi = address of vertices to rotate
; ecx = # of vertices to rotate
; eax = address of rotated vertices
;==============================================================================
mov esi, [ebp+8] ; ESI = ->trR
mov edi, [ebp+12] ; EDI = ->v
mov ecx, [ebp+16] ; ECX = num_vertices_to_rotate
mov edx, ecx ; EDX = num_vertices_to_rotate
shl edx, 4 ; EDX = 16*num_vertices_to_rotate
mov eax, [ebp+20] ; EAX = ->rotv