Support User Manuals

AMD 250 Computer Hardware User Manual

Open as PDF

of 384

Chapter 5 Cache and Memory Optimizations 121

Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

before starting a copy, especially for large blocks. To write data directly to main memory, bypassing

the cache, use the MOVNTI instruction instead of MOV for the four store instructions.

; rsi = source

; rdi = destination

; ecx = byte count

mov eax, ecx

shr eax, 5

jz done_32

align 16 ; align the loop to a 16-byte fetch boundary

copy_32_bytes:

mov r8, [rsi] ; read 8 bytes

mov r9, [rsi+8] ; it's a bit faster to pair two reads

add rsi, 32 ; update source pointer

mov [rdi], r8 ; store 8 bytes

mov [rdi+8], r9 ; again, pair 2 stores for slight perf gain

add rdi, 32 ; update destination pointer

mov r8, [rsi-16] ; loop is unrolled 4 reads, 4 writes

mov r9, [rsi-8] ; 4-way unroll hides latency of adds and dec

dec eax ; decrement data counter (32 bytes)

mov [rdi-16], r8 ; store more bytes

mov [rdi-8], r9 ; store last 8 bytes

jnz copy_32_bytes

done_32:

(copy any remaining bytes)

previous next