Chapter 5 Cache and Memory Optimizations 121
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
before starting a copy, especially for large blocks. To write data directly to main memory, bypassing
the cache, use the MOVNTI instruction instead of MOV for the four store instructions.
; rsi = source
; rdi = destination
; ecx = byte count
mov eax, ecx
shr eax, 5
jz done_32
align 16 ; align the loop to a 16-byte fetch boundary
copy_32_bytes:
mov r8, [rsi] ; read 8 bytes
mov r9, [rsi+8] ; it's a bit faster to pair two reads
add rsi, 32 ; update source pointer
mov [rdi], r8 ; store 8 bytes
mov [rdi+8], r9 ; again, pair 2 stores for slight perf gain
add rdi, 32 ; update destination pointer
mov r8, [rsi-16] ; loop is unrolled 4 reads, 4 writes
mov r9, [rsi-8] ; 4-way unroll hides latency of adds and dec
dec eax ; decrement data counter (32 bytes)
mov [rdi-16], r8 ; store more bytes
mov [rdi-8], r9 ; store last 8 bytes
jnz copy_32_bytes
done_32:
(copy any remaining bytes)