Support User Manuals

AMD 250 Computer Hardware User Manual

Open as PDF

of 384

350 AGP Considerations Appendix D

25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

...

// Use half of the 32-Kbyte nontemporal cache for a block load.

#define HALFL1PREFETCHNTACACHESIZE 16384

mov rdi, QWORD PTR [image_source]

mov rcx, HALFL1PREFETCHNTACACHESIZE / 64

Block_PrefetchIntoL1:

prefetchnta QWORD PTR [rdi] ; Grab 64 bytes.

add rdi, 64 ; Bump up to next cache line.

dec rcx

jnz Block_PrefetchIntoL1

LoadPtr_ToFrameBuffer:

mov rdi, QWORD PTR [frameBuffDestPtr]

mov rcx, HALFL1PREFETCHNTACACHESIZE / 128

/* Get linear pointer to local memory mapped in WC address space. */

mov rax, DQWORD PTR [FBimage_Ptr]

/* Send out 128 bytes (yielding ~1.7 Gbytes/s of fast-write bandwidth) */

/* per block. RDI now has pointer back to image source. */

/* 16 Kbytes of image is in L1 nontemporal cache (way 0 of cache). */

Block_WriteToFrameBuffer:

movdqa xmm0, [rdi]

movdqa xmm1, [rdi+16]

movdqa xmm2, [rdi+32]

movdqa xmm3, [rdi+48]

movdqa xmm4, [rdi+64]

movdqa xmm5, [rdi+80]

movdqa xmm6, [rdi+96]

movdqa xmm7, [rdi+112]

/* Copy register data to WC buffer. */

movdqa [rax], xmm0

movdqa [rax+16], xmm1

movdqa [rax+32], xmm2

/* The first WC buffer is sent after next write since we are crossing */

/* a cache-line boundary. */

movdqa [rax+48], xmm3

/* Allocate and fill another WC buffer. */

movdqa [rax+64], xmm4

movdqa [rax+80], xmm5

previous next