Chapter 5 Cache and Memory Optimizations 93
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
Preferred If Stores Are Close to the Load
movd mm0, eax
mov foo+4, edx
punpckldq mm0, foo+4
Examples—Large-to-small Mismatches
Avoid large-to-small mismatches, as shown in the following code:
64-bit (Avoid)
foo DQ ? ; Assume foo is 8-byte aligned.
...
mov QWORD PTR foo, rax ; Store a QWORD to foo.
mov eax, DWORD PTR foo ; Load a DWORD from foo.
mov edx, DWORD PTR foo+4 ; Load a DWORD from foo+4.
32-bit (Avoid)
foo DQ ? ; Assume foo is 4-byte aligned.
...
fst QWORD PTR foo ; Store a QWORD in foo.
mov eax, DWORD PTR foo ; Load a DWORD from foo.
mov edx, DWORD PTR foo+4 ; Load a DWORD from foo+4.
Avoid
movq foo, mm0
...
mov eax, foo
mov edx, foo+4
Preferred
movd foo, mm0
pswapd mm0, mm0
movd foo+4, mm0
pswapd mm0, mm0
...
mov eax, foo
mov edx, foo+4
Preferred If the Contents of MM0 are No Longer Needed
movd foo, mm0
punpckhdq mm0, mm0
movd foo+4, mm0
...
mov eax, foo
mov edx, foo+4