220 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
; xmm4 = [d,c,b,a]
; xmm5 = [D,C,B,A]
; xmm6 = [h,g,f,e]
; xmm7 = [H,G,F,E]
;
; and arranges them to look like:
; xmm4 = [E,e,A,a]
; xmm1 = [F,f,B,b]
; xmm2 = [G,g,C,c]
; xmm3 = [H,h,D,d]
movaps xmm3, xmm4 ; xmm3 | [d,c,b,a]
movaps xmm0, xmm5 ; xmm0 | [D,C,B,A]
unpcklps xmm4, xmm6 ; xmm4 | [f,b,e,a]
unpckhps xmm3, xmm6 ; xmm3 | [h,d,g,c]
movaps xmm1, xmm4 ; xmm1 | [f,b,e,a]
movaps xmm2, xmm3 ; xmm2 | [h,d,g,c]
unpcklps xmm5, xmm7 ; xmm5 | [F,B,E,A]
unpckhps xmm0, xmm7 ; xmm0 | [H,D,G,C]
unpcklps xmm4, xmm5 ; xmm4 | [E,e,A,a]
unpckhps xmm1, xmm5 ; xmm1 | [F,f,B,b]
unpcklps xmm3, xmm0 ; xmm3 | [G,g,C,c]
unpckhps xmm2, xmm0 ; xmm2 | [H,h,D,d]
; Now if we compute the sum of these registers, we get the dot-product
; of the first row of A with vector X:
;
; a+b+c+d
;
; in the lower DWORD of the resultant XMM register. The dot-product of the
; second row is stored in the second DWORD and so on, such that:
;
; xmm1 = [V+X+Y+Z,v+x+y+z,A+B+C+D,a+b+c+d]
addps xmm1, xmm4 ; xmm1 | [E+F,e+f,A+B,a+b]
addps xmm3, xmm2 ; xmm3 | [G+H,g+h,C+D,c+d]
addps xmm1, xmm3 ; xmm1 | [E+F+G+H,e+f+g+h,A+B+C+D,a+b+c+d]