140 Branch Optimizations Chapter 6
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
Example 5: C Code
#define PI 3.14159265358979323
float x, y ,xa ,ya ,r ,res;
int xs, df;
xs = x < 0 ? 1 : 0;
xa = fabs(x);
ya = fabs(y);
df = (xa < ya);
if (xs && df) {
res = PI / 2 + r;
} else if (xs) {
res = PI - r;
} else if (df) {
res = PI/2 - r;
} else {
res = r;
}
Example 5: 3DNow!™ Code
; In: MM0 = r
; MM1 = y
; MM2 = x
; Out: MM0 = res
movq mm7, sgn ; Mask to extract sign bit
movq mm6, sgn ; Mask to extract sign bit
movq mm5, mabs ; Mask to clear sign bit
pand mm7, mm2 ; xs = sign(x)
pand mm1, mm5 ; ya = abs(y)
pand mm2, mm5 ; xa = abs(x)
movq mm6, mm1 ; y
pcmpgtd mm6, mm2 ; df = (xa < ya) ? 0xffffffff : 0
pslld mm6, 31 ; df = bit 31
movq mm5, mm7 ; xs
pxor mm7, mm6 ; xs ^ df ? 0x80000000 : 0
movq mm3, npio2 ; -pi / 2
pxor mm5, mm3 ; xs ? pi / 2 : -pi / 2
psrad mm6, 31 ; df ? 0xffffffff : 0
pandn mm6, mm5 ; xs ? (df ? 0 : pi / 2) : (df ? 0 : -pi / 2)
pfsub mm6, mm3 ; pr = pi / 2 + (xs ? (df ? 0 : pi / 2) :
; (df ? 0 : -pi / 2))
por mm0, mm7 ; ar = xs ^ df ? -r : r
pfadd mm0, mm6 ; res = ar + pr