180 Integer Optimizations Chapter 8
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
4. The four 4-bit sums can now be rapidly accumulated by multiplying with a so-called magic
multiplier. This can be derived from looking at the following chart of partial products:
0p0q0r0s * 01010101 =
:0p0q0r0s
0p:0q0r0s
0p0q:0r0s
0p0q0r:0s
000pxxww:vvuutt0s
Here p, q, r, and s are the 4-bit sums from the previous step, and vv is the final interesting result.
The final result is as follows:
z = (y * 0x01010101) >> 24
Integer Version
unsigned int popcount(unsigned int v)
{
unsigned int retVal;
__asm {
mov eax, [v] ; v
mov edx, eax ; v
shr eax, 1 ; v >> 1
and eax, 055555555h ; (v >> 1) & 0x55555555
sub edx, eax ; w = v - ((v >> 1) & 0x55555555)
mov eax, edx ; w
shr edx, 2 ; w >> 2
and eax, 033333333h ; w & 0x33333333
and edx, 033333333h ; (w >> 2) & 0x33333333
add eax, edx ; x = (w & 0x33333333) + ((w >> 2) &
; 0x33333333)
mov edx, eax ; x
shr eax, 4 ; x >> 4
add eax, edx ; x + (x >> 4)
and eax, 00F0F0F0Fh ; y = (x + (x >> 4) & 0x0F0F0F0F)
imul eax, 001010101h ; y * 0x01010101
shr eax, 24 ; population count = (y *
; 0x01010101) >> 24
mov retVal, eax ; Store result.
}
return(retVal);
}