General Optimization Guidelines 2
2-85
Memory routines in the runtime library generated by Intel Compilers are
optimized across wide range of address alignment, counter values, and
microarchitectures. In most cases, applications should take advantage of
the default memory routines provided by Intel Compilers.
Table 2-5 Using REP STOSD with Arbitrary Count Size and 4-Byte-Aligned
Destination
A ‘C’ example of Memset() Equivalent Implementation Using REP STOSD
void memset(void *dst,int
c,size_t size)
{
char *d = (char *)dst;
size_t i;
for (i=0;i<size;i++)
*d++ = (char)c;
}
push edi
movzx eax,byte ptr [esp+12]
mov ecx,eax
shl ecx,8
or ecx,eax
mov ecx,eax
shl ecx,16
or eax,ecx
mov edi,[esp+8] : 4-byte aligned
mov ecx,[esp+16] ; byte count
shr ecx,2 ; do dword
cmp ecx,127
jle _main
test edi,4
jz _main
stosd ; peel off one dword
dec ecx
_main: ; 8-byte aligned
rep stosd
mov ecx,[esp + 16]
and ecx,3 ; do count <= 3
rep stosb ; optimal with <= 3
pop edi
ret