Chapter 9 Optimizing with SIMD Instructions 203
Software Optimization Guide for AMD64 Processors
25112 Rev. 3.06 September 2005
Btr_prefptr = Btr_ptr + 32; Ctr_prefptr = Ctr_ptr + 8;
// This loop cycles through the rows of the TRANSPOSED C matrix. A row
// of C-transpose is calculated by the code in this loop and then the
// next row is determined in the following loop iteration. There are
// 32 rows in C-transpose.
for (Ctr_row_num = 0; Ctr_row_num < 32; Ctr_row_num++) {
// Assign pointers to 4 consecutive rows of A by using the
// address of matrix A passed into the function:
Aptr0 = A;
Aptr1 = Aptr0 + 32;
Aptr2 = Aptr0 + 64;
Aptr3 = Aptr0 + 96;
// This loop contains code that "dots" 8 rows of A upon the present row
// of B-transpose. By looping 4 times, all 32 rows of A are multiplied
// upon the present column of B-transpose.
for (Ctr_8col_blck = 0; Ctr_8col_blck < 4; Ctr_8col_blck++) {
// This instruction prefetches 1/4 of the next column of B-transpose
// upon which matrix A needs to be multiplied. The loop within which
// this code resides is executed 4 times, and by incrementing
// Btr_prefptr (the ptr to the address of B transpose to be
// prefetched) by 8 doubles (or 64 bytes, or 1 cache line) the entire
// contents of the next row of B-transpose are brought to the
// processor in advance when Ctr_row_num in the outer loop is
// incremented
_mm_prefetch(&Btr_prefptr[0], 2);
// This loop below "dots" 4 consecutive rows of A upon a row of
// B-transpose by looping 8 times through code that multiplies and
// accumulates the products of 4 elements of A's rows with 4
// elements of B-transpose's column.
for (n = 0; n < 8; n++) {
Ctr_ptr[0] += Aptr0[0]*Btr_ptr[0] + Aptr0[1]*Btr_ptr[1] +
Aptr0[2]*Btr_ptr[2] + Aptr0[3]*Btr_ptr[3];
Ctr_ptr[1] += Aptr1[0]*Btr_ptr[0] + Aptr1[1]*Btr_ptr[1] +
Aptr1[2]*Btr_ptr[2] + Aptr1[3]*Btr_ptr[3];
Ctr_ptr[2] += Aptr2[0]*Btr_ptr[0] + Aptr2[1]*Btr_ptr[1] +
Aptr2[2]*Btr_ptr[2] + Aptr2[3]*Btr_ptr[3];
Ctr_ptr[3] += Aptr3[0]*Btr_ptr[0] + Aptr3[1]*Btr_ptr[1] +
Aptr3[2]*Btr_ptr[2] + Aptr3[3]*Btr_ptr[3];
// Increment pointers to B transpose's column and A's rows to
// the next 4 elements to be multiplied and accumulated.
Btr_ptr += 4;
Aptr0 += 4;
Aptr1 += 4;
Aptr2 += 4;
Aptr3 += 4;
}
// The pointer to C-transpose is incremented by 4 doubles to
// address the next 4 elements of C-transpose's row to be determined.
Ctr_ptr += 4;
// The pointer to B transpose points to the end of the present
// row. We need to subtract 32 doubles so Btr_ptr points