Support User Manuals

AMD 250 Computer Hardware User Manual

Open as PDF

of 384

Chapter 9 Optimizing with SIMD Instructions 203

Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

Btr_prefptr = Btr_ptr + 32; Ctr_prefptr = Ctr_ptr + 8;

// This loop cycles through the rows of the TRANSPOSED C matrix. A row

// of C-transpose is calculated by the code in this loop and then the

// next row is determined in the following loop iteration. There are

// 32 rows in C-transpose.

for (Ctr_row_num = 0; Ctr_row_num < 32; Ctr_row_num++) {

// Assign pointers to 4 consecutive rows of A by using the

// address of matrix A passed into the function:

Aptr0 = A;

Aptr1 = Aptr0 + 32;

Aptr2 = Aptr0 + 64;

Aptr3 = Aptr0 + 96;

// This loop contains code that "dots" 8 rows of A upon the present row

// of B-transpose. By looping 4 times, all 32 rows of A are multiplied

// upon the present column of B-transpose.

for (Ctr_8col_blck = 0; Ctr_8col_blck < 4; Ctr_8col_blck++) {

// This instruction prefetches 1/4 of the next column of B-transpose

// upon which matrix A needs to be multiplied. The loop within which

// this code resides is executed 4 times, and by incrementing

// Btr_prefptr (the ptr to the address of B transpose to be

// prefetched) by 8 doubles (or 64 bytes, or 1 cache line) the entire

// contents of the next row of B-transpose are brought to the

// processor in advance when Ctr_row_num in the outer loop is

// incremented

_mm_prefetch(&Btr_prefptr[0], 2);

// This loop below "dots" 4 consecutive rows of A upon a row of

// B-transpose by looping 8 times through code that multiplies and

// accumulates the products of 4 elements of A's rows with 4

// elements of B-transpose's column.

for (n = 0; n < 8; n++) {

Ctr_ptr[0] += Aptr0[0]*Btr_ptr[0] + Aptr0[1]*Btr_ptr[1] +

Aptr0[2]*Btr_ptr[2] + Aptr0[3]*Btr_ptr[3];

Ctr_ptr[1] += Aptr1[0]*Btr_ptr[0] + Aptr1[1]*Btr_ptr[1] +

Aptr1[2]*Btr_ptr[2] + Aptr1[3]*Btr_ptr[3];

Ctr_ptr[2] += Aptr2[0]*Btr_ptr[0] + Aptr2[1]*Btr_ptr[1] +

Aptr2[2]*Btr_ptr[2] + Aptr2[3]*Btr_ptr[3];

Ctr_ptr[3] += Aptr3[0]*Btr_ptr[0] + Aptr3[1]*Btr_ptr[1] +

Aptr3[2]*Btr_ptr[2] + Aptr3[3]*Btr_ptr[3];

// Increment pointers to B transpose's column and A's rows to

// the next 4 elements to be multiplied and accumulated.

Btr_ptr += 4;

Aptr0 += 4;

Aptr1 += 4;

Aptr2 += 4;

Aptr3 += 4;

}

// The pointer to C-transpose is incremented by 4 doubles to

// address the next 4 elements of C-transpose's row to be determined.

Ctr_ptr += 4;

// The pointer to B transpose points to the end of the present

// row. We need to subtract 32 doubles so Btr_ptr points

previous next