Skip to content

Commit

Permalink
Fix k = 0 edge case in power10 microkernels (#706)
Browse files Browse the repository at this point in the history
Details:
- When power10 sgemm and dgemm microkernels are called with k = 0, they
  become caught in infinite loops and segfault. This is fixed now via an
  early exit in the case of k = 0.
  • Loading branch information
nisanthmp authored Jan 11, 2023
1 parent 2e1ba9d commit d220f9c
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 33 deletions.
29 changes: 12 additions & 17 deletions kernels/power10/3/bli_dgemm_power10_mma.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,10 @@ void bli_dgemm_power10_mma_8x8
cntx_t* cntx
)
{

// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
uint64_t k_iter = (k-1) / 4;
uint64_t k_left = (k-1) % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;

uint64_t rs_c = rs_c0;

Expand Down Expand Up @@ -110,6 +108,16 @@ void bli_dgemm_power10_mma_8x8
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;

// initialize the accumulators to zeros
__builtin_mma_xxsetaccz(&acc0);
__builtin_mma_xxsetaccz(&acc1);
__builtin_mma_xxsetaccz(&acc2);
__builtin_mma_xxsetaccz(&acc3);
__builtin_mma_xxsetaccz(&acc4);
__builtin_mma_xxsetaccz(&acc5);
__builtin_mma_xxsetaccz(&acc6);
__builtin_mma_xxsetaccz(&acc7);

/* 2 vector pairs are necessary for a double precision outer product
instruction. */
__vector_pair colA_1,
Expand Down Expand Up @@ -141,19 +149,6 @@ void bli_dgemm_power10_mma_8x8
*/
D_ASSEMBLE_VEC_PAIR

/* Compute accumulate outer products and override accumulators with result */
__builtin_mma_xvf64ger (&acc0, colA_1, rb[0]);
__builtin_mma_xvf64ger (&acc1, colA_1, rb[1]);
__builtin_mma_xvf64ger (&acc2, colA_1, rb[2]);
__builtin_mma_xvf64ger (&acc3, colA_1, rb[3]);
__builtin_mma_xvf64ger (&acc4, colA_2, rb[0]);
__builtin_mma_xvf64ger (&acc5, colA_2, rb[1]);
__builtin_mma_xvf64ger (&acc6, colA_2, rb[2]);
__builtin_mma_xvf64ger (&acc7, colA_2, rb[3]);

/* Move A and B pointers */
D_INCREMENT

// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
Expand Down
29 changes: 13 additions & 16 deletions kernels/power10/3/bli_sgemm_power10_mma.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,8 @@ void bli_sgemm_power10_mma_8x16
{
// Typecast local copies of integers in case dim_t and inc_t are a
// different size than is expected by load instructions.
// (1 is subtracted from k0 because 1 iteration of the k loop is pulled out)
uint64_t k_iter = (k-1) / 4;
uint64_t k_left = (k-1) % 4;
uint64_t k_iter = k / 4;
uint64_t k_left = k % 4;

uint64_t rs_c = rs_c0;

Expand All @@ -84,6 +83,16 @@ void bli_sgemm_power10_mma_8x16
__vector_quad acc0, acc1, acc2, acc3,
acc4, acc5, acc6, acc7;

// initialize the accumulators to zeros
__builtin_mma_xxsetaccz(&acc0);
__builtin_mma_xxsetaccz(&acc1);
__builtin_mma_xxsetaccz(&acc2);
__builtin_mma_xxsetaccz(&acc3);
__builtin_mma_xxsetaccz(&acc4);
__builtin_mma_xxsetaccz(&acc5);
__builtin_mma_xxsetaccz(&acc6);
__builtin_mma_xxsetaccz(&acc7);

float* restrict A0 = a;
float* restrict B0 = b;
float* restrict C0 = c;
Expand All @@ -95,18 +104,6 @@ void bli_sgemm_power10_mma_8x16
vec_t *ca = (vec_t *) A0;
vec_t *rb = (vec_t *) B0;

/* Compute accumulate outer products and override accumulators with result */
__builtin_mma_xvf32ger (&acc0, ca[0], rb[0]);
__builtin_mma_xvf32ger (&acc1, ca[0], rb[1]);
__builtin_mma_xvf32ger (&acc2, ca[0], rb[2]);
__builtin_mma_xvf32ger (&acc3, ca[0], rb[3]);
__builtin_mma_xvf32ger (&acc4, ca[1], rb[0]);
__builtin_mma_xvf32ger (&acc5, ca[1], rb[1]);
__builtin_mma_xvf32ger (&acc6, ca[1], rb[2]);
__builtin_mma_xvf32ger (&acc7, ca[1], rb[3]);

S_INCREMENT

// k loop (unrolled by 4)
for (int k = 0; k<k_iter; k++)
{
Expand Down Expand Up @@ -147,4 +144,4 @@ void bli_sgemm_power10_mma_8x16
}

GEMM_UKR_FLUSH_CT( s );
}
}

0 comments on commit d220f9c

Please sign in to comment.