forked from EA31337/EA31337-classes
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refs EA31337#738. WIP. Partially working Matrix multiplication via Op…
…enCL. Now we need some improvements and Matrix::Deflatten().
- Loading branch information
Showing
6 changed files
with
210 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#define WIDTH 1 | ||
#define TSM 128 // The tile-size in dimension M | ||
#define TSN 128 // The tile-size in dimension N | ||
#define TSK 16 // The tile-size in dimension K | ||
#define WPTM 8 // The work-per-thread in dimension M | ||
#define WPTN 8 // The work-per-thread in dimension N | ||
#define RTSM (TSM/WPTM) // The reduced tile-size in dimension M | ||
#define RTSN (TSN/WPTN) // The reduced tile-size in dimension N | ||
#define LPTA ((TSK*TSM)/(RTSM*RTSN)) // Loads-per-thread for A | ||
#define LPTB ((TSK*TSN)/(RTSM*RTSN)) // Loads-per-thread for B | ||
__kernel void matmul(const int M, const int N, const int K, | ||
const __global double* A, | ||
const __global double* B, | ||
__global float* C) { | ||
|
||
// Thread identifiers | ||
const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM) | ||
const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN) | ||
const int offsetM = TSM*get_group_id(0); // Work-group offset | ||
const int offsetN = TSN*get_group_id(1); // Work-group offset | ||
|
||
// Local memory to fit a tile of A and B | ||
__local float Asub[TSK][TSM]; | ||
__local float Bsub[TSK][TSN]; | ||
|
||
// Allocate register space | ||
float Areg; | ||
float Breg[WPTN]; | ||
float acc[WPTM][WPTN]; | ||
|
||
// Initialise the accumulation registers | ||
for (int wm=0; wm<WPTM; wm++) { | ||
for (int wn=0; wn<WPTN; wn++) { | ||
acc[wm][wn] = 0.0f; | ||
} | ||
} | ||
|
||
// Loop over all tiles | ||
int numTiles = K/TSK; | ||
for (int t=0; t<numTiles; t++) { | ||
|
||
// Load one tile of A and B into local memory | ||
for (int la=0; la<LPTA/WIDTH; la++) { | ||
int tid = tidn*RTSM + tidm; | ||
int id = la*RTSN*RTSM + tid; | ||
int row = id % (TSM/WIDTH); | ||
int col = id / (TSM/WIDTH); | ||
|
||
// Load the values (wide vector load) | ||
int tiledIndex = TSK*t + col; | ||
double vecA = A[tiledIndex*(M/WIDTH) + offsetM/WIDTH + row]; | ||
double vecB = B[tiledIndex*(N/WIDTH) + offsetN/WIDTH + row]; | ||
|
||
// Store the loaded vectors into local memory | ||
#if WIDTH == 1 | ||
Asub[col][row] = vecA; | ||
Asub[col][row] = vecA; | ||
#elif WIDTH == 2 | ||
Asub[col][WIDTH*row + 0] = vecA.x; | ||
Asub[col][WIDTH*row + 1] = vecA.y; | ||
#elif WIDTH == 4 | ||
Asub[col][WIDTH*row + 0] = vecA.x; | ||
Asub[col][WIDTH*row + 1] = vecA.y; | ||
Asub[col][WIDTH*row + 2] = vecA.z; | ||
Asub[col][WIDTH*row + 3] = vecA.w; | ||
#endif | ||
#if WIDTH == 1 | ||
Bsub[col][row] = vecB; | ||
Bsub[col][row] = vecB; | ||
#elif WIDTH == 2 | ||
Bsub[col][WIDTH*row + 0] = vecB.x; | ||
Bsub[col][WIDTH*row + 1] = vecB.y; | ||
#elif WIDTH == 4 | ||
Bsub[col][WIDTH*row + 0] = vecB.x; | ||
Bsub[col][WIDTH*row + 1] = vecB.y; | ||
Bsub[col][WIDTH*row + 2] = vecB.z; | ||
Bsub[col][WIDTH*row + 3] = vecB.w; | ||
#endif | ||
} | ||
|
||
// Synchronise to make sure the tile is loaded | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
|
||
// Loop over the values of a single tile | ||
for (int k=0; k<TSK; k++) { | ||
|
||
// Cache the values of Bsub in registers | ||
for (int wn=0; wn<WPTN; wn++) { | ||
int col = tidn + wn*RTSN; | ||
Breg[wn] = Bsub[k][col]; | ||
} | ||
|
||
// Perform the computation | ||
for (int wm=0; wm<WPTM; wm++) { | ||
int row = tidm + wm*RTSM; | ||
Areg = Asub[k][row]; | ||
for (int wn=0; wn<WPTN; wn++) { | ||
acc[wm][wn] += Areg * Breg[wn]; | ||
} | ||
} | ||
} | ||
|
||
// Synchronise before loading the next tile | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
} | ||
|
||
// Store the final results in C | ||
for (int wm=0; wm<WPTM; wm++) { | ||
int globalRow = offsetM + tidm + wm*RTSM; | ||
for (int wn=0; wn<WPTN; wn++) { | ||
int globalCol = offsetN + tidn + wn*RTSN; | ||
C[globalCol*M + globalRow] = acc[wm][wn]; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable | ||
|
||
__kernel void matmul( | ||
__global double* A, | ||
__global double* B, | ||
__global double* C, | ||
int rowsA, | ||
int colsA, | ||
int colsB | ||
) | ||
{ | ||
int row = get_global_id(0); | ||
int col = get_global_id(1); | ||
|
||
double sum = 0.0; | ||
|
||
for(int k = 0; k < colsA; ++k) { | ||
sum += A[row * colsA + k] * B[k * colsB + col]; | ||
//sum += col; | ||
} | ||
|
||
C[row * colsB + col] = sum; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable | ||
__kernel void matmul(const int M, const int N, const int K, | ||
const __global double* A, | ||
const __global double* B, | ||
__global double* C) { | ||
|
||
// Thread identifiers | ||
//const int globalRow = get_global_id(0); // Row ID of C (0..M) | ||
//const int globalCol = get_global_id(1); // Col ID of C (0..N) | ||
|
||
// Compute a single element (loop over K) | ||
//float acc = 0.0f; | ||
//for (int k=0; k<K; k++) { | ||
//acc += A[k*M + globalRow] * B[globalCol*K + k]; | ||
//} | ||
|
||
// Store the result | ||
//C[globalCol*M + globalRow] = acc; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters