Skip to content

Commit

Permalink
Refs EA31337#738. WIP. Partially working Matrix multiplication via Op…
Browse files Browse the repository at this point in the history
…enCL. Now we need some improvements and Matrix::Deflatten().
  • Loading branch information
nseam committed Apr 24, 2024
1 parent 2f2fce7 commit b612dac
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 50 deletions.
115 changes: 115 additions & 0 deletions Matrix.matmul.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#define WIDTH 1
#define TSM 128 // The tile-size in dimension M
#define TSN 128 // The tile-size in dimension N
#define TSK 16 // The tile-size in dimension K
#define WPTM 8 // The work-per-thread in dimension M
#define WPTN 8 // The work-per-thread in dimension N
#define RTSM (TSM/WPTM) // The reduced tile-size in dimension M
#define RTSN (TSN/WPTN) // The reduced tile-size in dimension N
#define LPTA ((TSK*TSM)/(RTSM*RTSN)) // Loads-per-thread for A
#define LPTB ((TSK*TSN)/(RTSM*RTSN)) // Loads-per-thread for B
__kernel void matmul(const int M, const int N, const int K,
const __global double* A,
const __global double* B,
__global float* C) {

// Thread identifiers
const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM)
const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN)
const int offsetM = TSM*get_group_id(0); // Work-group offset
const int offsetN = TSN*get_group_id(1); // Work-group offset

// Local memory to fit a tile of A and B
__local float Asub[TSK][TSM];
__local float Bsub[TSK][TSN];

// Allocate register space
float Areg;
float Breg[WPTN];
float acc[WPTM][WPTN];

// Initialise the accumulation registers
for (int wm=0; wm<WPTM; wm++) {
for (int wn=0; wn<WPTN; wn++) {
acc[wm][wn] = 0.0f;
}
}

// Loop over all tiles
int numTiles = K/TSK;
for (int t=0; t<numTiles; t++) {

// Load one tile of A and B into local memory
for (int la=0; la<LPTA/WIDTH; la++) {
int tid = tidn*RTSM + tidm;
int id = la*RTSN*RTSM + tid;
int row = id % (TSM/WIDTH);
int col = id / (TSM/WIDTH);

// Load the values (wide vector load)
int tiledIndex = TSK*t + col;
double vecA = A[tiledIndex*(M/WIDTH) + offsetM/WIDTH + row];
double vecB = B[tiledIndex*(N/WIDTH) + offsetN/WIDTH + row];

// Store the loaded vectors into local memory
#if WIDTH == 1
Asub[col][row] = vecA;
Asub[col][row] = vecA;
#elif WIDTH == 2
Asub[col][WIDTH*row + 0] = vecA.x;
Asub[col][WIDTH*row + 1] = vecA.y;
#elif WIDTH == 4
Asub[col][WIDTH*row + 0] = vecA.x;
Asub[col][WIDTH*row + 1] = vecA.y;
Asub[col][WIDTH*row + 2] = vecA.z;
Asub[col][WIDTH*row + 3] = vecA.w;
#endif
#if WIDTH == 1
Bsub[col][row] = vecB;
Bsub[col][row] = vecB;
#elif WIDTH == 2
Bsub[col][WIDTH*row + 0] = vecB.x;
Bsub[col][WIDTH*row + 1] = vecB.y;
#elif WIDTH == 4
Bsub[col][WIDTH*row + 0] = vecB.x;
Bsub[col][WIDTH*row + 1] = vecB.y;
Bsub[col][WIDTH*row + 2] = vecB.z;
Bsub[col][WIDTH*row + 3] = vecB.w;
#endif
}

// Synchronise to make sure the tile is loaded
barrier(CLK_LOCAL_MEM_FENCE);

// Loop over the values of a single tile
for (int k=0; k<TSK; k++) {

// Cache the values of Bsub in registers
for (int wn=0; wn<WPTN; wn++) {
int col = tidn + wn*RTSN;
Breg[wn] = Bsub[k][col];
}

// Perform the computation
for (int wm=0; wm<WPTM; wm++) {
int row = tidm + wm*RTSM;
Areg = Asub[k][row];
for (int wn=0; wn<WPTN; wn++) {
acc[wm][wn] += Areg * Breg[wn];
}
}
}

// Synchronise before loading the next tile
barrier(CLK_LOCAL_MEM_FENCE);
}

// Store the final results in C
for (int wm=0; wm<WPTM; wm++) {
int globalRow = offsetM + tidm + wm*RTSM;
for (int wn=0; wn<WPTN; wn++) {
int globalCol = offsetN + tidn + wn*RTSN;
C[globalCol*M + globalRow] = acc[wm][wn];
}
}
}
23 changes: 23 additions & 0 deletions Matrix.matmul.naive.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable

__kernel void matmul(
__global double* A,
__global double* B,
__global double* C,
int rowsA,
int colsA,
int colsB
)
{
int row = get_global_id(0);
int col = get_global_id(1);

double sum = 0.0;

for(int k = 0; k < colsA; ++k) {
sum += A[row * colsA + k] * B[k * colsB + col];
//sum += col;
}

C[row * colsB + col] = sum;
}
19 changes: 19 additions & 0 deletions Matrix.matmul.test.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void matmul(const int M, const int N, const int K,
const __global double* A,
const __global double* B,
__global double* C) {

// Thread identifiers
//const int globalRow = get_global_id(0); // Row ID of C (0..M)
//const int globalCol = get_global_id(1); // Col ID of C (0..N)

// Compute a single element (loop over K)
//float acc = 0.0f;
//for (int k=0; k<K; k++) {
//acc += A[k*M + globalRow] * B[globalCol*K + k];
//}

// Store the result
//C[globalCol*M + globalRow] = acc;
}
97 changes: 49 additions & 48 deletions Matrix.mqh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
#include "Math.h"
#include "OpenCL.h"

#resource "Matrix.matmul.cl" as string CLSource_Matrix_MatMul
#resource "Matrix.matmul.naive.cl" as string CLSource_Matrix_MatMul_Naive
#resource "Matrix.matmul.test.cl" as string CLSource_Matrix_MatMul_Test

#define MATRIX_DIMENSIONS 6
#define MATRIX_VALUES_ARRAY_INCREMENT 500

Expand Down Expand Up @@ -746,9 +750,9 @@ class MatrixOpenCLBuffer : public Dynamic {
/**
* Constructor.
*/
MatrixOpenCLBuffer(int _size) {
MatrixOpenCLBuffer(int _size, unsigned int _flags) {
version = 0;
buffer = OpenCL::Alloc(_size);
buffer = OpenCL::Alloc(_size, _flags);
ArrayResize(data, _size);
}

Expand Down Expand Up @@ -871,18 +875,7 @@ class Matrix {
return;
}

cl_program_matmul = OpenCL::Compile(
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable" NL "__kernel void matmul(" NL " const int Mdim," NL
" const int Ndim," NL " const int Pdim," NL " __global double* A," NL " __global double* B," NL
" __global double* C," NL " __local double* Bwrk" NL ")" NL "{" NL " int k, j;" NL
" int i = get_global_id(0);" NL " int iloc = get_local_id(0);" NL
" int nloc = get_local_size(0);" NL " double Awrk[1000];" NL " double tmp;" NL "" NL
" for (k = 0; k < Pdim; k++)" NL " Awrk[k] = A[i * Ndim + k];" NL "" NL
" for (j = 0; j < Mdim; j++)" NL " {" NL " for (k = iloc; k < Pdim; k = k + nloc)" NL
" Bwrk[k] = B[k * Pdim + j];" NL "" NL " barrier(CLK_LOCAL_MEM_FENCE);" NL "" NL
" tmp = 0.0f;" NL "" NL " for (k = 0; k < Pdim; k++)" NL
" tmp += Awrk[k] * Bwrk[k];" NL " C[i * Ndim + j] += tmp;" NL " }" NL "}" NL,
"matmul");
cl_program_matmul = OpenCL::Compile(CLSource_Matrix_MatMul_Naive, "matmul");

cl_program_matmul_single = OpenCL::Compile(
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable" NL "__kernel void matmul(" NL "const int Mdim," NL
Expand All @@ -904,7 +897,7 @@ class Matrix {
_buffer = cl_buffers_in_0.GetByKey(_size, _buffer);

if (!_buffer.IsSet()) {
_buffer = new MatrixOpenCLBuffer<X>(_size);
_buffer = new MatrixOpenCLBuffer<X>(_size, CL_MEM_READ_ONLY);
cl_buffers_in_0.Set(_size, _buffer);
}

Expand All @@ -920,7 +913,7 @@ class Matrix {
_buffer = cl_buffers_in_1.GetByKey(_size, _buffer);

if (!_buffer.IsSet()) {
_buffer = new MatrixOpenCLBuffer<X>(_size);
_buffer = new MatrixOpenCLBuffer<X>(_size, CL_MEM_READ_ONLY);
cl_buffers_in_1.Set(_size, _buffer);
}

Expand All @@ -936,7 +929,7 @@ class Matrix {
_buffer = cl_buffers_out.GetByKey(_size, _buffer);

if (!_buffer.IsSet()) {
_buffer = new MatrixOpenCLBuffer<X>(_size);
_buffer = new MatrixOpenCLBuffer<X>(_size, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
cl_buffers_out.Set(_size, _buffer);
}

Expand Down Expand Up @@ -1419,7 +1412,7 @@ class Matrix {
static void MatMul(Matrix<X>& source, Matrix<X>& target, Matrix<X>& output) {
#ifdef MATRIX_USE_OPENCL
// MatMulCL(source, target, output);
MatMulCLSingle(source, target, output);
MatMulCL(source, target, output);
return;
#endif

Expand All @@ -1445,48 +1438,56 @@ class Matrix {
* method.
*/
static void MatMulCL(Matrix<X>& _source, Matrix<X>& _target, Matrix<X>& _output) {
if (_source.GetSize() != _target.GetRange(1)) {
if (_source.GetRange(1) != _target.GetRange(0)) {
Alert("Inconsistent size of matrices!");
}

unsigned int _m_dim = _source.GetRange(0);
unsigned int _n_dim = _source.GetRange(1);
unsigned int _p_dim = _target.GetRange(1);
unsigned int _rows_a = _source.GetRange(0);
unsigned int _cols_a = _source.GetRange(1);
unsigned int _cols_b = _target.GetRange(1);

OpenCLBuffer* _in_1 = GetCLBufferInArg0(_source.GetSize()) PTR_DEREF GetBuffer();
OpenCLBuffer* _in_2 = GetCLBufferInArg1(_target.GetSize()) PTR_DEREF GetBuffer();
OpenCLBuffer* _out = GetCLBufferOutArg(_source.GetRange(0)) PTR_DEREF GetBuffer();
OpenCLBuffer* _in_1 = GetCLBufferInArg0(_rows_a * _cols_a) PTR_DEREF GetBuffer();
OpenCLBuffer* _in_2 = GetCLBufferInArg1(_cols_a * _cols_b) PTR_DEREF GetBuffer();
OpenCLBuffer* _out = GetCLBufferOutArg(_rows_a * _cols_b) PTR_DEREF GetBuffer();

double _in_1_data[];
double _in_2_data[];
double _out_data[];

// ArrayResize(_out_data, _out PTR_DEREF GetSizeItems());

_source.GetRawArray(_in_1_data);
_target.GetRawArray(_in_2_data);

cl_program_matmul REF_DEREF SetArg(0, _m_dim);
cl_program_matmul REF_DEREF SetArg(1, _n_dim);
cl_program_matmul REF_DEREF SetArg(2, _p_dim);
cl_program_matmul REF_DEREF SetArg(3, _in_1);
cl_program_matmul REF_DEREF SetArg(4, _in_2);
cl_program_matmul REF_DEREF SetArg(5, _out);
cl_program_matmul REF_DEREF SetArgLocalMem(6, _p_dim * sizeof(X));
_in_1 PTR_DEREF Write(_in_1_data);
_in_2 PTR_DEREF Write(_in_2_data);

cl_program_matmul REF_DEREF SetArg(0, _in_1);
cl_program_matmul REF_DEREF SetArg(1, _in_2);
cl_program_matmul REF_DEREF SetArg(2, _out);
cl_program_matmul REF_DEREF SetArg(3, (int)_rows_a);
cl_program_matmul REF_DEREF SetArg(4, (int)_cols_a);
cl_program_matmul REF_DEREF SetArg(5, (int)_cols_b);

ARRAY(unsigned int, _global_work_offset);
ARRAY(unsigned int, _global_work_size);
ARRAY(unsigned int, _local_work_size);

ArrayPush(_global_work_offset, 0U);
ArrayPush(_global_work_size, _n_dim);
ArrayPush(_global_work_size, _m_dim);
ArrayPush(_local_work_size, _m_dim);
ArrayPush(_global_work_offset, 0U);
ArrayPush(_global_work_size, (unsigned int)_rows_a);
ArrayPush(_global_work_size, (unsigned int)_cols_b);
ArrayPush(_local_work_size, 1U);
ArrayPush(_local_work_size, 1U);

if (!cl_program_matmul REF_DEREF RunMany(2, _global_work_offset, _global_work_size, _local_work_size)) {
Alert("Errpr: Could not run Matrix::MatMulCL() over OpenCL!");
Alert("Error: Could not run Matrix::MatMulCL() over OpenCL!");
DebugBreak();
}

_out PTR_DEREF Read(_out_data);

// _output.SetShape(num_outputs);
_output.SetShape(_rows_a, _cols_b);
}

/**
Expand All @@ -1498,13 +1499,13 @@ class Matrix {
Alert("Inconsistent size of matrices!");
}

unsigned int _m_dim = _source.GetRange(0);
unsigned int _n_dim = _source.GetRange(1);
unsigned int _p_dim = _target.GetRange(1);
unsigned int _cols_a = _source.GetRange(0);
unsigned int _rows_a = _source.GetRange(1);
unsigned int _cols_b = _target.GetRange(1);

OpenCLBuffer* _in_1 = GetCLBufferInArg0(_n_dim * _p_dim) PTR_DEREF GetBuffer();
OpenCLBuffer* _in_2 = GetCLBufferInArg1(_p_dim * _m_dim) PTR_DEREF GetBuffer();
OpenCLBuffer* _out = GetCLBufferOutArg(_m_dim * _p_dim) PTR_DEREF GetBuffer();
OpenCLBuffer* _in_1 = GetCLBufferInArg0(_rows_a * _cols_b) PTR_DEREF GetBuffer();
OpenCLBuffer* _in_2 = GetCLBufferInArg1(_cols_b * _cols_a) PTR_DEREF GetBuffer();
OpenCLBuffer* _out = GetCLBufferOutArg(_cols_a * _cols_b) PTR_DEREF GetBuffer();

double _in_1_data[];
double _in_2_data[];
Expand All @@ -1515,16 +1516,16 @@ class Matrix {
_source.GetRawArray(_in_1_data);
_target.GetRawArray(_in_2_data);

MatMulCL_CPU(_m_dim, _n_dim, _p_dim, _in_1_data, _in_2_data, _out_data);
MatMulCL_CPU(_cols_a, _rows_a, _cols_b, _in_1_data, _in_2_data, _out_data);

cl_program_matmul REF_DEREF SetArg(0, _n_dim);
cl_program_matmul REF_DEREF SetArg(1, _m_dim);
cl_program_matmul REF_DEREF SetArg(2, _p_dim);
cl_program_matmul REF_DEREF SetArg(0, (int)_rows_a);
cl_program_matmul REF_DEREF SetArg(1, (int)_cols_a);
cl_program_matmul REF_DEREF SetArg(2, (int)_cols_b);
cl_program_matmul REF_DEREF SetArg(3, _in_1);
cl_program_matmul REF_DEREF SetArg(4, _in_2);
cl_program_matmul REF_DEREF SetArg(5, _out);

if (!cl_program_matmul_single REF_DEREF Run()) {
if (!cl_program_matmul REF_DEREF Run()) {
Alert("Errpr: Could not run Matrix::MatMulCL() over OpenCL!");
DebugBreak();
}
Expand Down
4 changes: 3 additions & 1 deletion OpenCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class OpenCLBuffer : public Dynamic {
DebugBreak();
return;
}
ArrayResize(_arr, buffer_size);
CLBufferRead(buffer_handle, _arr);
}

Expand Down Expand Up @@ -147,6 +148,7 @@ class OpenCLProgram : public Dynamic {
* your version isn't greater that the one already set in the buffer.
*/
void SetArg(int _index, double value) { CLSetKernelArg(kernel_handle, _index, value); }
void SetArg(int _index, int value) { CLSetKernelArg(kernel_handle, _index, value); }

/**
* Passes argument to the kernel. Will not set kernel argument if not needed.
Expand Down Expand Up @@ -479,7 +481,7 @@ class OpenCL {
/**
* Allocates memory to be later passed to OpenCLProgram.
*/
static OpenCLBuffer* Alloc(int _size) { return new OpenCLBuffer(_size); }
static OpenCLBuffer* Alloc(int _size, unsigned int _flags) { return new OpenCLBuffer(_size, _flags); }

/**
* Compiles given program and returns its id or -1 in case of error.
Expand Down
2 changes: 1 addition & 1 deletion tests/OpenCLTest.mq5
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ int OnInit() {

double result[] = {0};

Ref<OpenCLBuffer> buffer = OpenCL::Alloc(1 /* 1 double */);
Ref<OpenCLBuffer> buffer = OpenCL::Alloc(1 /* 1 double */, CL_MEM_READ_WRITE);

if (!program REF_DEREF Run(buffer.Ptr())) {
Alert("Error running program!");
Expand Down

0 comments on commit b612dac

Please sign in to comment.