Refs EA31337#738. WIP. Partially working Matrix multiplication via Op…

…enCL. Now we need some improvements and Matrix::Deflatten().
nseam · Apr 24, 2024 · b612dac · b612dac
1 parent 2f2fce7
commit b612dac
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 50 deletions.
diff --git a/Matrix.matmul.cl b/Matrix.matmul.cl
@@ -0,0 +1,115 @@
+#define WIDTH 1
+#define TSM 128                // The tile-size in dimension M
+#define TSN 128                // The tile-size in dimension N
+#define TSK 16                 // The tile-size in dimension K
+#define WPTM 8                 // The work-per-thread in dimension M
+#define WPTN 8                 // The work-per-thread in dimension N
+#define RTSM (TSM/WPTM)        // The reduced tile-size in dimension M
+#define RTSN (TSN/WPTN)        // The reduced tile-size in dimension N
+#define LPTA ((TSK*TSM)/(RTSM*RTSN)) // Loads-per-thread for A
+#define LPTB ((TSK*TSN)/(RTSM*RTSN)) // Loads-per-thread for B
+__kernel void matmul(const int M, const int N, const int K,
+                      const __global double* A,
+                      const __global double* B,
+                      __global float* C) {
+
+    // Thread identifiers
+    const int tidm = get_local_id(0); // Local row ID (max: TSM/WPTM)
+    const int tidn = get_local_id(1); // Local col ID (max: TSN/WPTN)
+    const int offsetM = TSM*get_group_id(0); // Work-group offset
+    const int offsetN = TSN*get_group_id(1); // Work-group offset
+
+    // Local memory to fit a tile of A and B
+    __local float Asub[TSK][TSM];
+    __local float Bsub[TSK][TSN];
+
+    // Allocate register space
+    float Areg;
+    float Breg[WPTN];
+    float acc[WPTM][WPTN];
+
+    // Initialise the accumulation registers
+    for (int wm=0; wm<WPTM; wm++) {
+        for (int wn=0; wn<WPTN; wn++) {
+            acc[wm][wn] = 0.0f;
+        }
+    }
+
+    // Loop over all tiles
+    int numTiles = K/TSK;
+    for (int t=0; t<numTiles; t++) {
+
+        // Load one tile of A and B into local memory
+        for (int la=0; la<LPTA/WIDTH; la++) {
+            int tid = tidn*RTSM + tidm;
+            int id = la*RTSN*RTSM + tid;
+            int row = id % (TSM/WIDTH);
+            int col = id / (TSM/WIDTH);
+
+            // Load the values (wide vector load)
+            int tiledIndex = TSK*t + col;
+            double vecA = A[tiledIndex*(M/WIDTH) + offsetM/WIDTH + row];
+            double vecB = B[tiledIndex*(N/WIDTH) + offsetN/WIDTH + row];
+
+            // Store the loaded vectors into local memory
+            #if WIDTH == 1
+                Asub[col][row] = vecA;
+                Asub[col][row] = vecA;
+            #elif WIDTH == 2
+                Asub[col][WIDTH*row + 0] = vecA.x;
+                Asub[col][WIDTH*row + 1] = vecA.y;
+            #elif WIDTH == 4
+                Asub[col][WIDTH*row + 0] = vecA.x;
+                Asub[col][WIDTH*row + 1] = vecA.y;
+                Asub[col][WIDTH*row + 2] = vecA.z;
+                Asub[col][WIDTH*row + 3] = vecA.w;
+            #endif
+            #if WIDTH == 1
+                Bsub[col][row] = vecB;
+                Bsub[col][row] = vecB;
+            #elif WIDTH == 2
+                Bsub[col][WIDTH*row + 0] = vecB.x;
+                Bsub[col][WIDTH*row + 1] = vecB.y;
+            #elif WIDTH == 4
+                Bsub[col][WIDTH*row + 0] = vecB.x;
+                Bsub[col][WIDTH*row + 1] = vecB.y;
+                Bsub[col][WIDTH*row + 2] = vecB.z;
+                Bsub[col][WIDTH*row + 3] = vecB.w;
+            #endif
+        }
+
+        // Synchronise to make sure the tile is loaded
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Loop over the values of a single tile
+        for (int k=0; k<TSK; k++) {
+
+            // Cache the values of Bsub in registers
+            for (int wn=0; wn<WPTN; wn++) {
+                int col = tidn + wn*RTSN;
+                Breg[wn] = Bsub[k][col];
+            }
+
+            // Perform the computation
+            for (int wm=0; wm<WPTM; wm++) {
+                int row = tidm + wm*RTSM;
+                Areg = Asub[k][row];
+                for (int wn=0; wn<WPTN; wn++) {
+                    acc[wm][wn] += Areg * Breg[wn];
+                }
+            }
+        }
+
+        // Synchronise before loading the next tile
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Store the final results in C
+    for (int wm=0; wm<WPTM; wm++) {
+        int globalRow = offsetM + tidm + wm*RTSM;
+        for (int wn=0; wn<WPTN; wn++) {
+            int globalCol = offsetN + tidn + wn*RTSN;
+            C[globalCol*M + globalRow] = acc[wm][wn];
+        }
+    }
+}
diff --git a/Matrix.matmul.naive.cl b/Matrix.matmul.naive.cl
@@ -0,0 +1,23 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+__kernel void matmul(
+  __global double* A,
+  __global double* B,
+  __global double* C,
+  int rowsA,
+  int colsA,
+  int colsB
+  )
+{
+  int row = get_global_id(0);
+  int col = get_global_id(1);
+
+  double sum = 0.0;
+
+  for(int k = 0; k < colsA; ++k) {
+    sum += A[row * colsA + k] * B[k * colsB + col];
+    //sum += col;
+  }
+
+  C[row * colsB + col] = sum;
+}
diff --git a/Matrix.matmul.test.cl b/Matrix.matmul.test.cl
@@ -0,0 +1,19 @@
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+__kernel void matmul(const int M, const int N, const int K,
+                      const __global double* A,
+                      const __global double* B,
+                      __global double* C) {
+
+    // Thread identifiers
+    //const int globalRow = get_global_id(0); // Row ID of C (0..M)
+    //const int globalCol = get_global_id(1); // Col ID of C (0..N)
+
+    // Compute a single element (loop over K)
+    //float acc = 0.0f;
+    //for (int k=0; k<K; k++) {
+        //acc += A[k*M + globalRow] * B[globalCol*K + k];
+    //}
+
+    // Store the result
+    //C[globalCol*M + globalRow] = acc;
+}
diff --git a/Matrix.mqh b/Matrix.mqh
@@ -35,6 +35,10 @@
 #include "Math.h"
 #include "OpenCL.h"
 
+#resource "Matrix.matmul.cl" as string CLSource_Matrix_MatMul
+#resource "Matrix.matmul.naive.cl" as string CLSource_Matrix_MatMul_Naive
+#resource "Matrix.matmul.test.cl" as string CLSource_Matrix_MatMul_Test
+
 #define MATRIX_DIMENSIONS 6
 #define MATRIX_VALUES_ARRAY_INCREMENT 500
 
@@ -746,9 +750,9 @@ class MatrixOpenCLBuffer : public Dynamic {
   /**
    * Constructor.
    */
-  MatrixOpenCLBuffer(int _size) {
+  MatrixOpenCLBuffer(int _size, unsigned int _flags) {
     version = 0;
-    buffer = OpenCL::Alloc(_size);
+    buffer = OpenCL::Alloc(_size, _flags);
     ArrayResize(data, _size);
   }
 
@@ -871,18 +875,7 @@ class Matrix {
       return;
     }
 
-    cl_program_matmul = OpenCL::Compile(
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" NL "__kernel void matmul(" NL "	const int Mdim," NL
-        "	const int Ndim," NL "	const int Pdim," NL "	__global double* A," NL "	__global double* B," NL
-        "	__global double* C," NL "	__local double* Bwrk" NL ")" NL "{" NL "	int k, j;" NL
-        "	int i = get_global_id(0);" NL "	int iloc = get_local_id(0);" NL
-        "	int nloc = get_local_size(0);" NL "	double Awrk[1000];" NL "	double tmp;" NL "" NL
-        "	for (k = 0; k < Pdim; k++)" NL "		Awrk[k] = A[i * Ndim + k];" NL "" NL
-        "	for (j = 0; j < Mdim; j++)" NL "	{" NL "		for (k = iloc; k < Pdim; k = k + nloc)" NL
-        "			Bwrk[k] = B[k * Pdim + j];" NL "" NL "		barrier(CLK_LOCAL_MEM_FENCE);" NL "" NL
-        "		tmp = 0.0f;" NL "" NL "		for (k = 0; k < Pdim; k++)" NL
-        "			tmp += Awrk[k] * Bwrk[k];" NL "		C[i * Ndim + j] += tmp;" NL "	}" NL "}" NL,
-        "matmul");
+    cl_program_matmul = OpenCL::Compile(CLSource_Matrix_MatMul_Naive, "matmul");
 
     cl_program_matmul_single = OpenCL::Compile(
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" NL "__kernel void matmul(" NL "const int Mdim," NL
@@ -904,7 +897,7 @@ class Matrix {
     _buffer = cl_buffers_in_0.GetByKey(_size, _buffer);
 
     if (!_buffer.IsSet()) {
-      _buffer = new MatrixOpenCLBuffer<X>(_size);
+      _buffer = new MatrixOpenCLBuffer<X>(_size, CL_MEM_READ_ONLY);
       cl_buffers_in_0.Set(_size, _buffer);
     }
 
@@ -920,7 +913,7 @@ class Matrix {
     _buffer = cl_buffers_in_1.GetByKey(_size, _buffer);
 
     if (!_buffer.IsSet()) {
-      _buffer = new MatrixOpenCLBuffer<X>(_size);
+      _buffer = new MatrixOpenCLBuffer<X>(_size, CL_MEM_READ_ONLY);
       cl_buffers_in_1.Set(_size, _buffer);
     }
 
@@ -936,7 +929,7 @@ class Matrix {
     _buffer = cl_buffers_out.GetByKey(_size, _buffer);
 
     if (!_buffer.IsSet()) {
-      _buffer = new MatrixOpenCLBuffer<X>(_size);
+      _buffer = new MatrixOpenCLBuffer<X>(_size, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
       cl_buffers_out.Set(_size, _buffer);
     }
 
@@ -1419,7 +1412,7 @@ class Matrix {
   static void MatMul(Matrix<X>& source, Matrix<X>& target, Matrix<X>& output) {
 #ifdef MATRIX_USE_OPENCL
     // MatMulCL(source, target, output);
-    MatMulCLSingle(source, target, output);
+    MatMulCL(source, target, output);
     return;
 #endif
 
@@ -1445,48 +1438,56 @@ class Matrix {
    * method.
    */
   static void MatMulCL(Matrix<X>& _source, Matrix<X>& _target, Matrix<X>& _output) {
-    if (_source.GetSize() != _target.GetRange(1)) {
+    if (_source.GetRange(1) != _target.GetRange(0)) {
       Alert("Inconsistent size of matrices!");
     }
 
-    unsigned int _m_dim = _source.GetRange(0);
-    unsigned int _n_dim = _source.GetRange(1);
-    unsigned int _p_dim = _target.GetRange(1);
+    unsigned int _rows_a = _source.GetRange(0);
+    unsigned int _cols_a = _source.GetRange(1);
+    unsigned int _cols_b = _target.GetRange(1);
 
-    OpenCLBuffer* _in_1 = GetCLBufferInArg0(_source.GetSize()) PTR_DEREF GetBuffer();
-    OpenCLBuffer* _in_2 = GetCLBufferInArg1(_target.GetSize()) PTR_DEREF GetBuffer();
-    OpenCLBuffer* _out = GetCLBufferOutArg(_source.GetRange(0)) PTR_DEREF GetBuffer();
+    OpenCLBuffer* _in_1 = GetCLBufferInArg0(_rows_a * _cols_a) PTR_DEREF GetBuffer();
+    OpenCLBuffer* _in_2 = GetCLBufferInArg1(_cols_a * _cols_b) PTR_DEREF GetBuffer();
+    OpenCLBuffer* _out = GetCLBufferOutArg(_rows_a * _cols_b) PTR_DEREF GetBuffer();
 
     double _in_1_data[];
     double _in_2_data[];
     double _out_data[];
 
+    // ArrayResize(_out_data, _out PTR_DEREF GetSizeItems());
+
     _source.GetRawArray(_in_1_data);
     _target.GetRawArray(_in_2_data);
 
-    cl_program_matmul REF_DEREF SetArg(0, _m_dim);
-    cl_program_matmul REF_DEREF SetArg(1, _n_dim);
-    cl_program_matmul REF_DEREF SetArg(2, _p_dim);
-    cl_program_matmul REF_DEREF SetArg(3, _in_1);
-    cl_program_matmul REF_DEREF SetArg(4, _in_2);
-    cl_program_matmul REF_DEREF SetArg(5, _out);
-    cl_program_matmul REF_DEREF SetArgLocalMem(6, _p_dim * sizeof(X));
+    _in_1 PTR_DEREF Write(_in_1_data);
+    _in_2 PTR_DEREF Write(_in_2_data);
+
+    cl_program_matmul REF_DEREF SetArg(0, _in_1);
+    cl_program_matmul REF_DEREF SetArg(1, _in_2);
+    cl_program_matmul REF_DEREF SetArg(2, _out);
+    cl_program_matmul REF_DEREF SetArg(3, (int)_rows_a);
+    cl_program_matmul REF_DEREF SetArg(4, (int)_cols_a);
+    cl_program_matmul REF_DEREF SetArg(5, (int)_cols_b);
+
     ARRAY(unsigned int, _global_work_offset);
     ARRAY(unsigned int, _global_work_size);
     ARRAY(unsigned int, _local_work_size);
+
     ArrayPush(_global_work_offset, 0U);
-    ArrayPush(_global_work_size, _n_dim);
-    ArrayPush(_global_work_size, _m_dim);
-    ArrayPush(_local_work_size, _m_dim);
+    ArrayPush(_global_work_offset, 0U);
+    ArrayPush(_global_work_size, (unsigned int)_rows_a);
+    ArrayPush(_global_work_size, (unsigned int)_cols_b);
+    ArrayPush(_local_work_size, 1U);
+    ArrayPush(_local_work_size, 1U);
 
     if (!cl_program_matmul REF_DEREF RunMany(2, _global_work_offset, _global_work_size, _local_work_size)) {
-      Alert("Errpr: Could not run Matrix::MatMulCL() over OpenCL!");
+      Alert("Error: Could not run Matrix::MatMulCL() over OpenCL!");
       DebugBreak();
     }
 
     _out PTR_DEREF Read(_out_data);
 
-    // _output.SetShape(num_outputs);
+    _output.SetShape(_rows_a, _cols_b);
   }
 
   /**
@@ -1498,13 +1499,13 @@ class Matrix {
       Alert("Inconsistent size of matrices!");
     }
 
-    unsigned int _m_dim = _source.GetRange(0);
-    unsigned int _n_dim = _source.GetRange(1);
-    unsigned int _p_dim = _target.GetRange(1);
+    unsigned int _cols_a = _source.GetRange(0);
+    unsigned int _rows_a = _source.GetRange(1);
+    unsigned int _cols_b = _target.GetRange(1);
 
-    OpenCLBuffer* _in_1 = GetCLBufferInArg0(_n_dim * _p_dim) PTR_DEREF GetBuffer();
-    OpenCLBuffer* _in_2 = GetCLBufferInArg1(_p_dim * _m_dim) PTR_DEREF GetBuffer();
-    OpenCLBuffer* _out = GetCLBufferOutArg(_m_dim * _p_dim) PTR_DEREF GetBuffer();
+    OpenCLBuffer* _in_1 = GetCLBufferInArg0(_rows_a * _cols_b) PTR_DEREF GetBuffer();
+    OpenCLBuffer* _in_2 = GetCLBufferInArg1(_cols_b * _cols_a) PTR_DEREF GetBuffer();
+    OpenCLBuffer* _out = GetCLBufferOutArg(_cols_a * _cols_b) PTR_DEREF GetBuffer();
 
     double _in_1_data[];
     double _in_2_data[];
@@ -1515,16 +1516,16 @@ class Matrix {
     _source.GetRawArray(_in_1_data);
     _target.GetRawArray(_in_2_data);
 
-    MatMulCL_CPU(_m_dim, _n_dim, _p_dim, _in_1_data, _in_2_data, _out_data);
+    MatMulCL_CPU(_cols_a, _rows_a, _cols_b, _in_1_data, _in_2_data, _out_data);
 
-    cl_program_matmul REF_DEREF SetArg(0, _n_dim);
-    cl_program_matmul REF_DEREF SetArg(1, _m_dim);
-    cl_program_matmul REF_DEREF SetArg(2, _p_dim);
+    cl_program_matmul REF_DEREF SetArg(0, (int)_rows_a);
+    cl_program_matmul REF_DEREF SetArg(1, (int)_cols_a);
+    cl_program_matmul REF_DEREF SetArg(2, (int)_cols_b);
     cl_program_matmul REF_DEREF SetArg(3, _in_1);
     cl_program_matmul REF_DEREF SetArg(4, _in_2);
     cl_program_matmul REF_DEREF SetArg(5, _out);
 
-    if (!cl_program_matmul_single REF_DEREF Run()) {
+    if (!cl_program_matmul REF_DEREF Run()) {
       Alert("Errpr: Could not run Matrix::MatMulCL() over OpenCL!");
       DebugBreak();
     }

diff --git a/OpenCL.h b/OpenCL.h
@@ -61,6 +61,7 @@ class OpenCLBuffer : public Dynamic {
       DebugBreak();
       return;
     }
+    ArrayResize(_arr, buffer_size);
     CLBufferRead(buffer_handle, _arr);
   }
 
@@ -147,6 +148,7 @@ class OpenCLProgram : public Dynamic {
    * your version isn't greater that the one already set in the buffer.
    */
   void SetArg(int _index, double value) { CLSetKernelArg(kernel_handle, _index, value); }
+  void SetArg(int _index, int value) { CLSetKernelArg(kernel_handle, _index, value); }
 
   /**
    * Passes argument to the kernel. Will not set kernel argument if not needed.
@@ -479,7 +481,7 @@ class OpenCL {
   /**
    * Allocates memory to be later passed to OpenCLProgram.
    */
-  static OpenCLBuffer* Alloc(int _size) { return new OpenCLBuffer(_size); }
+  static OpenCLBuffer* Alloc(int _size, unsigned int _flags) { return new OpenCLBuffer(_size, _flags); }
 
   /**
    * Compiles given program and returns its id or -1 in case of error.

diff --git a/tests/OpenCLTest.mq5 b/tests/OpenCLTest.mq5
@@ -40,7 +40,7 @@ int OnInit() {
 
   double result[] = {0};
 
-  Ref<OpenCLBuffer> buffer = OpenCL::Alloc(1 /* 1 double */);
+  Ref<OpenCLBuffer> buffer = OpenCL::Alloc(1 /* 1 double */, CL_MEM_READ_WRITE);
 
   if (!program REF_DEREF Run(buffer.Ptr())) {
     Alert("Error running program!");