__kernel void matmatmult(const __global float* A,
                         const __global float* B,
                         __global float* C,
                         uint Arows,  // = Crows
                         uint Acols,  // = Brows
                         uint Bcols)  // = Ccols
{
  const uint Brows = Acols;
  const uint Crows = Arows;
  const uint Ccols = Bcols;

  uint j = get_global_id(1);
  uint i = get_global_id(0);

  if (i < Crows && j < Ccols) {

    C[Crows*j + i] = 0.0;

    for (uint k = 0; k < Brows; ++k) {
      
      C[Crows*j + i] += A[Arows*k + i] * B[Brows*j + k];

    }
  }
}


/* 

Argument list when KERNEL_NEEDS_LOCAL_WORKSPACE is true

__kernel void matmatmult(const __global float* A,
                         const __global float* B,
                         __global float* C,
                         uint Arows,  // = Crows
                         uint Acols,  // = Brows
                         uint Bcols,  // = Ccols
                         __local float* R)  // length = get_num_groups(0)

*/
