__kernel void vecmatmult(const __global float* a,
                         const __global float* B,
                         __global float* c,
                         uint Brows,  // = acols
                         uint Bcols,  // = ccols
                         __local float* R)  // length = get_local_size(0)
{
  for (uint y = get_group_id(0); y < Bcols; y += get_num_groups(0)) {

    const __global float* col = B + y * Brows;
      
    float sum = 0.0f;

    for (uint x = get_local_id(0); x < Brows; x += get_local_size(0)) {
      sum += col[x] * a[x];
    }
      
    R[get_local_id(0)] = sum;
      
    for (uint stride = get_local_size(0)/2; stride > 0; stride /= 2) {

      barrier(CLK_LOCAL_MEM_FENCE);
          
      if (get_local_id(0) < stride) { 
        R[get_local_id(0)] += R[get_local_id(0) + stride];
      }
    }
      
    if (get_local_id(0) == 0) c[y] = R[0];
      
    barrier(CLK_LOCAL_MEM_FENCE);
  }
}
