/*
Source:https://www.marcusbannerman.co.uk/index.php/research/teaching-resources
Modified: 12/22/11 by arg
Applies to matrixvecmult.cl as well
*/

#include <iostream>
#include <vector>
#include <algorithm>
#include <fstream>
#include <cmath>
#include <time.h>
#include <sys/time.h>

//The OpenCL C++ bindings, with exceptions
#define __CL_ENABLE_EXCEPTIONS
#ifdef __APPLE__
  #include "cl_ver_1_1.hpp"
#else
  #include <CL/cl.hpp>
#endif

//const cl_uint WORK_GROUP_SIZE = 512;
const cl_uint WORK_GROUP_SIZE = 64;

//This is adjusted before kernel execution
cl_uint GLOBAL_SIZE = 0;

#ifdef __APPLE__
  const cl_uint M_WIDTH = 110;
  const cl_uint M_HEIGHT = 100000;
#else
  const cl_uint M_WIDTH = 1100;
  const cl_uint M_HEIGHT = 100000;
#endif

const cl_uint REPEAT_TESTS = 10;

double gpuExecutionTime(cl::Event &event)
{
  cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  return (double)1.0e-9 * (end - start);
}

void MatrixVectorMul(const cl_float* M,
		     const cl_float* V,
		     cl_float* W,
		     cl_uint width, cl_uint height)
{
  for (cl_uint y = 0; y < height; ++y)
    {
      const cl_float* row = M + y * width;

      cl_float dotProduct = 0;
      for (cl_uint x = 0; x < width; ++x)
	dotProduct += row[x] * V[x];

      W[y] = dotProduct;
    }
}

void testKernel(cl::Kernel& kernel, cl::Buffer& MBuffer, cl::Buffer& VBuffer, 
  cl::Buffer& WBuffer, const std::vector<cl_float>& WCPU, 
  cl::CommandQueue& cmdQ, bool needsLocalMem = false)	       
{
  /***************  Running on the graphics card  ***************/

  kernel.setArg(0, MBuffer);
  kernel.setArg(1, VBuffer);
  kernel.setArg(2, WBuffer);
  kernel.setArg(3, M_WIDTH);
  kernel.setArg(4, M_HEIGHT);
  if (needsLocalMem) kernel.setArg(5, sizeof(cl_float) * WORK_GROUP_SIZE, NULL);
  
  cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(GLOBAL_SIZE), 
    cl::NDRange(WORK_GROUP_SIZE));
  
  //Run the kernel multiple times
  cl::Event events[REPEAT_TESTS];

  for (cl_uint i(0); i < REPEAT_TESTS; ++i)
    events[i] = func();
    
  cmdQ.finish();

  /****************  Checking the output data   *****************/

  //Request a blocking copy of the data from the graphics card
  std::vector<cl_float> WGPU(M_HEIGHT);
  cmdQ.enqueueReadBuffer(WBuffer, true, 0, sizeof(cl_float) * M_HEIGHT, 
    &WGPU[0]);
  
  //Figure out an average time for the execution
  double sumTimings = 0;
  for (cl_uint i(0); i < REPEAT_TESTS; ++i)
    sumTimings += gpuExecutionTime(events[i]);
  sumTimings /= REPEAT_TESTS;

  size_t different(0);
  size_t veryDifferent(0);
  for (size_t i(0); i < M_HEIGHT; ++i)
    if (WCPU[i] != WGPU[i]) 
      {
	++different;
	if (std::fabs(WCPU[i] - WGPU[i]) > 0.00001 * WCPU[i])
	  ++veryDifferent;	  
      }

  std::cout << "\nFinished kernel execution\n  Average kernel execution time " 
	    << sumTimings
	    << "\n  Found " << different << " different entries";
  if (different)
    std::cout << "\n  " << veryDifferent << " entries more than 0.001%";
}

int main(int argc, char** argp, char** envp)
{
  try {
    /***************     OpenCL Initialisation      ***************/
    //Open a context to run the openCL kernel in
    #ifdef __APPLE__
      cl::Context context(CL_DEVICE_TYPE_GPU);
    #else
      std::vector<cl::Platform> platformList;
      cl::Platform::get(&platformList);
      std::vector<cl::Device> deviceList;
      cl_device_type deviceGPU = CL_DEVICE_TYPE_GPU;
      platformList[0].getDevices(deviceGPU, &deviceList);
      std::string deviceName;
      deviceList[0].getInfo((cl_device_info)CL_DEVICE_NAME, &deviceName);
      std::cout << deviceName << '\n';
      std::vector<cl::Device> deviceSelected;
      deviceSelected.push_back(deviceList[0]);
      cl::Context context(deviceSelected);
    #endif

    //Gather all the kernel sources for the OpenCL program
    cl::Program::Sources source;

    //Load the file which has the OpenCL kernel source code
    std::ifstream inputFile("matrixvecmult.cl");    
    if (!inputFile.is_open())
      {
	std::cerr << "\nCould not open the kernel source file\n";
	return -1;
      }
    
    source.push_back(std::pair<char *, size_t>(NULL,0));
    inputFile.seekg(0,std::ios::end);
    source.back().second = inputFile.tellg();
    inputFile.seekg(0, std::ios::beg);
    {
      char* tmp = new char[source.back().second];
      inputFile.read(tmp, source.back().second);
      source.back().first = tmp;
    }
  
    //Make an OpenCL program
    cl::Program program(context, source);
  
    //Get all the available devices in the context
    std::vector<cl::Device> devices 
      = context.getInfo<CL_CONTEXT_DEVICES>();

    //Build the kernel sources for all devices in the context
    try {
      program.build(devices);
    }
    catch (cl::Error& err)
      {
	//Get the build log for the first device
	std::cerr << "Building failed, " << err.what() << "("<< err.err() <<")" 
		  << "\nRetrieving build log\n"	
		  << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0])
		  << "\n";
	return -1;
      }

    std::cout << '\n';
    std::cout << *argp 
              << " executes kernels MatrixVectorMul1n which compute c = A * b\n"
              << "where A is stored in row major form and c and b are vectors";
    std::cout << '\n';

    std::cout << "\nUsing device " << devices[0].getInfo<CL_DEVICE_NAME>();

    //Make a queue to put jobs on the first compute device
    cl::CommandQueue cmdQ(context, devices[0], CL_QUEUE_PROFILING_ENABLE);

    /***************   Preparing the data buffers   ***************/

    //Create a random matrix and vector
    std::vector<cl_float> M;
    std::generate_n(std::back_inserter(M), M_WIDTH * M_HEIGHT, rand);

    std::vector<cl_float> V;
    std::generate_n(std::back_inserter(V), M_WIDTH, rand);

    //Start copying this data to the graphics card
    cl::Buffer MBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
      sizeof(cl_float) * M.size(), &M[0]);
    //Start copying this data to the graphics card
    cl::Buffer VBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
      sizeof(cl_float) * V.size(), &V[0]);
  
    //Make a buffer to hold the output of the kernel
    cl::Buffer WBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * M_HEIGHT);
  
    cmdQ.finish();

    /***************   Perform the CPU calculation  ***************/
    std::vector<cl_float> WCPU(M_HEIGHT);
    
    //Run and time the run
    #if defined __APPLE__
      timeval startTime, endTime;
      gettimeofday(&startTime,NULL);
    #else
      timespec startTime, endTime;
      clock_gettime(CLOCK_MONOTONIC, &startTime);
    #endif
    MatrixVectorMul(&M[0], &V[0], &WCPU[0], M_WIDTH, M_HEIGHT);
    #if defined __APPLE__
      gettimeofday(&endTime,NULL);
      double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) 
        + 1e-6 * (double(endTime.tv_usec) - double(startTime.tv_usec));
    #else
      clock_gettime(CLOCK_MONOTONIC, &endTime);
      double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) 
        + 1e-9 * (double(endTime.tv_nsec) - double(startTime.tv_nsec));
    #endif
    std::cout << "\nCPU took " << CPUTime << " s";
    
    cl::Kernel kernel;

    GLOBAL_SIZE 
     = WORK_GROUP_SIZE * ((M_HEIGHT + WORK_GROUP_SIZE - 1) / WORK_GROUP_SIZE);

    std::cout << "\n\nTesting MatrixVectorMul1"
	      << "\n WorkGroupSize = " << WORK_GROUP_SIZE 
	      << " GlobalSize " << GLOBAL_SIZE;
    kernel = cl::Kernel(program, "MatrixVectorMul1");
    testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ);
    
    GLOBAL_SIZE = WORK_GROUP_SIZE * 60;

    std::cout << "\n\nTesting MatrixVectorMul2"
	      << "\n WorkGroupSize = " << WORK_GROUP_SIZE 
	      << " GlobalSize " << GLOBAL_SIZE;
    kernel = cl::Kernel(program, "MatrixVectorMul2");
    testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ);

    std::cout << "\n\nTesting MatrixVectorMul3"
	      << "\n WorkGroupSize = " << WORK_GROUP_SIZE 
	      << " GlobalSize " << GLOBAL_SIZE;
    kernel = cl::Kernel(program, "MatrixVectorMul3");
    testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ, true);

    std::cout << "\n\nTesting MatrixVectorMul4"
	      << "\n WorkGroupSize = " << WORK_GROUP_SIZE 
	      << " GlobalSize " << GLOBAL_SIZE;
    kernel = cl::Kernel(program, "MatrixVectorMul4");
    testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ, true);

    std::cout << "\n\nTesting MatrixVectorMul5"
	      << "\n WorkGroupSize = " << WORK_GROUP_SIZE 
	      << " GlobalSize " << GLOBAL_SIZE;
    kernel = cl::Kernel(program, "MatrixVectorMul5");
    testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ, true);

    std::cout << "\n";
    return 0;
  }
  catch (cl::Error& err)
    {
      std::cerr << "An OpenCL error occured, " << err.what()
		<< "\nError num of " << err.err() << "\n";
      return -1;
    }
}
