#include <iostream>
#include <vector>
#include <algorithm>
#include <fstream>
#include <cmath>
#include <time.h>
#include <sys/time.h>

//The OpenCL C++ bindings, with exceptions
#define __CL_ENABLE_EXCEPTIONS
#ifdef __APPLE__
  #include "cl_ver_1_1.hpp"
#else
  #include <CL/cl.hpp>
#endif

//This is adjusted before kernel execution
cl_uint GLOBAL_SIZE_0 = 0;
cl_uint NUM_GROUPS_0 = 0;

#ifdef __APPLE__
  const cl_uint A_ROWS = 1;      // Don't change this
  const cl_uint A_COLS = 1000;
  const cl_uint B_COLS = 100;
  const cl_uint WORK_GROUP_SIZE_0 = 16;
#else
  const cl_uint A_ROWS = 1;      // Don't change this
  const cl_uint A_COLS = 1000;
  const cl_uint B_COLS = 100;
  const cl_uint WORK_GROUP_SIZE_0 = 128;
#endif

const cl_uint B_ROWS = A_COLS;
const cl_uint C_ROWS = A_ROWS;
const cl_uint C_COLS = B_COLS;

const cl_uint REPEAT_TESTS = 10;

float unity() { return 1.0; };

class cycle {
private:
  int i;
public:
  cycle() : i(0) {}
  float operator()()  { return float(++i % 1000); }
};

double gpuExecutionTime(cl::Event &event)
{
  cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
  cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
  return (double)1.0e-9 * (end - start);
}

void cpu_vecmatmult(const std::vector<cl_float>& A,
                 const std::vector<cl_float>& B,
                 std::vector<cl_float>& C,
                 cl_uint Arows, 
                 cl_uint Acols, 
                 cl_uint Bcols)
{
   const cl_uint Brows = Acols;
   for(cl_uint j = 0; j < Bcols; ++j) {
   for(cl_uint i = 0; i < Arows; ++i) {
     C[Arows*j + i] = 0.0;
     for(cl_uint k = 0; k < Brows; ++k) {
       C[Arows*j + i] += A[Arows*k + i] * B[Brows*j + k];
     }  
   }
   }
}

void testKernel(cl::Kernel& kernel, 
                cl::Buffer& ABuffer, 
                cl::Buffer& BBuffer, 
                cl::Buffer& CBuffer, 
                const std::vector<cl_float>& CCPU, 
                cl::CommandQueue& cmdQ) 
{
  /***************  Running on the graphics card  ***************/

  kernel.setArg(0, ABuffer);
  kernel.setArg(1, BBuffer);
  kernel.setArg(2, CBuffer);
  kernel.setArg(3, A_COLS);
  kernel.setArg(4, B_COLS);
  kernel.setArg(5, sizeof(cl_float)*WORK_GROUP_SIZE_0, NULL);
  
  cl::KernelFunctor func = kernel.bind(cmdQ, 
    cl::NDRange(GLOBAL_SIZE_0), 
    cl::NDRange(WORK_GROUP_SIZE_0));
  
  //Run the kernel multiple times
  cl::Event events[REPEAT_TESTS];

  for (cl_uint i(0); i < REPEAT_TESTS; ++i)
    events[i] = func();
    
  cmdQ.finish();

  /****************  Checking the output data   *****************/

  //Request a blocking copy of the data from the graphics card
  std::vector<cl_float> CGPU(A_ROWS*B_COLS);
  cmdQ.enqueueReadBuffer(CBuffer, true, 0, sizeof(cl_float) * A_ROWS*B_COLS, 
    &CGPU[0]);
  
  //Figure out an average time for the execution
  double sumTimings = 0;
  for (cl_uint i(0); i < REPEAT_TESTS; ++i)
    sumTimings += gpuExecutionTime(events[i]);
  sumTimings /= REPEAT_TESTS;

  size_t different(0);
  size_t veryDifferent(0);
  for (size_t i(0); i < A_ROWS*B_COLS; ++i) {
//std::cerr << CCPU[i] << ' ' << CGPU[i] << '\n';
    if (CCPU[i] != CGPU[i]) 
      {
        ++different;
        if (std::fabs(CCPU[i] - CGPU[i]) > 0.00001 * CCPU[i]) {
	  if (veryDifferent == 0)  std::cerr << "row col CCPU CGPU" << '\n';
	  std::cerr << i%A_ROWS << ' ' << i/A_ROWS
	            << ' ' << CCPU[i] << ' ' << CGPU[i] << '\n';
          ++veryDifferent;        
        }
      }
  }

  std::cout << "\nFinished kernel execution\n  Average kernel execution time " 
            << sumTimings << " sec"
            << "\n  Found " << different << " different entries";
  if (different)
    std::cout << "\n  " << veryDifferent << " entries more than 0.001%";
}

int main(int argc, char** argp, char** envp)
{
  try {
    /***************     OpenCL Initialisation      ***************/
    //Open a context to run the openCL kernel in
    #ifdef __APPLE__
      cl::Context context(CL_DEVICE_TYPE_GPU);
    #else
      std::cout << '\n';
      std::vector<cl::Platform> platformList;
      cl::Platform::get(&platformList);
      std::string platformName;
      platformList[0].getInfo((cl_platform_info)CL_PLATFORM_NAME,&platformName);
      std::cout << "Platform is " << platformName << '\n';
      std::vector<cl::Device> deviceList;
      cl_device_type deviceGPU = CL_DEVICE_TYPE_GPU;
      platformList[0].getDevices(deviceGPU, &deviceList);
      std::vector<cl::Device>::const_iterator dlitr;
      std::cout << "Devices available are:";
      for (dlitr=deviceList.begin(); dlitr!=deviceList.end(); ++dlitr) {
        std::string deviceName;
        dlitr->getInfo((cl_device_info)CL_DEVICE_NAME, &deviceName);
        std::cout << "\n  " << deviceName;
      }
      std::cout << '\n';
      std::string deviceName;
      deviceList[0].getInfo((cl_device_info)CL_DEVICE_NAME,&deviceName);
      std::vector<cl::Device> deviceSelected;
      deviceSelected.push_back(deviceList[0]);
      cl::Context context(deviceSelected);

    #endif

    //Gather all the kernel sources for the OpenCL program
    cl::Program::Sources source;

    //Load the file which has the OpenCL kernel source code
    std::ifstream inputFile("vecmatmult.cl");    
    if (!inputFile.is_open())
      {
        std::cerr << "\nCould not open the kernel source file\n";
        return -1;
      }
    
    source.push_back(std::pair<char *, size_t>(NULL,0));
    inputFile.seekg(0,std::ios::end);
    source.back().second = inputFile.tellg();
    inputFile.seekg(0, std::ios::beg);
    {
      char* tmp = new char[source.back().second];
      inputFile.read(tmp, source.back().second);
      source.back().first = tmp;
    }

    //Make an OpenCL program
    cl::Program program(context, source);
  
    //Get all the available devices in the context
    std::vector<cl::Device> devices 
      = context.getInfo<CL_CONTEXT_DEVICES>();

    //Build the kernel sources for all devices in the context
    try {
      program.build(devices);
    }
    catch (cl::Error& err)
      {
        //Get the build log for the first device
        std::cerr << "Building failed, " << err.what() << "("<< err.err() <<")" 
                  << "\nRetrieving build log\n" 
                  << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0])
                  << "\n";
        return -1;
      }

    std::cout << '\n';
    std::cout << *argp 
              << " executes kernel vecmatmult which computes c = a * B\n"
              << "where B is stored in column major form and a and c "
              << "are vectors";
    std::cout << '\n';



    std::cout << "\nUsing device " << devices[0].getInfo<CL_DEVICE_NAME>();
    std::cout << '\n';

    //Make a queue to put jobs on the first compute device
    cl::CommandQueue cmdQ(context, devices[0], CL_QUEUE_PROFILING_ENABLE);

    /***************   Preparing the data buffers   ***************/

    //Create a random matrix and vector
    std::vector<cl_float> A;
    //std::generate_n(std::back_inserter(A), A_ROWS * A_COLS, unity);
    //std::generate_n(std::back_inserter(A), A_ROWS * A_COLS, rand);
    std::generate_n(std::back_inserter(A), A_ROWS * A_COLS, cycle());

    std::vector<cl_float> B;
    //std::generate_n(std::back_inserter(B), B_ROWS * B_COLS, unity);
    //std::generate_n(std::back_inserter(B), B_ROWS * B_COLS, rand);
    std::generate_n(std::back_inserter(B), B_ROWS * B_COLS, cycle());

    //Start copying this data to the graphics card
    cl::Buffer ABuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
      sizeof(cl_float) * A.size(), &A[0]);
    //Start copying this data to the graphics card
    cl::Buffer BBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
      sizeof(cl_float) * B.size(), &B[0]);
  
    //Make a buffer to hold the output of the kernel
    cl::Buffer CBuffer(context, CL_MEM_WRITE_ONLY, 
       sizeof(cl_float) * C_ROWS * C_COLS);
  
    cmdQ.finish();

    /***************   Perform the CPU calculation  ***************/
    std::vector<cl_float> CCPU(C_ROWS*C_COLS);
    
    //Run and time the run
    #if defined __APPLE__
      timeval startTime, endTime;
      gettimeofday(&startTime,NULL);
    #else
      timespec startTime, endTime;
      clock_gettime(CLOCK_MONOTONIC, &startTime);
    #endif
    cpu_vecmatmult(A, B, CCPU, A_ROWS, A_COLS, B_COLS);
    #if defined __APPLE__
      gettimeofday(&endTime,NULL);
      double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) 
        + 1e-6 * (double(endTime.tv_usec) - double(startTime.tv_usec));
    #else
      clock_gettime(CLOCK_MONOTONIC, &endTime);
      double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) 
        + 1e-9 * (double(endTime.tv_nsec) - double(startTime.tv_nsec));
    #endif
    std::cout << "\nCPU took " << CPUTime << " sec";
    
    cl::Kernel kernel;

    NUM_GROUPS_0 = (B_COLS + WORK_GROUP_SIZE_0 - 1) / WORK_GROUP_SIZE_0;
    if (NUM_GROUPS_0 > 60)  NUM_GROUPS_0 = 60;

    GLOBAL_SIZE_0 = WORK_GROUP_SIZE_0*NUM_GROUPS_0;

    std::cout << "\n\nTesting vecmatmult"
              << "\n A_ROWS = " << A_ROWS
              << "\n A_COLS = " << A_COLS
              << "\n B_ROWS = " << A_COLS
              << "\n B_COLS = " << B_COLS
	      << "\n C_ROWS = " << C_ROWS
	      << "\n C_COLS = " << C_COLS
	      /*
              << "\n WorkGroupSize_0 = " << WORK_GROUP_SIZE_0 
              << "\n NumGroups_0     = " << NUM_GROUPS_0
              << "\n GlobalSize_0    = " << GLOBAL_SIZE_0 << std::endl;
	      */
              << "\n get_local_size(0)  = " << WORK_GROUP_SIZE_0 
              << "\n get_num_groups(0)  = " << NUM_GROUPS_0
              << "\n get_global_size(0) = " << GLOBAL_SIZE_0 << std::endl;

    kernel = cl::Kernel(program, "vecmatmult");

    testKernel(kernel, ABuffer, BBuffer, CBuffer, CCPU, cmdQ);
    
    std::cout << "\n";
    std::cout << "\n";
    return 0;
  }
  catch (cl::Error& err)
    {
      std::cerr << "An OpenCL error occured, " << err.what()
                << "\nError num of " << err.err() << "\n";
      return -1;
    }
}
