#include #include #include #include #include #include #include //The OpenCL C++ bindings, with exceptions #define __CL_ENABLE_EXCEPTIONS #ifdef __APPLE__ #include "cl_ver_1_1.hpp" #else #include #endif //This is adjusted before kernel execution cl_uint GLOBAL_SIZE_0 = 0; cl_uint NUM_GROUPS_0 = 0; #ifdef __APPLE__ const cl_uint A_ROWS = 1; // Don't change this const cl_uint A_COLS = 1000; const cl_uint B_COLS = 100; const cl_uint WORK_GROUP_SIZE_0 = 16; #else const cl_uint A_ROWS = 1; // Don't change this const cl_uint A_COLS = 1000; const cl_uint B_COLS = 100; const cl_uint WORK_GROUP_SIZE_0 = 128; #endif const cl_uint B_ROWS = A_COLS; const cl_uint C_ROWS = A_ROWS; const cl_uint C_COLS = B_COLS; const cl_uint REPEAT_TESTS = 10; float unity() { return 1.0; }; class cycle { private: int i; public: cycle() : i(0) {} float operator()() { return float(++i % 1000); } }; double gpuExecutionTime(cl::Event &event) { cl_ulong end = event.getProfilingInfo(); cl_ulong start = event.getProfilingInfo(); return (double)1.0e-9 * (end - start); } void cpu_vecmatmult(const std::vector& A, const std::vector& B, std::vector& C, cl_uint Arows, cl_uint Acols, cl_uint Bcols) { const cl_uint Brows = Acols; for(cl_uint j = 0; j < Bcols; ++j) { for(cl_uint i = 0; i < Arows; ++i) { C[Arows*j + i] = 0.0; for(cl_uint k = 0; k < Brows; ++k) { C[Arows*j + i] += A[Arows*k + i] * B[Brows*j + k]; } } } } void testKernel(cl::Kernel& kernel, cl::Buffer& ABuffer, cl::Buffer& BBuffer, cl::Buffer& CBuffer, const std::vector& CCPU, cl::CommandQueue& cmdQ) { /*************** Running on the graphics card ***************/ kernel.setArg(0, ABuffer); kernel.setArg(1, BBuffer); kernel.setArg(2, CBuffer); kernel.setArg(3, A_COLS); kernel.setArg(4, B_COLS); kernel.setArg(5, sizeof(cl_float)*WORK_GROUP_SIZE_0, NULL); cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(GLOBAL_SIZE_0), cl::NDRange(WORK_GROUP_SIZE_0)); //Run the kernel multiple times cl::Event events[REPEAT_TESTS]; for (cl_uint i(0); i < REPEAT_TESTS; ++i) events[i] = func(); cmdQ.finish(); /**************** Checking the output data *****************/ //Request a blocking copy of the data from the graphics card std::vector CGPU(A_ROWS*B_COLS); cmdQ.enqueueReadBuffer(CBuffer, true, 0, sizeof(cl_float) * A_ROWS*B_COLS, &CGPU[0]); //Figure out an average time for the execution double sumTimings = 0; for (cl_uint i(0); i < REPEAT_TESTS; ++i) sumTimings += gpuExecutionTime(events[i]); sumTimings /= REPEAT_TESTS; size_t different(0); size_t veryDifferent(0); for (size_t i(0); i < A_ROWS*B_COLS; ++i) { //std::cerr << CCPU[i] << ' ' << CGPU[i] << '\n'; if (CCPU[i] != CGPU[i]) { ++different; if (std::fabs(CCPU[i] - CGPU[i]) > 0.00001 * CCPU[i]) { if (veryDifferent == 0) std::cerr << "row col CCPU CGPU" << '\n'; std::cerr << i%A_ROWS << ' ' << i/A_ROWS << ' ' << CCPU[i] << ' ' << CGPU[i] << '\n'; ++veryDifferent; } } } std::cout << "\nFinished kernel execution\n Average kernel execution time " << sumTimings << " sec" << "\n Found " << different << " different entries"; if (different) std::cout << "\n " << veryDifferent << " entries more than 0.001%"; } int main(int argc, char** argp, char** envp) { try { /*************** OpenCL Initialisation ***************/ //Open a context to run the openCL kernel in #ifdef __APPLE__ cl::Context context(CL_DEVICE_TYPE_GPU); #else std::cout << '\n'; std::vector platformList; cl::Platform::get(&platformList); std::string platformName; platformList[0].getInfo((cl_platform_info)CL_PLATFORM_NAME,&platformName); std::cout << "Platform is " << platformName << '\n'; std::vector deviceList; cl_device_type deviceGPU = CL_DEVICE_TYPE_GPU; platformList[0].getDevices(deviceGPU, &deviceList); std::vector::const_iterator dlitr; std::cout << "Devices available are:"; for (dlitr=deviceList.begin(); dlitr!=deviceList.end(); ++dlitr) { std::string deviceName; dlitr->getInfo((cl_device_info)CL_DEVICE_NAME, &deviceName); std::cout << "\n " << deviceName; } std::cout << '\n'; std::string deviceName; deviceList[0].getInfo((cl_device_info)CL_DEVICE_NAME,&deviceName); std::vector deviceSelected; deviceSelected.push_back(deviceList[0]); cl::Context context(deviceSelected); #endif //Gather all the kernel sources for the OpenCL program cl::Program::Sources source; //Load the file which has the OpenCL kernel source code std::ifstream inputFile("vecmatmult.cl"); if (!inputFile.is_open()) { std::cerr << "\nCould not open the kernel source file\n"; return -1; } source.push_back(std::pair(NULL,0)); inputFile.seekg(0,std::ios::end); source.back().second = inputFile.tellg(); inputFile.seekg(0, std::ios::beg); { char* tmp = new char[source.back().second]; inputFile.read(tmp, source.back().second); source.back().first = tmp; } //Make an OpenCL program cl::Program program(context, source); //Get all the available devices in the context std::vector devices = context.getInfo(); //Build the kernel sources for all devices in the context try { program.build(devices); } catch (cl::Error& err) { //Get the build log for the first device std::cerr << "Building failed, " << err.what() << "("<< err.err() <<")" << "\nRetrieving build log\n" << program.getBuildInfo(devices[0]) << "\n"; return -1; } std::cout << '\n'; std::cout << *argp << " executes kernel vecmatmult which computes c = a * B\n" << "where B is stored in column major form and a and c " << "are vectors"; std::cout << '\n'; std::cout << "\nUsing device " << devices[0].getInfo(); std::cout << '\n'; //Make a queue to put jobs on the first compute device cl::CommandQueue cmdQ(context, devices[0], CL_QUEUE_PROFILING_ENABLE); /*************** Preparing the data buffers ***************/ //Create a random matrix and vector std::vector A; //std::generate_n(std::back_inserter(A), A_ROWS * A_COLS, unity); //std::generate_n(std::back_inserter(A), A_ROWS * A_COLS, rand); std::generate_n(std::back_inserter(A), A_ROWS * A_COLS, cycle()); std::vector B; //std::generate_n(std::back_inserter(B), B_ROWS * B_COLS, unity); //std::generate_n(std::back_inserter(B), B_ROWS * B_COLS, rand); std::generate_n(std::back_inserter(B), B_ROWS * B_COLS, cycle()); //Start copying this data to the graphics card cl::Buffer ABuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * A.size(), &A[0]); //Start copying this data to the graphics card cl::Buffer BBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * B.size(), &B[0]); //Make a buffer to hold the output of the kernel cl::Buffer CBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * C_ROWS * C_COLS); cmdQ.finish(); /*************** Perform the CPU calculation ***************/ std::vector CCPU(C_ROWS*C_COLS); //Run and time the run #if defined __APPLE__ timeval startTime, endTime; gettimeofday(&startTime,NULL); #else timespec startTime, endTime; clock_gettime(CLOCK_MONOTONIC, &startTime); #endif cpu_vecmatmult(A, B, CCPU, A_ROWS, A_COLS, B_COLS); #if defined __APPLE__ gettimeofday(&endTime,NULL); double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) + 1e-6 * (double(endTime.tv_usec) - double(startTime.tv_usec)); #else clock_gettime(CLOCK_MONOTONIC, &endTime); double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) + 1e-9 * (double(endTime.tv_nsec) - double(startTime.tv_nsec)); #endif std::cout << "\nCPU took " << CPUTime << " sec"; cl::Kernel kernel; NUM_GROUPS_0 = (B_COLS + WORK_GROUP_SIZE_0 - 1) / WORK_GROUP_SIZE_0; if (NUM_GROUPS_0 > 60) NUM_GROUPS_0 = 60; GLOBAL_SIZE_0 = WORK_GROUP_SIZE_0*NUM_GROUPS_0; std::cout << "\n\nTesting vecmatmult" << "\n A_ROWS = " << A_ROWS << "\n A_COLS = " << A_COLS << "\n B_ROWS = " << A_COLS << "\n B_COLS = " << B_COLS << "\n C_ROWS = " << C_ROWS << "\n C_COLS = " << C_COLS /* << "\n WorkGroupSize_0 = " << WORK_GROUP_SIZE_0 << "\n NumGroups_0 = " << NUM_GROUPS_0 << "\n GlobalSize_0 = " << GLOBAL_SIZE_0 << std::endl; */ << "\n get_local_size(0) = " << WORK_GROUP_SIZE_0 << "\n get_num_groups(0) = " << NUM_GROUPS_0 << "\n get_global_size(0) = " << GLOBAL_SIZE_0 << std::endl; kernel = cl::Kernel(program, "vecmatmult"); testKernel(kernel, ABuffer, BBuffer, CBuffer, CCPU, cmdQ); std::cout << "\n"; std::cout << "\n"; return 0; } catch (cl::Error& err) { std::cerr << "An OpenCL error occured, " << err.what() << "\nError num of " << err.err() << "\n"; return -1; } }