/* Source:https://www.marcusbannerman.co.uk/index.php/research/teaching-resources Modified: 12/22/11 by arg Applies to matrixvecmult.cl as well */ #include #include #include #include #include #include #include //The OpenCL C++ bindings, with exceptions #define __CL_ENABLE_EXCEPTIONS #ifdef __APPLE__ #include "cl_ver_1_1.hpp" #else #include #endif //const cl_uint WORK_GROUP_SIZE = 512; const cl_uint WORK_GROUP_SIZE = 64; //This is adjusted before kernel execution cl_uint GLOBAL_SIZE = 0; #ifdef __APPLE__ const cl_uint M_WIDTH = 110; const cl_uint M_HEIGHT = 100000; #else const cl_uint M_WIDTH = 1100; const cl_uint M_HEIGHT = 100000; #endif const cl_uint REPEAT_TESTS = 10; double gpuExecutionTime(cl::Event &event) { cl_ulong end = event.getProfilingInfo(); cl_ulong start = event.getProfilingInfo(); return (double)1.0e-9 * (end - start); } void MatrixVectorMul(const cl_float* M, const cl_float* V, cl_float* W, cl_uint width, cl_uint height) { for (cl_uint y = 0; y < height; ++y) { const cl_float* row = M + y * width; cl_float dotProduct = 0; for (cl_uint x = 0; x < width; ++x) dotProduct += row[x] * V[x]; W[y] = dotProduct; } } void testKernel(cl::Kernel& kernel, cl::Buffer& MBuffer, cl::Buffer& VBuffer, cl::Buffer& WBuffer, const std::vector& WCPU, cl::CommandQueue& cmdQ, bool needsLocalMem = false) { /*************** Running on the graphics card ***************/ kernel.setArg(0, MBuffer); kernel.setArg(1, VBuffer); kernel.setArg(2, WBuffer); kernel.setArg(3, M_WIDTH); kernel.setArg(4, M_HEIGHT); if (needsLocalMem) kernel.setArg(5, sizeof(cl_float) * WORK_GROUP_SIZE, NULL); cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(GLOBAL_SIZE), cl::NDRange(WORK_GROUP_SIZE)); //Run the kernel multiple times cl::Event events[REPEAT_TESTS]; for (cl_uint i(0); i < REPEAT_TESTS; ++i) events[i] = func(); cmdQ.finish(); /**************** Checking the output data *****************/ //Request a blocking copy of the data from the graphics card std::vector WGPU(M_HEIGHT); cmdQ.enqueueReadBuffer(WBuffer, true, 0, sizeof(cl_float) * M_HEIGHT, &WGPU[0]); //Figure out an average time for the execution double sumTimings = 0; for (cl_uint i(0); i < REPEAT_TESTS; ++i) sumTimings += gpuExecutionTime(events[i]); sumTimings /= REPEAT_TESTS; size_t different(0); size_t veryDifferent(0); for (size_t i(0); i < M_HEIGHT; ++i) if (WCPU[i] != WGPU[i]) { ++different; if (std::fabs(WCPU[i] - WGPU[i]) > 0.00001 * WCPU[i]) ++veryDifferent; } std::cout << "\nFinished kernel execution\n Average kernel execution time " << sumTimings << "\n Found " << different << " different entries"; if (different) std::cout << "\n " << veryDifferent << " entries more than 0.001%"; } int main(int argc, char** argp, char** envp) { try { /*************** OpenCL Initialisation ***************/ //Open a context to run the openCL kernel in #ifdef __APPLE__ cl::Context context(CL_DEVICE_TYPE_GPU); #else std::vector platformList; cl::Platform::get(&platformList); std::vector deviceList; cl_device_type deviceGPU = CL_DEVICE_TYPE_GPU; platformList[0].getDevices(deviceGPU, &deviceList); std::string deviceName; deviceList[0].getInfo((cl_device_info)CL_DEVICE_NAME, &deviceName); std::cout << deviceName << '\n'; std::vector deviceSelected; deviceSelected.push_back(deviceList[0]); cl::Context context(deviceSelected); #endif //Gather all the kernel sources for the OpenCL program cl::Program::Sources source; //Load the file which has the OpenCL kernel source code std::ifstream inputFile("matrixvecmult.cl"); if (!inputFile.is_open()) { std::cerr << "\nCould not open the kernel source file\n"; return -1; } source.push_back(std::pair(NULL,0)); inputFile.seekg(0,std::ios::end); source.back().second = inputFile.tellg(); inputFile.seekg(0, std::ios::beg); { char* tmp = new char[source.back().second]; inputFile.read(tmp, source.back().second); source.back().first = tmp; } //Make an OpenCL program cl::Program program(context, source); //Get all the available devices in the context std::vector devices = context.getInfo(); //Build the kernel sources for all devices in the context try { program.build(devices); } catch (cl::Error& err) { //Get the build log for the first device std::cerr << "Building failed, " << err.what() << "("<< err.err() <<")" << "\nRetrieving build log\n" << program.getBuildInfo(devices[0]) << "\n"; return -1; } std::cout << '\n'; std::cout << *argp << " executes kernels MatrixVectorMul1n which compute c = A * b\n" << "where A is stored in row major form and c and b are vectors"; std::cout << '\n'; std::cout << "\nUsing device " << devices[0].getInfo(); //Make a queue to put jobs on the first compute device cl::CommandQueue cmdQ(context, devices[0], CL_QUEUE_PROFILING_ENABLE); /*************** Preparing the data buffers ***************/ //Create a random matrix and vector std::vector M; std::generate_n(std::back_inserter(M), M_WIDTH * M_HEIGHT, rand); std::vector V; std::generate_n(std::back_inserter(V), M_WIDTH, rand); //Start copying this data to the graphics card cl::Buffer MBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * M.size(), &M[0]); //Start copying this data to the graphics card cl::Buffer VBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * V.size(), &V[0]); //Make a buffer to hold the output of the kernel cl::Buffer WBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * M_HEIGHT); cmdQ.finish(); /*************** Perform the CPU calculation ***************/ std::vector WCPU(M_HEIGHT); //Run and time the run #if defined __APPLE__ timeval startTime, endTime; gettimeofday(&startTime,NULL); #else timespec startTime, endTime; clock_gettime(CLOCK_MONOTONIC, &startTime); #endif MatrixVectorMul(&M[0], &V[0], &WCPU[0], M_WIDTH, M_HEIGHT); #if defined __APPLE__ gettimeofday(&endTime,NULL); double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) + 1e-6 * (double(endTime.tv_usec) - double(startTime.tv_usec)); #else clock_gettime(CLOCK_MONOTONIC, &endTime); double CPUTime = double(endTime.tv_sec) - double(startTime.tv_sec) + 1e-9 * (double(endTime.tv_nsec) - double(startTime.tv_nsec)); #endif std::cout << "\nCPU took " << CPUTime << " s"; cl::Kernel kernel; GLOBAL_SIZE = WORK_GROUP_SIZE * ((M_HEIGHT + WORK_GROUP_SIZE - 1) / WORK_GROUP_SIZE); std::cout << "\n\nTesting MatrixVectorMul1" << "\n WorkGroupSize = " << WORK_GROUP_SIZE << " GlobalSize " << GLOBAL_SIZE; kernel = cl::Kernel(program, "MatrixVectorMul1"); testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ); GLOBAL_SIZE = WORK_GROUP_SIZE * 60; std::cout << "\n\nTesting MatrixVectorMul2" << "\n WorkGroupSize = " << WORK_GROUP_SIZE << " GlobalSize " << GLOBAL_SIZE; kernel = cl::Kernel(program, "MatrixVectorMul2"); testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ); std::cout << "\n\nTesting MatrixVectorMul3" << "\n WorkGroupSize = " << WORK_GROUP_SIZE << " GlobalSize " << GLOBAL_SIZE; kernel = cl::Kernel(program, "MatrixVectorMul3"); testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ, true); std::cout << "\n\nTesting MatrixVectorMul4" << "\n WorkGroupSize = " << WORK_GROUP_SIZE << " GlobalSize " << GLOBAL_SIZE; kernel = cl::Kernel(program, "MatrixVectorMul4"); testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ, true); std::cout << "\n\nTesting MatrixVectorMul5" << "\n WorkGroupSize = " << WORK_GROUP_SIZE << " GlobalSize " << GLOBAL_SIZE; kernel = cl::Kernel(program, "MatrixVectorMul5"); testKernel(kernel, MBuffer, VBuffer, WBuffer, WCPU, cmdQ, true); std::cout << "\n"; return 0; } catch (cl::Error& err) { std::cerr << "An OpenCL error occured, " << err.what() << "\nError num of " << err.err() << "\n"; return -1; } }