#ifndef VIENNACL_LINALG_KERNELS_COORDINATE_MATRIX_SOURCE_HPP_
#define VIENNACL_LINALG_KERNELS_COORDINATE_MATRIX_SOURCE_HPP_
//Automatically generated file from auxiliary-directory, do not edit manually!
namespace viennacl
{
 namespace linalg
 {
  namespace kernels
  {
const char * const coordinate_matrix_align1_vec_mul = 
"//segmented parallel reduction. At present restricted to up to 256 threads\n"
"void segmented_parallel_reduction(unsigned int row, \n"
"                                  float val, \n"
"                                  __local unsigned int * shared_rows, \n"
"                                  __local float * inter_results) \n"
"{ \n"
"  //barrier(CLK_LOCAL_MEM_FENCE); \n"
"  shared_rows[get_local_id(0)] = row; \n"
"  inter_results[get_local_id(0)] = val; \n"
"  float left = 0;\n"
" \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >=  1 && row == shared_rows[get_local_id(0) -  1] ) { left = inter_results[get_local_id(0) -  1]; }  \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >=  2 && row == shared_rows[get_local_id(0) -  2] ) { left = inter_results[get_local_id(0) -  2]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >=  4 && row == shared_rows[get_local_id(0) -  4] ) { left = inter_results[get_local_id(0) -  4]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >=  8 && row == shared_rows[get_local_id(0) -  8] ) { left = inter_results[get_local_id(0) -  8]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >= 16 && row == shared_rows[get_local_id(0) - 16] ) { left = inter_results[get_local_id(0) - 16]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >= 32 && row == shared_rows[get_local_id(0) - 32] ) { left = inter_results[get_local_id(0) - 32]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >= 64 && row == shared_rows[get_local_id(0) - 64] ) { left = inter_results[get_local_id(0) - 64]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  if( get_local_id(0) >= 128 && row == shared_rows[get_local_id(0) - 128] ) { left = inter_results[get_local_id(0) - 128]; } \n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  inter_results[get_local_id(0)] += left; left = 0;\n"
"  barrier(CLK_LOCAL_MEM_FENCE); \n"
"  //if( get_local_id(0) >= 256 && row == shared_rows[get_local_id(0) - 256] ) { left = inter_results[get_local_id(0) - 256]; } \n"
"  //barrier(CLK_LOCAL_MEM_FENCE);  \n"
"  //inter_results[get_local_id(0)] += left; left = 0;\n"
"  //barrier(CLK_LOCAL_MEM_FENCE); \n"
"}\n"
"__kernel void vec_mul( \n"
"          __global const uint2 * coords, //(row_index, column_index) \n"
"          __global const float * elements, \n"
"          __global const uint  * group_boundaries,\n"
"          __global const float * vector,  \n"
"          __global float * result, \n"
"          __local unsigned int * shared_rows, \n"
"          __local float * inter_results) \n"
"{ \n"
"  uint2 tmp; \n"
"  float val;\n"
"  uint last_index = get_local_size(0) - 1;\n"
"  uint group_start = group_boundaries[get_group_id(0)];\n"
"  uint group_end = group_boundaries[get_group_id(0) + 1];\n"
"  uint k_end = 1 + (group_end - group_start - 1) / get_local_size(0);   // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)\n"
"  uint local_index = 0;\n"
"  for (uint k = 0; k < k_end; ++k)\n"
"  { \n"
"    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
"    \n"
"    local_index = group_start + k * get_local_size(0) + get_local_id(0); \n"
"  \n"
"    if (local_index < group_end)\n"
"    {\n"
"      tmp = coords[local_index]; \n"
"      val = elements[local_index] * vector[tmp.y]; \n"
"    }\n"
"    else\n"
"    {\n"
"      tmp.x = 0;\n"
"      tmp.y = 0;\n"
"      val = 0;\n"
"    }\n"
"    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
"    //check for carry from previous loop run: \n"
"    if (get_local_id(0) == 0 && k > 0)\n"
"    { \n"
"      if (tmp.x == shared_rows[last_index]) \n"
"        val += inter_results[last_index]; \n"
"      else \n"
"        result[shared_rows[last_index]] += inter_results[last_index]; \n"
"    } \n"
"    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
"    segmented_parallel_reduction(tmp.x, val, shared_rows, inter_results); //all threads have to enter this function\n"
"    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
"    if (get_local_id(0) != last_index &&\n"
"        shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] &&\n"
"        inter_results[get_local_id(0)] != 0) \n"
"    { \n"
"      result[tmp.x] += inter_results[get_local_id(0)]; \n"
"    }\n"
"   \n"
"    barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n"
"  } //for k\n"
"   \n"
"  if (get_local_id(0) == last_index && inter_results[last_index] != 0) \n"
"    result[tmp.x] += inter_results[last_index]; \n"
"}\n"
; //coordinate_matrix_align1_vec_mul

  }  //namespace kernels
 }  //namespace linalg
}  //namespace viennacl
#endif
