#define USE_VIENNACL_FAST_COPY
//#undef USE_VIENNACL_FAST_COPY

#include "libscl.h"

#include "viennacl/scalar.hpp"
#include "viennacl/vector.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/linalg/prod.hpp"

using namespace scl;
using namespace std;

const INTEGER Arows = 1000;
const INTEGER Acols = 10000;
const INTEGER Bcols = 1000;

const INTEGER Brows = Acols;
const INTEGER Crows = Arows;
const INTEGER Ccols = Bcols;

class cycle {
private:
  int i;
public:
  cycle() : i(0) {}
  REAL operator()()  { return REAL(++i % 1000); }
};

int main(int argc, char** argp, char** envp)
{

  cout << '\n';
  #if defined USE_VIENNACL_FAST_COPY
    cout << *argp << " is using fast_copy" << '\n';
  #else
    cout << *argp << " is not using fast_copy" << '\n';
  #endif

  realmat A(Arows,Acols);
  realmat B(Brows,Bcols);
  realmat C(Crows,Ccols);

  cycle fill;  
  for (INTEGER i=1; i<=A.size(); ++i) A[i] = fill();
  for (INTEGER i=1; i<=B.size(); ++i) B[i] = fill();

  stopwatch timer;

  C = A*B;

  REAL cpu_time = timer.time();

  cout << '\n';
  cout << "libscl_float mult time = " << cpu_time << '\n';
  
  realmat Answer = C;

  timer.reset(); 

  #if defined USE_VIENNACL_FAST_COPY

    viennacl::matrix<REAL,viennacl::column_major> gpu_A(Arows,Acols);
    viennacl::matrix<REAL,viennacl::column_major> gpu_B(Brows,Bcols);
    viennacl::matrix<REAL,viennacl::column_major> gpu_C(Crows,Ccols);

    /*
    vclmat cpu_A(A);
    vclmat cpu_B(B);
    vclmat cpu_C(C);

    viennacl::fast_copy(cpu_A.begin(),cpu_A.end(),gpu_A);
    viennacl::fast_copy(cpu_B.begin(),cpu_B.end(),gpu_B);
    */

    viennacl::fast_copy(A.begin(),A.end(),gpu_A);
    viennacl::fast_copy(B.begin(),B.end(),gpu_B);

  #else

    viennacl::matrix<REAL> gpu_A(Arows,Acols);
    viennacl::matrix<REAL> gpu_B(Brows,Bcols);
    viennacl::matrix<REAL> gpu_C(Crows,Ccols);

    vclmat cpu_A(A);
    vclmat cpu_B(B);

    viennacl::copy(cpu_A,gpu_A);
    viennacl::copy(cpu_B,gpu_B);

  #endif

  REAL vcl_time_1 = timer.time();

  cout << "viennacl A & B copy time = " << vcl_time_1 << '\n';

  timer.reset();

  gpu_C = viennacl::linalg::prod(gpu_A, gpu_B);

  REAL vcl_time_2 = timer.time();
  
  cout << "viennacl mult time = " << vcl_time_2 << '\n';

  timer.reset();

  #if defined USE_VIENNACL_FAST_COPY

    /*  
    vclmat cpu_C(C);
    viennacl::fast_copy(gpu_C,cpu_C.begin());
    if (!cpu_C.is(C)) C = cpu_C;
    */

    viennacl::fast_copy(gpu_C,C.begin());

  #else

    vclmat cpu_C(C);
    viennacl::copy(gpu_C,cpu_C);
    if (!cpu_C.is(C)) C = cpu_C;

  #endif

  REAL vcl_time_3 = timer.time();

  cout << "viennacl C copy time = " << vcl_time_3 << '\n';

  unsigned int count = 0;

  for (INTEGER i = 1; i <= Answer.size(); ++i) {
    REAL err = C[i] - Answer[i];
    REAL tol =  Answer[i]*1.0e-4;
    err = err > 0 ? err : -err;
    tol = tol > 0 ? tol : -tol;
    if (err > tol) ++count;
  }

  REAL vcl_time = vcl_time_1 + vcl_time_2 + vcl_time_3;

  cout << "viennacl total time = " << vcl_time << '\n';

  cout << "GPU/CPU total time = " 
       << fmt('f',8,4,100.0*vcl_time/cpu_time) << " per cent" << '\n';
  cout << "GPU/CPU mult time  = " 
       << fmt('f',8,4,100.0*vcl_time_2/cpu_time) << " per cent" << '\n';

  cout << "viennacl errors = " << count << '\n';
  cout << '\n';

  return 0;
}
