#include "libscl.h" 
#include <ctime>
extern "C" {
  #include "cblas.h"
}

using namespace scl;
using namespace std;

using std::cout;

namespace {
  ostream& maxerr(ostream& os, const realmat& C_save, const realmat& C)
  {
    REAL rv = 0.0;
    for (INTEGER i=1; i<=C.size(); ++i) {
      REAL err = abs(C[i]-C_save[i])/(abs(C_save[i])+1.0e-5);
      rv = rv < err ? err : rv;
    }
    os << "maxerr = " << rv;
    return os;
  }
}

int main(int argc, char** argp, char** envp)
{

  if (argc != 4) {
    cerr << "Usage: " << argp[0] << " M N K" << '\n';
    return 1;
  }

  INTEGER M = atoi(argp[1]);
  INTEGER N = atoi(argp[2]);
  INTEGER K = atoi(argp[3]);

  cout << "M, N, K = " << M << ' ' << N << ' ' << K << '\n';

  INTEGER MN=M*N;

  realmat A(M,K),B(K,N),C(M,N);

  INT_32BIT seed = 5874;
  for (INTEGER k=1; k<=K; ++k) {
    for (INTEGER m=1; m<=M; ++m) A(m,k) = unsk(seed);
    for (INTEGER n=1; n<=N; ++n) B(k,n) = unsk(seed);
  }

  cout << "Using libscl's C = A*B" << '\n'; 

  clock_t clock_start, clock_stop;

  clock_start = clock();
  
  C = A*B;

  clock_stop = clock();
  cout << "clock = "
       << REAL(clock_stop - clock_start)/REAL(CLOCKS_PER_SEC) << '\n';
  realmat C_save = C;

  cout << "Using loop unrolling" << '\n'; 

  clock_start = clock();
  
  for (INTEGER i=1; i<=MN; ++i) C[i] = 0.0;
  INTEGER M0=8*(M/8); INTEGER M1=M-M%8+1;
  for (INTEGER k=1; k<=K; ++k) {
    for (INTEGER j=1; j<=N; ++j) {
      for (INTEGER i=0; i<M0; i+=8) {
        C(i+1,j) += A(i+1,k)*B(k,j);
        C(i+2,j) += A(i+2,k)*B(k,j);
        C(i+3,j) += A(i+3,k)*B(k,j);
        C(i+4,j) += A(i+4,k)*B(k,j);
        C(i+5,j) += A(i+5,k)*B(k,j);
        C(i+6,j) += A(i+6,k)*B(k,j);
        C(i+7,j) += A(i+7,k)*B(k,j);
        C(i+8,j) += A(i+8,k)*B(k,j);
      }
      for (INTEGER i=M1; i<=M; ++i) C(i,j) += A(i,k)*B(k,j);
    }
  }

  clock_stop = clock();
  cout << "clock = "
       << REAL(clock_stop - clock_start)/REAL(CLOCKS_PER_SEC) << '\n';
  maxerr(cout,C_save,C) << '\n';


  cout << "Using Duff's device" << '\n'; 

  clock_start = clock();
  
  for (INTEGER i=1; i<=MN; ++i) C[i] = 0.0;
  for (INTEGER k=1; k<=K; ++k) {
    for (INTEGER j=1; j<=N; ++j) {
      INTEGER i = 0; 
      INTEGER M0 = (M + 7) / 8;
      switch (M % 8) {
        case 0: do {
                C(++i,j) += A(i,k)*B(k,j);
        case 7: C(++i,j) += A(i,k)*B(k,j);
        case 6: C(++i,j) += A(i,k)*B(k,j);
        case 5: C(++i,j) += A(i,k)*B(k,j);
        case 4: C(++i,j) += A(i,k)*B(k,j);
        case 3: C(++i,j) += A(i,k)*B(k,j);
        case 2: C(++i,j) += A(i,k)*B(k,j);
        case 1: C(++i,j) += A(i,k)*B(k,j);
                } while (--M0 > 0) ;
      }
    }
  }

  clock_stop = clock();
  cout << "clock = "
       << REAL(clock_stop - clock_start)/REAL(CLOCKS_PER_SEC) << '\n';
  maxerr(cout,C_save,C) << '\n';


  cout << "Using no acceleration techniques" << '\n'; 

  clock_start = clock();

  for (INTEGER i=1; i<=MN; ++i) C[i] = 0.0;
  for (INTEGER k=1; k<=K; ++k) {
    for (INTEGER j=1; j<=N; ++j) {
      for (INTEGER i=1; i<=M; ++i) {
        C(i,j) += A(i,k)*B(k,j);
      }
    }
  }

  clock_stop = clock();
  cout << "clock = "
       << REAL(clock_stop - clock_start)/REAL(CLOCKS_PER_SEC) << '\n';
  maxerr(cout,C_save,C) << '\n';

  if (K == 1) {
    cout << "Using cblas_dgemv" << '\n'; 
  }
  else {
    cout << "Using cblas_dgemm" << '\n'; 
  }

  clock_start = clock();

  if (K == 1) {
    cblas_dgemv(CblasColMajor, CblasNoTrans,
      M, K, 1.0, A.begin(), M, B.begin(), 1, 0.0, C.begin(), 1);
  }
  else {
    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
      M, N, K, 1.0, A.begin(), M, B.begin(), K, 0.0, C.begin(), M);
  }

  clock_stop = clock();
  cout << "clock = "
       << REAL(clock_stop - clock_start)/REAL(CLOCKS_PER_SEC) << '\n';
  maxerr(cout,C_save,C) << '\n';

  cout << "Using dgmprd" << '\n'; 

  clock_start = clock();

  dgmprd(A,B,C);

  clock_stop = clock();
  cout << "clock = "
       << REAL(clock_stop - clock_start)/REAL(CLOCKS_PER_SEC) << '\n';
  maxerr(cout,C_save,C) << '\n';


  return 0;
}
