#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <unistd.h>

#include <iostream>
#include <vector>

#include "sopnamsp.h"
#include "tmatrix.h"
#include "tvector.h"
#include "matharr.h"
#include "tarrinit.h"
#include "randr48.h"

#include "parlex.h"
#include "resusage.h"
#include "timing.h"
#include "ctimer.h"


/* -------------------------------------------------------------
  Programme de test des classes d'execution parallele de SOPHYA
  SOPHYA::ParallelExecutor ... 
  Exemples d'execution: 
  Usage: tparlex SEL [Size=500] [NThreads=2] [NbExecuteCall=1]
  csh> time tparlex A 2000 2 
  csh> time tparlex B 500 2 
  csh> time tparlex A 2000 2 4
  csh> time tparlex B 500 2 3
*/

// Declaration des fonctions de test 
int parex_testA();
int parex_testB();
 

static sa_size_t SIZE = 500;
static unsigned int NTHR = 2;
static unsigned int NBPEXC = 1;

//--------------------------------------------------------------
//----------------------   MAIN PROGRAM ------------------------
int main(int narg, char *arg[])
{

  if ((narg<2)||((narg > 1)&&(strcmp(arg[1],"-h")==0))) {
    cout << " tparlex Test of SOPHYA parallel execution classes \n" 
	 << " Usage: tparlex SEL [Size=500] [NThreads=2] [NbExecuteCall=1] \n" 
	 << "  - SEL : A or B \n "
	 << "  - Size : Matrix size (see below) \n "
	 << "  - NThreads : number of threads \n "
	 << "  - NbExecuteCall : number of call to parallel execution function \n "
	 << "   A -> Sin(mx)+Sqrt(mx)+Cos(mx) , mx(NThr,1000*Size) \n" 
	 << "   B -> mxa(NThr*Size, Size) * mxb(Size,Size)  " << endl;
    return(1);
  }
  InitTim();
  ResourceUsage res(ResourceUsage::RU_All);

  char sel = *arg[1];
  if (narg > 2) SIZE = atol(arg[2]);
  if (narg > 3) NTHR = atoi(arg[3]);
  if (narg > 4) NBPEXC = atoi(arg[4]);
  if (SIZE<100)  SIZE=100;
  if (NTHR<1) NTHR=1;
  if (NBPEXC<1) NBPEXC=1;

  cout << " tparlex/starting, SEL=" << sel << " Size=" << SIZE << "  NTHR=NRows=" << NTHR 
       << " NbParExCall=" << NBPEXC << endl;
  BaseArray::SetDefaultMemoryMapping(BaseArray::CMemoryMapping);

  int rc = 0;
  try {
    ResourceUsage res(ResourceUsage::RU_All); 
    if (sel=='A')   rc = parex_testA();
    else  rc = parex_testB();
    cout << res;
  }
  catch (std::exception exc) {
    cerr << "tparlex: catched std::exception " << exc.what() << endl;
    rc = 77;
  }  
  catch (...) {
    cerr << "tparlex: catched unknown (...) exception " << endl; 
    rc = 78; 
  } 
  
  PrtTim(">>> tparlex: END <<< ");
  cout << " ------------ End execution tparlex -------------- " << endl;
  return(rc);
}


//--------------------------------------------------------------------
//   Classe implementant la fonction d'execution parallele 
//   ParallelTaskInterface::execute()  mxb=sin(mxa)+sqrt(mxa)+cos(mxa)
class TParTaskA : public ParallelTaskInterface {
public:
  TParTaskA(Matrix& a, Matrix& b) 
    : mxa(a), mxb(b), nbex(0)
  {
  }
  virtual int    execute(int tid) 
  {
    nbex++;
    cout << " ---- TParTaskA::execute(tid=" << tid << ") Start computing - NbExec= " << nbex << endl;
    Vector vx = mxa.Row(tid);
    r_8* x = vx.Data();
    r_8* y = mxb.Row(tid).Data();
    for(sa_size_t j=0; j<vx.Size(); j++) 
      y[j] = sin(x[j])+sqrt(x[j])+cos(x[j]);
    // mxb.Row(tid) = Sin(x)+Sqrt(x)+Cos(x);
    cout << " ---- TParTaskA::execute( " << tid << "," << nbex << ")  DONE " << endl; 
    return 0;
  }

  Matrix& mxa;
  Matrix& mxb;
  int nbex;
};

/* --Fonction-- */
int parex_testA()
{
  sa_size_t NCOLS = SIZE*1000;
  Matrix a(NTHR, NCOLS);
  Matrix b(NTHR, NCOLS);
  Matrix c(NTHR, NCOLS);
  
  cout << " parex_testA/Info: " << a.InfoString() << endl;
  a = RegularSequence(0.25,0.003);
  PrtTim("tparlexA[1] Done init ");
  cout << "tparlexA[1] Start b=Sin(a)+Sqrt(a)+Cos[a]" << endl;
  r_8* x = a.Data();
  r_8* y = b.Data();
  for(sa_size_t j=0; j<a.Size(); j++) 
    y[j] = sin(x[j])+sqrt(x[j])+cos(x[j]);
  //    b = Sin(a)+Sqrt(a)+Cos(a);
  PrtTim(">>tparlexA[1.b] Done ");
  // char ans[64]; 
  // cout << " A/ CR to continue ... " << endl;  gets(ans);
  
  TParTaskA ptask(a,c);
  ParallelExecutor pex(ptask, NTHR);
  pex.start();
  int rce=0;
  for(int i=0; i<NBPEXC; i++) {
    cout << " tparlexA[II=" << i+1 << "  Start ParallelExecution c=Sin(a)+Sqrt(a)+Cos[a]" << endl;
    rce = pex.execute();
    PrtTim(">>>>tparlexA:  End ParallelExecution ");
  }
  cout << " Rc=pex.execute() = " << rce << endl;
  Matrix d = b-c;
  double dmin, dmax;
  d.MinMax(dmin, dmax);
  cout << ">>tparlexA[3] Diff d=b-c, dmin=" << dmin << " dmax=" << dmax << endl;
  // cout << " B/ CR to continue ... " << endl; gets(ans);
  
  // cout << " C/ CR to continue ... " << endl;  gets(ans);
  return 0; 
}


//--------------------------------------------------------------------
//   Classe implementant la fonction d'execution parallele 
//   ParallelTaskInterface::execute()  mxc= mxa * mxb 
class TParTaskB : public ParallelTaskInterface {
public:
  TParTaskB(Matrix& a, Matrix& b, Matrix& c, int nth) 
    : mxa(a), mxb(b), mxc(c), nbex(0), nthread(nth)
  {
  }
  virtual int    execute(int tid) 
  {
    nbex++;
    cout << " ---- TParTaskB::execute(tid=" << tid << ") Start computing - NbExec= " << nbex << endl;
    sa_size_t sz = mxb.NRows();
    // On s'arrange pour que chaque thread calcule une partie de la matrice resultat
    // Il faut etre un peu malin et eviter que differents threads accedent les memes zones memoire
    mxc.SubMatrix(Range(sz*tid, sz*(tid+1)-1), Range::all() ) = 
	mxa.SubMatrix(Range(sz*tid, sz*(tid+1)-1), Range::all()) * mxb;
    /*  Une maniere plus compliquee pour MxA(NTH*SZ , SZ) * MxB(SZ, NTH*SZ) 
        mais cela n'apporte rien ...
    for(sa_size_t j=0; j<nthread; j++) {
      sa_size_t jj = (j+tid)%nthread;
      mxc.SubMatrix(Range(sz*tid, sz*(tid+1)-1), Range(sz*jj, sz*(jj+1)-1)) = 
	mxa.SubMatrix(Range(sz*tid, sz*(tid+1)-1), Range::all()) * 
	mxb.SubMatrix(Range::all(), Range(sz*jj, sz*(jj+1)-1));
    }
    */
    cout << " ---- TParTaskB::execute( " << tid << "," << nbex << ")  DONE " << endl; 
    return 0;
  }

  Matrix& mxa;
  Matrix& mxb;
  Matrix& mxc;
  int nthread;
  int nbex;
};

/* --Fonction-- */
int parex_testB()
{
  // On se met dans les conditions optimales pour la multiplication matricielle 
  Matrix a(NTHR*SIZE, SIZE, BaseArray::CMemoryMapping);
  Matrix b(SIZE, SIZE, BaseArray::FortranMemoryMapping);
  Matrix c(NTHR*SIZE, SIZE);

  cout << " parex_testB/Info: a.InfoString(): " << a.InfoString() << endl;
  cout << " parex_testB/Info: b.InfoString(): " << b.InfoString() << endl;
    
  a = RegularSequence(0.25,0.003);
  b = RegularSequence(1.2,0.0423);

  PrtTim("tparlexA[1] Done init ");

  cout << "tparlexB[1] Start cc=a*b" << endl;
  // Matrix cc(NTHR*SIZE, SIZE);
  // cc = a*b;
  Matrix cc = a*b;
  PrtTim(">>tparlexB[1.b] Done ");
  
  TParTaskB ptask(a,b,c,NTHR);
  ParallelExecutor pex(ptask, NTHR);
  pex.start();
  int rce=0;
  for(int i=0; i<NBPEXC; i++) {
    cout << " tparlexB[II=" << i+1 << "  Start ParallelExecution c=a*b" << endl;
    rce = pex.execute();
    PrtTim(">>tparlexB:  End ParallelExecution ");
  }
  cout << " Rc=pex.execute() = " << rce << endl;
  Matrix d = cc-c;
  double dmin, dmax;
  d.MinMax(dmin, dmax);
  cout << ">>tparlexB[3] Diff d=b-c, dmin=" << dmin << " dmax=" << dmax << endl;
  return 0; 
}
