/* ------ programme de test des capacites OpenMP ------ */
/*      Calcul parallele sur plusieurs processeurs      */
/* (C) R. Ansari     LAL/IN2P3-CNRS           2000      */

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include <iostream.h>

#ifdef _OPENMP
#include <omp.h>
#endif

/* Declaration de fonctions de calcul de temps CPU - timg.h .c  */
extern "C" {
void InitTim();
void PrtTim(char *);
}

/* Declaration des fonctions de ce fichier */
void Mult(long n, double *v1, double *v2, double *v3);
void MultB(long b, double *v1, double *v2, double *v3);
void Check(long n, double *v3, double *v3ck);
void Mult_OMP(long n, double *v1, double *v2, double *v3);
void MultB_OMP(long b, double *v1, double *v2, double *v3);
void MultB_OMP2(long b, double *v1, double *v2, double *v3);
void MultB_OMP3(long b, double *v1, double *v2, double *v3);
// void getBlock(long b, long* off1, long* off2, long* off3, long* sz);  C
void getBlock(long b, long& off1, long& off2, long& off3, long& sz);
void fillRandom(long n, double *v1, double *v2);


/* Variables statiques globales */
static long M,N,B,BSz;
static int omp_nthr = 4;

/* --- main() --- */
int main (long narg, char *arg[])
{
long i;

double *v1, *v2, *v3, *v3ck;
if (narg < 2) { 
  cout << "\n Usage tompCXX P/p/S/s [N BSz B OMP_NThr]  (Test OpenMP) \n \n"
       << " P /P2 /P3 -> Calling MultB_OMP /2/3  p -> Calling Mult_OMP \n" 
       << " S -> Calling MultB , s -> Calling Mult \n"
       << " N (=10): External loop number  BSz : BlockSize Size (50000) \n"
       << " B : Nb of Blocks 100 \n"
       << " OMP_NThr : Number of OpenMP threads (def = 4) \n " << endl;
  return(0);
}

N = 10;
BSz = 50000;
B = 100;
if (narg > 2) N = atol(arg[2]);
if (narg > 3) BSz = atol(arg[3]);
if (narg > 4) B = atol(arg[4]);

M = B*BSz;

omp_nthr = 4;
if (narg > 5) omp_nthr = atol(arg[5]);

cout << " tompCXX :  M (ArrSize)= " << M << "  NLoop=" << N << "  NBlock=" << B 
     << " BlockSize=" << BSz << endl;
 
InitTim();

v1 = new double[M];
v2 = new double[M];
v3 = new double[M];
v3ck = new double[M];

/*  Remplissage initiale   */
fillRandom(M, v1, v2);
Mult(M, v1, v2, v3ck);

PrtTim("End of Init ");

#ifdef _OPENMP
if ((*arg[1] == 'P') || (*arg[1] == 'p'))
  {
    omp_set_num_threads(omp_nthr);
    printf(" tompC NumThreads= %d (Max=%d)  NumProcs= %d \n", omp_get_num_threads(), 
	   omp_get_max_threads(), omp_get_num_procs());
  }
#endif

/*  Fonctions avec OpenMP  */
if (*arg[1] == 'P') { 
  if (*(arg[2]+1) == '2') {   /* Double boucle - OpenMP parallel boucle externe */
    printf("Calling N=%d times MultB_OMP2(Size= %d) \n", N, M);
    for(i=0; i<N; i++) {
      MultB_OMP2(B, v1, v2, v3);
      printf("%d ", i);  fflush(stdout);
    }
    printf("  ... Done \n");  
  }
  else if (*(arg[2]+1) == '3') { /* Double boucle - Nested parallel */
    printf("Calling N=%d times MultB_OMP3(Size= %d) \n", N, M);
    for(i=0; i<N; i++) {
      MultB_OMP3(B, v1, v2, v3);  
      printf("%d ", i);  fflush(stdout);
    }
    printf("  ... Done \n");  
  }
  else {  /* boucle double - OpenMP parallel boucle externe  */
    printf("Calling N=%d times MultB_OMP(Size= %d) \n", N, M);
    for(i=0; i<N; i++) {
      MultB_OMP(B, v1, v2, v3);
      printf("%d ", i);  fflush(stdout);
    }
    printf("  ... Done \n");  
  }
}
 else if (*arg[1] == 'p') { /* boucle simple - OpenMP */
  printf("Calling N=%d times Mult_OMP(Size= %d) \n", N, M);
  for(i=0; i<N; i++) {
    Mult_OMP(M, v1, v2, v3);
    printf("%d ", i);  fflush(stdout);
    }
  printf("  ... Done \n");  
}

/* Fonctions SANS OpenMP (scalaire)  */
else if (*arg[1] == 'S') { /* Double boucle  */
  printf("Calling N=%d times MultB(Size= %d) \n", N, M);
  for(i=0; i<N; i++) {
    MultB(B, v1, v2, v3);
    printf("%d ", i);  fflush(stdout);
    }
  printf("  ... Done \n");  
}
 else if (*arg[1] == 's') { /* Boucle simple */
  printf("Calling N=%d times Mult(Size= %d) \n", N, M);
  for(i=0; i<N; i++) {
    Mult(M, v1, v2, v3);
    printf("%d ", i);  fflush(stdout);
  }
  printf("  ... Done \n");  
}


PrtTim("End of Mult-Operation ");

Check(M, v3, v3ck);

PrtTim("End of programme ");

delete[] v1;
delete[] v2;
delete[] v3;
delete[] v3ck;

return(0);
}


/* --Fonction-- */
void fillRandom(long n, double *v1, double *v2)
/* Remplissage aleatoire de tableaux v1 v2 */
{
  long off,nn,k,i;
  double x1;
  long nbk = (B < 20) ? 20 : B;
  nn = n/nbk;
  for(k=0; k<nn; k++) {
    v1[k] = random()%10000;
    v2[k] = random()%14000;
  }
  for(i=1; i<nbk; i++) {
    off = i*nn;
    v1[k+off] = v1[k];
    v2[k+off] = v2[k];
  }
  x1 = random()%18000;
  if (nn*nbk < n)
    for(k=nn*nbk; k<n; k++) v1[k] = v2[k] = x1;

  return;
}

/* --Fonction-- */
void  getBlock(long b, long& off1, long& off2, long& off3, long& sz)  
// void  getBlock(long b, long* off1, long* off2, long* off3, long* sz)
/* Numero , offset de blocks */
{
  if ( (b < 0) || (b >= B) ) {
    cerr << " ERROR getBlock( b= " << b << " ??? " << endl; 
    throw 999;
    //    printf(" ERROR getBlock( b= %ld ???? \n", b);
    //    exit(99);  
  }
  sz = BSz;
  off1 = off2 = off3 = b*BSz;
  //  *sz = BSz;
  //  *off1 = *off2 = *off3 = b*BSz;
  
  return ;
}

/* --Fonction-- */
void Check(long n, double *v3, double *v3ck)
/*  Verification egalite v3 v3ck */
{
long npb;
long k;
 npb = 0;
for(k=0; k<n; k++)
  if (fabs(v3[k]-v3ck[k]) > 1.e-39)  npb++;

 if (npb == 0) 
   //   printf("  Check() - OK   NPB=0 / N= %ld\n", n);
   cout << "  Check() - OK   NPB=0 / N= " << n << endl;
 else 
   //   printf("  PB Check() !!! - OK   NPB= %ld / N= %ld", npb,n);
   cout << "  PB Check() !!! - OK   NPB= " << npb << " / N= " << n << endl;
}


/* --Fonction-- */
void Mult(long n, double *v1, double *v2, double *v3)
/* Multiplication - boucle simple v3 = v1*v2  */
{
long k;
for(k=0; k<n; k++)
  v3[k] = v1[k] * v2[k];
}


/* --Fonction-- */
void Mult_OMP(long n, double *v1, double *v2, double *v3)
/* Multiplication - boucle simple - OpenMP v3 = v1*v2  */
{
long k;
#ifdef _OPENMP
omp_set_num_threads(omp_nthr);    
#pragma omp parallel for schedule(static) 
#endif
for(k=0; k<n; k++)
  v3[k] = v1[k] * v2[k];
}


/* --Fonction-- */
void MultB(long b, double *vv1, double *vv2, double *vv3)
/* Multiplication - boucle double (par block) v3 = v1*v2  */
{
  long k,i;
  long sz, off1,off2,off3;
  double *v1, *v2, *v3;
  for(k=0; k<b; k++) {
    //    getBlock(k, &off1, &off2, &off3, &sz);
    getBlock(k, off1, off2, off3, sz);
    v1 = vv1+off1;
    v2 = vv2+off2;
    v3 = vv3+off3;
    for(i=0; i<sz; i++) 
      v3[i] = v1[i] * v2[i];
  }
}

/* --Fonction-- */
void MultB_OMP(long b, double *vv1, double *vv2, double *vv3)
/* Multiplication - boucle double (par block) - OpenMP v3 = v1*v2  */
{
  long k,i,ub;
  long sz, off1,off2,off3;
  double *v1, *v2, *v3;

  ub = b;
#ifdef _OPENMP
  if (b>1)
    omp_set_num_threads((b<omp_nthr)?b:omp_nthr);    
#pragma omp parallel for if(b>1) private(v1,v2,v3,k,i,off1,off2,off3,sz) schedule(static)
#endif
  for(k=0; k<ub; k++) {
    //    getBlock(k, &off1, &off2, &off3, &sz);
    getBlock(k, off1, off2, off3, sz);
    v1 = vv1+off1;
    v2 = vv2+off2;
    v3 = vv3+off3;
    for(i=0; i<sz; i++) 
      v3[i] = v1[i] * v2[i];
  }
}

/* --Fonction-- */
void MultB_OMP2(long b, double *vv1, double *vv2, double *vv3)
/* Multiplication - boucle double (par block) - OpenMP v3 = v1*v2  */
{
  long k,i;
  long sz, off1,off2,off3;
  double *v1, *v2, *v3;
#ifdef _OPENMP
  if (b>1)
    omp_set_num_threads((b<omp_nthr)?b:omp_nthr);    
#pragma omp  parallel  if(b>1)
{
#pragma omp  single
  printf("MultB_OMP2()  NumThr= %d \n", omp_get_num_threads());
#pragma omp for private(v1,v2,v3,k,i,off1,off2,off3,sz) schedule(static)
#endif
  for(k=0; k<b; k++) {
    //    getBlock(k, &off1, &off2, &off3, &sz);
    getBlock(k, off1, off2, off3, sz);
    v1 = vv1+off1;
    v2 = vv2+off2;
    v3 = vv3+off3;
    for(i=0; i<sz; i++) 
      v3[i] = v1[i] * v2[i];
  }
#ifdef _OPENMP
}
#endif

}

/* --Fonction-- */
void MultB_OMP3(long b, double *vv1, double *vv2, double *vv3)
/* Multiplication - boucle double (par block) - OpenMP v3 = v1*v2  */
{
  long k,i;
  long sz, off1,off2,off3;
  double *v1, *v2, *v3;
#ifdef _OPENMP
  if (b>1)
    omp_set_num_threads((b<omp_nthr)?b:omp_nthr);    
#pragma omp  parallel  if(b>1)
{
#pragma omp  single
  printf("MultB_OMP3()  NumThr= %d \n", omp_get_num_threads());
#pragma omp for private(v1,v2,v3,k,i,off1,off2,off3,sz) schedule(static)
#endif
  for(k=0; k<b; k++) {
    //    getBlock(k, &off1, &off2, &off3, &sz);
    getBlock(k, off1, off2, off3, sz);
    v1 = vv1+off1;
    v2 = vv2+off2;
    v3 = vv3+off3;
#ifdef _OPENMP
#pragma omp  parallel  
{
#pragma omp  single 
  if (k==0) printf("MultB_OMP3() -pragma2- NumThr= %d \n", omp_get_num_threads());
#pragma omp for private(i) schedule(static) 
#endif
    for(i=0; i<sz; i++) 
      v3[i] = v1[i] * v2[i];
#ifdef _OPENMP
}
#endif
  }
#ifdef _OPENMP
}
#endif

}

