/* ///////////////////////////////////////////////////////////////////// */
/*! 
  \file  
  \brief Solve Laplace eq. on 2D domain in serial/parallel.

  Solve the two-dimensional Laplace equation with Dirichlet
  boundary conditions on a 2D Cartesian domain.

  The function DomainDecomposition() is used to decompose the
  computational domain into equally-sized sub-domains.

  The function BoundaryConditions() is used to assign both
  physical and inter-processor boundary conditions.

  The function WriteSolution() writes binary data to disk
  (fully parallel I/O is used in the parallel version).

  For the serial version, compile with

  > gcc laplace2D.c

  For the parallel version, compile with

  > mpicc -DPARALLEL laplace2D.c  (or turn #define PARALLEL to 1)


  \author A. Mignone (mignone@to.infn.it)
  \date   March 19, 2020
*/
/* ///////////////////////////////////////////////////////////////////// */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "tools.c"

#define PARALLEL
#define NX_GLOB     128   /* Global number of interior points */
#define NY_GLOB     128  /* Global number of interior points */
#define NGHOST   1
#define NDIM     2

#ifdef PARALLEL
  #include <mpi.h>
  MPI_Comm MPI_COMM_CART;
#endif

typedef struct MPI_Decomp_{
  int nprocs[NDIM];     /*  Number of processors in each dimension */
  int periods[NDIM];    /*  Periodicity flag in each dimension     */
  int coords[NDIM];     /*  Cartesian coordinate in the MPI topology */
  int gsize[NDIM];      /*  Global domain size (no ghosts)  */
  int lsize[NDIM];      /*  Local domain size (no ghosts)   */
  int start[NDIM];      /*  Local start index in each dimension           */
  int procL[NDIM];      /*  Rank of left-lying  process in each direction */ 
  int procR[NDIM];      /*  Rank of right-lying process in each direction */ 
  int rank;             /*  Local process rank */
  int size;             /*  Communicator size  */
} MPI_Decomp;


void BoundaryConditions(double **, double *, double *, int, int, MPI_Decomp *);
void DomainDecomposition(MPI_Decomp *);
void WriteSolution (double **, int, int, MPI_Decomp *);

int main(int argc, char ** argv)
{
  int    nx_tot, nx, i, ibeg, iend; 
  int    ny_tot, ny, j, jbeg, jend; 
  int    k, rank=0, size=1;
  double xbeg = 0.0, xend = 1.0;
  double ybeg = 0.0, yend = 1.0;
  double dx = (xend - xbeg)/(NX_GLOB + 1);
  double dy = (yend - ybeg)/(NY_GLOB + 1);
  double *xg, *yg, *x, *y, **phi, **phi0;
  double err, tol;
  MPI_Decomp  mpi_decomp;
  int procL[NDIM] = {-1,-1};
  int procR[NDIM] = {-1,-1};

/* --------------------------------------------------------
   0. Initialize the MPI execution environment
   -------------------------------------------------------- */

#ifdef PARALLEL
  MPI_Datatype row_type, col_type;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  DomainDecomposition(&mpi_decomp);
  nx = mpi_decomp.lsize[0];
  ny = mpi_decomp.lsize[1];
#else
  mpi_decomp.gsize[0] = mpi_decomp.lsize[0] = nx = NX_GLOB;
  mpi_decomp.gsize[1] = mpi_decomp.lsize[1] = ny = NY_GLOB;
  mpi_decomp.procL[0] = mpi_decomp.procL[1] = -1;
  mpi_decomp.procR[0] = mpi_decomp.procR[1] = -1;
#endif

/* --------------------------------------------------------
   1. Set local grid indices
   -------------------------------------------------------- */
  
  ibeg   = NGHOST;
  iend   = ibeg + nx - 1;
  nx     = iend - ibeg + 1;
  nx_tot = nx + 2*NGHOST;

  jbeg   = NGHOST;
  jend   = jbeg + ny - 1;
  ny     = jend - jbeg + 1;
  ny_tot = ny + 2*NGHOST;

/* --------------------------------------------------------
   2. Generate global and local grids
   -------------------------------------------------------- */

  xg = (double *) malloc ( (NX_GLOB+2*NGHOST)*sizeof(double)); 
  yg = (double *) malloc ( (NY_GLOB+2*NGHOST)*sizeof(double));
  for (i = 0; i < (NX_GLOB+2*NGHOST); i++) xg[i] = xbeg + (i-ibeg+1)*dx;
  for (j = 0; j < (NY_GLOB+2*NGHOST); j++) yg[j] = ybeg + (j-jbeg+1)*dy;
  #ifdef PARALLEL
  x = xg + mpi_decomp.start[0];
  y = yg + mpi_decomp.start[1];
  #else
  x = xg;
  y = yg;
  #endif

/* --------------------------------------------------------
   3. Allocate memory on local processor and
      assign initial conditions.
   -------------------------------------------------------- */

  phi  = Allocate_2DdblArray(ny_tot, nx_tot);
  phi0 = Allocate_2DdblArray(ny_tot, nx_tot);

  for (j = jbeg; j <= jend; j++){
  for (i = ibeg; i <= iend; i++){
    phi0[j][i] = 0.0;
  }}

#ifdef PARALLEL
  MPI_Type_contiguous (nx_tot, MPI_DOUBLE, &row_type);
  MPI_Type_vector     (ny_tot, 1, nx_tot, MPI_DOUBLE, &col_type);
  MPI_Type_commit (&row_type);
  MPI_Type_commit (&col_type);
#endif

/* --------------------------------------------------------
   4. Main iteration cycle
   -------------------------------------------------------- */

  tol = 1.e-5;
  err = 1.0;
  k   = 0;
  while  (err > tol){

  /* -- 4a. Set boundary conditions first -- */

    BoundaryConditions(phi0, x, y, nx, ny, &mpi_decomp);

  /* -- 4b. Jacobi's method and residual (interior points) -- */

    err = 0.0;
    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = 0.25*(  phi0[j][i-1] + phi0[j][i+1]
                        + phi0[j-1][i] + phi0[j+1][i] );

      err += dx*dy*fabs(phi[j][i] - phi0[j][i]);
    }}

    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi0[j][i] = phi[j][i];
    }}

    #ifdef PARALLEL
    double err_glob;
    MPI_Allreduce (&err, &err_glob, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    err = err_glob;
    #endif

    if (rank == 0){
      printf ("k = %d; err = %8.3e\n",k, err);
    }
    k++;
  }

  WriteSolution (phi, nx, ny, &mpi_decomp);

  #ifdef PARALLEL
  MPI_Finalize();
  #endif
  return 0;
}

#ifdef PARALLEL
/* ********************************************************************* */
void DomainDecomposition(MPI_Decomp *mpi_decomp)
/*
 *
 *********************************************************************** */
{
  int dim, i;
  int rank, size;
  int *coords  = mpi_decomp->coords;
  int *gsize   = mpi_decomp->gsize;
  int *lsize   = mpi_decomp->lsize;
  int *nprocs  = mpi_decomp->nprocs;
  int *periods = mpi_decomp->periods;
  int *procL   = mpi_decomp->procL;
  int *procR   = mpi_decomp->procR;
  int *start   = mpi_decomp->start;
  int new_coords[NDIM];

/* --------------------------------------------------------
   1. Get rank & size
   -------------------------------------------------------- */

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  mpi_decomp->rank = rank;
  mpi_decomp->size = size;

/* --------------------------------------------------------
   2. Obtain number of processor along each dimension.
      Use maximally squared decomp.
   -------------------------------------------------------- */

  nprocs[0] = (int)sqrt(size);
  nprocs[1] = size/nprocs[0];
  if (nprocs[0]*nprocs[1] != size){
    if (rank == 0) printf ("! Cannot decompose\n");
    MPI_Finalize();
    exit(1);
  }
  if (rank == 0){
    printf ("Decomposition achieved with %d X %d procs\n",nprocs[0],nprocs[1]);
  }

  periods[0] = 0;
  periods[1] = 0;

/* --------------------------------------------------------
   3. Create Cartesian topology
   -------------------------------------------------------- */

  MPI_Cart_create(MPI_COMM_WORLD, NDIM, nprocs, periods,
                                        0, &MPI_COMM_CART);
  MPI_Cart_get(MPI_COMM_CART, NDIM, nprocs, periods, coords);

/* --------------------------------------------------------
   4. Fill structure members
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;
  lsize[0] = NX_GLOB/nprocs[0];
  lsize[1] = NY_GLOB/nprocs[1];
  start[0] = coords[0]*lsize[0];
  start[1] = coords[1]*lsize[1];

/* --------------------------------------------------------
   5. Determine ranks of neighbour processors
   -------------------------------------------------------- */

  for (dim = 0; dim < NDIM; dim++) {
    for (i = 0; i < NDIM; i++) new_coords[i] = coords[i];

    new_coords[dim] = coords[dim] + 1;
    if (new_coords[dim] < nprocs[dim]) {
      MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procR[dim]) );
    } else {
      procR[dim] = MPI_PROC_NULL;
    }
 
    new_coords[dim] = coords[dim] - 1;
    if (new_coords[dim] >= 0) {
     MPI_Cart_rank ( MPI_COMM_CART, new_coords, &(procL[dim]) );
    } else {
      procL[dim] = MPI_PROC_NULL;
    }
  }

/* --------------------------------------------------------
   6. Print processor information.
      (Use MPI_Bcast() to print in sequence)
   -------------------------------------------------------- */

  int proc, go;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if (rank == go) {
      printf ("[Rank %d]\n",rank);
      printf ("  coords = [%d, %d], lsize = [%d, %d]\n",
                 coords[0], coords[1], lsize[0], lsize[1]);
      for (dim = 0; dim < NDIM; dim++){
        printf ("  (procL, procR)[%d] = %d, %d\n", dim, procL[dim], procR[dim]);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  return;
}
#endif

/* ********************************************************************* */
void BoundaryConditions(double **phi, double *x, double *y,
                        int nx, int ny, MPI_Decomp *mpi_decomp)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  int *procL = mpi_decomp->procL;
  int *procR = mpi_decomp->procR;
#ifdef PARALLEL
  int rank = mpi_decomp->rank;
  int size = mpi_decomp->size;
  double send_buf[NX_GLOB + 2*NGHOST];
  double recv_buf[NX_GLOB + 2*NGHOST];

/*  Used for testing
    for (j = 0; j <= jend+1; j++){
    for (i = 0; i <= iend+1; i++){
      phi[j][i] = -1;
    }}

    for (j = jbeg; j <= jend; j++){
    for (i = ibeg; i <= iend; i++){
      phi[j][i] = rank;
    }}
*/
  // Left buffer
  i = ibeg;
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procL[0], 0,
                recv_buf, jend+1, MPI_DOUBLE, procL[0], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  for (j = jbeg; j <= jend; j++) phi[j][i-1] = recv_buf[j];
  
  // Right buffer
  i = iend;
  for (j = jbeg; j <= jend; j++) send_buf[j] = phi[j][i];
  MPI_Sendrecv (send_buf, jend+1, MPI_DOUBLE, procR[0], 0,
                recv_buf, jend+1, MPI_DOUBLE, procR[0], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  for (j = jbeg; j <= jend; j++) phi[j][i+1] = recv_buf[j];
  
  
  // Bottom buffer
  j = jbeg;
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procL[1], 0,
                recv_buf, iend+1, MPI_DOUBLE, procL[1], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  for (i = ibeg; i <= iend; i++) phi[j-1][i] = recv_buf[i];
  
  // Top buffer
  j = jend;
  for (i = ibeg; i <= iend; i++) send_buf[i] = phi[j][i];
  MPI_Sendrecv (send_buf, iend+1, MPI_DOUBLE, procR[1], 0,
                recv_buf, iend+1, MPI_DOUBLE, procR[1], 0,
                MPI_COMM_WORLD, MPI_STATUS_IGNORE);
  for (i = ibeg; i <= iend; i++) phi[j+1][i] = recv_buf[i];
#endif

/* -- Left -- */

  if (procL[0] < 0){
    i = ibeg-1;
    for (j = jbeg; j <= jend; j++) phi[j][i] = 1.0-y[j];
  }

/* -- Right -- */

  if (procR[0] < 0){
    i = iend+1;
    for (j = jbeg; j <= jend; j++) phi[j][i] = y[j]*y[j];
  }

/* -- Bottom -- */

  if (procL[1] < 0){
    j = jbeg-1;
    for (i = ibeg; i <= iend; i++) phi[j][i] = 1.0-x[i];
  }

/* -- Top -- */

  if (procR[1] < 0){
    j = jend+1;
    for (i = ibeg; i <= iend; i++) phi[j][i] = x[i];
  }

  return;

#ifdef PARALLEL
// Print
  MPI_Barrier(MPI_COMM_WORLD);
  int go, proc;
  for (proc = 0; proc < size; proc++){
    go = proc;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == go) {
      printf ("Boundary [Rank %d]\n",rank);
      for (j = jend+1; j >= 0; j--){
        for (i = 0; i <= iend+1; i++){
          printf ("%6.2f  ", phi[j][i]);
        }
        printf ("\n");
      }
    }
  }

MPI_Finalize();
exit(0);
#endif
}

/* ********************************************************************* */
void WriteSolution (double **phi, int nx, int ny, MPI_Decomp *md)
/*
 *********************************************************************** */
{
  int i,j;
  int ibeg   = NGHOST;
  int iend   = ibeg + nx - 1;

  int jbeg   = NGHOST;
  int jend   = jbeg + ny - 1;

  static int nfile = 0;
  char fname[32];
  
  sprintf (fname,"laplace2D_%02d.bin",nfile);

/*
for (j = jbeg-1; j <= jend+1; j++) for (i = ibeg-1; i <= iend+1; i++) {
  phi[j][i] = -1;
}
for (j = jbeg; j <= jend; j++) for (i = ibeg; i <= iend; i++) {
  phi[j][i] = md->rank;
}
*/
#ifdef PARALLEL
  MPI_File fh;
  MPI_Datatype type_local, type_domain;
  int amode = MPI_MODE_CREATE | MPI_MODE_WRONLY;
  int gsize[2], lsize[2], start[2];

/* --------------------------------------------------------
   1. Create a local array type without the ghost zones
      This datatype will be passed to MPI_File_write()
   -------------------------------------------------------- */

  gsize[0] = md->lsize[0] + 2*NGHOST;
  gsize[1] = md->lsize[1] + 2*NGHOST;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = NGHOST;
  start[1] = NGHOST;

  MPI_Type_create_subarray (NDIM, gsize, lsize, start,
                            MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_local);
  MPI_Type_commit (&type_local);

/* --------------------------------------------------------
   2. Create the subarry in the global domain.
      This datatype is used to set the file view.
   -------------------------------------------------------- */

  gsize[0] = NX_GLOB;
  gsize[1] = NY_GLOB;

  lsize[0] = md->lsize[0];
  lsize[1] = md->lsize[1];

  start[0] = lsize[0]*md->coords[0];  // equal to md->start[0] 
  start[1] = lsize[1]*md->coords[1];  // equal to md->start[1] 

  MPI_Type_create_subarray (NDIM, gsize, lsize, start,
                            MPI_ORDER_FORTRAN, MPI_DOUBLE, &type_domain);
  MPI_Type_commit (&type_domain);

/* --------------------------------------------------------
   3. Write to disk
   -------------------------------------------------------- */

  MPI_File_delete(fname, MPI_INFO_NULL);
  MPI_File_open(MPI_COMM_CART, fname, amode, MPI_INFO_NULL, &fh);
  MPI_File_set_view(fh, 0, MPI_DOUBLE, type_domain, "native", MPI_INFO_NULL);
  MPI_File_write_all(fh, phi[0], 1, type_local, MPI_STATUS_IGNORE);
  MPI_File_close(&fh);
  MPI_Type_free (&type_local);
  MPI_Type_free (&type_domain);
#else
  FILE *fp;
  printf ("> Writing %s\n",fname);
  fp = fopen(fname, "wb");

  for (j = jbeg; j <= jend; j++){
    fwrite (phi[j] + ibeg, sizeof(double), nx, fp);
  }

  fclose(fp);
#endif

  nfile++;
}