// Matrix-Matrix Multiply, AB = C using PESSL (ScaLAPACK)
// filename:  ex5.C
 
// to use IBM PESSL library
// compile:   mpCC -o ex5 -lblacs -lpessl -DIBM ex5.C
// run:       poe+ ./ex5 -nodes 1 -procs 4
 
// to use ScaLAPACK library
// compile:   module load scalapack
//            mpCC -o ex5 $PBLAS $BLACS $SCALAPACK -lessl -lxlf90 ex5.C
 
// to use ScaLAPACK and VAMPIR (which does not work with PESSL)
// compile:   module load scalapack
//            module load vampir vampirtrace
//            mpCC -o ex5 $PBLAS $BLACS $SCALAPACK -lessl -lxlf90 $VTINC $VTLIB ex5.C
 
//              prow    number of rows in proc grid set to 2
//              pcol    number of columns in proc grid set to 2
//              n       number of rows/columns in matrix A set to 1000
//              nb      matrix distribution block size set to 16
 
//  output:   stdout  
 
 
#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#ifdef IBM
 #include "pessl.h"
#else
 #include <blacs.h>
 extern "FORTRAN" {
  int numroc(int *, int *, int *, int *, int *);
  void pdgemm( char *, char *, int *, int *, int *, double *, double *,
         int *, int *, int *, double *, int *, int *, int *, double *,
         double *, int *, int *, int * );
 }
#endif
 
extern "FORTRAN" {
 void blacs_pinfo(const int *, const int *);
 void blacs_get(const int *, const int *, const int *);
 void blacs_gridinit(const int *, char *, const int *, const int *);
 void blacs_gridinfo(const int *, const int *, const int *, const int *, const int *);
 void blacs_gridmap(const int *, int *, const int *, const int *, const int *);
 void blacs_gridexit(const int *);
 void blacs_exit(const int *);
}
 
int g2lp (int,int,int,int);
int g2li (int,int,int,int);
 
int main () {
 
      int n=1000, nb=16;              // problem size and block size 
      int myzero=0, myone=1;
      int myArows, myAcols;   // size of local subset of global array 
      int i,j, igrid,jgrid, iproc,jproc, myi,myj, p;
      double mydzero=0., mydone=1.;
      double *myA,*myB,*myC;  // local data arrays 
 
      int me, procs, icontxt, prow=2, pcol=2, myrow, mycol; // blacs data 
      int info;   // pessl return value 
      int ides_a[9], ides_b[9], ides_c[9]; // pessl array desc 
      char ascii_t='t', ascii_n='n', ascii_r='r';
 
// Initialize blacs processor grid 
 
      blacs_pinfo   (&me, &procs);
      blacs_get     (&myzero, &myzero, &icontxt);
      blacs_gridinit(&icontxt, &ascii_r, &prow, &pcol);
      blacs_gridinfo(&icontxt, &prow, &pcol, &myrow, &mycol);
 
      cout << "Proc: " << me << " is (" << myrow << "," << mycol << ") ";
      cout << " in p-array" << endl ;
 
// Construct local arrays
// Global structure:  matrix A of n rows and n columns
//                    matrix B of n rows and n column
//                    matrix C of n rows and n column 
 
//    if(me==0)printf("Size of global array is %d x %d\n", n, n);
//    if(me==0)printf("Size of block is        %d x %d\n", nb, nb);
 
 
 
#ifdef IBM      
      myArows = numroc(n, nb, myrow, 0, prow);
      myAcols = numroc(n, nb, mycol, 0, pcol);
#else
      myArows = numroc(&n, &nb, &myrow, &myzero, &prow);
      myAcols = numroc(&n, &nb, &mycol, &myzero, &pcol);
#endif
 
// Initialize local arrays  
 
      myA = (double *) malloc(sizeof(double)*myArows*myAcols);
      myB = (double *) malloc(sizeof(double)*myArows*myAcols);
      myC = (double *) malloc(sizeof(double)*myArows*myAcols);
 
      for (i=0;i<n;i++)
      {
         iproc = g2lp(i,n,prow,nb);
         myi   = g2li(i,n,prow,nb);
         if (myrow==iproc)
         {
            for (j=0;j<n;j++)
            {
            jproc = g2lp(j,n,pcol,nb);
            myj   = g2li(j,n,pcol,nb);
               if (mycol==jproc)
               {
                  myA[(myj*myArows)+myi] = (double)(i+j+2);
                  myB[(myj*myArows)+myi] = (double)(j-i);
                  myC[(myj*myArows)+myi] = 0.;
               }
            }
         }
      }
 
// Prepare array descriptors for PESSL (ScaLAPACK style) 
 
      ides_a[0] = 1;        // descriptor type 
      ides_a[1] = icontxt;  // blacs context 
      ides_a[2] = n;        // global number of rows 
      ides_a[3] = n;        // global number of columns 
      ides_a[4] = nb;       // row block size 
      ides_a[5] = nb;       // column block size 
      ides_a[6] = 0;        // initial process row 
      ides_a[7] = 0;        // initial process column 
      ides_a[8] = myArows;  // leading dimension of local array 
 
      for (i=0;i<9;i++)
      {
         ides_b[i] = ides_a[i];
         ides_c[i] = ides_a[i];
      }
 
// Call PESSL library routine 
 
#ifdef IBM      
     pdgemm("t","n",n,n,n,1.0,myA,1,1,ides_a,myB,1,1,ides_b,0.,myC,1,1,ides_c);
#else
     pdgemm(&ascii_t,&ascii_n,&n,&n,&n,&mydone,myA,&myone,&myone,ides_a,myB,&myone,&myone,ides_b,&mydzero,myC,&myone,&myone,ides_c);
#endif
 
// Print results 
 
      iproc = g2lp(n-1,n,prow,nb);
      myi   = g2li(n-1,n,prow,nb);
      jproc = g2lp(n-1,n,pcol,nb);
      myj   = g2li(n-1,n,pcol,nb);
      if ((myrow==iproc) && (mycol==jproc))
         printf("c(%d,%d)=%f\n", n, n, myC[(myj*myArows)+myi]);
 
// Deallocate the local arrays 
 
      free(myA);
      free(myB);
      free(myC);
 
// End blacs for processors that are used 
 
      blacs_gridexit(&icontxt);
      blacs_exit(&myzero);
 
      return 0; 
}
 
// convert global index to local index in block-cyclic distribution 
 
int g2lp(int i,int n,int np,int nb)
{
 
//  i      global array index, input
//  n      global array dimension, input
//  np     processor array dimension, input
//  nb     block size, input
//  p      processor array index, return
 
   return (i/nb)%np;
}
 
int g2li(int i,int n,int np,int nb)
{
 
//  i      global array index, input
//  n      global array dimension, input
//  np     processor array dimension, input
//  nb     block size, input
//  il     local array index, return
 
   return (i/(np*nb))*nb + (i%nb);
}


Close This Window