// Matrix-Matrix Multiply, AB = C using PESSL (ScaLAPACK)
// filename: ex5.C
// to use IBM PESSL library
// compile: mpCC -o ex5 -lblacs -lpessl -DIBM ex5.C
// run: poe+ ./ex5 -nodes 1 -procs 4
// to use ScaLAPACK library
// compile: module load scalapack
// mpCC -o ex5 $PBLAS $BLACS $SCALAPACK -lessl -lxlf90 ex5.C
// to use ScaLAPACK and VAMPIR (which does not work with PESSL)
// compile: module load scalapack
// module load vampir vampirtrace
// mpCC -o ex5 $PBLAS $BLACS $SCALAPACK -lessl -lxlf90 $VTINC $VTLIB ex5.C
// prow number of rows in proc grid set to 2
// pcol number of columns in proc grid set to 2
// n number of rows/columns in matrix A set to 1000
// nb matrix distribution block size set to 16
// output: stdout
#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#ifdef IBM
#include "pessl.h"
#else
#include <blacs.h>
extern "FORTRAN" {
int numroc(int *, int *, int *, int *, int *);
void pdgemm( char *, char *, int *, int *, int *, double *, double *,
int *, int *, int *, double *, int *, int *, int *, double *,
double *, int *, int *, int * );
}
#endif
extern "FORTRAN" {
void blacs_pinfo(const int *, const int *);
void blacs_get(const int *, const int *, const int *);
void blacs_gridinit(const int *, char *, const int *, const int *);
void blacs_gridinfo(const int *, const int *, const int *, const int *, const int *);
void blacs_gridmap(const int *, int *, const int *, const int *, const int *);
void blacs_gridexit(const int *);
void blacs_exit(const int *);
}
int g2lp (int,int,int,int);
int g2li (int,int,int,int);
int main () {
int n=1000, nb=16; // problem size and block size
int myzero=0, myone=1;
int myArows, myAcols; // size of local subset of global array
int i,j, igrid,jgrid, iproc,jproc, myi,myj, p;
double mydzero=0., mydone=1.;
double *myA,*myB,*myC; // local data arrays
int me, procs, icontxt, prow=2, pcol=2, myrow, mycol; // blacs data
int info; // pessl return value
int ides_a[9], ides_b[9], ides_c[9]; // pessl array desc
char ascii_t='t', ascii_n='n', ascii_r='r';
// Initialize blacs processor grid
blacs_pinfo (&me, &procs);
blacs_get (&myzero, &myzero, &icontxt);
blacs_gridinit(&icontxt, &ascii_r, &prow, &pcol);
blacs_gridinfo(&icontxt, &prow, &pcol, &myrow, &mycol);
cout << "Proc: " << me << " is (" << myrow << "," << mycol << ") ";
cout << " in p-array" << endl ;
// Construct local arrays
// Global structure: matrix A of n rows and n columns
// matrix B of n rows and n column
// matrix C of n rows and n column
// if(me==0)printf("Size of global array is %d x %d\n", n, n);
// if(me==0)printf("Size of block is %d x %d\n", nb, nb);
#ifdef IBM
myArows = numroc(n, nb, myrow, 0, prow);
myAcols = numroc(n, nb, mycol, 0, pcol);
#else
myArows = numroc(&n, &nb, &myrow, &myzero, &prow);
myAcols = numroc(&n, &nb, &mycol, &myzero, &pcol);
#endif
// Initialize local arrays
myA = (double *) malloc(sizeof(double)*myArows*myAcols);
myB = (double *) malloc(sizeof(double)*myArows*myAcols);
myC = (double *) malloc(sizeof(double)*myArows*myAcols);
for (i=0;i<n;i++)
{
iproc = g2lp(i,n,prow,nb);
myi = g2li(i,n,prow,nb);
if (myrow==iproc)
{
for (j=0;j<n;j++)
{
jproc = g2lp(j,n,pcol,nb);
myj = g2li(j,n,pcol,nb);
if (mycol==jproc)
{
myA[(myj*myArows)+myi] = (double)(i+j+2);
myB[(myj*myArows)+myi] = (double)(j-i);
myC[(myj*myArows)+myi] = 0.;
}
}
}
}
// Prepare array descriptors for PESSL (ScaLAPACK style)
ides_a[0] = 1; // descriptor type
ides_a[1] = icontxt; // blacs context
ides_a[2] = n; // global number of rows
ides_a[3] = n; // global number of columns
ides_a[4] = nb; // row block size
ides_a[5] = nb; // column block size
ides_a[6] = 0; // initial process row
ides_a[7] = 0; // initial process column
ides_a[8] = myArows; // leading dimension of local array
for (i=0;i<9;i++)
{
ides_b[i] = ides_a[i];
ides_c[i] = ides_a[i];
}
// Call PESSL library routine
#ifdef IBM
pdgemm("t","n",n,n,n,1.0,myA,1,1,ides_a,myB,1,1,ides_b,0.,myC,1,1,ides_c);
#else
pdgemm(&ascii_t,&ascii_n,&n,&n,&n,&mydone,myA,&myone,&myone,ides_a,myB,&myone,&myone,ides_b,&mydzero,myC,&myone,&myone,ides_c);
#endif
// Print results
iproc = g2lp(n-1,n,prow,nb);
myi = g2li(n-1,n,prow,nb);
jproc = g2lp(n-1,n,pcol,nb);
myj = g2li(n-1,n,pcol,nb);
if ((myrow==iproc) && (mycol==jproc))
printf("c(%d,%d)=%f\n", n, n, myC[(myj*myArows)+myi]);
// Deallocate the local arrays
free(myA);
free(myB);
free(myC);
// End blacs for processors that are used
blacs_gridexit(&icontxt);
blacs_exit(&myzero);
return 0;
}
// convert global index to local index in block-cyclic distribution
int g2lp(int i,int n,int np,int nb)
{
// i global array index, input
// n global array dimension, input
// np processor array dimension, input
// nb block size, input
// p processor array index, return
return (i/nb)%np;
}
int g2li(int i,int n,int np,int nb)
{
// i global array index, input
// n global array dimension, input
// np processor array dimension, input
// nb block size, input
// il local array index, return
return (i/(np*nb))*nb + (i%nb);
}
|