Home || Architecture || Video Search || Visual Search || Scripts || Applications || Important Messages || OGL || Src

PxDistribution.h

Go to the documentation of this file.
00001 /*
00002  *  Copyright (c) 2003-2004, University of Amsterdam, The Netherlands.
00003  *  All rights reserved.
00004  *
00005  *  Author(s):
00006  *  Frank Seinstra <fjseins@wins.uva.nl>
00007  */
00008 
00009 #ifndef __PxDistribution_h_
00010 #define __PxDistribution_h_
00011 
00012 
00013 #include "Core/Array/Pattern/ArrayFunc.h"
00014 #include "Core/Array/Pattern/PxArrayFunc.h"
00015 #include "Core/Array/Pattern/PxPartition.h"
00016 #include "Core/Array/Pattern/PxSystem.h"
00017 #include "mpi.h"
00018 
00019 #include "PxxStream.h"
00020 
00021 namespace Impala
00022 {
00023 namespace Core
00024 {
00025 namespace Array
00026 {
00027 namespace Pattern
00028 {
00029 
00030 
00031 /*** Enumerations *****************************************************/
00032 
00033 enum tags   { INIT_TAG, BCAST_TAG, RDUCE_TAG,   // message tags
00034               SCAT_TAG, GATH_TAG, RDIST_TAG, BOR_TAG };
00035 enum dists  { PX_OFT, PX_SBT, PX_MPI };                 // distribution types
00036 enum dirs   { TO, FROM };                       // direction types
00037 enum planes { YZ_PLANE, XZ_PLANE, XY_PLANE };   // plane types
00038 
00039 
00040 /*** Global variables *************************************************/
00041 
00042 static int  _myCPU;     /* Local CPU number */
00043 static int  _nrCPUs;    /* Total nr of CPUs */
00044 static int  _logCPUs;   /* Shortest CPU->CPU path in hypercube */
00045 static int  _maxCPUs;   /* Max nr of CPUs in complete hypercube */
00046 
00047 int  dp = 0;
00048 
00049 /**********************************************************************
00050  *** Distribution initialization & helper functions                 ***
00051  **********************************************************************/
00052 
00053 inline void
00054 PxInitDistribution(int aw, int ah, int ad,
00055                    int xcpus, int ycpus, int zcpus)
00056 {
00057     _myCPU   = PxMyCPU();
00058     _nrCPUs  = PxNrCPUs();
00059     _logCPUs = log((double)_nrCPUs)/log(2.0);
00060     _maxCPUs = pow((double)2, _logCPUs);
00061     if (_maxCPUs < _nrCPUs) {
00062         _logCPUs++;
00063         _maxCPUs *= 2;
00064     }
00065     PxInitPartition(aw, ah, ad, xcpus, ycpus, zcpus);
00066 }
00067 
00068 
00069 template<class ArrayT>
00070 static inline void
00071 PxLclArrayCopy(ArrayT* loc, int dir, ArrayT* blk, int root, bool bdata)
00072 {
00073     /*** Copy local array (loc) TO/FROM 'head' of block array (blk) ***/
00074 
00075     typedef typename ArrayT::StorType storT;
00076     storT* lPtr = (bdata) ? ArrayPB(loc) : ArrayCPB(loc);
00077     storT* bPtr = (bdata) ? ArrayPB(blk) : ArrayCPB(blk);
00078     int eSize = ArrayT::ElemSize();
00079 
00080     if (_myCPU == root) {
00081         bPtr = ArrayPB(blk) + eSize*(PxLclStart(_myCPU,
00082                ArrayBW(blk), ArrayBH(blk), ArrayBD(blk)));
00083         if (bdata) {
00084             bPtr -= (ArrayW(blk)*ArrayH(blk)*ArrayBD(blk) +
00085                 ArrayW(blk)*ArrayBH(blk) + ArrayBW(blk)) * eSize;
00086         }
00087     }
00088 
00089     int width  = (bdata) ? ArrayW(loc) : ArrayCW(loc);
00090     int height = (bdata) ? ArrayH(loc) : ArrayCH(loc);
00091     int depth  = (bdata) ? ArrayD(loc) : ArrayCD(loc);
00092 
00093     for (int z=0; z<depth; z++) {
00094         for (int y=0; y<height; y++) {
00095             storT* locPtr = lPtr + (ArrayW(loc)*ArrayH(loc)*z +
00096                                     ArrayW(loc)*y) * eSize;
00097             storT* blkPtr = bPtr + (ArrayW(blk)*ArrayH(blk)*z +
00098                                     ArrayW(blk)*y) * eSize;
00099             for (int x=0; x<width*eSize; x++) {
00100                 (dir == TO) ? (*blkPtr++ = *locPtr++) :
00101                               (*locPtr++ = *blkPtr++);
00102             }
00103         }
00104     }
00105 }
00106 
00107 
00108 static inline int
00109 PxResponsibilityRange(int cpuNr)
00110 {
00111     /*** Calculate the number of CPUs that 'cpuNr' is responsible ***/
00112     /*** for in hypercube (SBT) collective communication.         ***/
00113 
00114     int range = 1;
00115 
00116     for (int i=0; i<_logCPUs; i++) {
00117         if ((cpuNr & range) == range) {
00118             return range;
00119         }
00120         range <<= 1;
00121     }
00122     return range;
00123 }
00124 
00125 
00126 static inline int
00127 PxLineRange(int dimension, int firstIndex, int *order)
00128 {
00129     /*** Calculate number of 'lines' that will be send/received in ***/
00130     /*** SBT collective communication. The CPU at 'firstIndex' in  ***/
00131     /*** the SBTarray 'order' is responsible for a range of CPUs,  ***/
00132     /*** and will send and receive data for that range.            ***/
00133 
00134     int minNrLines = 0;
00135     int overflow = 0;
00136 
00137     switch (dimension) {
00138         case 1: if (PxLogPartXcpus() == 1) {    // get nr of 'columns'
00139                     return PxFullWidth();
00140                 }
00141                 minNrLines = PxMinLclWidth();
00142                 overflow = PxOverflowX();
00143                 break;
00144         case 2: if (PxLogPartYcpus() == 1) {    // get nr of 'rows'
00145                     return PxFullHeight();
00146                 }
00147                 minNrLines = PxMinLclHeight();
00148                 overflow = PxOverflowY();
00149                 break;
00150         case 3: if (PxLogPartZcpus() == 1) {    // get nr of 'pages'
00151                     return PxFullDepth();
00152                 }
00153                 minNrLines = PxMinLclDepth();
00154                 overflow = PxOverflowZ();
00155                 break;
00156         default:
00157                 return 0;
00158     }
00159 
00160     int lastIndex = firstIndex + PxResponsibilityRange(firstIndex) - 1;
00161     int nrIndexes = lastIndex - firstIndex + 1;
00162     int firstCPU = order[firstIndex];
00163     if (firstIndex == 0) {
00164         firstCPU = (firstCPU / 4) * 4;
00165     }
00166     int lastCPU = firstCPU + nrIndexes - 1;
00167     if (lastCPU >= _nrCPUs) {
00168         lastCPU = _nrCPUs - 1;
00169     }
00170 
00171     /*** Normalize CPU numbers to grid indices in correct dimension ***/
00172 
00173     switch (dimension) {
00174         case 1: firstCPU = PxGridIndexX(firstCPU);
00175                 lastCPU  = PxGridIndexX(lastCPU);
00176                 break;
00177         case 2: firstCPU = PxGridIndexY(firstCPU);
00178                 lastCPU  = PxGridIndexY(lastCPU);
00179                 break;
00180         case 3: firstCPU = PxGridIndexZ(firstCPU);
00181                 lastCPU  = PxGridIndexZ(lastCPU);
00182                 break;
00183     }
00184     int nrBufCPUs = lastCPU - firstCPU + 1;
00185     if (firstCPU >= overflow) {
00186         return (minNrLines * nrBufCPUs);
00187     } else if (lastCPU < overflow) {
00188         return ((minNrLines + 1) * nrBufCPUs);
00189     } else {
00190         return (minNrLines * nrBufCPUs + overflow - firstCPU);
00191     }
00192 }
00193 
00194 
00195 static inline void
00196 PxGetSBTorder(int root, int orderLength, int *order, int *myIndex)
00197 {
00198     /*** Determine ordering of CPUs in Spanning Binomial Tree ***/
00199 
00200     if (root >= _nrCPUs) {
00201         return;
00202     }
00203     *myIndex    = 0;
00204     order[0]    = root;
00205     int rootTmp = root;
00206     int index   = orderLength / 2;
00207 
00208     for (int i=0; i<_logCPUs; i++) {
00209         if (rootTmp < index) {
00210             for (int j=0; j<index; j++) {
00211                 int cpu = index+j + root-rootTmp;
00212                 if (cpu == _myCPU) {
00213                     *myIndex = index+j;
00214                 }
00215                 order[index+j] = cpu;
00216             }
00217         } else {
00218             for (int j=0; j<index; j++) {
00219                 int cpu = j + root-rootTmp;
00220                 if (cpu == _myCPU) {
00221                     *myIndex = index+j;
00222                 }
00223                 order[index+j] = cpu;
00224             }
00225             rootTmp -= index;
00226         }
00227         index /= 2;
00228     }
00229 }
00230 
00231 
00232 /**********************************************************************
00233  *** Communication initialization                                   ***
00234  **********************************************************************/
00235 
00236 inline void
00237 PxInitCommunication()
00238 {
00239     char        dummy;
00240     MPI_Status  stat;
00241 
00242     /*** Send up and receive down ***/
00243 
00244     if (PxMyUpCPU() != -1) {
00245         MPI_Send(&dummy, 1, MPI_BYTE,
00246                         PxMyUpCPU(), INIT_TAG, MPI_COMM_WORLD);
00247     }
00248     if (PxMyDownCPU() != -1) {
00249         MPI_Recv(&dummy, 1, MPI_BYTE,
00250                         PxMyDownCPU(), INIT_TAG, MPI_COMM_WORLD, &stat);
00251 
00252     /*** Send down and receive up ***/
00253 
00254         MPI_Send(&dummy, 1, MPI_BYTE,
00255                         PxMyDownCPU(), INIT_TAG, MPI_COMM_WORLD);
00256     }
00257     if (PxMyUpCPU() != -1) {
00258         MPI_Recv(&dummy, 1, MPI_BYTE,
00259                         PxMyUpCPU(), INIT_TAG, MPI_COMM_WORLD, &stat);
00260     }
00261 }
00262 
00263 
00264 
00265 /**********************************************************************
00266  *** Broadcast Value                                                ***
00267  **********************************************************************/
00268 
00269 template<class ArithT>
00270 static inline ArithT
00271 PxBcastValueOFT(ArithT val, int root)
00272 {
00273     /*** Broadcast value using One-level Flat Tree ***/
00274 
00275     MPI_Datatype elem;
00276     MPI_Status stat;
00277     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00278     MPI_Type_commit(&elem);
00279 
00280     if (_myCPU == root) {               // send value to all other CPUs
00281         for (int partner=0; partner<_nrCPUs; partner++) {
00282             if (partner != _myCPU) {
00283                 MPI_Send(&val,
00284                          1, elem, partner, BCAST_TAG, MPI_COMM_WORLD);
00285             }
00286         }
00287     } else {                            // receive value from root
00288         MPI_Recv(&val, 1, elem, root, BCAST_TAG, MPI_COMM_WORLD, &stat);
00289     }
00290     MPI_Type_free(&elem);
00291     return val;
00292 }
00293 
00294 
00295 template<class ArithT>
00296 static inline ArithT
00297 PxBcastValueSBT(ArithT val, int root)
00298 {
00299     /*** Determine ordering of CPUs in Spanning Binomial Tree ***/
00300 
00301     int *order = new int[_maxCPUs], myIndex;
00302     PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00303 
00304     /*** Broadcast value using Spanning Binomial Tree ***/
00305 
00306     MPI_Datatype elem;
00307     MPI_Status stat;
00308     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00309     MPI_Type_commit(&elem);
00310 
00311     int mask = 1 << _logCPUs-1;
00312     for (int i=0; i<_logCPUs; i++) {
00313         int partnerIndex = myIndex ^ mask;
00314         int partner = order[partnerIndex];
00315         if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00316             if (myIndex < partnerIndex) {
00317                 MPI_Send(&val, 1, elem,
00318                          partner, BCAST_TAG, MPI_COMM_WORLD);
00319             } else {
00320                 MPI_Recv(&val, 1, elem,
00321                          partner, BCAST_TAG, MPI_COMM_WORLD, &stat);
00322             }
00323         }
00324         mask >>= 1;
00325     }
00326     MPI_Type_free(&elem);
00327     delete order;
00328     return val;
00329 }
00330 
00331 
00332 template<class ArithT>
00333 static inline ArithT
00334 PxBcastValueMPI(ArithT val, int root)
00335 {
00336     /*** MPI-Bcast. May be faster than OFT/SBT broadcast provided   ***/
00337     /*** above as specific communication subsystem capabilities may ***/
00338     /*** be incorporated in the implementation.                     ***/
00339 
00340     MPI_Datatype elem;
00341     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00342     MPI_Type_commit(&elem);
00343     MPI_Bcast(&val, 1, elem, root, MPI_COMM_WORLD);
00344     MPI_Type_free(&elem);
00345     return val;
00346 }
00347 
00348 
00349 template<class ArithT>
00350 inline ArithT
00351 PxBcastValue(ArithT val, int root=0, int dtype=PX_MPI)
00352 {
00353     /*** root  : rank of broadcast root node                        ***/
00354     /*** dtype : distribution type (PX_OFT, PX_SBT, or PX_MPI)      ***/
00355     
00356     switch (dtype) {
00357         case PX_OFT : return PxBcastValueOFT(val, root); break;
00358         case PX_SBT : return PxBcastValueSBT(val, root); break;
00359         case PX_MPI :
00360         default         : return PxBcastValueMPI(val, root);
00361     }
00362 }
00363 
00364 
00365 /**********************************************************************
00366  *** Reduce Value to Root                                           ***
00367  **********************************************************************/
00368 
00369 template<class ArithT, class RedOpT>
00370 static inline void
00371 PxReduce(ArithT *in, ArithT *inout, int *len, MPI_Datatype *dptr)
00372 {
00373     /*** This is the user defined function as MPI likes to have it ***/
00374 
00375     RedOpT op;
00376     op.DoIt(*inout, *in);
00377 }
00378 
00379 
00380 template<class ArithT, class RedOpT>
00381 static inline ArithT
00382 PxReduceValueToRootOFT(ArithT val, RedOpT redOp, int root)
00383 {
00384     /*** Reduce value to root using One-level Flat Tree ***/
00385 
00386     ArithT result = ArithT(val);
00387     MPI_Datatype elem;
00388     MPI_Status stat;
00389     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00390     MPI_Type_commit(&elem);
00391 
00392     if (_myCPU == root) {           // receive value from all other CPUs
00393         for (int partner=0; partner<_nrCPUs; partner++) {
00394             if (partner != _myCPU) {
00395                 ArithT recvVal;
00396                 MPI_Recv(&recvVal, 1, elem,
00397                          partner, RDUCE_TAG, MPI_COMM_WORLD, &stat);
00398                 redOp.DoIt(result, recvVal);
00399             }
00400         }
00401     } else {                        // send value to root
00402         MPI_Send(&val, 1, elem, root, RDUCE_TAG, MPI_COMM_WORLD);
00403     }
00404     MPI_Type_free(&elem);
00405     return result;
00406 }
00407 
00408 
00409 template<class ArithT, class RedOpT>
00410 static inline ArithT
00411 PxReduceValueToRootSBT(ArithT val, RedOpT redOp, int root)
00412 {
00413     /*** Determine ordering of CPUs in Spanning Binomial Tree ***/
00414 
00415     int *order = new int[_maxCPUs], myIndex;
00416     PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00417 
00418     /*** Reduce value to root using Spanning Binomial Tree ***/
00419 
00420     ArithT result = ArithT(val);
00421     MPI_Datatype elem;
00422     MPI_Status stat;
00423     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00424     MPI_Type_commit(&elem);
00425 
00426     int mask = 1;
00427     for (int i=0; i<_logCPUs; i++) {
00428         int partnerIndex = myIndex ^ mask;
00429         int partner = order[partnerIndex];
00430         if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00431             if (myIndex > partnerIndex) {
00432                 MPI_Send(&result, 1, elem,
00433                          partner, RDUCE_TAG, MPI_COMM_WORLD);
00434             } else {
00435                 ArithT recvVal;
00436                 MPI_Recv(&recvVal, 1, elem,
00437                          partner, RDUCE_TAG, MPI_COMM_WORLD, &stat);
00438                 redOp.DoIt(result, recvVal);
00439             }
00440         }
00441         mask <<= 1;
00442     }
00443     MPI_Type_free(&elem);
00444     delete order;
00445     return result;
00446 }
00447 
00448 
00449 template<class ArithT, class RedOpT>
00450 static inline ArithT
00451 PxReduceValueToRootMPI(ArithT val, RedOpT redOp, int root)
00452 {
00453     /*** MPI-based global reduction NOT using any of the predefined ***/
00454     /*** reduce operations supported by MPI. At all times, a handle ***/
00455     /*** to a correctly instantiated (Horus) reduction operation is ***/
00456     /*** passed as a parameter to MPI_Reduce(...).                  ***/
00457 
00458     int commute = 1;
00459     ArithT result = RedOpT::NeutralElement();
00460     MPI_Op operation;
00461     MPI_Datatype elem;
00462     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00463     MPI_Type_commit(&elem);
00464 
00465     /*** Make instantiation of PxReduce, bind to MPIop handle & run ***/
00466 
00467     void (*reduceOp)(ArithT*, ArithT*, int*, MPI_Datatype*) =
00468                                             PxReduce<ArithT, RedOpT>;
00469     MPI_Op_create((void (*)(void*, void*, int*, MPI_Datatype*))reduceOp,
00470                                             commute, &operation);
00471     MPI_Reduce(&val, &result, 1, elem, operation, root, MPI_COMM_WORLD);
00472     MPI_Op_free(&operation);
00473     MPI_Type_free(&elem);
00474     return result;
00475 }
00476 
00477 
00478 template<class ArithT, class RedOpT>
00479 inline ArithT
00480 PxReduceValueToRoot(ArithT val, RedOpT redOp, int root=0, int dtype=PX_MPI)
00481 {
00482     /*** redOp : Horus reduction operation                          ***/
00483     /*** root  : rank of reduction root                             ***/
00484     /*** dtype : distribution type (PX_OFT, PX_SBT, or PX_MPI)      ***/
00485     
00486     switch (dtype) {
00487         case PX_OFT : return PxReduceValueToRootOFT(val, redOp, root);
00488                                           break;
00489         case PX_SBT : return PxReduceValueToRootSBT(val, redOp, root);
00490                                           break;
00491         case PX_MPI :
00492         default         : return PxReduceValueToRootMPI(val, redOp, root);
00493     }
00494 }
00495 
00496 
00497 /**********************************************************************
00498  *** Reduce Value to All                                            ***
00499  **********************************************************************/
00500 
00501 template<class ArithT, class RedOpT>
00502 static inline ArithT
00503 PxReduceValueToAllOFT(ArithT val, RedOpT redOp)
00504 {
00505     /*** All-to-one (root) and one-to-all (broadcast) using OFT ***/
00506 
00507     return PxBcastValueOFT(PxReduceValueToRootOFT(val, redOp, 0), 0);
00508 }
00509 
00510 
00511 template<class ArithT, class RedOpT>
00512 static inline ArithT
00513 PxReduceValueToAllSBT(ArithT val, RedOpT redOp)
00514 {
00515     /*** All-to-one (root) and one-to-all (broadcast) using SBT ***/
00516 
00517     return PxBcastValueSBT(PxReduceValueToRootSBT(val, redOp, 0), 0);
00518 }
00519 
00520 
00521 template<class ArithT, class RedOpT>
00522 static inline ArithT
00523 PxReduceValueToAllMPI(ArithT val, RedOpT redOp)
00524 {
00525     /*** MPI-based global Allreduce NOT using any of the predefined ***/
00526     /*** reduce operations supported by MPI. At all times, a handle ***/
00527     /*** to a correctly instantiated (Horus) reduction operation is ***/
00528     /*** passed as a parameter to MPI_Allreduce(...).               ***/
00529 
00530     int commute = 1;
00531     ArithT result = RedOpT::NeutralElement();
00532     MPI_Op operation;
00533     MPI_Datatype elem;
00534     MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00535     MPI_Type_commit(&elem);
00536 
00537     /*** Make instantiation of PxReduce, bind to MPIop handle & run ***/
00538 
00539     void (*reduceOp)(ArithT*, ArithT*, int*, MPI_Datatype*) =
00540                                             PxReduce<ArithT, RedOpT>;
00541     MPI_Op_create((void (*)(void*, void*, int*, MPI_Datatype*))reduceOp,
00542                                             commute, &operation);
00543     MPI_Allreduce(&val, &result, 1, elem, operation, MPI_COMM_WORLD);
00544     MPI_Op_free(&operation);
00545     MPI_Type_free(&elem);
00546     return result;
00547 }
00548 
00549 
00550 template<class ArithT, class RedOpT>
00551 inline ArithT
00552 PxReduceValueToAll(ArithT val, RedOpT redOp, int dtype=PX_MPI)
00553 {
00554     /*** redOp : Horus reduction operation                          ***/
00555     /*** dtype : distribution type (PX_OFT, PX_SBT, or PX_MPI)      ***/
00556     
00557     switch (dtype) {
00558         case PX_OFT : return PxReduceValueToAllOFT(val, redOp); break;
00559         case PX_SBT : return PxReduceValueToAllSBT(val, redOp); break;
00560         case PX_MPI :
00561         default         : return PxReduceValueToAllMPI(val, redOp);
00562     }
00563 }
00564 
00565 
00566 /**********************************************************************
00567  *** Scatter Array Data                                             ***
00568  **********************************************************************/
00569 
00570 template<class ArrayT>
00571 static inline void
00572 PxScatterArrayOFT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00573 {
00574     /*** Scatter array data using One-level Flat Tree ***/
00575 
00576     int eSize = ArrayT::ElemSize();
00577     int sSize = eSize*sizeof(typename ArrayT::StorType);
00578     MPI_Datatype elem, blk2d, blk3d;
00579     MPI_Status stat;
00580     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00581 
00582     if (_myCPU == root) {           // send array data to all other CPUs
00583         for (int partner=0; partner<_nrCPUs; partner++) {
00584             if (partner != _myCPU) {
00585                 int xSize  = PxLclWidth(partner);
00586                 int ySize  = PxLclHeight(partner);
00587                 int zSize  = PxLclDepth(partner);
00588                 int offset = PxLclStart(partner, ArrayBW(glob),
00589                              ArrayBH(glob), ArrayBD(glob));
00590                 if (bdata) {
00591                     xSize += 2*ArrayBW(glob);
00592                     ySize += 2*ArrayBH(glob);
00593                     zSize += 2*ArrayBD(glob);
00594                     offset -=
00595                     (ArrayW(glob)*ArrayH(glob)*ArrayBD(glob) +
00596                      ArrayW(glob)*ArrayBH(glob) + ArrayBW(glob));
00597                 }
00598                 if (xSize != 0 && ySize != 0 && zSize != 0) {
00599                     MPI_Type_vector(ySize, xSize,
00600                                     ArrayW(glob), elem, &blk2d);
00601                     MPI_Type_hvector(zSize, 1,
00602                                     ArrayW(glob)*ArrayH(glob)*sSize,
00603                                     blk2d, &blk3d);
00604                     MPI_Type_commit(&blk3d);
00605                     MPI_Send(ArrayPB(glob) + offset*eSize, 1, blk3d,
00606                                     partner, SCAT_TAG, MPI_COMM_WORLD);
00607                     MPI_Type_free(&blk3d);
00608                     MPI_Type_free(&blk2d);
00609                 }
00610             }
00611         }
00612         PxLclArrayCopy(loc, FROM, glob, root, bdata);
00613 
00614     } else {                        // receive array data from root
00615         int xSize = PxLclWidth(_myCPU);
00616         int ySize = PxLclHeight(_myCPU);
00617         int zSize = PxLclDepth(_myCPU);
00618         if (bdata) {
00619             xSize += 2*ArrayBW(glob);
00620             ySize += 2*ArrayBH(glob);
00621             zSize += 2*ArrayBD(glob);
00622         }
00623         if (xSize != 0 && ySize != 0 && zSize != 0) {
00624             MPI_Type_vector(ySize, xSize, ArrayW(loc), elem, &blk2d);
00625             MPI_Type_hvector(zSize, 1,
00626                     ArrayW(loc)*ArrayH(loc)*sSize, blk2d, &blk3d);
00627             MPI_Type_commit(&blk3d);
00628             MPI_Recv((bdata) ? ArrayPB(loc) : ArrayCPB(loc), 1,
00629                     blk3d, root, SCAT_TAG, MPI_COMM_WORLD, &stat);
00630             MPI_Type_free(&blk3d);
00631             MPI_Type_free(&blk2d);
00632         }
00633     }
00634     MPI_Type_free(&elem);
00635 }
00636 
00637 
00638 template<class ArrayT>
00639 static inline void
00640 PxScatterArraySBT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00641 {
00642     /*** SBT (Spanning Binomial Tree) or hypercube scatter. Its  ***/
00643     /*** use is restricted slightly for implementation reasons.  ***/
00644     /*** NOTE: should work correctly, except:                    ***/
00645     /***                                                         ***/
00646     /*** - IF ("zCPUs" == 1) AND ("yCPUs" > 1) THEN              ***/
00647     /***   "xCPUs" MUST be a power of 2 (so: 1, 2, 4, 8, etc...) ***/
00648     /*** - IF ("zCPUs" > 1) THEN                                 ***/
00649     /***   "xCPUs" AND "yCPUs" both MUST be a power of 2         ***/
00650 
00651     int     *order = new int[_maxCPUs], myIndex, bufW, bufH, bufD;
00652     ArrayT  *buf;
00653 
00654     /*** Determine ordering of CPUs in Spanning Binomial Tree ***/
00655 
00656     PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00657 
00658     /*** Create temporary array buffer (including borders) ***/
00659 
00660     if (_myCPU == root) {
00661         bufW = ArrayW(glob);
00662         bufH = ArrayH(glob);
00663         bufD = ArrayD(glob);
00664         buf  = glob;
00665     } else {
00666         bufW = PxLineRange(1, myIndex, order);
00667         bufH = PxLineRange(2, myIndex, order);
00668         bufD = PxLineRange(3, myIndex, order);
00669         buf  = PxArrayCreate<ArrayT>(bufW, bufH, bufD, ArrayBW(glob),
00670                                      ArrayBH(glob), ArrayBD(glob));
00671         bufW += 2*ArrayBW(glob);
00672         bufH += 2*ArrayBH(glob);
00673         bufD += 2*ArrayBD(glob);
00674     }
00675 
00676     /*** Scatter array data using Spanning Binomial Tree ***/
00677 
00678     int eSize = ArrayT::ElemSize();
00679     int sSize = eSize*sizeof(typename ArrayT::StorType);
00680     MPI_Datatype elem, blk2d, blk3d;
00681     MPI_Status stat;
00682     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00683 
00684     int mask = 1 << (_logCPUs-1);
00685     for (int i=0; i<_logCPUs; i++) {
00686         int partnerIndex = myIndex ^ mask;
00687         int partner = order[partnerIndex];
00688         if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00689             if (myIndex < partnerIndex) {   // send data to SBT child
00690                 int xSize  = PxLineRange(1, partnerIndex, order);
00691                 int ySize  = PxLineRange(2, partnerIndex, order);
00692                 int zSize  = PxLineRange(3, partnerIndex, order);
00693                 int offset = PxLclStart(partner, ArrayBW(glob),
00694                                     ArrayBH(glob), ArrayBD(glob));
00695                 if (_myCPU != root) {
00696                     offset -= (PxLclStart(_myCPU, ArrayBW(glob),
00697                                ArrayBH(glob), ArrayBD(glob)) -
00698                               (bufW*bufH*ArrayBD(glob) +
00699                                bufW*ArrayBH(glob) + ArrayBW(glob)));
00700                 }
00701                 if (bdata) {
00702                     xSize += 2*ArrayBW(glob);
00703                     ySize += 2*ArrayBH(glob);
00704                     zSize += 2*ArrayBD(glob);
00705                     offset -= (bufW*bufH*ArrayBD(glob) +
00706                                bufW*ArrayBH(glob) + ArrayBW(glob));
00707                 }
00708                 if (xSize != 0 && ySize != 0 && zSize != 0) {
00709                     MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00710                     MPI_Type_hvector(zSize, 1,
00711                                      bufW*bufH*sSize, blk2d, &blk3d);
00712                     MPI_Type_commit(&blk3d);
00713                     MPI_Send(ArrayPB(buf) + offset*eSize, 1, blk3d,
00714                              partner, SCAT_TAG, MPI_COMM_WORLD);
00715                     MPI_Type_free(&blk3d);
00716                     MPI_Type_free(&blk2d);
00717                 }
00718             } else {                    // receive data from SBT parent
00719                 int xSize = PxLineRange(1, myIndex, order);
00720                 int ySize = PxLineRange(2, myIndex, order);
00721                 int zSize = PxLineRange(3, myIndex, order);
00722                 if (bdata) {
00723                     xSize += 2*ArrayBW(glob);
00724                     ySize += 2*ArrayBH(glob);
00725                     zSize += 2*ArrayBD(glob);
00726                 }
00727                 if (xSize != 0 && ySize != 0 && zSize != 0) {
00728                     MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00729                     MPI_Type_hvector(zSize, 1,
00730                                      bufW*bufH*sSize, blk2d, &blk3d);
00731                     MPI_Type_commit(&blk3d);
00732                     MPI_Recv((bdata) ? ArrayPB(buf) : ArrayCPB(buf),
00733                             1, blk3d, partner, SCAT_TAG, MPI_COMM_WORLD,
00734                             &stat);
00735                     MPI_Type_free(&blk3d);
00736                     MPI_Type_free(&blk2d);
00737                 }
00738             }
00739         }
00740         mask >>= 1;
00741     }
00742     MPI_Type_free(&elem);
00743     PxLclArrayCopy(loc, FROM, buf, root, bdata);
00744     if (_myCPU != root) {
00745         delete buf;
00746     }
00747     delete order;
00748 }
00749 
00750 
00751 template<class ArrayT>
00752 static inline void
00753 PxScatterArrayMPI(ArrayT* glob, ArrayT* loc, int root, int bdata=true)
00754 {
00755     /*** MPI-Scatter. May be faster than OFT/SBT scatter provided   ***/
00756     /*** above as specific communication subsystem capabilities may ***/
00757     /*** be incorporated in the implementation.                     ***/
00758     /***                                                            ***/
00759     /*** NOTE: This function can only be used if:                   ***/
00760     /***                                                            ***/
00761     /*** Borders must be sent as well OR all border sizes are 0 AND ***/
00762     /*** xcpus>=1 AND ArrayH(glob)=1 AND ArrayD(glob)=1, OR     ***/
00763     /*** xcpus=1 AND ycpus>=1 AND ArrayD(glob)=1, OR              ***/
00764     /*** xcpus=1 AND ycpus=1 AND zcpus>=1.                          ***/
00765     /***                                                            ***/
00766     /*** This is due to the lack of possibility to define an array  ***/
00767     /*** of derived datatypes at the root. In principle, we could   ***/
00768     /*** temporarily re-arrange the layout of the array to still be ***/
00769     /*** able to use the standard MPI functionality. However, this  ***/
00770     /*** requires a costly copy operation which is not implemented. ***/
00771 
00772     int *counts, *disps;
00773 
00774     /*** Determine size and position of each local array ***/
00775 
00776     if (_myCPU == root) {
00777         counts = new int[_nrCPUs];
00778         disps  = new int[_nrCPUs];
00779         int slice = 2*((ArrayW(glob)*ArrayH(glob)*ArrayBD(glob)) +
00780                        (ArrayW(glob)*ArrayBH(glob)));
00781         for (int i=0; i<_nrCPUs; i++) {
00782             counts[i] = (PxLclWidth(i)  + 2*ArrayBW(glob)) *
00783                         (PxLclHeight(i) + 2*ArrayBH(glob)) *
00784                         (PxLclDepth(i)  + 2*ArrayBD(glob));
00785             disps[i] = (i==0) ? 0 : disps[i-1] + counts[i-1] - slice; 
00786         }
00787     }
00788 
00789     /*** Scatter array data ***/
00790 
00791     int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
00792     MPI_Datatype elem;
00793     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00794     MPI_Type_commit(&elem);
00795     MPI_Scatterv(ArrayPB(glob), counts, disps, elem, ArrayPB(loc),
00796                  ArrayW(loc)*ArrayH(loc)*ArrayD(loc), elem, root,
00797                  MPI_COMM_WORLD);
00798     MPI_Type_free(&elem);
00799 
00800     if (_myCPU == root) {
00801         delete counts;
00802         delete disps;
00803     }
00804 }
00805 
00806 
00807 template<class ArrayT>
00808 inline void
00809 PxScatterArray(ArrayT* glob, ArrayT* loc,
00810                int root=0, int dtype=PX_SBT, bool bdata=false)
00811 {
00812     /*** root  : rank of sending node                               ***/
00813     /*** dtype : distribution type (PX_OFT, PX_SBT, or PX_MPI)      ***/
00814     /*** bdata : indicates whether border data is scattered as well ***/
00815     
00816 if (dp) PX_COUT << "Scatter: " << "NrCPUs = " << PxNrCPUs() << PX_ENDL;
00817 
00818     switch (dtype) {
00819         case PX_MPI : PxScatterArrayMPI(glob, loc, root, true ); break;
00820         case PX_OFT : PxScatterArrayOFT(glob, loc, root, bdata); break;
00821         case PX_SBT :
00822         default         : PxScatterArraySBT(glob, loc, root, bdata);
00823     }
00824 }
00825 
00826 
00827 /**********************************************************************
00828  *** Gather Array Data                                              ***
00829  **********************************************************************/
00830 
00831 template<class ArrayT>
00832 static inline void
00833 PxGatherArrayOFT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00834 {
00835     /*** Gather array data using One-level Flat Tree ***/
00836 
00837     int eSize = ArrayT::ElemSize();
00838     int sSize = eSize*sizeof(typename ArrayT::StorType);
00839     MPI_Datatype elem, blk2d, blk3d;
00840     MPI_Status stat;
00841     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00842 
00843     if (_myCPU == root) {   // receive array data from all other CPUs
00844         PxLclArrayCopy(loc, TO, glob, root, bdata);
00845         for (int partner=0; partner<_nrCPUs; partner++) {
00846             if (partner != _myCPU) {
00847                 int xSize  = PxLclWidth(partner);
00848                 int ySize  = PxLclHeight(partner);
00849                 int zSize  = PxLclDepth(partner);
00850                 int offset = PxLclStart(partner,ArrayBW(glob),
00851                              ArrayBH(glob), ArrayBD(glob));
00852                 if (bdata) {
00853                     xSize += 2*ArrayBW(glob);
00854                     ySize += 2*ArrayBH(glob);
00855                     zSize += 2*ArrayBD(glob);
00856                     offset -=
00857                     (ArrayW(glob)*ArrayH(glob)*ArrayBD(glob) +
00858                      ArrayW(glob)*ArrayBH(glob) + ArrayBW(glob));
00859                 }
00860                 if (xSize != 0 && ySize != 0 && zSize != 0) {
00861                     MPI_Type_vector(ySize, xSize,
00862                                     ArrayW(glob), elem, &blk2d);
00863                     MPI_Type_hvector(zSize, 1,
00864                                     ArrayW(glob)*ArrayH(glob)*sSize,
00865                                     blk2d, &blk3d);
00866                     MPI_Type_commit(&blk3d);
00867                     MPI_Recv(ArrayPB(glob) + offset*eSize, 1, blk3d,
00868                              partner, GATH_TAG, MPI_COMM_WORLD, &stat);
00869                     MPI_Type_free(&blk3d);
00870                     MPI_Type_free(&blk2d);
00871                 }
00872             }
00873         }
00874     } else {                // send array data to root
00875         int xSize = PxLclWidth(_myCPU);
00876         int ySize = PxLclHeight(_myCPU);
00877         int zSize = PxLclDepth(_myCPU);
00878         if (bdata) {
00879             xSize += 2*ArrayBW(glob);
00880             ySize += 2*ArrayBH(glob);
00881             zSize += 2*ArrayBD(glob);
00882         }
00883         if (xSize != 0 && ySize != 0 && zSize != 0) {
00884             MPI_Type_vector(ySize, xSize, ArrayW(loc), elem, &blk2d);
00885             MPI_Type_hvector(zSize, 1,
00886                     ArrayW(loc)*ArrayH(loc)*sSize, blk2d, &blk3d);
00887             MPI_Type_commit(&blk3d);
00888             MPI_Send((bdata) ? ArrayPB(loc) : ArrayCPB(loc),
00889                      1, blk3d, root, GATH_TAG, MPI_COMM_WORLD);
00890             MPI_Type_free(&blk3d);
00891             MPI_Type_free(&blk2d);
00892         }
00893     }
00894     MPI_Type_free(&elem);
00895 }
00896 
00897 
00898 template<class ArrayT>
00899 static inline void
00900 PxGatherArraySBT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00901 {
00902     /*** SBT (Spanning Binomial Tree) or hypercube gather. Its   ***/
00903     /*** use is restricted slightly for implementation reasons.  ***/
00904     /*** NOTE: should work correctly, except:                    ***/
00905     /***                                                         ***/
00906     /*** - IF ("zCPUs" == 1) AND ("yCPUs" > 1) THEN              ***/
00907     /***   "xCPUs" MUST be a power of 2 (so: 1, 2, 4, 8, etc...) ***/
00908     /*** - IF ("zCPUs" > 1) THEN                                 ***/
00909     /***   "xCPUs" AND "yCPUs" both MUST be a power of 2         ***/
00910 
00911     int     *order = new int[_maxCPUs], myIndex, bufW, bufH, bufD;
00912     ArrayT  *buf;
00913 
00914     /*** Determine ordering of CPUs in Spanning Binomial Tree ***/
00915 
00916     PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00917 
00918     /*** Create temporary array buffer (including borders) ***/
00919 
00920     if (_myCPU == root) {
00921         bufW = ArrayW(glob);
00922         bufH = ArrayH(glob);
00923         bufD = ArrayD(glob);
00924         buf  = glob;
00925     } else {
00926         bufW = PxLineRange(1, myIndex, order);
00927         bufH = PxLineRange(2, myIndex, order);
00928         bufD = PxLineRange(3, myIndex, order);
00929         buf  = PxArrayCreate<ArrayT>(bufW, bufH, bufD, ArrayBW(glob),
00930                                      ArrayBH(glob), ArrayBD(glob));
00931         bufW += 2*ArrayBW(glob);
00932         bufH += 2*ArrayBH(glob);
00933         bufD += 2*ArrayBD(glob);
00934     }
00935     PxLclArrayCopy(loc, TO, buf, root, bdata);
00936 
00937     /*** Gather array data using Spanning Binomial Tree ***/
00938 
00939     int eSize = ArrayT::ElemSize();
00940     int sSize = eSize*sizeof(typename ArrayT::StorType);
00941     MPI_Datatype elem, blk2d, blk3d;
00942     MPI_Status stat;
00943     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00944 
00945     int mask = 1;
00946     for (int i=0; i<_logCPUs; i++) {
00947         int partnerIndex = myIndex ^ mask;
00948         int partner = order[partnerIndex];
00949         if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00950             if (myIndex > partnerIndex) {   // send data to SBT parent
00951                 int xSize = PxLineRange(1, myIndex, order);
00952                 int ySize = PxLineRange(2, myIndex, order);
00953                 int zSize = PxLineRange(3, myIndex, order);
00954                 if (bdata) {
00955                     xSize += 2*ArrayBW(glob);
00956                     ySize += 2*ArrayBH(glob);
00957                     zSize += 2*ArrayBD(glob);
00958                 }
00959                 if (xSize != 0 && ySize != 0 && zSize != 0) {
00960                     MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00961                     MPI_Type_hvector(zSize, 1,
00962                                      bufW*bufH*sSize, blk2d, &blk3d);
00963                     MPI_Type_commit(&blk3d);
00964                     MPI_Send((bdata) ? ArrayPB(buf) : ArrayCPB(buf),
00965                            1, blk3d, partner, GATH_TAG, MPI_COMM_WORLD);
00966                     MPI_Type_free(&blk3d);
00967                     MPI_Type_free(&blk2d);
00968                 }
00969             } else {                    // receive data from SBT child
00970                 int xSize  = PxLineRange(1, partnerIndex, order);
00971                 int ySize  = PxLineRange(2, partnerIndex, order);
00972                 int zSize  = PxLineRange(3, partnerIndex, order);
00973                 int offset = PxLclStart(partner, ArrayBW(glob),
00974                                     ArrayBH(glob), ArrayBD(glob));
00975                 if (_myCPU != root) {
00976                     offset -= (PxLclStart(_myCPU, ArrayBW(glob),
00977                                ArrayBH(glob), ArrayBD(glob)) -
00978                               (bufW*bufH*ArrayBD(glob) +
00979                                bufW*ArrayBH(glob) + ArrayBW(glob)));
00980                 }
00981                 if (bdata) {
00982                     xSize += 2*ArrayBW(glob);
00983                     ySize += 2*ArrayBH(glob);
00984                     zSize += 2*ArrayBD(glob);
00985                     offset -= (bufW*bufH*ArrayBD(glob) +
00986                                bufW*ArrayBH(glob) + ArrayBW(glob));
00987                 }
00988                 if (xSize != 0 && ySize != 0 && zSize != 0) {
00989                     MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00990                     MPI_Type_hvector(zSize, 1,
00991                                      bufW*bufH*sSize, blk2d, &blk3d);
00992                     MPI_Type_commit(&blk3d);
00993                     MPI_Recv(ArrayPB(buf) + offset*eSize, 1, blk3d,
00994                              partner, GATH_TAG, MPI_COMM_WORLD, &stat);
00995                     MPI_Type_free(&blk3d);
00996                     MPI_Type_free(&blk2d);
00997                 }
00998             }
00999         }
01000         mask <<= 1;
01001     }
01002     MPI_Type_free(&elem);
01003     if (_myCPU != root) {
01004         delete buf;
01005     }
01006     delete order;
01007 }
01008 
01009 
01010 template<class ArrayT>
01011 static inline void
01012 PxGatherArrayMPI(ArrayT* glob, ArrayT* loc, int root, bool bdata=true)
01013 {
01014     /*** MPI-Gather. May be faster than OFT/SBT scatter provided    ***/
01015     /*** above as specific communication subsystem capabilities may ***/
01016     /*** be incorporated in the implementation.                     ***/
01017     /***                                                            ***/
01018     /*** NOTE: This function can only be used if:                   ***/
01019     /***                                                            ***/
01020     /*** Borders must be sent as well OR all border sizes are 0 AND ***/
01021     /*** xcpus>=1 AND ArrayH(glob)=1 AND ArrayD(glob)=1, OR     ***/
01022     /*** xcpus=1 AND ycpus>=1 AND ArrayD(glob)=1, OR              ***/
01023     /*** xcpus=1 AND ycpus=1 AND zcpus>=1.                          ***/
01024     /***                                                            ***/
01025     /*** This is due to the lack of possibility to define an array  ***/
01026     /*** of derived datatypes at the root. In principle, we could   ***/
01027     /*** temporarily re-arrange the layout of the array to still be ***/
01028     /*** able to use the standard MPI functionality. However, this  ***/
01029     /*** requires a costly copy operation which is not implemented. ***/
01030 
01031     int *counts, *disps;
01032 
01033     /*** Determine size and position of each local array ***/
01034 
01035     if (_myCPU == root) {
01036         counts = new int[_nrCPUs];
01037         disps  = new int[_nrCPUs];
01038         int slice = 2*((ArrayW(glob)*ArrayH(glob)*ArrayBD(glob)) +
01039                        (ArrayW(glob)*ArrayBH(glob)));
01040         for (int i=0; i<_nrCPUs; i++) {
01041             counts[i] = (PxLclWidth(i)  + 2*ArrayBW(glob)) *
01042                         (PxLclHeight(i) + 2*ArrayBH(glob)) *
01043                         (PxLclDepth(i)  + 2*ArrayBD(glob));
01044             disps[i] = (i==0) ? 0 : disps[i-1] + counts[i-1] - slice; 
01045         }
01046     }
01047 
01048     /*** Scatter array data ***/
01049 
01050     int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01051     MPI_Datatype elem;
01052     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01053     MPI_Type_commit(&elem);
01054     MPI_Gatherv(ArrayPB(loc),
01055                 ArrayW(loc)*ArrayH(loc)*ArrayD(loc), elem,
01056                 ArrayPB(glob), counts, disps, elem, root,
01057                 MPI_COMM_WORLD);
01058     MPI_Type_free(&elem);
01059 
01060     if (_myCPU == root) {
01061         delete counts;
01062         delete disps;
01063     }
01064 }
01065 
01066 
01067 template<class ArrayT>
01068 inline void
01069 PxGatherArray(ArrayT* glob, ArrayT* loc,
01070               int root=0, int dtype=PX_SBT, bool bdata=false)
01071 {
01072     /*** root  : rank of receiving node                             ***/
01073     /*** dtype : distribution type (PX_OFT, PX_SBT, or PX_MP)       ***/
01074     /*** bdata : indicates whether border data is gathered as well  ***/
01075     
01076 if (dp) PX_COUT << "Gather..." << PX_ENDL;
01077 
01078     switch (dtype) {
01079         case PX_MPI : PxGatherArrayMPI(glob, loc, root, true ); break;
01080         case PX_OFT : PxGatherArrayOFT(glob, loc, root, bdata); break;
01081         case PX_SBT :
01082         default         : PxGatherArraySBT(glob, loc, root, bdata);
01083     }
01084 }
01085 
01086 
01087 /**********************************************************************
01088  *** Broadcast Array Data                                           ***
01089  **********************************************************************/
01090 
01091 template<class ArrayT>
01092 static inline void
01093 PxBcastArrayOFT(ArrayT* a, int root, bool bdata)
01094 {
01095     /*** Broadcast array data using One-level Flat Tree ***/
01096 
01097     int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01098     MPI_Datatype elem, blk2d, blk3d;
01099     MPI_Status stat;
01100     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01101     MPI_Type_commit(&elem);
01102 
01103     if (!bdata) {
01104         MPI_Type_vector(ArrayCH(a),
01105                         ArrayCW(a), ArrayW(a), elem, &blk2d);
01106         MPI_Type_hvector(ArrayCD(a), 1,
01107                         ArrayW(a)*ArrayH(a)*sSize, blk2d, &blk3d);
01108         MPI_Type_commit(&blk3d);
01109     }
01110     if (_myCPU == root) {       // send array data to all other CPUs
01111         for (int partner=0; partner<_nrCPUs; partner++) {
01112             if (partner != _myCPU) {
01113                 if (!bdata) {
01114                     MPI_Send(ArrayCPB(a), 1,
01115                              blk3d, partner, BCAST_TAG, MPI_COMM_WORLD);
01116                 } else {
01117                     MPI_Send(ArrayPB(a),
01118                              ArrayW(a)*ArrayH(a)*ArrayD(a),
01119                              elem, partner, BCAST_TAG, MPI_COMM_WORLD);
01120                 }
01121             }
01122         }
01123     } else {                    // receive array data from root
01124         if (!bdata) {
01125             MPI_Recv(ArrayCPB(a), 1, blk3d, root,
01126                      BCAST_TAG, MPI_COMM_WORLD, &stat);
01127         } else {
01128             MPI_Recv(ArrayPB(a), ArrayW(a)*ArrayH(a)*ArrayD(a),
01129                      elem, root, BCAST_TAG, MPI_COMM_WORLD, &stat);
01130         }
01131     }
01132     if (!bdata) {
01133         MPI_Type_free(&blk3d);
01134         MPI_Type_free(&blk2d);
01135     }
01136     MPI_Type_free(&elem);
01137 }
01138 
01139 
01140 template<class ArrayT>
01141 static inline void
01142 PxBcastArraySBT(ArrayT* a, int root, bool bdata)
01143 {
01144     /*** Determine ordering of CPUs in Spanning Binomial Tree ***/
01145 
01146     int *order = new int[_maxCPUs], myIndex;
01147     PxGetSBTorder(root, _maxCPUs, order, &myIndex);
01148 
01149     /*** Broadcast array data using Spanning Binomial Tree ***/
01150 
01151     int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01152     MPI_Datatype elem, blk2d, blk3d;
01153     MPI_Status stat;
01154     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01155     MPI_Type_commit(&elem);
01156 
01157     if (!bdata) {
01158         MPI_Type_vector(ArrayCH(a),
01159                         ArrayCW(a), ArrayW(a), elem, &blk2d);
01160         MPI_Type_hvector(ArrayCD(a), 1,
01161                         ArrayW(a)*ArrayH(a)*sSize, blk2d, &blk3d);
01162         MPI_Type_commit(&blk3d);
01163     }
01164     int mask = 1 << (_logCPUs-1);
01165     for (int i=0; i<_logCPUs; i++) {
01166         int partnerIndex = myIndex ^ mask;
01167         int partner = order[partnerIndex];
01168         if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
01169             if (myIndex < partnerIndex) {   // send data to SBT child
01170                 if (!bdata) {
01171                     MPI_Send(ArrayCPB(a), 1, blk3d,
01172                              partner, BCAST_TAG, MPI_COMM_WORLD);
01173                 } else {
01174                     MPI_Send(ArrayPB(a),
01175                              ArrayW(a)*ArrayH(a)*ArrayD(a), elem,
01176                              partner, BCAST_TAG, MPI_COMM_WORLD);
01177                 }
01178             } else {                    // receive data from SBT parent
01179                 if (!bdata) {
01180                     MPI_Recv(ArrayCPB(a), 1, blk3d, partner,
01181                              BCAST_TAG, MPI_COMM_WORLD, &stat);
01182                 } else {
01183                     MPI_Recv(ArrayPB(a),
01184                              ArrayW(a)*ArrayH(a)*ArrayD(a), elem,
01185                              partner, BCAST_TAG, MPI_COMM_WORLD, &stat);
01186                 }
01187             }
01188         }
01189         mask >>= 1;
01190     }
01191     if (!bdata) {
01192         MPI_Type_free(&blk3d);
01193         MPI_Type_free(&blk2d);
01194     }
01195     MPI_Type_free(&elem);
01196     delete order;
01197 }
01198 
01199 
01200 template<class ArrayT>
01201 static inline void
01202 PxBcastArrayMPI(ArrayT* a, int root, bool bdata)
01203 {
01204     /*** MPI-Bcast. May be faster than OFT/SBT broadcast provided ***/
01205     /*** above, as specific communication subsystem capabilities  ***/
01206     /*** may be incorporated in the implementation.               ***/
01207 
01208     int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01209     MPI_Datatype elem, blk2d, blk3d;
01210     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01211     MPI_Type_commit(&elem);
01212 
01213     if (!bdata) {
01214         MPI_Type_vector(ArrayCH(a),
01215                         ArrayCW(a), ArrayW(a), elem, &blk2d);
01216         MPI_Type_hvector(ArrayCD(a), 1,
01217                          ArrayW(a)*ArrayH(a)*sSize, blk2d, &blk3d);
01218         MPI_Type_commit(&blk3d);
01219         MPI_Bcast(ArrayCPB(a), 1, blk3d, root, MPI_COMM_WORLD);
01220         MPI_Type_free(&blk3d);
01221         MPI_Type_free(&blk2d);
01222     } else {
01223         MPI_Bcast(ArrayPB(a), ArrayW(a)*ArrayH(a)*ArrayD(a),
01224                   elem, root, MPI_COMM_WORLD);
01225     }
01226     MPI_Type_free(&elem);
01227 }
01228 
01229 
01230 template<class ArrayT>
01231 inline void
01232 PxBcastArray(ArrayT* a, int root=0, int dtype=PX_MPI, bool bdata=false)
01233 {
01234     /*** root  : rank of broadcast root node                        ***/
01235     /*** dtype : distribution type (PX_OFT, PX_SBT, or PX_MPI)      ***/
01236     /*** bdata : indicates whether border data is broadcast as well ***/
01237     
01238 if (dp) PX_COUT << "Bcast: " << "NrCPUs = " << PxNrCPUs() << PX_ENDL;
01239 
01240     switch (dtype) {
01241         case PX_OFT : PxBcastArrayOFT(a, root, bdata); break;
01242         case PX_SBT : PxBcastArraySBT(a, root, bdata); break;
01243         case PX_MPI :
01244         default         : PxBcastArrayMPI(a, root, bdata);
01245     }
01246 }
01247 
01248 
01249 /**********************************************************************
01250  *** Redistribute Array Data                                        ***
01251  **********************************************************************/
01252 
01253 template<class ArrayT>
01254 inline void
01255 PxRedistArray(ArrayT** a,
01256               int xcpus, int ycpus, int zcpus, bool bdata=false)
01257 {
01258     /*** Complete redistribution of array data (incl. potential    ***/
01259     /*** borders) using the Direct Communication schema. This code ***/
01260     /*** is based on: 'Parallel Matrix Transpose Algorithms on     ***/
01261     /*** Distributed Memory Concurrent Computers' (Choi, Dongarra, ***/
01262     /*** Walker) Mathematical Sciences Section, Oak Ridge National ***/
01263     /*** Lab., and Dept. Computer Science, Univ. of Tennessee at   ***/
01264     /*** Knoxville, LAPACK Working Note 65 (CS-93-215), Oct 1993.  ***/
01265 
01266     int bw = ArrayBW(*a), bh = ArrayBH(*a), bd = ArrayBD(*a);
01267 
01268     /*** Save old local image characteristics ***/
01269 
01270     int* oldStartX = new int[_nrCPUs];
01271     int* oldStartY = new int[_nrCPUs];
01272     int* oldStartZ = new int[_nrCPUs];
01273     int* oldEndX = new int[_nrCPUs];
01274     int* oldEndY = new int[_nrCPUs];
01275     int* oldEndZ = new int[_nrCPUs];
01276 
01277     int i;
01278     for (i=0; i<_nrCPUs; i++) {
01279         oldStartX[i] = PxLclStartX(i);
01280         oldStartY[i] = PxLclStartY(i);
01281         oldStartZ[i] = PxLclStartZ(i);
01282         oldEndX[i]   = oldStartX[i] + PxLclWidth(i)  - 1;
01283         oldEndY[i]   = oldStartY[i] + PxLclHeight(i) - 1;
01284         oldEndZ[i]   = oldStartZ[i] + PxLclDepth(i)  - 1;
01285         if (bdata) {
01286             oldEndX[i] += 2*bw;
01287             oldEndY[i] += 2*bh;
01288             oldEndZ[i] += 2*bd;
01289         }
01290     }
01291 
01292     /*** Repartition according to new CPU grid ***/
01293 
01294     PxRePartition(xcpus, ycpus, zcpus);
01295 
01296     /*** Obtain new local image characteristics ***/
01297 
01298     int* newStartX = new int[_nrCPUs];
01299     int* newStartY = new int[_nrCPUs];
01300     int* newStartZ = new int[_nrCPUs];
01301     int* newEndX = new int[_nrCPUs];
01302     int* newEndY = new int[_nrCPUs];
01303     int* newEndZ = new int[_nrCPUs];
01304 
01305     for (i=0; i<_nrCPUs; i++) {
01306         newStartX[i] = PxLclStartX(i);
01307         newStartY[i] = PxLclStartY(i);
01308         newStartZ[i] = PxLclStartZ(i);
01309         newEndX[i]   = newStartX[i] + PxLclWidth(i)  - 1;
01310         newEndY[i]   = newStartY[i] + PxLclHeight(i) - 1;
01311         newEndZ[i]   = newStartZ[i] + PxLclDepth(i)  - 1;
01312         if (bdata) {
01313             newEndX[i] += 2*bw;
01314             newEndY[i] += 2*bh;
01315             newEndZ[i] += 2*bd;
01316         }
01317     }
01318     int newW = PxLclWidth(_myCPU);
01319     int newH = PxLclHeight(_myCPU);
01320     int newD = PxLclDepth(_myCPU);
01321 
01322     /*** Create new local array and temporary MPI buffer ***/
01323 
01324     ArrayT* nA = PxArrayCreate<ArrayT>(newW, newH, newD, bw, bh, bd);
01325     typedef typename ArrayT::StorType storT;
01326     int eSize = ArrayT::ElemSize();
01327     int sSize = eSize*sizeof(storT);
01328     int mpibuffsize = sizeof(MPI_BYTE) * sSize *
01329                       ArrayW(nA) * ArrayH(nA) * ArrayD(nA);
01330 //  int mpibuffsize = sizeof(MPI_BYTE) * sSize * newW * newH * newD;
01331     char *mpibuff = new char[mpibuffsize];
01332     MPI_Buffer_attach(mpibuff, mpibuffsize);
01333 
01334     /*** Send to/receive from all other CPUs (direct communication) ***/
01335 
01336     MPI_Datatype elem, blk2d, blk3d;
01337     MPI_Status stat;
01338     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01339 
01340     for (i=0; i<_nrCPUs; i++) {
01341 
01342         int partner = (_nrCPUs - _myCPU + i) % _nrCPUs;
01343 
01344         /*** Send data to partner ***/
01345 
01346         int maxStartX = (oldStartX[_myCPU] > newStartX[partner]) ?
01347                          oldStartX[_myCPU] : newStartX[partner];
01348         int maxStartY = (oldStartY[_myCPU] > newStartY[partner]) ?
01349                          oldStartY[_myCPU] : newStartY[partner];
01350         int maxStartZ = (oldStartZ[_myCPU] > newStartZ[partner]) ?
01351                          oldStartZ[_myCPU] : newStartZ[partner];
01352         int minEndX   = (oldEndX[_myCPU] < newEndX[partner]) ?
01353                          oldEndX[_myCPU] : newEndX[partner];
01354         int minEndY   = (oldEndY[_myCPU] < newEndY[partner]) ?
01355                          oldEndY[_myCPU] : newEndY[partner];
01356         int minEndZ   = (oldEndZ[_myCPU] < newEndZ[partner]) ?
01357                          oldEndZ[_myCPU] : newEndZ[partner];
01358         int xSize = minEndX - maxStartX + 1;
01359         int ySize = minEndY - maxStartY + 1;
01360         int zSize = minEndZ - maxStartZ + 1;
01361 
01362         if (xSize > 0 && ySize > 0 && zSize > 0) {
01363 
01364             int offset = eSize *
01365             ((maxStartZ-oldStartZ[_myCPU]) * ArrayW(*a)*ArrayH(*a) +
01366              (maxStartY-oldStartY[_myCPU]) * ArrayW(*a) +
01367              (maxStartX-oldStartX[_myCPU]));
01368 
01369             MPI_Type_vector(ySize, xSize, ArrayW(*a), elem, &blk2d);
01370             MPI_Type_hvector(zSize, 1,
01371                         ArrayW(*a)*ArrayH(*a)*sSize, blk2d, &blk3d);
01372             MPI_Type_commit(&blk3d);
01373             MPI_Bsend((bdata) ?
01374                      ArrayPB(*a) + offset : ArrayCPB(*a) + offset,
01375                      1, blk3d, partner, RDIST_TAG, MPI_COMM_WORLD);
01376             MPI_Type_free(&blk3d);
01377             MPI_Type_free(&blk2d);
01378         }
01379 
01380         /*** Receive data from partner ***/
01381 
01382         maxStartX = (oldStartX[partner] > newStartX[_myCPU]) ?
01383                      oldStartX[partner] : newStartX[_myCPU];
01384         maxStartY = (oldStartY[partner] > newStartY[_myCPU]) ?
01385                      oldStartY[partner] : newStartY[_myCPU];
01386         maxStartZ = (oldStartZ[partner] > newStartZ[_myCPU]) ?
01387                      oldStartZ[partner] : newStartZ[_myCPU];
01388         minEndX   = (oldEndX[partner] < newEndX[_myCPU]) ?
01389                      oldEndX[partner] : newEndX[_myCPU];
01390         minEndY   = (oldEndY[partner] < newEndY[_myCPU]) ?
01391                      oldEndY[partner] : newEndY[_myCPU];
01392         minEndZ   = (oldEndZ[partner] < newEndZ[_myCPU]) ?
01393                      oldEndZ[partner] : newEndZ[_myCPU];
01394         xSize = minEndX - maxStartX + 1;
01395         ySize = minEndY - maxStartY + 1;
01396         zSize = minEndZ - maxStartZ + 1;
01397 
01398         if (xSize > 0 && ySize > 0 && zSize > 0) {
01399 
01400             int offset = eSize *
01401             ((maxStartZ-newStartZ[_myCPU]) * ArrayW(nA)*ArrayH(nA) +
01402              (maxStartY-newStartY[_myCPU]) * ArrayW(nA) +
01403              (maxStartX-newStartX[_myCPU]));
01404 
01405             MPI_Type_vector(ySize, xSize, ArrayW(nA), elem, &blk2d);
01406             MPI_Type_hvector(zSize, 1,
01407                         ArrayW(nA)*ArrayH(nA)*sSize, blk2d, &blk3d);
01408             MPI_Type_commit(&blk3d);
01409             MPI_Recv((bdata) ?
01410                     ArrayPB(nA) + offset : ArrayCPB(nA) + offset, 1,
01411                     blk3d, partner, RDIST_TAG, MPI_COMM_WORLD, &stat);
01412             MPI_Type_free(&blk3d);
01413             MPI_Type_free(&blk2d);
01414         }
01415     }
01416 
01417     /*** Clean up and make new local array permanent ***/
01418 
01419     delete oldStartX;
01420     delete oldStartY;
01421     delete oldStartZ;
01422     delete oldEndX;
01423     delete oldEndY;
01424     delete oldEndZ;
01425     delete newStartX;
01426     delete newStartY;
01427     delete newStartZ;
01428     delete newEndX;
01429     delete newEndY;
01430     delete newEndZ;
01431     MPI_Type_free(&elem);
01432     MPI_Buffer_detach(&mpibuff, &mpibuffsize);
01433     delete mpibuff;
01434     delete *a;
01435     *a = nA;
01436 }
01437 
01438 
01439 /**********************************************************************
01440  *** Array Border Data Exchange                                     ***
01441  **********************************************************************/
01442 
01443 template<class ArrayT>
01444 inline void
01445 PxBorderExchange(ArrayT* a, int divide, int extent)
01446 {
01447     int tw = ArrayW(a),  th = ArrayH(a),  td = ArrayD(a);
01448     int bw = ArrayBW(a), bh = ArrayBH(a), bd = ArrayBD(a);
01449     int cw = ArrayCW(a), ch = ArrayCH(a), cd = ArrayCD(a);
01450 
01451     typedef typename ArrayT::StorType storT;
01452     storT *ptr1, *ptr2, *ptr3, *ptr4;
01453     int part1, part2, xSize, ySize, zSize;
01454     int eSize = ArrayT::ElemSize();
01455 
01456     /*** Determine send/recv partners and buffer pointers & sizes ***/
01457 
01458     switch (divide) {
01459         case YZ_PLANE:                              // left <--> right
01460             part1 = PxMyLeftCPU();
01461             part2 = PxMyRightCPU();
01462             xSize = extent;
01463             ySize = ch;
01464             zSize = cd;
01465             ptr1 = ArrayPB(a) + (bd*tw*th+bh*tw+bw) * eSize;
01466             ptr2 = ArrayPB(a) + (bd*tw*th+bh*tw+bw+cw) * eSize;
01467             ptr3 = ArrayPB(a) + (bd*tw*th+bh*tw+bw+cw-xSize) * eSize;
01468             ptr4 = ArrayPB(a) + (bd*tw*th+bh*tw+bw-xSize) * eSize;
01469             break;
01470 
01471         case XZ_PLANE:                              // top <--> bottom
01472             part1 = PxMyUpCPU();
01473             part2 = PxMyDownCPU();
01474             xSize = tw;
01475             ySize = extent;
01476             zSize = cd;
01477             ptr1 = ArrayPB(a) + (bd*tw*th + bh*tw) * eSize;
01478             ptr2 = ArrayPB(a) + (bd*tw*th + (bh+ch)*tw) * eSize;
01479             ptr3 = ArrayPB(a) + (bd*tw*th + (bh+ch-ySize)*tw) * eSize;
01480             ptr4 = ArrayPB(a) + (bd*tw*th + (bh-ySize)*tw) * eSize;
01481             break;
01482 
01483         case XY_PLANE:                              // front <--> back
01484             part1 = PxMyFrontCPU();
01485             part2 = PxMyBackCPU();
01486             xSize = tw;
01487             ySize = th;
01488             zSize = extent;
01489             ptr1 = ArrayPB(a) + (bd*tw*th) * eSize;
01490             ptr2 = ArrayPB(a) + ((bd+cd)*tw*th) * eSize;
01491             ptr3 = ArrayPB(a) + ((bd+cd-zSize)*tw*th) * eSize;
01492             ptr4 = ArrayPB(a) + ((bd-zSize)*tw*th) * eSize;
01493             break;
01494 
01495         default:
01496             return;
01497     }
01498 
01499     if ((xSize == 0) || (ySize == 0) || (zSize == 0)) {
01500         return;
01501     }
01502 
01503     MPI_Datatype elem, blk2d, blk3d;
01504     MPI_Status stat;
01505     int sSize = eSize*sizeof(typename ArrayT::StorType);
01506     MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01507     MPI_Type_vector(ySize, xSize, tw, elem, &blk2d);
01508     MPI_Type_hvector(zSize, 1, tw*th*sSize, blk2d, &blk3d);
01509     MPI_Type_commit(&blk3d);
01510 
01511     /*** Send to first partner and receive from second partner ***/
01512 
01513     if (part1 != -1) {
01514         MPI_Send(ptr1, 1, blk3d, part1, BOR_TAG, MPI_COMM_WORLD);
01515     }
01516     if (part2 != -1) {
01517         MPI_Recv(ptr2, 1, blk3d, part2, BOR_TAG, MPI_COMM_WORLD, &stat);
01518 
01519     /*** Send to second partner and receive from first partner ***/
01520 
01521         MPI_Send(ptr3, 1, blk3d, part2, BOR_TAG, MPI_COMM_WORLD);
01522     }
01523     if (part1 != -1) {
01524         MPI_Recv(ptr4, 1, blk3d, part1, BOR_TAG, MPI_COMM_WORLD, &stat);
01525     }
01526 
01527     MPI_Type_free(&blk3d);
01528     MPI_Type_free(&blk2d);
01529     MPI_Type_free(&elem);
01530 }
01531 
01532 
01533 } // namespace Pattern
01534 } // namespace Array
01535 } // namespace Core
01536 } // namespace Impala
01537 
01538 #endif /* __PxDistribution_h_ */

Generated on Fri Mar 19 09:30:51 2010 for ImpalaSrc by  doxygen 1.5.1