00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef __PxDistribution_h_
00010 #define __PxDistribution_h_
00011
00012
00013 #include "Core/Array/Pattern/ArrayFunc.h"
00014 #include "Core/Array/Pattern/PxArrayFunc.h"
00015 #include "Core/Array/Pattern/PxPartition.h"
00016 #include "Core/Array/Pattern/PxSystem.h"
00017 #include "mpi.h"
00018
00019 #include "PxxStream.h"
00020
00021 namespace Impala
00022 {
00023 namespace Core
00024 {
00025 namespace Array
00026 {
00027 namespace Pattern
00028 {
00029
00030
00031
00032
00033 enum tags { INIT_TAG, BCAST_TAG, RDUCE_TAG,
00034 SCAT_TAG, GATH_TAG, RDIST_TAG, BOR_TAG };
00035 enum dists { PX_OFT, PX_SBT, PX_MPI };
00036 enum dirs { TO, FROM };
00037 enum planes { YZ_PLANE, XZ_PLANE, XY_PLANE };
00038
00039
00040
00041
00042 static int _myCPU;
00043 static int _nrCPUs;
00044 static int _logCPUs;
00045 static int _maxCPUs;
00046
00047 int dp = 0;
00048
00049
00050
00051
00052
00053 inline void
00054 PxInitDistribution(int aw, int ah, int ad,
00055 int xcpus, int ycpus, int zcpus)
00056 {
00057 _myCPU = PxMyCPU();
00058 _nrCPUs = PxNrCPUs();
00059 _logCPUs = log((double)_nrCPUs)/log(2.0);
00060 _maxCPUs = pow((double)2, _logCPUs);
00061 if (_maxCPUs < _nrCPUs) {
00062 _logCPUs++;
00063 _maxCPUs *= 2;
00064 }
00065 PxInitPartition(aw, ah, ad, xcpus, ycpus, zcpus);
00066 }
00067
00068
00069 template<class ArrayT>
00070 static inline void
00071 PxLclArrayCopy(ArrayT* loc, int dir, ArrayT* blk, int root, bool bdata)
00072 {
00073
00074
00075 typedef typename ArrayT::StorType storT;
00076 storT* lPtr = (bdata) ? ArrayPB(loc) : ArrayCPB(loc);
00077 storT* bPtr = (bdata) ? ArrayPB(blk) : ArrayCPB(blk);
00078 int eSize = ArrayT::ElemSize();
00079
00080 if (_myCPU == root) {
00081 bPtr = ArrayPB(blk) + eSize*(PxLclStart(_myCPU,
00082 ArrayBW(blk), ArrayBH(blk), ArrayBD(blk)));
00083 if (bdata) {
00084 bPtr -= (ArrayW(blk)*ArrayH(blk)*ArrayBD(blk) +
00085 ArrayW(blk)*ArrayBH(blk) + ArrayBW(blk)) * eSize;
00086 }
00087 }
00088
00089 int width = (bdata) ? ArrayW(loc) : ArrayCW(loc);
00090 int height = (bdata) ? ArrayH(loc) : ArrayCH(loc);
00091 int depth = (bdata) ? ArrayD(loc) : ArrayCD(loc);
00092
00093 for (int z=0; z<depth; z++) {
00094 for (int y=0; y<height; y++) {
00095 storT* locPtr = lPtr + (ArrayW(loc)*ArrayH(loc)*z +
00096 ArrayW(loc)*y) * eSize;
00097 storT* blkPtr = bPtr + (ArrayW(blk)*ArrayH(blk)*z +
00098 ArrayW(blk)*y) * eSize;
00099 for (int x=0; x<width*eSize; x++) {
00100 (dir == TO) ? (*blkPtr++ = *locPtr++) :
00101 (*locPtr++ = *blkPtr++);
00102 }
00103 }
00104 }
00105 }
00106
00107
00108 static inline int
00109 PxResponsibilityRange(int cpuNr)
00110 {
00111
00112
00113
00114 int range = 1;
00115
00116 for (int i=0; i<_logCPUs; i++) {
00117 if ((cpuNr & range) == range) {
00118 return range;
00119 }
00120 range <<= 1;
00121 }
00122 return range;
00123 }
00124
00125
00126 static inline int
00127 PxLineRange(int dimension, int firstIndex, int *order)
00128 {
00129
00130
00131
00132
00133
00134 int minNrLines = 0;
00135 int overflow = 0;
00136
00137 switch (dimension) {
00138 case 1: if (PxLogPartXcpus() == 1) {
00139 return PxFullWidth();
00140 }
00141 minNrLines = PxMinLclWidth();
00142 overflow = PxOverflowX();
00143 break;
00144 case 2: if (PxLogPartYcpus() == 1) {
00145 return PxFullHeight();
00146 }
00147 minNrLines = PxMinLclHeight();
00148 overflow = PxOverflowY();
00149 break;
00150 case 3: if (PxLogPartZcpus() == 1) {
00151 return PxFullDepth();
00152 }
00153 minNrLines = PxMinLclDepth();
00154 overflow = PxOverflowZ();
00155 break;
00156 default:
00157 return 0;
00158 }
00159
00160 int lastIndex = firstIndex + PxResponsibilityRange(firstIndex) - 1;
00161 int nrIndexes = lastIndex - firstIndex + 1;
00162 int firstCPU = order[firstIndex];
00163 if (firstIndex == 0) {
00164 firstCPU = (firstCPU / 4) * 4;
00165 }
00166 int lastCPU = firstCPU + nrIndexes - 1;
00167 if (lastCPU >= _nrCPUs) {
00168 lastCPU = _nrCPUs - 1;
00169 }
00170
00171
00172
00173 switch (dimension) {
00174 case 1: firstCPU = PxGridIndexX(firstCPU);
00175 lastCPU = PxGridIndexX(lastCPU);
00176 break;
00177 case 2: firstCPU = PxGridIndexY(firstCPU);
00178 lastCPU = PxGridIndexY(lastCPU);
00179 break;
00180 case 3: firstCPU = PxGridIndexZ(firstCPU);
00181 lastCPU = PxGridIndexZ(lastCPU);
00182 break;
00183 }
00184 int nrBufCPUs = lastCPU - firstCPU + 1;
00185 if (firstCPU >= overflow) {
00186 return (minNrLines * nrBufCPUs);
00187 } else if (lastCPU < overflow) {
00188 return ((minNrLines + 1) * nrBufCPUs);
00189 } else {
00190 return (minNrLines * nrBufCPUs + overflow - firstCPU);
00191 }
00192 }
00193
00194
00195 static inline void
00196 PxGetSBTorder(int root, int orderLength, int *order, int *myIndex)
00197 {
00198
00199
00200 if (root >= _nrCPUs) {
00201 return;
00202 }
00203 *myIndex = 0;
00204 order[0] = root;
00205 int rootTmp = root;
00206 int index = orderLength / 2;
00207
00208 for (int i=0; i<_logCPUs; i++) {
00209 if (rootTmp < index) {
00210 for (int j=0; j<index; j++) {
00211 int cpu = index+j + root-rootTmp;
00212 if (cpu == _myCPU) {
00213 *myIndex = index+j;
00214 }
00215 order[index+j] = cpu;
00216 }
00217 } else {
00218 for (int j=0; j<index; j++) {
00219 int cpu = j + root-rootTmp;
00220 if (cpu == _myCPU) {
00221 *myIndex = index+j;
00222 }
00223 order[index+j] = cpu;
00224 }
00225 rootTmp -= index;
00226 }
00227 index /= 2;
00228 }
00229 }
00230
00231
00232
00233
00234
00235
00236 inline void
00237 PxInitCommunication()
00238 {
00239 char dummy;
00240 MPI_Status stat;
00241
00242
00243
00244 if (PxMyUpCPU() != -1) {
00245 MPI_Send(&dummy, 1, MPI_BYTE,
00246 PxMyUpCPU(), INIT_TAG, MPI_COMM_WORLD);
00247 }
00248 if (PxMyDownCPU() != -1) {
00249 MPI_Recv(&dummy, 1, MPI_BYTE,
00250 PxMyDownCPU(), INIT_TAG, MPI_COMM_WORLD, &stat);
00251
00252
00253
00254 MPI_Send(&dummy, 1, MPI_BYTE,
00255 PxMyDownCPU(), INIT_TAG, MPI_COMM_WORLD);
00256 }
00257 if (PxMyUpCPU() != -1) {
00258 MPI_Recv(&dummy, 1, MPI_BYTE,
00259 PxMyUpCPU(), INIT_TAG, MPI_COMM_WORLD, &stat);
00260 }
00261 }
00262
00263
00264
00265
00266
00267
00268
00269 template<class ArithT>
00270 static inline ArithT
00271 PxBcastValueOFT(ArithT val, int root)
00272 {
00273
00274
00275 MPI_Datatype elem;
00276 MPI_Status stat;
00277 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00278 MPI_Type_commit(&elem);
00279
00280 if (_myCPU == root) {
00281 for (int partner=0; partner<_nrCPUs; partner++) {
00282 if (partner != _myCPU) {
00283 MPI_Send(&val,
00284 1, elem, partner, BCAST_TAG, MPI_COMM_WORLD);
00285 }
00286 }
00287 } else {
00288 MPI_Recv(&val, 1, elem, root, BCAST_TAG, MPI_COMM_WORLD, &stat);
00289 }
00290 MPI_Type_free(&elem);
00291 return val;
00292 }
00293
00294
00295 template<class ArithT>
00296 static inline ArithT
00297 PxBcastValueSBT(ArithT val, int root)
00298 {
00299
00300
00301 int *order = new int[_maxCPUs], myIndex;
00302 PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00303
00304
00305
00306 MPI_Datatype elem;
00307 MPI_Status stat;
00308 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00309 MPI_Type_commit(&elem);
00310
00311 int mask = 1 << _logCPUs-1;
00312 for (int i=0; i<_logCPUs; i++) {
00313 int partnerIndex = myIndex ^ mask;
00314 int partner = order[partnerIndex];
00315 if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00316 if (myIndex < partnerIndex) {
00317 MPI_Send(&val, 1, elem,
00318 partner, BCAST_TAG, MPI_COMM_WORLD);
00319 } else {
00320 MPI_Recv(&val, 1, elem,
00321 partner, BCAST_TAG, MPI_COMM_WORLD, &stat);
00322 }
00323 }
00324 mask >>= 1;
00325 }
00326 MPI_Type_free(&elem);
00327 delete order;
00328 return val;
00329 }
00330
00331
00332 template<class ArithT>
00333 static inline ArithT
00334 PxBcastValueMPI(ArithT val, int root)
00335 {
00336
00337
00338
00339
00340 MPI_Datatype elem;
00341 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00342 MPI_Type_commit(&elem);
00343 MPI_Bcast(&val, 1, elem, root, MPI_COMM_WORLD);
00344 MPI_Type_free(&elem);
00345 return val;
00346 }
00347
00348
00349 template<class ArithT>
00350 inline ArithT
00351 PxBcastValue(ArithT val, int root=0, int dtype=PX_MPI)
00352 {
00353
00354
00355
00356 switch (dtype) {
00357 case PX_OFT : return PxBcastValueOFT(val, root); break;
00358 case PX_SBT : return PxBcastValueSBT(val, root); break;
00359 case PX_MPI :
00360 default : return PxBcastValueMPI(val, root);
00361 }
00362 }
00363
00364
00365
00366
00367
00368
00369 template<class ArithT, class RedOpT>
00370 static inline void
00371 PxReduce(ArithT *in, ArithT *inout, int *len, MPI_Datatype *dptr)
00372 {
00373
00374
00375 RedOpT op;
00376 op.DoIt(*inout, *in);
00377 }
00378
00379
00380 template<class ArithT, class RedOpT>
00381 static inline ArithT
00382 PxReduceValueToRootOFT(ArithT val, RedOpT redOp, int root)
00383 {
00384
00385
00386 ArithT result = ArithT(val);
00387 MPI_Datatype elem;
00388 MPI_Status stat;
00389 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00390 MPI_Type_commit(&elem);
00391
00392 if (_myCPU == root) {
00393 for (int partner=0; partner<_nrCPUs; partner++) {
00394 if (partner != _myCPU) {
00395 ArithT recvVal;
00396 MPI_Recv(&recvVal, 1, elem,
00397 partner, RDUCE_TAG, MPI_COMM_WORLD, &stat);
00398 redOp.DoIt(result, recvVal);
00399 }
00400 }
00401 } else {
00402 MPI_Send(&val, 1, elem, root, RDUCE_TAG, MPI_COMM_WORLD);
00403 }
00404 MPI_Type_free(&elem);
00405 return result;
00406 }
00407
00408
00409 template<class ArithT, class RedOpT>
00410 static inline ArithT
00411 PxReduceValueToRootSBT(ArithT val, RedOpT redOp, int root)
00412 {
00413
00414
00415 int *order = new int[_maxCPUs], myIndex;
00416 PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00417
00418
00419
00420 ArithT result = ArithT(val);
00421 MPI_Datatype elem;
00422 MPI_Status stat;
00423 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00424 MPI_Type_commit(&elem);
00425
00426 int mask = 1;
00427 for (int i=0; i<_logCPUs; i++) {
00428 int partnerIndex = myIndex ^ mask;
00429 int partner = order[partnerIndex];
00430 if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00431 if (myIndex > partnerIndex) {
00432 MPI_Send(&result, 1, elem,
00433 partner, RDUCE_TAG, MPI_COMM_WORLD);
00434 } else {
00435 ArithT recvVal;
00436 MPI_Recv(&recvVal, 1, elem,
00437 partner, RDUCE_TAG, MPI_COMM_WORLD, &stat);
00438 redOp.DoIt(result, recvVal);
00439 }
00440 }
00441 mask <<= 1;
00442 }
00443 MPI_Type_free(&elem);
00444 delete order;
00445 return result;
00446 }
00447
00448
00449 template<class ArithT, class RedOpT>
00450 static inline ArithT
00451 PxReduceValueToRootMPI(ArithT val, RedOpT redOp, int root)
00452 {
00453
00454
00455
00456
00457
00458 int commute = 1;
00459 ArithT result = RedOpT::NeutralElement();
00460 MPI_Op operation;
00461 MPI_Datatype elem;
00462 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00463 MPI_Type_commit(&elem);
00464
00465
00466
00467 void (*reduceOp)(ArithT*, ArithT*, int*, MPI_Datatype*) =
00468 PxReduce<ArithT, RedOpT>;
00469 MPI_Op_create((void (*)(void*, void*, int*, MPI_Datatype*))reduceOp,
00470 commute, &operation);
00471 MPI_Reduce(&val, &result, 1, elem, operation, root, MPI_COMM_WORLD);
00472 MPI_Op_free(&operation);
00473 MPI_Type_free(&elem);
00474 return result;
00475 }
00476
00477
00478 template<class ArithT, class RedOpT>
00479 inline ArithT
00480 PxReduceValueToRoot(ArithT val, RedOpT redOp, int root=0, int dtype=PX_MPI)
00481 {
00482
00483
00484
00485
00486 switch (dtype) {
00487 case PX_OFT : return PxReduceValueToRootOFT(val, redOp, root);
00488 break;
00489 case PX_SBT : return PxReduceValueToRootSBT(val, redOp, root);
00490 break;
00491 case PX_MPI :
00492 default : return PxReduceValueToRootMPI(val, redOp, root);
00493 }
00494 }
00495
00496
00497
00498
00499
00500
00501 template<class ArithT, class RedOpT>
00502 static inline ArithT
00503 PxReduceValueToAllOFT(ArithT val, RedOpT redOp)
00504 {
00505
00506
00507 return PxBcastValueOFT(PxReduceValueToRootOFT(val, redOp, 0), 0);
00508 }
00509
00510
00511 template<class ArithT, class RedOpT>
00512 static inline ArithT
00513 PxReduceValueToAllSBT(ArithT val, RedOpT redOp)
00514 {
00515
00516
00517 return PxBcastValueSBT(PxReduceValueToRootSBT(val, redOp, 0), 0);
00518 }
00519
00520
00521 template<class ArithT, class RedOpT>
00522 static inline ArithT
00523 PxReduceValueToAllMPI(ArithT val, RedOpT redOp)
00524 {
00525
00526
00527
00528
00529
00530 int commute = 1;
00531 ArithT result = RedOpT::NeutralElement();
00532 MPI_Op operation;
00533 MPI_Datatype elem;
00534 MPI_Type_contiguous(sizeof(ArithT), MPI_BYTE, &elem);
00535 MPI_Type_commit(&elem);
00536
00537
00538
00539 void (*reduceOp)(ArithT*, ArithT*, int*, MPI_Datatype*) =
00540 PxReduce<ArithT, RedOpT>;
00541 MPI_Op_create((void (*)(void*, void*, int*, MPI_Datatype*))reduceOp,
00542 commute, &operation);
00543 MPI_Allreduce(&val, &result, 1, elem, operation, MPI_COMM_WORLD);
00544 MPI_Op_free(&operation);
00545 MPI_Type_free(&elem);
00546 return result;
00547 }
00548
00549
00550 template<class ArithT, class RedOpT>
00551 inline ArithT
00552 PxReduceValueToAll(ArithT val, RedOpT redOp, int dtype=PX_MPI)
00553 {
00554
00555
00556
00557 switch (dtype) {
00558 case PX_OFT : return PxReduceValueToAllOFT(val, redOp); break;
00559 case PX_SBT : return PxReduceValueToAllSBT(val, redOp); break;
00560 case PX_MPI :
00561 default : return PxReduceValueToAllMPI(val, redOp);
00562 }
00563 }
00564
00565
00566
00567
00568
00569
00570 template<class ArrayT>
00571 static inline void
00572 PxScatterArrayOFT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00573 {
00574
00575
00576 int eSize = ArrayT::ElemSize();
00577 int sSize = eSize*sizeof(typename ArrayT::StorType);
00578 MPI_Datatype elem, blk2d, blk3d;
00579 MPI_Status stat;
00580 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00581
00582 if (_myCPU == root) {
00583 for (int partner=0; partner<_nrCPUs; partner++) {
00584 if (partner != _myCPU) {
00585 int xSize = PxLclWidth(partner);
00586 int ySize = PxLclHeight(partner);
00587 int zSize = PxLclDepth(partner);
00588 int offset = PxLclStart(partner, ArrayBW(glob),
00589 ArrayBH(glob), ArrayBD(glob));
00590 if (bdata) {
00591 xSize += 2*ArrayBW(glob);
00592 ySize += 2*ArrayBH(glob);
00593 zSize += 2*ArrayBD(glob);
00594 offset -=
00595 (ArrayW(glob)*ArrayH(glob)*ArrayBD(glob) +
00596 ArrayW(glob)*ArrayBH(glob) + ArrayBW(glob));
00597 }
00598 if (xSize != 0 && ySize != 0 && zSize != 0) {
00599 MPI_Type_vector(ySize, xSize,
00600 ArrayW(glob), elem, &blk2d);
00601 MPI_Type_hvector(zSize, 1,
00602 ArrayW(glob)*ArrayH(glob)*sSize,
00603 blk2d, &blk3d);
00604 MPI_Type_commit(&blk3d);
00605 MPI_Send(ArrayPB(glob) + offset*eSize, 1, blk3d,
00606 partner, SCAT_TAG, MPI_COMM_WORLD);
00607 MPI_Type_free(&blk3d);
00608 MPI_Type_free(&blk2d);
00609 }
00610 }
00611 }
00612 PxLclArrayCopy(loc, FROM, glob, root, bdata);
00613
00614 } else {
00615 int xSize = PxLclWidth(_myCPU);
00616 int ySize = PxLclHeight(_myCPU);
00617 int zSize = PxLclDepth(_myCPU);
00618 if (bdata) {
00619 xSize += 2*ArrayBW(glob);
00620 ySize += 2*ArrayBH(glob);
00621 zSize += 2*ArrayBD(glob);
00622 }
00623 if (xSize != 0 && ySize != 0 && zSize != 0) {
00624 MPI_Type_vector(ySize, xSize, ArrayW(loc), elem, &blk2d);
00625 MPI_Type_hvector(zSize, 1,
00626 ArrayW(loc)*ArrayH(loc)*sSize, blk2d, &blk3d);
00627 MPI_Type_commit(&blk3d);
00628 MPI_Recv((bdata) ? ArrayPB(loc) : ArrayCPB(loc), 1,
00629 blk3d, root, SCAT_TAG, MPI_COMM_WORLD, &stat);
00630 MPI_Type_free(&blk3d);
00631 MPI_Type_free(&blk2d);
00632 }
00633 }
00634 MPI_Type_free(&elem);
00635 }
00636
00637
00638 template<class ArrayT>
00639 static inline void
00640 PxScatterArraySBT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00641 {
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651 int *order = new int[_maxCPUs], myIndex, bufW, bufH, bufD;
00652 ArrayT *buf;
00653
00654
00655
00656 PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00657
00658
00659
00660 if (_myCPU == root) {
00661 bufW = ArrayW(glob);
00662 bufH = ArrayH(glob);
00663 bufD = ArrayD(glob);
00664 buf = glob;
00665 } else {
00666 bufW = PxLineRange(1, myIndex, order);
00667 bufH = PxLineRange(2, myIndex, order);
00668 bufD = PxLineRange(3, myIndex, order);
00669 buf = PxArrayCreate<ArrayT>(bufW, bufH, bufD, ArrayBW(glob),
00670 ArrayBH(glob), ArrayBD(glob));
00671 bufW += 2*ArrayBW(glob);
00672 bufH += 2*ArrayBH(glob);
00673 bufD += 2*ArrayBD(glob);
00674 }
00675
00676
00677
00678 int eSize = ArrayT::ElemSize();
00679 int sSize = eSize*sizeof(typename ArrayT::StorType);
00680 MPI_Datatype elem, blk2d, blk3d;
00681 MPI_Status stat;
00682 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00683
00684 int mask = 1 << (_logCPUs-1);
00685 for (int i=0; i<_logCPUs; i++) {
00686 int partnerIndex = myIndex ^ mask;
00687 int partner = order[partnerIndex];
00688 if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00689 if (myIndex < partnerIndex) {
00690 int xSize = PxLineRange(1, partnerIndex, order);
00691 int ySize = PxLineRange(2, partnerIndex, order);
00692 int zSize = PxLineRange(3, partnerIndex, order);
00693 int offset = PxLclStart(partner, ArrayBW(glob),
00694 ArrayBH(glob), ArrayBD(glob));
00695 if (_myCPU != root) {
00696 offset -= (PxLclStart(_myCPU, ArrayBW(glob),
00697 ArrayBH(glob), ArrayBD(glob)) -
00698 (bufW*bufH*ArrayBD(glob) +
00699 bufW*ArrayBH(glob) + ArrayBW(glob)));
00700 }
00701 if (bdata) {
00702 xSize += 2*ArrayBW(glob);
00703 ySize += 2*ArrayBH(glob);
00704 zSize += 2*ArrayBD(glob);
00705 offset -= (bufW*bufH*ArrayBD(glob) +
00706 bufW*ArrayBH(glob) + ArrayBW(glob));
00707 }
00708 if (xSize != 0 && ySize != 0 && zSize != 0) {
00709 MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00710 MPI_Type_hvector(zSize, 1,
00711 bufW*bufH*sSize, blk2d, &blk3d);
00712 MPI_Type_commit(&blk3d);
00713 MPI_Send(ArrayPB(buf) + offset*eSize, 1, blk3d,
00714 partner, SCAT_TAG, MPI_COMM_WORLD);
00715 MPI_Type_free(&blk3d);
00716 MPI_Type_free(&blk2d);
00717 }
00718 } else {
00719 int xSize = PxLineRange(1, myIndex, order);
00720 int ySize = PxLineRange(2, myIndex, order);
00721 int zSize = PxLineRange(3, myIndex, order);
00722 if (bdata) {
00723 xSize += 2*ArrayBW(glob);
00724 ySize += 2*ArrayBH(glob);
00725 zSize += 2*ArrayBD(glob);
00726 }
00727 if (xSize != 0 && ySize != 0 && zSize != 0) {
00728 MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00729 MPI_Type_hvector(zSize, 1,
00730 bufW*bufH*sSize, blk2d, &blk3d);
00731 MPI_Type_commit(&blk3d);
00732 MPI_Recv((bdata) ? ArrayPB(buf) : ArrayCPB(buf),
00733 1, blk3d, partner, SCAT_TAG, MPI_COMM_WORLD,
00734 &stat);
00735 MPI_Type_free(&blk3d);
00736 MPI_Type_free(&blk2d);
00737 }
00738 }
00739 }
00740 mask >>= 1;
00741 }
00742 MPI_Type_free(&elem);
00743 PxLclArrayCopy(loc, FROM, buf, root, bdata);
00744 if (_myCPU != root) {
00745 delete buf;
00746 }
00747 delete order;
00748 }
00749
00750
00751 template<class ArrayT>
00752 static inline void
00753 PxScatterArrayMPI(ArrayT* glob, ArrayT* loc, int root, int bdata=true)
00754 {
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767
00768
00769
00770
00771
00772 int *counts, *disps;
00773
00774
00775
00776 if (_myCPU == root) {
00777 counts = new int[_nrCPUs];
00778 disps = new int[_nrCPUs];
00779 int slice = 2*((ArrayW(glob)*ArrayH(glob)*ArrayBD(glob)) +
00780 (ArrayW(glob)*ArrayBH(glob)));
00781 for (int i=0; i<_nrCPUs; i++) {
00782 counts[i] = (PxLclWidth(i) + 2*ArrayBW(glob)) *
00783 (PxLclHeight(i) + 2*ArrayBH(glob)) *
00784 (PxLclDepth(i) + 2*ArrayBD(glob));
00785 disps[i] = (i==0) ? 0 : disps[i-1] + counts[i-1] - slice;
00786 }
00787 }
00788
00789
00790
00791 int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
00792 MPI_Datatype elem;
00793 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00794 MPI_Type_commit(&elem);
00795 MPI_Scatterv(ArrayPB(glob), counts, disps, elem, ArrayPB(loc),
00796 ArrayW(loc)*ArrayH(loc)*ArrayD(loc), elem, root,
00797 MPI_COMM_WORLD);
00798 MPI_Type_free(&elem);
00799
00800 if (_myCPU == root) {
00801 delete counts;
00802 delete disps;
00803 }
00804 }
00805
00806
00807 template<class ArrayT>
00808 inline void
00809 PxScatterArray(ArrayT* glob, ArrayT* loc,
00810 int root=0, int dtype=PX_SBT, bool bdata=false)
00811 {
00812
00813
00814
00815
00816 if (dp) PX_COUT << "Scatter: " << "NrCPUs = " << PxNrCPUs() << PX_ENDL;
00817
00818 switch (dtype) {
00819 case PX_MPI : PxScatterArrayMPI(glob, loc, root, true ); break;
00820 case PX_OFT : PxScatterArrayOFT(glob, loc, root, bdata); break;
00821 case PX_SBT :
00822 default : PxScatterArraySBT(glob, loc, root, bdata);
00823 }
00824 }
00825
00826
00827
00828
00829
00830
00831 template<class ArrayT>
00832 static inline void
00833 PxGatherArrayOFT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00834 {
00835
00836
00837 int eSize = ArrayT::ElemSize();
00838 int sSize = eSize*sizeof(typename ArrayT::StorType);
00839 MPI_Datatype elem, blk2d, blk3d;
00840 MPI_Status stat;
00841 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00842
00843 if (_myCPU == root) {
00844 PxLclArrayCopy(loc, TO, glob, root, bdata);
00845 for (int partner=0; partner<_nrCPUs; partner++) {
00846 if (partner != _myCPU) {
00847 int xSize = PxLclWidth(partner);
00848 int ySize = PxLclHeight(partner);
00849 int zSize = PxLclDepth(partner);
00850 int offset = PxLclStart(partner,ArrayBW(glob),
00851 ArrayBH(glob), ArrayBD(glob));
00852 if (bdata) {
00853 xSize += 2*ArrayBW(glob);
00854 ySize += 2*ArrayBH(glob);
00855 zSize += 2*ArrayBD(glob);
00856 offset -=
00857 (ArrayW(glob)*ArrayH(glob)*ArrayBD(glob) +
00858 ArrayW(glob)*ArrayBH(glob) + ArrayBW(glob));
00859 }
00860 if (xSize != 0 && ySize != 0 && zSize != 0) {
00861 MPI_Type_vector(ySize, xSize,
00862 ArrayW(glob), elem, &blk2d);
00863 MPI_Type_hvector(zSize, 1,
00864 ArrayW(glob)*ArrayH(glob)*sSize,
00865 blk2d, &blk3d);
00866 MPI_Type_commit(&blk3d);
00867 MPI_Recv(ArrayPB(glob) + offset*eSize, 1, blk3d,
00868 partner, GATH_TAG, MPI_COMM_WORLD, &stat);
00869 MPI_Type_free(&blk3d);
00870 MPI_Type_free(&blk2d);
00871 }
00872 }
00873 }
00874 } else {
00875 int xSize = PxLclWidth(_myCPU);
00876 int ySize = PxLclHeight(_myCPU);
00877 int zSize = PxLclDepth(_myCPU);
00878 if (bdata) {
00879 xSize += 2*ArrayBW(glob);
00880 ySize += 2*ArrayBH(glob);
00881 zSize += 2*ArrayBD(glob);
00882 }
00883 if (xSize != 0 && ySize != 0 && zSize != 0) {
00884 MPI_Type_vector(ySize, xSize, ArrayW(loc), elem, &blk2d);
00885 MPI_Type_hvector(zSize, 1,
00886 ArrayW(loc)*ArrayH(loc)*sSize, blk2d, &blk3d);
00887 MPI_Type_commit(&blk3d);
00888 MPI_Send((bdata) ? ArrayPB(loc) : ArrayCPB(loc),
00889 1, blk3d, root, GATH_TAG, MPI_COMM_WORLD);
00890 MPI_Type_free(&blk3d);
00891 MPI_Type_free(&blk2d);
00892 }
00893 }
00894 MPI_Type_free(&elem);
00895 }
00896
00897
00898 template<class ArrayT>
00899 static inline void
00900 PxGatherArraySBT(ArrayT* glob, ArrayT* loc, int root, bool bdata)
00901 {
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911 int *order = new int[_maxCPUs], myIndex, bufW, bufH, bufD;
00912 ArrayT *buf;
00913
00914
00915
00916 PxGetSBTorder(root, _maxCPUs, order, &myIndex);
00917
00918
00919
00920 if (_myCPU == root) {
00921 bufW = ArrayW(glob);
00922 bufH = ArrayH(glob);
00923 bufD = ArrayD(glob);
00924 buf = glob;
00925 } else {
00926 bufW = PxLineRange(1, myIndex, order);
00927 bufH = PxLineRange(2, myIndex, order);
00928 bufD = PxLineRange(3, myIndex, order);
00929 buf = PxArrayCreate<ArrayT>(bufW, bufH, bufD, ArrayBW(glob),
00930 ArrayBH(glob), ArrayBD(glob));
00931 bufW += 2*ArrayBW(glob);
00932 bufH += 2*ArrayBH(glob);
00933 bufD += 2*ArrayBD(glob);
00934 }
00935 PxLclArrayCopy(loc, TO, buf, root, bdata);
00936
00937
00938
00939 int eSize = ArrayT::ElemSize();
00940 int sSize = eSize*sizeof(typename ArrayT::StorType);
00941 MPI_Datatype elem, blk2d, blk3d;
00942 MPI_Status stat;
00943 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
00944
00945 int mask = 1;
00946 for (int i=0; i<_logCPUs; i++) {
00947 int partnerIndex = myIndex ^ mask;
00948 int partner = order[partnerIndex];
00949 if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
00950 if (myIndex > partnerIndex) {
00951 int xSize = PxLineRange(1, myIndex, order);
00952 int ySize = PxLineRange(2, myIndex, order);
00953 int zSize = PxLineRange(3, myIndex, order);
00954 if (bdata) {
00955 xSize += 2*ArrayBW(glob);
00956 ySize += 2*ArrayBH(glob);
00957 zSize += 2*ArrayBD(glob);
00958 }
00959 if (xSize != 0 && ySize != 0 && zSize != 0) {
00960 MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00961 MPI_Type_hvector(zSize, 1,
00962 bufW*bufH*sSize, blk2d, &blk3d);
00963 MPI_Type_commit(&blk3d);
00964 MPI_Send((bdata) ? ArrayPB(buf) : ArrayCPB(buf),
00965 1, blk3d, partner, GATH_TAG, MPI_COMM_WORLD);
00966 MPI_Type_free(&blk3d);
00967 MPI_Type_free(&blk2d);
00968 }
00969 } else {
00970 int xSize = PxLineRange(1, partnerIndex, order);
00971 int ySize = PxLineRange(2, partnerIndex, order);
00972 int zSize = PxLineRange(3, partnerIndex, order);
00973 int offset = PxLclStart(partner, ArrayBW(glob),
00974 ArrayBH(glob), ArrayBD(glob));
00975 if (_myCPU != root) {
00976 offset -= (PxLclStart(_myCPU, ArrayBW(glob),
00977 ArrayBH(glob), ArrayBD(glob)) -
00978 (bufW*bufH*ArrayBD(glob) +
00979 bufW*ArrayBH(glob) + ArrayBW(glob)));
00980 }
00981 if (bdata) {
00982 xSize += 2*ArrayBW(glob);
00983 ySize += 2*ArrayBH(glob);
00984 zSize += 2*ArrayBD(glob);
00985 offset -= (bufW*bufH*ArrayBD(glob) +
00986 bufW*ArrayBH(glob) + ArrayBW(glob));
00987 }
00988 if (xSize != 0 && ySize != 0 && zSize != 0) {
00989 MPI_Type_vector(ySize, xSize, bufW, elem, &blk2d);
00990 MPI_Type_hvector(zSize, 1,
00991 bufW*bufH*sSize, blk2d, &blk3d);
00992 MPI_Type_commit(&blk3d);
00993 MPI_Recv(ArrayPB(buf) + offset*eSize, 1, blk3d,
00994 partner, GATH_TAG, MPI_COMM_WORLD, &stat);
00995 MPI_Type_free(&blk3d);
00996 MPI_Type_free(&blk2d);
00997 }
00998 }
00999 }
01000 mask <<= 1;
01001 }
01002 MPI_Type_free(&elem);
01003 if (_myCPU != root) {
01004 delete buf;
01005 }
01006 delete order;
01007 }
01008
01009
01010 template<class ArrayT>
01011 static inline void
01012 PxGatherArrayMPI(ArrayT* glob, ArrayT* loc, int root, bool bdata=true)
01013 {
01014
01015
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025
01026
01027
01028
01029
01030
01031 int *counts, *disps;
01032
01033
01034
01035 if (_myCPU == root) {
01036 counts = new int[_nrCPUs];
01037 disps = new int[_nrCPUs];
01038 int slice = 2*((ArrayW(glob)*ArrayH(glob)*ArrayBD(glob)) +
01039 (ArrayW(glob)*ArrayBH(glob)));
01040 for (int i=0; i<_nrCPUs; i++) {
01041 counts[i] = (PxLclWidth(i) + 2*ArrayBW(glob)) *
01042 (PxLclHeight(i) + 2*ArrayBH(glob)) *
01043 (PxLclDepth(i) + 2*ArrayBD(glob));
01044 disps[i] = (i==0) ? 0 : disps[i-1] + counts[i-1] - slice;
01045 }
01046 }
01047
01048
01049
01050 int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01051 MPI_Datatype elem;
01052 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01053 MPI_Type_commit(&elem);
01054 MPI_Gatherv(ArrayPB(loc),
01055 ArrayW(loc)*ArrayH(loc)*ArrayD(loc), elem,
01056 ArrayPB(glob), counts, disps, elem, root,
01057 MPI_COMM_WORLD);
01058 MPI_Type_free(&elem);
01059
01060 if (_myCPU == root) {
01061 delete counts;
01062 delete disps;
01063 }
01064 }
01065
01066
01067 template<class ArrayT>
01068 inline void
01069 PxGatherArray(ArrayT* glob, ArrayT* loc,
01070 int root=0, int dtype=PX_SBT, bool bdata=false)
01071 {
01072
01073
01074
01075
01076 if (dp) PX_COUT << "Gather..." << PX_ENDL;
01077
01078 switch (dtype) {
01079 case PX_MPI : PxGatherArrayMPI(glob, loc, root, true ); break;
01080 case PX_OFT : PxGatherArrayOFT(glob, loc, root, bdata); break;
01081 case PX_SBT :
01082 default : PxGatherArraySBT(glob, loc, root, bdata);
01083 }
01084 }
01085
01086
01087
01088
01089
01090
01091 template<class ArrayT>
01092 static inline void
01093 PxBcastArrayOFT(ArrayT* a, int root, bool bdata)
01094 {
01095
01096
01097 int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01098 MPI_Datatype elem, blk2d, blk3d;
01099 MPI_Status stat;
01100 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01101 MPI_Type_commit(&elem);
01102
01103 if (!bdata) {
01104 MPI_Type_vector(ArrayCH(a),
01105 ArrayCW(a), ArrayW(a), elem, &blk2d);
01106 MPI_Type_hvector(ArrayCD(a), 1,
01107 ArrayW(a)*ArrayH(a)*sSize, blk2d, &blk3d);
01108 MPI_Type_commit(&blk3d);
01109 }
01110 if (_myCPU == root) {
01111 for (int partner=0; partner<_nrCPUs; partner++) {
01112 if (partner != _myCPU) {
01113 if (!bdata) {
01114 MPI_Send(ArrayCPB(a), 1,
01115 blk3d, partner, BCAST_TAG, MPI_COMM_WORLD);
01116 } else {
01117 MPI_Send(ArrayPB(a),
01118 ArrayW(a)*ArrayH(a)*ArrayD(a),
01119 elem, partner, BCAST_TAG, MPI_COMM_WORLD);
01120 }
01121 }
01122 }
01123 } else {
01124 if (!bdata) {
01125 MPI_Recv(ArrayCPB(a), 1, blk3d, root,
01126 BCAST_TAG, MPI_COMM_WORLD, &stat);
01127 } else {
01128 MPI_Recv(ArrayPB(a), ArrayW(a)*ArrayH(a)*ArrayD(a),
01129 elem, root, BCAST_TAG, MPI_COMM_WORLD, &stat);
01130 }
01131 }
01132 if (!bdata) {
01133 MPI_Type_free(&blk3d);
01134 MPI_Type_free(&blk2d);
01135 }
01136 MPI_Type_free(&elem);
01137 }
01138
01139
01140 template<class ArrayT>
01141 static inline void
01142 PxBcastArraySBT(ArrayT* a, int root, bool bdata)
01143 {
01144
01145
01146 int *order = new int[_maxCPUs], myIndex;
01147 PxGetSBTorder(root, _maxCPUs, order, &myIndex);
01148
01149
01150
01151 int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01152 MPI_Datatype elem, blk2d, blk3d;
01153 MPI_Status stat;
01154 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01155 MPI_Type_commit(&elem);
01156
01157 if (!bdata) {
01158 MPI_Type_vector(ArrayCH(a),
01159 ArrayCW(a), ArrayW(a), elem, &blk2d);
01160 MPI_Type_hvector(ArrayCD(a), 1,
01161 ArrayW(a)*ArrayH(a)*sSize, blk2d, &blk3d);
01162 MPI_Type_commit(&blk3d);
01163 }
01164 int mask = 1 << (_logCPUs-1);
01165 for (int i=0; i<_logCPUs; i++) {
01166 int partnerIndex = myIndex ^ mask;
01167 int partner = order[partnerIndex];
01168 if ((myIndex % mask == 0) && (partner < _nrCPUs)) {
01169 if (myIndex < partnerIndex) {
01170 if (!bdata) {
01171 MPI_Send(ArrayCPB(a), 1, blk3d,
01172 partner, BCAST_TAG, MPI_COMM_WORLD);
01173 } else {
01174 MPI_Send(ArrayPB(a),
01175 ArrayW(a)*ArrayH(a)*ArrayD(a), elem,
01176 partner, BCAST_TAG, MPI_COMM_WORLD);
01177 }
01178 } else {
01179 if (!bdata) {
01180 MPI_Recv(ArrayCPB(a), 1, blk3d, partner,
01181 BCAST_TAG, MPI_COMM_WORLD, &stat);
01182 } else {
01183 MPI_Recv(ArrayPB(a),
01184 ArrayW(a)*ArrayH(a)*ArrayD(a), elem,
01185 partner, BCAST_TAG, MPI_COMM_WORLD, &stat);
01186 }
01187 }
01188 }
01189 mask >>= 1;
01190 }
01191 if (!bdata) {
01192 MPI_Type_free(&blk3d);
01193 MPI_Type_free(&blk2d);
01194 }
01195 MPI_Type_free(&elem);
01196 delete order;
01197 }
01198
01199
01200 template<class ArrayT>
01201 static inline void
01202 PxBcastArrayMPI(ArrayT* a, int root, bool bdata)
01203 {
01204
01205
01206
01207
01208 int sSize = ArrayT::ElemSize()*sizeof(typename ArrayT::StorType);
01209 MPI_Datatype elem, blk2d, blk3d;
01210 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01211 MPI_Type_commit(&elem);
01212
01213 if (!bdata) {
01214 MPI_Type_vector(ArrayCH(a),
01215 ArrayCW(a), ArrayW(a), elem, &blk2d);
01216 MPI_Type_hvector(ArrayCD(a), 1,
01217 ArrayW(a)*ArrayH(a)*sSize, blk2d, &blk3d);
01218 MPI_Type_commit(&blk3d);
01219 MPI_Bcast(ArrayCPB(a), 1, blk3d, root, MPI_COMM_WORLD);
01220 MPI_Type_free(&blk3d);
01221 MPI_Type_free(&blk2d);
01222 } else {
01223 MPI_Bcast(ArrayPB(a), ArrayW(a)*ArrayH(a)*ArrayD(a),
01224 elem, root, MPI_COMM_WORLD);
01225 }
01226 MPI_Type_free(&elem);
01227 }
01228
01229
01230 template<class ArrayT>
01231 inline void
01232 PxBcastArray(ArrayT* a, int root=0, int dtype=PX_MPI, bool bdata=false)
01233 {
01234
01235
01236
01237
01238 if (dp) PX_COUT << "Bcast: " << "NrCPUs = " << PxNrCPUs() << PX_ENDL;
01239
01240 switch (dtype) {
01241 case PX_OFT : PxBcastArrayOFT(a, root, bdata); break;
01242 case PX_SBT : PxBcastArraySBT(a, root, bdata); break;
01243 case PX_MPI :
01244 default : PxBcastArrayMPI(a, root, bdata);
01245 }
01246 }
01247
01248
01249
01250
01251
01252
01253 template<class ArrayT>
01254 inline void
01255 PxRedistArray(ArrayT** a,
01256 int xcpus, int ycpus, int zcpus, bool bdata=false)
01257 {
01258
01259
01260
01261
01262
01263
01264
01265
01266 int bw = ArrayBW(*a), bh = ArrayBH(*a), bd = ArrayBD(*a);
01267
01268
01269
01270 int* oldStartX = new int[_nrCPUs];
01271 int* oldStartY = new int[_nrCPUs];
01272 int* oldStartZ = new int[_nrCPUs];
01273 int* oldEndX = new int[_nrCPUs];
01274 int* oldEndY = new int[_nrCPUs];
01275 int* oldEndZ = new int[_nrCPUs];
01276
01277 int i;
01278 for (i=0; i<_nrCPUs; i++) {
01279 oldStartX[i] = PxLclStartX(i);
01280 oldStartY[i] = PxLclStartY(i);
01281 oldStartZ[i] = PxLclStartZ(i);
01282 oldEndX[i] = oldStartX[i] + PxLclWidth(i) - 1;
01283 oldEndY[i] = oldStartY[i] + PxLclHeight(i) - 1;
01284 oldEndZ[i] = oldStartZ[i] + PxLclDepth(i) - 1;
01285 if (bdata) {
01286 oldEndX[i] += 2*bw;
01287 oldEndY[i] += 2*bh;
01288 oldEndZ[i] += 2*bd;
01289 }
01290 }
01291
01292
01293
01294 PxRePartition(xcpus, ycpus, zcpus);
01295
01296
01297
01298 int* newStartX = new int[_nrCPUs];
01299 int* newStartY = new int[_nrCPUs];
01300 int* newStartZ = new int[_nrCPUs];
01301 int* newEndX = new int[_nrCPUs];
01302 int* newEndY = new int[_nrCPUs];
01303 int* newEndZ = new int[_nrCPUs];
01304
01305 for (i=0; i<_nrCPUs; i++) {
01306 newStartX[i] = PxLclStartX(i);
01307 newStartY[i] = PxLclStartY(i);
01308 newStartZ[i] = PxLclStartZ(i);
01309 newEndX[i] = newStartX[i] + PxLclWidth(i) - 1;
01310 newEndY[i] = newStartY[i] + PxLclHeight(i) - 1;
01311 newEndZ[i] = newStartZ[i] + PxLclDepth(i) - 1;
01312 if (bdata) {
01313 newEndX[i] += 2*bw;
01314 newEndY[i] += 2*bh;
01315 newEndZ[i] += 2*bd;
01316 }
01317 }
01318 int newW = PxLclWidth(_myCPU);
01319 int newH = PxLclHeight(_myCPU);
01320 int newD = PxLclDepth(_myCPU);
01321
01322
01323
01324 ArrayT* nA = PxArrayCreate<ArrayT>(newW, newH, newD, bw, bh, bd);
01325 typedef typename ArrayT::StorType storT;
01326 int eSize = ArrayT::ElemSize();
01327 int sSize = eSize*sizeof(storT);
01328 int mpibuffsize = sizeof(MPI_BYTE) * sSize *
01329 ArrayW(nA) * ArrayH(nA) * ArrayD(nA);
01330
01331 char *mpibuff = new char[mpibuffsize];
01332 MPI_Buffer_attach(mpibuff, mpibuffsize);
01333
01334
01335
01336 MPI_Datatype elem, blk2d, blk3d;
01337 MPI_Status stat;
01338 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01339
01340 for (i=0; i<_nrCPUs; i++) {
01341
01342 int partner = (_nrCPUs - _myCPU + i) % _nrCPUs;
01343
01344
01345
01346 int maxStartX = (oldStartX[_myCPU] > newStartX[partner]) ?
01347 oldStartX[_myCPU] : newStartX[partner];
01348 int maxStartY = (oldStartY[_myCPU] > newStartY[partner]) ?
01349 oldStartY[_myCPU] : newStartY[partner];
01350 int maxStartZ = (oldStartZ[_myCPU] > newStartZ[partner]) ?
01351 oldStartZ[_myCPU] : newStartZ[partner];
01352 int minEndX = (oldEndX[_myCPU] < newEndX[partner]) ?
01353 oldEndX[_myCPU] : newEndX[partner];
01354 int minEndY = (oldEndY[_myCPU] < newEndY[partner]) ?
01355 oldEndY[_myCPU] : newEndY[partner];
01356 int minEndZ = (oldEndZ[_myCPU] < newEndZ[partner]) ?
01357 oldEndZ[_myCPU] : newEndZ[partner];
01358 int xSize = minEndX - maxStartX + 1;
01359 int ySize = minEndY - maxStartY + 1;
01360 int zSize = minEndZ - maxStartZ + 1;
01361
01362 if (xSize > 0 && ySize > 0 && zSize > 0) {
01363
01364 int offset = eSize *
01365 ((maxStartZ-oldStartZ[_myCPU]) * ArrayW(*a)*ArrayH(*a) +
01366 (maxStartY-oldStartY[_myCPU]) * ArrayW(*a) +
01367 (maxStartX-oldStartX[_myCPU]));
01368
01369 MPI_Type_vector(ySize, xSize, ArrayW(*a), elem, &blk2d);
01370 MPI_Type_hvector(zSize, 1,
01371 ArrayW(*a)*ArrayH(*a)*sSize, blk2d, &blk3d);
01372 MPI_Type_commit(&blk3d);
01373 MPI_Bsend((bdata) ?
01374 ArrayPB(*a) + offset : ArrayCPB(*a) + offset,
01375 1, blk3d, partner, RDIST_TAG, MPI_COMM_WORLD);
01376 MPI_Type_free(&blk3d);
01377 MPI_Type_free(&blk2d);
01378 }
01379
01380
01381
01382 maxStartX = (oldStartX[partner] > newStartX[_myCPU]) ?
01383 oldStartX[partner] : newStartX[_myCPU];
01384 maxStartY = (oldStartY[partner] > newStartY[_myCPU]) ?
01385 oldStartY[partner] : newStartY[_myCPU];
01386 maxStartZ = (oldStartZ[partner] > newStartZ[_myCPU]) ?
01387 oldStartZ[partner] : newStartZ[_myCPU];
01388 minEndX = (oldEndX[partner] < newEndX[_myCPU]) ?
01389 oldEndX[partner] : newEndX[_myCPU];
01390 minEndY = (oldEndY[partner] < newEndY[_myCPU]) ?
01391 oldEndY[partner] : newEndY[_myCPU];
01392 minEndZ = (oldEndZ[partner] < newEndZ[_myCPU]) ?
01393 oldEndZ[partner] : newEndZ[_myCPU];
01394 xSize = minEndX - maxStartX + 1;
01395 ySize = minEndY - maxStartY + 1;
01396 zSize = minEndZ - maxStartZ + 1;
01397
01398 if (xSize > 0 && ySize > 0 && zSize > 0) {
01399
01400 int offset = eSize *
01401 ((maxStartZ-newStartZ[_myCPU]) * ArrayW(nA)*ArrayH(nA) +
01402 (maxStartY-newStartY[_myCPU]) * ArrayW(nA) +
01403 (maxStartX-newStartX[_myCPU]));
01404
01405 MPI_Type_vector(ySize, xSize, ArrayW(nA), elem, &blk2d);
01406 MPI_Type_hvector(zSize, 1,
01407 ArrayW(nA)*ArrayH(nA)*sSize, blk2d, &blk3d);
01408 MPI_Type_commit(&blk3d);
01409 MPI_Recv((bdata) ?
01410 ArrayPB(nA) + offset : ArrayCPB(nA) + offset, 1,
01411 blk3d, partner, RDIST_TAG, MPI_COMM_WORLD, &stat);
01412 MPI_Type_free(&blk3d);
01413 MPI_Type_free(&blk2d);
01414 }
01415 }
01416
01417
01418
01419 delete oldStartX;
01420 delete oldStartY;
01421 delete oldStartZ;
01422 delete oldEndX;
01423 delete oldEndY;
01424 delete oldEndZ;
01425 delete newStartX;
01426 delete newStartY;
01427 delete newStartZ;
01428 delete newEndX;
01429 delete newEndY;
01430 delete newEndZ;
01431 MPI_Type_free(&elem);
01432 MPI_Buffer_detach(&mpibuff, &mpibuffsize);
01433 delete mpibuff;
01434 delete *a;
01435 *a = nA;
01436 }
01437
01438
01439
01440
01441
01442
01443 template<class ArrayT>
01444 inline void
01445 PxBorderExchange(ArrayT* a, int divide, int extent)
01446 {
01447 int tw = ArrayW(a), th = ArrayH(a), td = ArrayD(a);
01448 int bw = ArrayBW(a), bh = ArrayBH(a), bd = ArrayBD(a);
01449 int cw = ArrayCW(a), ch = ArrayCH(a), cd = ArrayCD(a);
01450
01451 typedef typename ArrayT::StorType storT;
01452 storT *ptr1, *ptr2, *ptr3, *ptr4;
01453 int part1, part2, xSize, ySize, zSize;
01454 int eSize = ArrayT::ElemSize();
01455
01456
01457
01458 switch (divide) {
01459 case YZ_PLANE:
01460 part1 = PxMyLeftCPU();
01461 part2 = PxMyRightCPU();
01462 xSize = extent;
01463 ySize = ch;
01464 zSize = cd;
01465 ptr1 = ArrayPB(a) + (bd*tw*th+bh*tw+bw) * eSize;
01466 ptr2 = ArrayPB(a) + (bd*tw*th+bh*tw+bw+cw) * eSize;
01467 ptr3 = ArrayPB(a) + (bd*tw*th+bh*tw+bw+cw-xSize) * eSize;
01468 ptr4 = ArrayPB(a) + (bd*tw*th+bh*tw+bw-xSize) * eSize;
01469 break;
01470
01471 case XZ_PLANE:
01472 part1 = PxMyUpCPU();
01473 part2 = PxMyDownCPU();
01474 xSize = tw;
01475 ySize = extent;
01476 zSize = cd;
01477 ptr1 = ArrayPB(a) + (bd*tw*th + bh*tw) * eSize;
01478 ptr2 = ArrayPB(a) + (bd*tw*th + (bh+ch)*tw) * eSize;
01479 ptr3 = ArrayPB(a) + (bd*tw*th + (bh+ch-ySize)*tw) * eSize;
01480 ptr4 = ArrayPB(a) + (bd*tw*th + (bh-ySize)*tw) * eSize;
01481 break;
01482
01483 case XY_PLANE:
01484 part1 = PxMyFrontCPU();
01485 part2 = PxMyBackCPU();
01486 xSize = tw;
01487 ySize = th;
01488 zSize = extent;
01489 ptr1 = ArrayPB(a) + (bd*tw*th) * eSize;
01490 ptr2 = ArrayPB(a) + ((bd+cd)*tw*th) * eSize;
01491 ptr3 = ArrayPB(a) + ((bd+cd-zSize)*tw*th) * eSize;
01492 ptr4 = ArrayPB(a) + ((bd-zSize)*tw*th) * eSize;
01493 break;
01494
01495 default:
01496 return;
01497 }
01498
01499 if ((xSize == 0) || (ySize == 0) || (zSize == 0)) {
01500 return;
01501 }
01502
01503 MPI_Datatype elem, blk2d, blk3d;
01504 MPI_Status stat;
01505 int sSize = eSize*sizeof(typename ArrayT::StorType);
01506 MPI_Type_contiguous(sSize, MPI_BYTE, &elem);
01507 MPI_Type_vector(ySize, xSize, tw, elem, &blk2d);
01508 MPI_Type_hvector(zSize, 1, tw*th*sSize, blk2d, &blk3d);
01509 MPI_Type_commit(&blk3d);
01510
01511
01512
01513 if (part1 != -1) {
01514 MPI_Send(ptr1, 1, blk3d, part1, BOR_TAG, MPI_COMM_WORLD);
01515 }
01516 if (part2 != -1) {
01517 MPI_Recv(ptr2, 1, blk3d, part2, BOR_TAG, MPI_COMM_WORLD, &stat);
01518
01519
01520
01521 MPI_Send(ptr3, 1, blk3d, part2, BOR_TAG, MPI_COMM_WORLD);
01522 }
01523 if (part1 != -1) {
01524 MPI_Recv(ptr4, 1, blk3d, part1, BOR_TAG, MPI_COMM_WORLD, &stat);
01525 }
01526
01527 MPI_Type_free(&blk3d);
01528 MPI_Type_free(&blk2d);
01529 MPI_Type_free(&elem);
01530 }
01531
01532
01533 }
01534 }
01535 }
01536 }
01537
01538 #endif