Impala Documentation

00001 #ifndef Impala_Core_Training_ComputeKernelMatrix_h
00002 #define Impala_Core_Training_ComputeKernelMatrix_h
00003 
00004 #ifdef CUDA
00005 #include <cuda.h>
00006 #include <cuda_runtime.h>
00007 #include <cublas.h>
00008 #endif
00009 #include "Link/Cuda/PrecomputeMatrixConfig.h"
00010 #include "Core/Feature/VirtualFeatureTableFactory.h"
00011 #include "Core/Vector/Chi2Distance.h"
00012 #include "Link/Cuda/Chi2Distance.h"
00013 #include "Core/Vector/HistogramIntersection.h"
00014 #include "Link/Cuda/HistogramIntersection.h"
00015 #include "Util/TimeStats.h"
00016 
00017 namespace Impala
00018 {
00019 namespace Core
00020 {
00021 namespace Training
00022 {
00023 
00024 
00025 #ifndef CUDA_SAFE_CALL
00026 #define CUDA_SAFE_CALL(msg) {}
00027 #endif
00028 
00032 template <class FType, class PrecomputeTaskT>
00033 void
00034 ComputeKernelMatrix(PrecomputeTaskT* pt, int slabWidth, bool GPU)
00035 {
00036     ILOG_VAR(Impala.Core.Training.ComputeKernelMatrix);
00037     ILOG_INFO("Computing matrix (fsize=" << sizeof(FType) << ")...");
00038 
00039     // ** compute matrix
00040     typedef Core::Matrix::VirtualMatrix VirtualMatrix;
00041     typedef Core::Feature::VirtualFeatureTable VirtualFeatureTable;
00042     typedef Core::Feature::VirtualFeatureTableFactory VFTFactory;
00043     std::vector<VirtualFeatureTable*> tablesA;
00044     std::vector<VirtualFeatureTable*> tablesB;
00045 
00046     // check input sizes
00047     Int64 vectorCountA = -1;
00048     Int64 vectorCountB = -1;
00049     int vectorSize = 0;
00050     ILOG_INFO("Opening 2 x " << pt->NrFeatures() << " feature tables");
00051     for (unsigned int feature = 0; feature < pt->NrFeatures(); feature++)
00052     {
00053         VirtualFeatureTable* tableA =
00054             VFTFactory::GetInstance().ConstructIOBufferReader
00055                                  (pt->GetFeatureLocatorA(feature), true);
00056         tablesA.push_back(tableA);
00057         VirtualFeatureTable* tableB =
00058             VFTFactory::GetInstance().ConstructIOBufferReader
00059                                  (pt->GetFeatureLocatorB(feature), false);
00060         tablesB.push_back(tableB);
00061         vectorSize = Impala::Max(vectorSize, tableA->GetVectorLength());
00062         if (vectorCountA == -1)
00063             vectorCountA = tableA->Size();
00064         if (vectorCountB == -1)
00065             vectorCountB = tableB->Size();
00066         if (vectorCountA != tableA->Size())
00067         {
00068             ILOG_ERROR("FATAL: Vector count mismatch");
00069             return;
00070         }
00071         if (vectorCountB != tableB->Size())
00072         {
00073             ILOG_ERROR("FATAL: Vector count mismatch");
00074             return;
00075         }
00076         ILOG_INFO("Did 2 x " << feature+1 << " feature(s): " <<
00077                   Process::MemoryInfo::GetUsageString());
00078     }
00079     ILOG_INFO("vectorCountA(devel)=" << vectorCountA <<
00080               ", vectorCountB(test)=" << vectorCountB);
00081     VirtualMatrix* matrix = pt->GetWritableMatrix(vectorCountB, vectorCountA);
00082     int alignFactor = (pt->IsChi2()) ? VECTORCHUNK_SIZE : 32;
00083     int VECTOR_SIZE = IntAlignUp(vectorSize, alignFactor);
00084     int vectorChunkCount = VECTOR_SIZE / VECTORCHUNK_SIZE;
00085     ILOG_INFO("VECTOR_SIZE=" << VECTOR_SIZE << "; vectorSize=" << vectorSize <<
00086               "; chunkCount=" << vectorChunkCount <<
00087               "; chunkSize=" << VECTORCHUNK_SIZE);
00088     if (VECTOR_SIZE > 16384)
00089     {
00090         slabWidth = Impala::Min(slabWidth, 512);
00091     }
00092 
00093     // ** allocs start
00094 
00095     // allocate host memory for matrices A and B
00096     unsigned int size_A = VECTOR_SIZE * slabWidth;
00097     unsigned int mem_size_A = sizeof(FType) * size_A;
00098     unsigned int size_B = VECTOR_SIZE * slabWidth;
00099     unsigned int mem_size_B = sizeof(FType) * size_B;
00100     FType* h_A = 0; //(FType*) malloc(mem_size_A);
00101     FType* h_B = 0; //(FType*) malloc(mem_size_B);
00102 #ifdef CUDA
00103     cudaHostAlloc((void**)&h_A, mem_size_A, cudaHostAllocDefault);
00104     cudaHostAlloc((void**)&h_B, mem_size_B, cudaHostAllocDefault);
00105 #else
00106     h_A = (FType*) malloc(mem_size_A);
00107     h_B = (FType*) malloc(mem_size_B);
00108 #endif
00109 
00110     unsigned int size_C = slabWidth * slabWidth;
00111     unsigned int mem_size_C = sizeof(FType) * size_C;
00112 
00113     // allocate host memory for the intermediate result
00114     FType* h_C = 0;
00115     if (!GPU)
00116         h_C = (FType*) malloc(mem_size_C);
00117     // allocate host memory for the final result
00118     FType* h_D = (FType*) malloc(mem_size_C);
00119 
00120     // allocate device memory
00121     FType* d_A = 0;
00122     FType* d_B = 0;
00123     FType* d_Bextra = 0;
00124     FType* d_C = 0;
00125     FType* d_D = 0;
00126     if (GPU)
00127     {
00128         CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
00129         CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B));
00130         CUDA_SAFE_CALL(cudaMalloc((void**) &d_Bextra, mem_size_B));
00131         // allocate device memory for intermediate result
00132         CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, mem_size_C));
00133         // allocate device memory for final result
00134         CUDA_SAFE_CALL(cudaMalloc((void**) &d_D, mem_size_C));
00135     }
00136 
00137     // ** allocs end
00138 
00139     Util::TimeStats statsSlab;
00140     statsSlab.AddGroup("read");
00141     statsSlab.AddGroup("process");
00142     Util::TimeStats statsOverall;
00143     statsOverall.AddGroupsFromSub(&statsSlab);
00144     statsOverall.AddGroup("copy");
00145     statsOverall.AddGroup("write");
00146 
00147     for (int slabA = 0; slabA < vectorCountA; slabA += slabWidth)
00148     {
00149         int slabB = 0;
00150         if (pt->IsSymmetric())
00151         {
00152             slabB = slabA;
00153         }
00154         for ( ; slabB < vectorCountB; slabB += slabWidth)
00155         {
00156             statsOverall.MeasureFirst();
00157             // initialize final result for slab to zero
00158             if (GPU)
00159             {
00160                 CUDA_SAFE_CALL(cudaMemset(d_D, 0, mem_size_C));
00161             }
00162             else
00163             {
00164                 for (int i = 0; i < size_C; i++)
00165                     h_D[i] = 0.0;
00166             }
00167             int slabSizeA = Impala::Min((Int64)slabWidth, vectorCountA - slabA);
00168             int slabSizeB = Impala::Min((Int64)slabWidth, vectorCountB - slabB);
00169             
00170             for (unsigned int feature = 0; feature < pt->NrFeatures(); feature++)
00171             {
00172                 // read data
00173                 statsSlab.MeasureFirst();
00174                 tablesA[feature]->GetAlignedVectors(slabA, slabWidth, h_A, VECTOR_SIZE);
00175                 tablesB[feature]->GetAlignedVectors(slabB, slabWidth, h_B, VECTOR_SIZE);
00176                 statsSlab.MeasureNext();
00177                 
00178                 if (GPU)
00179                 {
00180 #ifdef CUDA
00181                     // copy host memory to device
00182                     CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A,
00183                                               cudaMemcpyHostToDevice) );
00184                     CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, mem_size_B,
00185                                               cudaMemcpyHostToDevice) );
00186                 
00187                     // execute the kernel
00188                     if (sizeof(FType) == 8)
00189                     {
00190                         ILOG_ERROR("Cannot have doubles in GPU mode!");
00191                     }
00192                     if (pt->IsChi2())
00193                         //DoSlabChi2DistanceSlow(VECTORCHUNK_SIZE, slabSizeA, slabSizeB, d_C, d_A, d_B, slabSizeA, vectorChunkCount);
00194                         Link::Cuda::DoSlabChi2Distance(d_C, d_A, d_B, d_Bextra, VECTOR_SIZE, slabSizeA, slabSizeB, slabWidth);
00195                     else
00196                         Link::Cuda::DoSlabHistogramIntersection(d_C, d_A, d_B, d_Bextra, VECTOR_SIZE, slabSizeA, slabSizeB, slabWidth);
00197 
00198                     // task: D = (-weight/avg) * C + D
00199                     //       Y =  alpha        * X + Y
00200                     double alpha = pt->GetFeatureWeight(feature);
00201                     if (pt->IsChi2())
00202                         alpha = -alpha / pt->GetFeatureAverage(feature);
00203                     cublasSaxpy(slabWidth * slabWidth, float(alpha), d_C, 1, d_D, 1);
00204 
00205                     // check if kernel execution generated an error
00206                     CUT_CHECK_ERROR("Kernel execution failed");
00207 #endif // CUDA
00208                 }
00209                 else
00210                 {
00211                     if (pt->IsChi2())
00212                         Core::Vector::DoSlabChi2Distance(h_C, h_A, h_B, slabSizeA, slabSizeB, VECTOR_SIZE);
00213                     else
00214                         Core::Vector::DoSlabHistogramIntersection(h_C, h_A, h_B, slabSizeA, slabSizeB, VECTOR_SIZE);
00215 
00216                     // task: D = (-weight/avg) * C + D
00217                     //       Y =  alpha        * X + Y
00218                     double alpha = pt->GetFeatureWeight(feature);
00219                     if (pt->IsChi2())
00220                         alpha = -alpha / pt->GetFeatureAverage(feature);
00221                     for (int i = 0; i < slabWidth * slabWidth; i++)
00222                         h_D[i] += (FType)(alpha * h_C[i]);
00223                 }
00224                 statsSlab.MeasureLast();
00225             }
00226             ILOG_INFO("slab(" << slabA << "," << slabB << ") " <<
00227                       statsSlab.AsString());
00228             statsOverall.MeasureFromSub(&statsSlab);
00229 
00230             if (GPU)
00231             {
00232                 // copy result from device to host
00233                 CUDA_SAFE_CALL(cudaMemcpy(h_D, d_D, mem_size_C,
00234                                           cudaMemcpyDeviceToHost) );
00235             }
00236             statsOverall.MeasureNext();
00237 
00238             double divisor = pt->GetTotalFeatureWeight();
00239             for (Int64 b=0 ; b<slabSizeB ; b++)
00240             {
00241                 FType* row = h_D + b * slabSizeA;
00242                 if (pt->IsChi2())
00243                 {
00244                     for (Int64 a=0 ; a<slabSizeA ; a++)
00245                         row[a] = exp(row[a] / divisor);
00246                 }
00247                 else
00248                 {
00249                     for (Int64 a=0 ; a<slabSizeA ; a++)
00250                         row[a] = row[a] / divisor;
00251                 }
00252                 matrix->AddRowPart(slabB+b, slabA, row, slabSizeA);
00253             }
00254 
00255             if ((slabA != slabB) && pt->IsSymmetric())
00256             {
00257                 FType* tmpBuf = new FType[slabSizeB];
00258                 // do the symmmetric write
00259                 for (Int64 a = 0; a < slabSizeA; a++)
00260                 {
00261                     for (Int64 b = 0; b < slabSizeB; b++)
00262                         tmpBuf[b] = h_D[b * slabSizeA + a];
00263                     matrix->AddRowPart(slabA+a, slabB, tmpBuf, slabSizeB);
00264                 }
00265                 delete tmpBuf;
00266             }
00267             statsOverall.MeasureLast();
00268             ILOG_INFO("Overall " << statsOverall.AsString());
00269         }
00270     }
00271     pt->Finalize();
00272     // ** compute matrix ends
00273     
00274     // clean up memory
00275 #ifdef CUDA
00276     cudaFreeHost(h_A);
00277     cudaFreeHost(h_B);
00278 #else
00279     free(h_A);
00280     free(h_B);
00281 #endif
00282     if (!GPU)
00283         free(h_C);
00284     free(h_D);
00285     if (GPU)
00286     {
00287         CUDA_SAFE_CALL(cudaFree(d_A));
00288         CUDA_SAFE_CALL(cudaFree(d_B));
00289         CUDA_SAFE_CALL(cudaFree(d_Bextra));
00290         CUDA_SAFE_CALL(cudaFree(d_C));
00291         CUDA_SAFE_CALL(cudaFree(d_D));
00292     }
00293     for (unsigned int feature=0 ; feature<pt->NrFeatures() ; feature++)
00294     {
00295         delete tablesA[feature];
00296         delete tablesB[feature];
00297     }
00298     tablesA.clear();
00299     tablesB.clear();
00300 }
00301 
00302 } // namespace Training
00303 } // namespace Core
00304 } // namespace Impala
00305 
00306 #endif
ComputeKernelMatrix.h