00001 #ifndef Impala_Core_Training_ComputeKernelMatrix_h
00002 #define Impala_Core_Training_ComputeKernelMatrix_h
00003
00004 #ifdef CUDA
00005 #include <cuda.h>
00006 #include <cuda_runtime.h>
00007 #include <cublas.h>
00008 #endif
00009 #include "Link/Cuda/PrecomputeMatrixConfig.h"
00010 #include "Core/Feature/VirtualFeatureTableFactory.h"
00011 #include "Core/Vector/Chi2Distance.h"
00012 #include "Link/Cuda/Chi2Distance.h"
00013 #include "Core/Vector/HistogramIntersection.h"
00014 #include "Link/Cuda/HistogramIntersection.h"
00015 #include "Util/TimeStats.h"
00016
00017 namespace Impala
00018 {
00019 namespace Core
00020 {
00021 namespace Training
00022 {
00023
00024
00025 #ifndef CUDA_SAFE_CALL
00026 #define CUDA_SAFE_CALL(msg) {}
00027 #endif
00028
00032 template <class FType, class PrecomputeTaskT>
00033 void
00034 ComputeKernelMatrix(PrecomputeTaskT* pt, int slabWidth, bool GPU)
00035 {
00036 ILOG_VAR(Impala.Core.Training.ComputeKernelMatrix);
00037 ILOG_INFO("Computing matrix (fsize=" << sizeof(FType) << ")...");
00038
00039
00040 typedef Core::Matrix::VirtualMatrix VirtualMatrix;
00041 typedef Core::Feature::VirtualFeatureTable VirtualFeatureTable;
00042 typedef Core::Feature::VirtualFeatureTableFactory VFTFactory;
00043 std::vector<VirtualFeatureTable*> tablesA;
00044 std::vector<VirtualFeatureTable*> tablesB;
00045
00046
00047 Int64 vectorCountA = -1;
00048 Int64 vectorCountB = -1;
00049 int vectorSize = 0;
00050 ILOG_INFO("Opening 2 x " << pt->NrFeatures() << " feature tables");
00051 for (unsigned int feature = 0; feature < pt->NrFeatures(); feature++)
00052 {
00053 VirtualFeatureTable* tableA =
00054 VFTFactory::GetInstance().ConstructIOBufferReader
00055 (pt->GetFeatureLocatorA(feature), true);
00056 tablesA.push_back(tableA);
00057 VirtualFeatureTable* tableB =
00058 VFTFactory::GetInstance().ConstructIOBufferReader
00059 (pt->GetFeatureLocatorB(feature), false);
00060 tablesB.push_back(tableB);
00061 vectorSize = Impala::Max(vectorSize, tableA->GetVectorLength());
00062 if (vectorCountA == -1)
00063 vectorCountA = tableA->Size();
00064 if (vectorCountB == -1)
00065 vectorCountB = tableB->Size();
00066 if (vectorCountA != tableA->Size())
00067 {
00068 ILOG_ERROR("FATAL: Vector count mismatch");
00069 return;
00070 }
00071 if (vectorCountB != tableB->Size())
00072 {
00073 ILOG_ERROR("FATAL: Vector count mismatch");
00074 return;
00075 }
00076 ILOG_INFO("Did 2 x " << feature+1 << " feature(s): " <<
00077 Process::MemoryInfo::GetUsageString());
00078 }
00079 ILOG_INFO("vectorCountA(devel)=" << vectorCountA <<
00080 ", vectorCountB(test)=" << vectorCountB);
00081 VirtualMatrix* matrix = pt->GetWritableMatrix(vectorCountB, vectorCountA);
00082 int alignFactor = (pt->IsChi2()) ? VECTORCHUNK_SIZE : 32;
00083 int VECTOR_SIZE = IntAlignUp(vectorSize, alignFactor);
00084 int vectorChunkCount = VECTOR_SIZE / VECTORCHUNK_SIZE;
00085 ILOG_INFO("VECTOR_SIZE=" << VECTOR_SIZE << "; vectorSize=" << vectorSize <<
00086 "; chunkCount=" << vectorChunkCount <<
00087 "; chunkSize=" << VECTORCHUNK_SIZE);
00088 if (VECTOR_SIZE > 16384)
00089 {
00090 slabWidth = Impala::Min(slabWidth, 512);
00091 }
00092
00093
00094
00095
00096 unsigned int size_A = VECTOR_SIZE * slabWidth;
00097 unsigned int mem_size_A = sizeof(FType) * size_A;
00098 unsigned int size_B = VECTOR_SIZE * slabWidth;
00099 unsigned int mem_size_B = sizeof(FType) * size_B;
00100 FType* h_A = 0;
00101 FType* h_B = 0;
00102 #ifdef CUDA
00103 cudaHostAlloc((void**)&h_A, mem_size_A, cudaHostAllocDefault);
00104 cudaHostAlloc((void**)&h_B, mem_size_B, cudaHostAllocDefault);
00105 #else
00106 h_A = (FType*) malloc(mem_size_A);
00107 h_B = (FType*) malloc(mem_size_B);
00108 #endif
00109
00110 unsigned int size_C = slabWidth * slabWidth;
00111 unsigned int mem_size_C = sizeof(FType) * size_C;
00112
00113
00114 FType* h_C = 0;
00115 if (!GPU)
00116 h_C = (FType*) malloc(mem_size_C);
00117
00118 FType* h_D = (FType*) malloc(mem_size_C);
00119
00120
00121 FType* d_A = 0;
00122 FType* d_B = 0;
00123 FType* d_Bextra = 0;
00124 FType* d_C = 0;
00125 FType* d_D = 0;
00126 if (GPU)
00127 {
00128 CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
00129 CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B));
00130 CUDA_SAFE_CALL(cudaMalloc((void**) &d_Bextra, mem_size_B));
00131
00132 CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, mem_size_C));
00133
00134 CUDA_SAFE_CALL(cudaMalloc((void**) &d_D, mem_size_C));
00135 }
00136
00137
00138
00139 Util::TimeStats statsSlab;
00140 statsSlab.AddGroup("read");
00141 statsSlab.AddGroup("process");
00142 Util::TimeStats statsOverall;
00143 statsOverall.AddGroupsFromSub(&statsSlab);
00144 statsOverall.AddGroup("copy");
00145 statsOverall.AddGroup("write");
00146
00147 for (int slabA = 0; slabA < vectorCountA; slabA += slabWidth)
00148 {
00149 int slabB = 0;
00150 if (pt->IsSymmetric())
00151 {
00152 slabB = slabA;
00153 }
00154 for ( ; slabB < vectorCountB; slabB += slabWidth)
00155 {
00156 statsOverall.MeasureFirst();
00157
00158 if (GPU)
00159 {
00160 CUDA_SAFE_CALL(cudaMemset(d_D, 0, mem_size_C));
00161 }
00162 else
00163 {
00164 for (int i = 0; i < size_C; i++)
00165 h_D[i] = 0.0;
00166 }
00167 int slabSizeA = Impala::Min((Int64)slabWidth, vectorCountA - slabA);
00168 int slabSizeB = Impala::Min((Int64)slabWidth, vectorCountB - slabB);
00169
00170 for (unsigned int feature = 0; feature < pt->NrFeatures(); feature++)
00171 {
00172
00173 statsSlab.MeasureFirst();
00174 tablesA[feature]->GetAlignedVectors(slabA, slabWidth, h_A, VECTOR_SIZE);
00175 tablesB[feature]->GetAlignedVectors(slabB, slabWidth, h_B, VECTOR_SIZE);
00176 statsSlab.MeasureNext();
00177
00178 if (GPU)
00179 {
00180 #ifdef CUDA
00181
00182 CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A,
00183 cudaMemcpyHostToDevice) );
00184 CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, mem_size_B,
00185 cudaMemcpyHostToDevice) );
00186
00187
00188 if (sizeof(FType) == 8)
00189 {
00190 ILOG_ERROR("Cannot have doubles in GPU mode!");
00191 }
00192 if (pt->IsChi2())
00193
00194 Link::Cuda::DoSlabChi2Distance(d_C, d_A, d_B, d_Bextra, VECTOR_SIZE, slabSizeA, slabSizeB, slabWidth);
00195 else
00196 Link::Cuda::DoSlabHistogramIntersection(d_C, d_A, d_B, d_Bextra, VECTOR_SIZE, slabSizeA, slabSizeB, slabWidth);
00197
00198
00199
00200 double alpha = pt->GetFeatureWeight(feature);
00201 if (pt->IsChi2())
00202 alpha = -alpha / pt->GetFeatureAverage(feature);
00203 cublasSaxpy(slabWidth * slabWidth, float(alpha), d_C, 1, d_D, 1);
00204
00205
00206 CUT_CHECK_ERROR("Kernel execution failed");
00207 #endif // CUDA
00208 }
00209 else
00210 {
00211 if (pt->IsChi2())
00212 Core::Vector::DoSlabChi2Distance(h_C, h_A, h_B, slabSizeA, slabSizeB, VECTOR_SIZE);
00213 else
00214 Core::Vector::DoSlabHistogramIntersection(h_C, h_A, h_B, slabSizeA, slabSizeB, VECTOR_SIZE);
00215
00216
00217
00218 double alpha = pt->GetFeatureWeight(feature);
00219 if (pt->IsChi2())
00220 alpha = -alpha / pt->GetFeatureAverage(feature);
00221 for (int i = 0; i < slabWidth * slabWidth; i++)
00222 h_D[i] += (FType)(alpha * h_C[i]);
00223 }
00224 statsSlab.MeasureLast();
00225 }
00226 ILOG_INFO("slab(" << slabA << "," << slabB << ") " <<
00227 statsSlab.AsString());
00228 statsOverall.MeasureFromSub(&statsSlab);
00229
00230 if (GPU)
00231 {
00232
00233 CUDA_SAFE_CALL(cudaMemcpy(h_D, d_D, mem_size_C,
00234 cudaMemcpyDeviceToHost) );
00235 }
00236 statsOverall.MeasureNext();
00237
00238 double divisor = pt->GetTotalFeatureWeight();
00239 for (Int64 b=0 ; b<slabSizeB ; b++)
00240 {
00241 FType* row = h_D + b * slabSizeA;
00242 if (pt->IsChi2())
00243 {
00244 for (Int64 a=0 ; a<slabSizeA ; a++)
00245 row[a] = exp(row[a] / divisor);
00246 }
00247 else
00248 {
00249 for (Int64 a=0 ; a<slabSizeA ; a++)
00250 row[a] = row[a] / divisor;
00251 }
00252 matrix->AddRowPart(slabB+b, slabA, row, slabSizeA);
00253 }
00254
00255 if ((slabA != slabB) && pt->IsSymmetric())
00256 {
00257 FType* tmpBuf = new FType[slabSizeB];
00258
00259 for (Int64 a = 0; a < slabSizeA; a++)
00260 {
00261 for (Int64 b = 0; b < slabSizeB; b++)
00262 tmpBuf[b] = h_D[b * slabSizeA + a];
00263 matrix->AddRowPart(slabA+a, slabB, tmpBuf, slabSizeB);
00264 }
00265 delete tmpBuf;
00266 }
00267 statsOverall.MeasureLast();
00268 ILOG_INFO("Overall " << statsOverall.AsString());
00269 }
00270 }
00271 pt->Finalize();
00272
00273
00274
00275 #ifdef CUDA
00276 cudaFreeHost(h_A);
00277 cudaFreeHost(h_B);
00278 #else
00279 free(h_A);
00280 free(h_B);
00281 #endif
00282 if (!GPU)
00283 free(h_C);
00284 free(h_D);
00285 if (GPU)
00286 {
00287 CUDA_SAFE_CALL(cudaFree(d_A));
00288 CUDA_SAFE_CALL(cudaFree(d_B));
00289 CUDA_SAFE_CALL(cudaFree(d_Bextra));
00290 CUDA_SAFE_CALL(cudaFree(d_C));
00291 CUDA_SAFE_CALL(cudaFree(d_D));
00292 }
00293 for (unsigned int feature=0 ; feature<pt->NrFeatures() ; feature++)
00294 {
00295 delete tablesA[feature];
00296 delete tablesB[feature];
00297 }
00298 tablesA.clear();
00299 tablesB.clear();
00300 }
00301
00302 }
00303 }
00304 }
00305
00306 #endif