template<class FType, class PrecomputeTaskT>
Kernel matrix computation.
Definition at line 34 of file ComputeKernelMatrix.h. References Impala::Util::TimeStats::AddGroup(), Impala::Util::TimeStats::AddGroupsFromSub(), Impala::Util::TimeStats::AsString(), CUDA_SAFE_CALL, Impala::Core::Vector::DoSlabChi2Distance(), Impala::Core::Vector::DoSlabHistogramIntersection(), Impala::Process::MemoryInfo::GetUsageString(), ILOG_ERROR, ILOG_INFO, ILOG_VAR, Impala::IntAlignUp(), Impala::Max(), Impala::Util::TimeStats::MeasureFirst(), Impala::Util::TimeStats::MeasureFromSub(), Impala::Util::TimeStats::MeasureLast(), Impala::Util::TimeStats::MeasureNext(), and Impala::Min(). Referenced by Impala::Core::Training::PrecomputeTask::Execute(). 00035 { 00036 ILOG_VAR(Impala.Core.Training.ComputeKernelMatrix); 00037 ILOG_INFO("Computing matrix (fsize=" << sizeof(FType) << ")..."); 00038 00039 // ** compute matrix 00040 typedef Core::Matrix::VirtualMatrix VirtualMatrix; 00041 typedef Core::Feature::VirtualFeatureTable VirtualFeatureTable; 00042 typedef Core::Feature::VirtualFeatureTableFactory VFTFactory; 00043 std::vector<VirtualFeatureTable*> tablesA; 00044 std::vector<VirtualFeatureTable*> tablesB; 00045 00046 // check input sizes 00047 Int64 vectorCountA = -1; 00048 Int64 vectorCountB = -1; 00049 int vectorSize = 0; 00050 ILOG_INFO("Opening 2 x " << pt->NrFeatures() << " feature tables"); 00051 for (unsigned int feature = 0; feature < pt->NrFeatures(); feature++) 00052 { 00053 VirtualFeatureTable* tableA = 00054 VFTFactory::GetInstance().ConstructIOBufferReader 00055 (pt->GetFeatureLocatorA(feature), true); 00056 tablesA.push_back(tableA); 00057 VirtualFeatureTable* tableB = 00058 VFTFactory::GetInstance().ConstructIOBufferReader 00059 (pt->GetFeatureLocatorB(feature), false); 00060 tablesB.push_back(tableB); 00061 vectorSize = Impala::Max(vectorSize, tableA->GetVectorLength()); 00062 if (vectorCountA == -1) 00063 vectorCountA = tableA->Size(); 00064 if (vectorCountB == -1) 00065 vectorCountB = tableB->Size(); 00066 if (vectorCountA != tableA->Size()) 00067 { 00068 ILOG_ERROR("FATAL: Vector count mismatch"); 00069 return; 00070 } 00071 if (vectorCountB != tableB->Size()) 00072 { 00073 ILOG_ERROR("FATAL: Vector count mismatch"); 00074 return; 00075 } 00076 ILOG_INFO("Did 2 x " << feature+1 << " feature(s): " << 00077 Process::MemoryInfo::GetUsageString()); 00078 } 00079 ILOG_INFO("vectorCountA(devel)=" << vectorCountA << 00080 ", vectorCountB(test)=" << vectorCountB); 00081 VirtualMatrix* matrix = pt->GetWritableMatrix(vectorCountB, vectorCountA); 00082 int alignFactor = (pt->IsChi2()) ? VECTORCHUNK_SIZE : 32; 00083 int VECTOR_SIZE = IntAlignUp(vectorSize, alignFactor); 00084 int vectorChunkCount = VECTOR_SIZE / VECTORCHUNK_SIZE; 00085 ILOG_INFO("VECTOR_SIZE=" << VECTOR_SIZE << "; vectorSize=" << vectorSize << 00086 "; chunkCount=" << vectorChunkCount << 00087 "; chunkSize=" << VECTORCHUNK_SIZE); 00088 if (VECTOR_SIZE > 16384) 00089 { 00090 slabWidth = Impala::Min(slabWidth, 512); 00091 } 00092 00093 // ** allocs start 00094 00095 // allocate host memory for matrices A and B 00096 unsigned int size_A = VECTOR_SIZE * slabWidth; 00097 unsigned int mem_size_A = sizeof(FType) * size_A; 00098 unsigned int size_B = VECTOR_SIZE * slabWidth; 00099 unsigned int mem_size_B = sizeof(FType) * size_B; 00100 FType* h_A = 0; //(FType*) malloc(mem_size_A); 00101 FType* h_B = 0; //(FType*) malloc(mem_size_B); 00102 #ifdef CUDA 00103 cudaHostAlloc((void**)&h_A, mem_size_A, cudaHostAllocDefault); 00104 cudaHostAlloc((void**)&h_B, mem_size_B, cudaHostAllocDefault); 00105 #else 00106 h_A = (FType*) malloc(mem_size_A); 00107 h_B = (FType*) malloc(mem_size_B); 00108 #endif 00109 00110 unsigned int size_C = slabWidth * slabWidth; 00111 unsigned int mem_size_C = sizeof(FType) * size_C; 00112 00113 // allocate host memory for the intermediate result 00114 FType* h_C = 0; 00115 if (!GPU) 00116 h_C = (FType*) malloc(mem_size_C); 00117 // allocate host memory for the final result 00118 FType* h_D = (FType*) malloc(mem_size_C); 00119 00120 // allocate device memory 00121 FType* d_A = 0; 00122 FType* d_B = 0; 00123 FType* d_Bextra = 0; 00124 FType* d_C = 0; 00125 FType* d_D = 0; 00126 if (GPU) 00127 { 00128 CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A)); 00129 CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, mem_size_B)); 00130 CUDA_SAFE_CALL(cudaMalloc((void**) &d_Bextra, mem_size_B)); 00131 // allocate device memory for intermediate result 00132 CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, mem_size_C)); 00133 // allocate device memory for final result 00134 CUDA_SAFE_CALL(cudaMalloc((void**) &d_D, mem_size_C)); 00135 } 00136 00137 // ** allocs end 00138 00139 Util::TimeStats statsSlab; 00140 statsSlab.AddGroup("read"); 00141 statsSlab.AddGroup("process"); 00142 Util::TimeStats statsOverall; 00143 statsOverall.AddGroupsFromSub(&statsSlab); 00144 statsOverall.AddGroup("copy"); 00145 statsOverall.AddGroup("write"); 00146 00147 for (int slabA = 0; slabA < vectorCountA; slabA += slabWidth) 00148 { 00149 int slabB = 0; 00150 if (pt->IsSymmetric()) 00151 { 00152 slabB = slabA; 00153 } 00154 for ( ; slabB < vectorCountB; slabB += slabWidth) 00155 { 00156 statsOverall.MeasureFirst(); 00157 // initialize final result for slab to zero 00158 if (GPU) 00159 { 00160 CUDA_SAFE_CALL(cudaMemset(d_D, 0, mem_size_C)); 00161 } 00162 else 00163 { 00164 for (int i = 0; i < size_C; i++) 00165 h_D[i] = 0.0; 00166 } 00167 int slabSizeA = Impala::Min((Int64)slabWidth, vectorCountA - slabA); 00168 int slabSizeB = Impala::Min((Int64)slabWidth, vectorCountB - slabB); 00169 00170 for (unsigned int feature = 0; feature < pt->NrFeatures(); feature++) 00171 { 00172 // read data 00173 statsSlab.MeasureFirst(); 00174 tablesA[feature]->GetAlignedVectors(slabA, slabWidth, h_A, VECTOR_SIZE); 00175 tablesB[feature]->GetAlignedVectors(slabB, slabWidth, h_B, VECTOR_SIZE); 00176 statsSlab.MeasureNext(); 00177 00178 if (GPU) 00179 { 00180 #ifdef CUDA 00181 // copy host memory to device 00182 CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, mem_size_A, 00183 cudaMemcpyHostToDevice) ); 00184 CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, mem_size_B, 00185 cudaMemcpyHostToDevice) ); 00186 00187 // execute the kernel 00188 if (sizeof(FType) == 8) 00189 { 00190 ILOG_ERROR("Cannot have doubles in GPU mode!"); 00191 } 00192 if (pt->IsChi2()) 00193 //DoSlabChi2DistanceSlow(VECTORCHUNK_SIZE, slabSizeA, slabSizeB, d_C, d_A, d_B, slabSizeA, vectorChunkCount); 00194 Link::Cuda::DoSlabChi2Distance(d_C, d_A, d_B, d_Bextra, VECTOR_SIZE, slabSizeA, slabSizeB, slabWidth); 00195 else 00196 Link::Cuda::DoSlabHistogramIntersection(d_C, d_A, d_B, d_Bextra, VECTOR_SIZE, slabSizeA, slabSizeB, slabWidth); 00197 00198 // task: D = (-weight/avg) * C + D 00199 // Y = alpha * X + Y 00200 double alpha = pt->GetFeatureWeight(feature); 00201 if (pt->IsChi2()) 00202 alpha = -alpha / pt->GetFeatureAverage(feature); 00203 cublasSaxpy(slabWidth * slabWidth, float(alpha), d_C, 1, d_D, 1); 00204 00205 // check if kernel execution generated an error 00206 CUT_CHECK_ERROR("Kernel execution failed"); 00207 #endif // CUDA 00208 } 00209 else 00210 { 00211 if (pt->IsChi2()) 00212 Core::Vector::DoSlabChi2Distance(h_C, h_A, h_B, slabSizeA, slabSizeB, VECTOR_SIZE); 00213 else 00214 Core::Vector::DoSlabHistogramIntersection(h_C, h_A, h_B, slabSizeA, slabSizeB, VECTOR_SIZE); 00215 00216 // task: D = (-weight/avg) * C + D 00217 // Y = alpha * X + Y 00218 double alpha = pt->GetFeatureWeight(feature); 00219 if (pt->IsChi2()) 00220 alpha = -alpha / pt->GetFeatureAverage(feature); 00221 for (int i = 0; i < slabWidth * slabWidth; i++) 00222 h_D[i] += (FType)(alpha * h_C[i]); 00223 } 00224 statsSlab.MeasureLast(); 00225 } 00226 ILOG_INFO("slab(" << slabA << "," << slabB << ") " << 00227 statsSlab.AsString()); 00228 statsOverall.MeasureFromSub(&statsSlab); 00229 00230 if (GPU) 00231 { 00232 // copy result from device to host 00233 CUDA_SAFE_CALL(cudaMemcpy(h_D, d_D, mem_size_C, 00234 cudaMemcpyDeviceToHost) ); 00235 } 00236 statsOverall.MeasureNext(); 00237 00238 double divisor = pt->GetTotalFeatureWeight(); 00239 for (Int64 b=0 ; b<slabSizeB ; b++) 00240 { 00241 FType* row = h_D + b * slabSizeA; 00242 if (pt->IsChi2()) 00243 { 00244 for (Int64 a=0 ; a<slabSizeA ; a++) 00245 row[a] = exp(row[a] / divisor); 00246 } 00247 else 00248 { 00249 for (Int64 a=0 ; a<slabSizeA ; a++) 00250 row[a] = row[a] / divisor; 00251 } 00252 matrix->AddRowPart(slabB+b, slabA, row, slabSizeA); 00253 } 00254 00255 if ((slabA != slabB) && pt->IsSymmetric()) 00256 { 00257 FType* tmpBuf = new FType[slabSizeB]; 00258 // do the symmmetric write 00259 for (Int64 a = 0; a < slabSizeA; a++) 00260 { 00261 for (Int64 b = 0; b < slabSizeB; b++) 00262 tmpBuf[b] = h_D[b * slabSizeA + a]; 00263 matrix->AddRowPart(slabA+a, slabB, tmpBuf, slabSizeB); 00264 } 00265 delete tmpBuf; 00266 } 00267 statsOverall.MeasureLast(); 00268 ILOG_INFO("Overall " << statsOverall.AsString()); 00269 } 00270 } 00271 pt->Finalize(); 00272 // ** compute matrix ends 00273 00274 // clean up memory 00275 #ifdef CUDA 00276 cudaFreeHost(h_A); 00277 cudaFreeHost(h_B); 00278 #else 00279 free(h_A); 00280 free(h_B); 00281 #endif 00282 if (!GPU) 00283 free(h_C); 00284 free(h_D); 00285 if (GPU) 00286 { 00287 CUDA_SAFE_CALL(cudaFree(d_A)); 00288 CUDA_SAFE_CALL(cudaFree(d_B)); 00289 CUDA_SAFE_CALL(cudaFree(d_Bextra)); 00290 CUDA_SAFE_CALL(cudaFree(d_C)); 00291 CUDA_SAFE_CALL(cudaFree(d_D)); 00292 } 00293 for (unsigned int feature=0 ; feature<pt->NrFeatures() ; feature++) 00294 { 00295 delete tablesA[feature]; 00296 delete tablesB[feature]; 00297 } 00298 tablesA.clear(); 00299 tablesB.clear(); 00300 }
Here is the call graph for this function:
|