Definition at line 19 of file KmeansClustering.h. References ILOG_INFO, ILOG_VAR, MatDrawRandomRows(), MatE(), MatKeepSpecificRows(), MatNrCol(), MatNrRow(), Impala::Core::Array::SetVal(), Impala::Timer::SplitTimeStr(), and VectorQuantize(). Referenced by Impala::Application::mainConstructCodebook(). 00020 { 00021 ILOG_VAR(Impala.Core.Matrix.KmeansClustering); 00022 Timer totalTimer; 00023 00024 // initialize codebook by random drawing 00025 Mat* codebook = MatDrawRandomRows(clusterInput, k); 00026 00027 double threshold = 1e-5; 00028 double diff = threshold + 1.0; 00029 std::vector<Real64> averageDistances; 00030 int n = MatNrRow(clusterInput); 00031 int d = MatNrCol(clusterInput); 00032 int* numberAssigned = new int[k]; 00033 while(diff > threshold) 00034 { 00035 Timer iterTimer; 00036 // assignment will contain indices and distortion/distance 00037 Mat* assignment = VectorQuantize(clusterInput, codebook); 00038 00039 Real64 averageDist = 0.0; 00040 for(int j = 0; j < n; j++) 00041 { 00042 averageDist += *MatE(assignment, j, 1); 00043 } 00044 averageDist = averageDist / MatNrRow(clusterInput); 00045 averageDistances.push_back(averageDist); 00046 00047 // reset the codebook to all zeros, to prepare for update 00048 SetVal(codebook, 0.0); 00049 00050 for(int i = 0; i < k; i++) numberAssigned[i] = 0; 00051 00052 // sum the observations of the different clusters 00053 for(int j = 0; j < n; j++) 00054 { 00055 int cluster = static_cast<int>(*MatE(assignment, j, 0)); 00056 numberAssigned[cluster]++; 00057 for(int i = 0; i < d; i++) 00058 { 00059 *MatE(codebook, cluster, i) += *MatE(clusterInput, j, i); 00060 } 00061 } 00062 00063 std::vector<int> usedClusters; 00064 usedClusters.reserve(MatNrRow(codebook)); 00065 for(int cluster = 0; cluster < MatNrRow(codebook); cluster++) 00066 { 00067 if(numberAssigned[cluster] > 0) 00068 { 00069 for(int i = 0; i < d; i++) 00070 { 00071 *MatE(codebook, cluster, i) /= numberAssigned[cluster]; 00072 } 00073 usedClusters.push_back(cluster); 00074 } 00075 } 00076 00077 if(usedClusters.size() < MatNrRow(codebook)) 00078 { 00079 // one of the clusters has no elements: remove it 00080 Mat* temp = MatKeepSpecificRows(codebook, usedClusters); 00081 delete codebook; 00082 codebook = temp; 00083 } 00084 00085 if(averageDistances.size() > 1) 00086 { 00087 diff = averageDistances[averageDistances.size()-2] - averageDistances[averageDistances.size()-1]; 00088 ILOG_INFO("K-means progress: " << averageDistances[averageDistances.size()-2] << " " << averageDistances[averageDistances.size()-1] << " " << diff << " " << MatNrRow(codebook) << " iterTime=" << iterTimer.SplitTimeStr() << " totalTimer=" << totalTimer.SplitTimeStr()); 00089 } 00090 delete assignment; 00091 } 00092 delete numberAssigned; 00093 return codebook; 00094 }
Here is the call graph for this function:
|