00001 #ifndef Impala_Core_Matrix_KmeansClustering_h
00002 #define Impala_Core_Matrix_KmeansClustering_h
00003
00004 #include "Core/Matrix/MatFunc.h"
00005 #include "Core/Matrix/Mat.h"
00006 #include "Core/Matrix/MatDrawRandomRows.h"
00007 #include "Core/Matrix/MatKeepSpecificRows.h"
00008 #include "Core/Matrix/VectorQuantize.h"
00009
00010 namespace Impala
00011 {
00012 namespace Core
00013 {
00014 namespace Matrix
00015 {
00016
00017
00018 Mat*
00019 KmeansClustering(Mat* clusterInput, int k)
00020 {
00021 ILOG_VAR(Impala.Core.Matrix.KmeansClustering);
00022 Timer totalTimer;
00023
00024
00025 Mat* codebook = MatDrawRandomRows(clusterInput, k);
00026
00027 double threshold = 1e-5;
00028 double diff = threshold + 1.0;
00029 std::vector<Real64> averageDistances;
00030 int n = MatNrRow(clusterInput);
00031 int d = MatNrCol(clusterInput);
00032 int* numberAssigned = new int[k];
00033 while(diff > threshold)
00034 {
00035 Timer iterTimer;
00036
00037 Mat* assignment = VectorQuantize(clusterInput, codebook);
00038
00039 Real64 averageDist = 0.0;
00040 for(int j = 0; j < n; j++)
00041 {
00042 averageDist += *MatE(assignment, j, 1);
00043 }
00044 averageDist = averageDist / MatNrRow(clusterInput);
00045 averageDistances.push_back(averageDist);
00046
00047
00048 SetVal(codebook, 0.0);
00049
00050 for(int i = 0; i < k; i++) numberAssigned[i] = 0;
00051
00052
00053 for(int j = 0; j < n; j++)
00054 {
00055 int cluster = static_cast<int>(*MatE(assignment, j, 0));
00056 numberAssigned[cluster]++;
00057 for(int i = 0; i < d; i++)
00058 {
00059 *MatE(codebook, cluster, i) += *MatE(clusterInput, j, i);
00060 }
00061 }
00062
00063 std::vector<int> usedClusters;
00064 usedClusters.reserve(MatNrRow(codebook));
00065 for(int cluster = 0; cluster < MatNrRow(codebook); cluster++)
00066 {
00067 if(numberAssigned[cluster] > 0)
00068 {
00069 for(int i = 0; i < d; i++)
00070 {
00071 *MatE(codebook, cluster, i) /= numberAssigned[cluster];
00072 }
00073 usedClusters.push_back(cluster);
00074 }
00075 }
00076
00077 if(usedClusters.size() < MatNrRow(codebook))
00078 {
00079
00080 Mat* temp = MatKeepSpecificRows(codebook, usedClusters);
00081 delete codebook;
00082 codebook = temp;
00083 }
00084
00085 if(averageDistances.size() > 1)
00086 {
00087 diff = averageDistances[averageDistances.size()-2] - averageDistances[averageDistances.size()-1];
00088 ILOG_INFO("K-means progress: " << averageDistances[averageDistances.size()-2] << " " << averageDistances[averageDistances.size()-1] << " " << diff << " " << MatNrRow(codebook) << " iterTime=" << iterTimer.SplitTimeStr() << " totalTimer=" << totalTimer.SplitTimeStr());
00089 }
00090 delete assignment;
00091 }
00092 delete numberAssigned;
00093 return codebook;
00094 }
00095
00096
00097 }
00098 }
00099 }
00100
00101 #endif