00001 #ifndef Impala_Core_Training_LogisticRegression_h
00002 #define Impala_Core_Training_LogisticRegression_h
00003
00004 #include "Core/Training/Classifier.h"
00005
00006 namespace Impala
00007 {
00008 namespace Core
00009 {
00010 namespace Training
00011 {
00012
00015 class LogisticRegression : public Classifier
00016 {
00017 public:
00018 LogisticRegression()
00019 {
00020 }
00021
00022 virtual void SetTrainSet(const ConceptFeatureTableType* trainset)
00023 {
00024 }
00025
00026 virtual Model* Train(PropertySet* properties)
00027 {
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056 int sampleCount = mTrainSet->Size();
00057 int vectorLength = mTrainset->GetColumn3()->GetVectorLength(0);
00058
00059 std::vector<double> labels = GetDistinct(mTrainSet->GetColumn2());
00060 if(labels.size() != 2)
00061 std::cerr << "[LogisticRegression::Train] multiclass not supported (yet)" << std::endl;
00062 int class1 = CountSelect(mTrainSet->GetColumn2(), labels[0]);
00063
00064 double prior[2] = {(double)class1 / (double)sampleCount, (double)(sampleCount-class1) / (double)sampleCount};
00065
00066
00067 ScaleMeanAndVariance(mTrainSet->GetColumn3());
00068
00069
00070 VectorSet* x = new VectorSet(true, vectorLength+1, sampleCount);
00071 SetVal(x->GetStorage(), x->GetStorage(), 1);
00072 PatSet(x->GetStorage(), mTrainSet->GetColumn3()->GetStorage(), 0, 0, 0, vectorLength, 0, 0);
00073
00074
00075
00076
00077
00078 int i;
00079 for(i=0 ; i<sampleCount ; i++)
00080 if(mTrainSet->Get2(i) == -1)
00081 mTrainSet->Get3(i) = -mTrainSet->Get3(i);
00082
00083
00084 double alf = (double)class1 / (double)(sampleCount-class1);
00085
00086 double weights[vectorLength+1] = {0};
00087
00088
00089 double L = -1e100;
00090 double Lnew = 1e100;
00091
00092
00093 while(abs(Lnew - L) > 0.001)
00094 {
00095
00096
00097
00098
00099 Vector prob = MatMul(x, weights);
00100
00101 double pax[sampleCount];
00102 for(i=0 ; i<sampleCount ; ++i)
00103 pax[i] = 1. / 1.+exp(-weights2[i]);
00104
00105 double pbx[sampleCount];
00106 for(i=0 ; i<sampleCount ; ++i)
00107 pbx[i] = 1. - pax[i];
00108
00109 L = Lnew;
00110 Lnew = 0;
00111 for(i=0 ; i<sampleCount ; ++i)
00112 Lnew += log(pax[i] + 1e-100);
00113
00114 double p2x[sampleCount];
00115 for(i=0 ; i<sampleCount ; ++i)
00116 p2x[i] = sqrt(pax[i]*pbx[i]);
00117
00118 Matrix y = x;
00119 for(i=0 ; i<sampleCount ; ++i)
00120 y.GetColumn(i) *= p2x[i];
00121
00122 Matrtix weightupdate = MatMul(MatTranspose(pbx),MatMul(x,MatPseudoInverse(MatMul(MatTranspose(y), y)));
00123 for(i=0 ; i<vectorLength+1 ; ++i)
00124 weights[i] += weightupdate.Value(0, i);
00125 }
00126
00127
00128
00129
00130 double w0 = weights.Value(vectorLength) + log(alf*prior[0]/prior[1]);
00131
00132
00133
00134 }
00135
00136 virtual void SetTestSet(const ConceptFeatureTableType* testset)
00137 {
00138 }
00139
00140 virtual double Predict(const Impala::Core::Vector::VectorTem<double>* feature, const Model* model)
00141 {
00142 }
00143
00144 virtual RankingTableType* Rank(const Model* model)
00145 {
00146 }
00147 };
00148
00149 }
00150 }
00151 }
00152
00153 #endif