Home || Architecture || Video Search || Visual Search || Scripts || Applications || Important Messages || OGL || Src

TableDataSourceCSV.h

Go to the documentation of this file.
00001 #ifndef MediaTable_TableDataSourceCSV_h
00002 #define MediaTable_TableDataSourceCSV_h
00003 
00004 #include "WritableTableDataSource.h"
00005 
00006 #ifdef MEDIATABLE_BASIC
00007 #include "RemoteRetriever.h"
00008 #endif
00009 
00010 #include <iostream>
00011 #include <fstream>
00012 #include <string>
00013 
00014 namespace Impala {
00015 namespace Application {
00016 namespace MediaTable {
00017 
00018 class TableDataSourceCSV : public WritableTableDataSource
00019 {
00020 public:
00021     TableDataSourceCSV(std::string filename):
00022         WritableTableDataSource()
00023     {
00024         Init();
00025         OpenCSV(filename);
00026     }
00027 
00028     ~TableDataSourceCSV()
00029     {
00030     }
00031 
00032     void OpenCSV(std::string filename) {
00033         std::ifstream is(filename.c_str());
00034 
00035                 /* From: http://stackoverflow.com/questions/1120140/csv-parser-in-c
00036                  * Can also use boost:
00037                 std::vector<std::string> vec;
00038                 using namespace boost;
00039                 tokenizer<escaped_list_separator<char> > tk(
00040                    line, escaped_list_separator<char>('\\', ',', '\"'));
00041                 for (tokenizer<escaped_list_separator<char> >::iterator i(tk.begin());
00042                    i!=tk.end();++i)
00043                 {
00044                    vec.push_back(*i);
00045                 }*/
00046 
00047         std::vector<int> columnType;
00048         std::vector<int> newColumnType;
00049         std::vector<std::vector<std::string> > data;
00050         if(!is.good()) return;
00051 
00052                 std::string line, cell;
00053         while(is.good()) {
00054                         //ILOG_DEBUG("is good");
00055                 std::vector<std::string> row;
00056                 std::getline(is, line);
00057                 if(line.length() == 0) continue;
00058                 std::stringstream lineStream(line);
00059                         //ILOG_INFO("LINE: " << line);
00060                         newColumnType = columnType;
00061                 while(std::getline(lineStream,cell,',')) {
00062                         if(cell.length() > 1 &&
00063                                         (cell.at(0) == 0x27 || cell.at(0) == 0x22))
00064                         {
00065                                         // Fix CR/LF issue on Unixes
00066                                         if(cell.at(cell.length()-1) == '\r')
00067                                                 cell = cell.substr(0, cell.length()-1);
00068                                 while(cell.at(cell.length()-1) != 0x27 &&
00069                                           cell.at(cell.length()-1) != 0x22)
00070                                 {
00071                                         std::string ncell;
00072                                         if(!std::getline(lineStream,ncell,','))
00073                                         {
00074                                                 // Error!
00075                                                 // CSV spec states that a new line is also allowed in a field.
00076                                         ILOG_DEBUG("Line " << data.size() + 1 << " has bad quoting.");
00077                                         row.clear();
00078                                                 break;
00079                                         } else
00080                                                 cell += ncell;
00081                                 }
00082                                 cell = cell.substr(1, cell.length()-2);
00083                         }
00084                         //printf(cell.c_str());
00085                         //printf(",");
00086                         row.push_back(cell);
00087                         if(data.size() == 0) {
00088                                 newColumnType.push_back(TYPE_INT);
00089                         } else {
00090                                 char result[100];
00091                                         if(row.size() > newColumnType.size())
00092                                         {
00093                                                 // skipping overflow row
00094 //                                              ILOG_DEBUG("Skipping overflow row");
00095                                                 continue;
00096                                         }
00097                                 if(newColumnType[row.size()-1] == TYPE_INT) {
00098                                         sprintf(result, "%d", atoi(cell.c_str()));
00099                                         if(std::string(result) != cell)
00100                                                 newColumnType[row.size()-1] = TYPE_DOUBLE;
00101                                 }
00102                                 if(newColumnType[row.size()-1] == TYPE_DOUBLE) {
00103                                         sprintf(result, "%f", atof(cell.c_str()));
00104                                         if(strncmp(cell.c_str(), result, cell.length()))
00105                                                 {
00106                                                         /*ILOG_DEBUG("On row " << data.size()+1 << " and column " << 
00107                                                                            data[0][row.size()-1] << ": " << 
00108                                                                            result << " != " << cell);*/
00109                                                 newColumnType[row.size()-1] = TYPE_IMAGE;
00110                                                 }
00111                                 }
00112                                 if(newColumnType[row.size()-1] == TYPE_IMAGE) {
00113                                                 ILOG_DEBUG(cell.substr(0, 7));
00114                                                 ILOG_DEBUG(cell.substr(cell.length()-4));
00115                                         if ((cell.length() < 13) || 
00116                                                     (cell.substr(0, 7) != "http://") ||
00117                                                    ((cell.substr(cell.length()-4) != ".jpg") && 
00118                                                         (cell.substr(cell.length()-5) != ".jpeg")))
00119                                                 {
00120                                                         /*ILOG_DEBUG("On row " << data.size()+1 << " and column " << 
00121                                                          data[0][row.size()-1] << ": " << 
00122                                                          result << " != " << cell);*/
00123                                                 newColumnType[row.size()-1] = TYPE_TEXT;
00124                                                 }
00125                                 }
00126                                 }
00127                 }
00128                         if(!row.size()) continue;
00129                 if(data.size() && (row.size() != columnType.size())) {
00130                         ILOG_DEBUG("Line " << data.size() + 1 << " has wrong number of cells (" <<
00131                                                    row.size() << " should be " << columnType.size() << ")");
00132                                 
00133                 } else {
00134                         //printf("\n");
00135                         data.push_back(row);
00136                                 // Only store columnType for valid rows
00137                                 columnType = newColumnType;
00138                 }
00139         }
00140         is.close();
00141 
00142                 ILOG_DEBUG("phase 2");
00143 
00144 #ifdef MEDIATABLE_BASIC
00145         int gotcha = 0;
00146         bool isFlickr = false;
00147         for(int col=0; col < columnType.size(); col++) {
00148                 if(data[0][col] == "id") gotcha++;
00149                 if(data[0][col] == "farm") gotcha++;
00150                 if(data[0][col] == "server") gotcha++;
00151                 if(data[0][col] == "secret") gotcha++;
00152         }
00153         if(gotcha == 4) {
00154                 isFlickr = true;
00155         }
00156                 mRemoteRetriever = RemoteRetriever::GetInstance();
00157         std::string id, farm, server, secret;
00158         std::string sizeSuffix = "t";
00159 #endif
00160                 ILOG_DEBUG("phase 3");
00161 
00162         for(int col=0; col < columnType.size(); col++) {
00163 #ifdef MEDIATABLE_BASIC
00164                 if(isFlickr)
00165                 {
00166                         if(data[0][col] == "id")
00167                                         columnType[col] = TYPE_TEXT;
00168                         if(data[0][col] == "farm" | data[0][col] == "server")
00169                                 continue;
00170                         if(data[0][col] == "secret")
00171                         {
00172                         AddStaticColumn("thumb", TYPE_IMAGE);
00173                         //AddStaticColumn("photo", TYPE_IMAGE);
00174                         continue;
00175                         }
00176                 }
00177 #endif
00178                 AddStaticColumn(data[0][col], columnType[col]);
00179                 ILOG_DEBUG("Added column " << data[0][col] << " of type " << columnType[col]);
00180         }
00181 
00182         for(int row=1; row < data.size(); row++) {
00183                 for(int col=0; col < columnType.size(); col++) {
00184 #ifdef MEDIATABLE_BASIC
00185                                 if(isFlickr)
00186                                 {
00187                                         if(data[0][col] == "id")
00188                                                 id = data[row][col];
00189                                         if(data[0][col] == "farm")
00190                                         {
00191                                                 farm = data[row][col];
00192                                                 continue;
00193                                         }
00194                                         if(data[0][col] == "server")
00195                                         {
00196                                                 server = data[row][col];
00197                                                 continue;
00198                                         }
00199                                         if(data[0][col] == "secret")
00200                                         {
00201                                                 secret = data[row][col];
00202                                         std::string imageUrl = "http://farm";
00203                                         imageUrl += farm + ".static.flickr.com/" + server + "/";
00204                                         imageUrl += id + "_" + secret;
00205                                                 //AddTextData("photo", imageUrl + ".jpg");
00206                                         if(sizeSuffix.length() > 0) imageUrl += "_" + sizeSuffix;
00207                                                 AddTextData("thumb", imageUrl + ".jpg");
00208                                                 continue;
00209                                         }
00210                                 }
00211 #endif
00212                         if(columnType[col] == TYPE_INT)         AddIntData(data[0][col], atoi(data[row][col].c_str()));
00213                         if(columnType[col] == TYPE_DOUBLE)      AddDoubleData(data[0][col], atof(data[row][col].c_str()));
00214                         if(columnType[col] == TYPE_IMAGE)       AddTextData(data[0][col], data[row][col]);
00215                         if(columnType[col] == TYPE_TEXT)        AddTextData(data[0][col], data[row][col]);
00216                 }
00217         }
00218     }
00219 
00220 #ifdef MEDIATABLE_BASIC
00221     Array2dVec3UInt8*
00222     GetImageDataByID(String column, int row)
00223     {
00224         std::string imageUrl = GetTextDataByID(column, row);
00225         Array2dVec3UInt8* ar = 0;
00226         ar = mRemoteRetriever->RetrieveImageData(imageUrl);
00227         return ar;
00228     }
00229 
00230     RemoteRetriever* mRemoteRetriever;
00231 
00232 #endif
00233 
00234 private:
00235 
00236     void Init()
00237     {
00238     }
00239 
00240     ILOG_VAR_DEC;
00241 };
00242 
00243 ILOG_VAR_INIT(TableDataSourceCSV, Application.MediaTable);
00244 
00245 } // namespace MediaTable
00246 } // namespace Application
00247 } // namespace Impala
00248 
00249 #endif // TableDataSourceCSV_h

Generated on Fri Mar 19 09:30:34 2010 for ImpalaSrc by  doxygen 1.5.1