00001
00002 #include <vector>
00003 #include <string>
00004 #include <fstream>
00005 #include <iostream>
00006
00007
00008 extern "C" {
00009 # include "svm_common.h"
00010 # include "plugin.h"
00011 }
00012 #include <limits.h>
00013
00014 #define fixed_point_shift USHRT_MAX
00015 #define fixed_point_type unsigned short
00016
00017 using namespace std;
00018
00019 void read_input_parameters(int, char **, char *, char *, char *,
00020 long *, long *, KERNEL_PARM *,
00021 long *, long *, long *);
00022
00023 void wait_any_key();
00024 void print_help();
00025
00026 void setValue(void * matrix, long i, long j, float value, long no_cols, long fixed_point)
00027 {
00028 if(!no_cols)
00029 if(fixed_point)
00030 ((fixed_point_type*)matrix)[j+i*(i+1)/2] = (fixed_point_type)(value * fixed_point_shift);
00031 else
00032 ((float*)matrix)[j+i*(i+1)/2] = value;
00033 else
00034 if(fixed_point)
00035 ((fixed_point_type*)matrix)[j+i*no_cols] = (fixed_point_type)(value * fixed_point_shift);
00036 else
00037 ((float*)matrix)[j+i*no_cols] = value;
00038 }
00039
00040 float getValue(void * matrix, long i, long j, long no_cols, long fixed_point)
00041 {
00042 if(!no_cols)
00043 if(fixed_point)
00044 return ((float*)matrix)[j+i*(i+1)/2]/fixed_point_shift;
00045 else
00046 return ((float*)matrix)[j+i*(i+1)/2];
00047 else
00048 if(fixed_point)
00049 return ((float*)matrix)[j+i*no_cols]/fixed_point_shift;
00050 else
00051 return ((float*)matrix)[j+i*no_cols];
00052 }
00053
00054 void printValue(FILE *fp,
00055 void *value,
00056 long offset,
00057 long fixed_point,
00058 long binary)
00059 {
00060 if(binary)
00061 if(fixed_point)
00062 fwrite(((fixed_point_type*)value)+offset, sizeof(fixed_point_type), 1, fp);
00063 else
00064 fwrite(((float*)value)+offset, sizeof(float), 1, fp);
00065 else
00066 if(fixed_point)
00067 fprintf(fp, "%hd ", *(((fixed_point_type*)value)+offset));
00068 else
00069 fprintf(fp, "%f ", *((float*)value+offset));
00070 }
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179 void free_documents(DOC ** docs, double * target, long totdoc)
00180 {
00181 for(long i=0;i<totdoc;i++)
00182 free_example(docs[i],1);
00183 free(docs);
00184 free(target);
00185 }
00186
00187 long getNoRows(const vector<string>& docfile)
00188 {
00189 long norows = 0;
00190 ifstream in;
00191 string buf;
00192
00193 for(uint i = 0; i < docfile.size(); i++){
00194 in.open(docfile[i].c_str());
00195 while(getline(in,buf))
00196 norows++;
00197 in.close();
00198 }
00199 return norows;
00200 }
00201
00202 float * symmetric_matrix(void * matrix, DOC ** docs, long totdoc, long offset,
00203 KERNEL_PARM * kernel_parm_p, long normalize, long fixed_point)
00204 {
00205 float * diag = NULL;
00206
00207 if(normalize){
00208 diag = (float*)malloc(sizeof(float)*totdoc);
00209 for(long i = 0; i < totdoc; i++)
00210 diag[i]=kernel(kernel_parm_p, docs[i], docs[i]);
00211 for(long i = 0; i < totdoc; i++)
00212 for(long j = 0; j < i; j++){
00213 float val = kernel(kernel_parm_p, docs[i], docs[j]);
00214 float den = sqrt(diag[i]*diag[j]);
00215 if(den != 0)
00216 val /= den;
00217 setValue(matrix,i+offset,j+offset, val, 0, fixed_point);
00218 }
00219 for(long i = 0; i < totdoc; i++)
00220 setValue(matrix,i+offset,i+offset,1,0,fixed_point);
00221 }
00222 else
00223 for(long i = 0; i < totdoc; i++)
00224 for(long j = 0; j <= i; j++)
00225 setValue(matrix,i+offset,j+offset, kernel(kernel_parm_p, docs[i], docs[j]), 0, fixed_point);
00226
00227 return diag;
00228 }
00229
00230 void symmetric_matrix(void * matrix, float * diag_c, DOC ** docs_r, long totdoc_r, long offset_r,
00231 DOC ** docs_c, long totdoc_c, long offset_c,
00232 KERNEL_PARM * kernel_parm_p, long normalize, long fixed_point)
00233 {
00234 if(normalize){
00235 float * diag_r = (float*)malloc(sizeof(float)*totdoc_r);
00236 for(long i = 0; i < totdoc_r; i++)
00237 diag_r[i]=kernel(kernel_parm_p, docs_r[i], docs_r[i]);
00238 for(long i = 0; i < totdoc_r; i++)
00239 for(long j = 0; j < totdoc_c; j++){
00240 float val = kernel(kernel_parm_p, docs_r[i], docs_c[j]);
00241 float den = sqrt(diag_r[i]*diag_c[j]);
00242 if(den != 0)
00243 val /= den;
00244 setValue(matrix,i+offset_r,j+offset_c, val, 0, fixed_point);
00245 }
00246 free(diag_r);
00247 }
00248 else
00249 for(long i = 0; i < totdoc_r; i++)
00250 for(long j = 0; j < totdoc_c; j++)
00251 setValue(matrix,i+offset_r,j+offset_c, kernel(kernel_parm_p, docs_r[i], docs_c[j]), 0, fixed_point);
00252 }
00253
00254 void * symmetric_matrix(const vector<string>& docfile,
00255 long verbosity,
00256 long normalize,
00257 KERNEL_PARM * kernel_parm_p,
00258 long fixed_point,
00259 long* no_rows)
00260 {
00261 uint blocks = docfile.size();
00262 DOC **docs[blocks];
00263 long totwords[blocks],totdoc[blocks],fulltotdoc,offset_r = 0, offset_c = 0;
00264 double *target[blocks];
00265 void *matrix;
00266
00267 fulltotdoc = getNoRows(docfile);
00268
00269 if(fixed_point)
00270 matrix = malloc(sizeof(fixed_point_type)*fulltotdoc*(fulltotdoc+1)/2);
00271 else
00272 matrix = malloc(sizeof(float)*fulltotdoc*(fulltotdoc+1)/2);
00273
00274 for(uint c = 0; c < docfile.size(); c++){
00275 read_documents(const_cast<char *>(docfile[c].c_str()),&docs[c],&target[c],&totwords[c],&totdoc[c]);
00276 float * diag_c = symmetric_matrix(matrix,docs[c],totdoc[c],offset_c, kernel_parm_p, normalize, fixed_point);
00277 offset_r = offset_c + totdoc[c];
00278 for(uint r = c+1; r < docfile.size(); r++){
00279 read_documents(const_cast<char *>(docfile[r].c_str()),&docs[r],&target[r],&totwords[r],&totdoc[r]);
00280 symmetric_matrix(matrix, diag_c, docs[r], totdoc[r], offset_r, docs[c], totdoc[c], offset_c,
00281 kernel_parm_p, normalize, fixed_point);
00282 free_documents(docs[r],target[r],totdoc[r]);
00283 offset_r += totdoc[r];
00284 }
00285 free_documents(docs[c],target[c],totdoc[c]);
00286 if(diag_c) free(diag_c);
00287 offset_c += totdoc[c];
00288 }
00289
00290 *no_rows = fulltotdoc;
00291 return matrix;
00292 }
00293
00294 void asymmetric_matrix(void * matrix, DOC ** docs_r, long totdoc_r, long offset_r,
00295 DOC ** docs_c, long totdoc_c, long offset_c, long nocols,
00296 KERNEL_PARM * kernel_parm_p, long normalize, long fixed_point)
00297 {
00298 if(normalize){
00299 float * diag_r = (float*)malloc(sizeof(float)*totdoc_r);
00300 float * diag_c = (float*)malloc(sizeof(float)*totdoc_c);
00301 for(long i = 0; i < totdoc_r; i++)
00302 diag_r[i]=kernel(kernel_parm_p, docs_r[i], docs_r[i]);
00303 for(long i = 0; i < totdoc_c; i++)
00304 diag_c[i]=kernel(kernel_parm_p, docs_c[i], docs_c[i]);
00305 for(long i = 0; i < totdoc_r; i++)
00306 for(long j = 0; j < totdoc_c; j++){
00307 float val = kernel(kernel_parm_p, docs_r[i], docs_c[j]);
00308 float den = sqrt(diag_r[i]*diag_c[j]);
00309 if(den != 0)
00310 val /= den;
00311 setValue(matrix, i+offset_r, j+offset_c, val, nocols, fixed_point);
00312 }
00313 free(diag_r);
00314 free(diag_c);
00315 }
00316 else
00317 for(long i = 0; i < totdoc_r; i++)
00318 for(long j = 0; j < totdoc_c; j++)
00319 setValue(matrix,i+offset_r,j+offset_c, kernel(kernel_parm_p, docs_r[i], docs_c[j]), nocols, fixed_point);
00320 }
00321
00322 void * asymmetric_matrix(const vector<string>& docfile_r,
00323 const vector<string>& docfile_c,
00324 long verbosity,
00325 long normalize,
00326 KERNEL_PARM * kernel_parm_p,
00327 long fixed_point,
00328 long *no_rows,
00329 long *no_cols)
00330 {
00331 uint blocks_r = docfile_r.size();
00332 DOC **docs_r[blocks_r];
00333 long totwords_r[blocks_r],totdoc_r[blocks_r],fulltotdoc_r,offset_r = 0;
00334 double *target_r[blocks_r];
00335 uint blocks_c = docfile_c.size();
00336 DOC **docs_c[blocks_c];
00337 long totwords_c[blocks_c],totdoc_c[blocks_c],fulltotdoc_c;
00338 double *target_c[blocks_c];
00339 void *matrix;
00340
00341 fulltotdoc_r = getNoRows(docfile_r);
00342 fulltotdoc_c = getNoRows(docfile_c);
00343
00344 if(fixed_point)
00345 matrix = malloc(sizeof(fixed_point_type)*fulltotdoc_r*fulltotdoc_c);
00346 else
00347 matrix = malloc(sizeof(float)*fulltotdoc_r*fulltotdoc_c);
00348
00349 for(uint r = 0; r < blocks_r; r++){
00350 read_documents(const_cast<char *>(docfile_r[r].c_str()),&docs_r[r],&target_r[r],&totwords_r[r],&totdoc_r[r]);
00351 long offset_c = 0;
00352 for(uint c = 0; c < blocks_c; c++){
00353 read_documents(const_cast<char *>(docfile_c[c].c_str()),&docs_c[c],&target_c[c],&totwords_c[c],&totdoc_c[c]);
00354 asymmetric_matrix(matrix, docs_r[r], totdoc_r[r], offset_r, docs_c[c], totdoc_c[c], offset_c,
00355 fulltotdoc_c, kernel_parm_p, normalize, fixed_point);
00356 free_documents(docs_c[c],target_c[c],totdoc_c[c]);
00357 offset_c += totdoc_c[c];
00358 }
00359 free_documents(docs_r[r],target_r[r],totdoc_r[r]);
00360 offset_r += totdoc_r[r];
00361 }
00362
00363 *no_rows = fulltotdoc_r;
00364 *no_cols = fulltotdoc_c;
00365 return matrix;
00366 }
00367
00368
00369 void print_matrix(const char * matrixfile, void * matrix, long fixed_point,
00370 long no_rows, long no_cols, long binary, long lower_traingular)
00371 {
00372 FILE* fp;
00373 int i,j;
00374
00375 fp = fopen(matrixfile,"w");
00376 if(fp == NULL){
00377 perror("ERROR in opening matrix file:");
00378 exit(1);
00379 }
00380
00381 if(!no_cols)
00382 if(lower_traingular)
00383 for(i = 0; i < no_rows; i++){
00384 for(j = 0; j <= i; j++)
00385 printValue(fp, matrix, (j+i*(i+1)/2), fixed_point, binary);
00386 if(!binary)
00387 fprintf(fp,"\n");
00388 }
00389 else
00390 for(i = 0; i < no_rows; i++){
00391 for(j = 0; j < no_rows; j++)
00392 printValue(fp, matrix, (j<=i) ? (j+i*(i+1)/2) : (i+j*(j+1)/2), fixed_point, binary);
00393 if(!binary)
00394 fprintf(fp,"\n");
00395 }
00396 else
00397 for(i = 0; i < no_rows; i++){
00398 for(j = 0; j < no_cols; j++)
00399 printValue(fp, matrix, (j+i*no_cols), fixed_point, binary);
00400 if(!binary)
00401 fprintf(fp,"\n");
00402 }
00403
00404 fclose(fp);
00405 }
00406
00407 void getStringList(string buf, vector<string>& list)
00408 {
00409 int beg = 0, end = 0;
00410 while((end = buf.find(":",beg)) != string::npos){
00411 list.push_back(buf.substr(beg,end-beg));
00412 beg = end+1;
00413 }
00414 list.push_back(buf.substr(beg));
00415 }
00416
00417
00418
00419 void read_input_parameters(int argc,char *argv[],string& matrixfile, vector<string>& docfile_r,
00420 vector<string>& docfile_c, long *verbosity,long *normalize,
00421 KERNEL_PARM *kernel_parm, long *fixed_point, long *binary,
00422 long *lower_triangular)
00423 {
00424 long i;
00425
00426
00427 strcpy (kernel_parm->plugin_file, "");
00428 (*verbosity)=1;
00429 (*normalize)=0;
00430 kernel_parm->kernel_type=0;
00431 kernel_parm->poly_degree=3;
00432 kernel_parm->rbf_gamma=1.0;
00433 kernel_parm->coef_lin=1;
00434 kernel_parm->coef_const=1;
00435 strcpy(kernel_parm->custom,"empty");
00436 (*fixed_point) = 0;
00437 (*binary) = 0;
00438 (*lower_triangular) = 0;
00439
00440 for(i=1;(i<argc) && ((argv[i])[0] == '-');i++) {
00441 switch ((argv[i])[1])
00442 {
00443 case '?': print_help(); exit(0);
00444 case 'v': i++; (*verbosity)=atol(argv[i]); break;
00445 case 'R': i++; getStringList(argv[i], docfile_r); break;
00446 case 'C': i++; getStringList(argv[i], docfile_c); break;
00447 case 't': i++; kernel_parm->kernel_type=atol(argv[i]); break;
00448 case 'd': i++; kernel_parm->poly_degree=atol(argv[i]); break;
00449 case 'g': i++; kernel_parm->rbf_gamma=atof(argv[i]); break;
00450 case 's': i++; kernel_parm->coef_lin=atof(argv[i]); break;
00451 case 'r': i++; kernel_parm->coef_const=atof(argv[i]); break;
00452 case 'u': i++; strcpy(kernel_parm->custom,argv[i]); break;
00453 case 'D': i++; strcpy(kernel_parm->plugin_file,argv[i]);
00454 if (kernel_parm->kernel_type==0) kernel_parm->kernel_type=4;
00455 break;
00456 case 'N': (*normalize)=1; break;
00457 case 'B': (*binary)=1; break;
00458 case 'F': (*fixed_point)=1; break;
00459 case 'L': (*lower_triangular)=1; break;
00460 default: printf("\nUnrecognized option %s!\n\n",argv[i]);
00461 print_help();
00462 exit(0);
00463 }
00464 }
00465 if(i>=argc) {
00466 printf("\nNot enough input parameters!\n\n");
00467 wait_any_key();
00468 print_help();
00469 exit(0);
00470 }
00471 if(*fixed_point && !(*normalize)){
00472 printf("\nFixed point can only be used with normalized kernels!\n\n");
00473 wait_any_key();
00474 print_help();
00475 exit(0);
00476 }
00477 matrixfile = argv[i++];
00478 }
00479
00480 void wait_any_key()
00481 {
00482 printf("\n(more)\n");
00483 (void)getc(stdin);
00484 }
00485
00486 void print_help()
00487 {
00488 printf("\nSVM-light %s: Support Vector Machine, Gram matrix computation module %s\n",VERSION,VERSION_DATE);
00489 copyright_notice();
00490 printf(" usage: svm_Dmatrix [options] matrix_file\n\n");
00491 printf("Arguments:\n");
00492 printf(" matrix_file-> file where kernel matrix will be saved\n");
00493 printf("General options:\n");
00494 printf(" -? -> this help\n");
00495 printf(" -v [0..3] -> verbosity level (default 1)\n");
00496 printf(" -R file1:..:fileN -> example files for rows (and cols if no -C -> i.e. symmetric)\n");
00497 printf(" -C file1:..:fileN -> example files for cols (asymmetric matrix)\n");
00498 printf("Kernel options:\n");
00499 printf(" -t int -> type of kernel function:\n");
00500 printf(" 0: linear (default if -D not given)\n");
00501 printf(" 1: polynomial (s a*b+c)^d\n");
00502 printf(" 2: radial basis function exp(-gamma ||a-b||^2)\n");
00503 printf(" 3: sigmoid tanh(s a*b + c)\n");
00504 printf(" 4: user defined kernel as specified in plugin (default when -D is given)\n");
00505 printf(" -d int -> parameter d in polynomial kernel\n");
00506 printf(" -g float -> parameter gamma in rbf kernel\n");
00507 printf(" -s float -> parameter s in sigmoid/poly kernel\n");
00508 printf(" -r float -> parameter c in sigmoid/poly kernel\n");
00509 printf(" -u string -> parameter of user defined kernel\n");
00510 printf(" -N -> normalize kernel (default=false)\n");
00511 printf(" -D string -> path to a dynamic library containing code for\n");
00512 printf(" custom data types and associated kernel\n");
00513 printf(" -B -> output matrix in binary form (instead of ascii)\n");
00514 printf(" -F -> use fixed point values (needs normalized kernel)\n");
00515 printf(" -L -> output lower triangular matrix only\n");
00516 wait_any_key();
00517 printf("\nMore details in:\n");
00518 printf("[1] T. Joachims, Making Large-Scale SVM Learning Practical. Advances in\n");
00519 printf(" Kernel Methods - Support Vector Learning, B. Schölkopf and C. Burges and\n");
00520 printf(" A. Smola (ed.), MIT Press, 1999.\n");
00521 printf("[2] T. Joachims, Estimating the Generalization performance of an SVM\n");
00522 printf(" Efficiently. International Conference on Machine Learning (ICML), 2000.\n");
00523 printf("[3] T. Joachims, Transductive Inference for Text Classification using Support\n");
00524 printf(" Vector Machines. International Conference on Machine Learning (ICML),\n");
00525 printf(" 1999.\n");
00526 printf("[4] K. Morik, P. Brockhausen, and T. Joachims, Combining statistical learning\n");
00527 printf(" with a knowledge-based approach - A case study in intensive care \n");
00528 printf(" monitoring. International Conference on Machine Learning (ICML), 1999.\n");
00529 printf("[5] T. Joachims, Learning to Classify Text Using Support Vector\n");
00530 printf(" Machines: Methods, Theory, and Algorithms. Dissertation, Kluwer,\n");
00531 printf(" 2002.\n\n");
00532 }
00533
00534 int main (int argc, char* argv[])
00535 {
00536 DOC **docs;
00537 long totwords,totdoc,i,j;
00538 double *target;
00539 KERNEL_PARM kernel_parm;
00540 long normalize;
00541 long fixed_point;
00542 long binary;
00543 long lower_triangular;
00544 void *matrix;
00545 long no_rows = 0, no_cols = 0;
00546 string matrixfile;
00547 vector<string> docfile_r;
00548 vector<string> docfile_c;
00549
00550 read_input_parameters(argc,argv,matrixfile,docfile_r,docfile_c,&verbosity,&normalize,&kernel_parm,
00551 &fixed_point, &binary, &lower_triangular);
00552 plugin_init(kernel_parm.plugin_file);
00553 plugin_kernel_setparm(kernel_parm.custom);
00554
00555
00556 if(docfile_c.size() == 0)
00557 matrix = symmetric_matrix(docfile_r,verbosity,normalize,&kernel_parm,fixed_point, &no_rows);
00558 else
00559 matrix = asymmetric_matrix(docfile_r,docfile_c,verbosity,normalize,&kernel_parm,fixed_point, &no_rows, &no_cols);
00560
00561 print_matrix(matrixfile.c_str(), matrix, fixed_point, no_rows, no_cols, binary, lower_triangular);
00562 free(matrix);
00563 return(0);
00564 }
00565