/************************************************************************ * * * Program package T O O L D I A G * * * * Version 1.5 * * Date: Tue Feb 8 13:39:03 1994 * * * * NOTE: This program package is copyrighted in the sense that it * * may be used for scientific purposes. The package as a whole, or * * parts thereof, cannot be included or used in any commercial * * application without written permission granted by the author. * * No programs contained in this package may be copied for commercial * * distribution. * * * * All comments concerning this program package may be sent to the * * e-mail address 'tr@fct.unl.pt'. * * * ************************************************************************/ #include #include #include #include #include "def.h" #include "matrix.h" extern universe *U; extern bool verbose; extern bool feat_description; extern char **feature_desc; extern float Euclidian_Distance(); #define MIN_DISPLAY 20 #define MAX_K 10 #define K_DEFAULT 1 static int K = K_DEFAULT; /* The nearest neighbor K */ static int maxFeat = MAXFEAT, *best = NULL; static FeatVector minErrBF = NULL, sortedminErrBF = NULL; static bool show_confusion_matrix = FALSE; typedef struct NearNeibor_ { float dist; int classIndex; } NearNeibor; static NearNeibor *NN = NULL; static str80 buf, gnuFile; static char cmd[200]; void set_K() { int knn; K = K_DEFAULT; printf( "Nearest neighbor K=%d\b", K ); gets( buf ); if( buf[0] != '\0' ) { sscanf( buf, "%d", &knn ); if( knn <= 0 || knn >= MAX_K ) { printf("Value %d for K is invalid. Setting to default %d...", knn, K_DEFAULT ); knn = K_DEFAULT; gets( buf ); } K = knn; } /* printf("K=%d",K);DBG; /**/ } void initMINERR() { set_K(); } static void init_NN( NN, K ) NearNeibor *NN; { int j; for( j = 0; j < K; j++ ) { NN[j].dist = INFINITY; /* infinity */ NN[j].classIndex = EMPTY; } } static void fill_NN_slot( NN, K, slot, classIndex, distance ) NearNeibor *NN; int K, slot, classIndex; float distance; { int b; /* found a distance smaller than existing one */ /* shift downward */ for( b = K-1; b > slot; b-- ) { NN[b].dist = NN[b-1].dist; NN[b].classIndex = NN[b-1].classIndex; } NN[slot].dist = distance; NN[slot].classIndex = classIndex; /* printf(" fill_NN_slot: \n"); for( b = 0; b < K; b++ ) printf("K[%d] = %d with %f\n...",b, NN[b].classIndex+1, NN[b].dist ); gets(buf); /**/ } /* * evaluate the NN table * return 0 on no error ; else return the index of the unexpected class */ static int evaluate_KNN( NN, K, info ) NearNeibor *NN; int K; bool info; { int i, a, maxFreq, Freq, maxIndex; if( info ) { printf("\n--------- Classification result -------------\n"); for( a = 0; a < K; a++ ) { i = NN[a].classIndex; if( i >= 0 && i < U->nrClass ) printf(" %3d. Neighbor = %20s with distance %.6f\n", a+1, U->C[i].name, NN[a].dist); } } /* look for class with maximum fequency */ /* Note: it could happen that another class with the same frequency as the most frequent class has some samples nearer to the unknown one */ maxFreq = 0; maxIndex = EMPTY; for( i = 0; i < U->nrClass; i++ ) { Freq = 0; for( a = 0; a < K; a++ ) if( NN[a].classIndex == i ) Freq++; if( Freq > maxFreq ) { maxFreq = Freq; maxIndex = i; } } return( maxIndex ); } static int est_err_KNN( nrSelFeat, K, leave_out, leave_out_Sample, FSV ) int nrSelFeat, K, leave_out; FeatVector leave_out_Sample; FeatSelectVector FSV; { int i, j, k, a, b, actual_sample_index; bool smaller; FeatVector Sample; float distance; if((Sample = (FeatVector)malloc(nrSelFeat*sizeof(FeatVector*))) == NULL) { printf("\nNo space for buffer 'Sample'.... Exitus...\n"); exit(1); } actual_sample_index = 0; init_NN( NN, K ); for( i = 0; i < U->nrClass; i++ ) { for( j = 0; j < U->C[i].numSampl; j++ ) { if( actual_sample_index != leave_out ) { for( k = 0; k < nrSelFeat; k++ ) Sample[k] = U->C[i].S[j*U->nrFeat+FSV[k].rank]; distance = Euclidian_Distance(leave_out_Sample, Sample, nrSelFeat); /* compare the distance with the distances already calculated */ a = 0; smaller = FALSE; while( !smaller && (a < K) ) { smaller = ( distance < NN[a].dist ); a++; } if( smaller ) fill_NN_slot( NN, K, a-1, i, distance ); } actual_sample_index++; } } FREE( Sample ); /* distances are filled in ; evaluate */ return( evaluate_KNN( NN, K, FALSE ) ); } /* * ERROR ESTIMATION FOR K_NEAREST NEIGHBOR CLASSIFIER * BY METHOD: LEAVE-ONE-OUT */ float estimate_error_KNN( feat, FSV ) int feat; FeatSelectVector FSV; { float global_error; int nrSampl, i, j, k, leave_out, id_class; FeatVector Sample; Matrix confusionMat; FeatVector FV = NULL; init_Matrix( &confusionMat ); confusionMat.row = U->nrClass; confusionMat.col = U->nrClass; Matrix_Alloc( &confusionMat ); Reset_Matrix( &confusionMat ); if((Sample = (FeatVector)malloc(feat*sizeof(FeatVector*))) == NULL) { printf("No space for buffer 'Sample'. Exitus...\n"); exit(1); } if((FV = (FeatVector)malloc(feat*sizeof(FeatVector*))) == NULL) { printf("\nNo space for buffer 'FV'. Exitus...\n"); exit(1); } if( (NN = (NearNeibor*)malloc( K*sizeof(struct NearNeibor_))) == NULL) { printf("\nNo space for buffer 'NN'. Exitus...\n"); exit(1); } nrSampl = 0; for( i = 0; i < U->nrClass; i++ ) nrSampl += U->C[i].numSampl; leave_out = 0; global_error = 0.0; if( verbose ) { printf("# Features: %2d - K: %2d - Total number of samples: %7d\n", feat, K, nrSampl); printf(" Samples to leave out:"); } for( i = 0; i < U->nrClass; i++ ) { if( U->C[i].numSampl < K ) { printf("Warning!: Class %d = %s has less samples than K = %d...", i+1, U->C[i].name, K ); gets( buf ); } for( j = 0; j < U->C[i].numSampl; j++ ) { for( k = 0; k < feat; k++ ) Sample[k] = U->C[i].S[j*U->nrFeat+FSV[k].rank]; id_class = est_err_KNN( feat, K, leave_out, Sample, FSV ); /* write result into matrix */ confusionMat.Elem[i*U->nrClass+id_class] += 1.0 / (float)U->C[i].numSampl; if( i != id_class ) global_error += 1.0; leave_out++; if( verbose ) { printf("%7d\b\b\b\b\b\b\b", nrSampl-leave_out); fflush(stdout); } } } if( verbose ) printf("\n"); global_error /= (float)nrSampl; if( show_confusion_matrix ) { printf("\n --- Confusion Matrix ---\n"); print_Matrix( &confusionMat ); } Matrix_Free( &confusionMat ); FREE( FV ); FREE( Sample ); FREE( NN ); /* printf("Returning %f\n",global_error); /**/ return( global_error ); } int id_one_sample_KNN( sample, name ) FeatVector sample; char *name; { int i, expected, recognized; FeatVector sampleSel = NULL; if( (NN = (NearNeibor*)malloc( K*sizeof(struct NearNeibor_))) == NULL) { printf("\nNo space for buffer 'NN'. Exitus...\n"); exit(1); } sampleSel = (FeatVector) malloc(sizeof(FeatVector*) * U->nrSelFeat); for( i = 0; i < U->nrSelFeat; i++ ) sampleSel[i] = sample[ U->FSV[i].rank ]; /* EMPTY means no sample is left out here: compare to every sample */ recognized = est_err_KNN( U->nrSelFeat, K, EMPTY, sampleSel, U->FSV ); FREE( sampleSel ); FREE( NN ); expected = EMPTY; for( i = 0; i < U->nrClass; i++ ) if( (strcmp(U->C[i].name,name)==0) ) expected = i; if( expected == EMPTY || expected >= U->nrClass ) { printf("\nUnknown class: \"%s\" - Exitus...\n", name ); exit(1); } if( expected == recognized ) return( 0 ); else { fprintf( stderr, "Confused %s with expected class %s\n", U->C[recognized].name, U->C[expected].name ); return( 1 ); } } #ifdef DOS #define EXEC "" #else #define EXEC "exec " #endif #define TOL (10.0) void gen_gnuplot( errInfo, nrSelFeat ) float *errInfo; int nrSelFeat; { int i; float maxErr, minErr, dE; FILE *gf = NULL; strcpy( gnuFile, DATA_DIR ); strcat( gnuFile, "_error_" ); gf = fopen( gnuFile, f_open_text_w ); if( gf == NULL ) { printf("Cannot open %s! Exitus...\n", gnuFile ); exit(1); } if( verbose ) printf("Dumping %s\n", gnuFile ); maxErr = 0.0; minErr = INFINITY; for( i = 0; i < nrSelFeat; i++ ) { fprintf( gf, "%3d %7.5f\n", i+1, errInfo[i] ); if( errInfo[i] > maxErr ) maxErr = errInfo[i]; if( errInfo[i] < minErr ) minErr = errInfo[i]; } fclose( gf ); dE = maxErr - minErr; if( maxErr == 0.0 ) { maxErr = 1.0; minErr = -1.0; } else { minErr -= dE / TOL; if( minErr < 0.0 ) minErr = 0.0; maxErr += dE / TOL; if( maxErr > 100.0 ) maxErr = 100.0; } minErr = 0.0; /* see always from zero */ /* generate the gnuplot batch file */ strcpy( gnuFile, DATA_DIR ); strcat( gnuFile, "_tmp.gnu" ); gf = fopen( gnuFile, f_open_text_w ); if( gf == NULL ) { printf("Cannot open %s! Exitus...\n", gnuFile ); exit(1); } if( verbose ) printf("Dumping %s\n", gnuFile ); fprintf( gf, "#\n# Batch file to visualize error estimate\n" ); fprintf( gf, "# Generated automatically !\n#\n" ); fprintf( gf, "# Universe %s\n", U->name ); if( nrSelFeat > 1 ) fprintf( gf, "set xrange [1:%d]\n", nrSelFeat ); else fprintf( gf, "set xrange [0.9:1.1]\n" ); fprintf( gf, "set xtics 0,1,%d\n", nrSelFeat ); fprintf( gf, "set yrange [%d:%d]\n", (int)minErr, (int)(maxErr)+1 ); fprintf( gf, "set title \"ERROR ESTIMATION: LEAVE-ONE-OUT\"\n"); fprintf( gf, "set xlabel \"Number of selected features\"\n"); fprintf( gf, "set ylabel \"Estimated error [%%]\"\n"); fprintf( gf, "plot \"_error_\" with linespoints\n" ); fprintf( gf, "pause -1 \"Hit return to exit...\"\n" ); fclose( gf ); sprintf( cmd, "cd %s\n\t%sgnuplot %s", DATA_DIR, EXEC, gnuFile ); if( verbose ) printf("\n --- Execute:\n\t%s\n", cmd ); #ifdef DOS #else system( cmd ); #endif fclose( gf ); } void est_err_nearneib() { int i; bool protocolFile; float err, *errInfo = NULL; FILE *ef = NULL; str80 errFile, name; if( U->nrSelFeat == 0 ) { printf("Select features first..."); gets( buf ); return; } set_K(); strcpy( errFile, DATA_DIR ); printf("Saving the error estimation protocol to file:\n\t\t%s", errFile ); gets( name ); protocolFile = name[0] != '\0'; if( protocolFile ) { strcat( errFile, name ); ef = fopen( errFile, f_open_text_w ); if( ef == NULL ) { printf("Cannot open %s! Exitus...\n", errFile ); exit(1); } fprintf( ef, "Universe: %s\n", U->name ); } else printf("\t\tNothing saved...\n"); show_confusion_matrix = FALSE; printf(" --- Show confusion matrix (y/n)?n\b" ); gets( buf ); if( buf[0] == 'y' || buf[0] == 'Y' ) show_confusion_matrix = TRUE; errInfo = (float*) malloc( U->nrSelFeat * sizeof(float) ); printf("- Estimating error using leave-one-out\n"); for( i = 1; i <= U->nrSelFeat; i++ ) { err = estimate_error_KNN( i, U->FSV ); errInfo[i-1] = 100.0*err; printf(" ---> Error estimate = %5.2f%% using %4d of %4d features\n", 100.0*err, i, U->nrSelFeat ); /**/ if( protocolFile ) fprintf( ef, "Nr. Features: %3d - Estimated error: %5.2f %% - Accuracy: %5.2f %%\n", i, 100.0*err, 100.0*(1.0-err) ); } if( protocolFile ) fclose( ef ); gen_gnuplot( errInfo, U->nrSelFeat ); FREE( errInfo ); } float select_multivariate_minerr( FSV, len ) FeatSelectVector FSV; int len; { float accuracy; accuracy = 1.0 - estimate_error_KNN( len, FSV ); return( accuracy ); } /* --- old algorithm --- */ void featSelectMinErr_SFS() { int i, j, nrSelectFeat, feat, minFeat; bool protocolFile, *already = NULL; float err, *errInfo = NULL, minErr; FILE *ef = NULL; str80 errFile, name; FeatSelectVector FSV = NULL; set_K(); strcpy( errFile, DATA_DIR ); printf("Saving the error estimation protocol to file:\n\t\t%s", errFile ); gets( name ); protocolFile = name[0] != '\0'; if( protocolFile ) { strcat( errFile, name ); ef = fopen( errFile, f_open_text_w ); if( ef == NULL ) { printf("Cannot open %s! Exitus...\n", errFile ); exit(1); } fprintf( ef, "Universe: %s\n", U->name ); } else printf("\t\tNothing saved...\n"); printf("Maximum number of features to be selected (1-%d)? ", U->nrFeat); get_d_range( &nrSelectFeat, 1, U->nrFeat, LEFT_CLOSED__RIGHT_CLOSED ); show_confusion_matrix = FALSE; printf(" --- Show confusion matrix (y/n)?n\b" ); gets( buf ); if( buf[0] == 'y' || buf[0] == 'Y' ) show_confusion_matrix = TRUE; already = (bool*) malloc( U->nrFeat * sizeof(bool) ); for( i = 0; i < U->nrFeat; i++ ) already[i] = FALSE; errInfo = (float*) malloc( U->nrFeat * sizeof(float) ); FSV = (feature*) malloc( U->nrFeat * sizeof(struct feature_) ); feat = 0; for( i = 0; i < nrSelectFeat; i++ ) { /* for all features not yet considered test the error rate */ minErr = 1.0; for( j = 0; j < U->nrFeat; j++ ) { if( ! already[j] ) { FSV[ feat ].rank = j; printf("Considering feature nr. %4d of %4d-%4d as candidate nr. %4d\r", j+1, U->nrFeat, feat, i+1 ); fflush( stdout ); if( verbose ) printf("\n" ); err = estimate_error_KNN( feat+1, FSV ); if( err < minErr ) { minErr = err; minFeat = j; } } } /* which one had the best performance ? */ FSV[ feat ].rank = minFeat; FSV[ feat ].crit = minErr; already[ minFeat ] = TRUE; errInfo[ feat ] = 100.0*minErr; feat++; printf("\n ---> Error estimate = %5.2f %%\n", 100.0*minErr ); /**/ printf("For feature vector:\n\t( "); for( j = 0; j < feat; j++ ) printf("%d ", FSV[j].rank ); printf(")\n\n"); if( protocolFile ) fprintf( ef, "Nr. Features: %3d - Estimated error: %5.2f %% - Accuracy: %5.2f %%\n", i+1, 100.0*minErr,100.0*(1.0-minErr) ); } U->FSV = (feature*) malloc( feat * sizeof(struct feature_) ); printf("\n--- SELECTED FEATURES --- : %d\n", feat); for( j = 0; j < feat; j++ ) { U->FSV[j].rank = FSV[j].rank; U->FSV[j].crit = FSV[j].crit; printf("Nr. %3d = %3d --- Criterion = %7.5f", j+1, U->FSV[j].rank, U->FSV[j].crit ); if( feat_description ) printf("\tName: %s", feature_desc[U->FSV[j].rank] ); printf("\n"); } U->nrSelFeat = feat; save_selected_feat(); if( protocolFile ) fclose( ef ); gen_gnuplot( errInfo, U->nrSelFeat ); FREE( errInfo ); FREE( FSV ); FREE( already ); }