/* ====================================================================== David W. Aha ibl.c: Implementation of Instance-Based Learning Algorithms Topic: IB1, IB2, IB3, and now IB4 March 1994 Done: 1. Implemented ib4 -- Assumes that, if best_concept_only, there is only 1 predictee and that it is the last. Always put IB4's predictee last! -- Need to implement k updates to similarity fn per classification -- Need a translation process from multi-discrete-value to multi-binary-attributes. Not done automatically? Right. 2. k>=1 3. -norm_none, -norm_linear, and -norm_sd options 4. -missing_maxdiff, -missing-ave, -missing_ignore 5. 3/9/94: -storeall option implemented - Tells IBn (n>1) to save (initially) all instances 6. 3/9/94: -multiline option implemented - Allows alternative format for instances in input files - CRLF's allowed between attribute values within an instance - Each new instance must begin on a new line ====================================================================== */ /*==== Accessing the declarations. ====*/ #define MAIN 1 #include "datastructures.h" #undef MAIN /* ====================================================================== Main Function ====================================================================== */ main (argc, argv) int argc; char *argv[]; { int able_to_interpret_options(); void experiment(), print_usage(); if (argc < (1 + NUMBER_OF_REQUIRED_INPUTS)) /*==== Missing some required inputs ====*/ print_usage(); else { /*==== Read the required input parameters ====*/ strcpy(descriptionfile,argv[1]); strcpy(namesfile,argv[2]); strcpy(trainingfile,argv[3]); strcpy(testingfile,argv[4]); strcpy(outputfile,argv[5]); srandom((unsigned)atoi(argv[6])); /*==== Interpret the options, if possible ====*/ if (able_to_interpret_options(&argv[0],argc)) experiment(); else print_usage(); } } /* ====================================================================== Prints usage information for the program. ====================================================================== */ void print_usage() { printf("Usage: description namesfile trainfile testfile outputfile seed [options]\n"); printf("\nRequired Parameters:\n"); printf(" descriptionfile contains the predictee/predictor info\n"); printf(" namesfile contains the datafile's format information\n"); printf(" trainfile contains training instances\n"); printf(" testfile contains testing instances\n"); printf(" outputfile will contain the experiment's results\n"); printf(" seed is used to initialize random variable generator\n"); printf("\nUser Parameters: (name, default, and brief description)\n"); printf(" -signif_accept (75 confidence) above class frequency\n"); printf(" -signif_drop (75 confidence) below class frequency\n"); printf(" -ib1 (off) Act like IB1 rather than IB3\n"); printf(" -ib2 (on) Act like IB2 rather than IB3\n"); printf(" -ib3 (off) Act like IB3\n"); printf(" -ib4 (off) Act like IB4 rather than IB3\n"); printf(" -k (1) Number of nearest acceptables wanted\n"); printf(" -norm_none (off) No normalization option\n"); printf(" -norm_linear (on) Linear normalization option\n"); printf(" -norm_sd (off) Standard deviation normalize option\n"); printf(" -missing_maxdiff (on) Assume maximum possible difference\n"); printf(" -missing_ave (off) Assume average or most frequent value\n"); printf(" -missing_ignore (off) Ignore, Normalize the sim'y results\n"); printf(" -storeall (off) IBn>1: Save (initially) all instances\n"); printf(" -multiline (off) Allows alternative input format\n"); printf(" - Instances can then appear on multiple lines\n"); printf(" - Final line per instance must contain only one symbol: <\n"); printf("\nConvenience Options:\n"); printf(" -testrate (100) How often to run on test set\n"); printf(" -reportrate (25) How often to report on things\n"); printf(" -startup (0) When to start testing/reporting\n"); printf(" -overlap (off) IB4: set of binary concepts\n"); printf(" -best_concept_only (off) IB4: if overlap, give only best\n"); printf(" -probability_weights (off) IB4: Toggles between 2 methods\n"); printf(" -printweights (off) IB4: always print attribute weights\n"); printf(" -testlast (off) Test after finished?\n"); } /* ====================================================================== Tries to interpret the options. Yields 0 iff something is wrong. Else 1. ====================================================================== */ int able_to_interpret_options(argument,argc) int *argument,argc; { /*==== Subfunctions ====*/ extern int read_data_format(); /*==== Utility ====*/ extern void read_test_set(); /*==== Testing ====*/ int known_significance_level(), good_value_for_k(); void print_known_significance_levels(); double significance_level(); /*==== Local Variables ====*/ register int i,arg, startup_set=FALSE; char argstr[30]; /*==== Setup optional argument defaults. ====*/ signif_accept_level = 75.0; /*==== 75% level of significance. ====*/ signif_drop_level = 75.0; /*==== 75% level of significance. ====*/ testrate = 50; reportrate = 25; ib1 = FALSE; ib2 = FALSE; ib3 = TRUE; ib4= FALSE; norm_none = FALSE; norm_linear = TRUE; norm_sd = FALSE; missing_maxdiff = TRUE; missing_ave = FALSE; missing_ignore = FALSE; storeall = FALSE; multiline = FALSE; number_of_predictors = number_of_predictees = 0; k=1; overlap = FALSE; best_concept_only = FALSE; weight_method = SIMPLE; printweights = FALSE; testlast = FALSE; learning_rate = 0.1; /*==== Option values will be divided by 100.0 ====*/ /*==== Now process the options. ====*/ for(i=NUMBER_OF_REQUIRED_INPUTS+1; i 20000)) { printf("Test rate argument x: 1 <= x <= 20000.\n"); return(FALSE); } else { testrate = arg; i++; } } } else if (strcmp(*(argument+i),"-reportrate") == 0) { if (i == (argc)) { printf("Report rate option need integer argument.\n"); return(FALSE); } else { arg = atoi(*(argument+i+1)); if ((arg<1) || (arg > 10000)) { printf("Report rate argument x: 1 <= x <= 10000.\n"); return(FALSE); } else { reportrate = arg; i++; } } } else if (strcmp(*(argument+i),"-signif_accept") == 0) { if (i == (argc)) { printf("Frequency significance option needs integer arg.\n"); return(FALSE);} else { arg = atoi(*(argument+i+1)); if (!known_significance_level(arg)) { printf("Frequency Significance argument: "); print_known_significance_levels(); return(FALSE); } else { signif_accept_level = (double)arg; i++; } } } else if (strcmp(*(argument+i),"-signif_drop") == 0) { if (i == (argc)) { printf("Instance record significance option needs int arg.\n"); return(FALSE);} else { arg = atoi(*(argument+i+1)); if (!known_significance_level(arg)) { printf("Instance record significance argument: "); print_known_significance_levels(); return(FALSE); } else { signif_drop_level = (double)arg; i++; } } } else if (strcmp(*(argument+i),"-ib1") == 0) { ib1 = TRUE; ib2 = FALSE; ib3 = FALSE; ib4 = FALSE; } else if (strcmp(*(argument+i),"-ib2") == 0) { ib2 = TRUE; ib1 = FALSE; ib3 = FALSE; ib4 = FALSE; } else if (strcmp(*(argument+i),"-ib3") == 0) { ib3 = TRUE; ib1 = FALSE; ib2 = FALSE; ib4 = FALSE; } else if (strcmp(*(argument+i),"-ib4") == 0) { ib3 = FALSE; ib1 = FALSE; ib2 = FALSE; ib4 = TRUE; } else if (strcmp(*(argument+i),"-norm_none") == 0) { norm_none = TRUE; norm_linear = FALSE; norm_sd = FALSE; } else if (strcmp(*(argument+i),"-norm_linear") == 0) { norm_none = FALSE; norm_linear = TRUE; norm_sd = FALSE; } else if (strcmp(*(argument+i),"-norm_sd") == 0) { norm_none = FALSE; norm_linear = FALSE; norm_sd = TRUE; } else if (strcmp(*(argument+i),"-missing_maxdiff") == 0) { missing_maxdiff = TRUE; missing_ave = FALSE; missing_ignore = FALSE; } else if (strcmp(*(argument+i),"-missing_ave") == 0) { missing_maxdiff = FALSE; missing_ave = TRUE; missing_ignore = FALSE; } else if (strcmp(*(argument+i),"-missing_ignore") == 0) { missing_maxdiff = FALSE; missing_ave = FALSE; missing_ignore = TRUE; } else if (strcmp(*(argument+i),"-storeall") == 0) storeall = TRUE; else if (strcmp(*(argument+i),"-multiline") == 0) multiline = TRUE; else if (strcmp(*(argument+i),"-k") == 0) { if (i == (argc)) { printf("k option needs an odd integer argument.\n"); return(FALSE); } else { arg = atoi(*(argument+i+1)); if (!good_value_for_k(arg)) { printf("k argument: integer less than %d.\n", MAX_VALUE_OF_K); return(FALSE); } else { k = arg; i++; } } } else if (strcmp(*(argument+i),"-startup") == 0) { if (i == (argc)) { printf("Startup option needs int arg.\n"); return(FALSE);} else { startup = atoi(*(argument+i+1)); startup_set = TRUE; i++; } } else if (strcmp(*(argument+i),"-overlap") == 0) overlap = TRUE; else if (strcmp(*(argument+i),"-best_concept_only") == 0) best_concept_only = TRUE; else if (strcmp(*(argument+i),"-probability_weights") == 0) weight_method = PROBABILITY_WEIGHTS; else if (strcmp(*(argument+i),"-printweights") == 0) printweights = TRUE; else if (strcmp(*(argument+i),"-testlast") == 0) testlast = TRUE; else if (strcmp(*(argument+i),"-learning_rate") == 0) { if (i == (argc)) { printf("Learning rate option needs integer arg.\n"); return(FALSE);} else { arg = atoi(*(argument+i+1)); if ((arg<1) || (arg>25)) { printf("Learning rate option must be in range [1,25].\n"); return(FALSE); } else { learning_rate = 0.01 * (double)arg; i++; } } } else { printf("Unknown argument to ibl: %s\n",*(argument+i)); return(FALSE); } } /*==== Initialization stuff that wound up here ====*/ if (read_data_format() == INCORRECT) { printf("Bad data format.\n"); return(FALSE); } unknown_value = -1000.1234; /*==== Unknown attribute val placeholder ====*/ read_test_set(); /*==== Reads test set ====*/ signif_accept = significance_level(signif_accept_level); signif_drop = significance_level(signif_drop_level); if (!startup_set) { startup = reportrate; if (reportrate > testrate) startup = testrate; } return(TRUE); } /* ====================================================================== Outlines the 4 experiments. ====================================================================== */ void experiment() { void initialization(); extern void train_and_test(); /* Training */ printf("Initializing variables...\n"); initialization(); printf("Training and testing...\n"); train_and_test(); printf("Finished Experiment.\n"); } /* ====================================================================== Initializes system variables and windows. ====================================================================== */ void initialization() { register int i, attribute, j, predictee_a; /*==== Initialize the algorithm string name. ====*/ if (ib1) sprintf(algorithm_string,"%d-IB1",k); else if (ib2) sprintf(algorithm_string,"%d-IB2",k); else if (ib3) sprintf(algorithm_string,"%d-IB3",k); else sprintf(algorithm_string,"%d-IB4",k); /*==== Initialize the normalization string name. ====*/ if (norm_none) strcpy(normalization_string,"None"); else if (norm_linear) strcpy(normalization_string,"Linear"); else strcpy(normalization_string,"Standard Deviation"); /*==== Initialize the missing attribute values string name. ====*/ if (missing_maxdiff) strcpy(missing_string,"Maximum Difference"); else if (missing_ave) strcpy(missing_string,"Average or Most Frequent Value"); else strcpy(missing_string,"Ignore and Normalize Similarity Results"); /*==== Simple variables ====*/ instance_number = 0; /*==== Instance counter ====*/ my_infinity = 999999.0; /*==== Occasional usage. ====*/ initusages = 0.0; /*==== Initial number of usages. ====*/ initcorrectcount = 0.0; /*==== Initial count value. ====*/ initincorrectcount = 0.0; /*==== Initial count value. ====*/ /*==== Others are initialized in setup_for_testing. ====*/ num_data_in_alg = 0; num_dropped_in_alg = 0; for(i=0; i= 1) && (value <= MAX_VALUE_OF_K)) return(TRUE); else return(FALSE); }