/* ====================================================================== David W. Aha ibl.c: Implementation of Instance-Based Learning Algorithms Topic: IB1, IB2, IB3, and now IB4 March 1994 Done: 1. Implemented ib4 -- Assumes that, if best_concept_only, there is only 1 predictee and that it is the last. Always put IB4's predictee last! -- Need to implement k updates to similarity fn per classification -- Need a translation process from multi-discrete-value to multi-binary-attributes. Not done automatically? Right. 2. k>=1 3. -norm_none, -norm_linear, and -norm_sd options 4. -missing_maxdiff, -missing-ave, -missing_ignore 5. 3/9/94: -storeall option implemented - Tells IBn (n>1) to save (initially) all instances 6. 3/9/94: -multiline option implemented - Allows alternative format for instances in input files - CRLF's allowed between attribute values within an instance - Each new instance must begin on a new line ====================================================================== */ /*==== Accessing the declarations. ====*/ #define MAIN 1 #include "datastructures.h" #undef MAIN /* ====================================================================== Main Function ====================================================================== */ main (argc, argv) int argc; char *argv[]; { int able_to_interpret_options(); void experiment(), print_usage(); if (argc < (1 + NUMBER_OF_REQUIRED_INPUTS)) /*==== Missing some required inputs ====*/ print_usage(); else { /*==== Read the required input parameters ====*/ strcpy(descriptionfile,argv[1]); strcpy(namesfile,argv[2]); strcpy(trainingfile,argv[3]); strcpy(testingfile,argv[4]); strcpy(outputfile,argv[5]); srandom((unsigned)atoi(argv[6])); /*==== Interpret the options, if possible ====*/ if (able_to_interpret_options(&argv[0],argc)) experiment(); else print_usage(); } } /* ====================================================================== Prints usage information for the program. ====================================================================== */ void print_usage() { printf("Usage: description namesfile trainfile testfile outputfile seed [options]\n"); printf("\nRequired Parameters:\n"); printf(" descriptionfile contains the predictee/predictor info\n"); printf(" namesfile contains the datafile's format information\n"); printf(" trainfile contains training instances\n"); printf(" testfile contains testing instances\n"); printf(" outputfile will contain the experiment's results\n"); printf(" seed is used to initialize random variable generator\n"); printf("\nUser Parameters: (name, default, and brief description)\n"); printf(" -signif_accept (75 confidence) above class frequency\n"); printf(" -signif_drop (75 confidence) below class frequency\n"); printf(" -ib1 (off) Act like IB1 rather than IB3\n"); printf(" -ib2 (on) Act like IB2 rather than IB3\n"); printf(" -ib3 (off) Act like IB3\n"); printf(" -ib4 (off) Act like IB4 rather than IB3\n"); printf(" -k (1) Number of nearest acceptables wanted\n"); printf(" -norm_none (off) No normalization option\n"); printf(" -norm_linear (on) Linear normalization option\n"); printf(" -norm_sd (off) Standard deviation normalize option\n"); printf(" -missing_maxdiff (on) Assume maximum possible difference\n"); printf(" -missing_ave (off) Assume average or most frequent value\n"); printf(" -missing_ignore (off) Ignore, Normalize the sim'y results\n"); printf(" -storeall (off) IBn>1: Save (initially) all instances\n"); printf(" -multiline (off) Allows alternative input format\n"); printf(" - Instances can then appear on multiple lines\n"); printf(" - Final line per instance must contain only one symbol: <\n"); printf("\nConvenience Options:\n"); printf(" -testrate (100) How often to run on test set\n"); printf(" -reportrate (25) How often to report on things\n"); printf(" -startup (0) When to start testing/reporting\n"); printf(" -overlap (off) IB4: set of binary concepts\n"); printf(" -best_concept_only (off) IB4: if overlap, give only best\n"); printf(" -probability_weights (off) IB4: Toggles between 2 methods\n"); printf(" -printweights (off) IB4: always print attribute weights\n"); printf(" -testlast (off) Test after finished?\n"); } /* ====================================================================== Tries to interpret the options. Yields 0 iff something is wrong. Else 1. ====================================================================== */ int able_to_interpret_options(argument,argc) int *argument,argc; { /*==== Subfunctions ====*/ extern int read_data_format(); /*==== Utility ====*/ extern void read_test_set(); /*==== Testing ====*/ int known_significance_level(), good_value_for_k(); void print_known_significance_levels(); double significance_level(); /*==== Local Variables ====*/ register int i,arg, startup_set=FALSE; char argstr[30]; /*==== Setup optional argument defaults. ====*/ signif_accept_level = 75.0; /*==== 75% level of significance. ====*/ signif_drop_level = 75.0; /*==== 75% level of significance. ====*/ testrate = 50; reportrate = 25; ib1 = FALSE; ib2 = FALSE; ib3 = TRUE; ib4= FALSE; norm_none = FALSE; norm_linear = TRUE; norm_sd = FALSE; missing_maxdiff = TRUE; missing_ave = FALSE; missing_ignore = FALSE; storeall = FALSE; multiline = FALSE; number_of_predictors = number_of_predictees = 0; k=1; overlap = FALSE; best_concept_only = FALSE; weight_method = SIMPLE; printweights = FALSE; testlast = FALSE; learning_rate = 0.1; /*==== Option values will be divided by 100.0 ====*/ /*==== Now process the options. ====*/ for(i=NUMBER_OF_REQUIRED_INPUTS+1; i<argc; i++) { strcpy(argstr,*(argument+i)); if (strcmp(*(argument+i),"-testrate") == 0) { if (i == (argc)) { printf("Test rate option need integer argument.\n"); return(FALSE); } else { arg = atoi(*(argument+i+1)); if ((arg<1) || (arg > 20000)) { printf("Test rate argument x: 1 <= x <= 20000.\n"); return(FALSE); } else { testrate = arg; i++; } } } else if (strcmp(*(argument+i),"-reportrate") == 0) { if (i == (argc)) { printf("Report rate option need integer argument.\n"); return(FALSE); } else { arg = atoi(*(argument+i+1)); if ((arg<1) || (arg > 10000)) { printf("Report rate argument x: 1 <= x <= 10000.\n"); return(FALSE); } else { reportrate = arg; i++; } } } else if (strcmp(*(argument+i),"-signif_accept") == 0) { if (i == (argc)) { printf("Frequency significance option needs integer arg.\n"); return(FALSE);} else { arg = atoi(*(argument+i+1)); if (!known_significance_level(arg)) { printf("Frequency Significance argument: "); print_known_significance_levels(); return(FALSE); } else { signif_accept_level = (double)arg; i++; } } } else if (strcmp(*(argument+i),"-signif_drop") == 0) { if (i == (argc)) { printf("Instance record significance option needs int arg.\n"); return(FALSE);} else { arg = atoi(*(argument+i+1)); if (!known_significance_level(arg)) { printf("Instance record significance argument: "); print_known_significance_levels(); return(FALSE); } else { signif_drop_level = (double)arg; i++; } } } else if (strcmp(*(argument+i),"-ib1") == 0) { ib1 = TRUE; ib2 = FALSE; ib3 = FALSE; ib4 = FALSE; } else if (strcmp(*(argument+i),"-ib2") == 0) { ib2 = TRUE; ib1 = FALSE; ib3 = FALSE; ib4 = FALSE; } else if (strcmp(*(argument+i),"-ib3") == 0) { ib3 = TRUE; ib1 = FALSE; ib2 = FALSE; ib4 = FALSE; } else if (strcmp(*(argument+i),"-ib4") == 0) { ib3 = FALSE; ib1 = FALSE; ib2 = FALSE; ib4 = TRUE; } else if (strcmp(*(argument+i),"-norm_none") == 0) { norm_none = TRUE; norm_linear = FALSE; norm_sd = FALSE; } else if (strcmp(*(argument+i),"-norm_linear") == 0) { norm_none = FALSE; norm_linear = TRUE; norm_sd = FALSE; } else if (strcmp(*(argument+i),"-norm_sd") == 0) { norm_none = FALSE; norm_linear = FALSE; norm_sd = TRUE; } else if (strcmp(*(argument+i),"-missing_maxdiff") == 0) { missing_maxdiff = TRUE; missing_ave = FALSE; missing_ignore = FALSE; } else if (strcmp(*(argument+i),"-missing_ave") == 0) { missing_maxdiff = FALSE; missing_ave = TRUE; missing_ignore = FALSE; } else if (strcmp(*(argument+i),"-missing_ignore") == 0) { missing_maxdiff = FALSE; missing_ave = FALSE; missing_ignore = TRUE; } else if (strcmp(*(argument+i),"-storeall") == 0) storeall = TRUE; else if (strcmp(*(argument+i),"-multiline") == 0) multiline = TRUE; else if (strcmp(*(argument+i),"-k") == 0) { if (i == (argc)) { printf("k option needs an odd integer argument.\n"); return(FALSE); } else { arg = atoi(*(argument+i+1)); if (!good_value_for_k(arg)) { printf("k argument: integer less than %d.\n", MAX_VALUE_OF_K); return(FALSE); } else { k = arg; i++; } } } else if (strcmp(*(argument+i),"-startup") == 0) { if (i == (argc)) { printf("Startup option needs int arg.\n"); return(FALSE);} else { startup = atoi(*(argument+i+1)); startup_set = TRUE; i++; } } else if (strcmp(*(argument+i),"-overlap") == 0) overlap = TRUE; else if (strcmp(*(argument+i),"-best_concept_only") == 0) best_concept_only = TRUE; else if (strcmp(*(argument+i),"-probability_weights") == 0) weight_method = PROBABILITY_WEIGHTS; else if (strcmp(*(argument+i),"-printweights") == 0) printweights = TRUE; else if (strcmp(*(argument+i),"-testlast") == 0) testlast = TRUE; else if (strcmp(*(argument+i),"-learning_rate") == 0) { if (i == (argc)) { printf("Learning rate option needs integer arg.\n"); return(FALSE);} else { arg = atoi(*(argument+i+1)); if ((arg<1) || (arg>25)) { printf("Learning rate option must be in range [1,25].\n"); return(FALSE); } else { learning_rate = 0.01 * (double)arg; i++; } } } else { printf("Unknown argument to ibl: %s\n",*(argument+i)); return(FALSE); } } /*==== Initialization stuff that wound up here ====*/ if (read_data_format() == INCORRECT) { printf("Bad data format.\n"); return(FALSE); } unknown_value = -1000.1234; /*==== Unknown attribute val placeholder ====*/ read_test_set(); /*==== Reads test set ====*/ signif_accept = significance_level(signif_accept_level); signif_drop = significance_level(signif_drop_level); if (!startup_set) { startup = reportrate; if (reportrate > testrate) startup = testrate; } return(TRUE); } /* ====================================================================== Outlines the 4 experiments. ====================================================================== */ void experiment() { void initialization(); extern void train_and_test(); /* Training */ printf("Initializing variables...\n"); initialization(); printf("Training and testing...\n"); train_and_test(); printf("Finished Experiment.\n"); } /* ====================================================================== Initializes system variables and windows. ====================================================================== */ void initialization() { register int i, attribute, j, predictee_a; /*==== Initialize the algorithm string name. ====*/ if (ib1) sprintf(algorithm_string,"%d-IB1",k); else if (ib2) sprintf(algorithm_string,"%d-IB2",k); else if (ib3) sprintf(algorithm_string,"%d-IB3",k); else sprintf(algorithm_string,"%d-IB4",k); /*==== Initialize the normalization string name. ====*/ if (norm_none) strcpy(normalization_string,"None"); else if (norm_linear) strcpy(normalization_string,"Linear"); else strcpy(normalization_string,"Standard Deviation"); /*==== Initialize the missing attribute values string name. ====*/ if (missing_maxdiff) strcpy(missing_string,"Maximum Difference"); else if (missing_ave) strcpy(missing_string,"Average or Most Frequent Value"); else strcpy(missing_string,"Ignore and Normalize Similarity Results"); /*==== Simple variables ====*/ instance_number = 0; /*==== Instance counter ====*/ my_infinity = 999999.0; /*==== Occasional usage. ====*/ initusages = 0.0; /*==== Initial number of usages. ====*/ initcorrectcount = 0.0; /*==== Initial count value. ====*/ initincorrectcount = 0.0; /*==== Initial count value. ====*/ /*==== Others are initialized in setup_for_testing. ====*/ num_data_in_alg = 0; num_dropped_in_alg = 0; for(i=0; i<number_of_predictees; i++) { num_train_in_concept[i] = 0; num_data_in_concept[i] = 0; num_dropped_in_concept[i] = 0; num_training_correct_in_concept[i] = 0; num_training_incorrect_in_concept[i] = 0; for(j=0; j<MAX_NUMBER_OF_VALUES; j++) num_train_with_value[i][j] = 0; } /*==== These are used for print_recent_training_accuracy ====*/ num_training_correct_in_alg = 0; num_training_incorrect_in_alg = 0; for(i=0; i<2; i++) /*==== Indices mean: reporting and testing ====*/ { num_training_correct_in_alg_previous[i] = 0; num_training_incorrect_in_alg_previous[i] = 0; for(j=0; j <number_of_predictees; j++) { num_training_correct_in_concept_previous[i][j] = 0; num_training_incorrect_in_concept_previous[i][j] = 0; } } /*==== Initialize counts and usages. ====*/ for(i=0; i <MAX_NUMBER_OF_INSTANCES; i++) { num_uses[i] = 0; for(j=0; j<number_of_predictees; j++) { counts[j][i][CORRECT] = 0.0; counts[j][i][INCORRECT] = 0.0; usages[j][i] = 0.0; } } /*==== Attribute scales are updated dynamically during learning ====*/ for(attribute=0; attribute<number_of_attributes; attribute++) { scalemin[attribute] = my_infinity; /* Impossibly high */ scalemax[attribute] = -1.0 * my_infinity; /* Impossibly low */ } /*==== These used to track the attribute weights ====*/ for(predictee_a=0; predictee_a<number_of_predictees; predictee_a++) for(attribute=0; attribute<number_of_predictors; attribute++) { /*==== 1. Simple weighting method ====*/ attribute_weight_total[predictee_a][attribute] = 0.01; weight_size[predictee_a][attribute] = 0.01; attribute_weight[predictee_a][attribute] = 0.01; attribute_weight_used[predictee_a][attribute] = 0.5; if (weight_method == SIMPLE) attribute_weight_used[predictee_a][attribute] = 1.0/(double)number_of_predictors; /*==== 2. Conditional probability weighting method ====*/ conditional_probability[predictee_a][attribute] = 0.5; } /*==== Initialize stuff for norm_sd if it's set ====*/ if (norm_sd) for(attribute=0; attribute<number_of_predictors; attribute++) { sum_values[attribute] = 0.0; sum_squared_values[attribute] = 0.0; number_of_known_values[attribute] = 0.0; standard_deviation[attribute] = 1.0; } /*==== Initialize variables needed for missing_ave ====*/ if (missing_ave) for(attribute=0; attribute<number_of_attributes; attribute++) { num_predictor_training_values[attribute] = 0; sum_predictor_values[attribute] = 0.0; for(i=0; i<MAX_NUMBER_OF_VALUES; i++) num_training_instances_with_value[attribute][i] = 0; } } /* ====================================================================== Prints known significance levels. ====================================================================== */ void print_known_significance_levels() { printf("{55,60,67,70,75,80,85,90,95,97,99}\n"); } /* ====================================================================== True iff the given is a known significance level. ====================================================================== */ int known_significance_level(arg) int arg; { register int result; switch (arg) { case 55: case 60: case 67: case 70: case 75: case 80: case 85: case 90: case 95: case 97: case 99: result = TRUE; break; default: result = FALSE; } return(result); } /* ====================================================================== Yields significance level value z for given significance level. Assumes that the value v is a known significance level. ====================================================================== */ double significance_level(v) double v; { if (v == 55.0) return(0.125); else if (v == 60.0) return(0.255); else if (v == 67.0) return(0.44); else if (v == 70.0) return(0.525); else if (v == 75.0) return(0.675); else if (v == 80.0) return(0.843); else if (v == 85.0) return(1.037); else if (v == 90.0) return(1.285); else if (v == 95.0) return(1.645); else if (v == 97.0) return(1.881); else if (v == 99.0) return(2.325); else { printf("FAILURE IN SIGNIFICANCE LEVEL (with value %f)!!!\n",v); return(-1.0); } } /* ====================================================================== TRUE iff the argument is a good setting for k. ======================================================================*/ int good_value_for_k(value) int value; { if ((value >= 1) && (value <= MAX_VALUE_OF_K)) return(TRUE); else return(FALSE); }