/* ====================================================================== David W. Aha training.c: Source file for training set processing procedures Topic: IB1, IB2, IB3, and now IB4 July, 1990 ====================================================================== */ #include "datastructures.h" /* ====================================================================== Reading the examples into the data_set array of example lists. ====================================================================== */ void train_and_test() { /*==== Subfunctions ====*/ extern ptr_instance translate_instance(); /*==== Utility ====*/ extern void process_new_training_instance(), /*==== Utility ====*/ print_general_info(), /*==== Printing ====*/ update_confidence_intervals(), /*==== Utility ====*/ print_classification_results(); /*==== Printing ====*/ void run_algorithms(), store_current_instance(); /*==== Local Variables ====*/ FILE *fopen(), *training_set_file, *output_file; char raw_training_instance[MAXLINE]; register int c; /*==== 1. Open files, print default output. ====*/ training_set_file = fopen(trainingfile,"r"); output_file = fopen(outputfile,"w"); print_general_info(output_file); /*==== 2. Process first instance separately. ====*/ fgets(raw_training_instance, MAXLINE, training_set_file); process_new_training_instance(translate_instance(raw_training_instance, training_set_file)); for(c=0; cattributes[predictee[predictee_a]]; /*==== Nothing if value isn't known!! (Must be supervised) ====*/ if (predictee_val != unknown_value) { if (ib1) store_current_instance(predictee_a); else { incorporate(predictee_a,predictee_val); if (ib3 || ib4) { if (ib4) update_similarity_definition(predictee_a); update_classification_records(predictee_a); } } } } /*==== 2. Note that we processed another training instance ====*/ instance_number++; if (instance_number > MAX_NUMBER_OF_INSTANCES) { printf("FATAL ERROR! Too many training instances (> %d)\n", MAX_NUMBER_OF_INSTANCES); instance_number = 0; /*==== Sabatoged! ====*/ } /*==== 3. Test and summarize for me periodically ====*/ if (instance_number >= startup) if (((instance_number % reportrate) == 0) || ((instance_number % testrate) == 0)) print_classification_results(output_file,0); } /* ====================================================================== Given an instance represented as a record, stores it in the appropriate list of data examples in data_set under the concept name's index. ====================================================================== */ void store_current_instance(predictee_a) register int predictee_a; /*==== Target concept index ====*/ { /*==== 1. Store new instance index ====*/ data_set[predictee_a][num_data_in_concept[predictee_a]] = instance_number; num_uses[instance_number]++; /*=== 2. Update size counts for alg and this alg/concept pair ====*/ num_data_in_alg++; num_data_in_concept[predictee_a]++; } /* ====================================================================== Incorporates a training instance for one of IB5's concept's description. Invariant: instances[instance_number]->attributes[concept] is known. Invariant: at least one instance has been saved for this concept. ====================================================================== */ void incorporate(predictee_a,predictee_val) register int predictee_a; /*==== Target concept index ====*/ double predictee_val; /*== Value of current instance to predict! ====*/ { /*==== Subfunctions ====*/ void find_similar_neighbors(), store_current_instance(); int correct_classification(); /*==== 1. Find nearest acceptable n'bor; update nearby instances' wts ====*/ find_similar_neighbors(predictee_a); /*==== 2. Classify new training instance; update training accuracy ====*/ if (correct_classification(predictee_a,predictee_val)) { /*==== CORRECT CLASSIFICATION of current instance: discard ====*/ if (storeall) store_current_instance(predictee_a); num_training_correct_in_concept[predictee_a]++; num_training_correct_in_alg++; } else /*==== MISCLASSIFIED current instance: store ====*/ { store_current_instance(predictee_a); num_training_incorrect_in_concept[predictee_a]++; num_training_incorrect_in_alg++; } } /* ====================================================================== Computes similarity, orders instances, and updates weights. ====================================================================== */ void find_similar_neighbors(predictee_a) register int predictee_a; /*==== Weight type to do this for ====*/ { /*==== Subfunctions ====*/ void compute_similarities_to_current_instance(), order_instances_by_similarity(); /*==== 1. Calculate similarities to current training instance ====*/ compute_similarities_to_current_instance(predictee_a); /*==== 2. Order the instances by non-increasing similarity ====*/ order_instances_by_similarity(predictee_a); /*==== 3. If none accepted, update only some most similar instances ====*/ if (num_accepted == 0) if (num_rejected == 0) { printf("Failure in find_similar_neighbors: 0 accepted/rejected!\n"); num_accepted = num_rejected = -1; /*==== Sabatoge! ====*/ } else num_rejected = ((int)random() % num_rejected)+1; } /* ====================================================================== Computing all information for calculating similarities from new instance to all previous training instances. Side effects: values saved in the array named distances. Used only during training. Now assuming that no attributes are missing. Assumes for the moment that all attribute values are numeric. ====================================================================== */ void compute_similarities_to_current_instance(predictee_a) register int predictee_a; { /*==== Subfunctions ====*/ extern double diff_with_unknowns(); /*==== Utility ====*/ /*==== Local Variables ====*/ register int data_id, predictor_a, a; double sum, diff, normalizer; struct instance *oldinst, *newinst = instances[instance_number]; /*==== 1. Dynamic determination of weighted distance ====*/ for(data_id=0; data_idattributes[predictee[predictee_a]] == unknown_value) distances[data_id] = my_infinity; /*==== Preventing 0 division ====*/ else { for(predictor_a=0; predictor_aattributes[a] == unknown_value) || (oldinst->attributes[a] == unknown_value)) { if (ib4) normalizer -= attribute_weight_used[predictee_a][predictor_a]; else normalizer -= 1.0; diff = diff_with_unknowns(a,newinst->attributes[a], oldinst->attributes[a]); } else if (attribute_type[a] == NOMINAL_TYPE) diff = (double)(newinst->attributes[a] != oldinst->attributes[a]); else /*==== This is a pair of known numeric values. ====*/ diff = newinst->attributes[a] - oldinst->attributes[a]; if (ib4) sum += diff * diff * attribute_weight_used[predictee_a][predictor_a]; else sum += diff * diff; } if (!missing_ignore) distances[data_id] = sqrt(sum); else if (normalizer == 0.0) distances[data_id] = my_infinity; else distances[data_id] = sqrt(sum)/sqrt(normalizer); } } } /* ====================================================================== Handles multiple ties. Assumes that saved similarities are in "distances" array. ====================================================================== */ void order_instances_by_similarity(predictee_a) register int predictee_a; { /*==== Subfunctions ====*/ void insert_accepted(), insert_rejected(), randomize_ordered(); int concept_instance_accepted(); /*==== Local Variables ====*/ register int i, num_required = k; /*==== An odd positive integer ====*/ /*==== 1. Setup ====*/ num_accepted = num_rejected = 0; /*==== 2. Find required # of accepted instances and those as close ====*/ for(i=0; i= num_required) && (distances[i] > accepted_distance[num_accepted-1]))) continue; else if (concept_instance_accepted(predictee_a,i)) insert_accepted(i,distances[i],num_required); else insert_rejected(i,distances[i]); } /*==== 3. Ensure randomly chosen first instance among ties. ====*/ randomize_ordered(); } /* ====================================================================== This instance is acceptable. Also, either not all required acceptable instances have been found OR this is at least as similar as one of the most similar num_required instances we've yet found. ====================================================================== */ void insert_accepted(alg_data_index,value,num_required) register int alg_data_index, /*==== index of saved instance ====*/ num_required; /*==== # needed to make classifications ====*/ double value; { /*==== Local Variables ====*/ register int n,j; /*==== 1. Locate position in acceptance array for this one ====*/ for(n=num_accepted-1; n>=0; n--) if (accepted_distance[n] <= value) break; /*==== 2. Push back rest of array ====*/ for(j=num_accepted; j>n+1; j--) { accepted_distance[j] = accepted_distance[j-1]; accepted_dataid[j] = accepted_dataid[j-1]; } /*==== 3. Add new value into the array. ====*/ accepted_dataid[n+1] = alg_data_index; accepted_distance[n+1] = value; num_accepted++; /*==== 4. Remove parts of both arrays that are no longer needed ====*/ if ((num_accepted > num_required) && (accepted_distance[num_required-1] < accepted_distance[num_required])) num_accepted = num_required; for(n=num_rejected-1; n>=0; n--) if (rejected_distance[n] <= accepted_distance[num_required-1]) break; num_rejected = n+1; } /* ====================================================================== Inserts a rejected instance into that array. All in this array are assumed to be at least as similar to the probe as is the least similar but required, acceptable instance. ====================================================================== */ void insert_rejected(alg_data_index,value) register int alg_data_index; /*==== Index of inst in description ====*/ double value; /*==== Its distance to the new inst ====*/ { /*==== Local Variables ====*/ register int n,j; /*==== 1. Locate position in rejection array for this one ====*/ for(n=num_rejected-1; n>=0; n--) if (rejected_distance[n] <= value) break; /*==== 2. Push back rest of array ====*/ for(j=num_rejected; j>n+1; j--) { rejected_distance[j] = rejected_distance[j-1]; rejected_dataid[j] = rejected_dataid[j-1]; } /*==== 3. Add new index and value into the array. ====*/ rejected_dataid[n+1] = alg_data_index; rejected_distance[n+1] = value; num_rejected++; } /* ====================================================================== Random selection of first instances for the accepted array (if needed). Purpose: use, in case of ties for most similar, a randomly-selected instance be used to update the tr classification accuracy. Also now needed for rejected instances if none accepted...because the first rejected instance is used to update the similarity function. ====================================================================== */ void randomize_ordered() { /*==== Local Variables ====*/ register int i, j, temp_id, chosen; double temp_distance; /*==== If a tie for most similar among accepteds, pick one randomly. ====*/ if ((num_accepted>k) && (accepted_distance[k-1] == accepted_distance[k])) { for(i=k-1; i>0; i--) { /*==== Find first instance with same value ====*/ if (accepted_distance[i-1] < accepted_distance[i]) break; } for(j=k; j+1=k-num_accepted) && (rejected_distance[k-1-num_accepted] == rejected_distance[k-num_accepted])) { for(i=k-1-num_accepted; i>0; i--) { /*==== Find first instance with same value ====*/ if (rejected_distance[i-1] < rejected_distance[i]) break; } for(j=k-num_accepted; j+1= class_accept_threshold[predictee_a][(int)instances[i_num]->attributes[predictee[predictee_a]]]) return(TRUE); else return(FALSE); } /* ====================================================================== Yields TRUE iff the current instance was correctly classified. The target is assumed to be either Boolean of Nominal-valued. Now works for k>=1. Given: first k instances have been "untied" via randomize_ordered. ====================================================================== */ int correct_classification(predictee_a,predictee_val) register int predictee_a; double predictee_val; { /*==== Local Variables ====*/ double guess; int guesses[MAX_NUMBER_OF_VALUES], guesses_in_max[MAX_NUMBER_OF_VALUES]; register int i, c = predictee[predictee_a], max, num_guesses_in_max; /*==== 0. Setup: Guesses must be initialized ====*/ if (num_values[c] < 2) { printf("FATAL ERROR! The predictee has less than 2 possible values.\n"); printf("Did you mistakenly give it numeric type in the doc file?\n"); num_values[c] = 2; } for(i=0; iattributes[c]]++; /*==== 2. Fetch guesses from contributing rejected instances ====*/ if (k>num_accepted) for(i=0; iattributes[c]]++; /*==== 3. Find set of guesses with max number ====*/ max = guesses[0]; guesses_in_max[0] = 0; num_guesses_in_max = 1; for(i=1; imax) { max = guesses[i]; guesses_in_max[0] = i; num_guesses_in_max = 1; } else if (guesses[i] == max) { guesses_in_max[num_guesses_in_max] = i; num_guesses_in_max++; } /*==== 4. Finally, randomly choose from amongst those tied ====*/ guess = (double)guesses_in_max[(int)(random() % num_guesses_in_max)]; /*==== 5. Was the classification correct?? ====*/ if (guess == predictee_val) return(TRUE); else return(FALSE); } /* ====================================================================== Postprocessing steps concern dropping instances that have been detected as noisy or have been introduced by noisy instances. ====================================================================== */ void update_classification_records(c) register int c; /*==== c is a predictee ====*/ { /*==== Subfunctions ====*/ extern void drop_instance(), /*==== Utility ====*/ update_confidence_intervals(); /*==== Utility ====*/ void update_usage(); /*==== Local Variables ====*/ register int ordered_index, i; double new_val = instances[instance_number]->attributes[predictee[c]]; /*==== 0. Setup: update conf. intervals AFTER classification ====*/ update_confidence_intervals(); /*==== 1. Process all required, accepted instances. ====*/ num_to_be_dropped = 0; for(ordered_index=0; ordered_index=0; i--) drop_instance(c,to_be_dropped[i]); } /* ====================================================================== Updates usages, counts, and dropping arrays for this instance. Globals: num_to_be_dropped and to_be_dropped[] are global. ======================================================================*/ void update_usage(c,data_id,new_val) int c, data_id; /*==== c is a predictee ====*/ double new_val; { /*==== Subfunctions ====*/ extern int noisy_instance(); /*==== Utility ====*/ /*==== Local variables ====*/ register int i, j; /*==== Do the work ====*/ usages[c][data_id]++; if (new_val == instances[data_set[c][data_id]]->attributes[predictee[c]]) counts[c][data_id][CORRECT]++; else { counts[c][data_id][INCORRECT]++; if (noisy_instance(c,data_id)) { /*==== They must be ordered here!!!! ====*/ for(i=0; ii; j--) to_be_dropped[j] = to_be_dropped[j-1]; to_be_dropped[i] = data_id; num_to_be_dropped++; } } }