/* ====================================================================== David W. Aha utility.c: General utility progams Topic: IB1, IB2, IB3, and now IB4 (symbolics only at the moment) July, 1990 ====================================================================== */ #include "datastructures.h" /* ====================================================================== Reading in the data file's format: (by record) For each record: --- : -- no ":" in !!! --- where attribute type is either -- numeric (then always made into a double) -- nominal --- if the attribute is nominally valued, then it will be followed by a list of attribute values. Example: t,f Notes: -- class value always comes last for each instance!!!! -- Number of classes is the same as the number of attributes This sets values for 1. number_of_attributes 2. int attribute_type[number_of_attributes] -- either NUMERIC_TYPE, NOMINAL_TYPE, or BOOLEAN_TYPE 3. char value_name[number_of_attributes][number_of_values] (2 values) -- again, we assume nominals have exactly 2 values 12/7/88: Modified to handle non-numeric attributes. ====================================================================== */ int read_data_format() { /*==== Subfunctions and Local Variables ====*/ FILE *fopen(), *names_file; /* (namesfile) */ int read_description(); char names_file_line[MAXLINE]; int i, /* counter for characters in a line */ char_num = 0, /* Counter for characters in a name */ attribute_num = 0, /* Counter for attribute number */ value_num = 0; /* Counter for attribute values */ /*==== 1. Setup ====*/ printf("Reading data format file (namesfile)...\n"); names_file = fopen(namesfile,"r"); i = 0; /*==== 2. Read the attribute lines. ====*/ while (NULL != fgets(names_file_line, MAXLINE, names_file)) { /*==== 2.1 Discard everything up to attribute type. ====*/ for(i=0; names_file_line[i] != ':'; i++); /*==== 2.2 Read attribute type (1 letter suffices). ====*/ if (names_file_line[i+3] == 'u') attribute_type[attribute_num] = NUMERIC_TYPE; else if (names_file_line[i+4] == 'o') { attribute_type[attribute_num] = BOOLEAN_TYPE; /*==== 2.2.1 Read the set of 2 attribute values now. ====*/ value_num = 0; char_num = 0; for(i=i+10;; i++) if ((names_file_line[i] == ',') || (names_file_line[i] == '\n')) /*==== End of an attribute value name ====*/ { value_num++; value_name[attribute_num][value_num][char_num] = '\0'; if (char_num == 0) printf("Fatal! Bad names file: missing Boolean val\n"); if (value_num == 2) break; char_num = 0; } else /*==== Middle of an attribute value name ====*/ { value_name[attribute_num][value_num][char_num] = names_file_line[i]; char_num++; } num_values[attribute_num] = 2; } else { attribute_type[attribute_num] = NOMINAL_TYPE; /*==== Read the set of attribute values now. ====*/ value_num = 0; char_num = 0; for(i=i+10;; i++) if ((names_file_line[i] == ',') || (names_file_line[i] == '\n')) /*==== End of an attribute value name ====*/ { value_num++; value_name[attribute_num][value_num][char_num] = '\0'; if (char_num == 0) printf("Fatal! Bad names file: missing nominal val\n"); if (names_file_line[i] == '\n') break; char_num = 0; } else /*==== Middle of an attribute value name ====*/ { value_name[attribute_num][value_num][char_num] = names_file_line[i]; char_num++; } num_values[attribute_num] = value_num; } attribute_num++; } number_of_attributes = attribute_num; /*==== Cleanup. ====*/ fclose(names_file); /*==== Finish ====*/ return(read_description()); } /* ====================================================================== Reads descriptionfile. Determines who the predictees and predictors are. Takes care of IB4 overlap situations also. ====================================================================== */ int read_description() { /*==== Subfunctions and Local Variables ====*/ FILE *fopen(), *description; int get_int(); char descriptionline[MAXLINE]; register int a, i, val; /*==== 1. Setup ====*/ printf("Reading description file...\n"); description = fopen(descriptionfile,"r"); num_attributes_to_translate = number_of_attributes; /*==== 2. Determine predictees ====*/ fgets(descriptionline,MAXLINE,description); for(a=0; a MAX_NUMBER_OF_PREDICTEES) { printf("Too many predictees (at least %d).\n", number_of_predictees); return(INCORRECT); } } else if (val != 0) { printf("Predictees argument x: 0 <= x <= 1.\n"); return(INCORRECT); } } /*==== Determine predictors ====*/ fgets(descriptionline,MAXLINE,description); for(a=0; a MAX_NUMBER_OF_PREDICTORS) { printf("Too many predictors (at least %d).\n", number_of_predictors); return(INCORRECT); } } else if (val != 0) { printf("Predictors argument x: 0 <= x <= 1.\n"); return(INCORRECT); } } /*==== Cleanup ====*/ fclose(description); return(CORRECT); } /* ====================================================================== Gets an int and returns it. Assumes that a (empty or nonempty) series of blanks precedes each entity. Assumes that the line contains something in this position! ====================================================================== */ int get_int(position,resultline) int position; char resultline[MAXLINE]; { char intstr[MAXLINE]; int curpos, char_num = 0, intchar = 0; for(curpos=0; curposattributes[attribute_num] = unknown_value; else if (attribute_type[attribute_num] == NUMERIC_TYPE) inst->attributes[attribute_num] = atodouble(value); else /*==== Nominal or Boolean Type Attribute Value ====*/ { for(value_num=0; value_numattributes[attribute_num] = (double)value_num; break; } if (value_num == num_values[attribute_num]) if (ib4 && overlap && (attribute_num == (num_attributes_to_translate-1))) inst->attributes[attribute_num] = (double)value_num; else { printf("FATAL ERROR in translate_instance.\n"); printf("Cannot interpret attribute #%d (zero indexing).\n", attribute_num); printf("Current line: %s\n",instanceline); printf("Illegal value read: %s\n",value); printf("Attributes read so far for this instance:\n"); for(j=0; jattributes[j]); } } } /*==== Overlap option: add concept values as attributes. ====*/ if (ib4 && overlap) /*==== Assumes that the only predictee is at end of attribute list ====*/ for(predictee_a=0; predictee_aattributes[number_of_predictors+predictee_a]= (double)(strcmp(value,predictee_value_name[predictee_a]) == 0); /*==== 3. Assign the instance number and finish. ====*/ inst->instance_number = instance_number; return(inst); } /* ====================================================================== Takes a string, yields a doubleing point. The number doesn't require a decimal point -- handles integers or doubles. Doesn't require leading zero for fractions. Modification: the value can now be negative. ====================================================================== */ double atodouble(numeric_string) char numeric_string[20]; { register int i, integer_part, fraction_part, start_column; double result, fraction_size, negative_flag = 1.0; integer_part = 0; if (numeric_string[0] == '-') { start_column = 1; negative_flag = -1.0; } else start_column = 0; for(i=start_column; numeric_string[i]>='0' && numeric_string[i]<='9'; i++) integer_part = 10 * integer_part + numeric_string[i] - '0'; if (numeric_string[i] == '.') { fraction_part = 0; fraction_size = 1.0; for(i=i+1; numeric_string[i]>='0' && numeric_string[i]<='9'; i++) { fraction_part = 10 * fraction_part + numeric_string[i] - '0'; fraction_size *= 0.1; } result = (double)integer_part + ((double)fraction_part * fraction_size); } else /*==== This was an integer ====*/ result = (double)integer_part; return(result*negative_flag); } /* ====================================================================== Normalization for domain. Only for numeric attributes. Both for norm_linear and norm_sd. Assumes not norm_none. ====================================================================== */ ptr_instance normalize(inst) struct instance *inst; { /*==== Subfunctions ====*/ double scale(); /*==== Local Variables ====*/ register int i; /*==== Normalize all numeric-valued attribute's values ====*/ if (norm_linear) { for(i=0; i < number_of_attributes; i++) if ((attribute_type[i] == NUMERIC_TYPE) && (inst->attributes[i] != unknown_value)) inst->attributes[i] = scale(inst->attributes[i],scalemin[i],scalemax[i]); } else if (norm_sd) { for(i=0; i < number_of_attributes; i++) if ((attribute_type[i] == NUMERIC_TYPE) && (inst->attributes[i] != unknown_value)) inst->attributes[i]=inst->attributes[i]/standard_deviation[i]; } return(inst); } /* ====================================================================== Linearly maps the value given to the scale given (min and max found). Used by the normalize function of the data dependent file. ====================================================================== */ double scale(Value,Min,Max) double Value,Min,Max; { if (Value == unknown_value) return(Value); else if (Value >= Max) return(1.0); else if (Value <= Min) return(0.0); else return((Value - Min)/(Max - Min)); } /* ====================================================================== PREDICATE: True iff the given instance is thought of as noisy. Assumes either ib3 or ib4. ====================================================================== */ int noisy_instance(predictee_a,id) register int predictee_a, /*==== Specifies predictee ====*/ id; /*==== Specifies concept instance ====*/ { /*==== Subfunctions ====*/ double high_end_of_confidence_interval(); /*==== Local Variables ====*/ register int i_num = data_set[predictee_a][id]; /*==== Check: is accuracy interval's high end > class's interval's low? ===*/ /*==== Protection against 0 usage of this saved instance ====*/ if (usages[predictee_a][id] == 0) return(FALSE); else if (high_end_of_confidence_interval(counts[predictee_a][id][CORRECT], usages[predictee_a][id],signif_drop) <= class_drop_threshold[predictee_a][(int)instances[i_num]->attributes[predictee[predictee_a]]]) return(TRUE); else return(FALSE); } /* ====================================================================== Drops a noisy instance from the given algorithm. This involves adjusting (1) data_set (and num_data) (2) counts (and introduced) ====================================================================== */ void drop_instance(predictee_a,alg_data_index) register int predictee_a, /* Specifies concept description */ alg_data_index; /* Specifies which concept instance */ { /*==== Local Variables ====*/ register int i, instance_num = data_set[predictee_a][alg_data_index]; /*==== Now remove the instance from memory ====*/ num_uses[instance_num]--; /*==== Used only in update_all_norm... ====*/ num_dropped_in_concept[predictee_a]++; num_dropped_in_alg++; num_data_in_alg--; for(i=alg_data_index; i<(num_data_in_concept[predictee_a]-1); i++) { data_set[predictee_a][i] = data_set[predictee_a][i+1]; counts[predictee_a][i][CORRECT] = counts[predictee_a][i+1][CORRECT]; counts[predictee_a][i][INCORRECT] = counts[predictee_a][i+1][INCORRECT]; usages[predictee_a][i] = usages[predictee_a][i+1]; } num_data_in_concept[predictee_a]--; } /* ====================================================================== Updates the confidence intervals AFTER prediction, BEFORE updating classification records. This seems appropriate. ======================================================================*/ void update_confidence_intervals() { /*==== Subfunctions ====*/ double high_end_of_confidence_interval(), low_end_of_confidence_interval(); /*==== Local Variables ====*/ register int predictee_a,value; /*==== Set both thresholds for each potential predictee value ====*/ for(predictee_a=0; predictee_aattributes[i] = inst->attributes[i]; newinst->instance_number = inst->instance_number; return(newinst); } /* ====================================================================== TRUE iff the character is a name-ending character. ====================================================================== */ int name_ending_char(x) char x; { if ((x == ',') || (x == ' ') || (x == '\n') || (x == '#')) return(TRUE); else return(FALSE); } /* ====================================================================== Process the new training instance. ====================================================================== */ void process_new_training_instance(newinst) struct instance *newinst; /*==== New training instance ====*/ { /*==== Subfunctions ====*/ ptr_instance copy_instance(), normalize(); void update_normalization_info(), update_training_counts(), update_only_maxs_and_mins(); /*==== 1. Save raw instance. Need it for re-normalization/printing. ====*/ raw_instances[instance_number] = copy_instance(newinst); /*==== 2. Update the attribute normalization scales (if needed) ====*/ /*==== 3. Normalize and save the normalized new training instance ====*/ if (!norm_linear) update_only_maxs_and_mins(newinst); if (!norm_none) { update_normalization_info(newinst); newinst = normalize(newinst); } instances[instance_number] = newinst; /*==== 4. Update the training counts for the saved instances ====*/ update_training_counts(); /*==== Updates to the confidence intervals are done AFTER classification but before figuring out whether we should drop something ====*/ } /* ====================================================================== If norm_linear, updates maxes and mins for all attributes. If norm_sd, recalculates standard deviation for each numeric attribute. ====================================================================== */ void update_normalization_info(inst) struct instance *inst; { /*==== Subfunctions ====*/ void update_maxs_and_mins(), update_standard_deviations(); /*==== Choice of normalization function ====*/ if (norm_linear) update_maxs_and_mins(inst); else if (norm_sd) update_standard_deviations(inst); } /* ====================================================================== Norm_linear: update the maxes and mins for all attributes. ====================================================================== */ void update_maxs_and_mins(inst) struct instance *inst; { /*==== Subfunctions ====*/ void update_all_normalized_instances(); /*==== Local Variables ====*/ register int attribute_num; /*==== Re-scale each numeric attribute that needs to be re-scaled. ====*/ for(attribute_num=0; attribute_numattributes[attribute_num] != unknown_value) && (attribute_type[attribute_num] == NUMERIC_TYPE)) { if (inst->attributes[attribute_num] < scalemin[attribute_num]) { scalemin[attribute_num] = inst->attributes[attribute_num]; update_all_normalized_instances(attribute_num); } if (inst->attributes[attribute_num] > scalemax[attribute_num]) { scalemax[attribute_num] = inst->attributes[attribute_num]; update_all_normalized_instances(attribute_num); } } } /* ====================================================================== Dynamically update all normalized instances for this attribute. Called only for numeric attributes. ====================================================================== */ void update_all_normalized_instances(a) register int a; /*==== attribute ====*/ { /*==== Local Variables ====*/ register int i; double min = scalemin[a], max = scalemax[a]; /*==== Re-scale this attribute's value for all saved instances ====*/ /*==== Why? Its range was expanded by the current instance ====*/ for(i=0; i 0) && (instances[i]->attributes[a] != unknown_value)) instances[i]->attributes[a] = scale(raw_instances[i]->attributes[a],min,max); } /* ====================================================================== Norm_sd: 1. Update the standard deviations for all numeric predictors. 2. Reset the values for all stored instance's values! ====================================================================== */ void update_standard_deviations(inst) struct instance *inst; { /*==== Local Variables ====*/ register int predictor_a, a, i; double val, numerator, denominator; /*==== Update standard deviation for each numeric predictor. ====*/ for(predictor_a=0; predictor_aattributes[a] != unknown_value) && (attribute_type[a] == NUMERIC_TYPE)) { /*==== 1. Compute the new standard deviation ====*/ val = inst->attributes[a]; sum_values[a] = sum_values[a] + val; sum_squared_values[a] = sum_squared_values[a] + (val*val); number_of_known_values[a] = number_of_known_values[a] + 1.0; numerator = (number_of_known_values[a] * sum_squared_values[a]) - (sum_values[a] * sum_values[a]); denominator = number_of_known_values[a] * (number_of_known_values[a] - 1.0); if (denominator == 0.0) standard_deviation[a] = my_infinity; else standard_deviation[a] = sqrt(numerator/denominator); /*==== 2. Update the old instance's values. Avoid 0 divide. ====*/ if (standard_deviation[a] == 0.0) standard_deviation[a] = 1.0/my_infinity; for(i=0; i0) && (instances[i]->attributes[a] != unknown_value)) instances[i]->attributes[a] = raw_instances[i]->attributes[a]/standard_deviation[a]; } } } /* ====================================================================== Update only the maxs and mins for this new training instance. ====================================================================== */ void update_only_maxs_and_mins(inst) struct instance *inst; { /*==== Subfunctions ====*/ double scale(); /*==== Local Variables ====*/ register int attribute_num; /*==== Do it for all numeric vals.====*/ for(attribute_num=0; attribute_numattributes[attribute_num] != unknown_value) && (attribute_type[attribute_num] == NUMERIC_TYPE)) { if (inst->attributes[attribute_num] < scalemin[attribute_num]) scalemin[attribute_num] = inst->attributes[attribute_num]; if (inst->attributes[attribute_num] > scalemax[attribute_num]) scalemax[attribute_num] = inst->attributes[attribute_num]; } } /* ====================================================================== Updates the training counts for the current instance. Used by IB3 to estimate class's relative frequency. ====================================================================== */ void update_training_counts() { /*==== Local Variables ====*/ register int predictee_a, predictor_a, a; double val; /*==== For each target concept, if its value is known, update the cts ====*/ for(predictee_a=0; predictee_aattributes[predictee[predictee_a]]; if (val != unknown_value) { num_train_with_value[predictee_a][(int)val]++; num_train_in_concept[predictee_a]++; } } /*==== For keeping track of predictor averages and most frequents ====*/ if (missing_ave) for(predictor_a=0; predictor_aattributes[a]; if (val != unknown_value) { num_predictor_training_values[a]++; if (attribute_type[a] == NUMERIC_TYPE) sum_predictor_values[a] += val; else { val = instances[instance_number]->attributes[a]; num_training_instances_with_value[a][(int)val]++; } } } } /* ====================================================================== Updates a similarity definition for IB4. Version 1! (no Pr's) ====================================================================== */ void update_similarity_definition(predictee_a) register int predictee_a; { /*==== Subfunctions ====*/ void update_attribute_weights(), normalize_all_weights(); /*==== Local Variables ====*/ register int i,a; /*==== 1. Process with all accepted instances and rejected up to k ====*/ for(i=0; iattributes[predictee[predictee_a]]; frequency_of_value = (double)(num_train_with_value[predictee_a][val])/ (double)(num_train_in_concept[predictee_a]); if (frequency_of_value > 0.5) frequency_high = frequency_of_value; else frequency_high = 1.0 - frequency_of_value; /*==== 2. Update the attribute weight appropriately (num & denom) ====*/ if (instances[instance_number]->attributes[predictee[predictee_a]] == (double)val) /*==== 2a. They have the same target value ====*/ for(predictor_a=0; predictor_aattributes[a] != unknown_value) && (inst->attributes[a] != unknown_value))) { /*==== Get absolute difference of attribute values ====*/ if (!norm_linear) diff = value_diff(a, scale(raw_instances[instance_number]->attributes[a], scalemin[a],scalemax[a]), scale(raw_instances[data_set[predictee_a][data_id]]->attributes[a], scalemin[a],scalemax[a])); else diff = value_diff(a,instances[instance_number]->attributes[a], inst->attributes[a]); attribute_weight_total[predictee_a][predictor_a] += (1.0 - frequency_of_value) * (1.0 - diff); weight_size[predictee_a][predictor_a] += (1.0-frequency_of_value); } } else /*==== 2b. They have different target values ====*/ for(predictor_a=0; predictor_aattributes[a] != unknown_value) && (inst->attributes[a] != unknown_value))) {if (!norm_linear) diff = value_diff(a, scale(raw_instances[instance_number]->attributes[a], scalemin[a],scalemax[a]), scale(raw_instances[data_set[predictee_a][data_id]]->attributes[a], scalemin[a],scalemax[a])); else diff = value_diff(a,instances[instance_number]->attributes[a], inst->attributes[a]); attribute_weight_total[predictee_a][predictor_a] += (1.0 - frequency_high) * diff; weight_size[predictee_a][predictor_a] += (1.0 - frequency_high); } } } /* ====================================================================== Absolute value of their difference, but either may be unknown. Yield maxdiff in that case. ====================================================================== */ double value_diff(a,val1,val2) double val1,val2; register int a; /*==== A predictor attribute ====*/ { /*==== Subfunctions ====*/ double diff_with_unknowns(), absd(); /*==== Get their difference ====*/ if ((val1 == unknown_value) || (val2 == unknown_value)) return(diff_with_unknowns(a,val1,val2)); else if (attribute_type[a] == NOMINAL_TYPE) return((double)(val1 != val2)); else return(absd(val1-val2)); } /* ====================================================================== Yields maximum possible difference between this value and one in the range [0,1]. Returns a value in [0,1]. ====================================================================== */ double maxdiff(a,value) int a; /*==== An attributes[] index ====*/ double value; { if (attribute_type[a] == NOMINAL_TYPE) return(1.0); else if (value>0.5) return(value); else return(1.0-value); } /* ====================================================================== Absolute value for doubles. ====================================================================== */ double absd(val) double val; { if (val < 0.0) return(-val); else return(val); } /* ====================================================================== Normalization and fixing 0 weightings (default case). ====================================================================== */ void normalize_all_weights(predictee_a) register int predictee_a; { /*==== Sub-functions used here ====*/ double absd(); /*==== Local variable declarations ====*/ register int a; double total_weight; /*==== 1. Get the total weight ====*/ total_weight = 0.0; for(a=0; a num_training_instances_with_value[a][most_frequent[0]]) { most_frequent[0] = i; num_tied_with_most_frequent = 1; } else if (num_training_instances_with_value[a][i]== num_training_instances_with_value[a][most_frequent[0]]) { most_frequent[num_tied_with_most_frequent]=i; num_tied_with_most_frequent++; } most_frequent[0] = most_frequent[(int)(random() % num_tied_with_most_frequent)]; /*==== 2. Use as needed ====*/ if (val1 == unknown_value) if (val2 == unknown_value) diff = 0.0; else diff = (double)(val2 == (double)most_frequent[0]); else /*==== Assume val2 is unknown ====*/ diff = (double)(val1 == (double)most_frequent[0]); } return(diff); }