/* ====================================================================== David W. Aha NGE: utility.c: Utility files for NGE ====================================================================== */ #include "nge.h" /* ====================================================================== Reading in the data file's format: (by record) For each record: Format: : (no ':' in name) where attribute type is either 'numeric' or 'nominal' and if the latter, it must be followed by an integer denoting the number of possible symbolic values that this attribute can take on. Note: Class value always comes last for each instance!!!! Symbolic values must be encoded between 0 and (num_values[a]-1) This sets values for 1. number_of_attributes 2. attribute_type[number_of_attributes] 3. num_values[number_of_attributes] Yields 'TRUE' if all is well, else 'FALSE' ====================================================================== */ int read_data_format() { /*==== Local Variables ====*/ FILE *fopen(), *names_file; /* (namesfile) */ char names_file_line[MAX_LINE_LENGTH]; /* A record in this namesfile */ register int i = 0, /* Num characters in a line */ char_num; /* Index for value_name */ char value_name[MAX_LINE_LENGTH]; /* Holds #values for symbolic atts */ /*==== 1. Setup ====*/ /* printf("Reading data format from file named '%s'...\n",namesfile); */ names_file = fopen(namesfile,"r"); /*==== 2. Read the attribute lines. ====*/ while (NULL != fgets(names_file_line, MAX_LINE_LENGTH, names_file)) { /*==== 2.0 First check for errors ====*/ if (number_of_attributes == MAX_NUMBER_OF_ATTRIBUTES) { printf("Fatal error: MAX_NUMBER_OF_ATTRIBUTES exceeded.\n"); return(FALSE); } /*==== 2.1 Discard everything up to attribute type. ====*/ for(i=0; names_file_line[i] != ':'; i++) if (names_file_line[i] == '\0') return(FALSE); /*==== 2.2 Read attribute type (1 letter suffices). ====*/ if (names_file_line[i+3] == 'u') attribute_type[number_of_attributes] = NUMERIC_TYPE; else if (names_file_line[i+3] == 'o') { attribute_type[number_of_attributes] = SYMBOLIC_TYPE; /*==== Must compute number of values for symbolic attribute ====*/ char_num = 0; for(i += 10;; i++) if ((names_file_line[i] == '\n') || (names_file_line[i] == ' ')) /*==== End of number of values name ====*/ break; else /*==== Middle of an attribute value name ====*/ { value_name[char_num] = names_file_line[i]; char_num++; } value_name[char_num] = '\0'; num_values[number_of_attributes] = atoi(value_name); } else /*==== Fatal error while reading ====*/ { printf("Fatal error: invalid attribute type in format file.\n"); return(FALSE); } number_of_attributes++; } /*==== 3. Cleanup ====*/ /* printf("Done reading data format file.\n"); */ /* printf("Number of attributes is %d\n",number_of_attributes); */ fclose(names_file); return(TRUE); } /* ====================================================================== Reads and normalizes the training and testing data. Yields TRUE if all okay, FALSE otherwise ====================================================================== */ int read_and_normalize_data() { /*==== Local variables ====*/ FILE *training_file, *testing_file; int temp; /*==== Subfunctions ====*/ int read_data_and_normalize(); /*==== 1. Read the training set data ====*/ training_file = fopen(trainingfile,"r"); temp = read_data_and_normalize(training_file,TRAINING); fclose(training_file); /* printf("Number of training instances is %d\n",num_training_instances); */ if (temp == FALSE) return(FALSE); /*==== 2. Read the testing set data ====*/ testing_file = fopen(testingfile,"r"); temp = read_data_and_normalize(testing_file,TESTING); fclose(testing_file); /* printf("Number of testing instances is %d\n",num_testing_instances); */ if (temp == FALSE) return(FALSE); /*==== 3. All clear ====*/ return(TRUE); } /* ====================================================================== Reads a data file's information (either training or testing file) -- Should now work for any number of numeric predictors and a letter class -- Uses num_predictors to determine the number of attributes per instance -- Assumes class_index is after all of the predictor attributes' indices ======================================================================*/ int read_data_and_normalize(data_file,mode) FILE *data_file; register int mode; { /*==== Subfunctions ====*/ float atofloat(), scale(); /*==== Local variables ====*/ register int inst_num, attribute_num, name_size, i; char data_file_line[MAX_LINE_LENGTH],value[MAX_NAME_LENGTH]; float this_data, min[MAX_NUMBER_OF_ATTRIBUTES], max[MAX_NUMBER_OF_ATTRIBUTES]; /*==== 1. Setup for normalization ====*/ for(attribute_num=0; attribute_num min[attribute_num]) max[attribute_num] = this_data; } } /*==== 2.3 Store the data ====*/ if (mode == TRAINING) { if (inst_num == MAX_NUMBER_OF_TRAINING_INSTANCES) { printf("Max number of training instances exceeded.\n"); printf("Max allowed is %d\n", MAX_NUMBER_OF_TRAINING_INSTANCES); return(FALSE); } else training_instances[inst_num][attribute_num] = this_data; } else { if (inst_num == MAX_NUMBER_OF_TESTING_INSTANCES) { printf("Max number of testing instances exceeded.\n"); return(FALSE); } else testing_instances[inst_num][attribute_num] = this_data; } } } /*==== 3. Record the number of instances and normalize ====*/ if (mode == TRAINING) { num_training_instances = inst_num + 1; for(attribute_num=0;attribute_num='0' && numeric_string[i]<='9'; i++) integer_part = 10 * integer_part + numeric_string[i] - '0'; if (numeric_string[i] == '.') { fraction_part = 0; fraction_size = 1.0; for(i=i+1; numeric_string[i]>='0' && numeric_string[i]<='9'; i++) { fraction_part = 10 * fraction_part + numeric_string[i] - '0'; fraction_size *= 0.1; } result = (float)integer_part + ((float)fraction_part * fraction_size); } else /*==== This was an integer ====*/ result = (float)integer_part; return(result*negative_flag); } /* ====================================================================== Normalizes a given value between min and max ====================================================================== */ float scale(Value,Min,Max) float Value,Min,Max; { if (Value == UNKNOWN_VALUE) return(Value); else if (Value >= Max) return(1.0); else if (Value <= Min) return(0.0); else return((Value - Min)/(Max - Min)); }