/****************************************************************/ /* Copyright 1993 : Johns Hopkins University */ /* Department of Computer Science */ /****************************************************************/ /* Contact : murthy@cs.jhu.edu */ /****************************************************************/ /* File Name : load_data.c */ /* Author : Sreerama K. Murthy */ /* Last modified : October 1993 */ /* Contains modules : load_points */ /* allocate_point_array */ /* shuffle_points */ /* Uses modules in : oc1.h */ /* util.c */ /* Is used by modules in : mktree.c */ /* Remarks : Routines in this file assume that the */ /* dimensionality of the data, and the */ /* number of classes are known. So, these */ /* should be set in mktree.c. */ /* Throughout this program and other files */ /* in OC1, "points" refer to data samples */ /* or examples. This is due to the */ /* geometrical interpretation of the */ /* parametric space used by OC1. */ /****************************************************************/ #include "oc1.h" extern int no_of_dimensions; extern int no_of_categories; extern int unlabeled; /************************************************************************/ /* Module name : load_points */ /* Functionality : Reads data points from the input file into the */ /* array train_points. */ /* Dynamically allocates space for the points read. */ /* Counts the number of points read. */ /* Parameters : infile : File pointer to the input file. */ /* points_ptr : pointer to the array into which the */ /* data points are to be loaded. */ /* Returns : Number of points read. */ /* Calls modules : error (util.c) */ /* allocate_point_array */ /* Is called by modules : read_data (mktree.c) */ /* main (display.c) */ /* Remarks : Assumes that the no_of_dimensions and no_of_categories */ /* are set. */ /************************************************************************/ int load_points(infile,points_ptr) FILE *infile; POINT ***points_ptr; { int points_allocated = 0,i,j,categories_unknown = FALSE; int point_count,*category_array; float temp; char c; POINT **allocate_point_array(); POINT **array_name = NULL; if (!unlabeled && !no_of_categories) { categories_unknown = TRUE; no_of_categories = 1; category_array = (int *)malloc(no_of_categories * sizeof(int)); category_array -= 1; } if (!no_of_dimensions) no_of_dimensions = MAX_DIMENSIONS; point_count = 0; while (TRUE) { point_count++; if (point_count > points_allocated) { if (points_allocated == 0) { array_name = allocate_point_array(array_name,10,0); points_allocated = 10; } else { array_name = allocate_point_array(array_name,points_allocated*2, points_allocated); points_allocated *= 2; } } if (point_count == 1 && no_of_dimensions == MAX_DIMENSIONS) /*count the number of dimensions of the first line in the datafile, and set it as the no_of_dimensions. Use this value for reading in the subsequent lines. The last entry of any line is taken as the category value, if "unlabeled != TRUE". */ { float temp = HUGE; int dim=0; while (TRUE) { c = (char)getc(infile); if (c == '\n') { if (temp != HUGE) { if (unlabeled == TRUE) array_name[1]->dimension[++dim] = temp; else { i = array_name[1]->category = (int)temp; if (temp - i != 0) error("LOAD_POINTS: Float category values unacceptable."); if (categories_unknown) category_array[1] = array_name[1]->category; else if (i < 1 || i > no_of_categories) { printf ("Only category values between 1 and %d allowed.\n", no_of_categories); error("LOAD_POINTS: Data point with invalid category."); } } no_of_dimensions = dim; for (i=1;i<=points_allocated;i++) { array_name[i]->dimension += 1; array_name[i]->dimension = (float *)realloc (array_name[i]->dimension, no_of_dimensions * sizeof(float)); array_name[i]->dimension -= 1; } break; } } if (isspace(c)) continue; if (isalpha(c)) error("LOAD_POINTS: Invalid character in datafile."); if (temp != HUGE) { if (++dim > no_of_dimensions) error("LOAD_POINTS: Too many dimensions. Adjust the constant MAX_DIMENSIONS."); array_name[point_count]->dimension[dim] = temp; } ungetc(c,infile); fscanf(infile,"%f",&temp); } } else { for (j=1;j<=no_of_dimensions;j++) { i = fscanf(infile,"%f",&(array_name[point_count]->dimension[j])); if (i != 1) { if (j>1) error("LOAD_POINTS : Lines containing unequal number of attributes in datafile."); else break; } } if (i != 1) break; if (unlabeled == TRUE) { array_name[point_count]->val = (double)0.0; continue; } if (fscanf(infile,"%d",&i) != 1) error("LOAD_POINTS : Lines containing unequal number of attributes in datafile."); if (categories_unknown) { for (j=1;j<=no_of_categories;j++) if (i == category_array[j]) break; if (j > no_of_categories) { no_of_categories++; category_array += 1; category_array = (int *)realloc(category_array, no_of_categories * sizeof(int)); category_array -= 1; category_array[no_of_categories] = i; } } else if (i<1 || i>no_of_categories) { printf ("Only category values between 1 and %d allowed.\n", no_of_categories); error("LOAD_POINTS: Data point with invalid category."); } array_name[point_count]->category = i; } array_name[point_count]->val = (double)0.0; } point_count--; if (point_count != points_allocated) array_name = allocate_point_array(array_name,point_count,points_allocated); if ( !unlabeled && categories_unknown) { /*There are no_of_categories classes. If all these numbers are between 1 and no_of_categories, then we don't need any remapping. */ for (i=1;i<=no_of_categories;i++) if (category_array[i] < 1 || category_array[i] > no_of_categories) break; if (i <= no_of_categories) { printf("Remapping class numbers:\n"); for (i=1;i<=no_of_categories;i++) if (i != category_array[i]) printf("\t%d To %d\n",category_array[i],i); for (i=1;i<=point_count;i++) for (j=1;j<=no_of_categories;j++) if (category_array[j] == array_name[i]->category) { array_name[i]->category = j; break; } } } *points_ptr = array_name; return(point_count); } /************************************************************************/ /* Module name : allocate_point_array */ /* Functionality : Allocates or reallocates "array_name" to be an */ /* array of pointers (to POINT structures), of */ /* size "size". Fully allocates all the POINT */ /* structures also. */ /* Parameters : array_name : name of the array to be (re)allocated. */ /* size : number of points to be allocated. */ /* prev_size : 0 if array_name doesn't exist already */ /* current size otherwise. */ /* Returns : pointer to the allocated array. */ /* Calls modules : error (util.c) */ /* vector (util.c) */ /* Is called by modules : load_points */ /************************************************************************/ POINT **allocate_point_array(array_name,size,prev_size) POINT **array_name; int size,prev_size; { int i; if (prev_size == 0) { if (array_name != NULL) if (!free((char *)(array_name+1))) fprintf (stderr,"Load_Data: Memory deallocation failure. Harmless.\n"); array_name = (struct point **)malloc ((unsigned)size * sizeof(struct point *)); if (!array_name) error("ALLOCATE_POINT_ARRAY: Memory Allocation Failure 1."); array_name -= 1; /* All indices start from 1*/ for (i=1;i<=size;i++) { array_name[i] = (struct point *)malloc((unsigned) sizeof(struct point)); if (!array_name[i]) error("ALLOCATE_POINT_ARRAY : Memory Allocation failure 2."); } for (i=1;i<=size;i++) array_name[i]->dimension = vector(1,no_of_dimensions); } else { array_name += 1; array_name = (struct point **)realloc (array_name, (unsigned)size * sizeof(struct point *)); if (!array_name) error("ALLOCATE_POINT_ARRAY: Memory Allocation Failure 3."); array_name -= 1; /* All indices start from 1*/ if (prev_size >= size) return(array_name); for (i=prev_size+1;i<=size;i++) { array_name[i] = (struct point *)malloc((unsigned) sizeof(struct point)); if (!array_name[i]) error("ALLOCATE_POINT_ARRAY : Memory Allocation failure 4."); } for (i=prev_size+1;i<=size;i++) array_name[i]->dimension = vector(1,no_of_dimensions); } return(array_name); } /************************************************************************/ /* Module name : shuffle_points */ /* Functionality : Pseudo-randomly shuffles the points in the */ /* array "array_name". */ /* for i = 1 to n, do */ /* swap point i with the point at a random */ /* position between 1 and n. */ /* Parameters : array_name : Point array which is to be shuffled. */ /* count : Number of entries in the array. */ /* Returns : Nothing. */ /* Calls modules : myrandom (util.c) */ /* Is called by modules : load_points */ /* Remarks : Achieves shuffling just by swapping pointers, thus */ /* not spending time on allocation/deallocation. */ /* Only training points are shuffled. */ /************************************************************************/ shuffle_points(array_name,count) POINT **array_name; int count; { int i,newposition; POINT *temp_point; for (i=1;i<=count;i++) { newposition = (int)myrandom(1,count); /* shuffle position "i" with "newposition" */ temp_point = array_name[i]; array_name[i] = array_name[newposition]; array_name[newposition] = temp_point; } } /************************************************************************/ /************************************************************************/