/* /--------------------------------------------------------------------\ | File : PREPARE.C | |--------------------------------------------------------------------| | Written by : Umit CATALYUREK | | Date : 1.6.1993 | |--------------------------------------------------------------------| | Description : Implementation of CN2 for Machine Learning Course. | | File that contains functions which reads data, | | eliminates all unknown data etc. | |--------------------------------------------------------------------| | Last Modification Date : | | By : | | Description : | \____________________________________________________________________/ */ #include #include #include #include #include #include "prepare.h" Description Dscr; DataList Data; int mindiscr = 200; int maxdiscr = 0; int totdiscr = 0; char *killspace(str) char *str; { char *p=str, *s; while (*p==' ' || *p=='\t') p++; if (p != str) { s=str; while (*s++=*p++); } p = str + strlen(str) - 1; while (p>=str && (*p == ' ' || *p=='\t')) { *p = 0; --p; } return str; } void eliminatecomment(s) char *s; { char *com; int l; killspace(s); com = strchr(s, '|'); if (com != NULL) *com = 0; l = strlen(s); if (*(s+l-1)== '\n') *(s+l-1) = 0; l = strlen(s); if (*(s+l-1)== '.') *(s+l-1) = 0; killspace(s); l = strlen(s); if (*(s+l-1)== '.') *(s+l-1) = 0; } char *gettoken(st) char **st; { static char token[100]; char *p=*st, *s=token; while (*p && *p != ':' && *p != ',') *s++ = *p++; *s = 0; s = *st; if (!*p) *s = 0; else { p++; while (*s++ = *p++); } killspace(token); return token; } float zatof(s) char *s; { char buf[50]; strcpy(buf, s); if (strchr(buf, '.')==NULL) strcat(buf, ".0"); return (atof(buf)); } void Initialize() { int i; Dscr.numofclasses = 0; Dscr.numofattributes = 0; for (i=0; ivl[d].discr; } char *GetClassName(c) int c; { return Dscr.classes[c]; } void printinstance(I) PInstance I; { int j; for (j=0; jhead)+j); if (Dscr.attrs[j]->flag==ATTR_CONTINUOUS) printf("%6.2f,", D->val.value); else printf("%s,", GetDiscrName(j, D->val.discr)); } printf("%s\n", GetClassName(I->class)); } void AddData(P) PInstance P; { if (Data.tail == NULL) Data.head=Data.tail=P; else { Data.tail->next = P; P->prev = Data.tail; Data.tail = P; } Data.numofinstances++; } void DeleteData(P) PInstance P; { PInstance q; if (P==Data.tail) Data.tail = P->prev; if (P==Data.head) { Data.head = P->next; if (Data.head != NULL) Data.head->prev = NULL; } else { q = P->prev; q->next = P->next; if (P->next != NULL) P->next->prev = q; } free(P->head); free(P); Data.numofinstances--; } int FindDiscr(A, s) PAttribute A; char *s; { int j; for (j=0; strcmp(s, A->vl[j].discr) && jnumofdiscrval; j++); if (j==(A->numofdiscrval)) return -1; else return j; } int FindClass(cl) char *cl; { int j; for (j=0; strcmp(cl, Dscr.classes[j]) && jnumofdiscrval = 0; A->attrname = (char *) malloc(strlen(token)+1); strcpy(A->attrname, token); token = gettoken(&s); if (!strcmp(token, "continuous")) { A->flag = ATTR_CONTINUOUS; token = gettoken(&s); while (*token) { A->vl[A->numofdiscrval].value = zatof(token); A->occurs[A->numofdiscrval] = 0; A->numofdiscrval++; token = gettoken(&s); } if (A->numofdiscrval==0) { printf("\n No sub-range bound for attribute : %s", A->attrname); exit(0); } A->occurs[A->numofdiscrval] = 0; } else { A->flag = ATTR_DISCRETE; while (*token) { A->vl[A->numofdiscrval].discr = (char *) malloc(strlen(token)+1); strcpy(A->vl[A->numofdiscrval].discr, token); A->occurs[A->numofdiscrval] = 0; A->numofdiscrval++; token = gettoken(&s); } } if (A->numofdiscrval < mindiscr) mindiscr = A->numofdiscrval; if (A->numofdiscrval > maxdiscr) maxdiscr = A->numofdiscrval; totdiscr += A->numofdiscrval; Dscr.numofattributes++; } void ReadDescrFile(namesf) FILE *namesf; { Line st; int linecnt=0; while (!feof(namesf)) { if (fgets(st, LINELENGTH, namesf) != NULL) { eliminatecomment(st); if (*st) { if (!linecnt) getclasses(st); else getattribute(st); linecnt++; } } } } void printdescription() { int i; printf("\n Num of classes : %d", Dscr.numofclasses); for (i=0; i", Dscr.classes[i]); printf("\n Numer of Attributes : %d", Dscr.numofattributes); for (i=0; i ", A->attrname); if (A->flag==ATTR_DISCRETE) { int j; printf(" discrete attr :\n"); for (j=0; jnumofdiscrval; j++) printf("[%s] ", A->vl[j].discr); } else printf(" continous"); } printf("\n #Discr Val Min : %d Max : %d Avg :%8.3f", mindiscr, maxdiscr, (float) totdiscr/ (float) Dscr.numofattributes); } void getinstance(s) char *s; { char *token; int i; PAttribute A; PInstance I; PAttrData D; I = (PInstance) malloc(sizeof(Instance)); I->next = I->prev = NULL; I->head = (PAttrData) malloc(sizeof(AttrData)*Dscr.numofattributes); for (i=0; ihead)+i); if (*token=='?') I->flag = D->flag = DATA_UNKNOWN; else if (A->flag == ATTR_CONTINUOUS) { D->val.value = zatof(token); if (!A->numofdiscrval) { if (D->val.value>A->vl[1].value) A->vl[1].value = D->val.value; if (D->val.valuevl[0].value) A->vl[0].value = D->val.value; } } else if ((D->val.discr=FindDiscr(A, token)) == -1) I->flag = D->flag = DATA_UNKNOWN; else A->occurs[D->val.discr]++; } token = gettoken(&s); if ((I->class = FindClass(token))==-1) I->flag = DATA_UNKNOWN; else Dscr.occurs[I->class]++; AddData(I); } void ReadDataFile(dataf) FILE *dataf; { Line st; int i, j; PAttribute A; PInstance I; while (!feof(dataf)) { if (fgets(st, LINELENGTH, dataf) != NULL) { eliminatecomment(st); if (*st) getinstance(st); } } if (Data.numofinstances>0) for (i=0; inext, i++) { fl |= I->flag; #ifdef _DEBUG printinstance(I); #endif } if (fl) printf("\n Data has unknown value"); for (i=0; iattrname); for (j=0; jnumofdiscrval; j++) if (Dscr.attrs[i]->flag == ATTR_DISCRETE) printf("\n %s occurs %d times", Dscr.attrs[i]->vl[j].discr, Dscr.attrs[i]->occurs[j]); else printf("\n Range %d (Val <= %8.2f) occurs %d times", j, Dscr.attrs[i]->vl[j].value, Dscr.attrs[i]->occurs[j]); if (Dscr.attrs[i]->flag == ATTR_CONTINUOUS) { j = Dscr.attrs[i]->numofdiscrval; printf("\n Range %d (Val > %8.2f) occurs %d times", j, Dscr.attrs[i]->vl[j-1].value, Dscr.attrs[i]->occurs[j]); } } printf("\n"); } void WriteOccurences(o) int o; { int i; write(o, Dscr.occurs, sizeof(int)*Dscr.numofclasses); write(o, Dscr.expect, sizeof(double)*Dscr.numofclasses); for (i=0; inumofdiscrval; j++) write(o, &(A->occurs[j]), sizeof(int)); } } void ReadOccurences(in) int in; { int i; read(in, Dscr.occurs, sizeof(int)*Dscr.numofclasses); read(in, Dscr.expect, sizeof(double)*Dscr.numofclasses); for (i=0; inumofdiscrval; j++) read(in, &(A->occurs[j]), sizeof(int)); } } void EliminateUnknownData() { PInstance I; int mostoccured[MAXATTRIBUTE], moc, i, j; PAttribute A; PAttrData D; for (i=0; inumofdiscrval; j++) if (A->occurs[j] > A->occurs[mostoccured[i]]) mostoccured[i] = j; if (A->flag == ATTR_CONTINUOUS && A->occurs[A->numofdiscrval] > A->occurs[mostoccured[i]]) mostoccured[i] = A->numofdiscrval; } moc = 0; for (i=1; iDscr.occurs[moc]) moc = i; for (I=Data.head; I != NULL; I=I->next) if (I->flag==DATA_UNKNOWN) { if (I->class == -1) I->class = moc; for (i=0; ihead)+i; if (D->flag==DATA_UNKNOWN) { if (A->flag==ATTR_DISCRETE) D->val.discr = mo; else if (mo==0) D->val.value = A->vl[0].value-1.0; else if (mo==A->numofdiscrval) D->val.value = A->vl[A->numofdiscrval-1].value+1.0; else D->val.value = (A->vl[mo-1].value + A->vl[mo].value) / 2.0; D->flag = DATA_VALID; } } I->flag = DATA_VALID; } } void Deallocate() { int i; for (i=0; iattrname); if (Dscr.attrs[i]->flag == ATTR_DISCRETE) { int j; for (j=0; jnumofdiscrval; j++) free(Dscr.attrs[i]->vl[j].discr); } free(Dscr.attrs[i]); } while (Data.head != NULL) { Data.tail = Data.head; Data.head = Data.head->next; free(Data.tail->head); free(Data.tail); } }