#include #include #include #include #include // CHANGES // 07.12.2001 // initial_calc(), decoupling_coe_calc(), calculate_nc_and_powers() // has changed to read from dvector file. dd structure is disabled! // 09.12.2001 // read_old_cluster_index (=new_cluester_index) // cluster_index_sort() // create_iisd(): new_cluster_index'i kullaniyor ve dokuman numarasina // gore eslesen termleri buluyor (random dvector scan?!) // cluster_document() // iic.txt -> cluster: seed documanlar // TO DO: // seed secerken esit powera sahipler arasindan bosta kalan termleri // cover edeni bul ve sec! typedef long INT; typedef struct { INT seed, address; } cpointerrec; typedef struct { float power; INT document; } seedpowertuple; typedef struct { INT term, weight; } dvectorrec; typedef struct tagdocumentnode{ INT term, weight; tagdocumentnode *link; } documentnode, *documentpointer; typedef struct tagiisdclusternode { long cluster; float weight; tagiisdclusternode *link; } iisdclusternode, *iisdclusterpointer; typedef struct tagclusternode{ INT document; tagclusternode *link; } clusternode, *clusterpointer; typedef struct { INT term; float weight; } centroidtuple; const INT centroid_max = 250; // centroid size const INT max_part_size = 200; // while computing centroids, partition data into smaller chunks const INT max_overlap = 1; // maximum number of clusters that a document can be assigned const float overlap_coverage_ratio = 0.9; // ratio of the coverage / highest coverage const INT m_max = 220000; // maximum number of documents const INT n_max = 230000; // maximum number of terms const INT depth_max = 30000; // maximum number of terms in a document const float decoupling_max = (float)0.8; const float seed_power_diff_min = (float)0.0001; const float cover_diff_min = (float)0.001; const int TRUE = 1; const int FALSE = 0; const int binary_file = FALSE; const int terms_per_row = 15; INT m, n; INT debug; INT nc; INT term_presence[n_max + 1]; float alpha[m_max + 1], decoupling_document[m_max + 1]; float beta[n_max + 1]; float decoupling_term[n_max + 1]; seedpowertuple seed_power[m_max + 1]; int valid_seed[m_max + 1]; // new definitions iisdclusterpointer iisd[n_max+1]; // inverted index (terms) for seed documents clusternode *iic; // inverted index (docs) for clusters INT **iid; // inverted index (clusters) for documents INT *cluster_index; // holds the seed of the corresponding cluster centroidtuple **centroid_terms; // holds the centroid vectors of the clusters iisdclusterpointer ic[n_max]; // inverted index (terms) for centroids FILE *iid_file; // inverted index (clusters) for documents FILE *iic_file; // inverted index (terms) for clusters FILE *ic_file; // inverted index (terms) for centroids FILE *logfile; FILE *newcpnt; /* FILE of cpointerrec; */ FILE *dpointer; /* FILE of INTEGER; */ FILE *dvector; /* FILE of dvectorrec; */ FILE *icolsum; /* FILE of REAL; */ FILE *param; /* TEXT; */ FILE *seedpow; /* FILE OF REAL; */ FILE *idocsum; /* FILE OF REAL;*/ FILE *rectime; /* FILE OF TIME; */ int CmpFunc_long (const void *_a, const void *_b) { const long *a = (long *)_a; const long *b = (long *)_b; if ( *a < *b ) return (-1); else if (*a == *b ) return (0); else return (1); } int CmpFunc_dvectorrec(const void* _a, const void* _b) { // you've got to explicitly cast to the correct type const dvectorrec* a = (const dvectorrec*) _a; const dvectorrec* b = (const dvectorrec*) _b; if(a->weight < b->weight) return 1; else if(a->weight == b->weight) return 0; else return -1; } void preparation(INT& m, INT& n, INT& debug) { //INT document; INT term; INT i; //Initialize all array variables for ( i = 0; i <= m_max; i++ ) { alpha[i] = 0; decoupling_document[i] = 0; seed_power[i].document = 0; seed_power[i].power = 0; } for ( i = 0; i <= n_max; i++ ) { term_presence[i] = 0; decoupling_term[i] = 0; beta[i] = 0; } //open file of param.txt, read only param = fopen( "param.txt", "r" ); if (param == NULL) { printf("Error open file 'param.txt'.\n"); exit(1); } fscanf(param, "%ld%ld", &m, &n); /* "%ld" should be changed according to the definition of INT; */ /* in the current state, it is refer to variable LONG. */ printf("D matrix is %ld X %ld.\n", m, n); printf("m equals to %ld\n", m); printf("n equals to %ld\n", n); fprintf(logfile,"D matrix is %ld X %ld.\n", m, n); fprintf(logfile,"number of documents: %ld\n", m); fprintf(logfile,"number of terms %ld\n", n); debug = 0; fscanf(param, "%ld", &debug); if (debug == 1) { printf("Debugging mode in ON.\n"); } else { printf("Debugging mode is OFF.\n"); } fclose(param); } /* End of preparation */ void read_file_no(INT *file_no) { fscanf(dvector, "%6d", file_no); } int read_in_line( INT line[]) { int i; for( i = 0; i < terms_per_row; i++) { fscanf(dvector, "%7d", &line[i]); } fscanf(dvector,"\n"); if (feof(dvector)) { return EOF; } return(1); } int read_in_dvector(INT *file_no, INT *file_no2, INT term[], INT weight[]) { int i, j; char temp[8]; int if_EOF; read_file_no(file_no); //read in the odd row which contains term information if_EOF = read_in_line(term); read_file_no(file_no2); //read in the even row which contains weight information if_EOF = read_in_line(weight); return(if_EOF); } /* End of read_in_dvector */ void initial_calc() //calculate alpha value, getting prepared for beta value { INT depth, depth_counter; INT document; INT term[terms_per_row], weight[terms_per_row]; int loop = TRUE; int i, current; INT file_no; INT file_no2; dvectorrec document_vector[depth_max+1]; int flag = TRUE; if (debug == 1) { printf("Calculation alpha values, starting to calculate beta " "and term decoupling values.\n"); } //open document row sum, getting alpha value for similary calculation. idocsum = fopen("idocsum.txt", "w"); if (idocsum == NULL) { printf("Error open file idocsum.\n"); exit(1); } dvector = fopen("dvector.txt", "r"); if (dvector == NULL) { printf("Error open file dvector.\n"); exit(1); } read_file_no(&file_no); for (document=1; document<=m; document++) { depth = 0; alpha[document] = 0; do { current = read_in_line(term); read_file_no(&file_no2); current = read_in_line(weight); if (current == EOF) { loop = FALSE; } else { read_file_no(&file_no); } for(i = 0; i < terms_per_row; i++) { if (term[i] != 0 ) { // do an operation for each term-weight // combination if != 0 //add up column sum, as for calculating alpha value alpha[document] += weight[i]; //prepare for calculation of beta value beta[term[i]] += weight[i]; //add term to the term list term_presence[term[i]] ++; //move down depth ++; //store the term and weight into array for later calculation document_vector[depth].term = term[i]; document_vector[depth].weight = weight[i]; } } } while (file_no == document && loop == TRUE); if (alpha[document] != 0) { //alpha = 1 / row_sum alpha[document] = 1 / alpha[document]; fprintf(idocsum, "%f%c", alpha[document], ' '); for (depth_counter = 1; depth_counter <= depth; depth_counter ++) { // prepare to calculate term decoupling value decoupling_term[document_vector[depth_counter].term] = decoupling_term[document_vector[depth_counter].term] + document_vector[depth_counter].weight * document_vector[depth_counter].weight * alpha[document]; } } } fclose(dvector); } /* End of start_calc */ void finish_calc() { //finish calculating beta values and term decoupling values INT nonzero_entries, unique_terms; INT term; int i; if (debug == 1) { printf("Finishing calculating betas and term decoupling values.\n"); } //icolsum = fopen("icolsum.bin", "wb"); icolsum = fopen("icolsum.txt", "w"); if (icolsum == NULL) { printf("Error open file icolsum.\n"); exit(1); } nonzero_entries = 0; unique_terms = 0; for(term = 1; term <= n; term ++) { if (beta[term] != 0) { //calculate beta value beta[term] = 1 / beta[term]; } //finish calculating decoupling_term value //decoupling value from term point of view decoupling_term[term] *= beta[term]; //fwrite(beta + term, sizeof(float), 1, icolsum); fprintf(icolsum, "%f%c", beta[term], ' '); if (term_presence[term] != 0) { //count the unique terms unique_terms ++; nonzero_entries += term_presence[term]; } } //print out some information about terms printf("There are %ld non-zero entries in the D matrix.\n", nonzero_entries); printf("There are %ld unique terms.\n", unique_terms); printf("Average depth of indexing (xd) is %f.\n", float(float(nonzero_entries)/float(m))); printf("Average term generality (tg) is %f.\n", float(float(nonzero_entries)/float(unique_terms))); fprintf(logfile,"There are %ld non-zero entries in the D matrix.\n", nonzero_entries); fprintf(logfile,"There are %ld unique terms.\n", unique_terms); fprintf(logfile,"Average depth of indexing (xd) is %f.\n", float(float(nonzero_entries)/float(m))); fprintf(logfile,"Average term generality (tg) is %f.\n", float(float(nonzero_entries)/float(unique_terms))); } /* End of finish_calc */ void decoupling_coe_calc() { //calculate decoupling coefficient value from document point of view INT document; INT term[terms_per_row], weight[terms_per_row]; int loop = TRUE; int current; INT file_no; INT file_no2; if (debug == 1) { printf("Calculating decoupling coefficients for documents.\n"); } dvector = fopen("dvector.txt", "r"); if (dvector == NULL) { printf("Error open file dvector.\n"); exit(1); } read_file_no(&file_no); for (document=1; document<=m; document++) { decoupling_document[document] = 0; do { current = read_in_line(term); read_file_no(&file_no2); current = read_in_line(weight); if (current == EOF) { loop = FALSE; } else { read_file_no(&file_no); } for( int i = 0; i < terms_per_row; i++) { if (term[i] != 0) { decoupling_document[document] += weight[i] * weight[i] * beta[term[i]]; } } }while (file_no == document && loop == TRUE); decoupling_document[document] *= alpha[document]; } fclose(dvector); } /* End of dd_calc */ void calculate_nc_and_powers(INT &nc) { // calculate nc and seed power value INT document = 1; // only true when the first document has only one line of data; float _nc; INT term[terms_per_row], weight[terms_per_row]; int loop = TRUE; int current, i, j; INT file_no; INT file_no2; if (debug == 1) { printf("Calculating seed powers and nc.\n"); } _nc = 0; dvector = fopen("dvector.txt", "r"); if (dvector == NULL) { printf("Error open file dvector.\n"); exit(1); } read_file_no(&file_no); for (document=1; document<=m; document++) { seed_power[document].power = 0; do { current = read_in_line(term); read_file_no(&file_no2); current = read_in_line(weight); if (current == EOF) { loop = FALSE; } else { read_file_no(&file_no); } for(i = 0; i < terms_per_row; i++) { if (term[i] != 0) { if (binary_file == TRUE) { seed_power[document].power += weight[i]; } else { seed_power[document].power += weight[i] * decoupling_term[term[i]] * (1 - decoupling_term[term[i]]); } } } }while (file_no == document && loop == TRUE); if (seed_power[document].power != 0) { seed_power[document].power *= decoupling_document[document] * (1 - decoupling_document[document]); } //if ((seed_power[document].power == 0)&&(decoupling_document[document] == 1)) // { // seed_power[document].power = 1.0; // } // sum up all Cii _nc += decoupling_document[document]; seed_power[document].document = document; } fclose(dvector); // calculate nc, determine number of seeds need to be selected nc = int(_nc + .5); // allocate cluster index structure cluster_index = (long *)calloc(nc,sizeof(long)); centroid_terms = (centroidtuple **)malloc(nc * sizeof(centroidtuple *)); for(i=0; i x.power); if (i < j) { if (seed_power[i].power != seed_power[j].power) { temp = seed_power[j]; seed_power[j] = seed_power[i]; seed_power[i] = temp; } } else { done = TRUE; } } return j; } void sort_seed_power(INT p, INT r) /* using recursion to sort the seed_power. */ { INT q; if (p < r) { q = seed_power_partition(p, r); sort_seed_power(p, q); sort_seed_power(q+1, r); } } /* End seed_power_sort */ void cover_coefficients(INT i, INT j, float& cij, float& cji) { int i_length, i_runner, j_length, j_runner; //INT position; // This line is added in debug float sum; dvectorrec document_vector_i[depth_max + 1], document_vector_j[depth_max + 1]; dvectorrec term_frequency; INT file_no = 0; INT file_no2 = 0; INT term[terms_per_row], weight[terms_per_row]; int l; int current; // Initialize the common summation of cij and cji sum = 0; //read document vectors to use into arrays //position = address[i]; //calculate the length of document i in terms i_length = 0; j_length = 0; //read terms covered by document into an array //fseek(dvector, (position - 1) * sizeof(dvectorrec), SEEK_SET); dvector = fopen("dvector.txt", "r"); if (dvector == NULL) { printf("Error open file dvector.\n"); exit(1); } //to find the position of document i current = read_in_dvector(&file_no, &file_no2, term, weight); if (i < j) { while (file_no < i) { if (current != EOF && file_no != 0) { current = read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } while ( file_no == i ) { if (file_no != file_no2) { printf("Error reading the dvector files. \n"); printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2); exit (1); } for( l = 0; l < terms_per_row; l++) { i_length++; term_frequency.term = term[l]; term_frequency.weight = weight[l]; document_vector_i[i_length].term = term_frequency.term; document_vector_i[i_length].weight = term_frequency.weight; } if (current != EOF && file_no != 0) { current = read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } while (file_no < j) { if ( current != EOF && file_no != 0) { current = read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } while (file_no == j) { if (file_no != file_no2) { printf("Error reading the dvector files. \n"); printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2); exit (1); } for( l = 0; l < terms_per_row; l++) { j_length++; term_frequency.term = term[l]; term_frequency.weight = weight[l]; document_vector_j[j_length].term = term_frequency.term; document_vector_j[j_length].weight = term_frequency.weight; } if (current != EOF && file_no != 0) { read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } } // if (i < j) else { while (file_no < j) { if ( current != EOF && file_no != 0) { current = read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } while (file_no == j) { if (file_no != file_no2) { printf("Error reading the dvector files. \n"); printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2); exit (1); } for( l = 0; l < terms_per_row; l++) { j_length++; term_frequency.term = term[l]; term_frequency.weight = weight[l]; document_vector_j[j_length].term = term_frequency.term; document_vector_j[j_length].weight = term_frequency.weight; } if (current != EOF && file_no != 0) { read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } while (file_no < i) { if (current != EOF && file_no != 0) { current = read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } while ( file_no == i ) { if (file_no != file_no2) { printf("Error reading the dvector files. \n"); printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2); exit (1); } for( l = 0; l < terms_per_row; l++) { i_length++; term_frequency.term = term[l]; term_frequency.weight = weight[l]; document_vector_i[i_length].term = term_frequency.term; document_vector_i[i_length].weight = term_frequency.weight; } if (current != EOF && file_no != 0) { current = read_in_dvector(&file_no, &file_no2, term, weight); } else file_no = m + 1; } } // end else ( j < i) fclose(dvector); //read terms covered by document j into an array //fseek(dvector, (position - 1) * sizeof(dvectorrec), SEEK_SET); //need to change back, fseek usually don't work well with .txt file //calculate the common summation factor of the cover coefficients i_runner = 1; j_runner = 1; while (i_runner <= i_length && j_runner <= j_length) { // debug //if (document_vector_i[i_runner].term] == document_vector_j[j_runner].term) { if (document_vector_i[i_runner].term == document_vector_j[j_runner].term) { sum += document_vector_i[i_runner].weight * beta[document_vector_i[i_runner].term] * document_vector_j[j_runner].weight; i_runner ++; j_runner ++; // debug //} else if (document_vector_i[i_runner].term < document_vector_j[j_runner.term]) { } else if (document_vector_i[i_runner].term < document_vector_j[j_runner].term) { /* one term number must be less than the other, so increment the pointer of the documnent with the lesser term */ i_runner ++; } else { j_runner ++; } } cij = alpha[i] * sum; cji = alpha[j] * sum; } void select_seed(INT document, float power, INT &seeds_found) { cpointerrec cluster_pointer; // cluster index in memory is set cluster_index[seeds_found] = document; //increment number of seeds found seeds_found ++; //add new document to the cluster index on disk cluster_pointer.seed = document; cluster_pointer.address = 0; //fwrite(&cluster_pointer, sizeof(cpointerrec), 1, newcpnt); fprintf(newcpnt, "%ld%c%ld%c", cluster_pointer.seed, ' ', cluster_pointer.address, ' '); //save the cluster seed power //fwrite(&power, sizeof(float), 1, seedpow); fprintf(seedpow, "%f%c%c", power, ' ', ' '); } void determine_seeds() { //determine cluster seeds INT counter, document, first_document, last_document, seeds_found; float cij, cji; cpointerrec cluster_pointer; //sort the seed power sort_seed_power(1, m); if (debug == 1) { printf("Determining cluster seeds.\n"); } for (document = 1; document <= m_max; document ++) { valid_seed[document] = TRUE; } //newcpnt = fopen("newcpnt.bin", "wb"); newcpnt = fopen("newcpnt.txt", "w"); if (newcpnt == NULL) { printf("Error open file newcpnt.\n"); exit(1); } //seedpow = fopen("seedpow.bin", "wb"); seedpow = fopen("seedpow.txt", "w"); if (seedpow == NULL) { printf("Error open file seedpow.\n"); exit(1); } document = 1; seeds_found = 0; do { first_document = document; last_document = document; //find the last document that has the same seed power as the first while (fabs(seed_power[last_document + 1].power - seed_power[first_document].power) < seed_power_diff_min && last_document < m) { last_document ++; } //counter = first_document; // debug //for (document = first_document + 1; document <= last_document; document ++) { // debug for (document = first_document; document <= last_document; document ++) { //check if the document to compare is valid if (valid_seed[document]) { /* debug if (abs(decoupling_document[seed_power[document].document - decoupling_document[seed_power[counter].document) */ for (counter = document + 1; counter <= last_document; counter ++){ // if deco[doc] == deco[counter] == cij == cji // check if the document to compare is still valid if (fabs(decoupling_document[seed_power[document].document] - decoupling_document[seed_power[counter].document]) < cover_diff_min) { cover_coefficients(seed_power[document].document, seed_power[counter].document, cij, cji); //determine if the documents are equal if (fabs(cij - cji) < cover_diff_min) if (fabs(cij - decoupling_document[seed_power[document].document]) < cover_diff_min) if (fabs(cij - decoupling_document[seed_power[counter].document]) < cover_diff_min) if (fabs(cji - decoupling_document[seed_power[document].document]) < cover_diff_min) if (fabs(cji - decoupling_document[seed_power[counter].document]) < cover_diff_min) { //valid_seed[counter] = FALSE; valid_seed[document] = FALSE; // debug } } } } } //select all the valid seeds for (document = first_document; document <= last_document; document ++) { if (valid_seed[document] && seeds_found < nc) { // debug //select_seed(seed_powerdocument.document, seed_power[document.power, seeds_found); select_seed(seed_power[document].document, seed_power[document].power, seeds_found); } } document = last_document + 1; } while (!(seeds_found == nc || document > m)); //until all the required no. of seed found or all the documents readed printf("%d seeds were selected from top %d candidates.\n", seeds_found, last_document); fprintf(logfile,"%d seeds were selected from top %d candidates.\n", seeds_found, last_document); printf("%ld, cluster seeds were selected.\n", seeds_found); fprintf(logfile,"%ld, cluster seeds were selected.\n", seeds_found); cluster_pointer.seed = 0; cluster_pointer.address = 0; //fwrite(&cluster_pointer, sizeof(cpointerrec), 1, newcpnt); fprintf(newcpnt, "%ld%c%ld%c", cluster_pointer.address, ' ', cluster_pointer.seed, ' '); } /* End of determine_seeds */ void add_to_iisd(long term, long cluster, long weight, long& total_iisd_entries) { //add a cluster and weight to the IISD iisdclusterpointer new_iisd_cluster_node; //increment the total iisd entries total_iisd_entries ++; //make a new node new_iisd_cluster_node = new iisdclusternode; //copy the cluster number, weight and pointer to the list if(new_iisd_cluster_node == NULL) { printf("Error allocating memory.\n"); exit(2); } new_iisd_cluster_node -> cluster = cluster; new_iisd_cluster_node -> weight = weight; new_iisd_cluster_node -> link = iisd[term]; iisd[term] = new_iisd_cluster_node; } void create_iisd() { INT document; INT term[terms_per_row], weight[terms_per_row]; int loop = TRUE, is_seed = FALSE; int current; INT file_no; INT file_no2; INT terms_used_in_iisd = 0, total_iisd_entries = 0; int i, cluster; if (debug == 1) { printf("Creating iisd.\n"); } dvector = fopen("dvector.txt", "r"); if (dvector == NULL) { printf("Error open file dvector.\n"); exit(1); } // iisd = (iisdclusternode *)malloc((n+1)*sizeof(iisdclusternode)) for (i=0 ; i < n; i++) { iisd[i] = NULL; } cluster = 1; read_file_no(&file_no); for (document=1; document<=m; document++) { do { current = read_in_line(term); read_file_no(&file_no2); current = read_in_line(weight); if (current == EOF) { loop = FALSE; } else { read_file_no(&file_no); } if ( file_no2 == cluster_index[cluster-1] ) { is_seed = TRUE; for(i=0; i < terms_per_row; i++) { if (term[i] != 0 ) { // do an operation for each term-weight // combination if != 0 add_to_iisd(term[i],cluster,weight[i],total_iisd_entries); } } } } while (file_no == document && loop == TRUE); if ( is_seed == TRUE) { cluster++; is_seed = FALSE; } } for(i=1; i <= n; i++) { if (iisd[i] != NULL) { terms_used_in_iisd++; } } fclose(dvector); if (debug == 1) { printf("%ld terms are used in the IISD.\n", terms_used_in_iisd); printf("%ld entries are present in the IISD.\n", total_iisd_entries); fprintf(logfile,"%ld terms are used in the IISD.\n", terms_used_in_iisd); fprintf(logfile,"%ld entries are present in the IISD.\n", total_iisd_entries); } } // end create_iisd void cluster_documents() { INT document; INT term[terms_per_row], weight[terms_per_row]; int loop = TRUE, is_seed = FALSE, is_set = FALSE; int current, min, max; float avg; INT sum; INT file_no; INT file_no2; int i, pos; long cluster, highest_cluster; iisdclusterpointer iisd_runner; float doc_highest, highest, *coverage; clusterpointer new_doc_node; if (debug == 1) { printf("Clustering all documents.\n"); } dvector = fopen("dvector.txt", "r"); if (dvector == NULL) { printf("Error open file dvector.\n"); exit(1); } coverage = (float *)calloc(nc+1, sizeof(float)); iic = (clusterpointer)malloc((nc+1)*sizeof(clusternode)); for (i=0 ; i <= nc; i++) { iic[i].document = 0; iic[i].link = NULL; } iid = (INT **)malloc((m+1)*sizeof(INT)); for(i=0; icluster] += weight[i] * beta[term[i]] * iisd_runner->weight; iisd_runner = iisd_runner->link; } } } } else { is_seed = TRUE; } } while (file_no == document && loop == TRUE); if ( is_seed == TRUE) { cluster++; is_seed = FALSE; // new_doc_node = new clusternode; // if ( new_doc_node == NULL) { // printf("Error allocating memory for iic.\n"); // exit(2); // } // new_doc_node->document = document; // new_doc_node->link = iic[highest_cluster].link; // iic[iid[document][0]].link = new_doc_node; // iic[iid[document][0]].document++; } else { pos = 0; doc_highest = 0; do { highest = 0; highest_cluster = 0; for(i = 1; i <= nc ; i++) { if (coverage[i] > highest) { highest = coverage[i]; highest_cluster = i; } else { //Or if it has equal coverage, check to see if //this cluster has a greater seed power if (highest_cluster != 0) { if (coverage[i] == highest && seed_power[i].power > seed_power[highest_cluster].power) { highest = coverage[i]; highest_cluster = i; } } } } coverage[highest_cluster] = 0.0; if ( highest > doc_highest) { doc_highest = highest; } if (doc_highest * overlap_coverage_ratio <= highest) { new_doc_node = new clusternode; if ( new_doc_node == NULL) { printf("Error allocating memory for iic.\n"); exit(2); } new_doc_node->document = document; new_doc_node->link = iic[highest_cluster].link; iic[highest_cluster].link = new_doc_node; iic[highest_cluster].document++; iid[document][pos] = highest_cluster; } pos++; } while ( (pos < max_overlap) && (doc_highest * overlap_coverage_ratio < highest) ); } // end else } // end for document fclose(dvector); if (debug == 1) { // find cluster max_size, min_size, average min = max = iic[1].document; sum = iic[1].document; for (i=2; i<=nc; i++) { if (iic[i].document > max) { max = iic[i].document; } if (iic[i].document < min) { min = iic[i].document; } sum += iic[i].document; } avg = (float)sum / (float)nc; printf("Cluster min_size: %d, avg_size: %f, max_size: %d\n",min,avg,max); fprintf(logfile,"Cluster min_size: %d, avg_size: %f, max_size: %d\n",min,avg,max); } } void compute_centroids() { dvectorrec **terms; int i, j, k, pos, loop = TRUE, current; int min, max, iter, stop = 0; float avg; INT file_no, file_no2, document; INT term[terms_per_row], weight[terms_per_row]; INT cluster, len, first, last; clusternode *temp; dvectorrec *document_vector; if (debug == 1) { printf("Computing centroid vectors\n"); max = 0; avg = 0.0; min = n_max; } terms = (dvectorrec **)malloc((max_part_size)*sizeof(dvectorrec *)); for (i=0; i= (k*max_part_size) )&&(cluster < ((k+1)*max_part_size)) ) { for(i = 0; i < terms_per_row; i++) { if (term[i] != 0 ) { // do an operation for each term-weight // combination if != 0 terms[cluster%max_part_size][term[i]-1].term = term[i]; terms[cluster%max_part_size][term[i]-1].weight += weight[i]; } } } pos++; } } while (file_no == document && loop == TRUE); } if (k == (iter-1)) { stop = nc % max_part_size; } else { stop = max_part_size; } for(i=0; i < stop; i++) { // move the elements to the first part of the array first = 0; last = n_max-1; while (first < last) { while ((terms[i][first].term != 0)&&(first < n_max-1)) { first++; } while ((terms[i][last].term == 0)&&(last > 0)) { last--; } terms[i][first].term = terms[i][last].term; terms[i][first].weight = terms[i][last].weight; terms[i][last].term = 0; terms[i][last].weight = 0; } // sort by weight between 0 and last elements, and take highest // centroid_max terms // sort(0, last, terms); qsort((void *)(terms[i]), first, sizeof(dvectorrec), CmpFunc_dvectorrec); for(j=0; j max) { max = first; } if (first < min) { min = first; } avg += first; } } // end for(i=0 , i cluster = cluster; new_iisd_cluster_node -> weight = weight; new_iisd_cluster_node -> link = ic[term-1]; ic[term-1] = new_iisd_cluster_node; } // inverted index for centroids void create_ic() { int i, j; INT total_ic_entries = 0, distinct_terms = 0; dvectorrec *temp; if (debug == 1) { printf("Creating ic\n"); } for(i=0; ilink; head = a1; while (a2 != NULL) { a1->link = a2->link; a2->link = head; head = a2; a2 = a1->link; } iic[i].link = head; } } } void reverse_ic(){ iisdclusternode *mylist, *head, *a1, *a2; int i; for (i=0; i< n_max; i++) { mylist = ic[i]; if (mylist != NULL){ a1 = mylist; a2 = mylist->link; head = a1; while (a2 != NULL) { a1->link = a2->link; a2->link = head; head = a2; a2 = a1->link; } ic[i] = head; } } } // write the inverted index for clusters by documents void write_iic() { int i, cluster = 0, items; clusternode *temp; if (debug == 1) { printf("Writing iic\n"); } iic_file = fopen("iic_text.txt","w"); if (iic_file == NULL) { printf("IIC file write error!\n"); exit(2); } reverse_iic(); if (iic[0].document != 0){ temp = iic[0].link; printf("Unclustered documents:"); while( temp != NULL){ printf("%ld ", temp->document); temp = temp->link; } printf("\n"); } for (cluster=1; cluster <= nc; cluster++) { fprintf(iic_file,"#\n"); fprintf(iic_file,"%d %ld %ld\n", cluster, iic[cluster].document, cluster_index[cluster-1]); items = 0; temp = iic[cluster].link; while (temp != NULL) { fprintf(iic_file,"%ld", temp->document); items++; if (items % terms_per_row == 0) { fprintf(iic_file,"\n"); } else { fprintf(iic_file," "); } temp = temp->link; } items = terms_per_row - (items % terms_per_row); if (items < 15) { for(i=0; i < items; i++) { fprintf(iic_file,"0"); if (i == items-1) { fprintf(iic_file,"\n"); } else { fprintf(iic_file," "); } } } } fclose(iic_file); } // write the inverted index for documents by clusters void write_iid() { int i,j,count; if (debug == 1) { printf("Writing iid\n"); } iid_file = fopen("iid_text.txt","w"); if (iid_file == NULL) { printf("Error writing iid_file\n"); exit(2); } for (i=1; i<=m; i++) { fprintf(iid_file,"%d",i); count = 0; for(j=0; jcluster; weight[items] = temp->weight; temp = temp->link; items++; } if (items > 0) { while (items < terms_per_row) { cluster[items] = 0; weight[items] = 0; items++; } fprintf(ic_file,"%ld",i+1); for(j=0; j< terms_per_row; j++) { fprintf(ic_file," %ld", cluster[j]); } fprintf(ic_file,"\n"); fprintf(ic_file,"%ld",i+1); for(j=0; j< terms_per_row; j++) { fprintf(ic_file," %f",weight[j]); } fprintf(ic_file,"\n"); } } // end if } // end for fclose(ic_file); printf("CM ic done\n"); } void main() { time_t atime, btime, ctime, dtime, etime; int i; iisdclusterpointer temp; logfile = fopen("c3m.log", "w"); if (logfile == NULL) { printf("Error open file 'c3m.log'.\n"); exit(1); } time(&atime); //Read from the input document and prepare for the initial calculations preparation(m, n, debug); //calculate alpha value, getting prepared for beta value initial_calc(); //finish calculating beta values and term decoupling values finish_calc(); //calculate decoupling coefficient value from document point of view decoupling_coe_calc(); // calculate nc and seed power value calculate_nc_and_powers(nc); if (debug == 1) { printf("Sorting documents by seed power value.\n"); } time(&btime); //determine cluster seeds. determine_seeds(); time(&dtime); //for(i=0; icluster, temp->weight); temp = temp->link; } printf("\n"); } */ cluster_documents(); time(&ctime); compute_centroids(); compute_weight(1); create_ic(); time(&etime); write_iic(); write_iid(); write_ic(); rectime = fopen("initialtime.txt","w"); fprintf(rectime,"seed determination time: %ld secs. \n",dtime-btime); fprintf(rectime, "clustering takes %ld secs.\n", ctime-atime); fprintf(rectime, "centroid creation takes %ld secs.\n", etime-ctime); fprintf(rectime, "clustering and centroid determination takes %ld secs.\n", etime-atime); fclose(rectime); param = fopen( "param.txt", "w" ); if (param == NULL) { printf("Error open file 'param.txt'.\n"); exit(1); } fprintf(param, "%ld%1c%ld%1c%ld%1c%ld%1c", m,' ',n,' ', debug, ' ', nc, ' '); fclose(param); fclose(logfile); }