#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>

// CHANGES
// 07.12.2001
// initial_calc(), decoupling_coe_calc(), calculate_nc_and_powers()
// has changed to read from dvector file. dd structure is disabled!
// 09.12.2001
// read_old_cluster_index (=new_cluester_index)
// cluster_index_sort()
// create_iisd(): new_cluster_index'i kullaniyor ve dokuman numarasina
// gore eslesen termleri buluyor (random dvector scan?!)
// cluster_document()

// iic.txt -> cluster: seed documanlar
// TO DO:
// seed secerken esit powera sahipler arasindan bosta kalan termleri
// cover edeni bul ve sec!


typedef long INT;

typedef struct {
    INT seed, address;
} cpointerrec;

typedef struct {
    float power;
    INT document;
} seedpowertuple;

typedef struct {
    INT term, weight;
} dvectorrec;

typedef struct tagdocumentnode{
    INT term, weight;
    tagdocumentnode *link;
} documentnode, *documentpointer;

typedef struct tagiisdclusternode {
    long cluster;
        float weight;
    tagiisdclusternode *link;
} iisdclusternode, *iisdclusterpointer;

typedef struct tagclusternode{
        INT document;
    tagclusternode *link;
} clusternode, *clusterpointer;

typedef struct {
  INT term;
  float weight;
} centroidtuple;

const INT centroid_max = 250; // centroid size
const INT max_part_size = 200; // while computing centroids, partition data into smaller chunks
const INT max_overlap = 1; // maximum number of clusters that a document can be assigned
const float overlap_coverage_ratio = 0.9; // ratio of the coverage / highest coverage

const INT m_max = 220000; // maximum number of documents
const INT n_max = 230000; // maximum number of terms
const INT depth_max = 30000; // maximum number of terms in a document
const float decoupling_max = (float)0.8;
const float seed_power_diff_min = (float)0.0001;
const float cover_diff_min = (float)0.001;

const int TRUE = 1;
const int FALSE = 0;
const int binary_file = FALSE;
const int terms_per_row = 15;


INT m, n;
INT debug;
INT nc;
INT term_presence[n_max + 1];
float alpha[m_max + 1], decoupling_document[m_max + 1];
float beta[n_max + 1];
float decoupling_term[n_max + 1];
seedpowertuple seed_power[m_max + 1];
int valid_seed[m_max + 1];

// new definitions
iisdclusterpointer iisd[n_max+1]; // inverted index (terms) for seed documents
clusternode *iic; // inverted index (docs) for clusters
INT **iid; // inverted index (clusters) for documents
INT *cluster_index; // holds the seed of the corresponding cluster
centroidtuple **centroid_terms; // holds the centroid vectors of the clusters
iisdclusterpointer ic[n_max]; // inverted index (terms) for centroids
FILE *iid_file; // inverted index (clusters) for documents
FILE *iic_file; // inverted index (terms) for clusters
FILE *ic_file; // inverted index (terms) for centroids
FILE *logfile;


FILE *newcpnt;  /* FILE of cpointerrec; */
FILE *dpointer; /* FILE of INTEGER; */
FILE *dvector;  /* FILE of dvectorrec; */
FILE *icolsum;  /* FILE of REAL; */
FILE *param;    /* TEXT; */
FILE *seedpow;  /* FILE OF REAL; */
FILE *idocsum;  /* FILE OF REAL;*/
FILE *rectime;  /* FILE OF TIME; */

int CmpFunc_long (const void *_a, const void *_b) {
  const long *a = (long *)_a;
  const long *b = (long *)_b;


  if ( *a < *b ) return (-1);
  else if (*a == *b ) return (0);
  else return (1);
}

int CmpFunc_dvectorrec(const void* _a, const void* _b) {
      // you've got to explicitly cast to the correct type
      const dvectorrec* a = (const dvectorrec*) _a;
      const dvectorrec* b = (const dvectorrec*) _b;

      if(a->weight < b->weight) return 1;
      else
         if(a->weight == b->weight) return  0;
         else         return -1;
}

void preparation(INT& m, INT& n, INT& debug)
{
    //INT document;
    INT term;
    INT i;

    //Initialize all array variables
    for ( i = 0; i <= m_max; i++ )
    {
        alpha[i] = 0;
        decoupling_document[i] = 0;
        seed_power[i].document = 0;
        seed_power[i].power = 0;
    }

    for ( i = 0; i <= n_max; i++ )
    {
        term_presence[i] = 0;
        decoupling_term[i] = 0;
        beta[i] = 0;
    }

    //open file of param.txt, read only
    param = fopen( "param.txt", "r" );
    if (param == NULL) {
        printf("Error open file 'param.txt'.\n");
        exit(1);
    }

    fscanf(param, "%ld%ld", &m, &n);
    /* "%ld" should be changed according to the definition of INT; */
    /* in the current state, it is refer to variable LONG. */
    printf("D matrix is %ld X %ld.\n", m, n);
    printf("m equals to %ld\n", m);
    printf("n equals to %ld\n", n);

    fprintf(logfile,"D matrix is %ld X %ld.\n", m, n);
    fprintf(logfile,"number of documents: %ld\n", m);
    fprintf(logfile,"number of terms %ld\n", n);


    debug = 0;
    fscanf(param, "%ld", &debug);
    if (debug == 1) {
        printf("Debugging mode in ON.\n");
    } else {
        printf("Debugging mode is OFF.\n");
    }

    fclose(param);

} /* End of preparation */


void read_file_no(INT *file_no)
{
        fscanf(dvector, "%6d", file_no);
}

int read_in_line( INT line[])
{
    int i;

    for( i = 0; i < terms_per_row; i++)
        {
            fscanf(dvector, "%7d", &line[i]);
        }
    fscanf(dvector,"\n");
    if (feof(dvector)) {
      return EOF;
    }
    return(1);
}

int read_in_dvector(INT *file_no, INT *file_no2, INT term[], INT weight[])
{
    int i, j;
    char temp[8];
    int if_EOF;

    read_file_no(file_no);
    //read in the odd row which contains term information
    if_EOF = read_in_line(term);

    read_file_no(file_no2);
    //read in the even row which contains weight information
    if_EOF = read_in_line(weight);

    return(if_EOF);

} /* End of read_in_dvector */

void initial_calc()
//calculate alpha value, getting prepared for beta value
{
    INT depth, depth_counter;
    INT document;

    INT term[terms_per_row], weight[terms_per_row];
    int loop = TRUE;
    int i, current;
    INT file_no;
    INT file_no2;

    dvectorrec document_vector[depth_max+1];
    int flag = TRUE;

    if (debug == 1) {
        printf("Calculation alpha values, starting to calculate beta "
                "and term decoupling values.\n");
    }

    //open document row sum, getting alpha value for similary calculation.
    idocsum = fopen("idocsum.txt", "w");
    if (idocsum == NULL) {
        printf("Error open file idocsum.\n");
        exit(1);
    }

    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
        printf("Error open file dvector.\n");
        exit(1);
    }

    read_file_no(&file_no);
    for (document=1; document<=m; document++)
    {
            depth = 0;
            alpha[document] = 0;
            do {
            current = read_in_line(term);
            read_file_no(&file_no2);
            current = read_in_line(weight);
            if (current == EOF)
            {
                loop = FALSE;
            }
            else
            {
                read_file_no(&file_no);
            }
            for(i = 0; i < terms_per_row; i++)
            {
              if (term[i] != 0 ) {
                // do an operation for each term-weight
                // combination if != 0
                //add up column sum, as for calculating alpha value

                alpha[document] += weight[i];
                //prepare for calculation of beta value
                beta[term[i]] += weight[i];
                //add term to the term list
                term_presence[term[i]] ++;
                //move down
                depth ++;
                //store the term and weight into array for later calculation

                document_vector[depth].term = term[i];
                document_vector[depth].weight = weight[i];

              }

            }
        } while (file_no == document && loop == TRUE);
        if (alpha[document] != 0) {
            //alpha = 1 / row_sum
            alpha[document] = 1 / alpha[document];
            fprintf(idocsum, "%f%c", alpha[document], ' ');
            for (depth_counter = 1; depth_counter <= depth; depth_counter ++) {
                // prepare to calculate term decoupling value
                decoupling_term[document_vector[depth_counter].term] =
                    decoupling_term[document_vector[depth_counter].term] +
                    document_vector[depth_counter].weight *
                    document_vector[depth_counter].weight *
                    alpha[document];
            }
        }

    }
    fclose(dvector);

} /* End of start_calc */

void finish_calc()
{
    //finish calculating beta values and term decoupling values
    INT nonzero_entries, unique_terms;
    INT term;
    int i;

    if (debug == 1) {
        printf("Finishing calculating betas and term decoupling values.\n");
    }

    //icolsum = fopen("icolsum.bin", "wb");
    icolsum = fopen("icolsum.txt", "w");
    if (icolsum == NULL) {
        printf("Error open file icolsum.\n");
        exit(1);
    }
    nonzero_entries = 0;
    unique_terms = 0;
    for(term = 1; term <= n; term ++) {
        if (beta[term] != 0) {
            //calculate beta value
            beta[term] = 1 / beta[term];
        }
        //finish calculating decoupling_term value
        //decoupling value from term point of view
        decoupling_term[term] *= beta[term];
        //fwrite(beta + term, sizeof(float), 1, icolsum);
        fprintf(icolsum, "%f%c", beta[term], ' ');
        if (term_presence[term] != 0) {
            //count the unique terms
            unique_terms ++;
            nonzero_entries += term_presence[term];
        }
    }

    //print out some information about terms
    printf("There are %ld non-zero entries in the D matrix.\n", nonzero_entries);
    printf("There are %ld unique terms.\n", unique_terms);
    printf("Average depth of indexing (xd) is %f.\n", float(float(nonzero_entries)/float(m)));
    printf("Average term generality (tg) is %f.\n", float(float(nonzero_entries)/float(unique_terms)));

    fprintf(logfile,"There are %ld non-zero entries in the D matrix.\n", nonzero_entries);
    fprintf(logfile,"There are %ld unique terms.\n", unique_terms);
    fprintf(logfile,"Average depth of indexing (xd) is %f.\n", float(float(nonzero_entries)/float(m)));
    fprintf(logfile,"Average term generality (tg) is %f.\n", float(float(nonzero_entries)/float(unique_terms)));

} /* End of finish_calc */


void decoupling_coe_calc()
{
    //calculate decoupling coefficient value from document point of view

    INT document;
    INT term[terms_per_row], weight[terms_per_row];
    int loop = TRUE;
    int current;
    INT file_no;
    INT file_no2;

    if (debug == 1)
    {
        printf("Calculating decoupling coefficients for documents.\n");
    }

    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
        printf("Error open file dvector.\n");
        exit(1);
    }

    read_file_no(&file_no);
    for (document=1; document<=m; document++)
    {
        decoupling_document[document] = 0;
        do {
            current = read_in_line(term);
            read_file_no(&file_no2);
            current = read_in_line(weight);
            if (current == EOF)
            {
                loop = FALSE;
            }
            else
            {
                read_file_no(&file_no);
            }
            for( int i = 0; i < terms_per_row; i++)
            {
              if (term[i] != 0) {
                decoupling_document[document] += weight[i] * weight[i] * beta[term[i]];
              }
            }
        }while (file_no == document && loop == TRUE);
        decoupling_document[document] *= alpha[document];
    }
    fclose(dvector);


} /* End of dd_calc */


void calculate_nc_and_powers(INT &nc) {
    // calculate nc and seed power value
    INT document = 1; // only true when the first document has only one line of data;
    float _nc;
    INT term[terms_per_row], weight[terms_per_row];
    int loop = TRUE;
    int current, i, j;
    INT file_no;
    INT file_no2;

    if (debug == 1) {
        printf("Calculating seed powers and nc.\n");
    }

    _nc = 0;


    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
        printf("Error open file dvector.\n");
        exit(1);
    }

    read_file_no(&file_no);
    for (document=1; document<=m; document++)
    {
        seed_power[document].power = 0;
        do {
            current = read_in_line(term);
            read_file_no(&file_no2);
            current = read_in_line(weight);
            if (current == EOF)
            {
                loop = FALSE;
            }
            else
            {
                read_file_no(&file_no);
            }
            for(i = 0; i < terms_per_row; i++)
            {
              if (term[i] != 0) {
                  if (binary_file == TRUE)
                  {
                    seed_power[document].power += weight[i];
                  }
                  else
                  {
                    seed_power[document].power += weight[i] * decoupling_term[term[i]] * (1 - decoupling_term[term[i]]);
                  }
              }

            }
        }while (file_no == document && loop == TRUE);
        if (seed_power[document].power != 0)
        {
            seed_power[document].power *=
            decoupling_document[document] * (1 - decoupling_document[document]);
        }
        //if ((seed_power[document].power == 0)&&(decoupling_document[document] == 1))
        //  {
        //       seed_power[document].power = 1.0;
        //  }

        // sum up all Cii
        _nc += decoupling_document[document];
        seed_power[document].document = document;

    }
    fclose(dvector);

    // calculate nc, determine number of seeds need to be selected
    nc = int(_nc + .5);

    // allocate cluster index structure
    cluster_index = (long *)calloc(nc,sizeof(long));

    centroid_terms = (centroidtuple **)malloc(nc * sizeof(centroidtuple *));
    for(i=0; i<nc; i++) {
      centroid_terms[i] = (centroidtuple *)malloc(centroid_max * sizeof(centroidtuple));
    }

    printf ("nc equals to %ld \n", nc);
    fprintf(param, "%c%f%c", ' ', nc, ' ');

    fprintf (logfile,"nc equals to %ld \n", nc);

} /* End of find_nc_and_powers */


INT seed_power_partition(INT p, INT r)
{
    int done;
    INT i, j;
    seedpowertuple x, temp;

    x = seed_power[p];
    i = p - 1;
    j = r + 1;

    done = FALSE;
    while (!done) {
        do {
            j --;
        } while (seed_power[j].power < x.power);
        do {
            i ++;
        } while (seed_power[i].power > x.power);

        if (i < j) {
            if (seed_power[i].power != seed_power[j].power) {
                temp = seed_power[j];
                seed_power[j] = seed_power[i];
                seed_power[i] = temp;
            }
        } else {
            done = TRUE;
        }
    }
    return j;
}

void sort_seed_power(INT p, INT r)
/* using recursion to sort the seed_power. */
{
    INT q;

    if (p < r) {
        q = seed_power_partition(p, r);
        sort_seed_power(p, q);
        sort_seed_power(q+1, r);
    }
} /* End seed_power_sort */


void cover_coefficients(INT i, INT j, float& cij, float& cji)
{
    int i_length, i_runner, j_length, j_runner;
    //INT position; // This line is added in debug
    float sum;
    dvectorrec document_vector_i[depth_max + 1], document_vector_j[depth_max + 1];
    dvectorrec term_frequency;
    INT file_no = 0;
    INT file_no2 = 0;
    INT term[terms_per_row], weight[terms_per_row];
    int l;
    int current;

    // Initialize the common summation of cij and cji
    sum = 0;

    //read document vectors to use into arrays
    //position = address[i];

    //calculate the length of document i in terms
    i_length = 0;
    j_length = 0;

    //read terms covered by document  into an array
    //fseek(dvector, (position - 1) * sizeof(dvectorrec), SEEK_SET);
    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
        printf("Error open file dvector.\n");
        exit(1);
    }

    //to find the position of document i
    current = read_in_dvector(&file_no, &file_no2, term, weight);

    if (i < j) {
        while (file_no < i)
        {
            if (current != EOF && file_no != 0)
            {
                current = read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
        while ( file_no == i )
        {
            if (file_no != file_no2)
            {
                printf("Error reading the dvector files. \n");
                printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2);
                exit (1);
            }
            for( l = 0; l < terms_per_row; l++)
            {
                i_length++;
                term_frequency.term = term[l];
                term_frequency.weight = weight[l];
                document_vector_i[i_length].term = term_frequency.term;
                document_vector_i[i_length].weight = term_frequency.weight;
            }

            if (current != EOF && file_no != 0)
            {
                current = read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
        while (file_no < j)
        {
            if ( current != EOF && file_no != 0)
            {
                current = read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
        while (file_no == j)
        {
            if (file_no != file_no2)
            {
                printf("Error reading the dvector files. \n");
                printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2);
                exit (1);
            }
            for( l = 0; l < terms_per_row; l++)
            {
                j_length++;
                term_frequency.term = term[l];
                term_frequency.weight = weight[l];
                document_vector_j[j_length].term = term_frequency.term;
                document_vector_j[j_length].weight = term_frequency.weight;
            }

            if (current != EOF && file_no != 0)
            {
                read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
    } // if (i < j)
    else {
        while (file_no < j)
        {
            if ( current != EOF && file_no != 0)
            {
                current = read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
        while (file_no == j)
        {
            if (file_no != file_no2)
            {
                printf("Error reading the dvector files. \n");
                printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2);
                exit (1);
            }
            for( l = 0; l < terms_per_row; l++)
            {
                j_length++;
                term_frequency.term = term[l];
                term_frequency.weight = weight[l];
                document_vector_j[j_length].term = term_frequency.term;
                document_vector_j[j_length].weight = term_frequency.weight;
            }

            if (current != EOF && file_no != 0)
            {
                read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
       while (file_no < i)
        {
            if (current != EOF && file_no != 0)
            {
                current = read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }
        while ( file_no == i )
        {
            if (file_no != file_no2)
            {
                printf("Error reading the dvector files. \n");
                printf(" file_no %ld not equal to file_no2 %ld ", file_no, file_no2);
                exit (1);
            }
            for( l = 0; l < terms_per_row; l++)
            {
                i_length++;
                term_frequency.term = term[l];
                term_frequency.weight = weight[l];
                document_vector_i[i_length].term = term_frequency.term;
                document_vector_i[i_length].weight = term_frequency.weight;
            }

            if (current != EOF && file_no != 0)
            {
                current = read_in_dvector(&file_no, &file_no2, term, weight);
            }
            else
                file_no = m + 1;
        }

    } // end else ( j < i)

    fclose(dvector);

    //read terms covered by document j into an array
    //fseek(dvector, (position - 1) * sizeof(dvectorrec), SEEK_SET);
    //need to change back, fseek usually don't work well with .txt file

    //calculate the common summation factor of the cover coefficients
    i_runner = 1;
    j_runner = 1;

    while (i_runner <= i_length && j_runner <= j_length) {
        // debug //if (document_vector_i[i_runner].term] == document_vector_j[j_runner].term) {
        if (document_vector_i[i_runner].term == document_vector_j[j_runner].term)
{
            sum += document_vector_i[i_runner].weight *
                beta[document_vector_i[i_runner].term] *
                document_vector_j[j_runner].weight;
            i_runner ++;
            j_runner ++;
        // debug //} else if (document_vector_i[i_runner].term < document_vector_j[j_runner.term]) {
        } else if (document_vector_i[i_runner].term < document_vector_j[j_runner].term) {
            /* one term number must be less than the other, so increment the
               pointer of the documnent with the lesser term */
            i_runner ++;
        } else {
            j_runner ++;
        }
    }

    cij = alpha[i] * sum;
    cji = alpha[j] * sum;
}

void select_seed(INT document, float power, INT &seeds_found)
{
    cpointerrec cluster_pointer;

    // cluster index in memory is set
    cluster_index[seeds_found] = document;

    //increment number of seeds found
    seeds_found ++;

    //add new document to the cluster index on disk
    cluster_pointer.seed = document;
    cluster_pointer.address = 0;
    //fwrite(&cluster_pointer, sizeof(cpointerrec), 1, newcpnt);
    fprintf(newcpnt, "%ld%c%ld%c", cluster_pointer.seed, ' ', cluster_pointer.address, ' ');
    //save the cluster seed power
    //fwrite(&power, sizeof(float), 1, seedpow);
    fprintf(seedpow, "%f%c%c", power, ' ', ' ');
}

void determine_seeds()
{
    //determine cluster seeds
    INT counter, document, first_document, last_document, seeds_found;
    float cij, cji;
    cpointerrec cluster_pointer;

    //sort the seed power
    sort_seed_power(1, m);

    if (debug == 1) {
        printf("Determining cluster seeds.\n");
    }

    for (document = 1; document <= m_max; document ++) {
        valid_seed[document] = TRUE;
    }

    //newcpnt = fopen("newcpnt.bin", "wb");
    newcpnt = fopen("newcpnt.txt", "w");
    if (newcpnt == NULL) {
        printf("Error open file newcpnt.\n");
        exit(1);
    }
    //seedpow = fopen("seedpow.bin", "wb");
    seedpow = fopen("seedpow.txt", "w");
    if (seedpow == NULL) {
        printf("Error open file seedpow.\n");
        exit(1);
    }

    document = 1;
    seeds_found = 0;
    do {
        first_document = document;
        last_document = document;

        //find the last document that has the same seed power as the first
        while (fabs(seed_power[last_document + 1].power -
                   seed_power[first_document].power) < seed_power_diff_min
            && last_document < m)
        {
            last_document ++;
        }


        //counter = first_document; // debug
        //for (document = first_document + 1; document <= last_document; document ++) {
        // debug
        for (document = first_document; document <= last_document; document ++) {
            //check if the document to compare is valid
            if (valid_seed[document]) {
                /* debug
                if (abs(decoupling_document[seed_power[document].document -
                        decoupling_document[seed_power[counter].document)
                */
                for (counter = document + 1; counter <= last_document; counter ++){
                    // if deco[doc] == deco[counter] == cij == cji
                    // check if the document to compare is still valid
                    if (fabs(decoupling_document[seed_power[document].document] -
                            decoupling_document[seed_power[counter].document])
                        < cover_diff_min)
                    {
                        cover_coefficients(seed_power[document].document, seed_power[counter].document, cij, cji);

                    //determine if the documents are equal
                        if (fabs(cij - cji) < cover_diff_min)
                            if (fabs(cij - decoupling_document[seed_power[document].document])
                            < cover_diff_min)
                                if (fabs(cij - decoupling_document[seed_power[counter].document])
                                < cover_diff_min)
                                    if (fabs(cji - decoupling_document[seed_power[document].document])
                                    < cover_diff_min)
                                        if (fabs(cji - decoupling_document[seed_power[counter].document])
                                        < cover_diff_min)
                                        {
                                            //valid_seed[counter] = FALSE;
                                            valid_seed[document] = FALSE; // debug
                                        }
                    }
                }
            }
        }
        //select all the valid seeds
        for (document = first_document; document <= last_document; document ++) {
            if (valid_seed[document] && seeds_found < nc) {
                // debug //select_seed(seed_powerdocument.document, seed_power[document.power, seeds_found);
                select_seed(seed_power[document].document, seed_power[document].power, seeds_found);
            }
        }
        document = last_document + 1;
    } while (!(seeds_found == nc || document > m));
    //until all the required no. of seed found or all the documents readed

    printf("%d seeds were selected from top %d candidates.\n", seeds_found, last_document);
    fprintf(logfile,"%d seeds were selected from top %d candidates.\n", seeds_found, last_document);

    printf("%ld, cluster seeds were selected.\n", seeds_found);

    fprintf(logfile,"%ld, cluster seeds were selected.\n", seeds_found);

    cluster_pointer.seed = 0;
    cluster_pointer.address = 0;
    //fwrite(&cluster_pointer, sizeof(cpointerrec), 1, newcpnt);
    fprintf(newcpnt, "%ld%c%ld%c", cluster_pointer.address, ' ', cluster_pointer.seed, ' ');
} /* End of determine_seeds */


void add_to_iisd(long term, long cluster, long weight, long& total_iisd_entries)
{
    //add a cluster and weight to the IISD
    iisdclusterpointer new_iisd_cluster_node;

    //increment the total iisd entries
    total_iisd_entries ++;
    //make a new node
    new_iisd_cluster_node = new iisdclusternode;
    //copy the cluster number, weight and pointer to the list
    if(new_iisd_cluster_node == NULL) {
        printf("Error allocating memory.\n");
        exit(2);
    }
    new_iisd_cluster_node -> cluster = cluster;
    new_iisd_cluster_node -> weight = weight;
    new_iisd_cluster_node -> link = iisd[term];
    iisd[term] = new_iisd_cluster_node;
}

void create_iisd() {

    INT document;
    INT term[terms_per_row], weight[terms_per_row];
    int loop = TRUE, is_seed = FALSE;
    int current;
    INT file_no;
    INT file_no2;
    INT terms_used_in_iisd = 0, total_iisd_entries = 0;
    int i, cluster;

    if (debug == 1) {
        printf("Creating iisd.\n");
    }

    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
        printf("Error open file dvector.\n");
        exit(1);
    }

    //  iisd = (iisdclusternode *)malloc((n+1)*sizeof(iisdclusternode))
    for (i=0 ; i < n; i++) {
      iisd[i] = NULL;
    }

    cluster = 1;
    read_file_no(&file_no);
    for (document=1; document<=m; document++)
    {
            do {
            current = read_in_line(term);
            read_file_no(&file_no2);
            current = read_in_line(weight);
            if (current == EOF)
            {
                loop = FALSE;
            }
            else
            {
                read_file_no(&file_no);
            }
            if ( file_no2 == cluster_index[cluster-1] ) {
                 is_seed = TRUE;
                 for(i=0; i < terms_per_row; i++)
                 {
                      if (term[i] != 0 ) {
                      // do an operation for each term-weight
                      // combination if != 0
                    add_to_iisd(term[i],cluster,weight[i],total_iisd_entries);
                  }
                 }
            }
        } while (file_no == document && loop == TRUE);
        if ( is_seed == TRUE) {
          cluster++;
          is_seed = FALSE;
        }
    }

    for(i=1; i <= n; i++) {
        if (iisd[i] != NULL) {
           terms_used_in_iisd++;
        }
    }
    fclose(dvector);

    if (debug == 1) {
      printf("%ld terms are used in the IISD.\n", terms_used_in_iisd);
      printf("%ld entries are present in the IISD.\n", total_iisd_entries);

      fprintf(logfile,"%ld terms are used in the IISD.\n", terms_used_in_iisd);
      fprintf(logfile,"%ld entries are present in the IISD.\n", total_iisd_entries);
    }

} // end create_iisd


void cluster_documents() {

    INT document;
    INT term[terms_per_row], weight[terms_per_row];
    int loop = TRUE, is_seed = FALSE, is_set = FALSE;
    int current, min, max;
    float avg;
	INT sum;
    INT file_no;
    INT file_no2;
    int i, pos;
    long cluster, highest_cluster;
    iisdclusterpointer iisd_runner;
    float doc_highest, highest, *coverage;
    clusterpointer new_doc_node;

    if (debug == 1) {
        printf("Clustering all documents.\n");
    }

    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
        printf("Error open file dvector.\n");
        exit(1);
    }

    coverage = (float *)calloc(nc+1, sizeof(float));
    iic = (clusterpointer)malloc((nc+1)*sizeof(clusternode));
    for (i=0 ; i <= nc; i++) {
      iic[i].document = 0;
      iic[i].link = NULL;
    }

    iid = (INT **)malloc((m+1)*sizeof(INT));
    for(i=0; i<m+1; i++){
      iid[i] = (INT *)calloc(max_overlap,sizeof(INT));
    }

    for (i=0 ; i < nc; i++) {
      iid[cluster_index[i]][0] = i+1;
    }

    cluster = 1;
    read_file_no(&file_no);
    for (document=1; document<=m; document++)
      {
            if (is_set == TRUE) {
                is_set = FALSE;
                for(i=1; i<=nc; i++) {
                coverage[i] = 0;
            }
            }
            do {
            current = read_in_line(term);
            read_file_no(&file_no2);
            current = read_in_line(weight);
            if (current == EOF)
            {
                loop = FALSE;
            }
            else
            {
                read_file_no(&file_no);
            }
            if ( file_no2 != cluster_index[cluster-1] ) {
                 is_seed = FALSE;
                 is_set = TRUE;
                 for(i=0; i < terms_per_row; i++)
                 {
                      if (term[i] != 0 ) {
                    iisd_runner = iisd[term[i]];
                    while (iisd_runner != NULL) {
                      coverage[iisd_runner->cluster] +=
                    weight[i] * beta[term[i]] * iisd_runner->weight;
                      iisd_runner = iisd_runner->link;
                    }
                  }
                 }
            } else {
                 is_seed = TRUE;
            }
        } while (file_no == document && loop == TRUE);

        if ( is_seed == TRUE) {

            cluster++;
            is_seed = FALSE;
//              new_doc_node = new clusternode;
//              if ( new_doc_node == NULL) {
//                printf("Error allocating memory for iic.\n");
//                exit(2);
//              }
//              new_doc_node->document = document;
//              new_doc_node->link = iic[highest_cluster].link;
//              iic[iid[document][0]].link = new_doc_node;
//              iic[iid[document][0]].document++;
        } else {
          pos = 0;
          doc_highest = 0;
          do {
            highest = 0;
            highest_cluster = 0;
            for(i = 1; i <= nc ; i++) {
               if (coverage[i] > highest)
               {
                highest = coverage[i];
                highest_cluster = i;
               } else {
                //Or if it has equal coverage, check to see if
                //this cluster has a greater seed power
                if (highest_cluster != 0)
                {
                  if (coverage[i] == highest &&
                      seed_power[i].power >
                      seed_power[highest_cluster].power) {
                    highest = coverage[i];
                    highest_cluster = i;
                  }
                }
               }
            }

            coverage[highest_cluster] = 0.0;
            if ( highest > doc_highest) {
              doc_highest = highest;
            }

            if (doc_highest * overlap_coverage_ratio <= highest) {
              new_doc_node = new clusternode;
              if ( new_doc_node == NULL) {
            printf("Error allocating memory for iic.\n");
            exit(2);
              }
              new_doc_node->document = document;
              new_doc_node->link = iic[highest_cluster].link;
              iic[highest_cluster].link = new_doc_node;
              iic[highest_cluster].document++;

              iid[document][pos] = highest_cluster;
            }
            pos++;

          } while ( (pos < max_overlap) && (doc_highest * overlap_coverage_ratio < highest) );

            } // end else
    } // end for document

    fclose(dvector);

    if (debug == 1) {
      // find cluster max_size, min_size, average
      min =  max = iic[1].document;
      sum = iic[1].document;
      for (i=2; i<=nc; i++) {
        if (iic[i].document > max) {
          max = iic[i].document;
        }
        if (iic[i].document < min) {
          min = iic[i].document;
        }
        sum += iic[i].document;
      }
      avg = (float)sum / (float)nc;

      printf("Cluster min_size: %d, avg_size: %f, max_size: %d\n",min,avg,max);

      fprintf(logfile,"Cluster min_size: %d, avg_size: %f, max_size: %d\n",min,avg,max);
    }

}

void compute_centroids() {
  dvectorrec **terms;
  int i, j, k, pos, loop = TRUE, current;
  int min, max, iter, stop = 0;
  float avg;
  INT file_no, file_no2, document;
  INT term[terms_per_row], weight[terms_per_row];
  INT cluster, len, first, last;
  clusternode *temp;
  dvectorrec *document_vector;

  if (debug == 1) {
    printf("Computing centroid vectors\n");
    max = 0;
    avg = 0.0;
    min = n_max;
  }

  terms = (dvectorrec **)malloc((max_part_size)*sizeof(dvectorrec *));
  for (i=0; i<max_part_size; i++) {
    terms[i]  = (dvectorrec *)malloc((n_max)*sizeof(dvectorrec));
  }

  for(i=0; i<nc; i++) {
    for(j=0; j< centroid_max; j++) {
      centroid_terms[i][j].term = 0;
      centroid_terms[i][j].weight = 0.0;
    }
  }

    dvector = fopen("dvector.txt", "r");
    if (dvector == NULL) {
      printf("Error open file dvector.\n");
      exit(1);
    }

  iter = nc/max_part_size;
  if (nc % max_part_size != 0){
    iter++;
  }

  for (k=0; k< iter; k++) {

    for (i=0; i<max_part_size; i++) {
      for(j=0; j<n_max; j++) {
        terms[i][j].term = 0;
        terms[i][j].weight = 0;
      }
    }

       rewind(dvector);

       loop = TRUE;
       read_file_no(&file_no);
       for (document=1; document<=m; document++) {
            do {
            current = read_in_line(term);
            read_file_no(&file_no2);
            current = read_in_line(weight);
            if (current == EOF)
            {
                loop = FALSE;
            }
            else
            {
                read_file_no(&file_no);
            }
            pos = 0;
            while ((iid[file_no2][pos] != 0)&&(pos < max_overlap)) {
              cluster = iid[file_no2][pos] -1;
              if ( (cluster >= (k*max_part_size) )&&(cluster < ((k+1)*max_part_size)) ) {
                for(i = 0; i < terms_per_row; i++) {
                  if (term[i] != 0 ) {
                // do an operation for each term-weight
                // combination if != 0
                terms[cluster%max_part_size][term[i]-1].term = term[i];
                terms[cluster%max_part_size][term[i]-1].weight += weight[i];
                  }
                }
              }
              pos++;
            }
        } while (file_no == document && loop == TRUE);
    }

    if (k == (iter-1))  {
        stop = nc % max_part_size;
    } else {
        stop = max_part_size;
    }

    for(i=0; i < stop; i++) {
      // move the elements to the first part of the array

      first = 0;
      last = n_max-1;
      while (first < last) {
        while ((terms[i][first].term != 0)&&(first < n_max-1)) {
          first++;
        }
        while ((terms[i][last].term == 0)&&(last > 0)) {
          last--;
        }
        terms[i][first].term = terms[i][last].term;
        terms[i][first].weight = terms[i][last].weight;
        terms[i][last].term = 0;
        terms[i][last].weight = 0;
      }
      // sort by weight between 0 and last elements, and take highest
      // centroid_max terms
      // sort(0, last, terms);

      qsort((void *)(terms[i]), first, sizeof(dvectorrec), CmpFunc_dvectorrec);

      for(j=0; j<centroid_max; j++) {
        centroid_terms[i+(k*max_part_size)][j].term = terms[i][j].term;
        centroid_terms[i+(k*max_part_size)][j].weight = terms[i][j].weight;
      }

      if (debug == 1) {
        if (first > max) {
          max = first;
        }
        if (first < min) {
          min = first;
        }
        avg += first;
      }

    } // end for(i=0 , i<stop, i++)

  } // end for(k=0; k<iter, k++)

  fclose(dvector);

  if (debug == 1) {
    printf("Terms per cluster min_size: %d, avg_size: %f, max_size: %d\n",min,avg/nc,max);

    fprintf(logfile,"Terms per cluster min_size: %d, avg_size: %f, max_size: %d\n",min,avg/nc,max);
  }

}

void compute_weight(int flag){
  float t, f;
  float term_count[n_max];
  float sqrt_sum, sum = 0.0;

  int i, j;

  for(i=0; i<n_max; i++) {
    term_count[i] = 0;
  }
  // just related with centroid vectors
  if (flag == 1){
    t = 1;
    for (i=0; i<nc; i++) {
      for (j=0; j<centroid_max; j++){
    if (centroid_terms[i][j].weight != 0) {
      term_count[centroid_terms[i][j].term - 1]++;
    }
      }
    }
    for (i=0; i<nc; i++) {
      sum = 0;
      for (j=0; j<centroid_max; j++){
    if (centroid_terms[i][j].weight != 0) {
      f = log((float)nc/(float)term_count[centroid_terms[i][j].term - 1])+1.0;
      centroid_terms[i][j].weight = t*f;
      sum += t*f*t*f;
    }
      }
      sqrt_sum = sqrt(sum);
      for (j=0; j<centroid_max; j++){
      centroid_terms[i][j].weight /= sqrt_sum;

      }
    }
    // related with centriods including cluster info
  } else if (flag == 2) {
    for (i=0; i<nc; i++) {
      for (j=0; j<centroid_max; j++){
    if (centroid_terms[i][j].weight != 0) {
      term_count[centroid_terms[i][j].term - 1] += centroid_terms[i][j].weight;
    }
      }
    }
    for (i=0; i<nc; i++) {
      sum = 0;
      for (j=0; j<centroid_max; j++){
    if (centroid_terms[i][j].weight != 0) {
      t = centroid_terms[i][j].weight;
      f = log((float)term_count[centroid_terms[i][j].term - 1]/(float)centroid_terms[i][j].weight)+1.0;
      centroid_terms[i][j].weight = t*f;
      sum += t*f*t*f;
    }
      }
      sqrt_sum = sqrt(sum);
      for (j=0; j<centroid_max; j++){
      centroid_terms[i][j].weight /= sqrt_sum;
      }

    }
  } // end if
}


void add_to_ic(long term, long cluster, float weight, long& total_ic_entries)
{
    //add a cluster and weight to the IISD
    iisdclusterpointer new_iisd_cluster_node;

    //increment the total iisd entries
    total_ic_entries++;
    //make a new node
    new_iisd_cluster_node = (iisdclusternode *)malloc(sizeof(iisdclusternode));
    //copy the cluster number, weight and pointer to the list
    if(new_iisd_cluster_node == NULL) {
        printf("Error allocating memory.\n");
        exit(2);
    }
    new_iisd_cluster_node -> cluster = cluster;
    new_iisd_cluster_node -> weight = weight;
    new_iisd_cluster_node -> link = ic[term-1];
    ic[term-1] = new_iisd_cluster_node;
}


// inverted index for centroids
void create_ic() {

  int i, j;
  INT total_ic_entries = 0, distinct_terms = 0;
  dvectorrec *temp;

  if (debug == 1) {
    printf("Creating ic\n");
  }

  for(i=0; i<n_max; i++) {
    ic[i] = NULL;
  }

  for(i=0; i<nc; i++) {
    for(j=0; j<centroid_max; j++){
      if (centroid_terms[i][j].term != 0) {
    add_to_ic(centroid_terms[i][j].term, i+1 , centroid_terms[i][j].weight,
        total_ic_entries);
      }
    }
  }

  for(i=0; i<n_max; i++) {
    if (ic[i] != NULL)
      distinct_terms++;
  }
  printf("%ld distinct terms are present in the IC.\n", distinct_terms);

  fprintf(logfile,"%ld distinct terms are present in the IC.\n", distinct_terms);

  if (debug == 1) {
       printf("%ld entries are present in the IC.\n", total_ic_entries);

       fprintf(logfile,"%ld entries are present in the IC.\n", total_ic_entries);
  }

}

void reverse_iic(){
  clusternode *mylist, *head, *a1, *a2;
  int i;

  for (i=1; i<= nc; i++) {
    mylist = iic[i].link;
    if (mylist != NULL){
      a1 = mylist;
      a2 = mylist->link;
      head = a1;
      while (a2 != NULL) {
    a1->link = a2->link;
    a2->link = head;
    head = a2;
    a2 = a1->link;
      }
    iic[i].link = head;
    }
  }
}

void reverse_ic(){
  iisdclusternode *mylist, *head, *a1, *a2;
  int i;

  for (i=0; i< n_max; i++) {
    mylist = ic[i];
    if (mylist != NULL){
      a1 = mylist;
      a2 = mylist->link;
      head = a1;
      while (a2 != NULL) {
    a1->link = a2->link;
    a2->link = head;
    head = a2;
    a2 = a1->link;
      }
      ic[i] = head;
    }
  }
}

// write the inverted index for clusters by documents
void write_iic() {

  int i, cluster = 0, items;
  clusternode *temp;

  if (debug == 1) {
    printf("Writing iic\n");
  }

  iic_file = fopen("iic_text.txt","w");
  if (iic_file == NULL) {
      printf("IIC file write error!\n");
      exit(2);
  }

  reverse_iic();

  if (iic[0].document != 0){
    temp = iic[0].link;
    printf("Unclustered documents:");
    while( temp != NULL){
      printf("%ld ", temp->document);
      temp = temp->link;
    }
    printf("\n");
  }

  for (cluster=1; cluster <= nc; cluster++) {
      fprintf(iic_file,"#\n");
      fprintf(iic_file,"%d %ld %ld\n", cluster, iic[cluster].document, cluster_index[cluster-1]);

      items = 0;
      temp = iic[cluster].link;
      while (temp != NULL) {
    fprintf(iic_file,"%ld", temp->document);
    items++;
    if (items % terms_per_row == 0) {
      fprintf(iic_file,"\n");
    } else {
      fprintf(iic_file," ");
    }
    temp = temp->link;
      }
      items = terms_per_row - (items % terms_per_row);
      if (items < 15) {
    for(i=0; i < items; i++) {
      fprintf(iic_file,"0");
      if (i == items-1) {
        fprintf(iic_file,"\n");
      } else {
        fprintf(iic_file," ");
      }
    }
      }
  }

  fclose(iic_file);
}

// write the inverted index for documents by clusters
void write_iid() {
  int i,j,count;

  if (debug == 1) {
    printf("Writing iid\n");
  }

  iid_file = fopen("iid_text.txt","w");

  if (iid_file == NULL) {
    printf("Error writing iid_file\n");
    exit(2);
  }

  for (i=1; i<=m; i++) {
    fprintf(iid_file,"%d",i);
    count = 0;
    for(j=0; j<max_overlap; j++) {
      if (iid[i][j] != 0)
    count++;
    }
    fprintf(iid_file," %d",count);
    for(j=0; j<max_overlap; j++) {
      if (iid[i][j] != 0)
    fprintf(iid_file," %d",iid[i][j]);
    }
    fprintf(iid_file,"\n");
  }

  fclose(iid_file);
}

void write_ic() {

  int i, j, items, size;
  iisdclusterpointer temp;
  INT cluster[terms_per_row];
  float weight[terms_per_row];

  if (debug == 1) {
    printf("Writing ic\n");
  }

  ic_file = fopen("ic_text.txt", "w");
  if (ic_file == NULL) {
    printf("Error writing ic_text\n");
    exit(2);
  }

  for (i=0; i<terms_per_row; i++) {
    cluster[i] = 0;
    weight[i] = 0;
  }

  reverse_ic();

  for(i=0; i<n_max; i++) {
     items = 0;
     temp = ic[i];
     if (temp != NULL) {
       while (temp != NULL) {
     if (items == terms_per_row) {
       items = 0;
       fprintf(ic_file,"%ld",i+1);
       for(j=0; j< terms_per_row; j++) {
         fprintf(ic_file," %ld", cluster[j]);
       }
       fprintf(ic_file,"\n");
       fprintf(ic_file,"%ld",i+1);
       for(j=0; j< terms_per_row; j++) {
         fprintf(ic_file," %f",weight[j]);
       }
       fprintf(ic_file,"\n");
     }

     cluster[items] = temp->cluster;
     weight[items] = temp->weight;
     temp = temp->link;
     items++;
       }

       if (items > 0) {
     while (items < terms_per_row) {
       cluster[items] = 0;
       weight[items] = 0;
       items++;
     }
     fprintf(ic_file,"%ld",i+1);
     for(j=0; j< terms_per_row; j++) {
       fprintf(ic_file," %ld", cluster[j]);
     }
     fprintf(ic_file,"\n");
     fprintf(ic_file,"%ld",i+1);
     for(j=0; j< terms_per_row; j++) {
       fprintf(ic_file," %f",weight[j]);
     }
     fprintf(ic_file,"\n");
       }
     } // end if
  } // end for

   fclose(ic_file);

   printf("CM ic done\n");
}


void main()
{
    time_t atime, btime, ctime, dtime, etime;
    int i;
    iisdclusterpointer temp;


    logfile = fopen("c3m.log", "w");
    if (logfile == NULL) {
        printf("Error open file 'c3m.log'.\n");
        exit(1);
    }


    time(&atime);
    //Read from the input document and prepare for the initial calculations
    preparation(m, n, debug);

    //calculate alpha value, getting prepared for beta value
    initial_calc();

    //finish calculating beta values and term decoupling values
    finish_calc();


    //calculate decoupling coefficient value from document point of view
    decoupling_coe_calc();

    // calculate nc and seed power value
    calculate_nc_and_powers(nc);
    if (debug == 1) {
        printf("Sorting documents by seed power value.\n");
    }

	time(&btime);
    //determine cluster seeds.
    determine_seeds();

	time(&dtime);

	//for(i=0; i<nc; i++)
    //  printf("cluster_index[%d]:%d\n",i,cluster_index[i]);

    //sort the cluster index by seed_document numbers
    qsort(cluster_index, nc, sizeof(INT), CmpFunc_long );

    create_iisd();
    /*
    for(i=0; i<nc; i++) {
      printf("%ld ",cluster_index[i]);
    }
    printf("\n");

    for(i=1; i<=n; i++) {
      temp = iisd[i];
      while (temp != NULL) {
        printf("%d:%ld ",temp->cluster, temp->weight);
        temp = temp->link;
      }
      printf("\n");
    }
    */
    cluster_documents();

    time(&ctime);

    compute_centroids();

    compute_weight(1);

    create_ic();

    time(&etime);

    write_iic();

    write_iid();

    write_ic();

    rectime = fopen("initialtime.txt","w");
	fprintf(rectime,"seed determination time: %ld secs. \n",dtime-btime);
	fprintf(rectime, "clustering takes %ld secs.\n", ctime-atime);
    fprintf(rectime, "centroid creation takes %ld secs.\n", etime-ctime);
    fprintf(rectime, "clustering and centroid determination takes %ld secs.\n", etime-atime);
    fclose(rectime);

    param = fopen( "param.txt", "w" );
    if (param == NULL) {
        printf("Error open file 'param.txt'.\n");
        exit(1);
    }
    fprintf(param, "%ld%1c%ld%1c%ld%1c%ld%1c", m,' ',n,' ', debug, ' ', nc, ' ');
    fclose(param);

    fclose(logfile);
}