/* ======================================================================
   David W. Aha
   ibl.c: Implementation of Instance-Based Learning Algorithms
   Topic: IB1, IB2, IB3, and now IB4
   March 1994

   Done:
     1. Implemented ib4
        -- Assumes that, if best_concept_only, there is only 1 predictee
           and that it is the last.   Always put IB4's predictee last!
        -- Need to implement k updates to similarity fn per classification
        -- Need a translation process from multi-discrete-value to 
           multi-binary-attributes.  Not done automatically?  Right.
     2. k>=1
     3. -norm_none, -norm_linear, and -norm_sd options
     4. -missing_maxdiff, -missing-ave, -missing_ignore
     5. 3/9/94: -storeall option implemented
        - Tells IBn (n>1) to save (initially) all instances
     6. 3/9/94: -multiline option implemented
        - Allows alternative format for instances in input files
        - CRLF's allowed between attribute values within an instance
        - Each new instance must begin on a new line
   ====================================================================== */

/*==== Accessing the declarations. ====*/
#define MAIN 1
#include "datastructures.h"
#undef MAIN

/* ======================================================================
   Main Function
   ====================================================================== */
main (argc, argv)
   int argc;
   char *argv[];
{
   int able_to_interpret_options();
   void experiment(), print_usage();

   if (argc < (1 + NUMBER_OF_REQUIRED_INPUTS)) 
     /*==== Missing some required inputs ====*/
     print_usage();
   else
     { /*==== Read the required input parameters ====*/
       strcpy(descriptionfile,argv[1]);
       strcpy(namesfile,argv[2]);
       strcpy(trainingfile,argv[3]);
       strcpy(testingfile,argv[4]);
       strcpy(outputfile,argv[5]);
       srandom((unsigned)atoi(argv[6]));

       /*==== Interpret the options, if possible ====*/
       if (able_to_interpret_options(&argv[0],argc))
         experiment();
       else
         print_usage();
     }
}

/* ======================================================================
   Prints usage information for the program.
   ====================================================================== */
void print_usage()
{
  printf("Usage: description namesfile trainfile testfile outputfile seed [options]\n");
  printf("\nRequired Parameters:\n");
  printf("   descriptionfile contains the predictee/predictor info\n");
  printf("   namesfile contains the datafile's format information\n");
  printf("   trainfile contains training instances\n");
  printf("   testfile contains testing instances\n");
  printf("   outputfile will contain the experiment's results\n");
  printf("   seed is used to initialize random variable generator\n");
  printf("\nUser Parameters: (name, default, and brief description)\n");
  printf("  -signif_accept   (75 confidence) above class frequency\n");
  printf("  -signif_drop     (75 confidence) below class frequency\n");
  printf("  -ib1             (off) Act like IB1 rather than IB3\n");
  printf("  -ib2             (on) Act like IB2 rather than IB3\n");
  printf("  -ib3             (off) Act like IB3\n");
  printf("  -ib4             (off) Act like IB4 rather than IB3\n");
  printf("  -k               (1) Number of nearest acceptables wanted\n");
  printf("  -norm_none       (off) No normalization option\n");
  printf("  -norm_linear     (on)  Linear normalization option\n");
  printf("  -norm_sd         (off) Standard deviation normalize option\n");
  printf("  -missing_maxdiff (on) Assume maximum possible difference\n");
  printf("  -missing_ave     (off) Assume average or most frequent value\n");
  printf("  -missing_ignore  (off) Ignore, Normalize the sim'y results\n");
  printf("  -storeall        (off) IBn>1: Save (initially) all instances\n");
  printf("  -multiline       (off) Allows alternative input format\n");
  printf("   - Instances can then appear on multiple lines\n");
  printf("   - Final line per instance must contain only one symbol: <\n");
  printf("\nConvenience Options:\n");
  printf("  -testrate            (100) How often to run on test set\n");
  printf("  -reportrate          (25) How often to report on things\n");
  printf("  -startup             (0) When to start testing/reporting\n");
  printf("  -overlap             (off) IB4: set of binary concepts\n");
  printf("  -best_concept_only   (off) IB4: if overlap, give only best\n");
  printf("  -probability_weights (off) IB4: Toggles between 2 methods\n");
  printf("  -printweights        (off) IB4: always print attribute weights\n");
  printf("  -testlast            (off) Test after finished?\n");
}

/* ======================================================================
   Tries to interpret the options. Yields 0 iff something is wrong. Else 1.
   ====================================================================== */
int able_to_interpret_options(argument,argc)
  int *argument,argc;
{
  /*==== Subfunctions ====*/
  extern int read_data_format();                      /*==== Utility ====*/
  extern void read_test_set();                        /*==== Testing ====*/
  int known_significance_level(), good_value_for_k();
  void print_known_significance_levels();
  double significance_level();
 
  /*==== Local Variables ====*/
  register int i,arg, startup_set=FALSE;
  char argstr[30];

  /*==== Setup optional argument defaults. ====*/
  signif_accept_level = 75.0;  /*==== 75% level of significance. ====*/ 
  signif_drop_level = 75.0;    /*==== 75% level of significance. ====*/
  testrate = 50;                     
  reportrate = 25;
  ib1 = FALSE; ib2 = FALSE;  ib3 = TRUE;  ib4= FALSE;
  norm_none = FALSE; norm_linear = TRUE; norm_sd = FALSE;
  missing_maxdiff = TRUE; missing_ave = FALSE; missing_ignore = FALSE;
  storeall = FALSE;
  multiline = FALSE;
  number_of_predictors = number_of_predictees = 0;
  k=1; 
  overlap = FALSE;
  best_concept_only = FALSE;
  weight_method = SIMPLE;
  printweights = FALSE;
  testlast = FALSE;
  learning_rate = 0.1;     /*==== Option values will be divided by 100.0 ====*/

  /*==== Now process the options. ====*/
  for(i=NUMBER_OF_REQUIRED_INPUTS+1; i<argc; i++)
    { strcpy(argstr,*(argument+i));
        if (strcmp(*(argument+i),"-testrate") == 0)
	 { if (i == (argc))
	     { printf("Test rate option need integer argument.\n");
	       return(FALSE);
             }
	   else
	     { arg = atoi(*(argument+i+1));
	       if ((arg<1) || (arg > 20000))
		  { printf("Test rate argument x: 1 <= x <= 20000.\n");
		    return(FALSE);
		  }
               else
		 { testrate = arg; i++; }
             }
	 }
      else if (strcmp(*(argument+i),"-reportrate") == 0)
	 { if (i == (argc))
	     { printf("Report rate option need integer argument.\n");
	       return(FALSE);
             }
	   else
	     { arg = atoi(*(argument+i+1));
	       if ((arg<1) || (arg > 10000))
		  { printf("Report rate argument x: 1 <= x <= 10000.\n");
		    return(FALSE);
		  }
               else
		 { reportrate = arg; i++; }
             }
	 }
      else if (strcmp(*(argument+i),"-signif_accept") == 0)
         { if (i == (argc))
            { printf("Frequency significance option needs integer arg.\n"); 
              return(FALSE);}
           else
            { arg = atoi(*(argument+i+1));
              if (!known_significance_level(arg))
                { printf("Frequency Significance argument: ");
                  print_known_significance_levels();
                  return(FALSE); }
              else
                { signif_accept_level = (double)arg; i++; }
	    }
	 }
      else if (strcmp(*(argument+i),"-signif_drop") == 0)
         { if (i == (argc))
            { printf("Instance record significance option needs int arg.\n"); 
              return(FALSE);}
           else
            { arg = atoi(*(argument+i+1));
              if (!known_significance_level(arg))
                { printf("Instance record significance argument: ");
                  print_known_significance_levels();

                  return(FALSE); }
              else
                { signif_drop_level = (double)arg; i++; }
	    }
	 }
      else if (strcmp(*(argument+i),"-ib1") == 0)
         { ib1 = TRUE; ib2 = FALSE; ib3 = FALSE; ib4 = FALSE;
         }
      else if (strcmp(*(argument+i),"-ib2") == 0)
         { ib2 = TRUE; ib1 = FALSE;  ib3 = FALSE; ib4 = FALSE;
         }
      else if (strcmp(*(argument+i),"-ib3") == 0)
         { ib3 = TRUE;  ib1 = FALSE; ib2 = FALSE; ib4 = FALSE;
         }
      else if (strcmp(*(argument+i),"-ib4") == 0)
         { ib3 = FALSE;  ib1 = FALSE; ib2 = FALSE; ib4 = TRUE;
         }
      else if (strcmp(*(argument+i),"-norm_none") == 0)
         { norm_none = TRUE; norm_linear = FALSE; norm_sd = FALSE; 
         }
      else if (strcmp(*(argument+i),"-norm_linear") == 0)
         { norm_none = FALSE; norm_linear = TRUE; norm_sd = FALSE; 
         }
      else if (strcmp(*(argument+i),"-norm_sd") == 0)
         { norm_none = FALSE; norm_linear = FALSE; norm_sd = TRUE; 
         }
      else if (strcmp(*(argument+i),"-missing_maxdiff") == 0)
         { missing_maxdiff = TRUE; missing_ave = FALSE; missing_ignore = FALSE;
         }
      else if (strcmp(*(argument+i),"-missing_ave") == 0)
         { missing_maxdiff = FALSE; missing_ave = TRUE; missing_ignore = FALSE;
         }
      else if (strcmp(*(argument+i),"-missing_ignore") == 0)
         { missing_maxdiff = FALSE; missing_ave = FALSE; missing_ignore = TRUE;
         }
      else if (strcmp(*(argument+i),"-storeall") == 0)
        storeall = TRUE;
      else if (strcmp(*(argument+i),"-multiline") == 0)
        multiline = TRUE;
      else if (strcmp(*(argument+i),"-k") == 0)
	 { if (i == (argc))
	     { printf("k option needs an odd integer argument.\n");
	       return(FALSE);
             }
	   else
	     { arg = atoi(*(argument+i+1));
	       if (!good_value_for_k(arg))
		  { printf("k argument: integer less than %d.\n",
                           MAX_VALUE_OF_K);
		    return(FALSE);
		  }
               else
		 { k = arg; i++; }
             }
	 }
      else if (strcmp(*(argument+i),"-startup") == 0)
         { if (i == (argc))
            { printf("Startup option needs int arg.\n"); 
              return(FALSE);}
           else
            { startup = atoi(*(argument+i+1));
              startup_set = TRUE;
              i++; 
	    }
	 }
      else if (strcmp(*(argument+i),"-overlap") == 0)
         overlap = TRUE;
      else if (strcmp(*(argument+i),"-best_concept_only") == 0)
         best_concept_only = TRUE;
      else if (strcmp(*(argument+i),"-probability_weights") == 0)
         weight_method = PROBABILITY_WEIGHTS;
      else if (strcmp(*(argument+i),"-printweights") == 0)
         printweights = TRUE;
      else if (strcmp(*(argument+i),"-testlast") == 0)
         testlast = TRUE;
      else if (strcmp(*(argument+i),"-learning_rate") == 0)
         { if (i == (argc))
            { printf("Learning rate option needs integer arg.\n"); 
              return(FALSE);}
           else
            { arg = atoi(*(argument+i+1));
              if ((arg<1) || (arg>25))
                { printf("Learning rate option must be in range [1,25].\n");
                  return(FALSE); }
              else
                { learning_rate = 0.01 * (double)arg; i++; }
	    }
	 }
      else
	{ printf("Unknown argument to ibl: %s\n",*(argument+i));
          return(FALSE);
        }
    }

  /*==== Initialization stuff that wound up here ====*/
  if (read_data_format() == INCORRECT)
    { printf("Bad data format.\n");
      return(FALSE);
    }
  unknown_value = -1000.1234;   /*==== Unknown attribute val placeholder ====*/
  read_test_set();     /*==== Reads test set ====*/
  signif_accept = significance_level(signif_accept_level);
  signif_drop = significance_level(signif_drop_level);
  if (!startup_set)
    { startup = reportrate;
      if (reportrate > testrate)
        startup = testrate;
    }
  return(TRUE);
} 

/* ======================================================================
   Outlines the 4 experiments.  
   ====================================================================== */
void experiment()
{ 
   void initialization();
   extern void train_and_test();             /* Training */

   printf("Initializing variables...\n");
   initialization();
   printf("Training and testing...\n");  
   train_and_test();   
   printf("Finished Experiment.\n");
 }

/* ======================================================================
   Initializes system variables and windows.
   ====================================================================== */
void initialization()
{ 
  register int i, attribute, j, predictee_a;

  /*==== Initialize the algorithm string name. ====*/
  if (ib1)
    sprintf(algorithm_string,"%d-IB1",k);
  else if (ib2)
    sprintf(algorithm_string,"%d-IB2",k);
  else if (ib3)
    sprintf(algorithm_string,"%d-IB3",k);
  else
    sprintf(algorithm_string,"%d-IB4",k);

  /*==== Initialize the normalization string name. ====*/
  if (norm_none)
    strcpy(normalization_string,"None");
  else if (norm_linear)
    strcpy(normalization_string,"Linear");
  else 
    strcpy(normalization_string,"Standard Deviation");

  /*==== Initialize the missing attribute values string name. ====*/
  if (missing_maxdiff)
    strcpy(missing_string,"Maximum Difference");
  else if (missing_ave)
    strcpy(missing_string,"Average or Most Frequent Value");
  else
    strcpy(missing_string,"Ignore and Normalize Similarity Results");

  /*==== Simple variables ====*/
  instance_number = 0;          /*==== Instance counter ====*/
  my_infinity = 999999.0;       /*==== Occasional usage. ====*/
  initusages = 0.0;             /*==== Initial number of usages. ====*/
  initcorrectcount = 0.0;       /*==== Initial count value. ====*/
  initincorrectcount = 0.0;     /*==== Initial count value. ====*/

  /*==== Others are initialized in setup_for_testing. ====*/
  num_data_in_alg = 0;
  num_dropped_in_alg = 0;
  for(i=0; i<number_of_predictees; i++)
    { num_train_in_concept[i] = 0;
      num_data_in_concept[i] = 0;
      num_dropped_in_concept[i] = 0;
      num_training_correct_in_concept[i] = 0;
      num_training_incorrect_in_concept[i] = 0;
      for(j=0; j<MAX_NUMBER_OF_VALUES; j++)
        num_train_with_value[i][j] = 0;
    }

  /*==== These are used for print_recent_training_accuracy ====*/
  num_training_correct_in_alg = 0;  
  num_training_incorrect_in_alg = 0;
  for(i=0; i<2; i++)   /*==== Indices mean: reporting and testing ====*/
    { num_training_correct_in_alg_previous[i] = 0;  
      num_training_incorrect_in_alg_previous[i] = 0;
      for(j=0; j <number_of_predictees; j++)
        { num_training_correct_in_concept_previous[i][j] = 0;
          num_training_incorrect_in_concept_previous[i][j] = 0;
        }
    }
       
  /*==== Initialize counts and usages. ====*/
  for(i=0; i <MAX_NUMBER_OF_INSTANCES; i++)
    { num_uses[i] = 0;
      for(j=0; j<number_of_predictees; j++)
        { counts[j][i][CORRECT] = 0.0;
          counts[j][i][INCORRECT] = 0.0;
          usages[j][i] = 0.0;
	}
    }

  /*==== Attribute scales are updated dynamically during learning ====*/
  for(attribute=0; attribute<number_of_attributes; attribute++)
    { scalemin[attribute] = my_infinity;         /* Impossibly high */
      scalemax[attribute] = -1.0 * my_infinity;  /* Impossibly low */
    }

  /*==== These used to track the attribute weights ====*/
  for(predictee_a=0; predictee_a<number_of_predictees; predictee_a++)
    for(attribute=0; attribute<number_of_predictors; attribute++)
       { /*==== 1. Simple weighting method ====*/
         attribute_weight_total[predictee_a][attribute] = 0.01;
         weight_size[predictee_a][attribute] = 0.01;     
         attribute_weight[predictee_a][attribute] = 0.01;
         attribute_weight_used[predictee_a][attribute] = 0.5;
         if (weight_method == SIMPLE)
           attribute_weight_used[predictee_a][attribute] = 
             1.0/(double)number_of_predictors;

         /*==== 2. Conditional probability weighting method ====*/
         conditional_probability[predictee_a][attribute] = 0.5;
       }

   /*==== Initialize stuff for norm_sd if it's set ====*/
   if (norm_sd)
     for(attribute=0; attribute<number_of_predictors; attribute++)
       { sum_values[attribute] = 0.0;
         sum_squared_values[attribute] = 0.0;
         number_of_known_values[attribute] = 0.0;
         standard_deviation[attribute] = 1.0;
       }

   /*==== Initialize variables needed for missing_ave ====*/
   if (missing_ave)
     for(attribute=0; attribute<number_of_attributes; attribute++)
       { num_predictor_training_values[attribute] = 0;
         sum_predictor_values[attribute] = 0.0;
         for(i=0; i<MAX_NUMBER_OF_VALUES; i++)
  	   num_training_instances_with_value[attribute][i] = 0;
       }
}   

/* ====================================================================== 
   Prints known significance levels.
   ====================================================================== */
void print_known_significance_levels()
{
   printf("{55,60,67,70,75,80,85,90,95,97,99}\n");
}

/* ======================================================================
   True iff the given is a known significance level.
   ====================================================================== */
int known_significance_level(arg)
   int arg;
{
   register int result;

   switch (arg) {
     case 55: case 60: case 67: case 70: case 75: case 80: case 85:     
     case 90: case 95: case 97: case 99:     
      result = TRUE;
      break;
    default:
      result = FALSE;
    }

   return(result);
}

/* ====================================================================== 
   Yields significance level value z for given significance level.
   Assumes that the value v is a known significance level.
   ====================================================================== */
double significance_level(v)
   double v;
{
   if (v == 55.0)
     return(0.125);
   else if (v == 60.0)
     return(0.255);
   else if (v == 67.0)
     return(0.44);
   else if (v == 70.0)
     return(0.525);
   else if (v == 75.0)
     return(0.675);
   else if (v == 80.0)
     return(0.843);
   else if (v == 85.0)
     return(1.037);
   else if (v == 90.0)
     return(1.285);
   else if (v == 95.0)
     return(1.645);
   else if (v == 97.0)
     return(1.881);
   else if (v == 99.0)
     return(2.325);
   else
     { printf("FAILURE IN SIGNIFICANCE LEVEL (with value %f)!!!\n",v);
       return(-1.0);
     }
}

/* ======================================================================
   TRUE iff the argument is a good setting for k.
   ======================================================================*/
int good_value_for_k(value)
   int value;
{
   if ((value >= 1) && (value <= MAX_VALUE_OF_K))
     return(TRUE);
   else
     return(FALSE);
}