/* Handling command-line options that apply across the whole of libbow. */ #include #include /* For mkdir() and stat() */ #include #include #include #include /* Global variables whose value is set by bow_argp functions, but which must be examined by some other function (called later) in order to have any effect. */ /* Remove all but the top N words by selecting words with highest information gain */ int bow_prune_vocab_by_infogain_n = 0; /* Remove words that occur less than N times */ int bow_prune_vocab_by_occur_count_n = 0; /* The weight-setting and scoring method set on the command-line. */ bow_method *bow_argp_method = NULL; /* The directory in which we'll store word-vector data. */ const char *bow_data_dirname = NULL; /* If non-zero, print to stdout the contribution of each word to each class. */ int bow_print_word_scores = 0; /* If non-zero, use equal prior probabilities on classes when setting weights, calculating infogain and scoring */ int bow_uniform_class_priors = 0; /* If non-zero, use binary occurrence counts for words. */ int bow_binary_word_counts = 0; /* Don't lex any files with names matching this. */ const char *bow_exclude_filename = NULL; /* Value added to key to get the key of the opposite option. For example "do not use stoplist" has key 's'; "use stoplist" has key 's'+KEY_OPPOSITE. */ #define KEY_OPPOSITE 256 enum { APPEND_STOPLIST_FILE_KEY = 10000, PRINT_WORD_SCORES_KEY, UNIFORM_CLASS_PRIORS_KEY, NAIVEBAYES_SCORE_WITH_LOG_PROBS_KEY, BINARY_WORD_COUNTS_KEY, EXCLUDE_FILENAME_KEY }; static struct argp_option bow_options[] = { {0, 0, 0, 0, "General options", 1}, {"verbosity", 'v', "LEVEL", 0, "Set amount of info printed while running; " "(0=silent, 1=quiet, 2=show-progess,...5=max)"}, {"no-backspaces", 'b', 0, 0, "Don't use backspace when verbosifying progress (good for use in emacs)"}, {"data-dir", 'd', "DIR", 0, "Set the directory in which to read/write word-vector data " "(default=~/.)."}, {0, 0, 0, 0, "Lexing options", 2}, {"skip-header", 'h', 0, 0, "Avoid lexing news/mail headers by scanning forward until two newlines."}, {"no-stoplist", 's', 0, 0, "Do not toss lexed words that appear in the stoplist."}, {"use-stoplist", 's'+KEY_OPPOSITE, 0, 0, "Toss lexed words that appear in the stoplist. " "(usually the default, depending on lexer)"}, {"append-stoplist-file", APPEND_STOPLIST_FILE_KEY, "FILE", 0, "Add words in FILE to the stoplist."}, {"no-stemming", 'S'+KEY_OPPOSITE, 0, 0, "Do not modify lexed words with a stemming function. " "(usually the default, depending on lexer)"}, {"use-stemming", 'S', 0, 0, "Modify lexed words with the `Porter' stemming function."}, {"gram-size", 'g', "N", 0, "Create tokens for all 1-grams,... N-grams."}, {"exclude-filename", EXCLUDE_FILENAME_KEY, "FILENAME", 0, "When scanning directories for text files, skip files with name " "matching FILENAME."}, {0, 0, 0, 0, "Mutually exclusive choice of lexers", 3}, {"skip-html", 'H', 0, 0, "Skip HTML tokens when lexing."}, {"keep-html", 'H'+KEY_OPPOSITE, 0, OPTION_HIDDEN, "Treat HTML tokens the same as any other chars when lexing. (default)"}, {"lex-for-usenet", 'U', 0, 0, "Use a special lexer for UseNet articles, ignore some headers and " "uuencoded blocks."}, {0, 0, 0, 0, "Feature-selection options", 4}, {"prune-vocab-by-infogain", 'T', "N", 0, "Remove all but the top N words by selecting words with highest " "information gain."}, {"prune-vocab-by-occur-count", 'O', "N", 0, "Remove words that occur less than N times."}, {0, 0, 0, 0, "Weight-vector setting/scoring method options", 5}, {"method", 'm', "METHOD", 0, "Set the weight-method; METHOD may be one of: "}, {"print-word-scores", PRINT_WORD_SCORES_KEY, 0, 0, "During scoring, print the contribution of each word to each class."}, {"uniform-class-priors", UNIFORM_CLASS_PRIORS_KEY, 0, 0, "When setting weights, calculating infogain and scoring, use equal prior " "probabilities on classes."}, {"naivebayes-score-with-log-probs", NAIVEBAYES_SCORE_WITH_LOG_PROBS_KEY, 0, 0, "In NaiveBayes, when outputing scores, use log-probabilities, not " "probabilities."}, {"binary-word-counts", BINARY_WORD_COUNTS_KEY, 0, 0, "Instead of using integer occurrence counts of words to set weights, " "use binary absence/presence."}, {0, 0} }; static int parse_bow_opt (int opt, char *arg, struct argp_state *state) { switch (opt) { case 'v': bow_verbosity_level = atoi (optarg); break; case 'b': /* Don't print backspaces when verbosifying at level BOW_PROGRESS. */ bow_verbosity_use_backspace = 0; break; case 'd': /* Set name of the directory in which we'll store word-vector data. */ bow_data_dirname = arg; break; /* Lexing options. */ case 'h': /* Avoid lexing news/mail headers by scanning fwd until two newlines */ bow_default_lexer->document_start_pattern = "\n\n"; break; case 's': /* Do not toss lexed words that appear in the stoplist */ bow_default_lexer_simple->stoplist_func = NULL; break; case 's'+KEY_OPPOSITE: /* Toss lexed words that appear in the stoplist */ bow_default_lexer_simple->stoplist_func = bow_stoplist_present; break; case 'S': /* Modify lexed words with the `Porter' stemming function */ bow_default_lexer_simple->stem_func = bow_stem_porter; break; case 'S'+KEY_OPPOSITE: /* Do not modify lexed words with a stemming function. (default) */ bow_default_lexer_simple->stem_func = NULL; break; case APPEND_STOPLIST_FILE_KEY: bow_stoplist_add_from_file (arg); break; case 'g': /* Create tokens for all 1-grams,... N-grams */ { int n = atoi (arg); if (n == 0) { fprintf (stderr, "--gram-size, -N: gram size must be a positive int\n"); return ARGP_ERR_UNKNOWN; } bow_default_lexer_gram->gram_size = n; } case 'H': /* Skip HTML tokens when lexing */ bow_default_lexer_indirect->underlying_lexer = (bow_lexer*) bow_default_lexer_html; break; case 'H'+KEY_OPPOSITE: /* Treat HTML tokens the same as any other chars when lexing (default) */ bow_default_lexer_indirect->underlying_lexer = (bow_lexer*) bow_default_lexer_simple; break; case 'U': /* Use a special lexer for UseNet articles, ignore some headers and uuencoded blocks. */ bow_default_lexer_indirect->underlying_lexer = (bow_lexer*) bow_default_lexer_email; break; case EXCLUDE_FILENAME_KEY: bow_exclude_filename = arg; break; /* Feature selection options. */ case 'T': /* Remove all but the top N words by selecting words with highest information gain */ bow_prune_vocab_by_infogain_n = atoi (arg); break; case 'O': /* Remove words that occur less than N times */ bow_prune_vocab_by_occur_count_n = atoi (arg); break; case 'm': bow_argp_method = bow_method_at_name (arg); break; case PRINT_WORD_SCORES_KEY: bow_print_word_scores = 1; break; case UNIFORM_CLASS_PRIORS_KEY: bow_uniform_class_priors = 1; break; case NAIVEBAYES_SCORE_WITH_LOG_PROBS_KEY: /* Have NaiveBayes score using log probabilities */ ((bow_params_naivebayes*)(bow_method_naivebayes.params)) ->score_with_log_probabilities = bow_yes; break; case BINARY_WORD_COUNTS_KEY: /* Use binary absence/presence, instead of integer occurrence counts for words. */ bow_binary_word_counts = 1; break; case ARGP_KEY_INIT: /* Things to do before any arguments are processed. */ /* If the file ./.bow-stopwords exists, load the extra words into the stoplist. */ bow_stoplist_add_from_file ("./.bow-stopwords"); /* If the file ~/.bow-stopwords exists, load the extra words into the stoplist. */ { const char sw_fn[] = "/.bow-stopwords"; const char *home = getenv ("HOME"); char filename[strlen (home) + strlen (sw_fn) + 1]; strcpy (filename, home); strcat (filename, sw_fn); bow_stoplist_add_from_file (filename); } /* Build the default data directory name, in case it wasn't specified on the command line. */ assert (program_invocation_short_name); if (!bow_data_dirname) { char *homedir = getenv ("HOME"); char *dirname = bow_malloc (strlen (homedir) + strlen ("/.") + strlen (program_invocation_short_name) + 1); strcpy (dirname, homedir); strcat (dirname, "/."); strcat (dirname, program_invocation_short_name); bow_data_dirname = dirname; } case ARGP_KEY_ARG: break; case ARGP_KEY_END: /* Create the data directory, if it doesn't exist already. */ { struct stat st; int e; e = stat (bow_data_dirname, &st); if (e == 0) { /* Assume this means the file exists. */ if (!S_ISDIR (st.st_mode)) bow_error ("`%s' already exists, but is not a directory"); } else { if (mkdir (bow_data_dirname, 0777) == 0) bow_verbosify (bow_quiet, "Created directory `%s'.\n", bow_data_dirname); else if (errno != EEXIST) bow_error ("Couldn't create default data directory `%s'", bow_data_dirname); } } default: return ARGP_ERR_UNKNOWN; } return 0; } static char * _help_filter (int key, const char *text, void *input) { char *ret; /* Add the names of the available methods to the help text. */ if (key == 'm') { static const len = 1024; char methodnames[len]; int i; bow_method *m; methodnames[0] = '\0'; for (i = bow_methods->array->length-1; i >= 0; i--) { m = bow_sarray_entry_at_index (bow_methods, i); strcat (methodnames, m->name); strcat (methodnames, ", "); } strcat (methodnames, "default=naivebayes."); assert (strlen (methodnames) < len); ret = malloc (strlen (text) + len); strcpy (ret, text); strcat (ret, methodnames); return ret; } return (char*)text; } /* This may be used with argp_parse to parse standard libbow startup options, possible chained onto the end of a user argp structure. */ const struct argp bow_argp = { bow_options, /* data structure describing cmdline options */ parse_bow_opt, /* the function to handle the options */ 0, /* non-option argument documention */ 0, /* extra text printed before and after help */ 0, /* argp children */ _help_filter }; #define MAX_NUM_CHILDREN 10 struct argp_child bow_argp_children[MAX_NUM_CHILDREN] = { { &bow_argp, /* the argp structure */ 0, /* flags for child */ "Libbow options", /* optional header */ 999 /* group (general lib flags at end of help)*/ }, {0} }; /* The number of children already initialized in the const assignment above. */ static int bow_argp_children_length = 1; void bow_argp_add_child (struct argp_child *child) { #if 0 if (bow_argp_children_size == 0) { bow_argp_children_size = 5; bow_argp_children = bow_malloc (sizeof (struct argp_child) * bow_argp_children_size); bow_argp.children = bow_argp_children; } else if (bow_argp_children_length + 1 >= bow_argp_children_size) { bow_argp_children_size *= 2; bow_argp_children = bow_realloc (bow_argp_children, (sizeof (struct argp_child) * bow_argp_children_size)); bow_argp.children = bow_argp_children; } #endif assert (bow_argp_children_length < MAX_NUM_CHILDREN); memcpy (bow_argp_children + bow_argp_children_length, child, sizeof (struct argp_child)); bow_argp_children_length++; } static void _print_version (FILE *stream, struct argp_state *state) { if (argp_program_version) /* If this is non-zero, then the program's probably defined it, so let that take precedence over the default. */ fprintf (stream, "%s\n", argp_program_version); /* And because libbow is a changing, integral part, put our information out too. */ fprintf (stream, "libbow %d.%d\n", BOW_MAJOR_VERSION, BOW_MINOR_VERSION); } void (*argp_program_version_hook) (FILE *stream, struct argp_state *state) = _print_version;