/* istext.c - test if a file contains text or not. */ /* Copyright (C) 1997 Andrew McCallum Written by: Andrew Kachites McCallum This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */ #include #include /* for isprint(), etc. */ /* The percentage of characters that must be text-like in order for us to say this is a text file. */ #define TEXT_PRINTABLE_PERCENT 95 /* Examine the first NUM_TEST_CHARS characters of `fp', and return a non-zero value iff TEXT_PRINTABLE_PERCENT of them are printable. */ int bow_fp_is_text (FILE *fp) { static const int NUM_TEST_CHARS = 4096; char buf[NUM_TEST_CHARS]; int num_read; int num_printable = 0; int num_spaces = 0; int fpos; int i; int num_newlines; static const int NUM_LINE_LENGTHS = NUM_TEST_CHARS; int line_lengths[NUM_LINE_LENGTHS]; int line_length_histogram[NUM_LINE_LENGTHS]; int max_line_length_histogram_height; fpos = ftell (fp); num_read = fread (buf, sizeof (char), NUM_TEST_CHARS, fp); fseek (fp, fpos, SEEK_SET); for (i = 0; i < num_read; i++) { if (isprint (buf[i]) || isspace (buf[i])) num_printable++; if (isspace (buf[i])) num_spaces++; } if (!(num_read > 0 && (((100 * num_printable) / num_read) > TEXT_PRINTABLE_PERCENT))) return 0; /* Test for uuencoded blocks by seeing if over 1/3 of the lines have identical length. */ for (i = 0, num_newlines = 0, line_lengths[num_newlines] = 0; i < num_read; i++) { if (buf[i] == '\n') { num_newlines++; assert (num_newlines < NUM_LINE_LENGTHS); line_lengths[num_newlines] = 0; } else { line_lengths[num_newlines]++; } } for (i = 0; i < NUM_LINE_LENGTHS; i++) line_length_histogram[i] = 0; for (i = 0; i < num_newlines; i++) line_length_histogram[line_lengths[i]]++; max_line_length_histogram_height = line_length_histogram[0]; for (i = 1; i < NUM_LINE_LENGTHS; i++) if (max_line_length_histogram_height < line_length_histogram[i]) max_line_length_histogram_height = line_length_histogram[i]; /* If over a 1/3 of the lines have the same height, this file probably contains a uuencoded block. */ if (max_line_length_histogram_height > num_newlines / 3 && num_spaces < num_read / 10) return 0; return 1; }