/* Implementation of the ?? heap for the bow library */ /* Copyright (C) 1997 Andrew McCallum Written by: Sean Slattery This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */ #include /* Some HEAP thingys */ #define PARENT(i) ((i) / 2) #define LEFT(i) (2 * (i)) #define RIGHT(i) (2 * (i) + 1) /* We index from 1->length - do the translation back to 0-(length-1) here */ #define HEAP_ELT(i) (heap->entry[i-1]) #define HEAP_KEY(i) (HEAP_ELT(i).current_di) #define HEAP_WI(i) (HEAP_ELT(i).wi) /* Return true if a belongs higher up the heap than b. For a Max Heap the compare should be >=, and for a Min Heap it should be <= */ #define HEAP_CMP(a,b) ((a) < (b)) /* Comparing the wi's mean we get the words out in order from lowest to highest. */ #define HEAP_HIGHER(a,b) (HEAP_CMP(HEAP_KEY(a), HEAP_KEY(b)) || ((HEAP_KEY(a) == HEAP_KEY(b)) && (HEAP_WI(a) < HEAP_WI(b)))) /* Function to turn an array of bow_dv_heap_elements into a proper heap */ void bow_heapify(bow_dv_heap *heap, int i) { int l, r, highest; bow_dv_heap_element tmp; l = LEFT(i); r = RIGHT(i); if ((l <= heap->length) && HEAP_HIGHER(l, i)) highest = l; else highest = i; if ((r <= heap->length) && HEAP_HIGHER(r, highest)) highest = r; if (highest != i) { tmp = HEAP_ELT(i); HEAP_ELT(i) = HEAP_ELT(highest); HEAP_ELT(highest) = tmp; /* Need to recurse to fix the subtree */ bow_heapify (heap, highest); } } /* Function to create a heap of the vectors of documents associated with each word in the word vector. */ bow_dv_heap * bow_make_dv_heap_from_wv (bow_wi2dvf *wi2dvf, bow_wv *wv) { int wv_index; /* an index into the "word vector" */ int heap_index; /* an index into the heap we are creating */ int wi, i; bow_dv_heap *heap; bow_dv *dv; heap = bow_malloc (sizeof (bow_dv_heap) + (sizeof (bow_dv_heap_element) * (wv->num_entries))); /* Dump all the elements into the heap */ heap_index = 0; for (wv_index = 0; wv_index < wv->num_entries; wv_index++) { /* Get the word index */ wi = wv->entry[wv_index].wi; /* Now fetch the list of documents associated with this word */ /* Use this function instead of accessing the wi2dvf->entry directly because the dv may need to be read from a file. */ dv = bow_wi2dvf_dv (wi2dvf, wi); if (!dv) continue; heap->entry[heap_index].dv = dv; assert (dv->idf == dv->idf); /* Check for NaN */ heap->entry[heap_index].wi = wi; heap->entry[heap_index].index = 0; heap->entry[heap_index].current_di = dv->entry[0].di; heap_index++; } /* Initialise the heap */ heap->length = heap_index; /* Now need to make this baby into a heap. We'll use i to index into a conceptual array with indices 1..length and convert those references to index an array of 0..(length-1) when required. */ for (i = (heap->length)/2; i > 0; i--) bow_heapify (heap, i); return heap; } /* Function to take the top element of the heap - move it's index along and place it back in the heap. */ void bow_dv_heap_update (bow_dv_heap *heap) { bow_dv_heap_element *top = &(heap->entry[0]); /* Increment the index */ (top->index)++; /* Check to make sure we have elements left to look at */ if (top->index < top->dv->length) { top->current_di = top->dv->entry[top->index].di; /* Heapify!! */ bow_heapify (heap, 1); } else { /* Here we draft in the end of the heap and Heapify */ heap->entry[0] = heap->entry[(heap->length) - 1]; (heap->length)--; bow_heapify (heap, 1); } } /* Function to make a heap from all the vectors of documents in the big data structure we've built - I hope it all fits.... */ bow_dv_heap * bow_make_dv_heap_from_wi2dvf(bow_wi2dvf *wi2dvf) { int wi; /* a "word index", index into WI2DVF */ int max_wi; /* the highest "word index" */ int hi; /* a "heap index", an index into the heap */ bow_dv_heap *heap; /* what we are creating and returning */ bow_dv *dv; max_wi = MIN (wi2dvf->size, bow_num_words ()); heap = bow_malloc (sizeof (bow_dv_heap) + (sizeof (bow_dv_heap_element) * (max_wi))); /* Dump all the vectors of documents into the array */ hi = 0; for (wi = 0; wi < max_wi; wi++) { dv = bow_wi2dvf_dv (wi2dvf, wi); if (dv) { heap->entry[hi].dv = dv; heap->entry[hi].wi = wi; heap->entry[hi].index = 0; heap->entry[hi].current_di = dv->entry[0].di; /* xxx It would be nice to check for values too high, also.*/ assert (dv->entry[0].di >= 0); hi++; } } /* Initialise the Heap */ heap->length = hi; for (hi = (heap->length)/2; hi > 0; hi--) bow_heapify (heap, hi); return heap; } void bow_dv_heap_free (bow_dv_heap *heap) { bow_free (heap); }