;*------------------------------------------------------------------08/Mar/89-* ;* ;* CLASSWEB John Gennari ;* ;* Programmers: Patrick Young, Kevin Thompson, John Gennari, John Allen. ;* ;* This version of Classit and Cobweb is mostly colored by my style of ;* programming and documentation - John Gennari. See below for history ;* ;* For the reader, a warning - the first 2 or 3 (or 4?) pages are ;* initialization garbage: the `real' code begins with functions main-loop ;* and classify. ;*---------------------------------------------------------------------JHG----* #|***************************************************************************** IMPLEMENTATION OF CLASSIFY (COBWEB/CLASSIT); Patrick Young, and Kevin Thompson 5-9-88. Plus a few additions by John Allen, 2-89. History: 5/13 added missing attributes information. 6/3 salience on missing attributes. 6/3 extra value for outfile "t" to default name. 6/4 print in outfile some version, what the infile was, and time. 6/4 return tree again. 6/15 changed so that missing attributes (e.g. those with acount=0) aren't used, so I average by only those that have values. prob-att=value. 6/15 param for how nodes are printed. 6/15 ability to name each attribute yourself. 6/15 add ability for comments in input files. 2/14/89 Added :TREE to keyword list of run. Allow the user to pass in a pre-built tree. JAA. 2/14/89 Added :learning to keyword list of run. Allows you to specify learning or testing modes. Note that testing mode returns a list of lists, each list containing the name of the test object, the node it matches, and a list of all the predicted values specified by *PRED-ATTS* JAA. 2/15/89 Added *SPLIT* and *MERGE* to control the availability of the split and merge operators. JAA. 3/14/89 Rewrote and cleaned up "Classify", the top level classification routine. Added ModifyTree. Removed the subtracting instance business (see node-atts.lsp). Some pretty major re-writing here. *****************************************************************************|# (provide "classweb") (require "loop" "/ci/ua/gennari/bin/loop") (use-package "LOOP") (require "struct") (require "node-compute") (require "node-atts") (require "read-inst") (require "get-time") (require "globals") ; (proclaim '(optimize (safety 0) (speed 3))) #|***************************************************************************** T O P L E V E L P R O C E D U R E A N D U S E R I N T E R F A C E *****************************************************************************|# #|----------------------------------------------------------------15/Jun/88---- Function - USAGE Prints out a message describing the current usage of the function run. -------------------------------------------------------------------KThompso--|# (defun usage () (mapc #'(lambda (str) (format t (concatenate 'string "--- " str "~%"))) '( "USAGE: run " "and optional keyword arguments:" "OUTFILE: the name of an outfile for the results" " (defaults to standard-io), giving t as argument will create" " .out, e.g. infile soybean.data --> soybean.data.out" "PRINT-EACH: whether to print out the tree in between each iteration" " (defaults to nil)" "PRINT-FUNCTION: how to print out nodes; can be :short or :long" " (defaults to :short)" "ATT-TYPE: whether the attributes are :nominal or :numeric or :mixed" " (defaults to :nominal)" " if set to :mixed, infile is expected to have on first line of" " data a line of n or s, for numeric and symbolic, for each att" " " "ATT-NAMES: list of att-names (length num-atts) given. Must not have" " duplicates. Names of form (att1 att2 ...) generated" " automatically otherwise." "BREAKER: whether to stop inside the main loop of run to inspect the tree" " (defaults to nil)" "DEBUG: whether to print out debug statements about the program's innards" " (defaults to nil)" "" "TREE: Allows you to pass in a pre-built tree. (defaults to nil)" "" "LEARNING: Allows user to specify learning (t) or testing (nil) mode." " (defaults to t)" "" "PRED-ATTS: List of attribute to be predicted during testing mode. Every" " element must be found in ATT-NAMES (defaults to *PRED-ATTS*)" "" "THRESHOLD: Sets the Threshold parameter (defaults to 1.0, meaning" " no cutoff at all)" "" "Note that each line of the infile is expected to be in the form" " A1 A2 A3 A4 A5 . . . An cat-info" " where A1..An are attribute values, and cat-info is a label" " Lines that begin with a % are ignored." )) (values)) #|----------------------------------------------------------------04/Jun/88---- Function - PRINT-OUT-BANNER Inputs -> infile, outfile prints out banner to *OUTPUT-STREAM* -------------------------------------------------------------------KThompso--|# (defun print-out-banner (infile outfile) (pr-out "CLASSIFY running at ~A~%INFILE: ~A" (get-time) infile) (if outfile (pr-out "~%OUTFILE: ~A~3%" outfile) (pr-out "~3%")) (values)) #|----------------------------------------------------------------15/Jun/88---- Function - run Inputs -> input file, optional output file Returns -> the final tree. -------------------------------------------------------------------KThompso--|# (defun run (&optional (infile nil) (num-atts 0) ;number of attributes in input set. &key (outfile nil) (breaker nil) (debug nil) (att-type :nominal) (att-names nil) (print-function :short) (print-each nil) (tree nil) (threshold 1.0) (acuity 1.0) (learning t) (pred-atts *PRED-ATTS*)) (unless infile (usage) (return-from run (values))) (setq *LEARNING* learning) (setq *ACUITY* acuity) ;* Note that *THRESHOLD* is tied to acuity. -JHG (setq *THRESHOLD* (* threshold (inverse (sqrt acuity))) ) (setq *PRED-ATTS* pred-atts) (setup-node-printer print-function) (let ((input-stream (open infile :direction :input)) (outfile-name (if (eq outfile t) (concatenate 'string infile ".out") outfile))) ;*setup output files (setf *GLOBAL-DEBUG* debug) (setq *OUTPUT-STREAM* (if outfile (open outfile-name :direction :output :if-exists :new-version) *standard-output*)) (print-out-banner infile outfile-name) (setf *TYPE-LIST* (case att-type (:nominal (make-list num-atts :initial-element :nominal)) (:numeric (make-list num-atts :initial-element :numeric)) (:mixed (read-att-type input-stream num-atts)))) ;* error-check att-names (when att-names (assert (and (= num-atts (length att-names)) (equal (remove-duplicates att-names) att-names)) (att-names) "List of att-names ~A ~% is invalid -- must be length ~ ~D and have no duplicates~%:r to continue" att-names num-atts)) ;* create att-names (setq *ATT-NAMES* (or att-names *ATT-NAMES* (make-att-names num-atts))) ;* loop through remaining instances (setf tree (main-loop print-each breaker *ATT-NAMES* input-stream outfile num-atts tree)) ;* finished with instances (close input-stream) (let ((str (concatenate 'string "********************** Finished with instances at " (get-time)))) (pr-out "~%~A~2%" str) (if outfile (format t "~%~A~2%" str))) (if outfile (pr-out "~%~A~%" tree)) ;don't if to term; will be returned. (when outfile (close *OUTPUT-STREAM*)) tree ;return the tree. )) #|----------------------------------------------------------------15/Jun/88---- Function - MAIN-LOOP Inputs -> print-each breaker *ATT-NAMES* input-stream outfile num-atts tree Returns -> the tree resulting from looping through the instances. Or, if prediction is on, a list of attributes and predicted values. Actions -> Here is the basic incremental loop: an instance is read (into the global property list associated with 'obj), and it is incorporated into the hierarchy of concepts. Note that the first instance is handled slightly differently. -JHG 3/13/89 -------------------------------------------------------------------KThompso--|# (defun main-loop (print-each breaker att-names input-stream outfile num-atts tree) (let ((HALT (not (readinstance input-stream num-atts))) (instance-cntr 1) predicted ) (if (null tree) ; the tree is sometimes pre-set by the user. (progn (setq tree (create-node att-names)) (setq HALT (not (readinstance input-stream num-atts))) (setq instance-cntr (1+ instance-cntr))) ) (loop until HALT do (pr-out "~v%**** INCORPORATING new instance ~4D ~A~%" (if print-each 3 1) instance-cntr (given-name 'OBJ)) (if outfile (format t "~4D" instance-cntr)) (if *LEARNING* (setq tree (classify tree att-names 0)) (setq predicted (cons (cons (given-name 'obj) (classify tree att-names 0)) predicted))) (if print-each (pr-out "~%~A" tree)) (if breaker (break "stopping in run")) (setf (symbol-plist 'obj) nil) (setq instance-cntr (1+ instance-cntr)) (setq HALT (not (readinstance input-stream num-atts))) finally (if *LEARNING* (return tree) (return (reverse predicted))) ) )) #|***************************************************************************** C O N T R O L S T R U C T U R E F O R C L A S S W E B ---------------------------------------------------------------------------* (documented and modified 3/14/89, -JHG) CLASSIFY -- store a instance in the tree. Inputs: the "root" (the current parent), the list of attnames, and the level in the global tree (for debugging). Outputs: the new tree Algorithm: The action performed depends on the tree handed to store in. (1) The tree handed to classify is nil, in this case, we place the obj in at the top level. (This only happens with the first instance.) (2) The tree handed to classify is a single node with no children. In this case, classify will generalize the current node and give it two children, the original node and the new obj (3) The tree has children, in which case, we consider the category utility of (i) place the obj in a new child node, (ii) place the obj in one of the child nodes, (iii) merge the best two children in (ii) and place the obj with them, or (iv) consider splitting the children of the best node to the current level, placing the obj with the best of the children. Unless the obj is placed in a new child node, the algorithm is called recursively. As storage is called recursively, the root given will already have the new instance integrated (thus the call to integrate-obj if the level is zero, but not otherwise). However, if the new root has no children, then root will not have obj incorporated (see ModifyTree) This avoids having to "subtract" an instance from a concept. ------------------------------------------------------------------Pyoung----|# (defun classify (root att-names level) (cond ((null (node-children root)) ; root shouldn't have 'obj incorporated! (if *LEARNING* (fork-node root att-names) (predicted-values root))) ((> (node-attscores root) *THRESHOLD*) (if *GLOBAL-DEBUG* (pr-out " choosing not to continue...")) (if *LEARNING* root (predicted-values root)) ) (t (if (= level 0) (setq root (integrate-obj root att-names))) (store root att-names level)) ) ) #|----------------------------------------------------------------04/May/88---- Function - STORE (documented and modified 3/14/89, -JHG) Inputs -> root, the parent node, the list of attnames, and a level counter (for tracing). Returns -> If learning is on, the modified parent is returned (with obj usually incorporated). Otherwise, the tree is not modified, and a set of predictions are returned. Actions -> Here, the four options, merge, split, new disjunct, or best class are chosen amoung. For each of these,a "compute-CU-?" function is called (see node-compute.lsp). These return many things, including a pointer to a modified tree and a score. These scores are compard to choose an action. Finally, the tree is modified and a recursive call (to classify) is made -- See ModifyTree. -------------------------------------------------------------------KThompso--|# (defun store (root att-names level) (let (CU-self CU-best-child ;category-utility scores (CU-merge -1000) (CU-split -1000) winner ; winner from ordered list of CU scores self-tree best-child-tree ;resultant trees merge-tree split-tree self-node ;new node with only obj in it best-node second-node ;best (with obj integrated) and ; second best nodes to place obj in. old-best-node ;best node without obj integrated oldsplit-node ;best node of split w/o obj integrated merge-node split-node ;child nodes with obj for merge and ; split ) (multiple-value-setq (CU-self self-tree self-node) (compute-CU-self root att-names)) (multiple-value-setq (CU-best-child best-child-tree best-node second-node old-best-node) (compute-CU-best root att-names)) (if *LEARNING* ;dont do these if testing. (progn (if *MERGE* (multiple-value-setq (CU-merge merge-tree merge-node) (compute-CU-merge root best-node second-node old-best-node att-names))) (if *SPLIT* (multiple-value-setq (CU-split split-tree split-node oldsplit-node) (compute-CU-split root best-node old-best-node att-names))) )) (setq winner (max CU-self CU-best-child CU-merge CU-split)) (if *GLOBAL-DEBUG* (pr-out "===== Scores:Self = ~5F Best = ~5F Merge = ~5F Split = ~5F~%" CU-self CU-best-child CU-merge CU-split)) (pr-out "~VTlevel ~D:" (* level 3) level) (cond ((eq CU-self winner) (pr-out "putting instance by itself~%") (if *LEARNING* ; when testing return the best node and predicted self-tree ; values. (values #|(copy-node root)|# (predicted-values root)))) ((eq CU-best-child winner) (pr-out "putting with child ~A~%" (node-members old-best-node)) (if *LEARNING* (ModifyTree best-child-tree best-node old-best-node att-names level) (values-list (multiple-value-list (classify best-node att-names (1+ level)))) )) ((eq CU-merge winner) (pr-out "merging children ~A and ~A~%" (node-members old-best-node) (node-members second-node)) (ModifyTree merge-tree merge-node -999 att-names level) ) ; See below for "-999" explanation. ((eq CU-split winner) (pr-out "splitting child ~A~%" (node-members old-best-node)) (ModifyTree split-tree split-node oldsplit-node att-names level) ) ) )) ;*------------------------------------------------------------------15/Mar/89-* ;* Function - MODIFYTREE ;* ;* Inputs -> the tree (current root node), the node (the chosen child of ;* tree), the oldnode (the node w/o 'OBJ incorporated), the list of ;* attnames, and the level. ;* ;* Returns -> the new modified tree. ;* Actions -> This is where the hierarchy actually gets modified (via a ;* setf statement). It is called only from store, and only if the system ;* is about to recurse (not in the disjunct case). The If clause is a ;* look-ahead: if the node we are about to recurse on has no children, ;* then recurse on a node without the new obj incorporated, since we ;* are about to do a "fork-node". Note that this should never happen ;* if Merge is chosen (hence -999, above). ;*---------------------------------------------------------------------JHG----* (defun ModifyTree (tree node oldnode att-names level) (if (null (node-children node)) (setf (elt (node-children tree) (position node (node-children tree))) (classify oldnode att-names (1+ level))) (setf (elt (node-children tree) (position node (node-children tree))) (classify node att-names (1+ level))) ) tree ;return the new root ) #|----------------------------------------------------------------14/Feb/89---- Function - Predicted-values Input - node that the test case most closely matches. (Also uses *PRED-ATTS*) Output - a list of the predicted attributes in the given node. The variable *PRED-ATTS* specifies the attributes. -------------------------------------------------------------------Allen-----|# (defun predicted-values (node) (loop for att-name in *PRED-ATTS* for att = (find-att att-name (node-atts node)) if att collect att into answer else do (pr-out "can't predict about attribute ~A~%" att-name) finally (return answer) ) )