;;;; ID3.LISP ;;;; Quinlan's ID3 Algorithm for constructing decision trees ;;;; for learning from examples. ;;;; (See article by Quinlan in ML 1:1, 1986 or Mitchell's Decision Tree chapter.) ;;;; Copyright (C) 1988, 1989, 1995 by Jude William Shavlik. ;;;; This program may be freely copied, used, or ;;;; modified provided that this copyright notice ;;;; is included in each copy of this code and parts thereof. ;;;; This implementation of ID3 produces decision trees descriminating positive ;;;; and negative instances. Instances are represented by simple nominal features. ;;;; A keyword rather than a positional representation is used. See the file ;;;; FIGS.DATA for an example. Useful example-building programs are contained ;;;; in BUILD-EXAMPLES.LISP. RUN-ID3 provides a nice user interface. ;;;; Use MEASURE-CORRECTNESS to estimate the effectiveness of ID3. ;;;; Make sure the proper lisp is being used. (unless (member "AW" (mapcar #'package-name (list-all-packages)) :test #'EQUAL) (error "You must use lispAW, not lisp. (/p/course/cs540-shavlik/public/agent-world/lispAW)")) ;;;; Some lines in this file are longer than 80 characters. ;;;; You may wish to use "enscript -rG -fCourier7 -Plaser ID3.LISP" to print. ;;;; (At UW-Madison only, you can use: "print -landscape -Plaser ID3.LISP") ;;; --------------------------------------------------------------------------- ;;; Global variables used by ID3. ;;; --------------------------------------------------------------------------- (defvar *trace-id3* t "Produces a trace of operation if set.") (defvar *current-tree* nil "The most recently learned decision tree.") ;;; --------------------------------------------------------------------------- ;;; The Main Functions ;;; --------------------------------------------------------------------------- (defun id3 (examples possible-features splitting-function &optional (majority-class-of-parent '-)) "A simple version of Quinlan's ID3 Program - see Machine Learning 1:1, 1986. Numeric feature values, noisy data, and missing values are not handled." ;;; This function produces a decision tree that classifies the examples ;;; provided, using these features. The splitting function determines ;;; which feature should be used to split a collection of + and - examples. ;;; If all the examples are of the same type, a leaf node is returned. ;;; The resulting decision tree is a list of the form ;;; (feature (value1 subtree1) (value2 subtree2) ... ) ;;; or (decision #-of-examples-in-training-set-located-here) ;;; In the first case, depending on the value of feature, another decision ;;; tree must be traversed to make a decision. In the second case, a ;;; decision is recorded, along with the number of examples from the ;;; training set that would be placed at this node. ;;; See RUN-ID3 for a nice user interface to this function. ;;; It is assumed that every example has a valid value for each feature. ;;; The function VALIDATE-EXAMPLES can be used to pre-process examples. (cond ((null examples) ; No more examples, an "undecided" node. (list majority-class-of-parent 0)) ;Use the majority class at the parent node. ((all-positive? examples) `(+ , (length examples))) ((all-negative? examples) `(- , (length examples))) ((null possible-features) ;Used all the features, yet not all same class. (error "Out of features - inconsistent data:~% ~A" examples)) (t (let* ((chosen-feature (choose-feature examples possible-features splitting-function)) (remaining-features (remove chosen-feature possible-features)) (current-majority-class (if (> (count-positives examples) (count-negatives examples)) '+ '-))) ;used to fill `NULL' nodes (cons chosen-feature (mapcar #'(lambda (value) (list value (id3 (collect-examples-with-this-value examples chosen-feature value) remaining-features splitting-function current-majority-class))) (get-base-values chosen-feature))))))) (defun choose-feature (examples features splitting-function) "Choose an feature to split these examples into sub-groups. The method of doing so is specified by the third argument." (case splitting-function (random (choose-random features)) ; make an arbitrary choice (least-values nil) ; choose the one with the least possible values - TO BE WRITTEN (most-values nil) ; choose the feature with the most - TO BE WRITTEN (max-gain nil) ; use Quinlan's gain measure (pg. 90) to choose - TO BE WRITTEN (max-gain-ratio nil) ; use Quinlan's gain ratio measure (pg. 102) to choose - TO BE WRITTEN (otherwise (error "ERROR - unknown splitting function.")) )) (defun make-decision (example &optional (decision-tree *current-tree*) ) "Use this decision tree to classify this unclassified instance." ;; - TO BE WRITTEN (warning 'make-decision) ;Delete this line when you write this function. '-) ;;; --------------------------------------------------------------------------- ;;; Some useful utility functions. ;;; --------------------------------------------------------------------------- (defun run-id3 (&optional (examples *train-examples*) (splitting-function 'random) (examples-file "CATEGORIZED.DATA") (report-tree? *trace-id3*) &aux start-time) "Check these examples for correctness, build a decision tree, then draw the tree (if requested) and, finally, report some statistics about it." (when (null examples) (format t "~%~%Constructing the test set ... ") (build-example-lists examples-file) (format t " ~D training examples produced." (length *train-examples*)) (setf examples *train-examples*)) (format t "~%~%Building Decision Tree ...") (if (validate-examples examples) (progn (setf start-time (get-internal-run-time)) (setf *current-tree* (id3 examples *all-features* splitting-function)) (format t " finished in ~,3F sec.~%" (convert-to-sec (- (get-internal-run-time) start-time))) (if report-tree? (print-decision-tree)) (let ( (interior-nodes (count-interior-nodes *current-tree*)) (leaf-nodes (count-leaf-nodes *current-tree*))) (format t "~%~%Tree size=~A interior nodes=~A leaf-nodes=~A~%" (+ interior-nodes leaf-nodes) interior-nodes leaf-nodes)) (format t " positive leaves=~A negative leaves=~A undecided leaves=~A ~%~%" (count-matching-leaves *current-tree* '+) (count-matching-leaves *current-tree* '-) (count-matching-leaves *current-tree* '?)) (measure-correctness 'make-decision *trace-ID3* *test-examples*)) (format t "~%~% RUN ABORTED DUE TO ERRONEOUS TRAINING DATA.~%~%"))) (defun print-decision-tree (&optional (tree *current-tree*) (indent 0)) "Draw this decision tree, using indentation to indicate levels." (cond ((leaf-node? tree) (format t " ~A (~A)" (first tree) (second tree))) (t (mapc #'(lambda (sub-tree) (format t "~%") (dotimes (i (floor (/ indent 3))) (format t "| ")) (format t "~A=~A:" (first tree) (first sub-tree)) (print-decision-tree (second sub-tree) (+ indent 3))) (rest tree)))) nil) (defun count-interior-nodes (tree) "Count the interior (non-leaf) nodes in this tree." (if (leaf-node? tree) 0 (1+ (reduce #'+ (mapcar #'(lambda (arc) (count-interior-nodes (second arc))) (rest tree)))))) (defun count-leaf-nodes (tree) "Count the leaf nodes in this tree." (if (leaf-node? tree) 1 (reduce #'+ (mapcar #'(lambda (arc) (count-leaf-nodes (second arc))) (rest tree))))) (defun count-matching-leaves (tree match) "Count the number of leaf nodes in this tree that match the second argument." (cond ((leaf-node? tree) (if (eq (first tree) match) 1 0)) ((atom tree) 0) (t (+ (count-matching-leaves (first tree) match) (count-matching-leaves (rest tree) match))))) (defun leaf-node? (x) "Determine if this is a leaf node." (and (consp x) (member (first x) '(+ - ?))))