;;;; BACKPROP.LISP - A parallel distributed processing (PDP) ;;;; feed-forward system with one layer of hidden units. ;;;; The generalized delta rule is used for training ;;;; and the logistic function is used for activation. ;;;; (It is assumed that each node's output function ;;;; is the identity operator.) For more details, ;;;; see chapter 8 of PDP1, especially pp. 322-335. ;;;; Copyright (C) 1988 by Jude William Shavlik and Raymond Joseph Mooney ;;;; This program may be freely copied, used, or ;;;; modified provided that this copyright notice ;;;; is included in each copy of this code and parts thereof. ;;;; -------------------------------------------------------------------------------------- ;;;; Global Variables ;;;; -------------------------------------------------------------------------------------- (defvar *eta* 0.25 "Parameter that determines the learning rate.") (defvar *alpha* 0.90 "Scale factor for the momentum term.") (defvar *initial-wt-limit* 0.3 "The maximum magnitude of the initial wts.") (defvar *epsilon* 0.1 "Acceptable error for correct output vector.") (defvar *output-epsilon* 0.2 "Binary ouput is 1 if real output is this close to 1") (defvar *trace-backprop* nil "Report i/o behavior periodically?") (defvar *network* nil "The saved learned network") (defvar *graph-on* nil "Produces graph of the output") (defvar *graph-window* nil "The current graph window") (defvar *most-positive-exponent* (log most-positive-long-float) "Used to prevent exponentation overflow.") (defvar *most-negative-exponent* (log long-float-epsilon) "Used to prevent exponentation underflow.") (defmacro trace-print (test-var &rest format-form) ;; Print using the format string only if test-var is nonNIL `(if ,test-var (format t ,@format-form))) (defstruct (network (:print-function network-printer)) ;;; A PDP network, with one layer of hidden units. ;;; Input units fully connect to all of the hidden units and the hidden ;;; units fully connect to all of the output units. ;;; There are no direct connections between input and output units. ;;; Besides recording the network weights for inter-node links, ;;; the biases, activations, and error signals of nodes are recorded. ;;; In addition, the most recent changes (deltas) in the weights and ;;; biases are recorded. These are used to compute a "momentum" term ;;; when adjusting weights and biases. Empirically, networks often ;;; converge faster when the momentum term is used to "smooth" weight changes. input-to-hidden-wts hidden-to-output-wts input-to-hidden-deltas hidden-to-output-deltas hidden-unit-biases output-unit-biases hidden-bias-deltas output-bias-deltas hidden-activations output-activations hidden-unit-errors output-unit-errors) ;;;; -------------------------------------------------------------------------------------- ;;;; THE MAIN FUNCTIONS ;;;; -------------------------------------------------------------------------------------- (defun backprop-m (examples &optional short-encode-flag (hidden-units 4) randomize-examples? (report-freq 10) (max-cycles 1000000)) ;;; Backprop for multi-valued feature examples. First encodes examples into binary ;;; strings and then calls BACKPROP. If short-encode-flag is set uses log(n) bits ;;; feature value otherwise uses 1 bit/feature-value. See BACKPROP for purpose ;;; of other arguments (format t "~%Examples: ~A" examples) (backprop (backprop-example-form (convert-to-bits examples short-encode-flag)) hidden-units randomize-examples? report-freq max-cycles)) (defun backprop-example-form (examples) ;;; Converts examples with + and - to indicate class to form needed for ;;; BACKPROP. See BACKPROP. (mapcar #'(lambda (example) (cons (if (eq (first example) '+) '(1) '(0)) (second example))) examples)) (defun backprop (examples &optional (hidden-units 4) randomize-examples? (report-freq 10) (max-cycles 1000000) &aux network numb-correct (last-correctness 0) (cycle 0) (numb-of-examples (length examples)) (input-units (length (rest (first examples)))) (output-units (length (first (first examples))))) ;;; Train a PDP network with these example i/o pairs. The CAR of ;;; each example is the desired output when the CDR is the input. (The ;;; CAR must be a list, i.e. ((1) 0 1 0) is a properly formatted example.) ;;; The number of hidden units desired can be specified - the number of ;;; input and output units is determined by the form of each example. ;;; If RANDOMIZE-EXAMPLES? is set, during each training cycle the training ;;; examples are randomly mixed-up. Training stops when the network correctly ;;; learns the training examples or the maximum number of cycles is reached. ;;; The REPORT-FREQ variable specifies how often the system reports its performance. (format t "~%BACKPROP program initializing ... ~%~%") (setf network (build-network input-units hidden-units output-units t)) (setf *network* network) (if *graph-on* (setf *graph-window* (make-graph-window (format nil "% Correct vs. cycle #: eta=~,3F" *eta*)))) (loop (incf cycle) (if (zerop (mod cycle report-freq)) (format t "~%Cycle ~A~%" cycle) (format t " ~A" cycle)) (setf numb-correct 0) ;;; In each cycle, go through all of the training examples, adjusting weights ;;; after each example is presented. Count the number of correct classifications ;;; and report the precentage correct. If requested (by the *trace-backprop* flag), ;;; show the results on the training examples. To speed things up, only report ;;; results if the current cycle is a multiple of the reporting frequency. (dolist (example (if randomize-examples? (mix-up examples) examples)) (let ((input-vector (rest example)) (desired-output (first example))) ;; Activate the current network, given this input. (activate-network network input-vector input-units hidden-units output-units) (let ((correct? (output-correct? desired-output network output-units))) (when (and *trace-backprop* (zerop (mod cycle (* 2 report-freq)))) ;;; Every other time performance reporting occurs, report i/o behavior. (report-current-results network input-vector desired-output output-units correct?)) (if correct? (incf numb-correct) (perform-back-propagation network input-vector desired-output input-units hidden-units output-units))))) (when (zerop (mod cycle report-freq)) ;; If this is a "special" cycle, report the current results of the network. (format t "Percentage correct = ~5,3F%~%" (* 100 (/ numb-correct numb-of-examples))) (when *graph-on* (plot-graph-line (* 1 (- cycle report-freq)) (* 400 last-correctness) (* 1 cycle) (* 400 (/ numb-correct numb-of-examples)) *graph-window*) (setf last-correctness (/ numb-correct numb-of-examples)))) (if (or (= numb-correct numb-of-examples) (>= cycle max-cycles)) (return))) ; All classifications correct or "enough" training, so stop. (format t "~%~%There were ~A training cycles.~%~%" cycle) network) (defun activate-network (network input-vector &optional (input-units (length input-vector)) (hidden-units (length (network-hidden-unit-biases network))) (output-units (length (network-output-unit-biases network)))) ;;; Activate this network, given this input-vector. (For efficiency reasons, ;;; the number of each type of network node is provided.) (dotimes (h hidden-units) ; Set the activations of the hidden units. (setf (aref (network-hidden-activations network) h) (logistic-activation-function (let ((answer 0)) ; Each hidden unit gets a weighted input from each input unit. (dotimes (i input-units answer) (incf answer (* (elt input-vector i) (aref (network-input-to-hidden-wts network) i h))))) (aref (network-hidden-unit-biases network) h)))) (dotimes (o output-units) ; Set the activations of the output units. (setf (aref (network-output-activations network) o) (logistic-activation-function (let ((answer 0)) ; Each output unit gets a weighted input from each hidden unit. (dotimes (h hidden-units answer) (incf answer (* (aref (network-hidden-activations network) h) (aref (network-hidden-to-output-wts network) h o))))) (aref (network-output-unit-biases network) o))))) (defun perform-back-propagation (network input-vector desired-output-vector &optional (input-units (length input-vector)) (hidden-units (length (network-hidden-unit-biases network))) (output-units (length (network-output-unit-biases network)))) ;;; Perform back-propagation to modify the network's weights in order to improve ;;; future performance. It is assumed that the LOGISTIC activation function ;;; is used (see page 329 of PDP1). For "smoothing", a momentum term is included. ;;; The momentum can be cancelled by setting *alpha* to zero. ;; First, determine the error signals. (dotimes (o output-units) ; Determine the errors of the output units. (setf (aref (network-output-unit-errors network) o) (let ((actual-output (aref (network-output-activations network) o))) (* (- (elt desired-output-vector o) actual-output) actual-output (- 1 actual-output))))) (dotimes (h hidden-units) ; Determine the errors of the hidden units. (setf (aref (network-hidden-unit-errors network) h) (let ((hidden-unit-activation (aref (network-hidden-activations network) h)) (sum 0)) (* hidden-unit-activation (- 1 hidden-unit-activation) (dotimes (o output-units sum) (incf sum (* (aref (network-output-unit-errors network) o) (aref (network-hidden-to-output-wts network) h o)))))))) ;; Reset the weights in the network. (dotimes (h hidden-units) (dotimes (o output-units) ; Update the weights from the hidden to the output units. (let ((delta (delta-rule (aref (network-output-unit-errors network) o) (aref (network-hidden-activations network) h) (aref (network-hidden-to-output-deltas network) h o)))) ; Remember the weight momentum and update the weight. (setf (aref (network-hidden-to-output-deltas network) h o) delta) (incf (aref (network-hidden-to-output-wts network) h o) delta))) (dotimes (i input-units) ; Update the weights from the input to the hidden units. (let ((delta (delta-rule (aref (network-hidden-unit-errors network) h) (elt input-vector i) (aref (network-input-to-hidden-deltas network) i h)))) (setf (aref (network-input-to-hidden-deltas network) i h) delta) (incf (aref (network-input-to-hidden-wts network) i h) delta)))) ;; Reset the biases in the network. (dotimes (o output-units) ; Update each output unit's bias (let ((delta (delta-rule (aref (network-output-unit-errors network) o) 1 (aref (network-output-bias-deltas network) o)))) ; Remember the bias momentum and update the bias. (setf (aref (network-output-bias-deltas network) o) delta) (incf (aref (network-output-unit-biases network) o) delta))) (dotimes (h hidden-units) ; Update each hidden unit's bias. (let ((delta (delta-rule (aref (network-hidden-unit-errors network) h) 1 (aref (network-hidden-bias-deltas network) h)))) (setf (aref (network-hidden-bias-deltas network) h) delta) (incf (aref (network-hidden-unit-biases network) h) delta)))) ;;;; -------------------------------------------------------------------------------------- ;;;; UTILITY FUNCTIONS ;;;; -------------------------------------------------------------------------------------- (defun delta-rule (error activation previous-delta) ;;; Determine the weight change specified by the delta learning rule. ;;; Include a momentum term to reduce the chance of oscillations. (+ (* *eta* error activation) (* *alpha* previous-delta))) (defun logistic-activation-function (total-weighted-input bias) ;;; A continuous, non-linear activation function is needed. ;;; It must be continuous if the generalized delta rule ;;; is to be used. For hidden units to be beneficial, the ;;; activation function must also be non-linear. ;;; See equation 15 on page 329 of PDP1. (/ 1 (+ 1 (guarded-exp (- (+ total-weighted-input bias)))))) (defun guarded-exp (x) ;;; Prevent overflow during exponentiation. (cond ((<= x *most-negative-exponent*) long-float-epsilon) ((>= x *most-positive-exponent*) most-positive-long-float) (t (exp x)))) (defun build-network (input-units hidden-units output-units &optional randomize?) ;;; Construct a "feed-forward" PDP network with this many input, hidden, and output units. ;;; Each hidden unit is connected to each of the input units and each output unit is ;;; connected to each hidden unit. There are no direct connections between input and ;;; output units. If randomize? is set, the weights and biases are randomized. (make-network :output-unit-biases (build-pdp-array (list output-units) randomize?) :hidden-unit-biases (build-pdp-array (list hidden-units) randomize?) :output-bias-deltas (build-pdp-array (list output-units)) :hidden-bias-deltas (build-pdp-array (list hidden-units)) :output-activations (build-pdp-array (list output-units)) :hidden-activations (build-pdp-array (list hidden-units)) :output-unit-errors (build-pdp-array (list output-units)) :hidden-unit-errors (build-pdp-array (list hidden-units)) :hidden-to-output-wts (build-pdp-array (list hidden-units output-units) randomize?) :input-to-hidden-wts (build-pdp-array (list input-units hidden-units) randomize?) :hidden-to-output-deltas (build-pdp-array (list hidden-units output-units)) :input-to-hidden-deltas (build-pdp-array (list input-units hidden-units)))) (defun build-pdp-array (dimensions &optional randomize?) ;;; Construct an array of the specified size. If requested, randomize the elements. (if randomize? (make-array dimensions :element-type 'long-float :initial-contents (make-random-array-contents dimensions)) (make-array dimensions :element-type 'long-float :initial-element (coerce 0 'long-float)))) (defun make-random-array-contents (dimensions &aux temp) ;;; Construct a list representing an array of the specified dimensions. ;;; The elements of the array are randomly chosen. (if (null dimensions) (coerce (random-interval (- *initial-wt-limit*) *initial-wt-limit*) 'long-float) (dotimes (i (first dimensions) temp) (push (make-random-array-contents (rest dimensions)) temp)))) (defun output-correct? (desired-output network output-units &aux (all-right t)) ;;; Return T iff all of the ouputs are within epsilon of their desired values (dotimes (o output-units all-right) (unless (< (abs (- (elt desired-output o) (aref (network-output-activations network) o))) *epsilon*) (setf all-right nil)))) (defun report-current-results (network input-vector desired-output output-units correct?) ;;; Report how well the desired output matches the actual output. (format t " Input=~A~% Desired Output=~A~% Actual Output=(" input-vector desired-output) (dotimes (o output-units) (format t " ~9,8F" (aref (network-output-activations network) o))) (if (not correct?) (format t ") X~%") (format t ")~%"))) ; " (defun compute-network-values (network instance) ;;; Return the list of binary output values computed by the network given the ;;; instance as input. (activate-network network instance) (let ((output nil)) (dotimes (o (length (network-output-unit-biases network)) (reverse output)) (if (< (- 1 (aref (network-output-activations network) o)) *output-epsilon*) (push 1 output) (push 0 output))))) (defun network-printer (network stream depth &aux (input-units (first (array-dimensions (network-input-to-hidden-wts network)))) (hidden-units (length (network-hidden-unit-biases network))) (output-units (length (network-output-unit-biases network)))) ;; Print this PDP network in an understandable format. (declare (ignore depth)) (format stream "~%Network Contents~%") (dotimes (o output-units) (format stream "~% Output Unit ~A - Bias = ~9,5F~%" o (aref (network-output-unit-biases network) o)) (dotimes (h hidden-units) (format stream " Wt from hidden unit ~A = ~9,5F~%" h (aref (network-hidden-to-output-wts network) h o)))) (dotimes (h hidden-units) (format stream "~% Hidden Unit ~A - Bias = ~9,5F~%" h (aref (network-hidden-unit-biases network) h)) (dotimes (i input-units) (format stream " Wt from input unit ~A = ~9,5F~%" i (aref (network-input-to-hidden-wts network) i h))))) (defun test-all-instances (&optional short-encode-flag) ;;; For each instance in the feature space compute and report the ;;; classification resulting from the learned *network* (let ((encoding-alists (mapcar #'(lambda (domain) (if short-encode-flag (short-encode-domain domain) (long-encode-domain domain))) *domains*))) (dolist (instance (all-possible-instances)) (format t "~%~A is ~A" instance (if (equal (compute-network-values *network* (encode-instance instance encoding-alists)) '(1)) '+ '-))))) (defun all-possible-instances (&optional (domains *domains*)) ;;; Generate the list of all possible instances in the space ;;; defined by *domains*. (if (null (rest domains)) (mapcar #'(lambda (value) (list value)) (first domains)) (mapcan #'(lambda (value) (mapcar #'(lambda (sub-example) (cons value sub-example)) (all-possible-instances (rest domains)))) (first domains)))) (defun mix-up (list) "Randomize the elements in this list." (for pair in (sort (for item in list collect (cons (get-random) item)) #'(lambda (a b) (> (first a) (first b)))) collect (rest pair))) (defun random-interval (a b) ;;; Randomly chose a value between A and B. (+ a (* (- b a) (/ (random 1000000) 1000000)))) (defun seconds-since (time) ;;; Return seconds elapsed since given time (initially set by get-internal-run-time) (/ (- (get-internal-run-time) time) internal-time-units-per-second)) ;;;; ========================================================================================== ;;;; Functions for running and testing a single concept ;;;; ========================================================================================== (defun backprop-test (examples train# &optional short-encode-flag (hidden-units 4) randomize-examples? (report-freq 10) (max-cycles 1000000)) ;;; Run and test on the examples by using the first train# examples ;;; to train and the remaining to test (setf examples (convert-to-bits examples short-encode-flag)) (let ((training-examples (subseq examples 0 train#)) (testing-examples (subseq examples train#)) (start-time (get-internal-run-time))) (backprop (backprop-example-form training-examples) hidden-units randomize-examples? report-freq max-cycles) (format t "~%~%Run time: ~,2Fs" (seconds-since start-time)) (backprop-test-examples testing-examples))) (defun backprop-test-examples (examples) ;;; Test on the given set of examples and report results (let ((correct# 0)) (dolist (example examples) (let ((output (equal (compute-network-values *network* (second example)) '(1)))) (if (or (and (eq (first example) '+) output) (and (eq (first example) '-) (null output))) (incf correct#)))) (format t "~%~%Percentage correct: ~A" (round (* 100 (/ correct# (length examples))))))) ;;;; ========================================================================================== ;;;; The following functions are for multiple concept (category) problems like the soybean data ;;;; ========================================================================================== (defun make-backprop-examples (pos-instances neg-instances) ;;; Converts lists of positive and negative instances into a list of examples ;;; suitable for BACKPROP. (append (mapcar #'(lambda (instance) (cons '(1) instance)) pos-instances) (mapcar #'(lambda (instance) (cons '(0) instance)) neg-instances))) (defun backprop-categories (category-list &optional (hidden-units 4) randomize-examples? (report-freq 10) (max-cycles 1000000)) ;;; BACKPROP for multiple concept learning problems. The argument category-list ;;; should be a list of atoms which represent names of individual categories. A list of ;;; instances for learning should be stored on the LEARN-INSTANCES property of each ;;; category name. A single concept learning trial is run for each category in which the ;;; instances of that category are positive examples and instances of all other categories ;;; are negative examples. If convergence is reached for a given category, then the ;;; learned network is stored on the NETWORK property of the category name. (setf *instance-tester* #'backprop-test-instance) (let ((start-time (get-internal-run-time))) (dolist (category-name category-list) (format t "~%~%Category: ~A" category-name) (backprop (make-backprop-examples (get category-name 'learn-instances) (mapcan #'(lambda (a) (copy-list (get a 'learn-instances))) (remove category-name category-list))) hidden-units randomize-examples? report-freq max-cycles) (setf (get category-name 'NETWORK) *network*)) (format t "~%~%Run time: ~,3Fs" (seconds-since start-time))) (backprop-test-categories category-list)) (defun backprop-test-instance (instance categories) ;;; Given an instance and a list of category names returns the subset of these categories ;;; which it is determined that the instance belongs to (i.e. for which the learned ;;; network for that category returns T. For use with one network per category ;;; (BACKPROP-CATEGORIES) (let ((member-categories nil)) (dolist (category categories member-categories) (if (equal (compute-network-values (get category 'NETWORK) instance) '(1)) (push category member-categories))))) (defun backprop-categories1 (category-list &optional (hidden-units 4) randomize-examples? (report-freq 10) (max-cycles 1000000)) ;;; BACKPROP for multiple concept learning problems. The argument category-list ;;; should be a list of atoms which represent names of individual categories. A list of ;;; instances for learning should be stored on the LEARN-INSTANCES property of each ;;; category name. Builds one multiple-output network (1 output per disease) ;;; for classifying examples into all of the categories and formats instances ;;; of all diseases in multiple output form needed for BACKPROP. (setf *instance-tester* #'best-guess-test) (let ((start-time (get-internal-run-time)) (output-encoding-alist (long-encode-domain category-list))) (backprop (mix-up (mapcan #'(lambda (pair) (mapcar #'(lambda (instance) (cons (cdr pair) instance)) (get (car pair) 'learn-instances))) output-encoding-alist)) hidden-units randomize-examples? report-freq max-cycles) (format t "~%~%Run time: ~,3Fs" (/ (- (get-internal-run-time) start-time) internal-time-units-per-second))) (backprop-test-categories category-list)) (defun perfect-match-test (instance categories) ;;; Given an instance and a list of category names returns the subset of these categories ;;; which it is determined that the instance belongs to (i.e. for which the learned ;;; network for that category returns T.) Assumes examples is in a category iff ;;; its activation is within *output-epsilon* of 1. (activate-network *network* instance) (let ((member-categories nil)) (dotimes (o (length (network-output-unit-biases *network*)) member-categories) (if (> (aref (network-output-activations *network*) o) (- 1 *output-epsilon*)) (push (elt categories o) member-categories))))) (defun best-guess-test (instance categories) ;;; Given an instance and a list of category names returns the subset of these categories ;;; which it is determined that the instance belongs to (i.e. for which the learned ;;; multi-output network for that category returns T.) Assumes in the one category ;;; with the maximum output activation. (activate-network *network* instance) (let ((max-output-unit 0)) (dotimes (o (length (network-output-unit-biases *network*))) (if (> (aref (network-output-activations *network*) o) (aref (network-output-activations *network*) max-output-unit)) (setf max-output-unit o))) (list (elt categories max-output-unit)))) (defun run-backprop-test (categories learn-nums test-num num-hidden) (dolist (learn-num learn-nums) (separate-instances categories learn-num test-num) (format t "~%~%Test with ~A training examples/category" learn-num) (backprop-categories1 categories num-hidden) (setf *instance-tester* #'perfect-match-test) (format t "~%~%Perfect match test") (test-categories categories) (setf *instance-tester* #'best-guess-test) (format t "~%~%Best guess test") (test-categories categories))) (defun backprop-test-categories (categories &optional learn-instances?) ;;; To be used after *-CATEGORIES in order to test the generalizations learned ;;; on a set of new instances stored under the TEST-INSTANCES property of each category ;;; name in categories. Reports % correct for each category and overall % correct. (let ((percent-sum 0)(percent 0)) (dolist (category categories) (format t "~%~%Testing ~A instances" category) (let ((answers (mapcar #'(lambda (instance) (funcall *instance-tester* instance categories)) (get category (if learn-instances? 'learn-instances 'test-instances)))) (count 0)) (format t "~%~A" answers) (dolist (answer answers) (if (and (eq (first answer) category)(null (rest answer))) (incf count))) (setf percent (* 100 (/ count (length answers)))) (incf percent-sum percent) (format t "~%Percent correct: ~,2F" percent))) (format t "~%~%Total percent correct: ~,2F" (/ percent-sum (length categories))))) (defun separate-instances (categories num-learn-instances) ;;; Separates a list of instances for a set of categories into learning and testing ;;; instances to facilitate experimentation in preparation for using *-categories ;;; and test-categories. The variable categories should be bound to a list of ;;; category names whose values are a list of instances of that category. This function ;;; stores the first num-learn-instances of these instances on the LEARN-INSTANCES ;;; property to be used for learning and stores the rest of the instances on the ;;; TEST-INSTANCES property for the purpose of testing. (dolist (category categories) (setf (get category 'learn-instances) (subseq (eval category) 0 num-learn-instances)) (setf (get category 'test-instances) (subseq (eval category) num-learn-instances (length (eval category))))))