/******************************************************************/
/* IDT.PRO                                                        */
/* Torgos ID3-like system based on the gain-ratio measure         */
/******************************************************************/
/* impl. by     : Luis Torgo, Laboratorio Inteligencia Artificial */
/*                e Ciencas de Computacao,                        */
/*                Universidade do Porto,                          */
/*                Rua Campo Alegre 823,                           */
/*                4100 Porto,                                     */
/*                Portugal                                        */
/*                1989                                            */
/*                                                                */
/*                Thomas Hoppe                                    */
/*                Mommsenstr. 50                                  */
/*                1000 Berlin 12                                  */
/*                F.R.G.                                          */
/*                E-Mail: hoppet@db0tui11.pro                     */
/*                1990                                            */
/*                                                                */
/*  (c) copyright 1992  Luis Torgo, Thomas Hoppe                  */
/*                                                                */
/*                You can use this program provided that it is    */
/*                not for commercial purposes or for publication  */
/*                and that the program -including all comments-   */
/*                is not modified (except for changes due to      */
/*                different prolog dialects).                     */
/*                                                                */
/*  call        : id3                                             */
/*                                                                */
/******************************************************************/
 
/******************************************************************/
/* YAP-, C- and M-Prolog specific declaration of dynamical        */
/* clauses.                                                       */
/******************************************************************/
:- dynamic node/3.
:- dynamic decision_tree/1.
:- dynamic example/3.
:- dynamic attributes/1.
:- dynamic classes/1.
:- dynamic current_node/1.
:- dynamic table/3.
:- dynamic found/1.
 
/******************************************************************/
/*                                                                */
/*  call        : id3                                             */
/*                                                                */
/*  side effects: assertz and retracts clauses                    */
/*                                                                */
/******************************************************************/
/* id3 reads a filename from the terminal, initializes the know-  */
/* base, consults the correponding file builds a decision tree    */
/* and displays the tree.                                         */
/* The program assertz the following predicates, which must be    */
/* declared as dynamic in some Prolog dialects:                   */
/* node/3, decision_tree/1, example/3, attributes/1, classes/1,   */
/* current_node/1 and table/3.                                    */
/******************************************************************/
id3 :-
    repeat,
        nl,
        write('Which file to use ? '),
        read(FileName),nl,
        initialize_kb,
        consult(FileName),
        build_decision_tree,
        show_decision_tree,
        nl,
        write('Quit (y/n) ? '),
        read(y).
 
initialize_kb :-
    abolish(node,3),
    abolish(decision_tree,1),
    abolish(example,3),
    abolish(attributes,1),
    abolish(classes,1),
    abolish(current_node,1), !.
 
build_decision_tree :-
    generate_node_id(_),
    clause(attributes(Attributes),true),
    findbag(Ex,clause(example(Ex,_,_),true),Exs),
    id3(Exs,Attributes,Node),
    assert(decision_tree(Node)), !.
 
generate_node_id(Y) :-
    clause(current_node(X),true), !,
    retract(current_node(X)),
    Y is X + 1,
    assert(current_node(Y)).
generate_node_id(0) :-
    assert(current_node(0)).
 
/******************************************************************/
/*                                                                */
/*  call        : id3(+Examples,+Attributes,-Class)               */
/*                                                                */
/*  arguments   : Examples   = List of Examples                   */
/*                Attributes = List of Attributes                 */
/*                Class      = Node ID of Class or leaf(Class)    */
/*                                                                */
/******************************************************************/
/* ID3 determines an attribute-value pair which best splits the   */
/* examples according to the information-theoretical 'gain-ration'*/
/* measure. The attribute-value pair is deleted from the set of   */
/* all attribute-value pairs and the process of generating a sub- */
/* decision tree is called recursively with the according to the  */
/* attribute-value pair splitted examples. The recursion          */
/* terminates either if there is no more example to process or if */
/* all examples belong to the same class. In the last case        */
/* leaf(Class) is returned insteed of the SubtreeIDs.             */
/* In the end for every generated subtree an ID is generated and  */
/* the tree structure is asserted in the database.                */
/******************************************************************/
id3([],_,[]).
id3(Exs,_,[leaf(Class)]) :-
    termination_criterion(Exs,Class).
id3(Exs,Attributes,ID) :-
    get_best_attribute(Attributes,Exs,BestAttribute),
    split_values(BestAttribute,Exs,DividedValues),
    delete(BestAttribute,Attributes,NewAttributes),
    generate_subtrees(DividedValues,NewAttributes,SubtreeIDs),
    generate_node_id(ID),
    assert(node(ID,BestAttribute,SubtreeIDs)).
 
termination_criterion([Ex|Exs],Class) :-
    clause(example(Ex,Class,_),true),
    !,
    all_in_same_class(Exs,Class).
 
all_in_same_class([],_).
all_in_same_class([Ex|Exs],C) :-
    clause(example(Ex,C,_),true),
    !,
    all_in_same_class(Exs,C).
 
get_best_attribute(Attributes,Exs,BestAttribute) :-
    construct_contingency_table(Attributes,Exs),
    common_calculations(MC,N),
    calculate_parameter_classification(Attributes,MC,N,Values),
    get_best(Attributes,Values,BestAttribute).
 
construct_contingency_table(Attributes,Exs) :-
    clause(classes(Lc),true),
    length(Lc,NroColTab),
    abolish(table,3),
    initialize_contingency_tables(Attributes,NroColTab),
    construct_contingency_tables(Attributes,Exs).
 
initialize_contingency_tables([],_).
initialize_contingency_tables([A|As],NoCol) :-
    create_list_of_zeros(NoCol,List),
    assert(table(A,[],List)),
    initialize_contingency_tables(As,NoCol).
 
create_list_of_zeros(0,[]).
create_list_of_zeros(N,[0|R]) :-
     N1 is N-1,
     create_list_of_zeros(N1,R).
 
construct_contingency_tables([],_).
construct_contingency_tables([Attribute|Attributes],ExampleList) :-
    contingency_table(Attribute,ExampleList),
    !,
    construct_contingency_tables(Attributes,ExampleList).
 
contingency_table(_,[]).
contingency_table(Attribute,[Ex|Exs]) :-
     value(Attribute,Ex,V),
     position_of_class(Ex,Pc),
     update_table(Attribute,V,Pc),
     !,
     contingency_table(Attribute,Exs).
 
value(A,[A = V|_],V) :- !.
value(A,[_|Sels],V) :- value(A,Sels,V).
value(A,No,V) :-
      example(No,_,Ex),
      value(A,Ex,V).
 
position_of_class(Ex,Pc) :-
     clause(example(Ex,C,_),true),
     clause(classes(Classes),true),
     position(C,Classes,Pc).
 
update_table(Attribute,V,Pc) :-
     retract(table(Attribute,TabLines,TotClass)),
     modify_table(TabLines,V,Pc,NewLines),
     increment_position_list(1,Pc,TotClass,NewTotal),
     assert(table(Attribute,NewLines,NewTotal)).
 
modify_table([],V,Pc,[(V,Values,1)]) :-
     clause(classes(Classes),true),
     length(Classes,NoOfColums),
     create_list_of_zeros(NoOfColums,L),
     increment_position_list(1,Pc,L,Values).
modify_table([(V,Nums,Tot)|Rest],V,Pc,[(V,NewNums,NewTot)|Rest]) :-
     NewTot is Tot+1,
     increment_position_list(1,Pc,Nums,NewNums).
modify_table([X|Rest1],V,Pc,[X|Rest2]) :-
     modify_table(Rest1,V,Pc,Rest2).
 
increment_position_list(N,N,[X|R],[Y|R]) :-
     Y is X+1.
increment_position_list(N1,N,[X|R1],[X|R2]) :-
     N2 is N1+1,
     increment_position_list(N2,N,R1,R2).
 
common_calculations(MC,N) :-
    clause(table(_,_,Xjs),true),
    common_calculations(Xjs,0,0,MC,N).
 
common_calculations([],TotalSum,N,MC,N) :-
    MC is (-1 / N) * ( TotalSum - N * log(N) ).
common_calculations([Xj|Xjs],Ac1,Ac2,MC,N) :-
    NAc1 is Ac1 + Xj * log(Xj),
    NAc2 is Ac2 + Xj,
    common_calculations(Xjs,NAc1,NAc2,MC,N).
 
calculate_parameter_classification([],_,_,[]).
calculate_parameter_classification([A|As],MC,N,[V|Vs]) :-
    gain_ratio(A,MC,N,V),
    calculate_parameter_classification(As,MC,N,Vs).
 
gain_ratio(A,MC,N,GR) :-
    clause(table(A,Lines,_),true),
    calculate_factors_B_and_IV(Lines,N,0,0,B,IV),
    IM is MC - B,
    GR is IM / IV.
 
calculate_factors_B_and_IV([],N,Sum1,Sum2,B,IV) :-
    B  is ( -1 / N ) * ( Sum1 - Sum2 ),
    IV is ( -1 / N ) * ( Sum2 - N * log(N) ).
calculate_factors_B_and_IV([(_,L,TotL)|Rest],N,Ac1,Ac2,B,IV) :-
    sum_of_lines(L,0,SL),
    NAc1 is Ac1 + SL,
    NAc2 is Ac2 + TotL * log(TotL),
    calculate_factors_B_and_IV(Rest,N,NAc1,NAc2,B,IV).
 
sum_of_lines([],X,X).
sum_of_lines([0|Ns],Ac,Tot) :-
    sum_of_lines(Ns,Ac,Tot).
sum_of_lines([N|Ns],Ac,Tot) :-
    Nac is Ac + N * log(N),
    sum_of_lines(Ns,Nac,Tot).
 
get_best([A|As],[V|Vs],Result) :-
    best_value(As,Vs,(A,V),Result).
 
best_value([],[],(A,V),A).
best_value([A|As],[V|Vs],(TA,TV),Result) :-
    V > TV,
    best_value(As,Vs,(A,V),Result).
best_value([A|As],[V|Vs],(TA,TV),Result) :-
    best_value(As,Vs,(TA,TV),Result).
 
split_values(Attribute,Exs,Result) :-
    get_values(Attribute,Exs,Values),
    split_examples(Attribute,Values,Exs,Result).
 
get_values(Attribute,Exs,Vals) :-
    findbag(V,(member(Ex,Exs),value(Attribute,Ex,V)),Vs),
    remove_duplicates(Vs,Vals).
 
split_examples(_,[V],Exs,[(V,Exs)]).
split_examples(A,[V|Vs],Exs,[(V,VExs)|Rest]) :-
    findbag(Ex,(member(Ex,Exs),value(A,Ex,V)),VExs),
    difference(VExs,Exs,RestEx),
    split_examples(A,Vs,RestEx,Rest).
 
generate_subtrees([],_,[]).
generate_subtrees([(Value,Exs)|Rest1],Attributes,[(Value,Id)|Rest2]) :-
    id3(Exs,Attributes,Id),
    !,
    generate_subtrees(Rest1,Attributes,Rest2).
 
/******************************************************************/
/*                                                                */
/*  call        : show_decision_tree                              */
/*                                                                */
/******************************************************************/
/* A simple pretty-print procedure for displaying decision trees. */
/* In steed of this procedure, we can also generate rules from the*/
/* decision tree by traversing every path in the tree until a     */
/* leaf node was reached and collecting the attribute-value pairs */
/* of that path. Then the leaf node forms the head of a Horn-     */
/* formula and the set of attribute-value pairs of the path forms */
/* the body of the clause.                                        */
/******************************************************************/
show_decision_tree :-
        nl,
        clause(decision_tree(Node),true),
        show_subtree(Node,0), !.
 
show_subtree(NodeNo,Indent) :-
        clause(node(NodeNo,Attribute,SubtreeList),true),
        show_subtrees(SubtreeList,Attribute,Indent).
 
show_subtrees([],_,_) :- nl.
show_subtrees([(Value,[leaf(X)])|Brothers],Attribute,Indent) :-
        write(Attribute=Value), write(' '),
        write(' ==> '), write(class = X), nl,
        space(Indent),
        show_subtrees(Brothers,Attribute,Indent).
show_subtrees([(Value,NodeNo)|Brothers],Attribute,Indent) :-
        name(Attribute,List1), length(List1,N1),
        name(Value,List2), length(List2,N2),
        write(Attribute=Value),
        write(' and '),
        Offset is Indent + N1 + 1 + N2 + 5,
        show_subtree(NodeNo,Offset),
        space(Indent),
        show_subtrees(Brothers,Attribute,Indent).
 
/******************************************************************/
/* Utilit predicates                                              */
/******************************************************************/
space(0).
space(N) :-
        N > 0, write(' '), N1 is N - 1, space(N1).
 
remove_duplicates([],[]).
remove_duplicates([X|Xs],Ys) :-
      member(X,Xs),
      remove_duplicates(Xs,Ys).
remove_duplicates([X|Xs],[X|Ys]) :-
      remove_duplicates(Xs,Ys).
 
%length([],0).
%length([L|Ls],N) :-
%      length(Ls,N1),
%      N is N1+1.
 
position(X,L,P) :- position(X,1,L,P).
position(X,P,[X|_],P).
position(X,N,[_|R],P) :- N1 is N+1,position(X,N1,R,P).
 
delete(X,[X|Xs],Xs).
delete(X,[Y|Ys],[Y|Zs]) :-
    delete(X,Ys,Zs).
 
difference(L1,L2,L3) :-
    findbag(N,(member(N,L2),not(member(N,L1))),L3).
 
findbag(X,G,_) :-
     asserta(found(mark)), call(G),
       asserta(found(X)), fail .
findbag(_,_,L) :-
     collect_found([],L) .
 
collect_found(L,L1) :-
     getnext(X), collect_found([X|L],L1) .
collect_found(L,L) .
 
getnext(X) :-
     retract(found(X)), !, not (X == mark) .