doxygen/contin_8c_source.html

/*************************************************************************/

/*                                                                       */

/*  Copyright 2010 Rulequest Research Pty Ltd.                           */

/*                                                                       */

/*  This file is part of C5.0 GPL Edition, a single-threaded version     */

/*  of C5.0 release 2.07.                                                */

/*                                                                       */

/*  C5.0 GPL Edition is free software: you can redistribute it and/or    */

/*  modify it under the terms of the GNU General Public License as       */

/*  published by the Free Software Foundation, either version 3 of the   */

/*  License, or (at your option) any later version.                      */

/*                                                                       */

/*  C5.0 GPL Edition is distributed in the hope that it will be useful,  */

/*  but WITHOUT ANY WARRANTY; without even the implied warranty of       */

/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    */

/*  General Public License for more details.                             */

/*                                                                       */

/*  You should have received a copy of the GNU General Public License    */

/*  (gpl.txt) along with C5.0 GPL Edition.  If not, see                  */

/*                                                                       */

/*      <http://www.gnu.org/licenses/>.                                  */

/*                                                                       */

/*************************************************************************/


/*************************************************************************/

/*                                                                       */

/*      Evaluation of a test on a continuous valued attribute            */

/*      -----------------------------------------------------            */

/*                                                                       */

/*************************************************************************/


#include "defns.i"

#include "extern.i"


#define PartInfo(n) (-(n)*Log((n)/GEnv.Cases))


/*************************************************************************/

/*                                                                       */

/*      Continuous attributes are treated as if they have possible       */

/*      values 0 (unknown), 1 (not applicable), 2 (less than cut) and    */

/*      3 (greater than cut).                                            */

/*      This routine finds the best cut for cases Fp through Lp and      */

/*      sets Info[], Gain[] and Bar[]                                    */

/*                                                                       */

/*************************************************************************/


void EvalContinuousAtt(Attribute Att, CaseNo Fp, CaseNo Lp)

/*   -----------------  */

{

    CaseNo      i, j, BestI, Tries=0;

    double      LowInfo, LHInfo, LeastInfo=1E38,

                w, BestGain, BestInfo, ThreshCost=1;

    ClassNo     c;

    ContValue   Interval;


    Verbosity(3, fprintf(Of, "\tAtt %s\n", AttName[Att]))


    Gain[Att] = None;

    PrepareForContin(Att, Fp, Lp);


    /*  Special case when very few known values  */


    if ( GEnv.ApplicCases < 2 * MINITEMS )

    {

        Verbosity(2,

            fprintf(Of, "\tAtt %s\tinsufficient cases with known values\n",

                        AttName[Att]))

        return;

    }


    /*  Try possible cuts between cases i and i+1, and determine the

        information and gain of the split in each case  */


    /*  We have to be wary of splitting a small number of cases off one end,

        as this has little predictive power.  The minimum split GEnv.MinSplit is

        the maximum of MINITEMS or (the minimum of 25 and 10% of the cases

        per class)  */


    GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;

    if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;

    if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;


    /*  Find first possible cut point and initialise scan parameters  */


    i = PrepareForScan(Lp);


    /*  Repeatedly check next possible cut  */


    for ( ; i <= GEnv.Ep ; i++ )

    {

        c = GEnv.SRec[i].C;

        w = GEnv.SRec[i].W;

        assert(c >= 1 && c <= MaxClass);


        GEnv.LowCases   += w;

        GEnv.Freq[2][c] += w;

        GEnv.Freq[3][c] -= w;


        GEnv.HighVal = GEnv.SRec[i+1].V;

        if ( GEnv.HighVal > GEnv.LowVal )

        {

            Tries++;


            GEnv.LowClass  = GEnv.HighClass;

            GEnv.HighClass = GEnv.SRec[i+1].C;

            for ( j = i+2 ;

                  GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;

                  j++ )

            {

                if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;

            }


            if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )

            {

                LowInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass);


                /*  If cannot improve on best so far, count remaining

                    possible cuts and break  */


                if ( LowInfo >= LeastInfo )

                {

                    for ( i++ ; i <= GEnv.Ep ; i++ )

                    {

                        if ( GEnv.SRec[i+1].V > GEnv.SRec[i].V )

                        {

                            Tries++;

                        }

                    }

                    break;

                }


                LHInfo = LowInfo + TotalInfo(GEnv.Freq[3], 1, MaxClass);

                if ( LHInfo < LeastInfo )

                {

                    LeastInfo = LHInfo;

                    BestI     = i;


                    BestInfo = (GEnv.FixedSplitInfo

                                + PartInfo(GEnv.LowCases)

                                + PartInfo(GEnv.ApplicCases - GEnv.LowCases))

                               / GEnv.Cases;

                }


                Verbosity(3,

                {

                    fprintf(Of, "\t\tCut at %.3f  (gain %.3f):",

                           (GEnv.LowVal + GEnv.HighVal) / 2,

                           (1 - GEnv.UnknownRate) *

                           (GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases));

                    PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);

                })

            }


            GEnv.LowVal = GEnv.HighVal;

        }

    }


    BestGain = (1 - GEnv.UnknownRate) *

               (GEnv.BaseInfo - (GEnv.NAInfo + LeastInfo) / GEnv.KnownCases);


    /*  The threshold cost is the lesser of the cost of indicating the

        cases to split between or the interval containing the split  */


    if ( BestGain > 0 )

    {

        Interval = (GEnv.SRec[Lp].V - GEnv.SRec[GEnv.Xp].V) /

                   (GEnv.SRec[BestI+1].V - GEnv.SRec[BestI].V);

        ThreshCost = ( Interval < Tries ? Log(Interval) : Log(Tries) )

                     / GEnv.Cases;

    }


    BestGain -= ThreshCost;


    /*  If a test on the attribute is able to make a gain,

        set the best break point, gain and information  */


    if ( BestGain <= 0 )

    {

        Verbosity(2, fprintf(Of, "\tAtt %s\tno gain\n", AttName[Att]))

    }

    else

    {

        Gain[Att] = BestGain;

        Info[Att] = BestInfo;


        GEnv.LowVal  = GEnv.SRec[BestI].V;

        GEnv.HighVal = GEnv.SRec[BestI+1].V;


        /*  Set threshold, making sure that rounding problems do not

            cause it to reach upper value  */


        if ( (Bar[Att] = (ContValue) (0.5 * (GEnv.LowVal + GEnv.HighVal)))

             >= GEnv.HighVal )

        {

            Bar[Att] = GEnv.LowVal;

        }


        Verbosity(2,

            fprintf(Of, "\tAtt %s\tcut=%.3f, inf %.3f, gain %.3f\n",

                   AttName[Att], Bar[Att], Info[Att], Gain[Att]))

    }

}


/*************************************************************************/

/*                                                                       */

/*      Estimate max gain ratio available from any cut, using sample     */

/*      of SampleFrac of all cases                                       */

/*                                                                       */

/*************************************************************************/


void EstimateMaxGR(Attribute Att, CaseNo Fp, CaseNo Lp)

/*   -------------  */

{

    CaseNo      i, j;

    double      LHInfo, w, SplitInfo, ThisGain, GR;

    ClassNo     c;


    EstMaxGR[Att] = 0;


    if ( Skip(Att) || Att == ClassAtt ) return;


    PrepareForContin(Att, Fp, Lp);


    /*  Special case when very few known values  */


    if ( GEnv.ApplicCases < 2 * MINITEMS * SampleFrac )

    {

        return;

    }


    /*  Try possible cuts between cases i and i+1.  Use conservative

        value of GEnv.MinSplit to allow for sampling  */


    GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;

    if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;

    if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;


    GEnv.MinSplit *= SampleFrac * 0.33;


    i = PrepareForScan(Lp);


    /*  Repeatedly check next possible cut  */


    for ( ; i <= GEnv.Ep ; i++ )

    {

        c = GEnv.SRec[i].C;

        w = GEnv.SRec[i].W;

        assert(c >= 1 && c <= MaxClass);


        GEnv.LowCases   += w;

        GEnv.Freq[2][c] += w;

        GEnv.Freq[3][c] -= w;


        GEnv.HighVal = GEnv.SRec[i+1].V;

        if ( GEnv.HighVal > GEnv.LowVal )

        {

            GEnv.LowClass  = GEnv.HighClass;

            GEnv.HighClass = GEnv.SRec[i+1].C;

            for ( j = i+2 ;

                  GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;

                  j++ )

            {

                if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;

            }


            if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )

            {

                LHInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass)

                         + TotalInfo(GEnv.Freq[3], 1, MaxClass);


                SplitInfo = (GEnv.FixedSplitInfo

                            + PartInfo(GEnv.LowCases)

                            + PartInfo(GEnv.ApplicCases - GEnv.LowCases)) / GEnv.Cases;


                ThisGain = (1 - GEnv.UnknownRate) *

                           (GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases);

                if ( ThisGain > Gain[Att] ) Gain[Att] = ThisGain;


                /*  Adjust GR to make it more conservative upper bound  */


                GR = (ThisGain + 1E-5) / SplitInfo;

                if ( GR > EstMaxGR[Att] )

                {

                    EstMaxGR[Att] = GR;

                }


                Verbosity(3,

                {

                    fprintf(Of, "\t\tCut at %.3f  (gain %.3f):",

                           (GEnv.LowVal + GEnv.HighVal) / 2, ThisGain);

                    PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);

                })

            }


            GEnv.LowVal = GEnv.HighVal;

        }

    }


    Verbosity(2,

        fprintf(Of, "\tAtt %s: max GR estimate %.3f\n",

                    AttName[Att], EstMaxGR[Att]))

}


/*************************************************************************/

/*                                                                       */

/*      Routine to set some preparatory values used by both              */

/*      EvalContinuousAtt and EstimateMaxGR                              */

/*                                                                       */

/*************************************************************************/


void PrepareForContin(Attribute Att, CaseNo Fp, CaseNo Lp)

/*   ----------------  */

{

    CaseNo      i;

    ClassNo     c;

    DiscrValue  v;


    /*  Reset frequency tables  */


    ForEach(v, 0, 3)

    {

        ForEach(c, 1, MaxClass)

        {

            GEnv.Freq[v][c] = 0;

        }

        GEnv.ValFreq[v] = 0;

    }


    /*  Omit and count unknown and N/A values */


    GEnv.Cases = 0;


    if ( SomeMiss[Att] || SomeNA[Att] )

    {

        GEnv.Xp = Lp+1;


        ForEach(i, Fp, Lp)

        {

            assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);


            GEnv.Cases += Weight(Case[i]);


            if ( Unknown(Case[i], Att) )

            {

                GEnv.Freq[ 0 ][ Class(Case[i]) ] += Weight(Case[i]);

            }

            else

            if ( NotApplic(Case[i], Att) )

            {

                GEnv.Freq[ 1 ][ Class(Case[i]) ] += Weight(Case[i]);

            }

            else

            {

                GEnv.Freq[ 3 ][ Class(Case[i]) ] += Weight(Case[i]);

                GEnv.Xp--;

                GEnv.SRec[GEnv.Xp].V = CVal(Case[i], Att);

                GEnv.SRec[GEnv.Xp].W = Weight(Case[i]);

                GEnv.SRec[GEnv.Xp].C = Class(Case[i]);

            }

        }


        ForEach(c, 1, MaxClass)

        {

            GEnv.ValFreq[0] += GEnv.Freq[0][c];

            GEnv.ValFreq[1] += GEnv.Freq[1][c];

        }


        GEnv.NAInfo = TotalInfo(GEnv.Freq[1], 1, MaxClass);

        GEnv.FixedSplitInfo = PartInfo(GEnv.ValFreq[0]) + PartInfo(GEnv.ValFreq[1]);


        Verbosity(3, PrintDistribution(Att, 0, 1, GEnv.Freq, GEnv.ValFreq, true))

    }

    else

    {

        GEnv.Xp = Fp;


        ForEach(i, Fp, Lp)

        {

            GEnv.SRec[i].V = CVal(Case[i], Att);

            GEnv.SRec[i].W = Weight(Case[i]);

            GEnv.SRec[i].C = Class(Case[i]);


            GEnv.Freq[3][Class(Case[i])] += Weight(Case[i]);

        }


        ForEach(c, 1, MaxClass)

        {

            GEnv.Cases += GEnv.Freq[3][c];

        }


        GEnv.NAInfo = GEnv.FixedSplitInfo = 0;

    }


    GEnv.KnownCases  = GEnv.Cases - GEnv.ValFreq[0];

    GEnv.ApplicCases = GEnv.KnownCases - GEnv.ValFreq[1];


    GEnv.UnknownRate = 1.0 - GEnv.KnownCases / GEnv.Cases;


    Cachesort(GEnv.Xp, Lp, GEnv.SRec);


    /*  If unknowns or using sampling, must recompute base information  */


    if ( GEnv.ValFreq[0] > 0 || SampleFrac < 1 )

    {

        /*  Determine base information using GEnv.Freq[0] as temp buffer  */


        ForEach(c, 1, MaxClass)

        {

            GEnv.Freq[0][c] = GEnv.Freq[1][c] + GEnv.Freq[3][c];

        }


        GEnv.BaseInfo = TotalInfo(GEnv.Freq[0], 1, MaxClass) / GEnv.KnownCases;

    }

    else

    {

        GEnv.BaseInfo = GlobalBaseInfo;

    }

}


/*************************************************************************/

/*                                                                       */

/*      Set low and high bounds for scan and initial class               */

/*      (used by EvalContinuousAtt and EstimateMaxGR)                    */

/*                                                                       */

/*************************************************************************/


CaseNo PrepareForScan(CaseNo Lp)

/*     --------------  */

{

    CaseNo      i, j;

    ClassNo     c;

    double      w;


    /*  Find last possible split  */


    GEnv.HighCases = GEnv.LowCases = 0;


    for ( GEnv.Ep = Lp ; GEnv.Ep >= GEnv.Xp && GEnv.HighCases < GEnv.MinSplit ; GEnv.Ep-- )

    {

        GEnv.HighCases += GEnv.SRec[GEnv.Ep].W;

    }


    /*  Skip cases before first possible cut  */


    for ( i = GEnv.Xp ;

          i <= GEnv.Ep &&

          ( GEnv.LowCases + GEnv.SRec[i].W < GEnv.MinSplit - 1E-5 ||

            GEnv.SRec[i].V == GEnv.SRec[i+1].V ) ;

          i++ )

    {

        c = GEnv.SRec[i].C;

        w = GEnv.SRec[i].W;

        assert(c >= 1 && c <= MaxClass);


        GEnv.LowCases   += w;

        GEnv.Freq[2][c] += w;

        GEnv.Freq[3][c] -= w;

    }


    /*  Find the class key for the first interval  */


    GEnv.HighClass = GEnv.SRec[i].C;

    for ( j = i-1; GEnv.HighClass && j >= GEnv.Xp ; j-- )

    {

        if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;

    }

    assert(GEnv.HighClass <= MaxClass);

    assert(j+1 >= GEnv.Xp);


    GEnv.LowVal = GEnv.SRec[i].V;


    return i;

}


/*************************************************************************/

/*                                                                       */

/*      Change a leaf into a test on a continuous attribute              */

/*                                                                       */

/*************************************************************************/


void ContinTest(Tree Node, Attribute Att)

/*   ----------  */

{

    Sprout(Node, 3);


    Node->NodeType = BrThresh;

    Node->Tested   = Att;

    Node->Cut      =

    Node->Lower    =

    Node->Upper    = Bar[Att];

}


/*************************************************************************/

/*                                                                       */

/*      Adjust thresholds of all continuous attributes so that cuts      */

/*      are values that appear in the data                               */

/*                                                                       */

/*************************************************************************/


void AdjustAllThresholds(Tree T)

/*   -------------------  */

{

    Attribute   Att;

    CaseNo      Ep;


    ForEach(Att, 1, MaxAtt)

    {

        if ( Continuous(Att) )

        {

            Ep = -1;

            AdjustThresholds(T, Att, &Ep);

        }

    }

}


void AdjustThresholds(Tree T, Attribute Att, CaseNo *Ep)

/*   ----------------  */

{

    DiscrValue  v;

    CaseNo      i;


    if ( T->NodeType == BrThresh && T->Tested == Att )

    {

        if ( *Ep == -1 )

        {

            ForEach(i, 0, MaxCase)

            {

                if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )

                {

                    (&GEnv)->SRec[++(*Ep)].V = CVal(Case[i], Att);

                }

            }

            Cachesort(0, *Ep, (&GEnv)->SRec);


            if ( PossibleCuts && Trial == 0 )

            {

                int Cuts=0;


                ForEach(i, 1, *Ep)

                {

                    if ( (&GEnv)->SRec[i].V != (&GEnv)->SRec[i-1].V ) Cuts++;

                }

                PossibleCuts[Att] = Cuts;

            }

        }


        T->Cut = T->Lower = T->Upper = GreatestValueBelow(T->Cut, Ep);

    }


    if ( T->NodeType )

    {

        ForEach(v, 1, T->Forks)

        {

            AdjustThresholds(T->Branch[v], Att, Ep);

        }

    }

}


/*************************************************************************/

/*                                                                       */

/*      Return the greatest value of attribute Att below threshold Th    */

/*      (Assumes values of Att have been sorted.)                        */

/*                                                                       */

/*************************************************************************/


ContValue GreatestValueBelow(ContValue Th, CaseNo *Ep)

/*        ------------------  */

{

    CaseNo      Low, Mid, High;


    Low  = 0;

    High = *Ep;


    while ( Low < High )

    {

        Mid = (Low + High + 1) / 2;


        if ( (&GEnv)->SRec[Mid].V > Th )

        {

            High = Mid - 1;

        }

        else

        {

            Low = Mid;

        }

    }


    return (&GEnv)->SRec[Low].V;

}