mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
p-thresh.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Soften thresholds for continuous attributes */
30 /* ------------------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 
39 /*************************************************************************/
40 /* */
41 /* Soften all thresholds for continuous attributes in tree T */
42 /* */
43 /*************************************************************************/
44 
45 
47 /* ------------ */
48 {
49  ResubErrs(T, 0, MaxCase);
50 
51  FindBounds(T, 0, MaxCase);
52 }
53 
54 
55 
56 /*************************************************************************/
57 /* */
58 /* Find resubstitution errors for tree T */
59 /* */
60 /*************************************************************************/
61 
62 
63 void ResubErrs(Tree T, CaseNo Fp, CaseNo Lp)
64 /* --------- */
65 {
66  CaseNo i, Bp, Ep, Missing;
67  CaseCount Cases=0, KnownCases, BranchCases, MissingCases;
68  double Factor;
69  DiscrValue v;
70  Boolean PrevUnitWeights;
71  Attribute Att;
72 
73  if ( ! T->NodeType )
74  {
75  T->Errors = T->Cases - T->ClassDist[T->Leaf];
76  return;
77  }
78 
79  /* Estimate errors for each branch */
80 
81  Att = T->Tested;
82  Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
83 
84  if ( CostWeights )
85  {
86  MissingCases = SumNocostWeights(Fp, Ep);
87  KnownCases = SumNocostWeights(Ep+1, Lp);
88  }
89  else
90  {
91  MissingCases = CountCases(Fp, Ep);
92  KnownCases = Cases - MissingCases;
93  }
94 
95  PrevUnitWeights = UnitWeights;
96  if ( Missing ) UnitWeights = false;
97 
98  T->Errors = 0;
99  Bp = Fp;
100 
101  ForEach(v, 1, T->Forks)
102  {
103  Ep = Group(v, Bp + Missing, Lp, T);
104 
105  /* Bp -> first value in missing + remaining values
106  Ep -> last value in missing + current group */
107 
108  BranchCases = CountCases(Bp + Missing, Ep);
109 
110  Factor = ( ! Missing ? 0 :
111  ! CostWeights ? BranchCases / KnownCases :
112  SumNocostWeights(Bp + Missing, Ep) / KnownCases );
113 
114  if ( BranchCases + Factor * MissingCases >= MinLeaf )
115  {
116  if ( Missing )
117  {
118  ForEach(i, Bp, Bp + Missing - 1)
119  {
120  Weight(Case[i]) *= Factor;
121  }
122  }
123 
124  ResubErrs(T->Branch[v], Bp, Ep);
125 
126  T->Errors += T->Branch[v]->Errors;
127 
128  /* Restore weights if changed */
129 
130  if ( Missing )
131  {
132  for ( i = Ep ; i >= Bp ; i-- )
133  {
134  if ( Unknown(Case[i], Att) )
135  {
136  Weight(Case[i]) /= Factor;
137  Swap(i, Ep);
138  Ep--;
139  }
140  }
141  }
142 
143  Bp = Ep+1;
144  }
145  }
146 
147  UnitWeights = PrevUnitWeights;
148 }
149 
150 
151 
152 /*************************************************************************/
153 /* */
154 /* Calculate upper and lower bounds for each test on a continuous */
155 /* attribute in tree T, using cases from Fp to Lp. */
156 /* */
157 /* The lower bound is set so that the error rate of the GT branch */
158 /* on the cases between the bound and the threshold is double that */
159 /* of the correct (LE) branch; the upper bound is set similarly. */
160 /* */
161 /*************************************************************************/
162 
163 
164 void FindBounds(Tree T, CaseNo Fp, CaseNo Lp)
165 /* -------- */
166 {
167  int v;
168  CaseNo i, j, Kp, Bp, Ap, Missing, SplitI;
169  CaseCount w, LEErrs, GTErrs, KnownCases, SE;
170  ClassNo RealClass;
171  Attribute Att;
172  Boolean PrevUnitWeights;
173  double Factor;
174 
175  /* Stop when get to a leaf */
176 
177  if ( ! T->NodeType ) return;
178 
179  Kp = Group(0, Fp, Lp, T) + 1;
180  Missing = Kp - Fp;
181 
182  Att = T->Tested;
183  KnownCases = CountCases(Kp, Lp);
184 
185  /* Soften a threshold for a continuous attribute */
186 
187  if ( T->NodeType == BrThresh )
188  {
189  Verbosity(1, fprintf(Of, "\nTest %s <> %g\n", AttName[Att], T->Cut))
190 
191  /* Skip N/A values */
192 
193  Ap = Group(1, Kp, Lp, T) + 1;
194 
195  Quicksort(Ap, Lp, Att);
196 
197  /* Locate cut point and overall errors of the LE and GT branches */
198 
199  SplitI = Ap;
200  LEErrs = GTErrs = 0;
201  ForEach(i, Ap, Lp)
202  {
203  if ( CVal(Case[i], Att) <= T->Cut ) SplitI = i;
204  }
205 
206  T->Mid = (CVal(Case[SplitI], Att) + CVal(Case[SplitI+1], Att)) / 2;
207 
208  /* Consider cutoff points below and above the threshold.
209  The errors on the cases between the cutoff and the threshold
210  are computed for both the LE and GT branches. The additional
211  errors must be less than 0.5SE and, further, the errors
212  on the "other" branch must not exceed twice the errors
213  on the "real" branch, both after Laplace adjustment */
214 
215  SE = sqrt(T->Errors * (KnownCases - T->Errors) / (KnownCases + 1E-3))
216  * 2;
217 
218  LEErrs = GTErrs = 0;
219  j = SplitI;
220  for ( i = SplitI ; i > Ap ; i-- )
221  {
222  RealClass = Class(Case[i]);
223 
224  w = Weight(Case[i]);
225  GTErrs += w * ( TreeClassify(Case[i], T->Branch[3]) != RealClass );
226  LEErrs += w * ( TreeClassify(Case[i], T->Branch[2]) != RealClass );
227 
228  if ( CVal(Case[i-1], Att) < CVal(Case[i], Att) )
229  {
230  if ( GTErrs > 2 * LEErrs + 1 || GTErrs - LEErrs > 0.5 * SE )
231  {
232  break;
233  }
234 
235  j = i-1;
236  }
237  }
238  T->Lower = Min(T->Mid, CVal(Case[j], Att));
239 
240  LEErrs = GTErrs = 0;
241  j = SplitI+1;
242  for ( i = SplitI+1 ; i < Lp ; i++ )
243  {
244  RealClass = Class(Case[i]);
245 
246  w = Weight(Case[i]);
247  LEErrs += w * ( TreeClassify(Case[i], T->Branch[2]) != RealClass );
248  GTErrs += w * ( TreeClassify(Case[i], T->Branch[3]) != RealClass );
249 
250  if ( CVal(Case[i], Att) < CVal(Case[i+1], Att) )
251  {
252  if ( LEErrs > 2 * GTErrs + 1 || LEErrs - GTErrs > 0.5 * SE )
253  {
254  break;
255  }
256 
257  j = i+1;
258  }
259  }
260  T->Upper = Max(T->Mid, CVal(Case[j], Att));
261 
262  Verbosity(1,
263  fprintf(Of, "\tLower = %g, Upper = %g\n", T->Lower, T->Upper))
264  }
265 
266  /* Recursively scan each branch */
267 
268  PrevUnitWeights = UnitWeights;
269  if ( Missing > 0 ) UnitWeights = false;
270 
271  Bp = Fp;
272 
273  ForEach(v, 1, T->Forks)
274  {
275  Kp = Group(v, Bp + Missing, Lp, T);
276 
277  /* Bp -> first value in missing + remaining values
278  Kp -> last value in missing + current group */
279 
280  if ( Bp + Missing <= Kp &&
281  (Factor = CountCases(Bp + Missing, Kp) / KnownCases) > 1E-6 )
282  {
283  if ( Missing )
284  {
285  ForEach(i, Bp, Bp + Missing - 1)
286  {
287  Weight(Case[i]) *= Factor;
288  }
289  }
290 
291  FindBounds(T->Branch[v], Bp, Kp);
292 
293  /* Restore weights if changed */
294 
295  if ( Missing )
296  {
297  for ( i = Kp ; i >= Bp ; i-- )
298  {
299  if ( Unknown(Case[i], Att) )
300  {
301  Weight(Case[i]) /= Factor;
302  Swap(i, Kp);
303  Kp--;
304  }
305  }
306  }
307 
308  Bp = Kp+1;
309  }
310  }
311 
312  UnitWeights = PrevUnitWeights;
313 }