mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
contin.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Evaluation of a test on a continuous valued attribute */
30 /* ----------------------------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 #include "defns.i"
35 #include "extern.i"
36 
37 #define PartInfo(n) (-(n)*Log((n)/GEnv.Cases))
38 
39 
40 /*************************************************************************/
41 /* */
42 /* Continuous attributes are treated as if they have possible */
43 /* values 0 (unknown), 1 (not applicable), 2 (less than cut) and */
44 /* 3 (greater than cut). */
45 /* This routine finds the best cut for cases Fp through Lp and */
46 /* sets Info[], Gain[] and Bar[] */
47 /* */
48 /*************************************************************************/
49 
50 
52 /* ----------------- */
53 {
54  CaseNo i, j, BestI, Tries=0;
55  double LowInfo, LHInfo, LeastInfo=1E38,
56  w, BestGain, BestInfo, ThreshCost=1;
57  ClassNo c;
58  ContValue Interval;
59 
60  Verbosity(3, fprintf(Of, "\tAtt %s\n", AttName[Att]))
61 
62  Gain[Att] = None;
63  PrepareForContin(Att, Fp, Lp);
64 
65  /* Special case when very few known values */
66 
67  if ( GEnv.ApplicCases < 2 * MINITEMS )
68  {
69  Verbosity(2,
70  fprintf(Of, "\tAtt %s\tinsufficient cases with known values\n",
71  AttName[Att]))
72  return;
73  }
74 
75  /* Try possible cuts between cases i and i+1, and determine the
76  information and gain of the split in each case */
77 
78  /* We have to be wary of splitting a small number of cases off one end,
79  as this has little predictive power. The minimum split GEnv.MinSplit is
80  the maximum of MINITEMS or (the minimum of 25 and 10% of the cases
81  per class) */
82 
83  GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;
84  if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;
85  if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;
86 
87  /* Find first possible cut point and initialise scan parameters */
88 
89  i = PrepareForScan(Lp);
90 
91  /* Repeatedly check next possible cut */
92 
93  for ( ; i <= GEnv.Ep ; i++ )
94  {
95  c = GEnv.SRec[i].C;
96  w = GEnv.SRec[i].W;
97  assert(c >= 1 && c <= MaxClass);
98 
99  GEnv.LowCases += w;
100  GEnv.Freq[2][c] += w;
101  GEnv.Freq[3][c] -= w;
102 
103  GEnv.HighVal = GEnv.SRec[i+1].V;
104  if ( GEnv.HighVal > GEnv.LowVal )
105  {
106  Tries++;
107 
108  GEnv.LowClass = GEnv.HighClass;
109  GEnv.HighClass = GEnv.SRec[i+1].C;
110  for ( j = i+2 ;
111  GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;
112  j++ )
113  {
114  if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
115  }
116 
117  if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )
118  {
119  LowInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass);
120 
121  /* If cannot improve on best so far, count remaining
122  possible cuts and break */
123 
124  if ( LowInfo >= LeastInfo )
125  {
126  for ( i++ ; i <= GEnv.Ep ; i++ )
127  {
128  if ( GEnv.SRec[i+1].V > GEnv.SRec[i].V )
129  {
130  Tries++;
131  }
132  }
133  break;
134  }
135 
136  LHInfo = LowInfo + TotalInfo(GEnv.Freq[3], 1, MaxClass);
137  if ( LHInfo < LeastInfo )
138  {
139  LeastInfo = LHInfo;
140  BestI = i;
141 
142  BestInfo = (GEnv.FixedSplitInfo
143  + PartInfo(GEnv.LowCases)
144  + PartInfo(GEnv.ApplicCases - GEnv.LowCases))
145  / GEnv.Cases;
146  }
147 
148  Verbosity(3,
149  {
150  fprintf(Of, "\t\tCut at %.3f (gain %.3f):",
151  (GEnv.LowVal + GEnv.HighVal) / 2,
152  (1 - GEnv.UnknownRate) *
153  (GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases));
154  PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);
155  })
156  }
157 
158  GEnv.LowVal = GEnv.HighVal;
159  }
160  }
161 
162  BestGain = (1 - GEnv.UnknownRate) *
163  (GEnv.BaseInfo - (GEnv.NAInfo + LeastInfo) / GEnv.KnownCases);
164 
165  /* The threshold cost is the lesser of the cost of indicating the
166  cases to split between or the interval containing the split */
167 
168  if ( BestGain > 0 )
169  {
170  Interval = (GEnv.SRec[Lp].V - GEnv.SRec[GEnv.Xp].V) /
171  (GEnv.SRec[BestI+1].V - GEnv.SRec[BestI].V);
172  ThreshCost = ( Interval < Tries ? Log(Interval) : Log(Tries) )
173  / GEnv.Cases;
174  }
175 
176  BestGain -= ThreshCost;
177 
178  /* If a test on the attribute is able to make a gain,
179  set the best break point, gain and information */
180 
181  if ( BestGain <= 0 )
182  {
183  Verbosity(2, fprintf(Of, "\tAtt %s\tno gain\n", AttName[Att]))
184  }
185  else
186  {
187  Gain[Att] = BestGain;
188  Info[Att] = BestInfo;
189 
190  GEnv.LowVal = GEnv.SRec[BestI].V;
191  GEnv.HighVal = GEnv.SRec[BestI+1].V;
192 
193  /* Set threshold, making sure that rounding problems do not
194  cause it to reach upper value */
195 
196  if ( (Bar[Att] = (ContValue) (0.5 * (GEnv.LowVal + GEnv.HighVal)))
197  >= GEnv.HighVal )
198  {
199  Bar[Att] = GEnv.LowVal;
200  }
201 
202  Verbosity(2,
203  fprintf(Of, "\tAtt %s\tcut=%.3f, inf %.3f, gain %.3f\n",
204  AttName[Att], Bar[Att], Info[Att], Gain[Att]))
205  }
206 }
207 
208 
209 
210 /*************************************************************************/
211 /* */
212 /* Estimate max gain ratio available from any cut, using sample */
213 /* of SampleFrac of all cases */
214 /* */
215 /*************************************************************************/
216 
217 
219 /* ------------- */
220 {
221  CaseNo i, j;
222  double LHInfo, w, SplitInfo, ThisGain, GR;
223  ClassNo c;
224 
225  EstMaxGR[Att] = 0;
226 
227  if ( Skip(Att) || Att == ClassAtt ) return;
228 
229  PrepareForContin(Att, Fp, Lp);
230 
231  /* Special case when very few known values */
232 
233  if ( GEnv.ApplicCases < 2 * MINITEMS * SampleFrac )
234  {
235  return;
236  }
237 
238  /* Try possible cuts between cases i and i+1. Use conservative
239  value of GEnv.MinSplit to allow for sampling */
240 
241  GEnv.MinSplit = 0.10 * GEnv.KnownCases / MaxClass;
242  if ( GEnv.MinSplit > 25 ) GEnv.MinSplit = 25;
243  if ( GEnv.MinSplit < MINITEMS ) GEnv.MinSplit = MINITEMS;
244 
245  GEnv.MinSplit *= SampleFrac * 0.33;
246 
247  i = PrepareForScan(Lp);
248 
249  /* Repeatedly check next possible cut */
250 
251  for ( ; i <= GEnv.Ep ; i++ )
252  {
253  c = GEnv.SRec[i].C;
254  w = GEnv.SRec[i].W;
255  assert(c >= 1 && c <= MaxClass);
256 
257  GEnv.LowCases += w;
258  GEnv.Freq[2][c] += w;
259  GEnv.Freq[3][c] -= w;
260 
261  GEnv.HighVal = GEnv.SRec[i+1].V;
262  if ( GEnv.HighVal > GEnv.LowVal )
263  {
264  GEnv.LowClass = GEnv.HighClass;
265  GEnv.HighClass = GEnv.SRec[i+1].C;
266  for ( j = i+2 ;
267  GEnv.HighClass && j <= GEnv.Ep && GEnv.SRec[j].V == GEnv.HighVal ;
268  j++ )
269  {
270  if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
271  }
272 
273  if ( ! GEnv.LowClass || GEnv.LowClass != GEnv.HighClass || j > GEnv.Ep )
274  {
275  LHInfo = TotalInfo(GEnv.Freq[2], 1, MaxClass)
276  + TotalInfo(GEnv.Freq[3], 1, MaxClass);
277 
278  SplitInfo = (GEnv.FixedSplitInfo
279  + PartInfo(GEnv.LowCases)
280  + PartInfo(GEnv.ApplicCases - GEnv.LowCases)) / GEnv.Cases;
281 
282  ThisGain = (1 - GEnv.UnknownRate) *
283  (GEnv.BaseInfo - (GEnv.NAInfo + LHInfo) / GEnv.KnownCases);
284  if ( ThisGain > Gain[Att] ) Gain[Att] = ThisGain;
285 
286  /* Adjust GR to make it more conservative upper bound */
287 
288  GR = (ThisGain + 1E-5) / SplitInfo;
289  if ( GR > EstMaxGR[Att] )
290  {
291  EstMaxGR[Att] = GR;
292  }
293 
294  Verbosity(3,
295  {
296  fprintf(Of, "\t\tCut at %.3f (gain %.3f):",
297  (GEnv.LowVal + GEnv.HighVal) / 2, ThisGain);
298  PrintDistribution(Att, 2, 3, GEnv.Freq, GEnv.ValFreq, true);
299  })
300  }
301 
302  GEnv.LowVal = GEnv.HighVal;
303  }
304  }
305 
306  Verbosity(2,
307  fprintf(Of, "\tAtt %s: max GR estimate %.3f\n",
308  AttName[Att], EstMaxGR[Att]))
309 }
310 
311 
312 
313 /*************************************************************************/
314 /* */
315 /* Routine to set some preparatory values used by both */
316 /* EvalContinuousAtt and EstimateMaxGR */
317 /* */
318 /*************************************************************************/
319 
320 
322 /* ---------------- */
323 {
324  CaseNo i;
325  ClassNo c;
326  DiscrValue v;
327 
328  /* Reset frequency tables */
329 
330  ForEach(v, 0, 3)
331  {
332  ForEach(c, 1, MaxClass)
333  {
334  GEnv.Freq[v][c] = 0;
335  }
336  GEnv.ValFreq[v] = 0;
337  }
338 
339  /* Omit and count unknown and N/A values */
340 
341  GEnv.Cases = 0;
342 
343  if ( SomeMiss[Att] || SomeNA[Att] )
344  {
345  GEnv.Xp = Lp+1;
346 
347  ForEach(i, Fp, Lp)
348  {
349  assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
350 
351  GEnv.Cases += Weight(Case[i]);
352 
353  if ( Unknown(Case[i], Att) )
354  {
355  GEnv.Freq[ 0 ][ Class(Case[i]) ] += Weight(Case[i]);
356  }
357  else
358  if ( NotApplic(Case[i], Att) )
359  {
360  GEnv.Freq[ 1 ][ Class(Case[i]) ] += Weight(Case[i]);
361  }
362  else
363  {
364  GEnv.Freq[ 3 ][ Class(Case[i]) ] += Weight(Case[i]);
365  GEnv.Xp--;
366  GEnv.SRec[GEnv.Xp].V = CVal(Case[i], Att);
367  GEnv.SRec[GEnv.Xp].W = Weight(Case[i]);
368  GEnv.SRec[GEnv.Xp].C = Class(Case[i]);
369  }
370  }
371 
372  ForEach(c, 1, MaxClass)
373  {
374  GEnv.ValFreq[0] += GEnv.Freq[0][c];
375  GEnv.ValFreq[1] += GEnv.Freq[1][c];
376  }
377 
378  GEnv.NAInfo = TotalInfo(GEnv.Freq[1], 1, MaxClass);
379  GEnv.FixedSplitInfo = PartInfo(GEnv.ValFreq[0]) + PartInfo(GEnv.ValFreq[1]);
380 
381  Verbosity(3, PrintDistribution(Att, 0, 1, GEnv.Freq, GEnv.ValFreq, true))
382  }
383  else
384  {
385  GEnv.Xp = Fp;
386 
387  ForEach(i, Fp, Lp)
388  {
389  GEnv.SRec[i].V = CVal(Case[i], Att);
390  GEnv.SRec[i].W = Weight(Case[i]);
391  GEnv.SRec[i].C = Class(Case[i]);
392 
393  GEnv.Freq[3][Class(Case[i])] += Weight(Case[i]);
394  }
395 
396  ForEach(c, 1, MaxClass)
397  {
398  GEnv.Cases += GEnv.Freq[3][c];
399  }
400 
401  GEnv.NAInfo = GEnv.FixedSplitInfo = 0;
402  }
403 
404  GEnv.KnownCases = GEnv.Cases - GEnv.ValFreq[0];
405  GEnv.ApplicCases = GEnv.KnownCases - GEnv.ValFreq[1];
406 
407  GEnv.UnknownRate = 1.0 - GEnv.KnownCases / GEnv.Cases;
408 
409  Cachesort(GEnv.Xp, Lp, GEnv.SRec);
410 
411  /* If unknowns or using sampling, must recompute base information */
412 
413  if ( GEnv.ValFreq[0] > 0 || SampleFrac < 1 )
414  {
415  /* Determine base information using GEnv.Freq[0] as temp buffer */
416 
417  ForEach(c, 1, MaxClass)
418  {
419  GEnv.Freq[0][c] = GEnv.Freq[1][c] + GEnv.Freq[3][c];
420  }
421 
422  GEnv.BaseInfo = TotalInfo(GEnv.Freq[0], 1, MaxClass) / GEnv.KnownCases;
423  }
424  else
425  {
426  GEnv.BaseInfo = GlobalBaseInfo;
427  }
428 }
429 
430 
431 
432 /*************************************************************************/
433 /* */
434 /* Set low and high bounds for scan and initial class */
435 /* (used by EvalContinuousAtt and EstimateMaxGR) */
436 /* */
437 /*************************************************************************/
438 
439 
441 /* -------------- */
442 {
443  CaseNo i, j;
444  ClassNo c;
445  double w;
446 
447  /* Find last possible split */
448 
449  GEnv.HighCases = GEnv.LowCases = 0;
450 
451  for ( GEnv.Ep = Lp ; GEnv.Ep >= GEnv.Xp && GEnv.HighCases < GEnv.MinSplit ; GEnv.Ep-- )
452  {
453  GEnv.HighCases += GEnv.SRec[GEnv.Ep].W;
454  }
455 
456  /* Skip cases before first possible cut */
457 
458  for ( i = GEnv.Xp ;
459  i <= GEnv.Ep &&
460  ( GEnv.LowCases + GEnv.SRec[i].W < GEnv.MinSplit - 1E-5 ||
461  GEnv.SRec[i].V == GEnv.SRec[i+1].V ) ;
462  i++ )
463  {
464  c = GEnv.SRec[i].C;
465  w = GEnv.SRec[i].W;
466  assert(c >= 1 && c <= MaxClass);
467 
468  GEnv.LowCases += w;
469  GEnv.Freq[2][c] += w;
470  GEnv.Freq[3][c] -= w;
471  }
472 
473  /* Find the class key for the first interval */
474 
475  GEnv.HighClass = GEnv.SRec[i].C;
476  for ( j = i-1; GEnv.HighClass && j >= GEnv.Xp ; j-- )
477  {
478  if ( GEnv.SRec[j].C != GEnv.HighClass ) GEnv.HighClass = 0;
479  }
480  assert(GEnv.HighClass <= MaxClass);
481  assert(j+1 >= GEnv.Xp);
482 
483  GEnv.LowVal = GEnv.SRec[i].V;
484 
485  return i;
486 }
487 
488 
489 
490 /*************************************************************************/
491 /* */
492 /* Change a leaf into a test on a continuous attribute */
493 /* */
494 /*************************************************************************/
495 
496 
497 void ContinTest(Tree Node, Attribute Att)
498 /* ---------- */
499 {
500  Sprout(Node, 3);
501 
502  Node->NodeType = BrThresh;
503  Node->Tested = Att;
504  Node->Cut =
505  Node->Lower =
506  Node->Upper = Bar[Att];
507 }
508 
509 
510 
511 /*************************************************************************/
512 /* */
513 /* Adjust thresholds of all continuous attributes so that cuts */
514 /* are values that appear in the data */
515 /* */
516 /*************************************************************************/
517 
518 
520 /* ------------------- */
521 {
522  Attribute Att;
523  CaseNo Ep;
524 
525  ForEach(Att, 1, MaxAtt)
526  {
527  if ( Continuous(Att) )
528  {
529  Ep = -1;
530  AdjustThresholds(T, Att, &Ep);
531  }
532  }
533 }
534 
535 
536 
538 /* ---------------- */
539 {
540  DiscrValue v;
541  CaseNo i;
542 
543  if ( T->NodeType == BrThresh && T->Tested == Att )
544  {
545  if ( *Ep == -1 )
546  {
547  ForEach(i, 0, MaxCase)
548  {
549  if ( ! Unknown(Case[i], Att) && ! NotApplic(Case[i], Att) )
550  {
551  (&GEnv)->SRec[++(*Ep)].V = CVal(Case[i], Att);
552  }
553  }
554  Cachesort(0, *Ep, (&GEnv)->SRec);
555 
556  if ( PossibleCuts && Trial == 0 )
557  {
558  int Cuts=0;
559 
560  ForEach(i, 1, *Ep)
561  {
562  if ( (&GEnv)->SRec[i].V != (&GEnv)->SRec[i-1].V ) Cuts++;
563  }
564  PossibleCuts[Att] = Cuts;
565  }
566  }
567 
568  T->Cut = T->Lower = T->Upper = GreatestValueBelow(T->Cut, Ep);
569  }
570 
571  if ( T->NodeType )
572  {
573  ForEach(v, 1, T->Forks)
574  {
575  AdjustThresholds(T->Branch[v], Att, Ep);
576  }
577  }
578 }
579 
580 
581 
582 /*************************************************************************/
583 /* */
584 /* Return the greatest value of attribute Att below threshold Th */
585 /* (Assumes values of Att have been sorted.) */
586 /* */
587 /*************************************************************************/
588 
589 
591 /* ------------------ */
592 {
593  CaseNo Low, Mid, High;
594 
595  Low = 0;
596  High = *Ep;
597 
598  while ( Low < High )
599  {
600  Mid = (Low + High + 1) / 2;
601 
602  if ( (&GEnv)->SRec[Mid].V > Th )
603  {
604  High = Mid - 1;
605  }
606  else
607  {
608  Low = Mid;
609  }
610  }
611 
612  return (&GEnv)->SRec[Low].V;
613 }