mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
prune.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Prune a decision tree and predict its error rate */
30 /* ------------------------------------------------ */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 
39 #define LocalVerbosity(x,s) if (Sh >= 0) {Verbosity(x,s)}
40 #define Intab(x) Indent(x, 0)
41 
42 #define UPDATE 1 /* flag: change tree */
43 #define REGROW 2 /* regrow branches */
44 #define REPORTPROGRESS 4 /* original tree */
45 #define UNITWEIGHTS 8 /* UnitWeights is true*/
46 
48 
49 double MaxExtraErrs, /* limit for global prune */
50  TotalExtraErrs; /* extra errors from ties */
51 Tree *XT; /* subtrees with lowest cost comp */
52 int NXT; /* number ditto */
53 float MinCC; /* cost compexity for XT */
54 Boolean RecalculateErrs; /* if missing values */
55 
56 
57 
58 
59 /*************************************************************************/
60 /* */
61 /* Prune tree T */
62 /* */
63 /*************************************************************************/
64 
65 
66 void Prune(Tree T)
67 /* ----- */
68 {
69  Attribute Att;
70  int i, Options;
71  Boolean Regrow;
72 
73  Verbosity(2, fprintf(Of, "\n"))
74 
75  Regrow = ( Trial == 0 || Now == WINNOWATTS );
76 
77  /* Local pruning phase */
78 
79 
80  Options = ( Now == WINNOWATTS ? (UPDATE|REGROW) :
81  Regrow ? (UPDATE|REGROW|REPORTPROGRESS) :
83  if ( UnitWeights ) Options |= UNITWEIGHTS;
84 
85  EstimateErrs(T, 0, MaxCase, 0, Options);
86 
87  if ( MCost )
88  {
89  /* Remove any effects of WeightMul and reset leaf classes */
90 
91  RestoreDistribs(T);
92  }
93  else
94  {
95  /* Insert information on parents and recalculate errors, noting
96  whether fractional cases might have appeared (for GlobalPrune) */
97 
98  RecalculateErrs = false;
99  InsertParents(T, Nil);
100 
101  /* Possible global pruning phase */
102 
103  if ( GLOBAL && Now != WINNOWATTS )
104  {
105  GlobalPrune(T);
106  }
107  }
108 
109  /* Remove impossible values from subsets and ordered splits.
110  First record possible values for discrete attributes */
111 
113  ForEach(Att, 1, MaxAtt)
114  {
115  if ( Ordered(Att) || Discrete(Att) && SUBSET )
116  {
117  PossibleValues[Att] = AllocZero((MaxAttVal[Att]>>3)+1, Byte);
118  ForEach(i, 1, MaxAttVal[Att])
119  {
120  SetBit(i, PossibleValues[Att]);
121  }
122  }
123  }
124 
125  CheckSubsets(T, true);
126 
128 
129  /* For multibranch splits, merge non-occurring values. For trees
130  (first boosting trial only), also merge leaves of same class */
131 
132  if ( ! SUBSET )
133  {
134  CompressBranches(T);
135  }
136 }
137 
138 
139 
140 /*************************************************************************/
141 /* */
142 /* Estimate the errors in a given subtree */
143 /* */
144 /*************************************************************************/
145 
146 
147 void EstimateErrs(Tree T, CaseNo Fp, CaseNo Lp, int Sh, int Flags)
148 /* ------------ */
149 {
150  CaseNo i, Bp, Ep, Missing;
151  CaseCount Cases=0, KnownCases, *BranchCases, MissingCases,
152  *SmallBranches, SmallBranchCases=0,
153  Errs, SaveErrs, TreeErrs, LeafErrs, ExtraLeafErrs=0, BestBrErrs;
154  double Factor, *LocalClassDist;
155  DiscrValue v, BestBr=0;
156  ClassNo c, BestClass=1;
157  int UnitWeights; /* local value */
158  Tree Br;
159  Attribute Att;
160 
161 
162  if ( Fp > Lp ) return;
163 
164  UnitWeights = (Flags & UNITWEIGHTS);
165 
166  LocalClassDist = Alloc(MaxClass+1, double);
167 
168  FindClassFreq(LocalClassDist, Fp, Lp);
169 
170  /* Record new class distribution if updating the tree */
171 
172  if ( (Flags & UPDATE) )
173  {
174  ForEach(c, 1, MaxClass)
175  {
176  T->ClassDist[c] = LocalClassDist[c];
177  Cases += LocalClassDist[c];
178 
179  if ( LocalClassDist[c] > LocalClassDist[BestClass] ) BestClass = c;
180  }
181  }
182  else
183  {
184  ForEach(c, 1, MaxClass)
185  {
186  Cases += LocalClassDist[c];
187 
188  if ( LocalClassDist[c] > LocalClassDist[BestClass] ) BestClass = c;
189  }
190  }
191 
192  LeafErrs = Cases - LocalClassDist[BestClass];
193  ExtraLeafErrs = ExtraErrs(Cases, LeafErrs, BestClass);
194 
195  Free(LocalClassDist);
196 
197  if ( (Flags & UPDATE) )
198  {
199  T->Cases = Cases;
200  T->Leaf = BestClass;
201  }
202 
203  if ( ! T->NodeType ) /* leaf */
204  {
205  if ( (Flags & UPDATE) && (Flags & REPORTPROGRESS) &&
206  Now == SIMPLIFYTREE &&
207  T->Cases > 0 )
208  {
209  Progress(T->Cases);
210  }
211 
212  T->Errors = LeafErrs + ExtraLeafErrs;
213 
214  if ( (Flags & UPDATE) )
215  {
216  if ( Sh >= 0 )
217  {
218  LocalVerbosity(3,
219  Intab(Sh);
220  fprintf(Of, "%s (%.2f:%.2f/%.2f)\n", ClassName[T->Leaf],
221  T->Cases, LeafErrs, T->Errors))
222  }
223  }
224 
225  return;
226  }
227 
228  /* Estimate errors for each branch */
229 
230  Att = T->Tested;
231  Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
232 
233  if ( CostWeights )
234  {
235  MissingCases = SumNocostWeights(Fp, Ep);
236  KnownCases = SumNocostWeights(Ep+1, Lp);
237  }
238  else
239  {
240  MissingCases = CountCases(Fp, Ep);
241  KnownCases = Cases - MissingCases;
242  }
243 
244  SmallBranches = AllocZero(MaxClass+1, CaseCount);
245  BranchCases = Alloc(T->Forks+1, CaseCount);
246 
247  if ( Missing ) UnitWeights = 0;
248 
249  TreeErrs = 0;
250  Bp = Fp;
251 
252  ForEach(v, 1, T->Forks)
253  {
254  Ep = Group(v, Bp + Missing, Lp, T);
255 
256  /* Bp -> first value in missing + remaining values
257  Ep -> last value in missing + current group */
258 
259  BranchCases[v] = CountCases(Bp + Missing, Ep);
260 
261  Factor = ( ! Missing ? 0 :
262  ! CostWeights ? BranchCases[v] / KnownCases :
263  SumNocostWeights(Bp + Missing, Ep) / KnownCases );
264 
265  if ( (BranchCases[v] += Factor * MissingCases) >= MinLeaf )
266  {
267  if ( Missing )
268  {
269  ForEach(i, Bp, Bp + Missing - 1)
270  {
271  Weight(Case[i]) *= Factor;
272  }
273  }
274 
275  EstimateErrs(T->Branch[v], Bp, Ep, Sh+1, ((Flags&7) | UnitWeights));
276 
277  /* Group small branches together for error estimation */
278 
279  if ( BranchCases[v] < MINITEMS )
280  {
281  ForEach(i, Bp, Ep)
282  {
283  SmallBranches[ Class(Case[i]) ] += Weight(Case[i]);
284  }
285 
286  SmallBranchCases += BranchCases[v];
287  }
288  else
289  {
290  TreeErrs += T->Branch[v]->Errors;
291  }
292 
293  /* Restore weights if changed */
294 
295  if ( Missing )
296  {
297  for ( i = Ep ; i >= Bp ; i-- )
298  {
299  if ( Unknown(Case[i], Att) )
300  {
301  Weight(Case[i]) /= Factor;
302  Swap(i, Ep);
303  Ep--;
304  }
305  }
306  }
307 
308  Bp = Ep+1;
309  }
310  }
311 
312  /* Add error estimate from small branches, if any */
313 
314  if ( SmallBranchCases )
315  {
316  BestClass = 1;
317  ForEach(c, 2, MaxClass)
318  {
319  if ( SmallBranches[c] > SmallBranches[BestClass] ) BestClass = c;
320  }
321 
322  Errs = SmallBranchCases - SmallBranches[BestClass];
323  TreeErrs += Errs + ExtraErrs(SmallBranchCases, Errs, BestClass);
324  }
325  Free(SmallBranches);
326  Free(BranchCases);
327 
328  if ( ! (Flags & UPDATE) )
329  {
330  T->Errors = TreeErrs;
331  return;
332  }
333 
334  /* See how the largest candidate branch would perform. A branch
335  is a candidate if it is not a leaf, contains at least 10% of
336  the cases, and does not test the same continuous attribute.
337  This test is skipped for boosted trees */
338 
339  ForEach(v, 1, T->Forks)
340  {
341  if ( ! T->Branch[v]->NodeType ||
342  T->Branch[v]->Cases < 0.1 * T->Cases ||
343  T->Branch[v]->Tested == Att && Continuous(Att) )
344  {
345  continue;
346  }
347 
348  if ( ! BestBr || T->Branch[v]->Cases > T->Branch[BestBr]->Cases )
349  {
350  BestBr = v;
351  }
352  }
353 
354  if ( BestBr )
355  {
356  SaveErrs = T->Branch[BestBr]->Errors;
357  EstimateErrs(T->Branch[BestBr], Fp, Lp, -1, 0);
358  BestBrErrs = T->Branch[BestBr]->Errors;
359  T->Branch[BestBr]->Errors = SaveErrs;
360  }
361  else
362  {
363  BestBrErrs = MaxCase+1;
364  }
365 
366  LocalVerbosity(2,
367  Intab(Sh);
368  fprintf(Of, "%s: [%d%% N=%.2f tree=%.2f leaf=%.2f+%.2f",
369  AttName[T->Tested],
370  (int) ((TreeErrs * 100) / (T->Cases + 0.001)),
371  T->Cases, TreeErrs, LeafErrs, ExtraLeafErrs);
372  if ( BestBr )
373  {
374  fprintf(Of, " br[%d]=%.2f", BestBr, BestBrErrs);
375  }
376  fprintf(Of, "]\n"))
377 
378  /* See whether tree should be replaced with leaf or best branch */
379 
380  if ( LeafErrs + ExtraLeafErrs <= BestBrErrs + 0.1 &&
381  LeafErrs + ExtraLeafErrs <= TreeErrs + 0.1 )
382  {
383  LocalVerbosity(2,
384  Intab(Sh);
385  fprintf(Of, "Replaced with leaf %s\n", ClassName[T->Leaf]))
386 
387  UnSprout(T);
388  T->Errors = LeafErrs + ExtraLeafErrs;
389  }
390  else
391  if ( BestBrErrs <= TreeErrs + 0.1 )
392  {
393  LocalVerbosity(2,
394  Intab(Sh);
395  fprintf(Of, "Replaced with branch %d\n", BestBr))
396 
397  /* Free unused bits of tree */
398 
399  ForEach(v, 1, T->Forks)
400  {
401  if ( v != BestBr ) FreeTree(T->Branch[v]);
402  }
403  Br = T->Branch[BestBr];
404  Free(T->Branch);
405  Free(T->ClassDist);
406  if ( T->NodeType == BrSubset )
407  {
408  FreeVector((void **) T->Subset, 1, T->Forks);
409  }
410 
411  /* Copy the branch up */
412 
413  memcpy((char *) T, (char *) Br, sizeof(TreeRec));
414  Free(Br);
415 
416  Factor = T->Cases / Cases;
417  T->Cases = Cases;
418 
419  /* If not within a rebuilt tree, not a cascaded test, and
420  sufficient new cases to justify the effort, rebuild the branch */
421 
422  if ( T->NodeType && (Flags & REGROW) && Factor < 0.95 )
423  {
424  ForEach(v, 1, T->Forks)
425  {
426  FreeTree(T->Branch[v]); T->Branch[v] = Nil;
427  }
428 
430 
431  Divide(T, Fp, Lp, 0);
432  }
433 
434  EstimateErrs(T, Fp, Lp, Sh, UPDATE);
435  }
436  else
437  {
438  T->Errors = TreeErrs;
439  }
440 }
441 
442 
443 
444 /*************************************************************************/
445 /* */
446 /* Phase 2 (global) pruning. */
447 /* Prune minimum cost complexity subtrees until "error" */
448 /* (measured by sum of branch errors) increases by 1SE */
449 /* */
450 /*************************************************************************/
451 
452 
454 /* ----------- */
455 {
456  int DeltaLeaves, x;
457  double BaseErrs, DeltaErrs;
458  CaseNo i;
459  Tree ST;
460 
461  /* If fractional cases may have been used, calculate errors
462  directly by checking training data */
463 
464  if ( RecalculateErrs )
465  {
466  BaseErrs = 0;
467  ForEach(i, 0, MaxCase)
468  {
469  if ( TreeClassify(Case[i], T) != Class(Case[i]) )
470  {
471  BaseErrs += Weight(Case[i]);
472  }
473  }
474  }
475  else
476  {
477  BaseErrs = T->Errors;
478  }
479 
480  XT = Alloc(T->Leaves, Tree);
481 
482  /* Additional error limit set at 1SE */
483 
484  MaxExtraErrs = sqrt(BaseErrs * (1 - BaseErrs / (MaxCase + 1)));
485 
486  while ( MaxExtraErrs > 0 )
487  {
488  TotalExtraErrs = 0;
489 
490  MinCC = 1E38;
491  NXT = 0;
492 
493  /* Find all subtrees with lowest cost complexity */
494 
495  FindMinCC(T);
496 
497  Verbosity(2,
498  if ( NXT > 0 && TotalExtraErrs > MaxExtraErrs )
499  fprintf(Of, "%d tied with MinCC=%.3f; total extra errs %.1f\n",
501 
502  if ( ! NXT || TotalExtraErrs > MaxExtraErrs ) break;
503 
504  /* Make subtree into a leaf */
505 
506  ForEach(x, 0, NXT-1)
507  {
508  ST = XT[x];
509 
510  UnSprout(ST);
511 
512  /* Update errors and leaves for ST and ancestors */
513 
514  DeltaErrs = (ST->Cases - ST->ClassDist[ST->Leaf]) - ST->Errors;
515  DeltaLeaves = 1 - ST->Leaves;
516  while ( ST )
517  {
518  ST->Errors += DeltaErrs;
519  ST->Leaves += DeltaLeaves;
520  ST = ST->Parent;
521  }
522 
524 
525  Verbosity(2,
526  fprintf(Of, "global: %d leaves, %.1f errs\n",
527  DeltaLeaves, DeltaErrs))
528  }
529  Verbosity(2, fprintf(Of, "\tremaining=%.1f\n", MaxExtraErrs))
530  }
531 
532  Free(XT);
533 }
534 
535 
536 
537 /*************************************************************************/
538 /* */
539 /* Scan tree computing cost complexity of each subtree and */
540 /* record lowest in global variable XT */
541 /* */
542 /*************************************************************************/
543 
544 
546 /* --------- */
547 {
548  DiscrValue v;
549  double ExtraErrs, CC, SaveMinCC, SaveTotalExtraErrs;
550  int SaveNXT;
551 
552  if ( T->NodeType )
553  {
554  /* Save current situation */
555 
556  SaveTotalExtraErrs = TotalExtraErrs;
557  SaveMinCC = MinCC;
558  SaveNXT = NXT;
559 
560  /* Scan subtrees */
561 
562  ForEach(v, 1, T->Forks)
563  {
564  if ( T->Branch[v]->Cases > 0.1 )
565  {
566  FindMinCC(T->Branch[v]);
567  }
568  }
569 
570  /* Compute CC for this subtree and check whether minimum */
571 
572  ExtraErrs = (T->Cases - T->ClassDist[T->Leaf]) - T->Errors;
573 
574  CC = ExtraErrs / (T->Leaves - 1);
575 
576  if ( ExtraErrs <= MaxExtraErrs )
577  {
578  /* Have to be careful of ties in descendants, because
579  they would inflate TotalExtraErrs. Any such ties
580  should be discarded */
581 
582  if ( CC < MinCC ||
583  CC <= MinCC && CC < SaveMinCC /* changed by descendants */ )
584  {
585  /* This is the first of a possible group of ties */
586 
587  MinCC = CC;
588  NXT = 1;
589  XT[0] = T;
591  }
592  else
593  if ( CC <= MinCC )
594  {
595  /* This is a tie. Discard any ties among descendants */
596 
597  if ( NXT > SaveNXT )
598  {
599  TotalExtraErrs = SaveTotalExtraErrs;
600  NXT = SaveNXT;
601  }
602 
603  XT[NXT++] = T;
605  }
606  }
607  }
608 }
609 
610 
611 
612 /*************************************************************************/
613 /* */
614 /* Annotate tree with information on parent and leaves */
615 /* */
616 /*************************************************************************/
617 
618 
620 /* ------------- */
621 {
622  DiscrValue v;
623 
624  T->Parent = P;
625  T->Errors = T->Leaves = 0;
626 
627  if ( T->NodeType )
628  {
629  ForEach(v, 1, T->Forks)
630  {
631  InsertParents(T->Branch[v], T);
632  T->Errors += T->Branch[v]->Errors;
633  T->Leaves += T->Branch[v]->Leaves;
634  }
635 
636  if ( SomeMiss[T->Tested] ) RecalculateErrs = true;
637  }
638  else
639  if ( T->Cases > 1E-3 )
640  {
641  T->Errors = T->Cases - T->ClassDist[T->Leaf];
642  T->Leaves = 1;
643  }
644 }
645 
646 
647 
648 /*************************************************************************/
649 /* */
650 /* Remove unnecessary subset tests on missing values */
651 /* */
652 /*************************************************************************/
653 
654 
655 void CheckSubsets(Tree T, Boolean PruneDefaults)
656 /* ------------ */
657 {
658  Set HoldValues;
659  int v, vv, x, Bytes, b, First, Any=0;
660  Attribute A;
661  Tree LeafBr;
662  ClassNo c;
663 
664  if ( T->NodeType == BrSubset )
665  {
666  A = T->Tested;
667 
668  Bytes = (MaxAttVal[A]>>3) + 1;
669  HoldValues = Alloc(Bytes, Byte);
670 
671  /* For non-ordered attributes the last (default) branch contains
672  all values that do not appear in the data. See whether this
673  branch can be simplified or omitted */
674 
675  if ( ! Ordered(A) && PruneDefaults )
676  {
677  ForEach(b, 0, Bytes-1)
678  {
679  T->Subset[T->Forks][b] &= PossibleValues[A][b];
680  Any |= T->Subset[T->Forks][b];
681  }
682 
683  if ( ! Any )
684  {
685  FreeTree(T->Branch[T->Forks]);
686  Free(T->Subset[T->Forks]);
687  T->Forks--;
688  }
689  }
690 
691  /* Process each subtree, leaving only values in branch subset */
692 
693  CopyBits(Bytes, PossibleValues[A], HoldValues);
694 
695  ForEach(v, 1, T->Forks)
696  {
697  /* Remove any impossible values from ordered subsets */
698 
699  if ( Ordered(A) )
700  {
701  ForEach(vv, 1, MaxAttVal[A])
702  {
703  if ( In(vv, T->Subset[v]) && ! In(vv, HoldValues) )
704  {
705  ResetBit(vv, T->Subset[v]);
706  }
707  }
708  }
709 
710  CopyBits(Bytes, T->Subset[v], PossibleValues[A]);
711 
712  CheckSubsets(T->Branch[v], PruneDefaults);
713  }
714 
715  CopyBits(Bytes, HoldValues, PossibleValues[A]);
716 
717  Free(HoldValues);
718 
719  /* See whether branches other than N/A can be merged.
720  This cannot be done for ordered attributes since the
721  values in the subset represent an interval */
722 
723  if ( ! Ordered(A) )
724  {
725  First = ( In(1, T->Subset[1]) ? 2 : 1 );
726  for ( v = First ; v < T->Forks ; v++ )
727  {
728  if ( T->Branch[v]->NodeType ) continue;
729  LeafBr = T->Branch[v];
730 
731  /* Consider branches vv that could be merged with branch v */
732 
733  for ( vv = v+1 ; vv <= T->Forks ; )
734  {
735  if ( ! T->Branch[vv]->NodeType &&
736  T->Branch[vv]->Leaf == LeafBr->Leaf &&
737  ( PruneDefaults || T->Branch[vv]->Cases > 0 ) )
738  {
739  /* Branch vv can be merged with branch v */
740 
741  if ( T->Branch[vv]->Cases )
742  {
743  /* Add class distribution from branch vv,
744  or replace if branch v has no cases */
745 
746  ForEach(c, 1, MaxClass)
747  {
748  if ( ! LeafBr->Cases )
749  {
750  LeafBr->ClassDist[c] =
751  T->Branch[vv]->ClassDist[c];
752  }
753  else
754  {
755  LeafBr->ClassDist[c] +=
756  T->Branch[vv]->ClassDist[c];
757  }
758  }
759  LeafBr->Cases += T->Branch[vv]->Cases;
760  LeafBr->Errors += T->Branch[vv]->Errors;
761  }
762 
763  /* Merge values and free branch vv */
764 
765  ForEach(b, 0, Bytes-1)
766  {
767  T->Subset[v][b] |= T->Subset[vv][b];
768  }
769  FreeTree(T->Branch[vv]);
770  Free(T->Subset[vv]);
771 
772  T->Forks--;
773  ForEach(x, vv, T->Forks)
774  {
775  T->Branch[x] = T->Branch[x+1];
776  T->Subset[x] = T->Subset[x+1];
777  }
778  }
779  else
780  {
781  vv++;
782  }
783  }
784  }
785  }
786  }
787  else
788  if ( T->NodeType )
789  {
790  ForEach(v, 1, T->Forks)
791  {
792  CheckSubsets(T->Branch[v], PruneDefaults);
793  }
794  }
795 }
796 
797 
798 
799 /*************************************************************************/
800 /* */
801 /* Compute Coeff, used by RawExtraErrs() to adjust resubstitution */
802 /* error rate to upper limit of the confidence level. Coeff is */
803 /* the square of the number of standard deviations corresponding */
804 /* to the selected confidence level. (Taken from Documenta Geigy */
805 /* Scientific Tables (Sixth Edition), p185 (with modifications).) */
806 /* */
807 /*************************************************************************/
808 
809 
810 float Val[] = { 0, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},
811  Dev[] = {4.0, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00},
812  Coeff;
813 
814 
816 /* ------------------- */
817 {
818  int i=1;
819 
820  /* Compute and retain the coefficient value, interpolating from
821  the values in Val and Dev */
822 
823  while ( CF > Val[i] ) i++;
824 
825  Coeff = Dev[i-1] +
826  (Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);
827  Coeff = Coeff * Coeff;
828  CF = Max(CF, 1E-6);
829 }
830 
831 
832 /*************************************************************************/
833 /* */
834 /* Calculate extra errors to correct the resubstitution error */
835 /* rate at a leaf with N cases, E errors, predicted class C. */
836 /* If CostWeights are used, N and E are normalised by removing */
837 /* the effects of cost weighting and then reapplying weights to */
838 /* the result. */
839 /* */
840 /*************************************************************************/
841 
842 
844 /* --------- */
845 {
846  ClassNo EC;
847  CaseCount NormC, NormEC;
848 
849  if ( ! CostWeights )
850  {
851  return RawExtraErrs(N, E);
852  }
853 
854  EC = 3 - C; /* the other class */
855  NormC = (N - E) / WeightMul[C]; /* normalised cases of class C */
856  NormEC = E / WeightMul[EC]; /* ditto the other class */
857 
858  return WeightMul[EC] * RawExtraErrs(NormC + NormEC, NormEC);
859 }
860 
861 
862 
864 /* ------------ */
865 {
866  float Val0, Pr;
867 
868  if ( E < 1E-6 )
869  {
870  return N * (1 - exp(log(CF) / N));
871  }
872  else
873  if ( N > 1 && E < 0.9999 )
874  {
875  Val0 = N * (1 - exp(log(CF) / N));
876  return Val0 + E * (RawExtraErrs(N, 1.0) - Val0);
877  }
878  else
879  if ( E + 0.5 >= N )
880  {
881  return 0.67 * (N - E);
882  }
883  else
884  {
885  Pr = (E + 0.5 + Coeff/2
886  + sqrt(Coeff * ((E + 0.5) * (1 - (E + 0.5)/N) + Coeff/4)) )
887  / (N + Coeff);
888  return (N * Pr - E);
889  }
890 }
891 
892 
893 
894 /*************************************************************************/
895 /* */
896 /* If there are differential misclassification costs, the weights */
897 /* may have been artificially adjusted. Fix the distributions so */
898 /* that they represent the "true" (possibly boosted) weights */
899 /* */
900 /*************************************************************************/
901 
902 
904 /* --------------- */
905 {
906  DiscrValue v;
907  ClassNo c;
908 
909  if ( T->NodeType )
910  {
911  ForEach(v, 1, T->Forks)
912  {
913  RestoreDistribs(T->Branch[v]);
914  }
915  }
916 
917  if ( T->Cases >= MinLeaf )
918  {
919  if ( CostWeights )
920  {
921  T->Cases = 0;
922  ForEach(c, 1, MaxClass)
923  {
924  ClassSum[c] = (T->ClassDist[c] /= WeightMul[c]);
925  T->Cases += T->ClassDist[c];
926  }
927  }
928  else
929  {
930  ForEach(c, 1, MaxClass)
931  {
932  ClassSum[c] = T->ClassDist[c];
933  }
934  }
935 
936  T->Leaf = SelectClass(1, true);
937  T->Errors = T->Cases - T->ClassDist[T->Leaf];
938  }
939 }
940 
941 
942 
943 /*************************************************************************/
944 /* */
945 /* See whether empty branches can be formed into subsets. */
946 /* For the first trial only, and when not generating rulesets, */
947 /* combine leaves with the same class. */
948 /* */
949 /*************************************************************************/
950 
951 
953 /* ---------------- */
954 {
955  DiscrValue v, vv, S=0, *LocalSet;
956  int Bytes;
957  Tree Br, *OldBranch;
958  ClassNo c;
959  Boolean EmptyOnly;
960 
961  EmptyOnly = Trial || RULES;
962 
963  if ( T->NodeType )
964  {
965  /* LocalSet[v] is the new branch number to which branch v belongs */
966 
967  LocalSet = AllocZero(T->Forks+1, DiscrValue);
968 
969  ForEach(v, 1, T->Forks)
970  {
971  Br = T->Branch[v];
972  CompressBranches(Br);
973 
974  /* Don't check if compression impossible */
975 
976  if ( v == 1 || T->Forks < 4 || Br->NodeType ||
977  EmptyOnly && Br->Cases >= MinLeaf )
978  {
979  vv = v + 1;
980  }
981  else
982  {
983  /* Check whether some previous branch is mergeable.
984  For Trial 0, leaves are mergeable if they are
985  both empty or both non-empty and have the same class;
986  for later trials, they must both be empty */
987 
988  for ( vv = 2 ; vv < v ; vv++ )
989  {
990  if ( ! T->Branch[vv]->NodeType &&
991  ( EmptyOnly ? T->Branch[vv]->Cases < MinLeaf :
992  ( T->Branch[vv]->Cases < MinLeaf ) ==
993  ( Br->Cases < MinLeaf ) &&
994  T->Branch[vv]->Leaf == Br->Leaf ) )
995  {
996  break;
997  }
998  }
999  }
1000 
1001  /* If no merge was found, this becomes a new branch */
1002 
1003  LocalSet[v] = ( vv < v ? LocalSet[vv] : ++S );
1004  }
1005 
1006  if ( S < T->Forks )
1007  {
1008  /* Compress! */
1009 
1010  T->Subset = Alloc(S+1, Set);
1011  OldBranch = T->Branch;
1012  T->Branch = Alloc(S+1, Tree);
1013 
1014  Bytes = (MaxAttVal[T->Tested]>>3) + 1;
1015  S = 0;
1016 
1017  ForEach(v, 1, T->Forks)
1018  {
1019  if ( LocalSet[v] > S )
1020  {
1021  S++;
1022  Br = T->Branch[S] = OldBranch[v];
1023  if ( ! Br->ClassDist )
1024  {
1026  }
1027  T->Subset[S] = AllocZero(Bytes, Byte);
1028 
1029  /* Must include N/A even when no cases -- otherwise
1030  reader gets the branches muddled */
1031 
1032  SetBit(v, T->Subset[S]);
1033 
1034  ForEach(vv, v+1, T->Forks)
1035  {
1036  if ( LocalSet[vv] == S )
1037  {
1038  SetBit(vv, T->Subset[S]);
1039 
1040  Br->Cases += OldBranch[vv]->Cases;
1041  Br->Errors += OldBranch[vv]->Errors;
1042  ForEach(c, 1, MaxClass)
1043  {
1044  Br->ClassDist[c] += OldBranch[vv]->ClassDist[c];
1045  }
1046  }
1047  }
1048  }
1049  else
1050  {
1051  FreeTree(OldBranch[v]);
1052  }
1053  }
1054 
1055  T->NodeType = BrSubset;
1056  T->Forks = S;
1057  Free(OldBranch);
1058  }
1059  Free(LocalSet);
1060  }
1061 }
1062 
1063 
1064 
1065 void SetGlobalUnitWeights(int LocalFlag)
1066 /* -------------------- */
1067 {
1068  UnitWeights = ( LocalFlag != 0 );
1069 }