mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
formrules.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Form a set of rules from a decision tree */
30 /* ---------------------------------------- */
31 /* */
32 /* The cases are partitioned into sublists: */
33 /* * Fail0: those cases that satisfy all undeleted conditions */
34 /* * Fail1: those that satisfy all but one of the above */
35 /* * FailMany: the remaining cases */
36 /* Lists are implemented via Succ; Succ[i] is the number of the */
37 /* case that follows case i. */
38 /* */
39 /*************************************************************************/
40 
41 
42 #include "defns.i"
43 #include "extern.i"
44 
45 double *Errors=Nil, /* [Condition] */
46  *Total=Nil; /* [Condition] */
47 
48 float *Pessimistic=Nil, /* [Condition] */
49  *CondCost=Nil; /* [Condition] */
50 
51 Boolean **CondFailedBy=Nil, /* [Condition][CaseNo] */
52  *Deleted=Nil; /* [Condition] */
53 
55 
56 int MaxDepth=0, /* depth of tree */
57  NCond,
58  Bestd;
59 
61 
62 short *NFail=Nil, /* NFail[i] = conditions failed by i */
63  *LocalNFail=Nil; /* copy used during rule pruning */
64 
66  Fail1,
67  FailMany,
68  *Succ=Nil; /* case following case i */
69 
70 
71 
72 /*************************************************************************/
73 /* */
74 /* Process a tree to extract a ruleset */
75 /* */
76 /*************************************************************************/
77 
78 
80  /* --------- */
81 {
82  int i;
83  CRuleSet RS;
84 
85  NotifyStage(FORMRULES);
86  Progress(-(MaxCase+1.0));
87 
88  Verbosity(2, PrintTree(T, "Pruned tree:"))
89 
90  /* Find essential parameters and allocate storage */
91 
92  MaxDepth = TreeDepth(T);
93 
94  Errors = AllocZero(MaxDepth+2, double);
95  Total = AllocZero(MaxDepth+2, double);
96 
97  Pessimistic = AllocZero(MaxDepth+2, float);
98  CondCost = AllocZero(MaxDepth+2, float);
99 
102 
103  Stack = AllocZero(MaxDepth+2, Condition);
104 
105  ForEach(i, 0, MaxDepth+1)
106  {
107  Stack[i] = Alloc(1, CondRec);
109  }
110 
111  NFail = AllocZero(MaxCase+1, short);
112  LocalNFail = AllocZero(MaxCase+1, short);
113 
114  CovBy = AllocZero(MaxCase+2, int);
115 
116  List = Alloc(MaxCase+2, CaseNo);
117  Succ = Alloc(MaxCase+1, CaseNo);
118 
119  CBuffer = Alloc(4 + (MaxCase+1) + (MaxCase+1)/128, Byte);
120 
121  NRules = RuleSpace = 0;
123 
124  if ( ! BranchBits )
125  {
127  FindTestCodes();
128  }
129 
130  SetupNCost();
131 
132  /* Extract and prune paths from root to leaves */
133 
134  NCond = 0;
135  Scan(T);
136 
137  Default = T->Leaf;
138 
139  /* Deallocate storage */
140 
142 
143  /* Select final rules */
144 
145  SiftRules((T->Errors + MaxClass-1) / (MaxCase+1 + MaxClass));
146 
147  FreeVector((void **) NCost, 0, MaxClass); NCost = Nil;
148 
150 
151  RS = Alloc(1, RuleSetRec);
152 
153  RS->SNRules = NRules;
154  RS->SRule = Rule; Rule = Nil;
155  RS->SDefault = Default;
156 
157  ConstructRuleTree(RS);
158 
159  return RS;
160 }
161 
162 
163 
164 /*************************************************************************/
165 /* */
166 /* Set up normalised costs. These are all 0/1 if MCost is not */
167 /* defined or if cost weighting is used. Otherwise, MCost is */
168 /* divided by an estimated average error cost, determined as */
169 /* follows: */
170 /* */
171 /* Assume E errors. The expected number of cases misclassified */
172 /* as class C is E * P(C), with the real classes distributed */
173 /* in accordance with their priors. This gives an expected */
174 /* total error cost of */
175 /* E * sum/C { P(C) * sum/D!=C { P(D)/(1-P(C)) * M[C][D] } } */
176 /* and dividing by E gives an expected average cost. */
177 /* */
178 /* The above tends to be pessimistic, so we reduce it somewhat. */
179 /* */
180 /* Siftrules requires a row of NCost corresponding to predicted */
181 /* class 0 (case not covered by any rule). All costs in this row */
182 /* are set to 1. */
183 /* */
184 /*************************************************************************/
185 
186 
188 /* ---------- */
189 {
190  ClassNo Real, Pred;
191  double AvErrCost=0, ProbPred, ProbReal;
192 
193  NCost = Alloc(MaxClass+1, float *);
194 
195  ForEach(Pred, 0, MaxClass)
196  {
197  NCost[Pred] = Alloc(MaxClass+1, float);
198 
199  if ( ! MCost || CostWeights || Pred == 0 )
200  {
201  ForEach(Real, 1, MaxClass)
202  {
203  NCost[Pred][Real] = ( Pred != Real );
204  }
205  }
206  else
207  {
208  ProbPred = ClassFreq[Pred] / (MaxCase+1);
209  ForEach(Real, 1, MaxClass)
210  {
211  NCost[Pred][Real] = MCost[Pred][Real];
212  if ( Real == Pred ) continue;
213 
214  ProbReal = ClassFreq[Real] / (MaxCase+1);
215  AvErrCost +=
216  ProbPred * (ProbReal / (1 - ProbPred)) * MCost[Pred][Real];
217  }
218  }
219  }
220 
221  if ( MCost && ! CostWeights )
222  {
223  AvErrCost = (AvErrCost + 1) / 2; /* reduced average cost */
224  ForEach(Real, 1, MaxClass)
225  {
226  ForEach(Pred, 1, MaxClass)
227  {
228  NCost[Pred][Real] /= AvErrCost;
229  }
230  }
231  }
232 }
233 
234 
235 
236 /*************************************************************************/
237 /* */
238 /* Extract paths from tree T and prune them to form rules */
239 /* */
240 /*************************************************************************/
241 
242 
243 void Scan(Tree T)
244 /* ---- */
245 {
246  DiscrValue v, Last;
247  Condition Term;
248 
249  if ( T->NodeType )
250  {
251  NCond++;
252  Term = Stack[NCond];
253 
254  Term->NodeType = T->NodeType;
255  Term->Tested = T->Tested;
256  Term->Cut = T->Cut;
257 
258  ForEach(v, 1, T->Forks)
259  {
260  /* Skip branches with empty leaves */
261 
262  if ( T->Branch[v]->Cases < MinLeaf ) continue;
263 
264  Term->TestValue = v;
265 
266  if ( T->NodeType == BrSubset )
267  {
268  if ( Elements(T->Tested, T->Subset[v], &Last) == 1 )
269  {
270  /* Subset contains a single element */
271 
272  Term->NodeType = BrDiscr;
273  Term->TestValue = Last;
274  }
275  else
276  {
277  Term->NodeType = BrSubset;
278  Term->Subset = T->Subset[v];
279  Term->TestValue = 1;
280  }
281  }
282 
283  CondCost[NCond] = CondBits(Term);
284 
285  /* Adjust number of failed conditions */
286 
287  PushCondition();
288 
289  Scan(T->Branch[v]);
290 
291  /* Reset number of failed conditions */
292 
293  PopCondition();
294  }
295 
296  NCond--;
297  }
298 
299  /* Make a rule from every node of the tree other than the root */
300 
301  if ( NCond > 0 && T->Cases >= 1 )
302  {
303 
304  memcpy(LocalNFail, NFail, (MaxCase + 1) * sizeof(short));
305 
306  TargetClass = T->Leaf;
307  PruneRule(Stack, T->Leaf);
308 
309  if ( ! T->NodeType ) Progress(T->Cases);
310  }
311 }
312 
313 
314 
315 /*************************************************************************/
316 /* */
317 /* Update NFail when a condition is added to/removed from Stack */
318 /* */
319 /*************************************************************************/
320 
321 
323 /* ------------- */
324 {
325  int i;
326 
327  ForEach(i, 0, MaxCase)
328  {
329  if ( (CondFailedBy[NCond][i] = ! Satisfies(Case[i], Stack[NCond])) )
330  {
331  NFail[i]++;
332  }
333  }
334 }
335 
336 
337 
339 /* ------------- */
340 {
341  int i;
342 
343  ForEach(i, 0, MaxCase)
344  {
345  if ( CondFailedBy[NCond][i] )
346  {
347  NFail[i]--;
348  }
349  }
350 }
351 
352 
353 
354 /*************************************************************************/
355 /* */
356 /* Prune the rule given by the conditions Cond, and the number of */
357 /* conditions NCond, and add the resulting rule to the current */
358 /* ruleset if it is sufficiently accurate */
359 /* */
360 /*************************************************************************/
361 
362 #define TI(a,b) (((a)+(b)) * Log((a)+(b)) - (a) * Log(a) - (b) * Log(b))
363 
364 
366 /* --------- */
367 {
368  int d, id, Bestid, Remaining=NCond;
369  double RealTotal, RealCorrect;
370  CaseNo i, LL=0;
371  float Prior;
372  double Base, Gain, Cost=0;
373 
374  ForEach(d, 0, NCond)
375  {
376  Deleted[d] = false;
377  Total[d] =
378  Errors[d] = 0;
379 
380  if ( d ) Cost += CondCost[d];
381  }
382  Cost -= LogFact[NCond];
383 
384  Base = TI(ClassFreq[TargetClass], MaxCase+1 - ClassFreq[TargetClass]);
385 
386  /* Initialise all fail lists */
387 
388  Bestd = 0;
389  ProcessLists();
390 
391  ForEach(d, 1, NCond)
392  {
393  Total[d] += Total[0];
394  Errors[d] += Errors[0];
395  }
396 
397  /* Find conditions to delete */
398 
399  Verbosity(1, fprintf(Of, "\n Pruning rule for %s", ClassName[TargetClass]))
400 
401  while (true )
402  {
403  /* Find the condition, deleting which would most improve
404  the pessimistic accuracy of the rule.
405  Note: d = 0 means all conditions are satisfied */
406 
407  Bestd = id = 0;
408 
409  Gain = Base - TI(Total[0]-Errors[0], Errors[0])
410  - TI(ClassFreq[TargetClass]-Total[0]+Errors[0],
411  MaxCase+1-ClassFreq[TargetClass]-Errors[0]);
412 
413  Verbosity(1,
414  fprintf(Of, "\n Err Used Pess\tAbsent condition\n"))
415 
416  ForEach(d, 0, NCond)
417  {
418  if ( Deleted[d] ) continue;
419 
420  if ( Errors[d] > Total[d] ) Errors[d] = Total[d];
421 
422  Pessimistic[d] = ( Total[d] < Epsilon ? 0.5 :
423  (Errors[d] + 1) / (Total[d] + 2.0) );
424 
425  Verbosity(1,
426  fprintf(Of, " %7.1f%7.1f %4.1f%%",
427  Errors[d], Total[d], 100 * Pessimistic[d]))
428 
429  if ( ! d )
430  {
431  Verbosity(1,
432  fprintf(Of, "\t<base> %.1f/%.1f bits\n", Gain, Cost))
433  }
434  else
435  {
436  id++;
437 
438  Verbosity(1, PrintCondition(Cond[d]))
439 
440  /* Bestd identifies the condition with lowest pessimistic
441  error estimate */
442 
443  if ( ! Bestd || Pessimistic[d] <= Pessimistic[Bestd] )
444  {
445  Bestd = d;
446  Bestid = id;
447  }
448  }
449  }
450 
451  if ( Remaining == 1 || ! Bestd ||
452  ( THEORYFRAC * Cost <= Gain &&
453  Pessimistic[Bestd] > Pessimistic[0] ) )
454  {
455  break;
456  }
457 
458  Verbosity(1, fprintf(Of, "\teliminate test %d\n", Bestid))
459 
460  Deleted[Bestd] = true;
461  Remaining--;
462  Cost -= CondCost[Bestd] - LogFact[Remaining+1] + LogFact[Remaining];
463 
464  ForEach(d, 1, NCond)
465  {
466  if ( d != Bestd )
467  {
468  Total[d] += Total[Bestd] - Total[0];
469  Errors[d] += Errors[Bestd] - Errors[0];
470  }
471  }
472  Total[0] = Total[Bestd];
473  Errors[0] = Errors[Bestd];
474 
475  ProcessLists();
476  }
477 
478  if ( Remaining && Total[0] > 0.99 && THEORYFRAC * Cost <= Gain )
479  {
480  Prior = ClassFreq[TargetClass] / (MaxCase+1.0);
481 
482  /* Find list of cases covered by this rule and adjust coverage
483  if using costs */
484 
485  if ( ! MCost )
486  {
487  RealTotal = Total[0];
488  RealCorrect = Total[0] - Errors[0];
489 
490  for ( i = Fail0 ; i >= 0 ; i = Succ[i] )
491  {
492  List[++LL] = i;
493  }
494  }
495  else
496  if ( CostWeights )
497  {
498  /* Adjust distributions to reverse case weighting */
499 
500  Prior /= WeightMul[TargetClass];
501 
502  RealTotal = 0;
503  for ( i = Fail0 ; i >= 0 ; i = Succ[i] )
504  {
505  RealTotal += Weight(Case[i]) / WeightMul[Class(Case[i])];
506  List[++LL] = i;
507  }
508  RealCorrect = (Total[0] - Errors[0]) / WeightMul[TargetClass];
509  }
510  else
511  {
512  /* Errors have been weighted by NCost -- undo */
513 
514  RealTotal = Total[0];
515  RealCorrect = 0;
516  for ( i = Fail0 ; i >= 0 ; i = Succ[i] )
517  {
518  RealCorrect += Weight(Case[i]) *
519  (Class(Case[i]) == TargetClass);
520  List[++LL] = i;
521  }
522  }
523  List[0] = LL;
524 
525  if ( (RealCorrect + 1) / ((RealTotal + 2) * Prior) >= 0.95 )
526  {
527  NewRule(Cond, NCond, TargetClass, Deleted, Nil,
528  RealTotal, RealCorrect, Prior);
529  }
530  }
531 }
532 
533 
534 
535 /*************************************************************************/
536 /* */
537 /* Change Fail0, Fail1, and FailMany. */
538 /* If Bestd has not been set, initialise the lists; otherwise */
539 /* record the changes for deleting condition Bestd and reduce */
540 /* LocalNFail for cases that do not satisfy condition Bestd */
541 /* */
542 /*************************************************************************/
543 
544 
546 /* ------------ */
547 {
548  CaseNo i, iNext, *Prev;
549  int d;
550 
551  if ( ! Bestd )
552  {
553  /* Initialise the fail list */
554 
555  Fail0 = Fail1 = FailMany = -1;
556 
557  ForEach(i, 0, MaxCase)
558  {
559  if ( ! LocalNFail[i] )
560  {
561  Increment(0, i, Total, Errors);
562  AddToList(&Fail0, i);
563  }
564  else
565  if ( LocalNFail[i] == 1 )
566  {
567  d = SingleFail(i);
568  Increment(d, i, Total, Errors);
569  AddToList(&Fail1, i);
570  }
571  else
572  {
573  AddToList(&FailMany, i);
574  }
575  }
576  }
577  else
578  {
579  /* Change the fail list to remove condition Bestd */
580 
581  /* Promote cases from Fail1 to Fail0 */
582 
583  Prev = &Fail1;
584 
585  for ( i = Fail1 ; i >= 0 ; )
586  {
587  iNext = Succ[i];
588  if ( CondFailedBy[Bestd][i] )
589  {
590  DeleteFromList(Prev, i);
591  AddToList(&Fail0, i);
592  }
593  else
594  {
595  Prev = &Succ[i];
596  }
597  i = iNext;
598  }
599 
600  /* Check cases in FailMany */
601 
602  Prev = &FailMany;
603 
604  for ( i = FailMany ; i >= 0 ; )
605  {
606  iNext = Succ[i];
607  if ( CondFailedBy[Bestd][i] && --LocalNFail[i] == 1 )
608  {
609  d = SingleFail(i);
610  Increment(d, i, Total, Errors);
611 
612  DeleteFromList(Prev, i);
613  AddToList(&Fail1, i);
614  }
615  else
616  {
617  Prev = &Succ[i];
618  }
619  i = iNext;
620  }
621  }
622 }
623 
624 
625 
626 /*************************************************************************/
627 /* */
628 /* Add case to list whose first case is *List */
629 /* */
630 /*************************************************************************/
631 
632 
634 /* --------- */
635 {
636  Succ[N] = *List;
637  *List = N;
638 }
639 
640 
641 
642 /*************************************************************************/
643 /* */
644 /* Delete case from list where previous case is *Before */
645 /* */
646 /*************************************************************************/
647 
648 
649 void DeleteFromList(CaseNo *Before, CaseNo N)
650 /* -------------- */
651 {
652  *Before = Succ[N];
653 }
654 
655 
656 
657 /*************************************************************************/
658 /* */
659 /* Find single condition failed by a case */
660 /* */
661 /*************************************************************************/
662 
663 
665 /* ---------- */
666 {
667  int d;
668 
669  ForEach(d, 1, NCond)
670  {
671  if ( ! Deleted[d] && CondFailedBy[d][i] ) return d;
672  }
673 
674  return 0;
675 }
676 
677 
678 
679 /*************************************************************************/
680 /* */
681 /* Case i covers all conditions except d; update Total and Errors */
682 /* */
683 /*************************************************************************/
684 
685 
686 void Increment(int d, CaseNo i, double *Total, double *Errors)
687 /* --------- */
688 {
689  Total[d] += Weight(Case[i]);
690  Errors[d]+= Weight(Case[i]) * NCost[TargetClass][Class(Case[i])];
691 }
692 
693 
694 
695 
696 
697 
698 /*************************************************************************/
699 /* */
700 /* Free all data allocated for forming rules */
701 /* */
702 /*************************************************************************/
703 
704 
706 /* ---------------- */
707 {
708  if ( ! CondFailedBy ) return;
709 
710  FreeVector((void **) CondFailedBy, 0, MaxDepth+1); CondFailedBy = Nil;
711  FreeVector((void **) Stack, 0, MaxDepth+1); Stack = Nil;
712  Free(Deleted); Deleted = Nil;
715  Free(Total); Total = Nil;
716  Free(Errors); Errors = Nil;
717  Free(NFail); NFail = Nil;
719  Free(Succ); Succ = Nil;
720 }