mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
classify.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Determine the class of a case from a decision tree or ruleset */
30 /* */
31 /*************************************************************************/
32 
33 
34 #include "defns.i"
35 #include "extern.i"
36 
37 
38  /* Local data used by MarkActive and RuleClassify.
39  Note: Active is never deallocated, just grows as required */
40 
41 RuleNo *Active=Nil, /* rules that fire while classifying case */
42  NActive, /* number ditto */
43  ActiveSpace=0; /* space allocated */
44 
45 
46 
47 /*************************************************************************/
48 /* */
49 /* Classify a case using a decision tree */
50 /* */
51 /*************************************************************************/
52 
53 
55 /* ------------ */
56 {
57  ClassNo c;
58 
59  ForEach(c, 0, MaxClass)
60  {
61  ClassSum[c] = 0;
62  }
63 
64  FindLeaf(Case, DecisionTree, Nil, 1.0);
65 
66  return SelectClass(1, (Boolean)(MCost != Nil));
67 }
68 
69 
70 
71 /*************************************************************************/
72 /* */
73 /* Classify a case using the given subtree. */
74 /* Adjust the value ClassSum for each class */
75 /* */
76 /*************************************************************************/
77 
78 
79 void FindLeaf(DataRec Case, Tree T, Tree PT, float Fraction)
80 /* -------- */
81 {
82  DiscrValue v, Dv;
83  ClassNo c;
84  float NewFrac, BrWt[4];
85 
86  /* Special case for winnowing cycles */
87 
88  if ( T->NodeType && Skip(T->Tested) )
89  {
90  FollowAllBranches(Case, T, Fraction);
91  return;
92  }
93 
94  if ( T->NodeType && Tested )
95  {
96  Tested[T->Tested] = true; /* for usage */
97  }
98 
99  switch ( T->NodeType )
100  {
101  case 0: /* leaf */
102 
103  LeafUpdate:
104 
105  /* Use parent node if effectively no cases at this node */
106 
107  if ( T->Cases < Epsilon )
108  {
109  T = PT;
110  }
111 
112  /* Update from all classes */
113 
114  ForEach(c, 1, MaxClass)
115  {
116  ClassSum[c] += Fraction * T->ClassDist[c] / T->Cases;
117  }
118 
119  return;
120 
121  case BrDiscr: /* test of discrete attribute */
122 
123  Dv = DVal(Case, T->Tested); /* > MaxAttVal if unknown */
124 
125  if ( Dv <= T->Forks ) /* Make sure not new discrete value */
126  {
127  FindLeaf(Case, T->Branch[Dv], T, Fraction);
128  }
129  else
130  {
131  FollowAllBranches(Case, T, Fraction);
132  }
133 
134  return;
135 
136  case BrThresh: /* test of continuous attribute */
137 
138  if ( Unknown(Case, T->Tested) )
139  {
140  FollowAllBranches(Case, T, Fraction);
141  }
142  else
143  if ( NotApplic(Case, T->Tested) )
144  {
145  FindLeaf(Case, T->Branch[1], T, Fraction);
146  }
147  else
148  {
149  /* Find weights for <= and > branches, interpolating if
150  probabilistic thresholds are used */
151 
152  BrWt[2] = Interpolate(T, CVal(Case, T->Tested));
153  BrWt[3] = 1 - BrWt[2];
154 
155  ForEach(v, 2, 3)
156  {
157  if ( (NewFrac = Fraction * BrWt[v]) >= 0.01 )
158  {
159  FindLeaf(Case, T->Branch[v], T, NewFrac);
160  }
161  }
162  }
163 
164  return;
165 
166  case BrSubset: /* subset test on discrete attribute */
167 
168  Dv = DVal(Case, T->Tested); /* > MaxAttVal if unknown */
169 
170  if ( Dv <= MaxAttVal[T->Tested] )
171  {
172  ForEach(v, 1, T->Forks)
173  {
174  if ( In(Dv, T->Subset[v]) )
175  {
176  FindLeaf(Case, T->Branch[v], T, Fraction);
177 
178  return;
179  }
180  }
181 
182  /* Value not found in any subset -- treat as leaf */
183 
184  goto LeafUpdate;
185  }
186  else
187  {
188  FollowAllBranches(Case, T, Fraction);
189  }
190  }
191 }
192 
193 
194 
195 /*************************************************************************/
196 /* */
197 /* Follow all branches from a node, weighting them in proportion */
198 /* to the number of training cases they contain */
199 /* */
200 /*************************************************************************/
201 
202 
203 void FollowAllBranches(DataRec Case, Tree T, float Fraction)
204 /* ----------------- */
205 {
206  DiscrValue v;
207 
208  ForEach(v, 1, T->Forks)
209  {
210  if ( T->Branch[v]->Cases > Epsilon )
211  {
212  FindLeaf(Case, T->Branch[v], T,
213  (Fraction * T->Branch[v]->Cases) / T->Cases);
214  }
215  }
216 }
217 
218 
219 
220 /*************************************************************************/
221 /* */
222 /* Classify a case using a ruleset */
223 /* */
224 /*************************************************************************/
225 
226 
228 /* ------------ */
229 {
230  ClassNo c, Best;
231  float TotWeight=0;
232  int a, u=1, d;
233  CRule R;
234  RuleNo r;
235 
236  ForEach(c, 0, MaxClass)
237  {
238  ClassSum[c] = 0;
239  MostSpec[c] = Nil;
240  }
241 
242  /* Find active rules */
243 
244  NActive = 0;
245 
246  if ( RS->RT )
247  {
248  MarkActive(RS->RT, Case);
249  }
250  else
251  {
252  ForEach(r, 1, RS->SNRules)
253  {
254  R = RS->SRule[r];
255 
256  if ( Matches(R, Case) )
257  {
258  Active[NActive++] = r;
259  }
260  }
261  }
262 
263  /* Must sort rules if using utility bands */
264 
265  if ( UtilBand )
266  {
267  SortActive();
268  }
269 
270  /* Vote active rules */
271 
272  ForEach(a, 0, NActive-1)
273  {
274  r = Active[a];
275  R = RS->SRule[r];
276 
277  if ( Tested )
278  {
279  ForEach(d, 1, R->Size)
280  {
281  Tested[R->Lhs[d]->Tested] = true; /* for usage */
282  }
283  }
284  if ( UtilBand ) CheckUtilityBand(&u, r, Class(Case), RS->SDefault);
285  ClassSum[R->Rhs] += R->Vote;
286  TotWeight += 1000.0;
287 
288  /* Check whether this is the most specific rule for this class;
289  resolve ties in favor of rule with higher vote */
290 
291  if ( ! MostSpec[R->Rhs] ||
292  R->Cover < MostSpec[R->Rhs]->Cover ||
293  ( R->Cover == MostSpec[R->Rhs]->Cover &&
294  R->Vote > MostSpec[R->Rhs]->Vote ) )
295  {
296  MostSpec[R->Rhs] = R;
297  }
298  }
299 
300  /* Flush any remaining utility bands */
301 
302  if ( UtilBand )
303  {
304  CheckUtilityBand(&u, RS->SNRules+1, Class(Case), RS->SDefault);
305  }
306 
307  /* Check for default and normalise ClassSum */
308 
309  if ( ! TotWeight )
310  {
311  Confidence = 0.5;
312  return RS->SDefault;
313  }
314 
315  ForEach(c, 1, MaxClass)
316  {
317  ClassSum[c] /= TotWeight;
318  }
319 
320  Best = SelectClass(RS->SDefault, false);
321 
322  /* Set Confidence to the vote for the most specific rule of class Best */
323 
324  Confidence = MostSpec[Best]->Vote / 1000.0;
325 
326  return Best;
327 }
328 
329 
330 
331 /*************************************************************************/
332 /* */
333 /* Determine outcome of a test on a case. */
334 /* Return -1 if value of tested attribute is unknown */
335 /* */
336 /*************************************************************************/
337 
338 
340 /* ----------- */
341 {
342  DiscrValue v, Outcome;
343  Attribute Att;
344 
345  Att = OneCond->Tested;
346 
347  /* Determine the outcome of this test on this case */
348 
349  switch ( OneCond->NodeType )
350  {
351  case BrDiscr: /* test of discrete attribute */
352 
353  v = XDVal(Case, Att);
354  Outcome = ( v == 0 ? -1 : v );
355  break;
356 
357  case BrThresh: /* test of continuous attribute */
358 
359  Outcome = ( Unknown(Case, Att) ? -1 :
360  NotApplic(Case, Att) ? 1 :
361  CVal(Case, Att) <= OneCond->Cut ? 2 : 3 );
362  break;
363 
364  case BrSubset: /* subset test on discrete attribute */
365 
366  v = XDVal(Case, Att);
367  Outcome = ( v <= MaxAttVal[Att] && In(v, OneCond->Subset) ?
368  OneCond->TestValue : 0 );
369  }
370 
371  return Outcome;
372 }
373 
374 
375 
376 /*************************************************************************/
377 /* */
378 /* Determine whether a case satisfies a condition */
379 /* */
380 /*************************************************************************/
381 
382 
384 /* --------- */
385 {
386  return ( FindOutcome(Case, OneCond) == OneCond->TestValue );
387 }
388 
389 
390 
391 /*************************************************************************/
392 /* */
393 /* Determine whether a case satisfies all conditions of a rule */
394 /* */
395 /*************************************************************************/
396 
397 
399 /* ------- */
400 {
401  int d;
402 
403  ForEach(d, 1, R->Size)
404  {
405  if ( ! Satisfies(Case, R->Lhs[d]) )
406  {
407  return false;
408  }
409  }
410 
411  return true;
412 }
413 
414 
415 
416 /*************************************************************************/
417 /* */
418 /* Make sure that Active[] has space for at least N rules */
419 /* */
420 /*************************************************************************/
421 
422 
423 void CheckActiveSpace(int N)
424 /* ---------------- */
425 {
426  if ( ActiveSpace <= N )
427  {
428  Realloc(Active, (ActiveSpace=N+1), RuleNo);
429  }
430 }
431 
432 
433 
434 /*************************************************************************/
435 /* */
436 /* Use RT to enter active rules in Active[] */
437 /* */
438 /*************************************************************************/
439 
440 
442 /* ---------- */
443 {
444  DiscrValue v;
445  int ri;
446  RuleNo r;
447 
448  if ( ! RT ) return;
449 
450  /* Enter any rules satisfied at this node */
451 
452  if ( RT->Fire )
453  {
454  for ( ri = 0 ; (r = RT->Fire[ri]) ; ri++ )
455  {
456  Active[NActive++] = r;
457  }
458  }
459 
460  if ( ! RT->Branch ) return;
461 
462  /* Explore subtree for rules that include condition at this node */
463 
464  if ( (v = FindOutcome(Case, RT->CondTest)) > 0 && v <= RT->Forks )
465  {
466  MarkActive(RT->Branch[v], Case);
467  }
468 
469  /* Explore default subtree for rules that do not include condition */
470 
471  MarkActive(RT->Branch[0], Case);
472 }
473 
474 
475 
476 /*************************************************************************/
477 /* */
478 /* Sort active rules for utility band error rates */
479 /* */
480 /*************************************************************************/
481 
482 
484 /* ---------- */
485 {
486  RuleNo r;
487  int a, aa, aLow;
488 
489  ForEach(a, 0, NActive-1)
490  {
491  aLow = a;
492 
493  ForEach(aa, a+1, NActive-1)
494  {
495  if ( Active[aa] < Active[aLow] ) aLow = aa;
496  }
497 
498  r = Active[a];
499  Active[a] = Active[aLow];
500  Active[aLow] = r;
501  }
502 }
503 
504 
505 
506 /*************************************************************************/
507 /* */
508 /* Update utility band error rates for all bands before rule r */
509 /* that have not been competed yet. Update current band. */
510 /* */
511 /*************************************************************************/
512 
513 
515 /* ---------------- */
516 {
517  ClassNo c;
518 
519  while ( *u < UTILITY && r > UtilBand[*u] )
520  {
521  c = SelectClass(Default, false);
522  if ( c != Actual )
523  {
524  UtilErr[*u]++;
525  if ( MCost ) UtilCost[*u] += MCost[c][Actual];
526  }
527 
528  (*u)++;
529  }
530 }
531 
532 
533 
534 /*************************************************************************/
535 /* */
536 /* Classify a case using boosted tree or rule sequence. */
537 /* Global variable Default must have been set prior to call */
538 /* */
539 /* Note: boosting with costs is complicated. With trees, */
540 /* complete class distributions are accumulated and then a class */
541 /* selected to minimize expected cost. This cannot be done with */
542 /* rulesets since a single ruleset does not give a reliable */
543 /* class distribution; instead, the votes from all cost-adjusted */
544 /* rulesets are combined without reference to costs. */
545 /* */
546 /*************************************************************************/
547 
548 
550 /* ------------- */
551 {
552  ClassNo c, Best;
553  int t;
554  float Total=0;
555 
556  ForEach(c, 1, MaxClass)
557  {
558  Vote[c] = 0;
559  }
560 
561  ForEach(t, 0, MaxTrial)
562  {
563  Best = ( RULES ? RuleClassify(Case, RuleSet[t]) :
564  TreeClassify(Case, Pruned[t]) );
565 
566  Vote[Best] += Confidence;
567  Total += Confidence;
568 
569  TrialPred[t] = Best;
570  }
571 
572  /* Copy votes into ClassSum */
573 
574  ForEach(c, 1, MaxClass)
575  {
576  ClassSum[c] = Vote[c] / Total;
577  }
578 
579  return SelectClass(Default, false);
580 }
581 
582 
583 
584 /*************************************************************************/
585 /* */
586 /* Select the best class to return. Take misclassification costs */
587 /* into account if they are defined. */
588 /* */
589 /*************************************************************************/
590 
591 
593 /* ----------- */
594 {
595  ClassNo c, cc, BestClass;
596  float ExpCost, BestCost=1E38, TotCost=0;
597 
598  BestClass = Default;
599 
600  if ( UseCosts )
601  {
602  ForEach(c, 1, MaxClass)
603  {
604  ExpCost = 0;
605  ForEach(cc, 1, MaxClass)
606  {
607  if ( cc == c ) continue;
608  ExpCost += ClassSum[cc] * MCost[c][cc];
609  }
610 
611  TotCost += ExpCost;
612 
613  if ( ExpCost < BestCost )
614  {
615  BestClass = c;
616  BestCost = ExpCost;
617  }
618  }
619 
620  Confidence = 1 - BestCost / TotCost;
621  }
622  else
623  {
624  ForEach(c, 1, MaxClass)
625  {
626  if ( ClassSum[c] > ClassSum[BestClass] ) BestClass = c;
627  }
628 
629  Confidence = ClassSum[BestClass];
630  }
631 
632  return BestClass;
633 }
634 
635 
636 
637 /*************************************************************************/
638 /* */
639 /* General classification routine */
640 /* */
641 /*************************************************************************/
642 
643 
645 /* -------- */
646 {
647 
648  return ( TRIALS > 1 ? BoostClassify(Case, TRIALS-1) :
649  RULES ? RuleClassify(Case, RuleSet[0]) :
650  TreeClassify(Case, Pruned[0]) );
651 }
652 
653 
654 
655 /*************************************************************************/
656 /* */
657 /* Interpolate a single value between Lower, Mid and Upper */
658 /* (All these have the same value unless using probabilistic */
659 /* thresholds.) */
660 /* */
661 /*************************************************************************/
662 
663 
665 /* ----------- */
666 {
667  return ( Val <= T->Lower ? 1.0 :
668  Val >= T->Upper ? 0.0 :
669  Val <= T->Mid ?
670  1 - 0.5 * (Val - T->Lower) / (T->Mid - T->Lower + 1E-6) :
671  0.5 - 0.5 * (Val - T->Mid) / (T->Upper - T->Mid + 1E-6) );
672 }
673 
674 
675 
676 /*************************************************************************/
677 /* */
678 /* Free data structures for one classifier */
679 /* */
680 /*************************************************************************/
681 
682 
684 /* -------------- */
685 {
686  if ( Raw )
687  {
688  FreeTree(Raw[Trial]); Raw[Trial] = Nil;
689  }
690 
691  if ( Pruned )
692  {
693  FreeTree(Pruned[Trial]); Pruned[Trial] = Nil;
694  }
695 
696  if ( RULES && RuleSet && RuleSet[Trial] )
697  {
698  FreeRules(RuleSet[Trial]); RuleSet[Trial] = Nil;
699  }
700 }