mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
siftrules.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Find a good subset of a set of rules */
30 /* ------------------------------------ */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 
39 float *DeltaErrs=Nil, /* DeltaErrs[r] = change attributable to rule r or
40  realisable if rule r included */
41  *Bits=Nil, /* Bits[r] = bits to encode rule r */
42  BitsErr, /* BitsErr = bits to label prediction as error */
43  BitsOK; /* BitsOK = bits to label prediction as ok */
44 
45 int **TotVote=Nil; /* TotVote[i][c] = case i's votes for class c */
46 
47 ClassNo *TopClass=Nil, /* TopClass[i] = class with highest vote */
48  *AltClass=Nil; /* AltClass[i] = class with second highest vote */
49 
50 Boolean *RuleIn=Nil, /* RuleIn[r] = rule r included */
51  *Covered=Nil; /* Covered[i] = case i covered by rule(s) */
52 
53 Byte *CovByBlock=Nil,/* holds entries for inverse of Fires */
54  **CovByPtr=Nil; /* next entry for CovBy[i] */
55 
56 RuleNo *LastCovBy=Nil; /* Last rule covering case i */
57 
58 
59 /*************************************************************************/
60 /* */
61 /* Main rule selection routine. */
62 /* 1. Form initial theory */
63 /* 2. Hillclimb in MDL space */
64 /* */
65 /*************************************************************************/
66 
67 
68 void SiftRules(float EstErrRate)
69 /* --------- */
70 {
71  RuleNo r;
72  int d, *bp;
73  CRule R;
74  float CodeLength;
75  CaseNo i;
76 
77  NotifyStage(SIFTRULES);
78  Progress(-(float) NRules);
79 
80  /* Determine inverse of Fires in CovBy, CovByPtr, CovByBlock */
81 
82  InvertFires();
83 
84  /* Clean up any subsets in conditions by removing values that do
85  not appear in the covered cases */
86 
87  if ( SUBSET )
88  {
89  PruneSubsets();
90  }
91 
93  RuleIn = AllocZero(NRules+1, Boolean);
94 
95  /* Set initial theory */
96 
98 
99  Bits = Alloc(NRules+1, float);
100 
101  /* Calculate the number of bits associated with attribute tests;
102  this is not repeated in boosting, composite rulesets etc */
103 
104  if ( ! BranchBits || NRules > MaxCase )
105  {
107  Max(MaxDiscrVal, NRules)))));
108  }
109 
110  if ( ! BranchBits )
111  {
112  FindTestCodes();
113  }
114 
115  /* Determine rule codelengths */
116 
117  if ( NRules >= MaxCase+1 )
118  {
119  Realloc(List, NRules+1, CaseNo);
120  }
121 
122  ForEach(r, 1, NRules)
123  {
124  R = Rule[r];
125 
126  CodeLength = 0;
127  ForEach(d, 1, R->Size)
128  {
129  CodeLength += CondBits(R->Lhs[d]);
130  }
131  Bits[r] = CodeLength + LogCaseNo[R->Size] - LogFact[R->Size];
132  }
133 
134  /* Use estimated error rate to determine the bits required to
135  label a theory's prediction for a case as an error or correct */
136 
137  if ( EstErrRate > 0.5 ) EstErrRate = 0.45;
138 
139  BitsErr = - Log(EstErrRate);
140  BitsOK = - Log(1.0 - EstErrRate);
141 
142 
143  /* Allocate tables used in hillclimbing */
144 
145  DeltaErrs = Alloc(NRules+1, float);
147 
149  TotVote = Alloc(MaxCase+1, int *);
150 
151  bp = AllocZero((MaxCase+1) * (MaxClass+1), int);
152  ForEach(i, 0, MaxCase)
153  {
154  TotVote[i] = bp;
155  bp += MaxClass + 1;
156  }
157 
158  /* Now find best subset of rules */
159 
160  HillClimb();
161 
162  /* Determine default class and reorder rules */
163 
164  SetDefaultClass();
165  OrderRules();
166 
167  /* Deallocate storage */
168 
170 }
171 
172 
173 
174 /*************************************************************************/
175 /* */
176 /* Find inverse of Fires[][] in CovBy, CovByPtr, and CovByBlock. */
177 /* */
178 /* CovBy[i] = number of rules covering case i (set by NewRule) */
179 /* */
180 /* Set up CovByPtr as pointers into CovByBlock so that */
181 /* CovByPtr[i] is the start of the compressed entry for case i */
182 /* */
183 /*************************************************************************/
184 
185 
187 /* ----------- */
188 {
189  RuleNo r, Entry;
190  int j, Blocks, Extra;
191  CaseNo i;
192  Byte *p, *From, *To, *Next;
193 
194  CovByPtr = Alloc(MaxCase+2, Byte *);
195  Extra = NRules / 128; /* max number of filler entries */
196  CovByPtr[0] = 0;
197  ForEach(i, 1, MaxCase+1)
198  {
199  CovByPtr[i] = CovByPtr[i-1] + CovBy[i-1] + Extra;
200  }
201 
202  CovByBlock = Alloc((size_t) CovByPtr[MaxCase+1], Byte);
203  ForEach(i, 0, MaxCase)
204  {
205  CovByPtr[i] += (size_t) CovByBlock;
206  }
207 
209 
210  /* Add entries for each rule */
211 
212  ForEach(r, 1, NRules)
213  {
214  Uncompress(Fires[r], List);
215  ForEach(j, 1, List[0])
216  {
217  i = List[j];
218 
219  /* Add compressed entry for this rule */
220 
221  p = CovByPtr[i];
222  Entry = r - LastCovBy[i];
223  LastCovBy[i] = r;
224 
225  while ( Entry > 127 )
226  {
227  Blocks = (Entry >> 7);
228  if ( Blocks > 127 ) Blocks = 127;
229  Entry -= Blocks * 128;
230  *p++ = Blocks + 128;
231  }
232 
233  *p++ = Entry;
234  CovByPtr[i] = p;
235  }
236  }
237 
239 
240  /* Reset CovByPtr entries and compact */
241 
242  To = CovByPtr[0];
243  From = CovByPtr[0] = CovByBlock;
244 
245  ForEach(i, 1, MaxCase)
246  {
247  From += CovBy[i-1] + Extra;
248  Next = CovByPtr[i];
249  CovByPtr[i] = To;
250 
251  for ( p = From ; p < Next ; )
252  {
253  *To++ = *p++;
254  }
255  }
256 
257  /* Reduce CovByBlock to size actually used */
258 
259  From = CovByBlock; /* current address */
260 
262 
263  if ( CovByBlock != From )
264  {
265  /* CovByBlock has been moved */
266 
267  ForEach(i, 0, MaxCase)
268  {
269  CovByPtr[i] += CovByBlock - From;
270  }
271  }
272 }
273 
274 
275 
276 /*************************************************************************/
277 /* */
278 /* Determine code lengths for attributes and branches */
279 /* */
280 /*************************************************************************/
281 
282 
284 /* ------------- */
285 {
286  Attribute Att;
287  DiscrValue v, V;
288  CaseNo i, *ValFreq;
289  int PossibleAtts=0;
290  float Sum;
291 
292  BranchBits = AllocZero(MaxAtt+1, float);
293  AttValues = AllocZero(MaxAtt+1, int);
294 
295  ForEach(Att, 1, MaxAtt)
296  {
297  if ( Skip(Att) || Att == ClassAtt ) continue;
298 
299  PossibleAtts++;
300 
301  if ( Ordered(Att) )
302  {
303  BranchBits[Att] = 1 + 0.5 * LogCaseNo[MaxAttVal[Att] - 1];
304  }
305  else
306  if ( (V = MaxAttVal[Att]) )
307  {
308  /* Discrete attribute */
309 
310  ValFreq = AllocZero(V+1, CaseNo);
311 
312  ForEach(i, 0, MaxCase)
313  {
314  assert(XDVal(Case[i],Att) >= 0 && XDVal(Case[i],Att) <= V);
315  ValFreq[ XDVal(Case[i],Att) ]++;
316  }
317 
318  Sum = 0;
319  ForEach(v, 1, V)
320  {
321  if ( ValFreq[v] )
322  {
323  Sum += (ValFreq[v] / (MaxCase+1.0)) *
324  (LogCaseNo[MaxCase+1] - LogCaseNo[ValFreq[v]]);
325  AttValues[Att]++;
326  }
327  }
328  Free(ValFreq);
329 
330  BranchBits[Att] = Sum;
331  }
332  else
333  {
334  /* Continuous attribute */
335 
336  BranchBits[Att] = PossibleCuts[Att] > 1 ?
337  1 + 0.5 * LogCaseNo[PossibleCuts[Att]] : 0 ;
338  }
339  }
340 
341  AttTestBits = LogCaseNo[PossibleAtts];
342 }
343 
344 
345 
346 /*************************************************************************/
347 /* */
348 /* Determine the number of bits required to encode a condition */
349 /* */
350 /*************************************************************************/
351 
352 
354 /* -------- */
355 {
356  Attribute Att;
357  float Code=0;
358  int Elts=0;
359  DiscrValue v;
360 
361  Att = C->Tested;
362  switch ( C->NodeType )
363  {
364  case BrDiscr: /* test of discrete attribute */
365  case BrThresh: /* test of continuous attribute */
366 
367  return AttTestBits + BranchBits[Att];
368 
369  case BrSubset: /* subset test on discrete attribute */
370 
371  /* Ignore subset test form for ordered attributes */
372 
373  if ( Ordered(Att) )
374  {
375  return AttTestBits + BranchBits[Att];
376  }
377 
378  ForEach(v, 1, MaxAttVal[Att])
379  {
380  if ( In(v, C->Subset) )
381  {
382  Elts++;
383  }
384  }
385  Elts = Min(Elts, AttValues[Att] - 1); /* if values not present */
386  Code = LogFact[AttValues[Att]] -
387  (LogFact[Elts] + LogFact[AttValues[Att] - Elts]);
388 
389  return AttTestBits + Code;
390  }
391 }
392 
393 
394 
395 /*************************************************************************/
396 /* */
397 /* Select initial theory. This is important, since the greedy */
398 /* optimization procedure is very sensitive to starting with */
399 /* a reasonable theory. */
400 /* */
401 /* The theory is constructed class by class. For each class, */
402 /* rules are added in confidence order until all of the cases of */
403 /* that class are covered. Rules that do not improve coverage */
404 /* are skipped. */
405 /* */
406 /*************************************************************************/
407 
408 
410 /* ---------------- */
411 {
412  ClassNo c;
413  RuleNo r, Active=0;
414 
415  ForEach(c, 1, MaxClass)
416  {
417  CoverClass(c);
418  }
419 
420  /* Remove rules that don't help coverage */
421 
422  ForEach(r, 1, NRules)
423  {
424  if ( (RuleIn[r] &= 1) ) Active++;
425  }
426 }
427 
428 
429 
430 void CoverClass(ClassNo Target)
431 /* ---------- */
432 {
433  CaseNo i;
434  double Remaining, FalsePos=0, NewFalsePos, NewTruePos;
435  RuleNo r, Best;
436  int j;
437 
438  memset(Covered, false, MaxCase+1);
439 
440  Remaining = ClassFreq[Target];
441 
442  while ( Remaining > FalsePos )
443  {
444  /* Find most accurate unused rule from a leaf */
445 
446  Best = 0;
447  ForEach(r, 1, NRules)
448  {
449  if ( Rule[r]->Rhs == Target && ! RuleIn[r] &&
450  Rule[r]->Correct >= MINITEMS )
451  {
452  if ( ! Best || Rule[r]->Vote > Rule[Best]->Vote ) Best = r;
453  }
454  }
455 
456  if ( ! Best ) return;
457 
458  /* Check increased coverage */
459 
460  NewFalsePos = NewTruePos = 0;
461 
462  Uncompress(Fires[Best], List);
463  for( j = List[0] ; j ; j-- )
464  {
465  i = List[j];
466  if ( ! Covered[i] )
467  {
468  if ( Class(Case[i]) == Target )
469  {
470  NewTruePos += Weight(Case[i]);
471  }
472  else
473  {
474  NewFalsePos += Weight(Case[i]);
475  }
476  }
477  }
478 
479  /* If coverage is not increased, set RuleIn to 2 so that
480  the rule can be removed later */
481 
482  if ( NewTruePos - NewFalsePos <= MINITEMS + Epsilon )
483  {
484  RuleIn[Best] = 2;
485  }
486  else
487  {
488  Remaining -= NewTruePos;
489  FalsePos += NewFalsePos;
490 
491  RuleIn[Best] = true;
492 
493  Uncompress(Fires[Best], List);
494  for( j = List[0] ; j ; j-- )
495  {
496  i = List[j];
497  if ( ! Covered[i] )
498  {
499  Covered[i] = true;
500  }
501  }
502  }
503  }
504 }
505 
506 
507 
508 /*************************************************************************/
509 /* */
510 /* Calculate total message length as */
511 /* THEORYFRAC * cost of transmitting theory */
512 /* + cost of identifying and correcting errors */
513 /* */
514 /* The cost of identifying errors assumes that the final theory */
515 /* will have about the same error rate as the pruned tree, so */
516 /* is approx. the sum of the corresponding messages. */
517 /* */
518 /* Message lengths are returned in units of 0.01 */
519 /* */
520 /*************************************************************************/
521 
522 
523 int MessageLength(RuleNo NR, double RuleBits, float Errs)
524 /* ------------- */
525 {
526  return rint(100.0 *
527  (THEORYFRAC * Max(0, RuleBits - LogFact[NR]) +
528  Errs * BitsErr + (MaxCase+1 - Errs) * BitsOK +
529  Errs * LogCaseNo[MaxClass-1]));
530 }
531 
532 
533 
534 /*************************************************************************/
535 /* */
536 /* Improve a subset of rules by adding and deleting rules. */
537 /* MDL costs are rounded to nearest 0.01 bit */
538 /* */
539 /*************************************************************************/
540 
541 
542 void HillClimb()
543 /* --------- */
544 {
545  RuleNo r, RuleCount=0, OriginalCount, Toggle, LastToggle=0;
546  int OutCount;
547  CaseNo i;
548  int j;
549  CaseCount Errs;
550  double RuleBits=0;
551  int LastCost=1E9, CurrentCost, AltCost, NewCost;
552  Boolean DeleteOnly=false;
553 
554  ForEach(r, 1, NRules)
555  {
556  if ( RuleIn[r] )
557  {
558  RuleBits += Bits[r];
559  RuleCount++;
560  }
561  }
562  OriginalCount = RuleCount;
563 
564  InitialiseVotes();
565  Verbosity(1, fprintf(Of, "\n"))
566 
567  /* Initialise DeltaErrs[] */
568 
569  Errs = CalculateDeltaErrs();
570 
571  /* Add or drop rule with greatest reduction in coding cost */
572 
573  while ( true )
574  {
575  CurrentCost = NewCost = MessageLength(RuleCount, RuleBits, Errs);
576 
577  Verbosity(1,
578  fprintf(Of, "\t%d rules, %.1f errs, cost=%.1f bits\n",
579  RuleCount, Errs, CurrentCost/100.0);
580 
581  if ( ! DeleteOnly && CurrentCost > LastCost )
582  {
583  fprintf(Of, "ERROR %g %g\n",
584  CurrentCost/1000.0, LastCost/100.0);
585  break;
586  })
587 
588  Toggle = OutCount = 0;
589 
590  ForEach(r, 1, NRules)
591  {
592  if ( r == LastToggle ) continue;
593 
594  if ( RuleIn[r] )
595  {
596  AltCost = MessageLength(RuleCount - 1,
597  RuleBits - Bits[r],
598  Errs + DeltaErrs[r]);
599  }
600  else
601  {
602  if ( Errs < 1E-3 || DeleteOnly ) continue;
603 
604  AltCost = MessageLength(RuleCount + 1,
605  RuleBits + Bits[r],
606  Errs + DeltaErrs[r]);
607  }
608 
609  Verbosity(2,
610  if ( ! (OutCount++ % 5) ) fprintf(Of, "\n\t\t");
611  fprintf(Of, "%d<%g=%.1f> ",
612  r, DeltaErrs[r], (AltCost - CurrentCost)/100.0))
613 
614  if ( AltCost < NewCost ||
615  AltCost == NewCost && RuleIn[r] )
616  {
617  Toggle = r;
618  NewCost = AltCost;
619  }
620  }
621 
622  if ( ! DeleteOnly && NewCost > CurrentCost )
623  {
624  DeleteOnly = true;
625  Verbosity(1, fprintf(Of, "(start delete mode)\n"))
626  }
627 
628  Verbosity(2, fprintf(Of, "\n"))
629 
630  if ( ! Toggle || DeleteOnly && RuleCount <= OriginalCount ) break;
631 
632  Verbosity(1,
633  fprintf(Of, "\t%s rule %d/%d (errs=%.1f, cost=%.1f bits)\n",
634  ( RuleIn[Toggle] ? "Delete" : "Add" ),
635  Rule[Toggle]->TNo, Rule[Toggle]->RNo,
636  Errs + DeltaErrs[Toggle], NewCost/100.0))
637 
638  /* Adjust vote information */
639 
640  Uncompress(Fires[Toggle], List);
641  for ( j = List[0] ; j ; j-- )
642  {
643  i = List[j];
644 
645  /* Downdate DeltaErrs for all rules except Toggle that cover i */
646 
647  UpdateDeltaErrs(i, -Weight(Case[i]), Toggle);
648 
649  if ( RuleIn[Toggle] )
650  {
651  TotVote[i][Rule[Toggle]->Rhs] -= Rule[Toggle]->Vote;
652  }
653  else
654  {
655  TotVote[i][Rule[Toggle]->Rhs] += Rule[Toggle]->Vote;
656  }
657 
658  CountVotes(i);
659 
660  /* Update DeltaErrs for all rules except Toggle that cover i */
661 
662  UpdateDeltaErrs(i, Weight(Case[i]), Toggle);
663  }
664 
665  /* Update information about rules selected and current errors */
666 
667  if ( RuleIn[Toggle] )
668  {
669  RuleIn[Toggle] = false;
670  RuleBits -= Bits[Toggle];
671  RuleCount--;
672  }
673  else
674  {
675  RuleIn[Toggle] = true;
676  RuleBits += Bits[Toggle];
677  RuleCount++;
678  }
679 
680  Errs += DeltaErrs[Toggle];
681  DeltaErrs[Toggle] = - DeltaErrs[Toggle];
682 
683  LastToggle = Toggle;
684  LastCost = CurrentCost;
685 
686  Progress(1.0);
687  }
688 }
689 
690 
691 
692 /*************************************************************************/
693 /* */
694 /* Determine votes for each case from initial rules */
695 /* Note: no vote for default class */
696 /* */
697 /*************************************************************************/
698 
699 
701 /* --------------- */
702 {
703  CaseNo i;
704  int j, Vote;
705  ClassNo Rhs;
706  RuleNo r;
707 
708  /* Adjust vote for each case covered by rule */
709 
710  ForEach(r, 1, NRules)
711  {
712  if ( ! RuleIn[r] ) continue;
713 
714  Rhs = Rule[r]->Rhs;
715  Vote = Rule[r]->Vote;
716 
717  Uncompress(Fires[r], List);
718  for ( j = List[0] ; j ; j-- )
719  {
720  TotVote[List[j]][Rhs] += Vote;
721  }
722  }
723 
724  /* Find the best and alternate class for each case */
725 
726  ForEach(i, 0, MaxCase)
727  {
728  CountVotes(i);
729  }
730 }
731 
732 
733 
734 /*************************************************************************/
735 /* */
736 /* Find the best and second-best class for each case using the */
737 /* current values of TotVote */
738 /* */
739 /*************************************************************************/
740 
741 
743 /* ---------- */
744 {
745  ClassNo c, First=0, Second=0;
746  int V;
747 
748  ForEach(c, 1, MaxClass)
749  {
750  if ( (V = TotVote[i][c]) )
751  {
752  if ( ! First || V > TotVote[i][First] )
753  {
754  Second = First;
755  First = c;
756  }
757  else
758  if ( ! Second || V > TotVote[i][Second] )
759  {
760  Second = c;
761  }
762  }
763  }
764 
765  TopClass[i] = First;
766  AltClass[i] = Second;
767 }
768 
769 
770 
771 /*************************************************************************/
772 /* */
773 /* Adjust DeltaErrors for all rules except Toggle that cover case i */
774 /* */
775 /*************************************************************************/
776 
777 
778 #define Prefer(d,c1,c2) ((d) > 0 || (d) == 0 && c1 < c2)
779 
780 void UpdateDeltaErrs(CaseNo i, double Delta, RuleNo Toggle)
781 /* --------------- */
782 {
783  ClassNo RealClass, Top, Alt, Rhs;
784  RuleNo r;
785  Byte *p;
786  int k;
787 
788  RealClass = Class(Case[i]);
789  Top = TopClass[i];
790  Alt = AltClass[i];
791 
792  r = 0;
793  p = CovByPtr[i];
794  ForEach(k, 1, CovBy[i])
795  {
796  /* Update r to next rule covering case i */
797 
798  while ( (*p) & 128 )
799  {
800  r += ((*p++) & 127) * 128;
801  }
802  r += *p++;
803 
804  if ( r != Toggle )
805  {
806  /* Examine effect of adding or deleting rule */
807 
808  Rhs = Rule[r]->Rhs;
809 
810  if ( RuleIn[r] )
811  {
812  if ( Rhs == Top &&
813  Prefer(TotVote[i][Alt] - (TotVote[i][Top] - Rule[r]->Vote),
814  Alt, Top) )
815  {
816  DeltaErrs[r] +=
817  (NCost[Alt][RealClass] - NCost[Top][RealClass]) * Delta;
818  }
819  }
820  else
821  {
822  if ( Rhs != Top &&
823  Prefer(TotVote[i][Rhs] + Rule[r]->Vote - TotVote[i][Top],
824  Rhs, Top) )
825  {
826  DeltaErrs[r] +=
827  (NCost[Rhs][RealClass] - NCost[Top][RealClass]) * Delta;
828  }
829  }
830  }
831  }
832 }
833 
834 
835 
836 /*************************************************************************/
837 /* */
838 /* Calculate initial value of DeltaErrs and total errors */
839 /* */
840 /*************************************************************************/
841 
842 
844 /* ------------------ */
845 {
846  RuleNo r;
847  CaseNo i;
848  double Errs=0;
849 
850  ForEach(i, 0, MaxCase)
851  {
852  Errs += Weight(Case[i]) * NCost[TopClass[i]][Class(Case[i])];
853  }
854 
855  ForEach(r, 1, NRules)
856  {
857  DeltaErrs[r] = 0;
858  }
859 
860  ForEach(i, 0, MaxCase)
861  {
862  UpdateDeltaErrs(i, Weight(Case[i]), 0);
863  }
864 
865  return Errs;
866 }
867 
868 
869 
870 /*************************************************************************/
871 /* */
872 /* Remove unrepresented values from subsets */
873 /* */
874 /*************************************************************************/
875 
876 
878 /* ------------ */
879 {
881  Attribute Att, *Atts, Last;
882  int *Bytes, d, NAtts, j, b;
883  CaseNo i;
884  CRule R;
885  RuleNo r;
886 
887  /* Allocate subsets for possible values */
888 
889  Atts = Alloc(MaxAtt+1, Attribute);
890  Bytes = Alloc(MaxAtt+1, int);
891 
892  PossibleValues = AllocZero(MaxAtt+1, Set);
893  ForEach(Att, 1, MaxAtt)
894  {
895  if ( MaxAttVal[Att] > 3 )
896  {
897  Bytes[Att] = (MaxAttVal[Att]>>3)+1;
898  PossibleValues[Att] = AllocZero(Bytes[Att], Byte);
899  }
900  }
901 
902  /* Check each rule in turn */
903 
904  ForEach(r, 1, NRules)
905  {
906  R = Rule[r];
907  NAtts = 0;
908 
909  /* Find all subset conditions */
910 
911  ForEach(d, 1, R->Size)
912  {
913  if ( R->Lhs[d]->NodeType != BrSubset ) continue;
914 
915  Atts[++NAtts] = Att = R->Lhs[d]->Tested;
916  ClearBits(Bytes[Att], PossibleValues[Att]);
917  }
918 
919  if ( ! NAtts ) continue; /* no subset conditions */
920 
921  /* Scan cases covered by this rule */
922 
923  Uncompress(Fires[r], List);
924  for ( j = List[0] ; j ; j-- )
925  {
926  i = List[j];
927 
928  /* Record values of listed attributes */
929 
930  ForEach(d, 1, NAtts)
931  {
932  Att = Atts[d];
933  SetBit(DVal(Case[i], Att), PossibleValues[Att]);
934  }
935  }
936 
937  /* Delete unrepresented values */
938 
939  ForEach(d, 1, R->Size)
940  {
941  if ( R->Lhs[d]->NodeType != BrSubset ) continue;
942 
943  Att = R->Lhs[d]->Tested;
944  ForEach(b, 0, Bytes[Att]-1)
945  {
946  R->Lhs[d]->Subset[b] &= PossibleValues[Att][b];
947  }
948 
949  if ( Elements(Att, R->Lhs[d]->Subset, &Last) == 1 )
950  {
951  R->Lhs[d]->NodeType = BrDiscr;
952  R->Lhs[d]->TestValue = Last;
953  Free(R->Lhs[d]->Subset);
954  }
955  }
956  }
957 
958  FreeVector((void **) PossibleValues, 1, MaxAtt);
959  Free(Bytes);
960  Free(Atts);
961 }
962 
963 
964 
965 /*************************************************************************/
966 /* */
967 /* Choose the default class as the one with the maximum */
968 /* weight of uncovered cases */
969 /* */
970 /*************************************************************************/
971 
972 
974 /* --------------- */
975 {
976  RuleNo r;
977  ClassNo c;
978  double *UncoveredWeight, TotUncovered=1E-3;
979  CaseNo i, j;
980 
981  memset(Covered, false, MaxCase+1);
982  UncoveredWeight = AllocZero(MaxClass+1, double);
983 
984  /* Check which cases are covered by at least one rule */
985 
986  ForEach(r, 1, NRules)
987  {
988  if ( ! RuleIn[r] ) continue;
989 
990  Uncompress(Fires[r], List);
991  for ( j = List[0] ; j ; j-- )
992  {
993  Covered[List[j]] = true;
994  }
995  }
996 
997  /* Find weights by class of uncovered cases */
998 
999  ForEach(i, 0, MaxCase)
1000  {
1001  if ( ! Covered[i] )
1002  {
1003  UncoveredWeight[ Class(Case[i]) ] += Weight(Case[i]);
1004  TotUncovered += Weight(Case[i]);
1005  }
1006  }
1007 
1008  /* Choose new default class using rel freq and rel uncovered */
1009 
1010  Verbosity(1, fprintf(Of, "\n Weights of uncovered cases:\n"));
1011 
1012  ForEach(c, 1, MaxClass)
1013  {
1014  Verbosity(1, fprintf(Of, "\t%s (%.2f): %.1f\n",
1015  ClassName[c], ClassFreq[c] / (MaxCase + 1.0),
1016  UncoveredWeight[c]));
1017 
1018  ClassSum[c] = (UncoveredWeight[c] + 1) / (TotUncovered + 2.0) +
1019  ClassFreq[c] / (MaxCase + 1.0);
1020  }
1021 
1022  Default = SelectClass(1, (Boolean) (MCost && ! CostWeights));
1023 
1024  Free(UncoveredWeight);
1025 }
1026 
1027 
1028 
1029 /*************************************************************************/
1030 /* */
1031 /* Swap two rules */
1032 /* */
1033 /*************************************************************************/
1034 
1035 
1037 /* -------- */
1038 {
1039  CRule Hold;
1040  Boolean HoldIn;
1041 
1042  Hold = Rule[A];
1043  Rule[A] = Rule[B];
1044  Rule[B] = Hold;
1045 
1046  HoldIn = RuleIn[A];
1047  RuleIn[A] = RuleIn[B];
1048  RuleIn[B] = HoldIn;
1049 }
1050 
1051 
1052 
1053 /*************************************************************************/
1054 /* */
1055 /* Order rules by utility, least important first */
1056 /* (Called after HilClimb(), so RuleIn etc already known.) */
1057 /* */
1058 /*************************************************************************/
1059 
1060 
1062 /* -------------- */
1063 {
1064  RuleNo r, *Drop, NDrop=0, NewNRules=0, Toggle;
1065  CaseNo i;
1066  int j, OutCount;
1067  double Errs=0;
1068 
1069  Verbosity(1, fprintf(Of, "\n Determining rule utility\n"))
1070 
1071  Drop = Alloc(NRules, RuleNo);
1072 
1073  /* Find the rule that has the least beneficial effect on accuracy */
1074 
1075  while ( true )
1076  {
1077  Toggle = OutCount = 0;
1078 
1079  ForEach(r, 1, NRules)
1080  {
1081  if ( ! RuleIn[r] ) continue;
1082 
1083  Verbosity(2,
1084  if ( ! (OutCount++ %10 ) ) fprintf(Of, "\n\t\t");
1085  fprintf(Of, "%d<%g> ", r, DeltaErrs[r]))
1086 
1087  if ( ! Toggle ||
1088  DeltaErrs[r] < DeltaErrs[Toggle] - 1E-3 ||
1089  ( DeltaErrs[r] < DeltaErrs[Toggle] + 1E-3 &&
1090  Bits[r] > Bits[Toggle] ) )
1091  {
1092  Toggle = r;
1093  }
1094  }
1095  Verbosity(2, fprintf(Of, "\n"))
1096 
1097  if ( ! Toggle ) break;
1098 
1099  Verbosity(1,
1100  fprintf(Of, "\tDelete rule %d/%d (errs up %.1f)\n",
1101  Rule[Toggle]->TNo, Rule[Toggle]->RNo,
1102  Errs + DeltaErrs[Toggle]))
1103 
1104  /* Adjust vote information */
1105 
1106  Uncompress(Fires[Toggle], List);
1107  for ( j = List[0] ; j ; j-- )
1108  {
1109  i = List[j];
1110 
1111  /* Downdate DeltaErrs for all rules except Toggle that cover i */
1112 
1113  UpdateDeltaErrs(i, -Weight(Case[i]), Toggle);
1114 
1115  TotVote[i][Rule[Toggle]->Rhs] -= Rule[Toggle]->Vote;
1116 
1117  CountVotes(i);
1118 
1119  /* Update DeltaErrs for all rules except Toggle that cover i */
1120 
1121  UpdateDeltaErrs(i, Weight(Case[i]), Toggle);
1122  }
1123 
1124  Drop[NDrop++] = Toggle;
1125  RuleIn[Toggle] = false;
1126 
1127  Errs += DeltaErrs[Toggle];
1128  }
1129 
1130  /* Now reverse the order */
1131 
1132  while ( --NDrop >= 0 )
1133  {
1134  NewNRules++;
1135  RuleIn[Drop[NDrop]] = true;
1136  SwapRule(Drop[NDrop], NewNRules);
1137 
1138  /* Have to alter rule number in Drop */
1139  ForEach(r, 0, NDrop-1)
1140  {
1141  if ( Drop[r] == NewNRules ) Drop[r] = Drop[NDrop];
1142  }
1143  }
1144  Free(Drop);
1145 
1146  return NewNRules;
1147 }
1148 
1149 
1150 
1151 
1152 /*************************************************************************/
1153 /* */
1154 /* Order rules by class and then by rule CF */
1155 /* */
1156 /*************************************************************************/
1157 
1158 
1160 /* ------------ */
1161 {
1162  RuleNo r, nr, NewNRules=0;
1163  ClassNo c;
1164 
1165  ForEach(c, 1, MaxClass)
1166  {
1167  while ( true )
1168  {
1169  nr = 0;
1170  ForEach(r, NewNRules+1, NRules)
1171  {
1172  if ( RuleIn[r] && Rule[r]->Rhs == c &&
1173  ( ! nr || Rule[r]->Vote > Rule[nr]->Vote ) )
1174  {
1175  nr = r;
1176  }
1177  }
1178 
1179  if ( ! nr ) break;
1180 
1181  NewNRules++;
1182  if ( nr != NewNRules )
1183  {
1184  SwapRule(NewNRules, nr);
1185  }
1186  }
1187  }
1188 
1189  return NewNRules;
1190 }
1191 
1192 
1193 
1194 /*************************************************************************/
1195 /* */
1196 /* Discard deleted rules and sequence and renumber those remaining. */
1197 /* Sort by class and then by rule CF or by utility */
1198 /* */
1199 /*************************************************************************/
1200 
1201 
1203 /* ---------- */
1204 {
1205  RuleNo r, NewNRules;
1206 
1207  NewNRules = ( UTILITY ? OrderByUtility() : OrderByClass() );
1208 
1209  ForEach(r, 1, NewNRules)
1210  {
1211  Rule[r]->RNo = r;
1212  }
1213 
1214  /* Free discarded rules */
1215 
1216  ForEach(r, NewNRules+1, NRules)
1217  {
1218  FreeRule(Rule[r]);
1219  }
1220 
1221  NRules = NewNRules;
1222 }
1223 
1224 
1225 
1226 /*************************************************************************/
1227 /* */
1228 /* Tabluate logs and log factorials (to improve speed) */
1229 /* */
1230 /*************************************************************************/
1231 
1232 
1233 void GenerateLogs(int MaxN)
1234 /* ------------ */
1235 {
1236  CaseNo i;
1237 
1238  if ( LogCaseNo )
1239  {
1240  Realloc(LogCaseNo, MaxN+2, double);
1241  Realloc(LogFact, MaxN+2, double);
1242  }
1243  else
1244  {
1245  LogCaseNo = Alloc(MaxN+2, double);
1246  LogFact = Alloc(MaxN+2, double);
1247  }
1248 
1249  LogCaseNo[0] = -1E38;
1250  LogCaseNo[1] = 0;
1251 
1252  LogFact[0] = LogFact[1] = 0;
1253 
1254  ForEach(i, 2, MaxN+1)
1255  {
1256  LogCaseNo[i] = Log((double) i);
1257  LogFact[i] = LogFact[i-1] + LogCaseNo[i];
1258  }
1259 }
1260 
1261 
1262 
1264 /* ---------------- */
1265 {
1266  FreeUnlessNil(List); List = Nil;
1267  FreeVector((void **) Fires, 1, RuleSpace-1); Fires = Nil;
1275 
1278  FreeUnlessNil(Bits); Bits = Nil;
1281  if ( TotVote )
1282  {
1283  FreeUnlessNil(TotVote[0]);
1285  }
1286 }