mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
formtree.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Central tree-forming algorithm */
30 /* ------------------------------ */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 
39 Boolean MultiVal, /* all atts have many values */
40  Subsample; /* use subsampling */
41 float AvGainWt, /* weight of average gain in gain threshold */
42  MDLWt; /* weight of MDL threshold ditto */
43 
44 Attribute *DList=Nil; /* list of discrete atts */
45 int NDList; /* number in list */
46 
47 DiscrValue MaxLeaves; /* target maximum tree size */
48 
49 #define SAMPLEUNIT 2000
50 
51 float ValThresh; /* minimum GR when evaluating sampled atts */
52 Boolean Sampled; /* true if sampling used */
53 
54 Attribute *Waiting=Nil, /* attribute wait list */
56 
57 
58 
59 
60 /*************************************************************************/
61 /* */
62 /* Allocate space for tree tables */
63 /* */
64 /*************************************************************************/
65 
66 
68 /* ------------------ */
69 {
70  DiscrValue v;
71  Attribute Att;
72  DiscrValue vMax;
73 
74  Raw = AllocZero(TRIALS+1, Tree);
76 
78 
79  Gain = AllocZero(MaxAtt+1, float);
80  Info = AllocZero(MaxAtt+1, float);
82 
83  EstMaxGR = AllocZero(MaxAtt+1, float);
84 
85  /* Data for subsets */
86 
87  if ( SUBSET )
88  {
90  Subset = Alloc(MaxAtt+1, Set *);
91 
92  ForEach(Att, 1, MaxAtt)
93  {
94  if ( Discrete(Att) && Att != ClassAtt && ! Skip(Att) )
95  {
96  Subset[Att] = AllocZero(MaxAttVal[Att]+1, Set);
97  ForEach(v, 0, MaxAttVal[Att])
98  {
99  Subset[Att][v] = Alloc((MaxAttVal[Att]>>3)+1, Byte);
100  }
101  }
102  }
103  Subsets = AllocZero(MaxAtt+1, int);
104  }
105 
107  NDList = 0;
108 
109  DFreq = AllocZero(MaxAtt+1, double *);
110  ForEach(Att, 1, MaxAtt)
111  {
112  if ( Att == ClassAtt || Skip(Att) || ! Discrete(Att) ) continue;
113 
114  DList[NDList++] = Att;
115 
116  DFreq[Att] = Alloc(MaxClass * (MaxAttVal[Att]+1), double);
117  }
118 
119  ClassFreq = AllocZero(MaxClass+1, double);
120  ClassSum = Alloc(MaxClass+1, float);
121 
122  if ( BOOST )
123  {
124  Vote = Alloc(MaxClass+1, float);
126  }
127 
128  if ( RULES )
129  {
131  PossibleCuts = Alloc(MaxAtt+1, int);
132  }
133 
134  /* Check whether all attributes have many discrete values */
135 
136  MultiVal = true;
137  if ( ! SUBSET )
138  {
139  for ( Att = 1 ; MultiVal && Att <= MaxAtt ; Att++ )
140  {
141  if ( ! Skip(Att) && Att != ClassAtt )
142  {
143  MultiVal = MaxAttVal[Att] >= 0.3 * (MaxCase + 1);
144  }
145  }
146  }
147 
148  /* See whether there are continuous attributes for subsampling */
149 
150  Subsample = false;
151 
152  /* Set parameters for RawExtraErrs() */
153 
155 
156  /* Set up environment */
157 
159 
160  vMax = Max(3, MaxDiscrVal+1);
161 
162  GEnv.Freq = Alloc(vMax+1, double *);
163  ForEach(v, 0, vMax)
164  {
165  GEnv.Freq[v] = Alloc(MaxClass+1, double);
166  }
167 
168  GEnv.ValFreq = Alloc(vMax, double);
169 
170  GEnv.ClassFreq = Alloc(MaxClass+1, double);
171 
172  GEnv.SRec = Alloc(MaxCase+1, SortRec);
173 
174  if ( SUBSET )
175  {
176  GEnv.SubsetInfo = Alloc(MaxDiscrVal+1, double);
177  GEnv.SubsetEntr = Alloc(MaxDiscrVal+1, double);
178 
179  GEnv.MergeInfo = Alloc(MaxDiscrVal+1, double *);
180  GEnv.MergeEntr = Alloc(MaxDiscrVal+1, double *);
181  GEnv.WSubset = Alloc(MaxDiscrVal+1, Set);
182  ForEach(v, 1, MaxDiscrVal)
183  {
184  GEnv.MergeInfo[v] = Alloc(MaxDiscrVal+1, double);
185  GEnv.MergeEntr[v] = Alloc(MaxDiscrVal+1, double);
186  GEnv.WSubset[v] = Alloc((MaxDiscrVal>>3)+1, Byte);
187  }
188  }
189 }
190 
191 
193 /* ------------ */
194 {
195  Attribute Att;
196  DiscrValue vMax;
197 
198  FreeUnlessNil(Raw); Raw = Nil;
200 
202 
205  FreeUnlessNil(Bar); Bar = Nil;
206 
208 
209  if ( SUBSET )
210  {
211  FreeVector((void **) Bell, 1, MaxDiscrVal); Bell = Nil;
212 
213  if ( Subset )
214  {
215  ForEach(Att, 1, MaxAtt)
216  {
217  if ( Subset[Att] )
218  {
219  FreeVector((void **) Subset[Att], 0, MaxAttVal[Att]);
220  }
221  }
222  Free(Subset); Subset = Nil;
223  Free(Subsets); Subsets = Nil;
224  }
225  }
226 
228 
229  if ( DFreq )
230  {
231  ForEach(Att, 1, MaxAtt)
232  {
233  FreeUnlessNil(DFreq[Att]);
234  }
235 
236  Free(DFreq); DFreq = Nil;
237  }
238 
241 
244 
247 
248  vMax = Max(3, MaxDiscrVal+1);
249  FreeVector((void **) GEnv.Freq, 0, vMax);
250  Free(GEnv.ValFreq);
251  Free(GEnv.ClassFreq);
252  FreeUnlessNil(GEnv.SRec);
253 
254  if ( GEnv.SubsetInfo )
255  {
256  Free(GEnv.SubsetInfo);
257  Free(GEnv.SubsetEntr);
258  FreeVector((void **) GEnv.MergeInfo, 1, MaxDiscrVal);
259  FreeVector((void **) GEnv.MergeEntr, 1, MaxDiscrVal);
260  FreeVector((void **) GEnv.WSubset, 1, MaxDiscrVal);
261  }
262 
264 }
265 
266 
267 
268 /*************************************************************************/
269 /* */
270 /* Set threshold on minimum gain as follows: */
271 /* * when forming winnowing tree: no minimum */
272 /* * for small problems, AvGain (usual Gain Ratio) */
273 /* * for large problems, discounted MDL */
274 /* * for intermediate problems, interpolated */
275 /* */
276 /*************************************************************************/
277 
278 
280 /* ---------------- */
281 {
282  float Frac;
283 
284  /* Set AvGainWt and MDLWt */
285 
286  if ( Now == WINNOWATTS )
287  {
288  AvGainWt = MDLWt = 0.0;
289  }
290  else
291  if ( (MaxCase+1) / MaxClass <= 500 )
292  {
293  AvGainWt = 1.0;
294  MDLWt = 0.0;
295  }
296  else
297  if ( (MaxCase+1) / MaxClass >= 1000 )
298  {
299  AvGainWt = 0.0;
300  MDLWt = 0.9;
301  }
302  else
303  {
304  Frac = ((MaxCase+1) / MaxClass - 500) / 500.0;
305 
306  AvGainWt = 1 - Frac;
307  MDLWt = 0.9 * Frac;
308  }
309 }
310 
311 
312 
313 /*************************************************************************/
314 /* */
315 /* Build a decision tree for the cases Fp through Lp */
316 /* */
317 /* - if all cases are of the same class, the tree is a leaf labelled */
318 /* with this class */
319 /* */
320 /* - for each attribute, calculate the potential information provided */
321 /* by a test on the attribute (based on the probabilities of each */
322 /* case having a particular value for the attribute), and the gain */
323 /* in information that would result from a test on the attribute */
324 /* (based on the probabilities of each case with a particular */
325 /* value for the attribute being of a particular class) */
326 /* */
327 /* - on the basis of these figures, and depending on the current */
328 /* selection criterion, find the best attribute to branch on. */
329 /* Note: this version will not allow a split on an attribute */
330 /* unless two or more subsets have at least MINITEMS cases */
331 /* */
332 /* - try branching and test whether the resulting tree is better than */
333 /* forming a leaf */
334 /* */
335 /*************************************************************************/
336 
337 
338 void FormTree(CaseNo Fp, CaseNo Lp, int Level, Tree *Result)
339 /* -------- */
340 {
341  CaseCount Cases=0, TreeErrs=0;
342  Attribute BestAtt;
343  ClassNo c, BestLeaf=1, Least=1;
344  Tree Node;
345  DiscrValue v;
346 
347 
348  assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
349 
350  /* Make a single pass through the cases to determine class frequencies
351  and value/class frequencies for all discrete attributes */
352 
353  FindAllFreq(Fp, Lp);
354 
355  /* Choose the best leaf and the least prevalent class */
356 
357  ForEach(c, 2, MaxClass)
358  {
359  if ( ClassFreq[c] > ClassFreq[BestLeaf] )
360  {
361  BestLeaf = c;
362  }
363  else
364  if ( ClassFreq[c] > 0.1 && ClassFreq[c] < ClassFreq[Least] )
365  {
366  Least = c;
367  }
368  }
369 
370  ForEach(c, 1, MaxClass)
371  {
372  Cases += ClassFreq[c];
373  }
374 
375  MaxLeaves = ( LEAFRATIO > 0 ? rint(LEAFRATIO * Cases) : 1E6 );
376 
377  *Result = Node =
378  Leaf(ClassFreq, BestLeaf, Cases, Cases - ClassFreq[BestLeaf]);
379 
380  Verbosity(1,
381  fprintf(Of, "\n<%d> %d cases", Level, No(Fp,Lp));
382  if ( fabs(No(Fp,Lp) - Cases) >= 0.1 )
383  {
384  fprintf(Of, ", total weight %.1f", Cases);
385  }
386  fprintf(Of, "\n"))
387 
388  /* Do not try to split if:
389  - all cases are of the same class
390  - there are not enough cases to split */
391 
392  if ( ClassFreq[BestLeaf] >= 0.999 * Cases ||
393  Cases < 2 * MINITEMS ||
394  MaxLeaves < 2 )
395  {
396  if ( Now == FORMTREE ) Progress(Cases);
397  return;
398  }
399 
400  /* Calculate base information */
401 
403 
404  /* Perform preliminary evaluation if using subsampling.
405  Must expect at least 10 of least prevalent class */
406 
407  ValThresh = 0;
408  if ( Subsample && No(Fp, Lp) > 5 * MaxClass * SAMPLEUNIT &&
409  (ClassFreq[Least] * MaxClass * SAMPLEUNIT) / No(Fp, Lp) >= 10 )
410  {
411  SampleEstimate(Fp, Lp, Cases);
412  Sampled = true;
413  }
414  else
415  {
416  Sampled = false;
417  }
418 
419  BestAtt = ChooseSplit(Fp, Lp, Cases, Sampled);
420 
421  /* Decide whether to branch or not */
422 
423  if ( BestAtt == None )
424  {
425  Verbosity(1, fprintf(Of, "\tno sensible splits\n"))
426  if ( Now == FORMTREE ) Progress(Cases);
427  }
428  else
429  {
430  Verbosity(1,
431  fprintf(Of, "\tbest attribute %s", AttName[BestAtt]);
432  if ( Continuous(BestAtt) )
433  {
434  fprintf(Of, " cut %.3f", Bar[BestAtt]);
435  }
436  fprintf(Of, " inf %.3f gain %.3f val %.3f\n",
437  Info[BestAtt], Gain[BestAtt], Gain[BestAtt] / Info[BestAtt]))
438 
439  /* Build a node of the selected test */
440 
441  if ( Discrete(BestAtt) )
442  {
443  if ( SUBSET && MaxAttVal[BestAtt] > 3 && ! Ordered(BestAtt) )
444  {
445  SubsetTest(Node, BestAtt);
446  }
447  else
448  {
449  DiscreteTest(Node, BestAtt);
450  }
451  }
452  else
453  {
454  ContinTest(Node, BestAtt);
455  }
456 
457  /* Carry out the recursive divide-and-conquer */
458 
459  ++Tested[BestAtt];
460 
461  Divide(Node, Fp, Lp, Level);
462 
463  --Tested[BestAtt];
464 
465  /* See whether we would have been no worse off with a leaf */
466 
467  ForEach(v, 1, Node->Forks)
468  {
469  TreeErrs += Node->Branch[v]->Errors;
470  }
471 
472  if ( TreeErrs >= 0.999 * Node->Errors )
473  {
474  Verbosity(1,
475  fprintf(Of, "<%d> Collapse tree for %d cases to leaf %s\n",
476  Level, No(Fp,Lp), ClassName[BestLeaf]))
477 
478  UnSprout(Node);
479  }
480  else
481  {
482  Node->Errors = TreeErrs;
483  }
484  }
485 }
486 
487 
488 
489 /*************************************************************************/
490 /* */
491 /* Estimate Gain[] and Info[] using sample */
492 /* */
493 /*************************************************************************/
494 
495 
497 /* -------------- */
498 {
499  CaseNo SLp, SampleSize;
500  CaseCount NewCases;
501  Attribute Att;
502  float GR;
503 
504  /* Phase 1: evaluate all discrete attributes and record best GR */
505 
506  ForEach(Att, 1, MaxAtt)
507  {
508  Gain[Att] = None;
509 
510  if ( Discrete(Att) )
511  {
512  EvalDiscrSplit(Att, Cases);
513 
514  if ( Info[Att] > Epsilon &&
515  (GR = Gain[Att] / Info[Att]) > ValThresh )
516  {
517  ValThresh = GR;
518  }
519  }
520  }
521 
522  /* Phase 2: generate sample */
523 
524  SampleSize = MaxClass * SAMPLEUNIT;
525  Sample(Fp, Lp, SampleSize);
526  SLp = Fp + SampleSize - 1;
527 
528  /* Phase 3: evaluate continuous attributes using sample */
529 
530  NewCases = CountCases(Fp, SLp);
531  SampleFrac = NewCases / Cases;
532  NWaiting = 0;
533 
534  ForEach(Att, 1, MaxAtt)
535  {
536  if ( Continuous(Att) )
537  {
538  Waiting[NWaiting++] = Att;
539  }
540  }
541 
542  ProcessQueue(Fp, SLp, NewCases);
543 
544  SampleFrac = 1.0;
545 }
546 
547 
548 
549 /*************************************************************************/
550 /* */
551 /* Sample N cases from cases Fp through Lp */
552 /* */
553 /*************************************************************************/
554 
555 
556 void Sample(CaseNo Fp, CaseNo Lp, CaseNo N)
557 /* ------ */
558 {
559  CaseNo i, j;
560  double Interval;
561 
562  Interval = No(Fp, Lp) / (double) N;
563 
564  ForEach(i, 0, N-1)
565  {
566  j = (i + 0.5) * Interval;
567 
568  assert(j >= 0 && Fp + j <= Lp);
569 
570  Swap(Fp + i, Fp + j);
571  }
572 }
573 
574 
575 
576 /*************************************************************************/
577 /* */
578 /* Evaluate splits and choose best attribute to split on. */
579 /* If Sampled, Gain[] and Info[] have been estimated on */
580 /* sample and unlikely candidates are not evaluated on all cases */
581 /* */
582 /*************************************************************************/
583 
584 
586 /* ----------- */
587 {
588  Attribute Att;
589  int i, j;
590 
591 
592  /* For each available attribute, find the information and gain */
593 
594  NWaiting = 0;
595 
596  if ( Sampled )
597  {
598  /* If samples have been used, do not re-evaluate discrete atts
599  or atts that have low GR */
600 
601  for ( Att = MaxAtt ; Att > 0 ; Att-- )
602  {
603  if ( ! Continuous(Att) ) continue;
604 
605  if ( EstMaxGR[Att] >= ValThresh )
606  {
607  /* Add attributes in reverse order of estimated max GR */
608 
609  for ( i = 0 ;
610  i < NWaiting && EstMaxGR[Waiting[i]] < EstMaxGR[Att] ;
611  i++ )
612  ;
613 
614  for ( j = NWaiting-1 ; j >= i ; j-- )
615  {
616  Waiting[j+1] = Waiting[j];
617  }
618  NWaiting++;
619 
620  Waiting[i] = Att;
621  }
622  else
623  {
624  /* Don't use -- attribute hasn't been fully evaluated.
625  Leave Gain unchanged to get correct count for Possible */
626 
627  Info[Att] = -1E6; /* negative so GR also negative */
628  }
629  }
630  }
631  else
632  {
633  for ( Att = MaxAtt ; Att > 0 ; Att-- )
634  {
635  Gain[Att] = None;
636 
637  if ( Skip(Att) || Att == ClassAtt )
638  {
639  continue;
640  }
641 
642  Waiting[NWaiting++] = Att;
643  }
644  }
645 
646  ProcessQueue(Fp, Lp, Cases);
647 
648  return FindBestAtt(Cases);
649 }
650 
651 
652 
653 void ProcessQueue(CaseNo WFp, CaseNo WLp, CaseCount WCases)
654 /* ------------ */
655 {
656  Attribute Att;
657  float GR;
658 
659  for ( ; NWaiting > 0 ; )
660  {
661  Att = Waiting[--NWaiting];
662 
663  if ( Discrete(Att) )
664  {
665  EvalDiscrSplit(Att, WCases);
666  }
667  else
668  if ( SampleFrac < 1 )
669  {
670  EstimateMaxGR(Att, WFp, WLp);
671  }
672  else
673  if ( Sampled )
674  {
675  Info[Att] = -1E16;
676 
677  if ( EstMaxGR[Att] > ValThresh )
678  {
679  EvalContinuousAtt(Att, WFp, WLp);
680 
681  if ( Info[Att] > Epsilon &&
682  (GR = Gain[Att] / Info[Att]) > ValThresh )
683  {
684  if ( GR > ValThresh ) ValThresh = GR;
685  }
686  }
687  }
688  else
689  {
690  EvalContinuousAtt(Att, WFp, WLp);
691  }
692  }
693 }
694 
695 
696 
697 /*************************************************************************/
698 /* */
699 /* Adjust each attribute's gain to reflect choice and */
700 /* select att with maximum GR */
701 /* */
702 /*************************************************************************/
703 
704 
706 /* ----------- */
707 {
708  double BestVal, Val, MinGain=1E6, AvGain=0, MDL;
709  Attribute Att, BestAtt, Possible=0;
710  DiscrValue NBr, BestNBr=MaxDiscrVal+1;
711 
712  ForEach(Att, 1, MaxAtt)
713  {
714  /* Update the number of possible attributes for splitting and
715  average gain (unless very many values) */
716 
717  if ( Gain[Att] >= Epsilon &&
718  ( MultiVal || MaxAttVal[Att] < 0.3 * (MaxCase + 1) ) )
719  {
720  Possible++;
721  AvGain += Gain[Att];
722  }
723  else
724  {
725  Gain[Att] = None;
726  }
727  }
728 
729  /* Set threshold on minimum gain */
730 
731  if ( ! Possible ) return None;
732 
733  AvGain /= Possible;
734  MDL = Log(Possible) / Cases;
735  MinGain = AvGain * AvGainWt + MDL * MDLWt;
736 
737  Verbosity(2,
738  fprintf(Of, "\tav gain=%.3f, MDL (%d) = %.3f, min=%.3f\n",
739  AvGain, Possible, MDL, MinGain))
740 
741  /* Find best attribute according to Gain Ratio criterion subject
742  to threshold on minimum gain */
743 
744  BestVal = -Epsilon;
745  BestAtt = None;
746 
747  ForEach(Att, 1, MaxAtt)
748  {
749  if ( Gain[Att] >= 0.999 * MinGain && Info[Att] > 0 )
750  {
751  Val = Gain[Att] / Info[Att];
752  NBr = ( MaxAttVal[Att] <= 3 || Ordered(Att) ? 3 :
753  SUBSET ? Subsets[Att] : MaxAttVal[Att] );
754 
755  if ( Val > BestVal ||
756  Val > 0.999 * BestVal &&
757  ( NBr < BestNBr ||
758  NBr == BestNBr && Gain[Att] > Gain[BestAtt] ) )
759  {
760  BestAtt = Att;
761  BestVal = Val;
762  BestNBr = NBr;
763  }
764  }
765  }
766 
767  return BestAtt;
768 }
769 
770 
771 
772 /*************************************************************************/
773 /* */
774 /* Evaluate split on Att */
775 /* */
776 /*************************************************************************/
777 
778 
780 /* -------------- */
781 {
782  DiscrValue v, NBr;
783 
784  Gain[Att] = None;
785 
786  if ( Skip(Att) || Att == ClassAtt ) return;
787 
788  if ( Ordered(Att) )
789  {
790  EvalOrderedAtt(Att, Cases);
791  NBr = ( GEnv.ValFreq[1] > 0.5 ? 3 : 2 );
792  }
793  else
794  if ( SUBSET && MaxAttVal[Att] > 3 )
795  {
796  EvalSubset(Att, Cases);
797  NBr = Subsets[Att];
798  }
799  else
800  if ( ! Tested[Att] )
801  {
802  EvalDiscreteAtt(Att, Cases);
803 
804  NBr = 0;
805  ForEach(v, 1, MaxAttVal[Att])
806  {
807  if ( GEnv.ValFreq[v] > 0.5 ) NBr++;
808  }
809  }
810  else
811  {
812  NBr = 0;
813  }
814 
815  /* Check that this test will not give too many leaves */
816 
817  if ( NBr > MaxLeaves + 1 )
818  {
819  Verbosity(2,
820  fprintf(Of, "\t(cancelled -- %d leaves, max %d)\n", NBr, MaxLeaves))
821 
822  Gain[Att] = None;
823  }
824 }
825 
826 
827 
828 /*************************************************************************/
829 /* */
830 /* Form the subtrees for the given node */
831 /* */
832 /*************************************************************************/
833 
834 
835 void Divide(Tree T, CaseNo Fp, CaseNo Lp, int Level)
836 /* ------ */
837 {
838  CaseNo Bp, Ep, Missing, Cases, i;
839  CaseCount KnownCases, MissingCases, BranchCases;
840  Attribute Att;
841  double Factor;
842  DiscrValue v;
843  Boolean PrevUnitWeights;
844 
845  PrevUnitWeights = UnitWeights;
846 
847  Att = T->Tested;
848  Missing = (Ep = Group(0, Fp, Lp, T)) - Fp + 1;
849 
850  KnownCases = T->Cases - (MissingCases = CountCases(Fp, Ep));
851 
852  if ( Missing )
853  {
854  UnitWeights = false;
855 
856  /* If using costs, must adjust branch factors to undo effects of
857  reweighting cases */
858 
859  if ( CostWeights )
860  {
861  KnownCases = SumNocostWeights(Ep+1, Lp);
862  }
863 
864  /* If there are many cases with missing values and many branches,
865  skip cases whose weight < 0.1 */
866 
867  if ( (Cases = No(Fp,Lp)) > 1000 &&
868  Missing > 0.5 * Cases &&
869  T->Forks >= 10 )
870  {
871  ForEach(i, Fp, Ep)
872  {
873  if ( Weight(Case[i]) < 0.1 )
874  {
875  Missing--;
876  MissingCases -= Weight(Case[i]);
877  Swap(Fp, i);
878  Fp++;
879  }
880  }
881 
882  assert(Missing >= 0);
883  }
884  }
885 
886  Bp = Fp;
887  ForEach(v, 1, T->Forks)
888  {
889  Ep = Group(v, Bp + Missing, Lp, T);
890 
891  assert(Bp + Missing <= Lp+1 && Ep <= Lp);
892 
893  /* Bp -> first value in missing + remaining values
894  Ep -> last value in missing + current group */
895 
896  BranchCases = CountCases(Bp + Missing, Ep);
897 
898  Factor = ( ! Missing ? 0 :
899  ! CostWeights ? BranchCases / KnownCases :
900  SumNocostWeights(Bp + Missing, Ep) / KnownCases );
901 
902  if ( BranchCases + Factor * MissingCases >= MinLeaf )
903  {
904  if ( Missing )
905  {
906  /* Adjust weights of cases with missing values */
907 
908  ForEach(i, Bp, Bp + Missing - 1)
909  {
910  Weight(Case[i]) *= Factor;
911  }
912  }
913 
914  FormTree(Bp, Ep, Level+1, &T->Branch[v]);
915 
916  /* Restore weights if changed */
917 
918  if ( Missing )
919  {
920  for ( i = Ep ; i >= Bp ; i-- )
921  {
922  if ( Unknown(Case[i], Att) )
923  {
924  Weight(Case[i]) /= Factor;
925  Swap(i, Ep);
926  Ep--;
927  }
928  }
929  }
930 
931  Bp = Ep+1;
932  }
933  else
934  {
935  T->Branch[v] = Leaf(Nil, T->Leaf, 0.0, 0.0);
936  }
937  }
938 
939  UnitWeights = PrevUnitWeights;
940 }
941 
942 
943 
944 /*************************************************************************/
945 /* */
946 /* Group together the cases corresponding to branch V of a test */
947 /* and return the index of the last such */
948 /* */
949 /* Note: if V equals zero, group the unknown values */
950 /* */
951 /*************************************************************************/
952 
953 
954 CaseNo Group(DiscrValue V, CaseNo Bp, CaseNo Ep, Tree TestNode)
955 /* ----- */
956 {
957  CaseNo i;
958  Attribute Att;
959  ContValue Thresh;
960  Set SS;
961 
962  Att = TestNode->Tested;
963 
964  if ( ! V )
965  {
966  /* Group together unknown values (if any) */
967 
968  if ( SomeMiss[Att] )
969  {
970  ForEach(i, Bp, Ep)
971  {
972  if ( Unknown(Case[i], Att) )
973  {
974  Swap(Bp, i);
975  Bp++;
976  }
977  }
978  }
979  }
980  else /* skip non-existant N/A values */
981  if ( V != 1 || TestNode->NodeType == BrSubset || SomeNA[Att] )
982  {
983  /* Group cases on the value of attribute Att, and depending
984  on the type of branch */
985 
986  switch ( TestNode->NodeType )
987  {
988  case BrDiscr:
989 
990  ForEach(i, Bp, Ep)
991  {
992  if ( DVal(Case[i], Att) == V )
993  {
994  Swap(Bp, i);
995  Bp++;
996  }
997  }
998  break;
999 
1000  case BrThresh:
1001 
1002  Thresh = TestNode->Cut;
1003  ForEach(i, Bp, Ep)
1004  {
1005  if ( V == 1 ? NotApplic(Case[i], Att) :
1006  (CVal(Case[i], Att) <= Thresh) == (V == 2) )
1007  {
1008  Swap(Bp, i);
1009  Bp++;
1010  }
1011  }
1012  break;
1013 
1014  case BrSubset:
1015 
1016  SS = TestNode->Subset[V];
1017  ForEach(i, Bp, Ep)
1018  {
1019  if ( In(XDVal(Case[i], Att), SS) )
1020  {
1021  Swap(Bp, i);
1022  Bp++;
1023  }
1024  }
1025  break;
1026  }
1027  }
1028 
1029  return Bp - 1;
1030 }
1031 
1032 
1033 
1034 /*************************************************************************/
1035 /* */
1036 /* Return the total weight of cases from Fp to Lp */
1037 /* */
1038 /*************************************************************************/
1039 
1040 
1042 /* ---------- */
1043 {
1044  double Sum=0.0;
1045  CaseNo i;
1046 
1047  assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
1048 
1049  ForEach(i, Fp, Lp)
1050  {
1051  Sum += Weight(Case[i]);
1052  }
1053 
1054  return Sum;
1055 }
1056 
1057 
1058 
1059 /*************************************************************************/
1060 /* */
1061 /* Special version to undo the weightings associated with costs */
1062 /* */
1063 /*************************************************************************/
1064 
1065 
1067 /* ---------------- */
1068 {
1069  double Sum=0.0;
1070  CaseNo i;
1071 
1072  assert(Fp >= 0 && Lp >= Fp-1 && Lp <= MaxCase);
1073 
1074  ForEach(i, Fp, Lp)
1075  {
1076  Sum += Weight(Case[i]) / WeightMul[Class(Case[i])];
1077  }
1078 
1079  return Sum;
1080 }
1081 
1082 
1083 
1084 /*************************************************************************/
1085 /* */
1086 /* Generate class frequency distribution */
1087 /* */
1088 /*************************************************************************/
1089 
1090 
1091 void FindClassFreq(double *CF, CaseNo Fp, CaseNo Lp)
1092 /* ------------- */
1093 {
1094  ClassNo c;
1095  CaseNo i;
1096 
1097  assert(Fp >= 0 && Lp >= Fp && Lp <= MaxCase);
1098 
1099  ForEach(c, 0, MaxClass)
1100  {
1101  CF[c] = 0;
1102  }
1103 
1104  ForEach(i, Fp, Lp)
1105  {
1106  assert(Class(Case[i]) >= 1 && Class(Case[i]) <= MaxClass);
1107 
1108  CF[ Class(Case[i]) ] += Weight(Case[i]);
1109  }
1110 }
1111 
1112 
1113 
1114 /*************************************************************************/
1115 /* */
1116 /* Find all discrete frequencies */
1117 /* */
1118 /*************************************************************************/
1119 
1120 
1122 /* ----------- */
1123 {
1124  ClassNo c;
1125  CaseNo i;
1126  Attribute Att, a;
1127  CaseCount w;
1128  int x;
1129 
1130  /* Zero all values */
1131 
1132  ForEach(c, 0, MaxClass)
1133  {
1134  ClassFreq[c] = 0;
1135  }
1136 
1137  for ( a = 0 ; a < NDList ; a++ )
1138  {
1139  Att = DList[a];
1140  for ( x = MaxClass * (MaxAttVal[Att]+1) - 1 ; x >= 0 ; x-- )
1141  {
1142  DFreq[Att][x] = 0;
1143  }
1144  }
1145 
1146  /* Scan cases */
1147 
1148  ForEach(i, Fp, Lp)
1149  {
1150  ClassFreq[ (c=Class(Case[i])) ] += (w=Weight(Case[i]));
1151 
1152  for ( a = 0 ; a < NDList ; a++ )
1153  {
1154  Att = DList[a];
1155  DFreq[Att][ MaxClass * XDVal(Case[i], Att) + (c-1) ] += w;
1156  }
1157  }
1158 }