mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
subset.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Evaluation of discrete attribute subsets */
30 /* ---------------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 
39 
40 /*************************************************************************/
41 /* */
42 /* Set up tables for subsets */
43 /* */
44 /*************************************************************************/
45 
46 
48 /* --------------------- */
49 {
50  DiscrValue n, k;
51 
52  /* Table of Bell numbers (used for subset test penalties) */
53 
54  Bell = AllocZero(MaxDiscrVal+1, double *);
55  ForEach(n, 1, MaxDiscrVal)
56  {
57  Bell[n] = AllocZero(n+1, double);
58  ForEach(k, 1, n)
59  {
60  Bell[n][k] = ( k == 1 || k == n ? 1 :
61  Bell[n-1][k-1] + k * Bell[n-1][k] );
62  }
63  }
64 }
65 
66 
67 
68 /*************************************************************************/
69 /* */
70 /* Evaluate subsetting a discrete attribute and form the chosen */
71 /* subsets Subset[Att][], setting Subsets[Att] to the number of */
72 /* subsets, and the Info[] and Gain[] of a test on the attribute */
73 /* */
74 /*************************************************************************/
75 
76 
77 void EvalSubset(Attribute Att, CaseCount Cases)
78 /* ---------- */
79 {
80  DiscrValue V1, V2, V3, BestV1, BestV2, InitialBlocks, First=1, Prelim=0;
81  ClassNo c;
82  double BaseInfo, ThisGain, ThisInfo, Penalty, UnknownRate,
83  Val, BestVal, BestGain, BestInfo, PrevGain, PrevInfo;
84  int MissingValues=0;
85  CaseCount KnownCases;
86  Boolean Better;
87 
88  /* First compute Freq[][], ValFreq[], base info, and the gain
89  and total info of a split on discrete attribute Att */
90 
91  SetDiscrFreq(Att);
92 
93  GEnv.ReasonableSubsets = 0;
94  ForEach(c, 1, MaxAttVal[Att])
95  {
96  if ( GEnv.ValFreq[c] >= MINITEMS ) GEnv.ReasonableSubsets++;
97  }
98 
99  if ( ! GEnv.ReasonableSubsets )
100  {
101  Verbosity(2,
102  fprintf(Of, "\tAtt %s: poor initial split\n", AttName[Att]))
103 
104  return;
105  }
106 
107  KnownCases = Cases - GEnv.ValFreq[0];
108  UnknownRate = GEnv.ValFreq[0] / Cases;
109 
110  BaseInfo = ( ! GEnv.ValFreq[0] ? GlobalBaseInfo :
111  DiscrKnownBaseInfo(KnownCases, MaxAttVal[Att]) );
112 
113  PrevGain = ComputeGain(BaseInfo, UnknownRate, MaxAttVal[Att], KnownCases);
114  PrevInfo = TotalInfo(GEnv.ValFreq, 0, MaxAttVal[Att]) / Cases;
115  BestVal = PrevGain / PrevInfo;
116 
117  Verbosity(2, fprintf(Of, "\tAtt %s", AttName[Att]))
118  Verbosity(3, PrintDistribution(Att, 0, MaxAttVal[Att], GEnv.Freq,
119  GEnv.ValFreq, true))
120  Verbosity(2,
121  fprintf(Of, "\tinitial inf %.3f, gain %.3f, val=%.3f\n",
122  PrevInfo, PrevGain, BestVal))
123 
124  /* Eliminate unrepresented attribute values from Freq[] and ValFreq[]
125  and form a separate subset for each represented attribute value.
126  Unrepresented N/A values are ignored */
127 
128  GEnv.Bytes = (MaxAttVal[Att]>>3) + 1;
129  ClearBits(GEnv.Bytes, Subset[Att][0]);
130 
131  GEnv.Blocks = 0;
132  ForEach(V1, 1, MaxAttVal[Att])
133  {
134  if ( GEnv.ValFreq[V1] > Epsilon || V1 == 1 && SomeNA[Att] )
135  {
136  if ( ++GEnv.Blocks < V1 )
137  {
138  GEnv.ValFreq[GEnv.Blocks] = GEnv.ValFreq[V1];
139  ForEach(c, 1, MaxClass)
140  {
141  GEnv.Freq[GEnv.Blocks][c] = GEnv.Freq[V1][c];
142  }
143  }
144  ClearBits(GEnv.Bytes, GEnv.WSubset[GEnv.Blocks]);
145  SetBit(V1, GEnv.WSubset[GEnv.Blocks]);
146  CopyBits(GEnv.Bytes, GEnv.WSubset[GEnv.Blocks],
147  Subset[Att][GEnv.Blocks]);
148 
149  /* Cannot merge N/A values with other blocks */
150 
151  if ( V1 == 1 ) First = 2;
152  }
153  else
154  if ( V1 != 1 )
155  {
156  SetBit(V1, Subset[Att][0]);
157  MissingValues++;
158  }
159  }
160 
161  /* Set n-way branch as initial test */
162 
163  Gain[Att] = PrevGain;
164  Info[Att] = PrevInfo;
165  Subsets[Att] = InitialBlocks = GEnv.Blocks;
166 
167  /* As a preliminary step, merge values with identical distributions */
168 
169  ForEach(V1, First, GEnv.Blocks-1)
170  {
171  ForEach(V2, V1+1, GEnv.Blocks)
172  {
173  if ( SameDistribution(V1, V2) )
174  {
175  Prelim = V1;
176  AddBlock(V1, V2);
177  }
178  }
179 
180  /* Eliminate any merged values */
181 
182  if ( Prelim == V1 )
183  {
184  V3 = V1;
185 
186  ForEach(V2, V1+1, GEnv.Blocks)
187  {
188  if ( GEnv.ValFreq[V2] && ++V3 != V2 )
189  {
190  MoveBlock(V3, V2);
191  }
192  }
193 
194  GEnv.Blocks = V3;
195  }
196  }
197 
198  if ( Prelim )
199  {
200  PrevInfo = TotalInfo(GEnv.ValFreq, 0, GEnv.Blocks) / Cases;
201 
202  Penalty = ( finite(Bell[InitialBlocks][GEnv.Blocks]) ?
203  Log(Bell[InitialBlocks][GEnv.Blocks]) :
204  (InitialBlocks-GEnv.Blocks+1) * Log(GEnv.Blocks) );
205 
206  Val = (PrevGain - Penalty / Cases) / PrevInfo;
207  Better = ( GEnv.Blocks >= 2 && GEnv.ReasonableSubsets >= 2 &&
208  Val >= BestVal );
209 
210  Verbosity(2,
211  {
212  fprintf(Of, "\tprelim merges -> inf %.3f, gain %.3f, val %.3f%s%s",
213  PrevInfo, PrevGain, Val,
214  ( Better ? " **" : "" ),
215  (VERBOSITY > 2 ? "" : "\n" ));
216  Verbosity(3, PrintDistribution(Att, 0, GEnv.Blocks, GEnv.Freq,
217  GEnv.ValFreq, false))
218  })
219 
220  if ( Better )
221  {
222  Subsets[Att] = GEnv.Blocks;
223 
224  ForEach(V1, 1, GEnv.Blocks)
225  {
226  CopyBits(GEnv.Bytes, GEnv.WSubset[V1], Subset[Att][V1]);
227  }
228 
229  Info[Att] = PrevInfo;
230  Gain[Att] = PrevGain - Penalty / KnownCases;
231  BestVal = Val;
232  }
233  }
234 
235  /* Determine initial information and entropy values */
236 
237  ForEach(V1, 1, GEnv.Blocks)
238  {
239  GEnv.SubsetInfo[V1] = -GEnv.ValFreq[V1] * Log(GEnv.ValFreq[V1] / Cases);
240  GEnv.SubsetEntr[V1] = TotalInfo(GEnv.Freq[V1], 1, MaxClass);
241  }
242 
243  ForEach(V1, First, GEnv.Blocks-1)
244  {
245  ForEach(V2, V1+1, GEnv.Blocks)
246  {
247  EvaluatePair(V1, V2, Cases);
248  }
249  }
250 
251  /* Examine possible pair mergers and hill-climb */
252 
253  while ( GEnv.Blocks > 2 )
254  {
255  BestV1 = 0;
256  BestGain = -Epsilon;
257 
258  /* For each possible pair of values, calculate the gain and
259  total info of a split in which they are treated as one.
260  Keep track of the pair with the best gain. */
261 
262  ForEach(V1, First, GEnv.Blocks-1)
263  {
264  ForEach(V2, V1+1, GEnv.Blocks)
265  {
266  if ( GEnv.ReasonableSubsets == 2 &&
267  GEnv.ValFreq[V1] >= MINITEMS-Epsilon &&
268  GEnv.ValFreq[V2] >= MINITEMS-Epsilon )
269  {
270  continue;
271  }
272 
273  ThisGain = PrevGain -
274  ((1-UnknownRate) / KnownCases) *
275  (GEnv.MergeEntr[V1][V2] -
276  (GEnv.SubsetEntr[V1] + GEnv.SubsetEntr[V2]));
277  ThisInfo = PrevInfo + (GEnv.MergeInfo[V1][V2] -
278  (GEnv.SubsetInfo[V1] + GEnv.SubsetInfo[V2])) / Cases;
279  Verbosity(3,
280  fprintf(Of, "\t combine %d %d info %.3f gain %.3f\n",
281  V1, V2, ThisInfo, ThisGain))
282 
283  /* See whether this merge has the best gain so far */
284 
285  if ( ThisGain > BestGain+Epsilon )
286  {
287  BestGain = ThisGain;
288  BestInfo = ThisInfo;
289  BestV1 = V1;
290  BestV2 = V2;
291  }
292  }
293  }
294 
295  if ( ! BestV1 ) break;
296 
297  PrevGain = BestGain;
298  PrevInfo = BestInfo;
299 
300  /* Determine penalty as log of Bell number. If number is too
301  large, use an approximation of log */
302 
303  Penalty = ( finite(Bell[InitialBlocks][GEnv.Blocks-1]) ?
304  Log(Bell[InitialBlocks][GEnv.Blocks-1]) :
305  (InitialBlocks-GEnv.Blocks+1) * Log(GEnv.Blocks-1) );
306 
307  Val = (BestGain - Penalty / Cases) / BestInfo;
308 
309  Merge(BestV1, BestV2, Cases);
310 
311  Verbosity(2,
312  fprintf(Of, "\tform subset ");
313  PrintSubset(Att, GEnv.WSubset[BestV1]);
314  fprintf(Of, ": %d subsets, inf %.3f, gain %.3f, val %.3f%s\n",
315  GEnv.Blocks, BestInfo, BestGain, Val,
316  ( Val > BestVal ? " **" : "" ));
317  Verbosity(3,
318  PrintDistribution(Att, 0, GEnv.Blocks, GEnv.Freq, GEnv.ValFreq,
319  false))
320  )
321 
322  if ( Val >= BestVal )
323  {
324  Subsets[Att] = GEnv.Blocks;
325 
326  ForEach(V1, 1, GEnv.Blocks)
327  {
328  CopyBits(GEnv.Bytes, GEnv.WSubset[V1], Subset[Att][V1]);
329  }
330 
331  Info[Att] = BestInfo;
332  Gain[Att] = BestGain - Penalty / KnownCases;
333  BestVal = Val;
334  }
335  }
336 
337  /* Add missing values as another branch */
338 
339  if ( MissingValues )
340  {
341  Subsets[Att]++;
342  CopyBits(GEnv.Bytes, Subset[Att][0], Subset[Att][Subsets[Att]]);
343  }
344 
345  Verbosity(2,
346  fprintf(Of, "\tfinal inf %.3f, gain %.3f, val=%.3f\n",
347  Info[Att], Gain[Att], Gain[Att] / (Info[Att] + 1E-3)))
348 }
349 
350 
351 
352 /*************************************************************************/
353 /* */
354 /* Combine the distribution figures of subsets x and y. */
355 /* Update Freq, ValFreq, SubsetInfo, SubsetEntr, MergeInfo, and */
356 /* MergeEntr. */
357 /* */
358 /*************************************************************************/
359 
360 
362 /* ----- */
363 {
364  ClassNo c;
365  double Entr=0;
366  CaseCount KnownCases=0;
367  int R, C;
368 
369  AddBlock(x, y);
370 
371  ForEach(c, 1, MaxClass)
372  {
373  Entr -= GEnv.Freq[x][c] * Log(GEnv.Freq[x][c]);
374  KnownCases += GEnv.Freq[x][c];
375  }
376 
377  GEnv.SubsetInfo[x] = - GEnv.ValFreq[x] * Log(GEnv.ValFreq[x] / Cases);
378  GEnv.SubsetEntr[x] = Entr + KnownCases * Log(KnownCases);
379 
380  /* Eliminate y from working blocks */
381 
382  ForEach(R, y, GEnv.Blocks-1)
383  {
384  MoveBlock(R, R+1);
385 
386  GEnv.SubsetInfo[R] = GEnv.SubsetInfo[R+1];
387  GEnv.SubsetEntr[R] = GEnv.SubsetEntr[R+1];
388 
389  ForEach(C, 1, GEnv.Blocks)
390  {
391  GEnv.MergeInfo[R][C] = GEnv.MergeInfo[R+1][C];
392  GEnv.MergeEntr[R][C] = GEnv.MergeEntr[R+1][C];
393  }
394  }
395 
396  ForEach(C, y, GEnv.Blocks-1)
397  {
398  ForEach(R, 1, GEnv.Blocks-1)
399  {
400  GEnv.MergeInfo[R][C] = GEnv.MergeInfo[R][C+1];
401  GEnv.MergeEntr[R][C] = GEnv.MergeEntr[R][C+1];
402  }
403  }
404  GEnv.Blocks--;
405 
406  /* Update information for newly-merged block */
407 
408  ForEach(C, 1, GEnv.Blocks)
409  {
410  if ( C != x ) EvaluatePair(x, C, Cases);
411  }
412 }
413 
414 
415 
416 /*************************************************************************/
417 /* */
418 /* Calculate the effect of merging subsets x and y */
419 /* */
420 /*************************************************************************/
421 
422 
424 /* ------------ */
425 {
426  ClassNo c;
427  double Entr=0;
428  CaseCount KnownCases=0, F;
429 
430  if ( y < x )
431  {
432  c = y;
433  y = x;
434  x = c;
435  }
436 
437  F = GEnv.ValFreq[x] + GEnv.ValFreq[y];
438  GEnv.MergeInfo[x][y] = - F * Log(F / Cases);
439 
440  ForEach(c, 1, MaxClass)
441  {
442  F = GEnv.Freq[x][c] + GEnv.Freq[y][c];
443  Entr -= F * Log(F);
444  KnownCases += F;
445  }
446  GEnv.MergeEntr[x][y] = Entr + KnownCases * Log(KnownCases);
447 }
448 
449 
450 
451 /*************************************************************************/
452 /* */
453 /* Check whether two values have same class distribution */
454 /* */
455 /*************************************************************************/
456 
457 
459 /* ---------------- */
460 {
461  ClassNo c;
462  CaseCount D1, D2;
463 
464  D1 = GEnv.ValFreq[V1];
465  D2 = GEnv.ValFreq[V2];
466 
467  ForEach(c, 1, MaxClass)
468  {
469  if ( fabs(GEnv.Freq[V1][c] / D1 - GEnv.Freq[V2][c] / D2) > 0.001 )
470  {
471  return false;
472  }
473  }
474 
475  return true;
476 }
477 
478 
479 
480 /*************************************************************************/
481 /* */
482 /* Add frequency and subset information from block V2 to V1 */
483 /* */
484 /*************************************************************************/
485 
486 
488 /* -------- */
489 {
490  ClassNo c;
491  int b;
492 
493  if ( GEnv.ValFreq[V1] >= MINITEMS-Epsilon &&
494  GEnv.ValFreq[V2] >= MINITEMS-Epsilon )
495  {
496  GEnv.ReasonableSubsets--;
497  }
498  else
499  if ( GEnv.ValFreq[V1] < MINITEMS-Epsilon &&
500  GEnv.ValFreq[V2] < MINITEMS-Epsilon &&
501  GEnv.ValFreq[V1] + GEnv.ValFreq[V2] >= MINITEMS-Epsilon )
502  {
503  GEnv.ReasonableSubsets++;
504  }
505 
506  ForEach(c, 1, MaxClass)
507  {
508  GEnv.Freq[V1][c] += GEnv.Freq[V2][c];
509  }
510  GEnv.ValFreq[V1] += GEnv.ValFreq[V2];
511  GEnv.ValFreq[V2] = 0;
512  ForEach(b, 0, GEnv.Bytes-1)
513  {
514  GEnv.WSubset[V1][b] |= GEnv.WSubset[V2][b];
515  }
516 }
517 
518 
519 
520 /*************************************************************************/
521 /* */
522 /* Move frequency and subset information from block V2 to V1 */
523 /* */
524 /*************************************************************************/
525 
526 
528 /* --------- */
529 {
530  ClassNo c;
531 
532  ForEach(c, 1, MaxClass)
533  {
534  GEnv.Freq[V1][c] = GEnv.Freq[V2][c];
535  }
536  GEnv.ValFreq[V1] = GEnv.ValFreq[V2];
537  CopyBits(GEnv.Bytes, GEnv.WSubset[V2], GEnv.WSubset[V1]);
538 }
539 
540 
541 
542 /*************************************************************************/
543 /* */
544 /* Print the values of attribute Att which are in the subset Ss */
545 /* */
546 /*************************************************************************/
547 
548 
549 void PrintSubset(Attribute Att, Set Ss)
550 /* ----------- */
551 {
552  DiscrValue V1;
553  Boolean First=true;
554 
555  ForEach(V1, 1, MaxAttVal[Att])
556  {
557  if ( In(V1, Ss) )
558  {
559  if ( First )
560  {
561  First = false;
562  }
563  else
564  {
565  fprintf(Of, ", ");
566  }
567 
568  fprintf(Of, "%s", AttValName[Att][V1]);
569  }
570  }
571 }
572 
573 
574 
575 /*************************************************************************/
576 /* */
577 /* Construct and return a node for a test on a subset of values */
578 /* */
579 /*************************************************************************/
580 
581 
582 void SubsetTest(Tree Node, Attribute Att)
583 /* ----------- */
584 {
585  int S, Bytes;
586 
587  Sprout(Node, Subsets[Att]);
588 
589  Node->NodeType = BrSubset;
590  Node->Tested = Att;
591 
592  Bytes = (MaxAttVal[Att]>>3) + 1;
593  Node->Subset = AllocZero(Subsets[Att]+1, Set);
594  ForEach(S, 1, Node->Forks)
595  {
596  Node->Subset[S] = Alloc(Bytes, Byte);
597  CopyBits(Bytes, Subset[Att][S], Node->Subset[S]);
598  }
599 }