mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
discr.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Evaluation of a test on a discrete valued attribute */
30 /* --------------------------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 #include "defns.i"
35 #include "extern.i"
36 
37 
38 /*************************************************************************/
39 /* */
40 /* Set Info[] and Gain[] for discrete partition of cases */
41 /* */
42 /*************************************************************************/
43 
44 
46 /* --------------- */
47 {
48  CaseCount KnownCases;
49  int ReasonableSubsets=0;
50  DiscrValue v;
51  double BaseInfo;
52 
53  SetDiscrFreq(Att);
54  KnownCases = Cases - GEnv.ValFreq[0];
55 
56  /* Check reasonable subsets */
57 
58  ForEach(v, 1, MaxAttVal[Att])
59  {
60  if ( GEnv.ValFreq[v] >= MINITEMS ) ReasonableSubsets++;
61  }
62 
63  if ( ReasonableSubsets < 2 )
64  {
65  Verbosity(2, fprintf(Of, "\tAtt %s: poor split\n", AttName[Att]))
66  return;
67  }
68 
69  BaseInfo = ( ! GEnv.ValFreq[0] ? GlobalBaseInfo :
70  DiscrKnownBaseInfo(KnownCases, MaxAttVal[Att]) );
71 
72  Gain[Att] = ComputeGain(BaseInfo, GEnv.ValFreq[0] / Cases, MaxAttVal[Att],
73  KnownCases);
74  Info[Att] = TotalInfo(GEnv.ValFreq, 0, MaxAttVal[Att]) / Cases;
75 
76  Verbosity(2,
77  {
78  fprintf(Of, "\tAtt %s", AttName[Att]);
79  Verbosity(3,
80  PrintDistribution(Att, 0, MaxAttVal[Att], GEnv.Freq, GEnv.ValFreq,
81  true))
82  fprintf(Of, "\tinf %.3f, gain %.3f\n", Info[Att], Gain[Att]);
83  })
84 }
85 
86 
87 
88 /*************************************************************************/
89 /* */
90 /* Set Info[] and Gain[] for ordered split on cases */
91 /* */
92 /*************************************************************************/
93 
94 
96 /* -------------- */
97 {
98  CaseCount KnownCases;
99  double *HoldFreqRow, SplitFreq[4];
100  ClassNo c;
101  int Tries=0;
102  DiscrValue v, BestV;
103  double BaseInfo, ThisGain, BestInfo, BestGain=None;
104 
105  SetDiscrFreq(Att);
106  KnownCases = Cases - GEnv.ValFreq[0];
107 
108  BaseInfo = ( ! GEnv.ValFreq[0] ? GlobalBaseInfo :
109  DiscrKnownBaseInfo(KnownCases, MaxAttVal[Att]) );
110 
111  Verbosity(2, fprintf(Of, "\tAtt %s", AttName[Att]))
112  Verbosity(3, PrintDistribution(Att, 0, MaxAttVal[Att], GEnv.Freq,
113  GEnv.ValFreq, true))
114 
115  /* Move elts of Freq[] starting with the third up one place
116  and aggregate class frequencies */
117 
118  HoldFreqRow = GEnv.Freq[MaxAttVal[Att]+1];
119  ForEach(c, 1, MaxClass)
120  {
121  HoldFreqRow[c] = 0;
122  }
123  SplitFreq[0] = GEnv.ValFreq[0];
124  SplitFreq[1] = GEnv.ValFreq[1];
125  SplitFreq[2] = GEnv.ValFreq[2];
126  SplitFreq[3] = 0;
127 
128  for ( v = MaxAttVal[Att] ; v > 2 ; v-- )
129  {
130  GEnv.Freq[v+1] = GEnv.Freq[v];
131  ForEach(c, 1, MaxClass)
132  {
133  HoldFreqRow[c] += GEnv.Freq[v][c];
134  }
135  SplitFreq[3] += GEnv.ValFreq[v];
136  }
137 
138  GEnv.Freq[3] = HoldFreqRow;
139 
140  /* Try various cuts, saving the one with maximum gain */
141 
142  ForEach(v, 3, MaxAttVal[Att])
143  {
144  if ( GEnv.ValFreq[v] > 0 &&
145  SplitFreq[2] >= MINITEMS && SplitFreq[3] >= MINITEMS )
146  {
147  Tries++;
148  ThisGain =
149  ComputeGain(BaseInfo, GEnv.ValFreq[0] / Cases, 3, KnownCases);
150 
151  if ( ThisGain > BestGain )
152  {
153  BestGain = ThisGain;
154  BestInfo = TotalInfo(SplitFreq, 0, 3) / Cases;
155  BestV = v-1;
156  }
157 
158  Verbosity(3,
159  { fprintf(Of, "\t\tFrom %s (gain %.3f)",
160  AttValName[Att][v], ThisGain);
161  PrintDistribution(Att, 0, 3, GEnv.Freq, GEnv.ValFreq, false);
162  })
163  }
164 
165  /* Move val v from right branch to left branch */
166 
167  ForEach(c, 1, MaxClass)
168  {
169  GEnv.Freq[2][c] += GEnv.Freq[v+1][c];
170  GEnv.Freq[3][c] -= GEnv.Freq[v+1][c];
171  }
172  SplitFreq[2] += GEnv.ValFreq[v];
173  SplitFreq[3] -= GEnv.ValFreq[v];
174  }
175 
176  if ( Tries > 1 ) BestGain -= Log(Tries) / Cases;
177 
178  /* If a test on the attribute is able to make a gain,
179  set the best break point, gain and information */
180 
181  if ( BestGain <= 0 )
182  {
183  Verbosity(2, fprintf(Of, "\tno gain\n"))
184  }
185  else
186  {
187  Gain[Att] = BestGain;
188  Info[Att] = BestInfo;
189  Bar[Att] = BestV;
190 
191  Verbosity(2,
192  fprintf(Of, "\tcut=%g, inf %.3f, gain %.3f\n",
193  Bar[Att], Info[Att], Gain[Att]))
194  }
195 }
196 
197 
198 
199 /*************************************************************************/
200 /* */
201 /* Compute frequency tables Freq[][] and ValFreq[] for attribute */
202 /* Att for current cases */
203 /* */
204 /*************************************************************************/
205 
206 
208 /* ------------ */
209 {
210  ClassNo c;
211  DiscrValue v;
212  int x;
213 
214  /* Determine the frequency of each possible value for the
215  given attribute */
216 
217  ForEach(v, 0, MaxAttVal[Att])
218  {
219  GEnv.ValFreq[v] = 0;
220 
221  x = v * MaxClass;
222  ForEach(c, 1, MaxClass)
223  {
224  GEnv.ValFreq[v] += (GEnv.Freq[v][c] = DFreq[Att][x + (c-1)]);
225  }
226  }
227 }
228 
229 
230 
231 /*************************************************************************/
232 /* */
233 /* Return the base info for cases with known values of a discrete */
234 /* attribute, using the frequency table Freq[][] */
235 /* */
236 /*************************************************************************/
237 
238 
239 double DiscrKnownBaseInfo(CaseCount KnownCases, DiscrValue MaxVal)
240 /* ------------------ */
241 {
242  ClassNo c;
243  CaseCount ClassCount;
244  DiscrValue v;
245 
246  if ( KnownCases < 1E-5 ) return 0.0;
247 
248  ForEach(c, 1, MaxClass)
249  {
250  ClassCount = 0;
251  ForEach(v, 1, MaxVal)
252  {
253  ClassCount += GEnv.Freq[v][c];
254  }
255  GEnv.ClassFreq[c] = ClassCount;
256  }
257 
258  return TotalInfo(GEnv.ClassFreq, 1, MaxClass) / KnownCases;
259 }
260 
261 
262 
263 /*************************************************************************/
264 /* */
265 /* Construct and return a node for a test on a discrete attribute */
266 /* */
267 /*************************************************************************/
268 
269 
270 void DiscreteTest(Tree Node, Attribute Att)
271 /* ------------ */
272 {
273  int S, Bytes;
274  DiscrValue v, CutV;
275 
276  if ( Ordered(Att) )
277  {
278  Sprout(Node, 3);
279 
280  Node->NodeType = BrSubset;
281  Node->Tested = Att;
282 
283  Bytes = (MaxAttVal[Att]>>3) + 1;
284  Node->Subset = AllocZero(4, Set);
285 
286  ForEach(S, 1, 3)
287  {
288  Node->Subset[S] = AllocZero(Bytes, Byte);
289  }
290 
291  Node->Cut = CutV = Bar[Att] + 0.1;
292 
293  SetBit(1, Node->Subset[1]);
294  ForEach(v, 2, MaxAttVal[Att])
295  {
296  S = ( v <= CutV ? 2 : 3 );
297  SetBit(v, Node->Subset[S]);
298  }
299  }
300  else
301  {
302  Sprout(Node, MaxAttVal[Att]);
303 
304  Node->NodeType = BrDiscr;
305  Node->Tested = Att;
306  }
307 }