mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
attwinnow.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Routines for winnowing attributes */
30 /* --------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 float *AttImp=Nil; /* att importance */
39 Boolean *Split=Nil, /* atts used in unpruned tree */
40  *Used=Nil; /* atts used in pruned tree */
41 
42 
43 /*************************************************************************/
44 /* */
45 /* Winnow attributes by constructing a tree from half the data. */
46 /* Remove those that are never used as splits and those that */
47 /* increase error on the remaining data, and check that the new */
48 /* error cost does not increase */
49 /* */
50 /*************************************************************************/
51 
52 
53 void WinnowAtts()
54 /* ---------- */
55 {
56  Attribute Att, Removed=0, Best;
57  CaseNo i, Bp, Ep;
58  float Base;
59  Boolean First=true, *Upper;
60  ClassNo c;
61  extern Attribute *DList;
62  extern int NDList;
63 
64  /* Save original case order */
65 
67  ForEach(i, 0, MaxCase)
68  {
69  SaveCase[i] = Case[i];
70  }
71 
72  /* Split data into two halves with equal class frequencies */
73 
74  Upper = AllocZero(MaxClass+1, Boolean);
75 
76  Bp = 0;
77  Ep = MaxCase;
78  ForEach(i, 0, MaxCase)
79  {
80  c = Class(SaveCase[i]);
81 
82  if ( Upper[c] )
83  {
84  Case[Ep--] = SaveCase[i];
85  }
86  else
87  {
88  Case[Bp++] = SaveCase[i];
89  }
90 
91  Upper[c] = ! Upper[c];
92  }
93 
94  Free(Upper);
95 
96  /* Use first 50% of the cases for building a winnowing tree
97  and remaining 50% for measuring attribute importance */
98 
99  AttImp = AllocZero(MaxAtt+1, float);
102 
103  Base = TrialTreeCost(true);
104 
105  /* Remove attributes when doing so would reduce error cost */
106 
107  ForEach(Att, 1, MaxAtt)
108  {
109  if ( AttImp[Att] < 0 )
110  {
111  SpecialStatus[Att] ^= SKIP;
112  Removed++;
113  }
114  }
115 
116  /* If any removed, rebuild tree and reinstate if error increases */
117 
118  if ( Removed && TrialTreeCost(false) > Base )
119  {
120  ForEach(Att, 1, MaxAtt)
121  {
122  if ( AttImp[Att] < 0 )
123  {
124  AttImp[Att] = 1;
125  SpecialStatus[Att] ^= SKIP;
126  Verbosity(1, fprintf(Of, " re-including %s\n", AttName[Att]))
127  }
128  }
129 
130  Removed=0;
131  }
132 
133  /* Discard unused attributes */
134 
135  ForEach(Att, 1, MaxAtt)
136  {
137  if ( Att != ClassAtt && ! Skip(Att) && ! Split[Att] )
138  {
139  SpecialStatus[Att] ^= SKIP;
140  Removed++;
141  }
142  }
143 
144  /* Print summary of winnowing */
145 
146  if ( ! Removed )
147  {
148  fprintf(Of, T_NoWinnow);
149  }
150  else
151  {
152  fprintf(Of, T_AttributesWinnowed, Removed, Plural(Removed));
153 
154  /* Print remaining attributes ordered by importance */
155 
156  while ( true )
157  {
158  Best = 0;
159  ForEach(Att, 1, MaxAtt)
160  {
161  if ( AttImp[Att] >= 1 &&
162  ( ! Best || AttImp[Att] > AttImp[Best] ) )
163  {
164  Best = Att;
165  }
166  }
167  if ( ! Best ) break;
168 
169  if ( First )
170  {
171  fprintf(Of, T_EstImportance);
172  First = false;
173  }
174  if ( AttImp[Best] >= 1.005 )
175  {
176  fprintf(Of, "%7d%% %s\n",
177  (int) ((AttImp[Best] - 1) * 100 + 0.5),
178  AttName[Best]);
179  }
180  else
181  {
182  fprintf(Of, " <1%% %s\n", AttName[Best]);
183  }
184  AttImp[Best] = 0;
185  }
186  }
187 
188  if ( Removed )
189  {
190  Winnowed = true;
191 
192  /* Reset DList */
193 
194  NDList = 0;
195  ForEach(Att, 1, MaxAtt)
196  {
197  if ( DFreq[Att] && ! Skip(Att) )
198  {
199  DList[NDList++] = Att;
200  }
201  }
202  }
203 
204  /* Restore case order and clean up */
205 
206  ForEach(i, 0, MaxCase)
207  {
208  Case[i] = SaveCase[i];
209  }
210 
215 
216  Now = 0;
217 }
218 
219 
220 
221 /*************************************************************************/
222 /* */
223 /* Build trial tree and check error cost on remaining data. */
224 /* If first time, note split attributes and check effect of */
225 /* removing every attribute */
226 /* */
227 /*************************************************************************/
228 
229 
230 float TrialTreeCost(Boolean FirstTime)
231 /* ------------- */
232 {
233  Attribute Att;
234  float Base, Cost, SaveMINITEMS;
235  CaseNo SaveMaxCase, Cut;
236  int SaveVERBOSITY;
237 
238  Verbosity(1,
239  fprintf(Of, ( FirstTime ? "\nWinnow cycle:\n" : "\nCheck:\n" )))
240 
241  /* Build and prune trial tree */
242 
243  SaveMaxCase = MaxCase;
244  SaveVERBOSITY = VERBOSITY;
245  SaveMINITEMS = MINITEMS;
246  MINITEMS = Max(MINITEMS / 2, 2.0);
247 
248  Cut = (MaxCase+1) / 2 - 1;
249 
251  LEAFRATIO = 0;
252  VERBOSITY = 0;
253  MaxCase = Cut;
254 
255  memset(Tested, 0, MaxAtt+1); /* reset tested attributes */
256 
258  FormTree(0, Cut, 0, &WTree);
259 
260  if ( FirstTime )
261  {
262  /* Find attributes used in unpruned tree */
263 
264  ScanTree(WTree, Split);
265  }
266 
267  Prune(WTree);
268 
269  VERBOSITY = SaveVERBOSITY;
270  MaxCase = SaveMaxCase;
271  MINITEMS = SaveMINITEMS;
272 
273  Verbosity(2,
274  PrintTree(WTree, "Winnowing tree:");
275  fprintf(Of, "\n training error cost %g\n", ErrCost(WTree, 0, Cut)))
276 
277  Base = ErrCost(WTree, Cut+1, MaxCase);
278 
279  Verbosity(1,
280  fprintf(Of, " initial error cost %g\n", Base))
281 
282  if ( FirstTime )
283  {
284  /* Check each attribute used in pruned tree */
285 
286  ScanTree(WTree, Used);
287 
288  ForEach(Att, 1, MaxAtt)
289  {
290 
291  if ( ! Used[Att] )
292  {
293  Verbosity(1,
294  if ( Att != ClassAtt && ! Skip(Att) )
295  {
296  fprintf(Of, " %s not used\n", AttName[Att]);
297  })
298 
299  if ( Split[Att] )
300  {
301  AttImp[Att] = 1;
302  }
303 
304  continue;
305  }
306 
307  /* Determine error cost if this attribute omitted */
308 
309  SpecialStatus[Att] ^= SKIP;
310 
311  Cost = ErrCost(WTree, Cut+1, MaxCase);
312 
313  AttImp[Att] = ( Cost < Base ? -1 : Cost / Base );
314  Verbosity(1,
315  fprintf(Of, " error cost without %s = %g%s\n",
316  AttName[Att], Cost,
317  ( Cost < Base ? " - excluded" : "" )))
318 
319  SpecialStatus[Att] ^= SKIP;
320  }
321  }
322 
323  if ( WTree )
324  {
325  FreeTree(WTree); WTree = Nil;
326  }
327 
328  return Base;
329 }
330 
331 
332 
333 /*************************************************************************/
334 /* */
335 /* Determine the error rate or cost of T on cases Fp through Lp */
336 /* */
337 /*************************************************************************/
338 
339 
340 float ErrCost(Tree T, CaseNo Fp, CaseNo Lp)
341 /* ------- */
342 {
343  CaseNo i;
344  float ErrCost=0;
345  ClassNo Pred;
346 
347  if ( MCost )
348  {
349  ForEach(i, Fp, Lp)
350  {
351  if ( (Pred = TreeClassify(Case[i], T)) != Class(Case[i]) )
352  {
353  ErrCost += MCost[Pred][Class(Case[i])];
354  }
355  }
356  }
357  else
358  {
359  ForEach(i, Fp, Lp)
360  {
361  if ( TreeClassify(Case[i], T) != Class(Case[i]) )
362  {
363  ErrCost += 1.0;
364  }
365  }
366  }
367 
368  return ErrCost;
369 }
370 
371 
372 
373 /*************************************************************************/
374 /* */
375 /* Find attributes used in tree T */
376 /* */
377 /*************************************************************************/
378 
379 
381 /* -------- */
382 {
383  DiscrValue v;
384 
385  if ( T->NodeType )
386  {
387  Used[T->Tested] = true;
388 
389  ForEach(v, 1, T->Forks)
390  {
391  ScanTree(T->Branch[v], Used);
392  }
393  }
394 }