mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
xval.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Carry out crossvalidation trials */
30 /* -------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 #include "defns.i"
35 #include "extern.i"
36 
37 
39 float **Result=Nil; /* Result[f][0] = tree/ruleset size
40  [1] = tree/ruleset errors
41  [2] = tree/ruleset cost */
42 
43 
44 
45 /*************************************************************************/
46 /* */
47 /* Outer function (differs from xval script) */
48 /* */
49 /*************************************************************************/
50 
51 
52 void CrossVal()
53 /* -------- */
54 {
55  CaseNo i, Size, Start=0, Next, SaveMaxCase;
56  int f, SmallTestBlocks, t, SaveTRIALS;
57  ClassNo c;
58  static CaseNo *ConfusionMat=Nil;
59  static int SaveFOLDS=0;
60 
61  /* Check for left-overs after interrupt */
62 
63  if ( Result )
64  {
65  FreeVector((void **) Result, 0, SaveFOLDS-1);
66  Free(ConfusionMat);
67  }
68 
69  if ( FOLDS > MaxCase+1 )
70  {
71  fprintf(Of, T_FoldsReduced);
72  FOLDS = MaxCase+1;
73  }
74 
75  Result = AllocZero((SaveFOLDS = FOLDS), float *);
76  Blocked = Alloc(MaxCase+1, DataRec);
77  ConfusionMat = AllocZero((MaxClass+1)*(MaxClass+1), CaseNo);
78 
79  Prepare();
80 
81  SaveMaxCase = MaxCase;
82  SaveTRIALS = TRIALS;
83 
84  /* First test blocks may be smaller than the others */
85 
86  SmallTestBlocks = FOLDS - ((MaxCase+1) % FOLDS);
87  Size = (MaxCase + 1) / FOLDS;
88 
89  ForEach(f, 0, FOLDS-1)
90  {
91  fprintf(Of, "\n\n[ " T_Fold " %d ]\n", f+1);
92  Result[f] = AllocZero(3, float);
93 
94  if ( f == SmallTestBlocks ) Size++;
95  MaxCase = SaveMaxCase - Size;
96 
97  ForEach(i, 0, MaxCase)
98  {
99  Case[i] = Blocked[Start];
100  Start = (Start + 1) % (SaveMaxCase + 1);
101  }
102 
104 
105  /* Check size (if appropriate) and errors */
106 
107  if ( TRIALS == 1 )
108  {
109  Result[f][0] = ( RULES ? RuleSet[0]->SNRules :
110  TreeSize(Pruned[0]) );
111  Next = Start;
112  ForEach(i, 0, Size-1)
113  {
114  Case[i] = Blocked[Next];
115  c = ( RULES ? RuleClassify(Blocked[Next], RuleSet[0]) :
116  TreeClassify(Blocked[Next], Pruned[0]) );
117  if ( c != Class(Blocked[Next]) )
118  {
119  Result[f][1] += 1.0;
120  if ( MCost )
121  {
122  Result[f][2] += MCost[c][Class(Blocked[Next])];
123  }
124  }
125 
126  /* Add to confusion matrix for target classifier */
127 
128  ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
129 
130  Next = (Next + 1) % (SaveMaxCase + 1);
131  }
132  }
133  else
134  {
135  Result[f][0] = -1;
136  Next = Start;
137  Default = ( RULES ? RuleSet[0]->SDefault : Pruned[0]->Leaf );
138  ForEach(i, 0, Size-1)
139  {
140  Case[i] = Blocked[Next];
141  c = BoostClassify(Blocked[Next], TRIALS-1);
142  if ( c != Class(Blocked[Next]) )
143  {
144  Result[f][1] += 1.0;
145  if ( MCost )
146  {
147  Result[f][2] += MCost[c][Class(Blocked[Next])];
148  }
149  }
150 
151  /* Add to confusion matrix for target classifier */
152 
153  ConfusionMat[ Class(Blocked[Next])*(MaxClass+1)+c ]++;
154 
155  Next = (Next + 1) % (SaveMaxCase + 1);
156  }
157  }
158 
159  Result[f][1] = (100.0 * Result[f][1]) / Size;
160  Result[f][2] /= Size;
161 
162  fprintf(Of, T_EvalHoldOut, Size);
163  MaxCase = Size-1;
164  Evaluate(0);
165 
166  /* Free space used by classifiers */
167 
168  ForEach(t, 0, MaxTree)
169  {
170  FreeClassifier(t);
171  }
172  MaxTree = -1;
173 
174  TRIALS = SaveTRIALS;
175  }
176 
177  /* Print summary of crossvalidation */
178 
179  MaxCase = SaveMaxCase;
180 
181  Summary();
182  PrintConfusionMatrix(ConfusionMat);
183 
184  /* Free local storage */
185 
186  ForEach(i, 0, MaxCase)
187  {
188  Case[i] = Blocked[i];
189  }
190 
191  FreeVector((void **) Result, 0, FOLDS-1); Result = Nil;
192  Free(Blocked); Blocked = Nil;
193  Free(ConfusionMat); ConfusionMat = Nil;
194 }
195 
196 
197 
198 /*************************************************************************/
199 /* */
200 /* Prepare data for crossvalidation (similar to xval-prep.c) */
201 /* */
202 /*************************************************************************/
203 
204 
205 void Prepare()
206 /* ------- */
207 {
208  CaseNo i, First=0, Last, *Temp, Hold, Next=0;
209  ClassNo Group;
210 
211  Temp = Alloc(MaxCase+1, CaseNo);
212  ForEach(i, 0, MaxCase)
213  {
214  Temp[i] = i;
215  }
216 
217  Shuffle(Temp);
218 
219  /* Sort into class groups */
220 
221  while ( First <= MaxCase )
222  {
223  Last = First;
224  Group = Class(Case[Temp[First]]);
225 
226  ForEach(i, First+1, MaxCase)
227  {
228  if ( Class(Case[Temp[i]]) == Group )
229  {
230  Last++;
231  Hold = Temp[Last];
232  Temp[Last] = Temp[i];
233  Temp[i] = Hold;
234  }
235  }
236 
237  First = Last+1;
238  }
239 
240  /* Organize into stratified blocks */
241 
242  ForEach(First, 0, FOLDS-1)
243  {
244  for ( i = First ; i <= MaxCase ; i += FOLDS )
245  {
246  Blocked[Next++] = Case[Temp[i]];
247  }
248  }
249 
250  Free(Temp);
251 }
252 
253 
254 
255 /*************************************************************************/
256 /* */
257 /* Shuffle the data cases */
258 /* */
259 /*************************************************************************/
260 
261 
262 void Shuffle(int *Vec)
263 /* ------- */
264 {
265  int This=0, Alt, Left=MaxCase+1, Hold;
266 
267  ResetKR(KRInit);
268 
269  while ( Left )
270  {
271  Alt = This + (Left--) * KRandom();
272 
273  Hold = Vec[This];
274  Vec[This++] = Vec[Alt];
275  Vec[Alt] = Hold;
276  }
277 }
278 
279 
280 
281 /*************************************************************************/
282 /* */
283 /* Summarise a crossvalidation */
284 /* */
285 /*************************************************************************/
286 
287 
288 char
289  *FoldHead[] = { F_Fold, F_UFold, "" };
290 
291 void Summary()
292 /* ------- */
293 {
294  int i, f, t;
295  Boolean PrintSize=true;
296  float Sum[3], SumSq[3];
297  extern char *StdP[], *StdPC[], *Extra[], *ExtraC[];
298 
299  for ( i = 0 ; i < 3 ; i++ )
300  {
301  Sum[i] = SumSq[i] = 0;
302  }
303 
304  ForEach(f, 0, FOLDS-1)
305  {
306  if ( Result[f][0] < 1 ) PrintSize = false;
307  }
308 
309  fprintf(Of, "\n\n[ " T_Summary " ]\n\n");
310 
311  ForEach(t, 0, 2)
312  {
313  fprintf(Of, "%s", FoldHead[t]);
314  putc('\t', Of);
315  if ( RULES )
316  {
317  fprintf(Of, "%s", ( MCost ? ExtraC[t] : Extra[t] ));
318  }
319  else
320  {
321  fprintf(Of, "%s", ( MCost ? StdPC[t] : StdP[t] ));
322  }
323  putc('\n', Of);
324  }
325  putc('\n', Of);
326 
327  ForEach(f, 0, FOLDS-1)
328  {
329  fprintf(Of, "%4d\t", f+1);
330 
331  if ( PrintSize )
332  {
333  fprintf(Of, " %5g", Result[f][0]);
334  }
335  else
336  {
337  fprintf(Of, " *");
338  }
339  fprintf(Of, " %10.1f%%", Result[f][1]);
340 
341  if ( MCost )
342  {
343  fprintf(Of, "%7.2f", Result[f][2]);
344  }
345  fprintf(Of, "\n");
346 
347  for ( i = 0 ; i < 3 ; i++ )
348  {
349  Sum[i] += Result[f][i];
350  SumSq[i] += Result[f][i] * Result[f][i];
351  }
352  }
353 
354  fprintf(Of, "\n " T_Mean "\t");
355 
356  if ( ! PrintSize )
357  {
358  fprintf(Of, " ");
359  }
360  else
361  {
362  fprintf(Of, "%6.1f", Sum[0] / FOLDS);
363  }
364 
365  fprintf(Of, " %10.1f%%", Sum[1] / FOLDS);
366 
367  if ( MCost )
368  {
369  fprintf(Of, "%7.2f", Sum[2] / FOLDS);
370  }
371 
372  fprintf(Of, "\n " T_SE "\t");
373 
374  if ( ! PrintSize )
375  {
376  fprintf(Of, " ");
377  }
378  else
379  {
380  fprintf(Of, "%6.1f", SE(Sum[0], SumSq[0], FOLDS));
381  }
382 
383  fprintf(Of, " %10.1f%%", SE(Sum[1], SumSq[1], FOLDS));
384 
385  if ( MCost )
386  {
387  fprintf(Of, "%7.2f", SE(Sum[2], SumSq[2], FOLDS));
388  }
389  fprintf(Of, "\n");
390 }
391 
392 
393 
394 float SE(float sum, float sumsq, int no)
395 /* -- */
396 {
397  float mean;
398 
399  mean = sum / no;
400 
401  return sqrt( ((sumsq - no * mean * mean) / (no - 1)) / no );
402 }