mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
getdata.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Get cases from data file */
30 /* ------------------------ */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 
39 #define Inc 2048
40 
42 #define XError(a,b,c) if (! SuppressErrorMessages) Error(a,b,c)
43 
44 CaseNo SampleFrom; /* file count for sampling */
45 
46 
47 /*************************************************************************/
48 /* */
49 /* Read raw cases from file with given extension. */
50 /* */
51 /* On completion, cases are stored in array Case in the form */
52 /* of vectors of attribute values, and MaxCase is set to the */
53 /* number of data cases. */
54 /* */
55 /*************************************************************************/
56 
57 
58 void GetData(FILE *Df, Boolean Train, Boolean AllowUnknownClass)
59 /* ------- */
60 {
61  DataRec DVec;
62  CaseNo CaseSpace, WantTrain, LeftTrain, WantTest, LeftTest;
63  Boolean FirstIgnore=true, SelectTrain;
64 
65  LineNo = 0;
66  SuppressErrorMessages = SAMPLE && ! Train;
67 
68  /* Don't reset case count if appending data for xval */
69 
70  if ( Train || ! Case )
71  {
72  MaxCase = MaxLabel = CaseSpace = 0;
73  Case = Alloc(1, DataRec); /* for error reporting */
74  }
75  else
76  {
77  CaseSpace = MaxCase + 1;
78  MaxCase++;
79  }
80 
81  if ( SAMPLE )
82  {
83  if ( Train )
84  {
85  SampleFrom = CountData(Df);
86  ResetKR(KRInit); /* initialise KRandom() */
87  }
88  else
89  {
90  ResetKR(KRInit); /* restore KRandom() */
91  }
92 
93  WantTrain = SampleFrom * SAMPLE + 0.5;
94  LeftTrain = SampleFrom;
95 
96  WantTest = ( SAMPLE < 0.5 ? WantTrain : SampleFrom - WantTrain );
97  LeftTest = SampleFrom - WantTrain;
98  }
99 
100  while ( (DVec = GetDataRec(Df, Train)) )
101  {
102  /* Check whether to include if we are sampling */
103 
104  if ( SAMPLE )
105  {
106  SelectTrain = KRandom() < WantTrain / (float) LeftTrain--;
107 
108  /* Include if
109  * Select and this is the training set
110  * ! Select and this is the test set and sub-select
111  NB: Must use different random number generator for
112  sub-selection since cannot disturb random number sequence */
113 
114  if ( SelectTrain )
115  {
116  WantTrain--;
117  }
118 
119  if ( SelectTrain != Train ||
120  ( ! Train && AltRandom >= WantTest / (float) LeftTest-- ) )
121  {
122  FreeLastCase(DVec);
123  continue;
124  }
125 
126  if ( ! Train )
127  {
128  WantTest--;
129  }
130  }
131 
132  /* Make sure there is room for another case */
133 
134  if ( MaxCase >= CaseSpace )
135  {
136  CaseSpace += Inc;
137  Realloc(Case, CaseSpace+1, DataRec);
138  }
139 
140  /* Ignore cases with unknown class */
141 
142  if ( AllowUnknownClass || (Class(DVec) & 077777777) > 0 )
143  {
144  Case[MaxCase] = DVec;
145  MaxCase++;
146  }
147  else
148  {
149  if ( FirstIgnore && Of )
150  {
151  fprintf(Of, T_IgnoreBadClass);
152  FirstIgnore = false;
153  }
154 
155  FreeLastCase(DVec);
156  }
157  }
158 
159  fclose(Df);
160  MaxCase--;
161 
162 }
163 
164 
165 
166 /*************************************************************************/
167 /* */
168 /* Read a raw case from file Df. */
169 /* */
170 /* For each attribute, read the attribute value from the file. */
171 /* If it is a discrete valued attribute, find the associated no. */
172 /* of this attribute value (if the value is unknown this is 0). */
173 /* */
174 /* Returns the DataRec of the case (i.e. the array of attribute */
175 /* values). */
176 /* */
177 /*************************************************************************/
178 
179 
180 DataRec GetDataRec(FILE *Df, Boolean Train)
181 /* ---------- */
182 {
183  Attribute Att;
184  char Name[1000], *EndName;
185  int Dv, Chars;
186  DataRec DVec;
187  ContValue Cv;
188  Boolean FirstValue=true;
189 
190 
191  if ( ReadName(Df, Name, 1000, '\00') )
192  {
193  Case[MaxCase] = DVec = NewCase();
194  ForEach(Att, 1, MaxAtt)
195  {
196  if ( AttDef[Att] )
197  {
198  DVec[Att] = EvaluateDef(AttDef[Att], DVec);
199 
200  if ( Continuous(Att) )
201  {
202  CheckValue(DVec, Att);
203  }
204 
205  if ( SomeMiss )
206  {
207  SomeMiss[Att] |= Unknown(DVec, Att);
208  SomeNA[Att] |= NotApplic(DVec, Att);
209  }
210 
211  continue;
212  }
213 
214  /* Get the attribute value if don't already have it */
215 
216  if ( ! FirstValue && ! ReadName(Df, Name, 1000, '\00') )
217  {
218  XError(HITEOF, AttName[Att], "");
219  FreeLastCase(DVec);
220  return Nil;
221  }
222  FirstValue = false;
223 
224  if ( Exclude(Att) )
225  {
226  if ( Att == LabelAtt )
227  {
228  /* Record the value as a string */
229 
230  SVal(DVec,Att) = StoreIVal(Name);
231  }
232  }
233  else
234  if ( ! strcmp(Name, "?") )
235  {
236  /* Set marker to indicate missing value */
237 
238  DVal(DVec, Att) = UNKNOWN;
239  if ( SomeMiss ) SomeMiss[Att] = true;
240  }
241  else
242  if ( Att != ClassAtt && ! strcmp(Name, "N/A") )
243  {
244  /* Set marker to indicate not applicable */
245 
246  DVal(DVec, Att) = NA;
247  if ( SomeNA ) SomeNA[Att] = true;
248  }
249  else
250  if ( Discrete(Att) )
251  {
252  /* Discrete attribute */
253 
254  Dv = Which(Name, AttValName[Att], 1, MaxAttVal[Att]);
255  if ( ! Dv )
256  {
257  if ( StatBit(Att, DISCRETE) )
258  {
259  if ( Train || XVAL )
260  {
261  /* Add value to list */
262 
263  if ( MaxAttVal[Att] >= (long) AttValName[Att][0] )
264  {
265  XError(TOOMANYVALS, AttName[Att],
266  (char *) AttValName[Att][0] - 1);
267  Dv = MaxAttVal[Att];
268  }
269  else
270  {
271  Dv = ++MaxAttVal[Att];
272  AttValName[Att][Dv] = strdup(Name);
273  AttValName[Att][Dv+1] = "<other>"; /* no free */
274  }
275  if ( Dv > MaxDiscrVal )
276  {
277  MaxDiscrVal = Dv;
278  }
279  }
280  else
281  {
282  /* Set value to "<other>" */
283 
284  Dv = MaxAttVal[Att] + 1;
285  }
286  }
287  else
288  {
289  XError(BADATTVAL, AttName[Att], Name);
290  Dv = UNKNOWN;
291  }
292  }
293  DVal(DVec, Att) = Dv;
294  }
295  else
296  {
297  /* Continuous value */
298 
299  if ( TStampVal(Att) )
300  {
301  CVal(DVec, Att) = Cv = TStampToMins(Name);
302  if ( Cv >= 1E9 ) /* long time in future */
303  {
304  XError(BADTSTMP, AttName[Att], Name);
305  DVal(DVec, Att) = UNKNOWN;
306  }
307  }
308  else
309  if ( DateVal(Att) )
310  {
311  CVal(DVec, Att) = Cv = DateToDay(Name);
312  if ( Cv < 1 )
313  {
314  XError(BADDATE, AttName[Att], Name);
315  DVal(DVec, Att) = UNKNOWN;
316  }
317  }
318  else
319  if ( TimeVal(Att) )
320  {
321  CVal(DVec, Att) = Cv = TimeToSecs(Name);
322  if ( Cv < 0 )
323  {
324  XError(BADTIME, AttName[Att], Name);
325  DVal(DVec, Att) = UNKNOWN;
326  }
327  }
328  else
329  {
330  CVal(DVec, Att) = strtod(Name, &EndName);
331  if ( EndName == Name || *EndName != '\0' )
332  {
333  XError(BADATTVAL, AttName[Att], Name);
334  DVal(DVec, Att) = UNKNOWN;
335  }
336  }
337 
338  CheckValue(DVec, Att);
339  }
340  }
341 
342  if ( ClassAtt )
343  {
344  if ( Discrete(ClassAtt) )
345  {
346  Class(DVec) = XDVal(DVec, ClassAtt);
347  }
348  else
349  if ( Unknown(DVec, ClassAtt) || NotApplic(DVec, ClassAtt) )
350  {
351  Class(DVec) = 0;
352  }
353  else
354  {
355  /* Find appropriate segment using class thresholds */
356 
357  Cv = CVal(DVec, ClassAtt);
358 
359  for ( Dv = 1 ; Dv < MaxClass && Cv > ClassThresh[Dv] ; Dv++ )
360  ;
361 
362  Class(DVec) = Dv;
363  }
364  }
365  else
366  {
367  if ( ! ReadName(Df, Name, 1000, '\00') )
368  {
369  XError(HITEOF, Fn, "");
370  FreeLastCase(DVec);
371  return Nil;
372  }
373 
374  if ( (Class(DVec) = Dv = Which(Name, ClassName, 1, MaxClass)) == 0 )
375  {
376  if ( strcmp(Name, "?") ) XError(BADCLASS, "", Name);
377  }
378  }
379 
380  if ( LabelAtt &&
381  (Chars = strlen(IgnoredVals + SVal(DVec, LabelAtt))) > MaxLabel )
382  {
383  MaxLabel = Chars;
384  }
385  return DVec;
386  }
387  else
388  {
389  return Nil;
390  }
391 }
392 
393 
394 
395 /*************************************************************************/
396 /* */
397 /* Count cases in data file */
398 /* */
399 /*************************************************************************/
400 
401 
402 CaseNo CountData(FILE *Df)
403 /* --------- */
404 {
405  char Last=',';
406  int Count=0, Next;
407 
408  while ( true )
409  {
410  if ( (Next = getc(Df)) == EOF )
411  {
412  if ( Last != ',' ) Count++;
413  rewind(Df);
414  return Count;
415  }
416 
417  if ( Next == '|' )
418  {
419  while ( (Next = getc(Df)) != '\n' )
420  ;
421  }
422 
423  if ( Next == '\n' )
424  {
425  if ( Last != ',' ) Count++;
426  Last = ',';
427  }
428  else
429  if ( Next == '\\' )
430  {
431  /* Skip escaped character */
432 
433  getc(Df);
434  }
435  else
436  if ( Next != '\t' && Next != ' ' )
437  {
438  Last = Next;
439  }
440  }
441 }
442 
443 
444 
445 /*************************************************************************/
446 /* */
447 /* Store a label or ignored value in IValStore */
448 /* */
449 /*************************************************************************/
450 
451 
453 /* --------- */
454 {
455  int StartIx, Length;
456 
457  if ( (Length=strlen(S) + 1) + IValsOffset > IValsSize )
458  {
459  if ( IgnoredVals )
460  {
461  Realloc(IgnoredVals, IValsSize += 32768, char);
462  }
463  else
464  {
465  IValsSize = 32768;
466  IValsOffset = 0;
467  IgnoredVals = Alloc(IValsSize, char);
468  }
469  }
470 
471  StartIx = IValsOffset;
472  strcpy(IgnoredVals + StartIx, S);
473  IValsOffset += Length;
474 
475  return StartIx;
476 }
477 
478 
479 
480 /*************************************************************************/
481 /* */
482 /* Free case space */
483 /* */
484 /*************************************************************************/
485 
486 
487 void FreeData()
488 /* -------- */
489 {
490  FreeCases();
491 
493  IValsSize = 0;
494 
495  Free(Case); Case = Nil;
496 
497  MaxCase = -1;
498 }
499 
500 
501 
502 /*************************************************************************/
503 /* */
504 /* Check for bad continuous value */
505 /* */
506 /*************************************************************************/
507 
508 
509 void CheckValue(DataRec DVec, Attribute Att)
510 /* ---------- */
511 {
512  ContValue Cv;
513 
514  Cv = CVal(DVec, Att);
515  if ( ! finite(Cv) )
516  {
517  Error(BADNUMBER, AttName[Att], "");
518 
519  CVal(DVec, Att) = UNKNOWN;
520  }
521 }