mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
modelfiles.c
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Copyright 2010 Rulequest Research Pty Ltd. */
4 /* */
5 /* This file is part of C5.0 GPL Edition, a single-threaded version */
6 /* of C5.0 release 2.07. */
7 /* */
8 /* C5.0 GPL Edition is free software: you can redistribute it and/or */
9 /* modify it under the terms of the GNU General Public License as */
10 /* published by the Free Software Foundation, either version 3 of the */
11 /* License, or (at your option) any later version. */
12 /* */
13 /* C5.0 GPL Edition is distributed in the hope that it will be useful, */
14 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
15 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU */
16 /* General Public License for more details. */
17 /* */
18 /* You should have received a copy of the GNU General Public License */
19 /* (gpl.txt) along with C5.0 GPL Edition. If not, see */
20 /* */
21 /* <http://www.gnu.org/licenses/>. */
22 /* */
23 /*************************************************************************/
24 
25 
26 
27 /*************************************************************************/
28 /* */
29 /* Routines for saving and reading model files */
30 /* ------------------------------------------- */
31 /* */
32 /*************************************************************************/
33 
34 
35 #include "defns.i"
36 #include "extern.i"
37 
38 int Entry;
39 
40 char* Prop[]={"null",
41  "att",
42  "class",
43  "cut",
44  "conds",
45  "elts",
46  "entries",
47  "forks",
48  "freq",
49  "id",
50  "type",
51  "low",
52  "mid",
53  "high",
54  "result",
55  "rules",
56  "val",
57  "lift",
58  "cover",
59  "ok",
60  "default",
61  "costs",
62  "sample",
63  "init"
64  };
65 
66 char PropName[20],
68  *Unquoted;
70 
71 #define PROPS 23
72 
73 #define ERRORP 0
74 #define ATTP 1
75 #define CLASSP 2
76 #define CUTP 3
77 #define CONDSP 4
78 #define ELTSP 5
79 #define ENTRIESP 6
80 #define FORKSP 7
81 #define FREQP 8
82 #define IDP 9
83 #define TYPEP 10
84 #define LOWP 11
85 #define MIDP 12
86 #define HIGHP 13
87 #define RESULTP 14
88 #define RULESP 15
89 #define VALP 16
90 #define LIFTP 17
91 #define COVERP 18
92 #define OKP 19
93 #define DEFAULTP 20
94 #define COSTSP 21
95 #define SAMPLEP 22
96 #define INITP 23
97 
98 
99 /*************************************************************************/
100 /* */
101 /* Check whether file is open. If it is not, open it and */
102 /* read/write sampling information and discrete names */
103 /* */
104 /*************************************************************************/
105 
106 
107 void CheckFile(String Extension, Boolean Write)
108 /* --------- */
109 {
110  static char *LastExt="";
111 
112  if ( ! TRf || strcmp(LastExt, Extension) )
113  {
114  LastExt = Extension;
115 
116  if ( TRf )
117  {
118  fprintf(TRf, "\n");
119  fclose(TRf);
120  }
121 
122  if ( Write )
123  {
124  WriteFilePrefix(Extension);
125  }
126  else
127  {
128  ReadFilePrefix(Extension);
129  }
130  }
131 }
132 
133 
134 
135 /*************************************************************************/
136 /* */
137 /* Write information on system, sampling */
138 /* */
139 /*************************************************************************/
140 
141 
142 void WriteFilePrefix(String Extension)
143 /* --------------- */
144 {
145  time_t clock;
146  struct tm *now;
147 
148  if ( ! (TRf = GetFile(Extension, "w")) )
149  {
150  Error(NOFILE, Fn, E_ForWrite);
151  }
152 
153  clock = time(0);
154  now = localtime(&clock);
155  now->tm_mon++;
156  fprintf(TRf, "id=\"See5/C5.0 %s %d-%d%d-%d%d\"\n",
157  RELEASE,
158  now->tm_year + 1900,
159  now->tm_mon / 10, now->tm_mon % 10,
160  now->tm_mday / 10, now->tm_mday % 10);
161 
162  if ( MCost )
163  {
164  fprintf(TRf, "costs=\"1\"\n");
165  }
166 
167  if ( SAMPLE > 0 )
168  {
169  fprintf(TRf, "sample=\"%g\" init=\"%d\"\n", SAMPLE, KRInit);
170  }
171 
173 
174  fprintf(TRf, "entries=\"%d\"\n", TRIALS);
175 }
176 
177 
178 
179 /*************************************************************************/
180 /* */
181 /* Read header information */
182 /* */
183 /*************************************************************************/
184 
185 
186 void ReadFilePrefix(String Extension)
187 /* -------------- */
188 {
189  if ( ! (TRf = GetFile(Extension, "r")) ) Error(NOFILE, Fn, "");
190 
191  StreamIn((char *) &TRIALS, sizeof(int));
192  if ( memcmp((char *) &TRIALS, "id=", 3) != 0 )
193  {
194  printf("\nCannot read old format classifiers\n");
195  exit(1);
196  }
197  else
198  {
199  rewind(TRf);
200  ReadHeader();
201  }
202 }
203 
204 
205 
206 /*************************************************************************/
207 /* */
208 /* Save attribute values read with "discrete N" */
209 /* */
210 /*************************************************************************/
211 
212 
214 /* ----------------- */
215 {
216  Attribute Att;
217  DiscrValue v;
218 
219  ForEach(Att, 1, MaxAtt)
220  {
221  if ( ! StatBit(Att, DISCRETE) || MaxAttVal[Att] < 2 ) continue;
222 
223  AsciiOut("att=", AttName[Att]);
224  AsciiOut(" elts=", AttValName[Att][2]); /* skip N/A */
225 
226  ForEach(v, 3, MaxAttVal[Att])
227  {
228  AsciiOut(",", AttValName[Att][v]);
229  }
230  fprintf(TRf, "\n");
231  }
232 }
233 
234 
235 
236 /*************************************************************************/
237 /* */
238 /* Save entire decision tree T in file with extension Extension */
239 /* */
240 /*************************************************************************/
241 
242 
243 void SaveTree(Tree T, String Extension)
244 /* -------- */
245 {
246  CheckFile(Extension, true);
247 
248  OutTree(T);
249 }
250 
251 
252 
253 void OutTree(Tree T)
254 /* ------- */
255 {
256  DiscrValue v, vv;
257  ClassNo c;
258  Boolean First;
259 
260  fprintf(TRf, "type=\"%d\"", T->NodeType);
261  AsciiOut(" class=", ClassName[T->Leaf]);
262  if ( T->Cases > 0 )
263  {
264  fprintf(TRf, " freq=\"%g", T->ClassDist[1]);
265  ForEach(c, 2, MaxClass)
266  {
267  fprintf(TRf, ",%g", T->ClassDist[c]);
268  }
269  fprintf(TRf, "\"");
270  }
271 
272  if ( T->NodeType )
273  {
274  AsciiOut(" att=", AttName[T->Tested]);
275  fprintf(TRf, " forks=\"%d\"", T->Forks);
276 
277  switch ( T->NodeType )
278  {
279  case BrDiscr:
280  break;
281 
282  case BrThresh:
283  fprintf(TRf, " cut=\"%.*g\"", PREC+1, T->Cut);
284  if ( T->Upper > T->Cut )
285  {
286  fprintf(TRf, " low=\"%.*g\" mid=\"%.*g\" high=\"%.*g\"",
287  PREC, T->Lower, PREC, T->Mid, PREC, T->Upper);
288  }
289  break;
290 
291  case BrSubset:
292  ForEach(v, 1, T->Forks)
293  {
294  First=true;
295  ForEach(vv, 1, MaxAttVal[T->Tested])
296  {
297  if ( In(vv, T->Subset[v]) )
298  {
299  if ( First )
300  {
301  AsciiOut(" elts=", AttValName[T->Tested][vv]);
302  First = false;
303  }
304  else
305  {
306  AsciiOut(",", AttValName[T->Tested][vv]);
307  }
308  }
309  }
310  /* Make sure have printed at least one element */
311 
312  if ( First ) AsciiOut(" elts=", "N/A");
313  }
314  break;
315  }
316  fprintf(TRf, "\n");
317 
318  ForEach(v, 1, T->Forks)
319  {
320  OutTree(T->Branch[v]);
321  }
322  }
323  else
324  {
325  fprintf(TRf, "\n");
326  }
327 }
328 
329 
330 
331 /*************************************************************************/
332 /* */
333 /* Save the current ruleset in rules file */
334 /* */
335 /*************************************************************************/
336 
337 
338 void SaveRules(CRuleSet RS, String Extension)
339 /* --------- */
340 {
341  int ri, d;
342  CRule R;
343  Condition C;
344  DiscrValue v;
345  Boolean First;
346 
347  CheckFile(Extension, true);
348 
349  fprintf(TRf, "rules=\"%d\"", RS->SNRules);
350  AsciiOut(" default=", ClassName[RS->SDefault]);
351  fprintf(TRf, "\n");
352 
353  ForEach(ri, 1, RS->SNRules)
354  {
355  R = RS->SRule[ri];
356  fprintf(TRf, "conds=\"%d\" cover=\"%g\" ok=\"%g\" lift=\"%g\"",
357  R->Size, R->Cover, R->Correct,
358  (R->Correct + 1) / ((R->Cover + 2) * R->Prior));
359  AsciiOut(" class=", ClassName[R->Rhs]);
360  fprintf(TRf, "\n");
361 
362  ForEach(d, 1, R->Size)
363  {
364  C = R->Lhs[d];
365 
366  fprintf(TRf, "type=\"%d\"", C->NodeType);
367  AsciiOut(" att=", AttName[C->Tested]);
368 
369  switch ( C->NodeType )
370  {
371  case BrDiscr:
372  AsciiOut(" val=", AttValName[C->Tested][C->TestValue]);
373  break;
374 
375  case BrThresh:
376  if ( C->TestValue == 1 ) /* N/A */
377  {
378  fprintf(TRf, " val=\"N/A\"");
379  }
380  else
381  {
382  fprintf(TRf, " cut=\"%.*g\" result=\"%c\"",
383  PREC+1, C->Cut,
384  ( C->TestValue == 2 ? '<' : '>' ));
385  }
386  break;
387 
388  case BrSubset:
389  First=true;
390  ForEach(v, 1, MaxAttVal[C->Tested])
391  {
392  if ( In(v, C->Subset) )
393  {
394  if ( First )
395  {
396  AsciiOut(" elts=", AttValName[C->Tested][v]);
397  First = false;
398  }
399  else
400  {
401  AsciiOut(",", AttValName[C->Tested][v]);
402  }
403  }
404  }
405  break;
406  }
407 
408  fprintf(TRf, "\n");
409  }
410  }
411 }
412 
413 
414 
415 /*************************************************************************/
416 /* */
417 /* Write ASCII string with prefix, escaping any quotes */
418 /* */
419 /*************************************************************************/
420 
421 
422 void AsciiOut(String Pre, String S)
423 /* -------- */
424 {
425  fprintf(TRf, "%s\"", Pre);
426  while ( *S )
427  {
428  if ( *S == '"' || *S == '\\' ) fputc('\\', TRf);
429  fputc(*S++, TRf);
430  }
431  fputc('"', TRf);
432 }
433 
434 
435 
436 /*************************************************************************/
437 /* */
438 /* Read the header information (id, saved names, models) */
439 /* */
440 /*************************************************************************/
441 
442 
444 /* --------- */
445 {
446  Attribute Att;
447  DiscrValue v;
448  char *p, Dummy;
449  int Year, Month, Day;
450  FILE *F;
451 
452  while ( true )
453  {
454  switch ( ReadProp(&Dummy) )
455  {
456  case ERRORP:
457  return;
458 
459  case IDP:
460  /* Recover year run and set base date for timestamps */
461 
462  if ( sscanf(PropVal + strlen(PropVal) - 11,
463  "%d-%d-%d\"", &Year, &Month, &Day) == 3 )
464  {
465  SetTSBase(Year);
466  }
467  break;
468 
469  case COSTSP:
470  /* Recover costs file used to generate model */
471 
472  if ( (F = GetFile(".costs", "r")) )
473  {
474  GetMCosts(F);
475  }
476  break;
477  case SAMPLEP:
478  sscanf(PropVal, "\"%f\"", &SAMPLE);
479  break;
480 
481  case INITP:
482  sscanf(PropVal, "\"%d\"", &KRInit);
483  break;
484 
485  case ATTP:
487  Att = Which(Unquoted, AttName, 1, MaxAtt);
488  if ( ! Att || Exclude(Att) )
489  {
491  }
492  break;
493 
494  case ELTSP:
495  MaxAttVal[Att] = 1;
496  AttValName[Att][1] = strdup("N/A");
497 
498  for ( p = PropVal ; *p ; )
499  {
500  p = RemoveQuotes(p);
501  v = ++MaxAttVal[Att];
502  AttValName[Att][v] = strdup(p);
503 
504  for ( p += strlen(p) ; *p != '"' ; p++ )
505  ;
506  p++;
507  if ( *p == ',' ) p++;
508  }
509  AttValName[Att][MaxAttVal[Att]+1] = "<other>";
510  MaxDiscrVal = Max(MaxDiscrVal, MaxAttVal[Att]+1);
511  break;
512 
513  case ENTRIESP:
514  sscanf(PropVal, "\"%d\"", &TRIALS);
515  Entry = 0;
516  return;
517  }
518  }
519 }
520 
521 
522 
523 /*************************************************************************/
524 /* */
525 /* Retrieve decision tree with extension Extension */
526 /* */
527 /*************************************************************************/
528 
529 
530 Tree GetTree(String Extension)
531 /* ------- */
532 {
533  CheckFile(Extension, false);
534 
535  return InTree();
536 }
537 
538 
539 
541 /* ------ */
542 {
543  Tree T;
544  DiscrValue v, Subset=0;
545  char Delim, *p;
546  ClassNo c;
547  int X;
548  double XD;
549 
550  T = (Tree) AllocZero(1, TreeRec);
551 
552  do
553  {
554  switch ( ReadProp(&Delim) )
555  {
556  case ERRORP:
557  return Nil;
558 
559  case TYPEP:
560  sscanf(PropVal, "\"%d\"", &X); T->NodeType = X;
561  break;
562 
563  case CLASSP:
565  T->Leaf = Which(Unquoted, ClassName, 1, MaxClass);
566  if ( ! T->Leaf ) Error(MODELFILE, E_MFCLASS, Unquoted);
567  break;
568 
569  case ATTP:
571  T->Tested = Which(Unquoted, AttName, 1, MaxAtt);
572  if ( ! T->Tested || Exclude(T->Tested) )
573  {
575  }
576  break;
577 
578  case CUTP:
579  sscanf(PropVal, "\"%lf\"", &XD); T->Cut = XD;
580  T->Lower = T->Mid = T->Upper = T->Cut;
581  break;
582 
583  case LOWP:
584  sscanf(PropVal, "\"%lf\"", &XD); T->Lower = XD;
585  break;
586 
587  case MIDP:
588  sscanf(PropVal, "\"%lf\"", &XD); T->Mid = XD;
589  break;
590 
591  case HIGHP:
592  sscanf(PropVal, "\"%lf\"", &XD); T->Upper = XD;
593  break;
594 
595  case FORKSP:
596  sscanf(PropVal, "\"%d\"", &T->Forks);
597  break;
598 
599  case FREQP:
601  p = PropVal+1;
602 
603  ForEach(c, 1, MaxClass)
604  {
605  T->ClassDist[c] = strtod(p, &p);
606  T->Cases += T->ClassDist[c];
607  p++;
608  }
609  break;
610 
611  case ELTSP:
612  if ( ! Subset++ )
613  {
614  T->Subset = AllocZero(T->Forks+1, Set);
615  }
616 
617  T->Subset[Subset] = MakeSubset(T->Tested);
618  break;
619  }
620  }
621  while ( Delim == ' ' );
622 
623  if ( T->ClassDist )
624  {
625  T->Errors = T->Cases - T->ClassDist[T->Leaf];
626  }
627  else
628  {
629  T->ClassDist = Alloc(1, CaseCount);
630  }
631 
632  if ( T->NodeType )
633  {
634  T->Branch = AllocZero(T->Forks+1, Tree);
635  ForEach(v, 1, T->Forks)
636  {
637  T->Branch[v] = InTree();
638  }
639  }
640 
641  return T;
642 }
643 
644 
645 
646 /*************************************************************************/
647 /* */
648 /* Retrieve ruleset with extension Extension */
649 /* (Separate functions for ruleset, single rule, single condition) */
650 /* */
651 /*************************************************************************/
652 
653 
655 /* -------- */
656 {
657  CheckFile(Extension, false);
658 
659  return InRules();
660 }
661 
662 
663 
665 /* ------- */
666 {
667  CRuleSet RS;
668  RuleNo r;
669  char Delim;
670 
671  RS = Alloc(1, RuleSetRec);
672 
673  do
674  {
675  switch ( ReadProp(&Delim) )
676  {
677  case ERRORP:
678  return Nil;
679 
680  case RULESP:
681  sscanf(PropVal, "\"%d\"", &RS->SNRules);
683  break;
684 
685  case DEFAULTP:
688  if ( ! RS->SDefault ) Error(MODELFILE, E_MFCLASS, Unquoted);
689  break;
690  }
691  }
692  while ( Delim == ' ' );
693 
694  /* Read each rule */
695 
696  RS->SRule = Alloc(RS->SNRules+1, CRule);
697  ForEach(r, 1, RS->SNRules)
698  {
699  if ( (RS->SRule[r] = InRule()) )
700  {
701  RS->SRule[r]->RNo = r;
702  RS->SRule[r]->TNo = Entry;
703  }
704  }
705  ConstructRuleTree(RS);
706  Entry++;
707  return RS;
708 }
709 
710 
711 
713 /* ------ */
714 {
715  CRule R;
716  int d;
717  char Delim;
718  float Lift;
719 
720  R = Alloc(1, RuleRec);
721 
722  do
723  {
724  switch ( ReadProp(&Delim) )
725  {
726  case ERRORP:
727  return Nil;
728 
729  case CONDSP:
730  sscanf(PropVal, "\"%d\"", &R->Size);
731  break;
732 
733  case COVERP:
734  sscanf(PropVal, "\"%f\"", &R->Cover);
735  break;
736 
737  case OKP:
738  sscanf(PropVal, "\"%f\"", &R->Correct);
739  break;
740 
741  case LIFTP:
742  sscanf(PropVal, "\"%f\"", &Lift);
743  R->Prior = (R->Correct + 1) / ((R->Cover + 2) * Lift);
744  break;
745 
746  case CLASSP:
748  R->Rhs = Which(Unquoted, ClassName, 1, MaxClass);
749  if ( ! R->Rhs ) Error(MODELFILE, E_MFCLASS, Unquoted);
750  break;
751  }
752  }
753  while ( Delim == ' ' );
754 
755  R->Lhs = Alloc(R->Size+1, Condition);
756  ForEach(d, 1, R->Size)
757  {
758  R->Lhs[d] = InCondition();
759  }
760 
761  R->Vote = 1000 * (R->Correct + 1.0) / (R->Cover + 2.0) + 0.5;
762 
763  return R;
764 }
765 
766 
767 
769 /* ----------- */
770 {
771  Condition C;
772  char Delim;
773  int X;
774  double XD;
775 
776  C = Alloc(1, CondRec);
777 
778  do
779  {
780  switch ( ReadProp(&Delim) )
781  {
782  case ERRORP:
783  return Nil;
784 
785  case TYPEP:
786  sscanf(PropVal, "\"%d\"", &X); C->NodeType = X;
787  break;
788 
789  case ATTP:
791  C->Tested = Which(Unquoted, AttName, 1, MaxAtt);
792  if ( ! C->Tested || Exclude(C->Tested) )
793  {
795  }
796  break;
797 
798  case CUTP:
799  sscanf(PropVal, "\"%lf\"", &XD); C->Cut = XD;
800  break;
801 
802  case RESULTP:
803  C->TestValue = ( PropVal[1] == '<' ? 2 : 3 );
804  break;
805 
806  case VALP:
807  if ( Continuous(C->Tested) )
808  {
809  C->TestValue = 1;
810  }
811  else
812  {
814  C->TestValue = Which(Unquoted,
815  AttValName[C->Tested],
816  1, MaxAttVal[C->Tested]);
818  }
819  break;
820 
821  case ELTSP:
822  C->Subset = MakeSubset(C->Tested);
823  C->TestValue = 1;
824  break;
825  }
826  }
827  while ( Delim == ' ' );
828 
829  return C;
830 }
831 
832 
833 
834 /*************************************************************************/
835 /* */
836 /* ASCII reading utilities */
837 /* */
838 /*************************************************************************/
839 
840 
841 int ReadProp(char *Delim)
842 /* -------- */
843 {
844  int c, i;
845  char *p;
846  Boolean Quote=false;
847 
848  for ( p = PropName ; (c = fgetc(TRf)) != '=' ; )
849  {
850  if ( p - PropName >= 19 || c == EOF )
851  {
852  Error(MODELFILE, E_MFEOF, "");
853  PropName[0] = PropVal[0] = *Delim = '\00';
854  return 0;
855  }
856  *p++ = c;
857  }
858  *p = '\00';
859 
860  for ( p = PropVal ; ((c = fgetc(TRf)) != ' ' && c != '\n') || Quote ; )
861  {
862  if ( c == EOF )
863  {
864  Error(MODELFILE, E_MFEOF, "");
865  PropName[0] = PropVal[0] = '\00';
866  return 0;
867  }
868 
869  if ( (i = p - PropVal) >= PropValSize )
870  {
871  Realloc(PropVal, (PropValSize += 10000) + 3, char);
872  p = PropVal + i;
873  }
874 
875  *p++ = c;
876  if ( c == '\\' )
877  {
878  *p++ = fgetc(TRf);
879  }
880  else
881  if ( c == '"' )
882  {
883  Quote = ! Quote;
884  }
885  }
886  *p = '\00';
887  *Delim = c;
888 
889  return Which(PropName, Prop, 1, PROPS);
890 }
891 
892 
894 /* ------------ */
895 {
896  char *p, *Start;
897 
898  p = Start = S;
899 
900  for ( S++ ; *S != '"' ; S++ )
901  {
902  if ( *S == '\\' ) S++;
903  *p++ = *S;
904  *S = '-';
905  }
906  *p = '\00';
907 
908  return Start;
909 }
910 
911 
912 
914 /* ---------- */
915 {
916  int Bytes, b;
917  char *p;
918  Set S;
919 
920  Bytes = (MaxAttVal[Att]>>3) + 1;
921  S = AllocZero(Bytes, Byte);
922 
923  for ( p = PropVal ; *p ; )
924  {
925  p = RemoveQuotes(p);
926  b = Which(p, AttValName[Att], 1, MaxAttVal[Att]);
927  if ( ! b ) Error(MODELFILE, E_MFATTVAL, p);
928  SetBit(b, S);
929 
930  for ( p += strlen(p) ; *p != '"' ; p++ )
931  ;
932  p++;
933  if ( *p == ',' ) p++;
934  }
935 
936  return S;
937 }
938 
939 
940 
941 /*************************************************************************/
942 /* */
943 /* Character stream read for binary routines */
944 /* */
945 /*************************************************************************/
946 
947 
948 void StreamIn(String S, int n)
949 /* -------- */
950 {
951  while ( n-- ) *S++ = getc(TRf);
952 }