mageec  0.1.0
MAchine Guided Energy Efficient Compilation
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
classify-defns.h
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Source code for use with See5/C5.0 Release 2.10 */
4 /* ----------------------------------------------- */
5 /* Copyright RuleQuest Research 2013 */
6 /* */
7 /* This code is provided "as is" without warranty of any kind, */
8 /* either express or implied. All use is at your own risk. */
9 /* */
10 /*************************************************************************/
11 
12 
13 #include <stdio.h>
14 #include <math.h>
15 #include <string.h>
16 #include <stdlib.h>
17 #include <ctype.h>
18 #include <time.h>
19 #ifdef WIN32
20 #include <windows.h>
21 #endif
22 
23 
24 /*************************************************************************/
25 /* */
26 /* Constants, macros etc. */
27 /* */
28 /*************************************************************************/
29 
30 
31 #define SEE5
32 
33 #define Nil 0 /* null pointer */
34 #define false 0
35 #define true 1
36 #define None -1
37 #define Epsilon 1E-4
38 
39 #define EXCLUDE 1 /* special attribute status: do not use */
40 #define SKIP 2 /* do not use in classifiers */
41 #define DISCRETE 4 /* ditto: collect values as data read */
42 #define ORDERED 8 /* ditto: ordered discrete values */
43 #define DATEVAL 16 /* ditto: YYYY/MM/DD or YYYY-MM-DD */
44 #define STIMEVAL 32 /* ditto: HH:MM:SS */
45 #define TSTMPVAL 64 /* date time */
46 
47  /* unknown and N/A values are represented by
48  unlikely floating-point numbers
49  (octal 01600000000 and 01) */
50 #define UNKNOWN 01600000000 /* 1.5777218104420236e-30 */
51 #define NA 01 /* 1.4012984643248171e-45 */
52 
53 #define BrDiscr 1
54 #define BrThresh 2
55 #define BrSubset 3
56 
57 #define Alloc(N,T) (T *) Pmalloc((N)*sizeof(T))
58 #define AllocZero(N,T) (T *) Pcalloc(N, sizeof(T))
59 #define Realloc(V,N,T) V = (T *) Prealloc(V, (N)*sizeof(T))
60 
61 #define Max(a,b) ((a)>(b) ? (a) : (b))
62 #define Min(a,b) ((a)<(b) ? (a) : (b))
63 
64 #define Bit(b) (1 << (b))
65 #define In(b,s) ((s[(b) >> 3]) & Bit((b) & 07))
66 #define SetBit(b,s) (s[(b) >> 3] |= Bit((b) & 07))
67 
68 #define ForEach(v,f,l) for(v=f ; v<=l ; ++v)
69 
70 #define StatBit(a,b) (SpecialStatus[a]&(b))
71 #define Exclude(a) StatBit(a,EXCLUDE)
72 #define Skip(a) StatBit(a,EXCLUDE|SKIP)
73 #define Discrete(a) (MaxAttVal[a] || StatBit(a,DISCRETE))
74 #define Continuous(a) (! MaxAttVal[a] && ! StatBit(a,DISCRETE))
75 #define Ordered(a) StatBit(a,ORDERED)
76 #define DateVal(a) StatBit(a,DATEVAL)
77 #define TimeVal(a) StatBit(a,STIMEVAL)
78 #define TStampVal(a) StatBit(a,TSTMPVAL)
79 
80 #define Space(s) (s==' ' || s=='\n' || s=='\r' || s=='\t')
81 #define SkipComment while ( ( c = InChar(f) ) != '\n' && c != EOF )
82 
83 #define FreeUnlessNil(p) if((p)!=Nil) free(p)
84 #define Free(x) free(x)
85 
86 #define assert(x)
87 
88 #ifdef WIN32
89 #define rint(x) ((x)<0 ? (double)((int)((x)-0.5)) :\
90  (double)((int)((x)+0.5)))
91 #define finite(x) _finite(x)
92 #define strdup(x) _strdup(x)
93 #endif
94 
95 #define P1(x) (rint((x)*10) / 10)
96 #define Of stdout
97 #define Goodbye(x) exit(x)
98 #define CharWidth(s) ((int) strlen(s))
99 
100 
101 #define NOFILE 0
102 #define BADCLASSTHRESH 1
103 #define LEQCLASSTHRESH 2
104 #define BADATTNAME 3
105 #define EOFINATT 4
106 #define SINGLEATTVAL 5
107 #define BADATTVAL 6
108 #define BADNUMBER 7
109 #define BADCLASS 8
110 #define BADCOSTCLASS 9
111 #define BADCOST 10
112 #define NOMEM 11
113 #define TOOMANYVALS 12
114 #define BADDISCRETE 13
115 #define NOTARGET 14
116 #define BADCTARGET 15
117 #define BADDTARGET 16
118 #define LONGNAME 17
119 #define HITEOF 18
120 #define MISSNAME 19
121 #define BADDATE 20
122 #define BADTIME 21
123 #define BADTSTMP 22
124 #define DUPATTNAME 23
125 #define UNKNOWNATT 24
126 #define BADDEF1 25
127 #define BADDEF2 26
128 #define BADDEF3 27
129 #define BADDEF4 28
130 #define SAMEATT 29
131 #define MODELFILE 30
132 #define CWTATTERR 31
133 
134 
135 /*************************************************************************/
136 /* */
137 /* Type definitions */
138 /* */
139 /*************************************************************************/
140 
141 
142 typedef unsigned char Boolean, BranchType, *Set, Byte;
143 typedef char *String;
144 
145 typedef int CaseNo; /* data item number */
146 typedef float CaseCount; /* count of (partial) items */
147 
148 typedef int ClassNo, /* class number, 1..MaxClass */
149  DiscrValue, /* discrete attribute value (0 = ?) */
150  Attribute; /* attribute number, 1..MaxAtt */
151 
152 typedef float ContValue; /* continuous attribute value */
153 #define PREC 7 /* precision */
154 
155 
156 typedef union _def_val
157  {
158  String _s_val; /* att val for comparison */
159  ContValue _n_val; /* number for arith */
160  }
161  DefVal;
162 
163 
164 typedef struct _def_elt
165  {
166  short _op_code; /* type of element */
167  DefVal _operand; /* string or numeric value */
168  }
169  DefElt, *Definition;
170 
171 
172 typedef struct _elt_rec
173  {
174  int Fi, /* index of first char of element */
175  Li; /* last ditto */
176  char Type; /* 'B', 'S', or 'N' */
177  }
178  EltRec;
179 
180 
181 #define DefOp(DE) DE._op_code
182 #define DefSVal(DE) DE._operand._s_val
183 #define DefNVal(DE) DE._operand._n_val
184 
185 #define OP_ATT 0 /* opcodes */
186 #define OP_NUM 1
187 #define OP_STR 2
188 #define OP_MISS 3
189 #define OP_AND 10
190 #define OP_OR 11
191 #define OP_EQ 20
192 #define OP_NE 21
193 #define OP_GT 22
194 #define OP_GE 23
195 #define OP_LT 24
196 #define OP_LE 25
197 #define OP_SEQ 26
198 #define OP_SNE 27
199 #define OP_PLUS 30
200 #define OP_MINUS 31
201 #define OP_UMINUS 32
202 #define OP_MULT 33
203 #define OP_DIV 34
204 #define OP_MOD 35
205 #define OP_POW 36
206 #define OP_SIN 40
207 #define OP_COS 41
208 #define OP_TAN 42
209 #define OP_LOG 43
210 #define OP_EXP 44
211 #define OP_INT 45
212 #define OP_END 99
213 
214 
215 typedef union _attribute_value
216  {
219  }
220  AttValue, *DataRec;
221 
222 #define CVal(Case,Attribute) Case[Attribute]._cont_val
223 #define DVal(Case,Attribute) Case[Attribute]._discr_val
224 #define XDVal(Case,Att) (Case[Att]._discr_val & 077777777)
225 #define SVal(Case,Attribute) Case[Attribute]._discr_val
226 #define Class(Case) (*Case)._discr_val
227 #define Weight(Case) (*(Case-1))._cont_val
228 
229 #define Unknown(Case,Att) (DVal(Case,Att)==UNKNOWN)
230 #define UnknownVal(AV) (AV._discr_val==UNKNOWN)
231 #define NotApplic(Case,Att) (DVal(Case,Att)==NA)
232 #define NotApplicVal(AV) (AV._discr_val==NA)
233 
234 
235 typedef struct _treerec *Tree;
236 typedef struct _treerec
237  {
239  ClassNo Leaf; /* best class at this node */
240  CaseCount Cases, /* no of items at this node */
241  *ClassDist, /* class distribution of items */
242  Errors; /* no of errors at this node */
243  Attribute Tested; /* attribute referenced in test */
244  int Forks; /* number of branches at this node */
245  ContValue Cut, /* threshold for continuous attribute */
246  Lower, /* lower limit of soft threshold */
247  Upper, /* upper limit ditto */
248  Mid; /* 50% point */
249  Set *Subset; /* subsets of discrete values */
250  Tree *Branch; /* Branch[x] = subtree for outcome x */
251  }
252  TreeRec;
253 
254 
255 typedef int RuleNo; /* rule number */
256 
257 typedef struct _condrec
258  {
259  BranchType NodeType; /* test type (see tree nodes) */
260  Attribute Tested; /* attribute tested */
261  int Forks; /* possible branches */
262  ContValue Cut; /* threshold (if relevant) */
263  Set Subset; /* subset (if relevant) */
264  int TestValue, /* specified outcome of test */
265  TestI; /* rule tree index of this test */
266  }
267  CondRec, *Condition;
268 
269 
270 typedef struct _rulerec
271  {
272  RuleNo RNo; /* rule number */
273  int TNo, /* trial number */
274  Size; /* number of conditions */
275  Condition *Lhs; /* conditions themselves */
276  ClassNo Rhs; /* class given by rule */
277  CaseCount Cover, /* number of cases covered by rule */
278  Correct; /* number on which correct */
279  float Prior; /* prior probability of RHS */
280  int Vote; /* unit = 0.001 */
281  }
282  RuleRec, *CRule;
283 
284 
285 typedef struct _ruletreerec *RuleTree;
286 typedef struct _ruletreerec
287  {
288  RuleNo *Fire; /* rules matched at this node */
289  Condition CondTest; /* new test */
290  int Forks; /* number of branches */
291  RuleTree *Branch; /* subtrees */
292  }
293  RuleTreeRec;
294 
295 
296 typedef struct _rulesetrec
297  {
298  RuleNo SNRules; /* number of rules */
299  CRule *SRule; /* rules */
300  ClassNo SDefault; /* default class for this ruleset */
301  RuleTree RT; /* rule tree for this ruleset */
302  }
304 
305 
306 typedef struct _classify_environment
307  {
308  CaseNo Fp; /* for SMP */
309  double *ClassWt; /* total class votes */
310  float *Vote, /* class boost votes */
311  Confidence; /* prediction CF */
312  RuleNo *Active, /* active rules */
313  ActiveSpace, /* space for same */
314  NActive; /* number of same */
315  CRule *MostSpec; /* most specific active rules */
316  Boolean *AttUsed; /* reserved for possible later use */
317  RuleNo *RulesUsed, /* all applicable rules */
318  NRulesUsed; /* number of same */
319  }
320  CEnvRec, *CEnv;
321 
322 
323 /*************************************************************************/
324 /* */
325 /* Function prototypes */
326 /* */
327 /*************************************************************************/
328 
329 Boolean ReadName(FILE *f, String s, int n, char ColonOpt);
330 void GetNames(FILE *Nf);
331 void ExplicitAtt(FILE *Nf);
332 int Which(String Val, String *List, int First, int Last);
333 int InChar(FILE *f);
334 
335 DataRec GetDataRec(FILE *Df, Boolean Train);
336 int StoreIVal(String S);
337 void CheckValue(DataRec DVec, Attribute Att);
338 
339 void ImplicitAtt(FILE *Nf);
340 void ReadDefinition(FILE *f);
341 void Append(char c);
343 Boolean Conjunct();
346 Boolean Term();
347 Boolean Factor();
348 Boolean Primary();
349 Boolean Atom();
350 Boolean Find(String S);
351 int FindOne(String *Alt);
353 void DefSyntaxError(String Msg);
354 void DefSemanticsError(int Fi, String Msg, int OpCode);
355 void Dump(char OpCode, ContValue F, String S, int Fi);
356 void DumpOp(char OpCode, int Fi);
357 Boolean UpdateTStack(char OpCode, ContValue F, String S, int Fi);
358 AttValue EvaluateDef(Definition D, DataRec Case);
359 
360 void ReadFilePrefix(String Extension);
361 void ReadHeader();
362 Tree GetTree(String Extension);
363 Tree InTree();
364 CRuleSet GetRules(String Extension);
365 CRuleSet InRules();
366 CRule InRule();
367 Condition InCondition();
368 void ConstructRuleTree(CRuleSet RS);
369 void SetTestIndex(Condition C);
370 RuleTree GrowRT(RuleNo *RR, int RRN, CRule *Rule);
371 int DesiredOutcome(CRule R, int TI);
372 int SelectTest(RuleNo *RR, int RRN, CRule *Rule);
373 int ReadProp(char *Delim);
376 Tree Leaf(double *Freq, ClassNo NodeClass, CaseCount Cases,
377  CaseCount Errors);
378 
379 void GetMCosts(FILE *f);
380 
381 ClassNo TreeClassify(DataRec CaseDesc, Tree DecisionTree, CEnv E);
382 void FollowAllBranches(DataRec CaseDesc, Tree T, float Fraction,
383  double *Prob, Boolean *AttUsed);
384 void FindLeaf(DataRec CaseDesc, Tree T, Tree PT, float Wt, double *Prob,
385  Boolean *AttUsed);
386 ClassNo RuleClassify(DataRec CaseDesc, CRuleSet RS, CEnv E);
387 int FindOutcome(DataRec Case, Condition OneCond);
388 Boolean Satisfies(DataRec CaseDesc, Condition OneCond);
389 Boolean Matches(CRule R, DataRec Case);
390 void CheckActiveSpace(int N, CEnv E);
391 void MarkActive(RuleTree RT, DataRec Case, CEnv E);
392 ClassNo BoostClassify(DataRec CaseDesc, int MaxTrial, CEnv E);
393 ClassNo SelectClass(ClassNo Default, Boolean UseCosts, double *Prob);
394 double MisclassCost(double *LocalFreq, ClassNo C);
395 ClassNo Classify(DataRec CaseDesc, CEnv E);
396 float Interpolate(Tree T, ContValue Val);
397 
398 FILE * GetFile(String Extension, String RW);
399 void CheckFile(String Extension, Boolean Write);
400 
401 char ProcessOption(int Argc, char *Argv[], char *Options);
402 void *Pmalloc(size_t Bytes);
403 void *Prealloc(void *Present, size_t Bytes);
404 void *Pcalloc(size_t Number, unsigned Size);
405 void Error(int ErrNo, String S1, String S2);
407 int GetInt(String S, int N);
408 int DateToDay(String DS);
409 int TimeToSecs(String TS);
410 void SetTSBase(int y);
411 int TStampToMins(String TS);
412 
413 void FreeGlobals();
414 void FreeCosts();
415 void FreeNames();
416 void FreeTree(Tree T);
417 void FreeRule(CRule R);
418 void FreeRuleTree(RuleTree RT);
419 void FreeRules(CRuleSet RS);
420 void FreeLastCase(DataRec DVec);
421 void FreeVector(void **V, int First, int Last);
422 
423 
424 /*************************************************************************/
425 /* */
426 /* Text strings */
427 /* */
428 /*************************************************************************/
429 
430 
431 #define TX_Line(l,f) "\n*** line %d of `%s': ", l, f
432 #define E_NOFILE(f,e) "cannot open file %s%s\n", f, e
433 #define E_BADATTNAME "`:' or `:=' expected after attribute name"\
434  " `%s'\n"
435 #define E_EOFINATT "unexpected eof while reading attribute `%s'\n"
436 #define E_SINGLEATTVAL(a,v) "attribute `%s' has only one value `%s'\n",\
437  a, v
438 #define E_DUPATTNAME "multiple attributes with name `%s'\n"
439 #define E_CWTATTERR "case weight attribute must be continuous\n"
440 #define E_BADATTVAL(v,a) "bad value of `%s' for attribute `%s'\n", v, a
441 #define E_BADNUMBER(a) "value of `%s' changed to `?'\n", a
442 #define E_BADCLASS "bad class value `%s'l\n"
443 #define E_BADCLASSTHRESH "bad class threshold `%s'\n"
444 #define E_LEQCLASSTHRESH "class threshold `%s' <= previous threshold\n"
445 #define E_BADCOSTCLASS "bad class `%s'\n"
446 #define E_BADCOST "bad cost value `%s'\n"
447 #define E_NOMEM "unable to allocate sufficient memory\n"
448 #define E_TOOMANYVALS(a,n) "too many values for attribute `%s'"\
449  " (max %d)\n", a, n
450 #define E_BADDISCRETE "bad number of discrete values for attribute"\
451  " `%s'\n"
452 #define E_NOTARGET "target attribute `%s' not found\n"
453 #define E_BADCTARGET "target attribute `%s' must be"\
454  " type `continuous'\n"
455 #define E_BADDTARGET "target attribute `%s' must be specified by"\
456  " a list of discrete values\n"
457 #define E_LONGNAME "overlength name: check data file formats\n"
458 #define E_HITEOF "unexpected end of file\n"
459 #define E_MISSNAME "missing name or value before `%s'\n"
460 #define E_BADTSTMP(d,a) "bad timestamp `%s' for attribute `%s'\n", d, a
461 #define E_BADDATE(d,a) "bad date `%s' for attribute `%s'\n", d, a
462 #define E_BADTIME(d,a) "bad time `%s' for attribute `%s'\n", d, a
463 #define E_UNKNOWNATT "unknown attribute name `%s'\n"
464 #define E_BADDEF1(a,s,x) "in definition of attribute `%s':\n"\
465  "\tat `%.12s': expect %s\n", a, s, x
466 #define E_BADDEF2(a,s,x) "in definition of attribute `%s':\n"\
467  "\t`%s': %s\n", a, s, x
468 #define E_BADDEF3 "cannot define target attribute `%s'\n"
469 #define E_BADDEF4 "[warning] target attribute appears in"\
470  " definition of attribute `%s'\n"
471 #define E_SAMEATT(a,b) "[warning] attribute `%s' is identical to"\
472  " attribute `%s'\n", a, b
473 #define EX_MODELFILE(f) "file %s incompatible with .names file\n", f
474 #define E_MFATT "undefined or excluded attribute"
475 #define E_MFATTVAL "undefined attribute value"
476 #define E_MFCLASS "undefined class"
477 #define E_MFEOF "unexpected eof"
478 #define T_ErrorLimit "Error limit exceeded\n"