60 Bell[n][k] = ( k == 1 || k == n ? 1 :
80 DiscrValue V1, V2, V3, BestV1, BestV2, InitialBlocks, First=1, Prelim=0;
82 double BaseInfo, ThisGain, ThisInfo, Penalty, UnknownRate,
83 Val, BestVal, BestGain, BestInfo, PrevGain, PrevInfo;
93 GEnv.ReasonableSubsets = 0;
99 if ( !
GEnv.ReasonableSubsets )
102 fprintf(
Of,
"\tAtt %s: poor initial split\n",
AttName[Att]))
107 KnownCases = Cases -
GEnv.ValFreq[0];
108 UnknownRate =
GEnv.ValFreq[0] / Cases;
115 BestVal = PrevGain / PrevInfo;
117 Verbosity(2, fprintf(
Of,
"\tAtt %s",
AttName[Att]))
121 fprintf(
Of,
"\tinitial inf %.3f, gain %.3f, val=%.3f\n",
122 PrevInfo, PrevGain, BestVal))
136 if ( ++
GEnv.Blocks < V1 )
151 if ( V1 == 1 ) First = 2;
163 Gain[Att] = PrevGain;
164 Info[Att] = PrevInfo;
188 if (
GEnv.ValFreq[V2] && ++V3 != V2 )
202 Penalty = ( finite(
Bell[InitialBlocks][
GEnv.Blocks]) ?
203 Log(
Bell[InitialBlocks][
GEnv.Blocks]) :
204 (InitialBlocks-
GEnv.Blocks+1) * Log(
GEnv.Blocks) );
206 Val = (PrevGain - Penalty / Cases) / PrevInfo;
207 Better = (
GEnv.Blocks >= 2 &&
GEnv.ReasonableSubsets >= 2 &&
212 fprintf(
Of,
"\tprelim merges -> inf %.3f, gain %.3f, val %.3f%s%s",
213 PrevInfo, PrevGain, Val,
214 ( Better ?
" **" :
"" ),
217 GEnv.ValFreq,
false))
229 Info[Att] = PrevInfo;
230 Gain[Att] = PrevGain - Penalty / KnownCases;
239 GEnv.SubsetInfo[V1] = -
GEnv.ValFreq[V1] * Log(
GEnv.ValFreq[V1] / Cases);
253 while (
GEnv.Blocks > 2 )
266 if (
GEnv.ReasonableSubsets == 2 &&
273 ThisGain = PrevGain -
274 ((1-UnknownRate) / KnownCases) *
275 (
GEnv.MergeEntr[V1][V2] -
276 (
GEnv.SubsetEntr[V1] +
GEnv.SubsetEntr[V2]));
277 ThisInfo = PrevInfo + (
GEnv.MergeInfo[V1][V2] -
278 (
GEnv.SubsetInfo[V1] +
GEnv.SubsetInfo[V2])) / Cases;
280 fprintf(
Of,
"\t combine %d %d info %.3f gain %.3f\n",
281 V1, V2, ThisInfo, ThisGain))
285 if ( ThisGain > BestGain+
Epsilon )
295 if ( ! BestV1 )
break;
303 Penalty = ( finite(
Bell[InitialBlocks][
GEnv.Blocks-1]) ?
304 Log(
Bell[InitialBlocks][
GEnv.Blocks-1]) :
305 (InitialBlocks-
GEnv.Blocks+1) * Log(
GEnv.Blocks-1) );
307 Val = (BestGain - Penalty / Cases) / BestInfo;
309 Merge(BestV1, BestV2, Cases);
312 fprintf(
Of,
"\tform subset ");
314 fprintf(
Of,
": %d subsets, inf %.3f, gain %.3f, val %.3f%s\n",
315 GEnv.Blocks, BestInfo, BestGain, Val,
316 ( Val > BestVal ?
" **" :
"" ));
322 if ( Val >= BestVal )
331 Info[Att] = BestInfo;
332 Gain[Att] = BestGain - Penalty / KnownCases;
346 fprintf(
Of,
"\tfinal inf %.3f, gain %.3f, val=%.3f\n",
373 Entr -=
GEnv.Freq[x][c] * Log(
GEnv.Freq[x][c]);
374 KnownCases +=
GEnv.Freq[x][c];
377 GEnv.SubsetInfo[x] = -
GEnv.ValFreq[x] * Log(
GEnv.ValFreq[x] / Cases);
378 GEnv.SubsetEntr[x] = Entr + KnownCases * Log(KnownCases);
386 GEnv.SubsetInfo[R] =
GEnv.SubsetInfo[R+1];
387 GEnv.SubsetEntr[R] =
GEnv.SubsetEntr[R+1];
391 GEnv.MergeInfo[R][C] =
GEnv.MergeInfo[R+1][C];
392 GEnv.MergeEntr[R][C] =
GEnv.MergeEntr[R+1][C];
400 GEnv.MergeInfo[R][C] =
GEnv.MergeInfo[R][C+1];
401 GEnv.MergeEntr[R][C] =
GEnv.MergeEntr[R][C+1];
437 F =
GEnv.ValFreq[x] +
GEnv.ValFreq[y];
438 GEnv.MergeInfo[x][y] = - F * Log(F / Cases);
442 F =
GEnv.Freq[x][c] +
GEnv.Freq[y][c];
446 GEnv.MergeEntr[x][y] = Entr + KnownCases * Log(KnownCases);
464 D1 =
GEnv.ValFreq[V1];
465 D2 =
GEnv.ValFreq[V2];
469 if ( fabs(
GEnv.Freq[V1][c] / D1 -
GEnv.Freq[V2][c] / D2) > 0.001 )
496 GEnv.ReasonableSubsets--;
503 GEnv.ReasonableSubsets++;
508 GEnv.Freq[V1][c] +=
GEnv.Freq[V2][c];
510 GEnv.ValFreq[V1] +=
GEnv.ValFreq[V2];
511 GEnv.ValFreq[V2] = 0;
514 GEnv.WSubset[V1][b] |=
GEnv.WSubset[V2][b];
534 GEnv.Freq[V1][c] =
GEnv.Freq[V2][c];
536 GEnv.ValFreq[V1] =
GEnv.ValFreq[V2];
537 CopyBits(
GEnv.Bytes,
GEnv.WSubset[V2],
GEnv.WSubset[V1]);