35 #define LR_CLASS2LABEL(class) \ 36 ((class) == LR_CLASS_HAM ? "ham" : \ 37 (class) == LR_CLASS_SPAM ? "spam" : "unknown") 39 #define LR_LABEL2CLASS(label) \ 40 (STRCASEEQUAL((label), "spam") ? LR_CLASS_SPAM : \ 41 STRCASEEQUAL((label), "ham") ? LR_CLASS_HAM : \ 58 lr_evt_fill(evt, cmd, date,
class, fname)
67 memset(evt, 0,
sizeof (*evt));
70 strlcpy(evt->fname, fname, sizeof (evt->fname));
76 #define LR_EVT_FILL(evt, cmd, date, class, fname) \ 78 if ((evt) != NULL) { \ 79 memset((evt), 0, sizeof(*(evt))); \ 80 (evt)->date = (date); \ 81 (evt)->class = (class); \ 82 strlcpy((evt)->fname, (fname), sizeof((evt)->fname)); \ 98 #define PILE_INIT(p) \ 101 memset(p, 0, sizeof(p)); \ 112 static bool pile_sort(
pile_T * pile);
121 #define CLI_CLASSIFY 1 125 #define CLI_DELAY_FN_CTE 0 126 #define CLI_DELAY_FN_EXP 1 127 #define CLI_DELAY_FN_FIX 2 153 #define CLI_OPT_INIT {FALSE, 7200, CLI_DELAY_FN_CTE, FALSE, -1., FALSE, -1., FALSE, 0, 0} 160 static int cli_lr_classify(
char *fileIn,
char *dataFile);
162 static int cli_lr_learn(
char *fileIn,
char *dataFile,
cli_opts_T * opt,
165 static int cli_lr_simul(
char *fileIn,
char *dataFile,
cli_opts_T * opt,
168 static int cli_lr_extract(
char *fileIn,
char *dataFile,
cli_opts_T * opt,
172 static double lrate = 0.004;
180 char *fileData =
"/tmp/lr.txt";
188 char *xmode =
"simul";
191 const char *args =
"hi:d:lR:L:x:o:";
195 while ((c = getopt(argc, argv, args)) != -1)
214 lrate = atof(optarg);
221 if (!decode_lr_options(&lrOpts, &cliOpt, optarg))
232 while (io < argc && *argv[io] ==
'-')
255 (void) cli_lr_learn(fileIn, fileData, &cliOpt, &lrOpts);
262 (void) cli_lr_classify(fileIn, fileData);
269 (void) cli_lr_simul(fileIn, fileData, &cliOpt, &lrOpts);
276 (void) cli_lr_extract(fileIn, fileData, &cliOpt, &lrOpts);
291 cli_lr_extract(fileIn, dataFile, cliopt, lropts)
298 FILE *fdata = NULL, *fin = NULL;
308 fin = fopen(fileIn,
"r");
324 int nham = 0, nspam = 0;
332 memset(&cargs, 0,
sizeof (cargs));
333 memset(&margs, 0,
sizeof (margs));
335 memset(stime, 0,
sizeof (stime));
336 memset(sclass, 0,
sizeof (sclass));
337 memset(sfile, 0,
sizeof (sfile));
339 while (fin != NULL && fscanf(fin,
"%s %s %s", stime, sclass, sfile) == 3)
346 snprintf(
id,
sizeof(
id),
"%08d", nid++);
367 cli_lr_classify(fileIn, dataFile)
371 char *
id =
"000000.000";
373 FILE *fdata = NULL, *fin = NULL;
383 fin = fopen(fileIn,
"r");
399 int nham = 0, nspam = 0;
406 memset(&cargs, 0,
sizeof (cargs));
407 memset(&margs, 0,
sizeof (margs));
408 memset(&mscore, 0,
sizeof (mscore));
410 memset(stime, 0,
sizeof (stime));
411 memset(sclass, 0,
sizeof (sclass));
412 memset(sfile, 0,
sizeof (sfile));
414 while (fin != NULL && fscanf(fin,
"%s %s %s", stime, sclass, sfile) == 3)
424 ZE_MessageInfo(10,
"%s %s op=%s judge=%-4s class=%-4s score=%.4f prob=%.4f",
427 spam ?
"spam" :
"ham",
428 mscore.
odds > 0.0 ?
"spam" :
"ham",
446 cli_lr_learn(fileIn, dataFile, cliopt, lropts)
452 char *
id =
"000000.000";
454 FILE *fdata = NULL, *fin = NULL;
468 fin = fopen(fileIn,
"r");
492 int nham = 0, nspam = 0;
493 int pham = 0, pspam = 0;
500 time_t t_last, t_now;
508 if ((env = getenv(
"LRRESAMPLEWIN")) != NULL)
518 memset(&mscore, 0,
sizeof (mscore));
519 memset(&cargs, 0,
sizeof (cargs));
520 memset(&margs, 0,
sizeof (margs));
522 memset(stime, 0,
sizeof (stime));
523 memset(sclass, 0,
sizeof (sclass));
524 memset(sfile, 0,
sizeof (sfile));
534 if (fscanf(fin,
"%s %s %s", stime, sclass, sfile) != 3)
550 ok =
lr_learn(
id, fname, &cargs, &margs, &mscore, spam);
554 "%s classification : %8.4f %.8f judge=%-4s class=%-4s" 555 " learn=%s query=%s features=%d" 556 " score=%g prob=%.6f",
567 memset(&evt, 0,
sizeof (evt));
573 pspam = nspam % nbMax;
588 if (nspam == 0 || nham == 0)
595 i = random() % nbMax;
597 i = random() % nspam;
598 fname = eSpam[i].
fname;
602 i = random() % nbMax;
605 fname = eHam[i].
fname;
609 lr_learn(
id, fname, &cargs, &margs, &mscore, spam);
636 cli_simul_handle(cargs, margs, pile, evt)
646 char *
id =
"000000.000";
648 memset(&mscore, 0,
sizeof (mscore));
650 memset(margs, 0,
sizeof (*margs));
665 ok =
lr_classify(
id, fname, cargs, margs, &mscore);
683 margs->
score = mscore;
701 #define ST_DIM 0x10000 704 cli_lr_simul(fileIn, dataFile, cliopt, lropts)
710 char *
id =
"000000.000";
712 FILE *fdata = NULL, *fin = NULL;
717 double pmiss = cliopt->
pmiss;
728 memset(stcl, 0,
sizeof (stcl));
732 if ((env = getenv(
"STDIM")) != NULL)
749 fin = fopen(fileIn,
"r");
773 int nham = 0, nspam = 0;
774 int pham = 0, pspam = 0;
791 if ((env = getenv(
"LRRESAMPLEWIN")) != NULL)
801 memset(&cargs, 0,
sizeof (cargs));
802 memset(&margs, 0,
sizeof (margs));
804 memset(stime, 0,
sizeof (stime));
805 memset(sclass, 0,
sizeof (sclass));
806 memset(sfile, 0,
sizeof (sfile));
814 memset(&fevt, 0,
sizeof (fevt));
815 if (fscanf(fin,
"%s %s %s", stime, sclass, sfile) == 3)
823 class = LR_LABEL2CLASS(sclass);
827 if (cliopt->
tend > 0 && date > cliopt->
tend)
831 if (cliopt->
tstart > 0 && date < cliopt->tstart)
836 (void) lr_evt_fill(&fevt,
LR_CMD_CLASS, date,
class, fname);
840 pspam = nspam % nbMax;
855 while (pile_check_bottom(&pile, &pevt))
862 (void) pile_shift(&pile, &pevt);
874 cli_simul_handle(&cargs, &margs, &pile, &pevt);
877 "%ld %s op=%s judge=%-4s class=%-4s" 878 " learn=%-5s query=%-5s miss=%-5s noisy=%-5s features=%d" 879 " score=%g prob=%.6f",
911 cli_simul_handle(&cargs, &margs, &pile, &fevt);
919 loss = fabs((spam ? 1. : 0.) - margs.
score.
value);
921 stcl[stp].
loss = loss;
944 if ((r = drand48()) < cliopt->
pmiss)
957 stmax =
MIN(stn, st_dim);
959 for (i = nq = ne = 0, risk = 0.; i < stmax; i++)
965 risk += stcl[i].
loss;
967 erate = ((double) ne) / stmax;
968 qrate = ((double) nq) / stmax;
970 ZE_MessageInfo(10,
"STATS: %7d QRATE %8.5f ERATE: %8.5f RISK: %8.5f",
971 stn, qrate, erate, risk);
987 if ((r = drand48()) < cliopt->
pnoise)
995 if (cargs.
nmsg >= 2000)
1006 x = -cliopt->
delay * log(1 - drand48() + 1.e-10);
1008 x =
MIN(x, (30 * 86400));
1013 evt.
date += 86400 - (evt.
date % 86400);
1020 (void) pile_push(&pile, &evt);
1027 "%ld %s op=%s judge=%-4s class=%-4s" 1028 " learn=%-5s query=%-5s miss=%-5s noisy=%-5s features=%d" 1029 " score=%g prob=%.6f",
1036 STRBOOL(query,
"true",
"false"),
1037 STRBOOL(miss,
"true",
"false"),
1038 STRBOOL(noisy,
"true",
"false"),
1043 if (resample && nspam > 1 && nham > 1)
1051 m = (nspam > nbMax) ? nbMax : nspam;
1053 fname = eSpam[i].
fname;
1056 m = (nham > nbMax) ? nbMax : nham;
1058 fname = eHam[i].
fname;
1061 (void) lr_evt_fill(&fevt, LR_CMD_LEARN_RESAMPLE, date, class, fname);
1062 cli_simul_handle(&cargs, &margs, &pile, &fevt);
1091 learn_callback(i, cargs, margs)
1098 static bool ok =
FALSE;
1099 static int srate = 2;
1100 static double ri = 0.5, rf = 0.004, teta = -0.040;
1106 if ((s = getenv(
"LRATEDEFS")) != NULL)
1112 memset(argv, 0,
sizeof (argv));
1113 strlcpy(ebuf, s,
sizeof (ebuf));
1115 if (argv[0] != NULL)
1116 srate = atof(argv[0]);
1117 if (argv[1] != NULL)
1119 if (argv[2] != NULL)
1124 if (argv[3] != NULL)
1125 teta = atof(argv[3]);
1136 r = ri / sqrt(i + 1);
1141 r = (ri - rf) * exp(teta * i) + rf;
1158 pile_push(pile, evt)
1165 if (pile->n >=
DPILE - 1)
1169 evt->
serial = pile->serial;
1170 pile->p[pile->n] = *evt;
1187 *evt = pile->p[pile->n - 1];
1194 pile_shift(pile, evt)
1208 for (i = 0; i < pile->n - 1; i++)
1209 pile->p[i] = pile->p[i + 1];
1217 pile_check_top(pile, evt)
1227 *evt = pile->p[pile->n - 1];
1233 pile_check_bottom(pile, evt)
1249 lrevtcmp(
const void *ea,
const void *eb)
1270 qsort(pile->p, pile->n, sizeof (
learn_evt_T), lrevtcmp);
1279 #define OPTION_BOOL(args, v) \ 1281 if (zeStrRegex((args), "^yes|true|ok$", NULL, NULL, TRUE)) \ 1283 if (zeStrRegex((args), "^no|false|ko$", NULL, NULL, TRUE)) \ 1287 #define OPTION_INT(args, v) \ 1289 (v) = zeStr2long(args, NULL, (v)); \ 1292 #define OPTION_DOUBLE(args, v) \ 1294 (v) = zeStr2double(args, NULL, (v)); \ 1298 decode_lr_options(lrOpts, cliOpt, optarg)
1310 strlcpy(buf, optarg,
sizeof buf);
1456 printf(
"Usage : %s options\n" 1457 " -h : this message\n" 1458 " -l : toggle learn option\n" 1459 " -i : input file with commands\n" 1461 " -a : active learning\n" 1462 " -t : active learning threshold (0.5 - margin)\n" 1464 " -m : feedback miss probability ([0.0,1.0])\n" 1465 " -R : asymptotic learning rate\n" 1466 " -o : option=value \n" 1467 " -x mode : where mode in learn, class, simulate\n", arg);
#define LR_CMD_LEARN_RESAMPLE
#define OPTION_DOUBLE(args, v)
bool lr_data_open(char *fname)
int main(int argc, char **argv)
#define LR_OPTS_INITIALIZER
bool lr_data_dump(char *fname)
#define LR_CMD_LEARN_FEEDBACK
bool lr_set_learn_callback(lr_callback_F)
#define STRNCASEEQUAL(a, b, n)
bool lr_learn_options(bool active, double threshold)
bool lr_classify(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *score)
void zeLog_SetOutput(bool, bool)
int zeStr2Tokens(char *, int, char **, char *)
#define OPTION_BOOL(args, v)
long zeStr2long(char *s, int *error, long dval)
bool lr_learn(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *score, bool spam)
#define ZE_MessageInfo(level,...)
#define ZE_LogSysError(...)
int configure(char *, char *, bool)
#define OPTION_INT(args, v)
#define STRCASEEQUAL(a, b)
bool lr_extract(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs)
time_t zeStr2time(char *s, int *error, time_t dval)