36 # define LR_BODY_LENGTH 256 37 # define LR_RAW_LENGTH 2500 47 #define DATA_INIT {FALSE, JBT_INITIALIZER} 49 static bool lr_task(
char *
id,
73 lrtokcmp(
void *a,
void *b)
87 static double lrate =
LRATE;
110 #define LR_INITIALIZER \ 112 FALSE, PTHREAD_MUTEX_INITIALIZER, JBT_INITIALIZER, \ 115 NULL, LR_OPTS_INITIALIZER} 125 lr_data_read(bt, fname)
137 fin = fopen(fname,
"r");
143 while (fgets(buf,
sizeof (buf), fin) != NULL)
167 lr_data.
ns = atoi(argv[1]);
172 lr_data.
nh = atoi(argv[1]);
215 memset(&tok, 0,
sizeof (tok));
216 if (sscanf(buf,
"%x %lg %d %d %d", &tok.
tok.
utok, &tok.
weight,
263 if ((lr_data.
fname = strdup(fname)) == NULL)
268 if (lr_data_read(&lr_data.
lrbt, fname) < 0)
283 if ((env = getenv(
"LR_LRATE")) != NULL)
295 if ((env = getenv(
"LR_USE_RAW_MSG")) != NULL)
301 if ((env = getenv(
"LR_RAW_LENGTH")) != NULL)
310 if ((env = getenv(
"LR_BODY_LENGTH")) != NULL)
319 if ((env = getenv(
"LR_USE_BODY")) != NULL)
328 if ((env = getenv(
"LR_USE_HEADERS")) != NULL)
404 lr_lrate_F lrate_function;
406 bool active_learning;
407 double active_threshold;
419 lr_browse_dump(vtok, varg)
428 fprintf(fout,
"%08lx %lg %5d %5d %5d\n", (
long unsigned int) tok->
tok.
utok, tok->
weight,
449 fout = fopen(fname,
"w");
455 fprintf(fout,
"<HEAD>\n");
456 fprintf(fout,
"date=%ld\n", now);
457 fprintf(fout,
"toktype=%d\n", 1);
458 fprintf(fout,
"toklength=%d\n", 4);
460 fprintf(fout,
"spams=%d\n", lr_data.
ns);
461 fprintf(fout,
"hams=%d\n", lr_data.
nh);
462 fprintf(fout,
"spamsu=%d\n", lr_data.
nsu);
463 fprintf(fout,
"hamsu=%d\n", lr_data.
nhu);
465 fprintf(fout,
"useheaders=%s\n",
468 fprintf(fout,
"</HEAD>\n");
469 fprintf(fout,
"<DATA>\n");
471 fprintf(fout,
"</DATA>\n");
485 static char *uheaders[] = {
490 "X-ze-filter-status",
495 "DomainKey-Signature",
496 "Authentication-Results",
506 static char *uheaders[] = {
511 "X-ze-filter-status",
512 "X-ze-filter-Enveloppe",
518 "DomainKey-Signature",
519 "Authentication-Results",
526 "X-Antivirus-Status",
529 "X-DSPAM-Confidence",
531 "X-DSPAM-Improbability",
532 "X-DSPAM-Probability",
540 static char *mymtas[] = {
542 "by .*.mines-paristech.fr",
544 "from .*.mines-paristech.fr",
547 "from .*.renater.fr",
552 #define DATE_EXPR "(Sun|Mon|Tue|Wed|Thu|Fri|Sat)?,? +[0-9]+ " \ 553 "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +20[0-9]{2}" \ 554 " +([0-9]{1,2}:[0-9]{2}:[0-9]{2} +([-+]?[0-9]{2}00)?)?" 562 int i, slen, tlen = 4;
567 for (i = 0; i <= slen - tlen; i++)
578 memset(&token, 0,
sizeof (token));
596 bt_browse_classify(
void *vtok,
void *varg)
598 double *score = (
double *) varg;
613 bt_browse_dump_tokens(
void *vtok,
void *varg)
616 char *
id = (
char *) varg;
618 id = (
id != NULL ? id :
"000");
620 printf(
"%-14s %08lx\n",
id, (
long unsigned int) tok->
tok.
utok);
626 bt_browse_adjust(
void *vtok,
void *varg)
629 double *delta = (
double *) varg;
663 tokens_mime_part(buf, size,
id, level, type, arg, mpart)
674 char *mtype =
"TEXT";
679 bool clHeaders =
TRUE;
688 if ((env = getenv(
"LR_CLEANUP_HEADERS")) != NULL)
692 if ((env = getenv(
"LR_CLEANUP_DATES")) != NULL)
700 for (h = mpart->
hdrs; h != NULL; h = h->
next)
705 if (h->
value == NULL || strlen(h->
value) == 0)
708 snprintf(vbuf,
sizeof (vbuf),
"%s: %s", h->
key, h->
value);
711 for (s = uheaders; s != NULL && *s != NULL; s++)
721 for (s = mymtas; s != NULL && *s != NULL; s++)
726 if (strstr(h->
value, *s))
736 for (s = mymtas; s != NULL && *s != NULL; s++)
741 if (strstr(h->
value, *s))
749 snprintf(vbuf,
sizeof (vbuf),
"%s: %s", h->
key, h->
value);
759 for (i = 0, p = vbuf; i < lm; i++)
761 if (i < pi || i > pf)
768 scan_msg_part(&data->
bt, vbuf);
782 scan_msg_part(&data->
bt, vbuf);
795 lr_task(
id, fname, cargs, margs, mscore, task, learn, spam)
806 double score = 0.0, prob = 0.5;
809 double ptarget, delta;
830 fd = open(fname, O_RDONLY);
836 memset(buf, 0,
sizeof (buf));
837 nc = read(fd, buf, 2500);
845 scan_msg_part(&data.
bt, buf);
860 prob = 1. / (1. + exp(-score));
865 mscore->
value = prob;
866 mscore->
odds = score;
892 ptarget = spam ? 1.0 : 0.0;
1018 lr_data.
opts = *opts;
1033 *opts = lr_data.
opts;
1047 popts = (opts != NULL ? opts : &lr_data.
opts);
1049 printf(
"# lrate %7.3f\n" 1053 "# bodyLength %ld\n" 1056 "# cleanUpHeaders %d\n" 1057 "# cleanUpDates %d\n" 1060 "# active_learning %d\n" 1061 "# active_margin %.3f\n"
double(* lr_callback_F)(int, lr_cargs_T *, lr_margs_T *)
bool lr_data_open(char *fname)
void * zeBTree_Get(ZEBT_T *, void *)
bool lr_data_dump(char *fname)
bool zeBTree_Init(ZEBT_T *, size_t, ZEBT_CMP_F)
#define MUTEX_UNLOCK(mutex)
lr_callback_F learn_callback
bool lr_set_learn_callback(lr_callback_F funct)
bool lr_learn_options(bool active, double margin)
#define MUTEX_LOCK(mutex)
bool lr_get_options(lr_opts_T *opts)
bool lr_classify(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *mscore)
bool zeBTree_Add(ZEBT_T *, void *)
bool lr_set_options(lr_opts_T *opts)
bool zeStrRegex(char *, char *, long *, long *, bool)
bool zeBTree_Destroy(ZEBT_T *)
char * zeStrChomp(char *)
int zeStr2Tokens(char *, int, char **, char *)
void lr_print_options(lr_opts_T *opts)
#define ZE_MessageNotice(level,...)
size_t zeStr2size(char *s, int *error, size_t dval)
double zeStr2double(char *s, int *error, double dval)
int zeBTree_Count(ZEBT_T *)
bool lr_learn(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *mscore, bool spam)
#define ZE_MessageInfo(level,...)
#define ZE_MessageWarning(level,...)
#define ZE_LogSysError(...)
#define STRCASEEQUAL(a, b)
bool lr_extract(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs)
int zeBTree_Browse(ZEBT_T *, ZEBT_BROWSE_F, void *)