ze-filter  (ze-filter-0.8.0-develop-180218)
ze-lr-funcs.c
Go to the documentation of this file.
1 /*
2  *
3  * ze-filter - Mail Server Filter for sendmail
4  *
5  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
6  *
7  * Auteur : Jose Marcio Martins da Cruz
8  * jose.marcio.mc@gmail.org
9  *
10  * Historique :
11  * Creation : Thu May 28 17:51:54 CEST 2009
12  *
13  * This program is free software, but with restricted license :
14  *
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * More details about ze-filter license can be found at ze-filter
21  * web site : http://foss.jose-marcio.org
22  */
23 
24 
25 #include <ze-sys.h>
26 #include <libze.h>
27 #include <ze-filter.h>
28 #include <ze-lr-funcs.h>
29 
30 
31 /* ****************************************************************************
32  * *
33  * *
34  **************************************************************************** */
35 
36 # define LR_BODY_LENGTH 256
37 # define LR_RAW_LENGTH 2500
38 
39 typedef struct
40 {
41  bool ok;
43  int dummy;
44 }
45 DATA_T;
46 
47 #define DATA_INIT {FALSE, JBT_INITIALIZER}
48 
49 static bool lr_task(char *id,
50  char *fname, lr_cargs_T * cargs, lr_margs_T * margs,
51  test_score_T * mscore, int task, bool learn, bool spam);
52 
53 /* ****************************************************************************
54  * *
55  * *
56  **************************************************************************** */
57 
58 typedef struct
59 {
60  union
61  {
63  char xtok[8];
64  } tok;
65 
66  int nb;
67  int nbs;
68  int nbh;
69  double weight;
70 } lrtok_T;
71 
72 static int
73 lrtokcmp(void *a, void *b)
74 {
75  lrtok_T *ta = (lrtok_T *) a;
76  lrtok_T *tb = (lrtok_T *) b;
77 
78  if (ta->tok.utok > tb->tok.utok)
79  return 1;
80  if (ta->tok.utok < tb->tok.utok)
81  return -1;
82  return 0;
83 }
84 
85 #define LRATE 0.004
86 
87 static double lrate = LRATE;
88 
89 typedef struct
90 {
91  bool ok;
92  pthread_mutex_t mutex;
94 
95  char *fname; /* data file name */
96 
97  int ns; /* count of spams */
98  int nh; /* count of hams */
99 
100  int nsu; /* count of (unbalanced) spams */
101  int nhu; /* count of (unbalanced) hams */
102 
104 
105  /* options */
107 
108 } LR_T;
109 
110 #define LR_INITIALIZER \
111  { \
112  FALSE, PTHREAD_MUTEX_INITIALIZER, JBT_INITIALIZER, \
113  NULL, \
114  0, 0, 0, 0, \
115  NULL, LR_OPTS_INITIALIZER}
116 
117 static LR_T lr_data = LR_INITIALIZER;
118 
119 
120 /* ****************************************************************************
121  * *
122  * *
123  **************************************************************************** */
124 static int
125 lr_data_read(bt, fname)
126  ZEBT_T *bt;
127  char *fname;
128 {
129  FILE *fin = NULL;
130  bool inHead = FALSE, inData = FALSE;
131  int nl = 0;
132  char buf[1024];
133 
134  if (fname == NULL)
135  return -1;
136 
137  fin = fopen(fname, "r");
138  if (fin == NULL)
139  {
140  ZE_LogSysError("fopen %s", fname);
141  return -1;
142  }
143  while (fgets(buf, sizeof (buf), fin) != NULL)
144  {
145  nl++;
146  (void) zeStrChomp(buf);
147 
148  if (inHead)
149  {
150  char *argv[8];
151  int argc;
152 
153  if (zeStrRegex(buf, "</head>", NULL, NULL, TRUE))
154  {
155  inHead = FALSE;
156  continue;
157  }
158  argc = zeStr2Tokens(buf, 8, argv, "=: ");
159  if (argc >= 2)
160  {
161  if (STRCASEEQUAL(argv[0], "count"))
162  {
163  continue;
164  }
165  if (STRCASEEQUAL(argv[0], "spams"))
166  {
167  lr_data.ns = atoi(argv[1]);
168  continue;
169  }
170  if (STRCASEEQUAL(argv[0], "hams"))
171  {
172  lr_data.nh = atoi(argv[1]);
173  continue;
174  }
175  if (STRCASEEQUAL(argv[0], "usebody"))
176  {
177  lr_data.opts.useBody = TRUE;
178  if (STRCASEEQUAL(argv[1], "NO"))
179  lr_data.opts.useBody = FALSE;
180  if (STRCASEEQUAL(argv[1], "FALSE"))
181  lr_data.opts.useBody = FALSE;
182  continue;
183  }
184  if (STRCASEEQUAL(argv[0], "useheaders"))
185  {
186  lr_data.opts.useHeaders = TRUE;
187  if (STRCASEEQUAL(argv[1], "NO"))
188  lr_data.opts.useHeaders = FALSE;
189  if (STRCASEEQUAL(argv[1], "FALSE"))
190  lr_data.opts.useHeaders = FALSE;
191  continue;
192  }
193  if (STRCASEEQUAL(argv[0], "bodylength"))
194  {
195  size_t len;
196 
197  len = zeStr2size(argv[0], NULL, LR_BODY_LENGTH);
198  if (len > 0)
199  lr_data.opts.bodyLength = len;
200  continue;
201  }
202  }
203  }
204 
205  if (inData)
206  {
207  lrtok_T tok;
208 
209  if (zeStrRegex(buf, "</data>", NULL, NULL, TRUE))
210  {
211  inData = FALSE;
212  continue;
213  }
214 
215  memset(&tok, 0, sizeof (tok));
216  if (sscanf(buf, "%x %lg %d %d %d", &tok.tok.utok, &tok.weight,
217  &tok.nb, &tok.nbs, &tok.nbh) == 5)
218  {
219  if (!zeBTree_Add(bt, &tok))
220  {
221 
222  }
223  continue;
224  }
225  ZE_MessageNotice(10, "# error %d : %s", nl, buf);
226  continue;
227  }
228 
229  if (zeStrRegex(buf, "<head>", NULL, NULL, TRUE))
230  {
231  inHead = TRUE;
232  continue;
233  }
234  if (zeStrRegex(buf, "<data>", NULL, NULL, TRUE))
235  {
236  inData = TRUE;
237  continue;
238  }
239 
240  }
241  fclose(fin);
242 
243  return nl;
244 }
245 
246 bool
248  char *fname;
249 {
250  bool result = FALSE;
251 
252  if (lr_data.ok)
253  return TRUE;
254 
255  MUTEX_LOCK(&lr_data.mutex);
256  if (!lr_data.ok)
257  {
258  char *env = NULL;
259 
260  (void) zeBTree_Init(&lr_data.lrbt, sizeof (lrtok_T), lrtokcmp);
261 
262  /* JOE XXX hmm ... and if fname is NULL ??? */
263  if ((lr_data.fname = strdup(fname)) == NULL)
264  {
265  goto fin;
266  }
267 
268  if (lr_data_read(&lr_data.lrbt, fname) < 0)
269 #if 0
270  goto fin;
271 #else
272  ;
273 #endif
274 
275  lr_data.ok = TRUE;
276 
277  lr_data.opts.tok_type = 0;
278  lr_data.opts.tok_len = 4;
279 
280  lr_data.opts.lrate = LRATE;
281  lr_data.learn_callback = NULL;
282 
283  if ((env = getenv("LR_LRATE")) != NULL)
284  {
285  double rate;
286 
287  rate = zeStr2double(env, NULL, lrate);
288  if (rate > 0.)
289  {
290  lrate = rate;
291  lr_data.opts.lrate = rate;
292  }
293  }
294 
295  if ((env = getenv("LR_USE_RAW_MSG")) != NULL)
296  {
297  if (zeStrRegex(env, "yes|true|oui", NULL, NULL, TRUE))
298  lr_data.opts.useRawMsg = TRUE;
299  }
300 
301  if ((env = getenv("LR_RAW_LENGTH")) != NULL)
302  {
303  size_t len;
304 
305  len = zeStr2size(env, NULL, LR_RAW_LENGTH);
306  if (len > 0)
307  lr_data.opts.rawLength = len;
308  }
309 
310  if ((env = getenv("LR_BODY_LENGTH")) != NULL)
311  {
312  size_t len;
313 
314  len = zeStr2size(env, NULL, LR_BODY_LENGTH);
315  if (len > 0)
316  lr_data.opts.bodyLength = len;
317  }
318 
319  if ((env = getenv("LR_USE_BODY")) != NULL)
320  {
321  lr_data.opts.useBody = TRUE;
322  if (STRCASEEQUAL(env, "NO"))
323  lr_data.opts.useBody = FALSE;
324  if (STRCASEEQUAL(env, "FALSE"))
325  lr_data.opts.useBody = FALSE;
326  }
327 
328  if ((env = getenv("LR_USE_HEADERS")) != NULL)
329  {
330  lr_data.opts.useHeaders = TRUE;
331  if (STRCASEEQUAL(env, "NO"))
332  lr_data.opts.useHeaders = FALSE;
333  if (STRCASEEQUAL(env, "FALSE"))
334  lr_data.opts.useHeaders = FALSE;
335  }
336 
337  result = TRUE;
338  }
339 fin:
340  MUTEX_UNLOCK(&lr_data.mutex);
341 
342  return result;
343 }
344 
345 /* ****************************************************************************
346  * *
347  * *
348  **************************************************************************** */
349 bool
351 {
352  if (!lr_data.ok)
353  return TRUE;
354 
355  MUTEX_LOCK(&lr_data.mutex);
356  if (lr_data.ok)
357  {
358  (void) zeBTree_Destroy(&lr_data.lrbt);
359 
360  FREE(lr_data.fname);
361  lr_data.ok = FALSE;
362  }
363  MUTEX_UNLOCK(&lr_data.mutex);
364  return TRUE;
365 }
366 
367 /* ****************************************************************************
368  * *
369  * *
370  **************************************************************************** */
371 bool
373 {
374  if (!lr_data.ok)
375  return TRUE;
376 
377  MUTEX_LOCK(&lr_data.mutex);
378  if (lr_data.ok)
379  {
380  ZE_MessageInfo(10, "Modele (fname) %s", lr_data.fname);
381 #if 0
382 
383  ZE_MessageInfo("Token length %7d", lr_data.tlen);
384  ZE_MessageInfo("Token type %7d", lr_data.ttype);
385  int tlen; /* token len */
386  int ttype; /* token type */
387 
388  int ns; /* count of spams */
389  int nh; /* count of hams */
390 
391  int nsu; /* count of (unbalanced) spams */
392  int nhu; /* count of (unbalanced) hams */
393 
394  bool useRawMsg;
395  size_t rawLength;
396 
397  bool useHeaders;
398  bool useBody;
399  size_t bodyLength;
400  bool cleanUpHeaders;
401  bool cleanUpDates;
402 
403  double lrate;
404  lr_lrate_F lrate_function;
405 
406  bool active_learning;
407  double active_threshold;
408 #endif
409  }
410  MUTEX_UNLOCK(&lr_data.mutex);
411  return TRUE;
412 }
413 
414 /* ****************************************************************************
415  * *
416  * *
417  **************************************************************************** */
418 static int
419 lr_browse_dump(vtok, varg)
420  void *vtok;
421  void *varg;
422 {
423  lrtok_T *tok = (lrtok_T *) vtok;
424  FILE *fout = varg;
425 #if 0
426  int tlen = lr_data.opts.tok_len;
427 #endif
428  fprintf(fout, "%08lx %lg %5d %5d %5d\n", (long unsigned int) tok->tok.utok, tok->weight,
429  tok->nb, tok->nbs, tok->nbh);
430 
431  return 1;
432 }
433 
434 bool
436  char *fname;
437 {
438  if (!lr_data.ok)
439  return FALSE;
440 
441  MUTEX_LOCK(&lr_data.mutex);
442  if (lr_data.ok)
443  {
444  FILE *fout = NULL;
445  int nb;
446  ZEBT_T *bt = &lr_data.lrbt;
447  time_t now;
448 
449  fout = fopen(fname, "w");
450  if (fout == NULL)
451  goto fin;
452 
453  now = time(NULL);
454 
455  fprintf(fout, "<HEAD>\n");
456  fprintf(fout, "date=%ld\n", now);
457  fprintf(fout, "toktype=%d\n", 1);
458  fprintf(fout, "toklength=%d\n", 4);
459  fprintf(fout, "count=%d\n", zeBTree_Count(bt));
460  fprintf(fout, "spams=%d\n", lr_data.ns);
461  fprintf(fout, "hams=%d\n", lr_data.nh);
462  fprintf(fout, "spamsu=%d\n", lr_data.nsu);
463  fprintf(fout, "hamsu=%d\n", lr_data.nhu);
464  fprintf(fout, "usebody=%s\n", STRBOOL(lr_data.opts.useBody, "YES", "NO"));
465  fprintf(fout, "useheaders=%s\n",
466  STRBOOL(lr_data.opts.useHeaders, "YES", "NO"));
467  fprintf(fout, "bodylength=%ld\n", lr_data.opts.bodyLength);
468  fprintf(fout, "</HEAD>\n");
469  fprintf(fout, "<DATA>\n");
470  nb = zeBTree_Browse(bt, lr_browse_dump, fout);
471  fprintf(fout, "</DATA>\n");
472  fclose(fout);
473  fin:
474  ;
475  }
476  MUTEX_UNLOCK(&lr_data.mutex);
477  return TRUE;
478 }
479 
480 /* ****************************************************************************
481  * *
482  * *
483  **************************************************************************** */
484 #if 0
485 static char *uheaders[] = {
486  "xxx-Message-ID",
487  "xxx-Date",
488  "xxx-To",
489  "X-ze-filter-score",
490  "X-ze-filter-status",
491  "X-Miltered",
492  "X-Sieve",
493  "X-DKIM",
494  "DKIM-Signature",
495  "DomainKey-Signature",
496  "Authentication-Results",
497  "xxx-MIME-Version",
498  "Received-SPF",
499  "X-Greylist",
500  "X-Virus-Scanned",
501  "X-Virus-Status",
502  "xxx-X-Spam-Status",
503  NULL
504 };
505 #else
506 static char *uheaders[] = {
507  "Message-ID",
508  "Date",
509  "Toxxxxxxx",
510  "X-ze-filter-score",
511  "X-ze-filter-status",
512  "X-ze-filter-Enveloppe",
513  "X-ze-filter-file",
514  "X-Miltered",
515  "X-Sieve",
516  "X-DKIM",
517  "DKIM-Signature",
518  "DomainKey-Signature",
519  "Authentication-Results",
520  "MIME-Version",
521  "Received-SPF",
522  "X-Greylist",
523  "X-Virus-Scanned",
524  "X-Virus-Status",
525  "X-Antivirus",
526  "X-Antivirus-Status",
527  "X-Spam-Status",
528  "X-DSPAM-Check",
529  "X-DSPAM-Confidence",
530  "X-DSPAM-Factors",
531  "X-DSPAM-Improbability",
532  "X-DSPAM-Probability",
533  "X-DSPAM-Processed",
534  "X-DSPAM-Result",
535  "X-DSPAM-Signature",
536  NULL
537 };
538 #endif
539 
540 static char *mymtas[] = {
541  "by .*.ensmp.fr",
542  "by .*.mines-paristech.fr",
543  "from .*.ensmp.fr",
544  "from .*.mines-paristech.fr",
545  "from .*.cru.fr",
546  "by .*.cru.fr",
547  "from .*.renater.fr",
548  "by .*.renater.fr",
549  NULL
550 };
551 
552 #define DATE_EXPR "(Sun|Mon|Tue|Wed|Thu|Fri|Sat)?,? +[0-9]+ " \
553  "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +20[0-9]{2}" \
554  " +([0-9]{1,2}:[0-9]{2}:[0-9]{2} +([-+]?[0-9]{2}00)?)?"
555 
556 
557 static bool
558 scan_msg_part(bt, s)
559  ZEBT_T *bt;
560  char *s;
561 {
562  int i, slen, tlen = 4;
563 
564  slen = strlen(s);
565 
566  strlcat(s, " ", 4);
567  for (i = 0; i <= slen - tlen; i++)
568  {
569  lrtok_T token, *t;
570 
571  uint32_t tok;
572 
573  tok = 0;
574  tok += ((uint32_t) s[i]) << 24;
575  tok += ((uint32_t) s[i + 1]) << 16;
576  tok += ((uint32_t) s[i + 2]) << 8;
577  tok += ((uint32_t) s[i + 3]);
578  memset(&token, 0, sizeof (token));
579  token.tok.utok = tok;
580 
581  token.nb = 1;
582  if ((t = zeBTree_Get(bt, &token)) != NULL)
583  t->nb++;
584  else
585  (void) zeBTree_Add(bt, &token);
586  }
587 
588  return TRUE;
589 }
590 
591 /* ****************************************************************************
592  * *
593  * *
594  **************************************************************************** */
595 static int
596 bt_browse_classify(void *vtok, void *varg)
597 {
598  double *score = (double *) varg;
599  lrtok_T *ptok = vtok;
600 #if 0
601  lrtok_T tok;
602 
603  tok = *((lrtok_T *) vtok);
604 #endif
605  if ((ptok = zeBTree_Get(&lr_data.lrbt, vtok)) != NULL)
606  *score += ptok->weight;
607 
608  return 1;
609 }
610 
611 
612 static int
613 bt_browse_dump_tokens(void *vtok, void *varg)
614 {
615  lrtok_T *tok = (lrtok_T *) vtok;
616  char *id = (char *) varg;
617 
618  id = (id != NULL ? id : "000");
619 
620  printf("%-14s %08lx\n", id, (long unsigned int) tok->tok.utok);
621  return 1;
622 }
623 
624 
625 static int
626 bt_browse_adjust(void *vtok, void *varg)
627 {
628  lrtok_T *ptok = vtok, tok;
629  double *delta = (double *) varg;
630 
631  tok = *((lrtok_T *) vtok);
632  if ((ptok = zeBTree_Get(&lr_data.lrbt, &tok)) == NULL)
633  {
634  tok.weight += *delta;
635  tok.nb = 1;
636  if (*delta > 0)
637  tok.nbs++;
638  else
639  tok.nbh++;
640  if (!zeBTree_Add(&lr_data.lrbt, &tok))
641  {
642  }
643  } else
644  {
645  ptok->weight += *delta;
646  ptok->nb++;
647  if (*delta > 0)
648  ptok->nbs++;
649  else
650  ptok->nbh++;
651  }
652 
653  return 1;
654 }
655 
656 
657 /* ****************************************************************************
658  * *
659  * *
660  **************************************************************************** */
661 
662 static bool
663 tokens_mime_part(buf, size, id, level, type, arg, mpart)
664  char *buf;
665  size_t size;
666  char *id;
667  int level;
668  int type;
669  void *arg;
670  mime_part_T *mpart;
671 {
672  DATA_T *data = (DATA_T *) arg;
673 #if 0
674  char *mtype = "TEXT";
675 #endif
676  rfc2822_hdr_T *h;
677  char vbuf[1024];
678 
679  bool clHeaders = TRUE;
680  bool clDates = TRUE;
681 
682  if (data == NULL)
683  return FALSE;
684 
685  {
686  char *env = NULL;
687 
688  if ((env = getenv("LR_CLEANUP_HEADERS")) != NULL)
689  {
690  clHeaders = zeStrRegex(env, "yes|true", NULL, NULL, TRUE);
691  }
692  if ((env = getenv("LR_CLEANUP_DATES")) != NULL)
693  {
694  clDates = zeStrRegex(env, "yes|true", NULL, NULL, TRUE);
695  }
696  }
697 
698  if (lr_data.opts.useHeaders)
699  {
700  for (h = mpart->hdrs; h != NULL; h = h->next)
701  {
702  char **s = NULL;
703  long pi, pf;
704 
705  if (h->value == NULL || strlen(h->value) == 0)
706  continue;
707 
708  snprintf(vbuf, sizeof (vbuf), "%s: %s", h->key, h->value);
709  if (clHeaders)
710  {
711  for (s = uheaders; s != NULL && *s != NULL; s++)
712  {
713  if (STRCASEEQUAL(*s, h->key))
714  break;
715  }
716  if (*s != NULL)
717  continue;
718 
719  if (STRCASEEQUAL("Received", h->key))
720  {
721  for (s = mymtas; s != NULL && *s != NULL; s++)
722  {
723 #if 1
724  if (zeStrRegex(h->value, *s, NULL, NULL, TRUE))
725 #else
726  if (strstr(h->value, *s))
727 #endif
728  break;
729  }
730  if (*s != NULL)
731  continue;
732  }
733 
734  if (STRCASEEQUAL("X-ze-filter-Enveloppe", h->key))
735  {
736  for (s = mymtas; s != NULL && *s != NULL; s++)
737  {
738 #if 1
739  if (zeStrRegex(h->value, *s, NULL, NULL, TRUE))
740 #else
741  if (strstr(h->value, *s))
742 #endif
743  break;
744  }
745  if (*s != NULL)
746  continue;
747  }
748  }
749  snprintf(vbuf, sizeof (vbuf), "%s: %s", h->key, h->value);
750 
751  if (clDates)
752  {
753  while (zeStrRegex(vbuf, DATE_EXPR, &pi, &pf, TRUE))
754  {
755  int i, lm;
756  char *p;
757 
758  lm = strlen(vbuf);
759  for (i = 0, p = vbuf; i < lm; i++)
760  {
761  if (i < pi || i > pf)
762  *p++ = vbuf[i];
763  }
764  *p = '\0';
765  }
766  }
767 
768  scan_msg_part(&data->bt, vbuf);
769  }
770  }
771 
772  if (lr_data.opts.useBody)
773  {
774  if (type != MIME_TYPE_TEXT)
775  return TRUE;
776 
777  if (lr_data.opts.bodyLength > 0)
778  {
779  char *pb = buf;
780 
781  strlcpy(vbuf, pb, lr_data.opts.bodyLength);
782  scan_msg_part(&data->bt, vbuf);
783  }
784  }
785 
786  return TRUE;
787 }
788 
789 
790 /* ****************************************************************************
791  * *
792  * *
793  **************************************************************************** */
794 static bool
795 lr_task(id, fname, cargs, margs, mscore, task, learn, spam)
796  char *id;
797  char *fname;
798  lr_cargs_T *cargs;
799  lr_margs_T *margs;
800  test_score_T *mscore;
801  int task;
802  bool learn;
803  bool spam;
804 {
805  DATA_T data = DATA_INIT;
806  double score = 0.0, prob = 0.5;
807  int nb;
808  bool result = FALSE;
809  double ptarget, delta;
810 
811  if (!lr_data.ok)
812  {
813  static int nbw = 0;
814 
815  if (nbw++ < 5)
816  ZE_MessageWarning(10, "lr data not yet initialized");
817  return FALSE;
818  }
819 
820  id = STREMPTY(id, "NOID");
821  if (!zeBTree_Init(&data.bt, sizeof (lrtok_T), lrtokcmp))
822  goto fin;
823 
824  if (lr_data.opts.useRawMsg)
825  {
826  int fd = -1;
827  char buf[8192];
828  size_t nc;
829 
830  fd = open(fname, O_RDONLY);
831  if (fd < 0)
832  {
833  ZE_LogSysError("Error opening %s", fname);
834  goto fin;
835  }
836  memset(buf, 0, sizeof (buf));
837  nc = read(fd, buf, 2500);
838  if (nc < 0)
839  {
840  ZE_LogSysError("Error reading %s", fname);
841  close(fd);
842  goto fin;
843  }
844  close(fd);
845  scan_msg_part(&data.bt, buf);
846  } else
847  {
848  if (!decode_mime_file(id, fname, NULL, tokens_mime_part, &data))
849  goto fin;
850  }
851 
852  if (task == LR_TASK_EXTRACT) {
853  nb = zeBTree_Browse(&data.bt, bt_browse_dump_tokens, id);
854  goto fin;
855  }
856 
857  nb = zeBTree_Browse(&data.bt, bt_browse_classify, &score);
858  if (nb <= 0)
859  goto fin;
860  prob = 1. / (1. + exp(-score));
861 
862  if (mscore != NULL)
863  {
864  mscore->actif = TRUE;
865  mscore->value = prob;
866  mscore->odds = score;
867  }
868  if (margs != NULL)
869  {
870  margs->score.actif = TRUE;
871  margs->score.value = prob;
872  margs->score.odds = score;
873  }
874 
875  result = TRUE;
876 
877 #if 1
878  if (task == LR_TASK_LEARN)
879 #else
880  if (learn)
881 #endif
882  {
883  double dp;
884 
885 #if 0
886  if (lr_data.learn_callback != NULL)
887  {
888  (void) lr_data.learn_callback(lr_data.nsu + lr_data.nhu, cargs, margs);
889  }
890 #endif
891 
892  ptarget = spam ? 1.0 : 0.0;
893  dp = ptarget - prob;
894  if (lr_data.opts.active_learning) {
895  if ((fabs(prob - 0.5) > lr_data.opts.active_margin) && (fabs(dp) < 0.5))
896  goto fin;
897  }
898 
899  if (margs != NULL)
900  margs->query = TRUE;
901 
902  {
903  double lrate;
904 
905  lrate = lr_data.opts.lrate;
906  if (lr_data.learn_callback != NULL)
907  {
908  lrate = lr_data.learn_callback(lr_data.nsu + lr_data.nhu, cargs, margs);
909  lr_data.opts.lrate = lrate;
910  }
911  delta = dp * lr_data.opts.lrate;
912  /* si apprentissage, correct weights */
913  nb = zeBTree_Browse(&data.bt, bt_browse_adjust, &delta);
914  }
915 
916  if (spam)
917  lr_data.ns++;
918  else
919  lr_data.nh++;
920 
921  if (margs != NULL)
922  {
923  if (!margs->resample)
924  {
925  if (spam)
926  lr_data.nsu++;
927  else
928  lr_data.nhu++;
929  }
930  margs->learnt = TRUE;
931  if (cargs != NULL)
932  cargs->nFeatures = zeBTree_Count(&lr_data.lrbt);
933  }
934  if (nb <= 0)
935  {
936  result = FALSE;
937  goto fin;
938  }
939  }
940 
941 fin:
942  (void) zeBTree_Destroy(&data.bt);
943  return result;
944 }
945 
946 /* ****************************************************************************
947  * *
948  * *
949  **************************************************************************** */
950 bool
951 lr_extract(id, fname, cargs, margs)
952  char *id;
953  char *fname;
954  lr_cargs_T *cargs;
955  lr_margs_T *margs;
956 {
957  return lr_task(id, fname, cargs, margs, NULL, LR_TASK_EXTRACT, FALSE, FALSE);
958 }
959 
960 /* ****************************************************************************
961  * *
962  * *
963  **************************************************************************** */
964 bool
965 lr_classify(id, fname, cargs, margs, mscore)
966  char *id;
967  char *fname;
968  lr_cargs_T *cargs;
969  lr_margs_T *margs;
970  test_score_T *mscore;
971 {
972  return lr_task(id, fname, cargs, margs, mscore, LR_TASK_CLASSIFY, FALSE, FALSE);
973 }
974 
975 /* ****************************************************************************
976  * *
977  * *
978  **************************************************************************** */
979 bool
980 lr_learn(id, fname, cargs, margs, mscore, spam)
981  char *id;
982  char *fname;
983  lr_cargs_T *cargs;
984  lr_margs_T *margs;
985  test_score_T *mscore;
986  bool spam;
987 {
988  return lr_task(id, fname, cargs, margs, mscore, LR_TASK_LEARN, TRUE, spam);
989 }
990 
991 
992 /* ****************************************************************************
993  * *
994  * *
995  **************************************************************************** */
996 bool
997 lr_learn_options(active, margin)
998  bool active;
999  double margin;
1000 {
1001  lr_data.opts.active_learning = active;
1002  lr_data.opts.active_margin = margin;
1003 
1004  return TRUE;
1005 }
1006 
1007 /* ****************************************************************************
1008  * *
1009  * *
1010  **************************************************************************** */
1011 bool
1013  lr_opts_T *opts;
1014 {
1015  if (opts == NULL)
1016  return FALSE;
1017 
1018  lr_data.opts = *opts;
1019  return TRUE;
1020 }
1021 
1022 /* ****************************************************************************
1023  * *
1024  * *
1025  **************************************************************************** */
1026 bool
1028  lr_opts_T *opts;
1029 {
1030  if (opts == NULL)
1031  return FALSE;
1032 
1033  *opts = lr_data.opts;
1034  return TRUE;
1035 }
1036 
1037 /* ****************************************************************************
1038  * *
1039  * *
1040  **************************************************************************** */
1041 void
1043  lr_opts_T *opts;
1044 {
1045  lr_opts_T *popts = opts;
1046 
1047  popts = (opts != NULL ? opts : &lr_data.opts);
1048 
1049  printf("# lrate %7.3f\n"
1050  "# resample %d\n"
1051  "# useRawMsg %d\n"
1052  "# rawLength %ld\n"
1053  "# bodyLength %ld\n"
1054  "# useBody %d\n"
1055  "# useHeaders %d\n"
1056  "# cleanUpHeaders %d\n"
1057  "# cleanUpDates %d\n"
1058  "# tok_type %d\n"
1059  "# tok_len %d\n"
1060  "# active_learning %d\n"
1061  "# active_margin %.3f\n"
1062  "#\n",
1063  popts->lrate, popts->resample, popts->useRawMsg, (long) popts->rawLength,
1064  (long) popts->bodyLength, popts->useBody, popts->useHeaders,
1065  popts->cleanUpHeaders, popts->cleanUpDates, popts->tok_type,
1066  popts->tok_len, popts->active_learning, popts->active_margin);
1067 }
1068 
1069 /* ****************************************************************************
1070  * *
1071  * *
1072  **************************************************************************** */
1073 bool
1075  lr_callback_F funct;
1076 {
1077  lr_data.learn_callback = funct;
1078 
1079  return TRUE;
1080 }
#define LR_RAW_LENGTH
Definition: ze-lr-funcs.c:37
#define STRBOOL(x, t, f)
Definition: macros.h:87
double(* lr_callback_F)(int, lr_cargs_T *, lr_margs_T *)
Definition: ze-lr-funcs.h:155
long nFeatures
Definition: ze-lr-funcs.h:95
bool lr_data_open(char *fname)
Definition: ze-lr-funcs.c:247
int nsu
Definition: ze-lr-funcs.c:100
void * zeBTree_Get(ZEBT_T *, void *)
Definition: zeBTree.c:281
bool lr_data_dump(char *fname)
Definition: ze-lr-funcs.c:435
#define FREE(x)
Definition: macros.h:37
bool lr_data_close()
Definition: ze-lr-funcs.c:350
char * key
Definition: ze-rfc2822.h:37
bool zeBTree_Init(ZEBT_T *, size_t, ZEBT_CMP_F)
Definition: zeBTree.c:96
#define MUTEX_UNLOCK(mutex)
Definition: macros.h:101
lr_callback_F learn_callback
Definition: ze-lr-funcs.c:103
bool lr_set_learn_callback(lr_callback_F funct)
Definition: ze-lr-funcs.c:1074
rfc2822_hdr_T * next
Definition: ze-rfc2822.h:39
bool cleanUpHeaders
Definition: ze-lr-funcs.h:63
bool lr_learn_options(bool active, double margin)
Definition: ze-lr-funcs.c:997
#define MUTEX_LOCK(mutex)
Definition: macros.h:93
bool lr_get_options(lr_opts_T *opts)
Definition: ze-lr-funcs.c:1027
int nh
Definition: ze-lr-funcs.c:98
bool lr_classify(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *mscore)
Definition: ze-lr-funcs.c:965
char * value
Definition: ze-rfc2822.h:38
bool zeBTree_Add(ZEBT_T *, void *)
Definition: zeBTree.c:309
bool ok
Definition: ze-lr-funcs.c:41
bool lr_set_options(lr_opts_T *opts)
Definition: ze-lr-funcs.c:1012
#define FALSE
Definition: macros.h:160
#define strlcpy
Definition: zeString.h:32
size_t rawLength
Definition: ze-lr-funcs.h:57
bool zeStrRegex(char *, char *, long *, long *, bool)
Definition: zeStrings.c:544
#define DATE_EXPR
Definition: ze-lr-funcs.c:552
#define LR_BODY_LENGTH
Definition: ze-lr-funcs.c:36
char * fname
Definition: ze-lr-funcs.c:95
bool zeBTree_Destroy(ZEBT_T *)
Definition: zeBTree.c:192
int dummy
Definition: ze-lr-funcs.c:43
char * zeStrChomp(char *)
Definition: zeStrings.c:501
int tok_type
Definition: ze-lr-funcs.h:66
int zeStr2Tokens(char *, int, char **, char *)
Definition: zeStrings.c:610
#define LRATE
Definition: ze-lr-funcs.c:85
double weight
Definition: ze-lr-funcs.c:69
#define strlcat
Definition: zeString.h:28
bool decode_mime_file(char *, char *, uint32_t *, demime_F, void *)
Definition: ze-demime.c:584
bool useHeaders
Definition: ze-lr-funcs.h:61
lr_opts_T opts
Definition: ze-lr-funcs.c:106
void lr_print_options(lr_opts_T *opts)
Definition: ze-lr-funcs.c:1042
#define ZE_MessageNotice(level,...)
Definition: zeSyslog.h:91
size_t zeStr2size(char *s, int *error, size_t dval)
Definition: zeStrConvert.c:237
double zeStr2double(char *s, int *error, double dval)
Definition: zeStrConvert.c:202
bool useRawMsg
Definition: ze-lr-funcs.h:56
int zeBTree_Count(ZEBT_T *)
Definition: zeBTree.c:245
bool lr_learn(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *mscore, bool spam)
Definition: ze-lr-funcs.c:980
#define LR_INITIALIZER
Definition: ze-lr-funcs.c:110
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
int nb
Definition: ze-connopen.c:61
size_t bodyLength
Definition: ze-lr-funcs.h:59
#define LR_TASK_CLASSIFY
Definition: ze-lr-funcs.h:43
#define MIME_TYPE_TEXT
Definition: ze-demime.h:34
#define DATA_INIT
Definition: ze-lr-funcs.c:47
#define TRUE
Definition: macros.h:157
#define LR_TASK_EXTRACT
Definition: ze-lr-funcs.h:45
#define ZE_MessageWarning(level,...)
Definition: zeSyslog.h:92
double value
Definition: ze-msg-score.h:67
int nhu
Definition: ze-lr-funcs.c:101
int tok_len
Definition: ze-lr-funcs.h:67
#define ZE_LogSysError(...)
Definition: zeSyslog.h:129
ZEBT_T lrbt
Definition: ze-lr-funcs.c:93
uint32_t utok
Definition: ze-lr-funcs.c:62
#define STREMPTY(x, r)
Definition: macros.h:82
bool useBody
Definition: ze-lr-funcs.h:60
test_score_T score
Definition: ze-lr-funcs.h:116
double lrate
Definition: ze-lr-funcs.h:50
double active_margin
Definition: ze-lr-funcs.h:71
int nbs
Definition: ze-lr-funcs.c:67
pthread_mutex_t mutex
Definition: ze-lr-funcs.c:92
bool ok
Definition: ze-lr-funcs.c:91
#define STRCASEEQUAL(a, b)
Definition: macros.h:72
bool lr_data_show_conf()
Definition: ze-lr-funcs.c:372
int nb
Definition: ze-lr-funcs.c:66
#define LR_TASK_LEARN
Definition: ze-lr-funcs.h:44
bool cleanUpDates
Definition: ze-lr-funcs.h:64
int ns
Definition: ze-lr-funcs.c:97
ZEBT_T bt
Definition: ze-lr-funcs.c:42
bool lr_extract(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs)
Definition: ze-lr-funcs.c:951
bool active_learning
Definition: ze-lr-funcs.h:70
long uint32_t
Definition: ze-sys.h:489
bool resample
Definition: ze-lr-funcs.h:109
bool resample
Definition: ze-lr-funcs.h:53
int nbh
Definition: ze-lr-funcs.c:68
union lrtok_T::@13 tok
int zeBTree_Browse(ZEBT_T *, ZEBT_BROWSE_F, void *)
Definition: zeBTree.c:262
rfc2822_hdr_T * hdrs
Definition: ze-demime.h:74
Definition: zeBTree.h:73