ze-filter  (ze-filter-0.8.0-develop-180218)
ze-lr.c
Go to the documentation of this file.
1 /*
2  *
3  * ze-filter - Mail Server Filter for sendmail
4  *
5  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
6  *
7  * Auteur : Jose Marcio Martins da Cruz
8  * jose.marcio.mc@gmail.org
9  *
10  * Historique :
11  * Creation : janvier 2002
12  *
13  * This program is free software, but with restricted license :
14  *
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * More details about ze-filter license can be found at ze-filter
21  * web site : http://foss.jose-marcio.org
22  */
23 
24 #include <ze-sys.h>
25 #include <libze.h>
26 #include <libml.h>
27 #include "ze-filter.h"
28 
29 
30 
31 /* ****************************************************************************
32  * *
33  * *
34  **************************************************************************** */
35 #define LR_CLASS2LABEL(class) \
36  ((class) == LR_CLASS_HAM ? "ham" : \
37  (class) == LR_CLASS_SPAM ? "spam" : "unknown")
38 
39 #define LR_LABEL2CLASS(label) \
40  (STRCASEEQUAL((label), "spam") ? LR_CLASS_SPAM : \
41  STRCASEEQUAL((label), "ham") ? LR_CLASS_HAM : \
42  LR_CLASS_UNKNOWN)
43 
44 #define DHIST 100000
45 
46 typedef struct
47 {
48  int cmd;
49  time_t date;
50  long serial;
51  char fname[128];
52  int class;
53  bool ok;
54 } learn_evt_T;
55 
56 
57 static void
58 lr_evt_fill(evt, cmd, date, class, fname)
59  learn_evt_T *evt;
60  int cmd;
61  time_t date;
62  int class;
63  char *fname;
64 {
65  if (evt != NULL)
66  {
67  memset(evt, 0, sizeof (*evt));
68  evt->date = date;
69  evt->class = class;
70  strlcpy(evt->fname, fname, sizeof (evt->fname));
71  evt->ok = TRUE;
72  evt->cmd = cmd;
73  }
74 }
75 
76 #define LR_EVT_FILL(evt, cmd, date, class, fname) \
77  do { \
78  if ((evt) != NULL) { \
79  memset((evt), 0, sizeof(*(evt))); \
80  (evt)->date = (date); \
81  (evt)->class = (class); \
82  strlcpy((evt)->fname, (fname), sizeof((evt)->fname)); \
83  (evt)->ok = TRUE; \
84  (evt)->cmd = (cmd); \
85  } \
86  } while (0)
87 
88 
89 #define DPILE 20000
90 
91 typedef struct
92 {
93  int n;
94  learn_evt_T p[DPILE];
95  long serial;
96 } pile_T;
97 
98 #define PILE_INIT(p) \
99  do { \
100  if (p != NULL) \
101  memset(p, 0, sizeof(p)); \
102  } while(0)
103 
104 
105 static bool pile_push(pile_T * pile, learn_evt_T * evt);
106 static bool pile_pop(pile_T * pile, learn_evt_T * evt);
107 static bool pile_shift(pile_T * pile, learn_evt_T * evt);
108 
109 static bool pile_check_top(pile_T * pile, learn_evt_T * evt);
110 static bool pile_check_bottom(pile_T * pile, learn_evt_T * evt);
111 
112 static bool pile_sort(pile_T * pile);
113 
114 /* ****************************************************************************
115  * *
116  * *
117  **************************************************************************** */
118 
119 
120 #define CLI_NONE 0
121 #define CLI_CLASSIFY 1
122 #define CLI_LEARN 2
123 #define CLI_ONLINE 3
124 
125 typedef struct
126 {
127  /* resampling to equilibrate classes */
128  bool resample;
129  /* (a) active learning */
130  bool al_enable;
131  /* (A) active learning margin */
132  double al_margin;
133  /* (t) active learning threshold */
134  double al_threshold;
135  /* (D) active learning parameters */
136  char *al_defs;
137  /* (M/m) probability of active learning feedback miss */
138  double pmiss;
139  /* (N) feedback error rate */
140  double noise;
141  /* (e) ask for error feedback */
143 } cli_opt_T;
144 
145 #define CLI_OPT_INIT {FALSE, FALSE, 0.2, 0.05, NULL, -1., -1., FALSE}
146 
147 void usage(char *);
148 
149 static bool decode_lr_options(lr_opts_T * lrOpts, char *optarg);
150 
151 static int cli_lr_learn(char *fileIn, char *dataFile, cli_opt_T * opt);
152 static int cli_lr_classify(char *fileIn, char *dataFile);
153 
154 static int cli_lr_simul(char *fileIn, char *dataFile, cli_opt_T * opt);
155 
156 static double learn_callback(int i, lr_cargs_T * carg, lr_margs_T * marg);
157 static double lrate = 0.004;
158 
159 int
160 main(argc, argv)
161  int argc;
162  char **argv;
163 {
164  char *fileIn = NULL;
165  char *fileData = "/tmp/lr.txt";
166 
167  bool learn = FALSE;
168  int mode = CLI_NONE;
169 
170  cli_opt_T cliopt = CLI_OPT_INIT;
171 
173 
174  char *xmode = NULL;
175 
176  {
177  const char *args = "hi:d:lat:rR:m:x:o:";
178  int c;
179  int io;
180 
181  while ((c = getopt(argc, argv, args)) != -1)
182  {
183  switch (c)
184  {
185  case 'h':
186  usage(argv[0]);
187  exit(0);
188  break;
189 
190  case 'l':
191  learn = !learn;
192  break;
193  case 'i':
194  fileIn = optarg;
195  break;
196  case 'd':
197  fileData = optarg;
198  break;
199 
200  case 'a':
201  cliopt.al_enable = TRUE;
202  break;
203  case 't':
204  cliopt.al_threshold = atof(optarg);
205  break;
206  case 'r':
207  cliopt.resample = TRUE;
208  break;
209  case 'm':
210  case 'M':
211  cliopt.pmiss = atof(optarg);
212  break;
213  case 'R':
214  case 'L':
215  lrate = atof(optarg);
216  break;
217  case 'x':
218  xmode = optarg;
219  break;
220 
221  case 'o':
222  if (!decode_lr_options(&lrOpts, optarg))
223  ;
224  break;
225 
226  default:
227  break;
228  }
229  }
230 
231  io = optind;
232 
233  while (io < argc && *argv[io] == '-')
234  io++;
235 
236  if (io < argc)
237  {
238  /* fname = argv[io++]; */
239  }
240  }
241 
243  ze_logLevel = 10;
244 
245  if (0)
246  configure("ze-lr", conf_file, FALSE);
248 
249  if (xmode != NULL)
250  {
251  char *tag = NULL;
252 
253  tag = "learn";
254  if (STRNCASEEQUAL(xmode, tag, strlen(tag)))
255  {
256  return cli_lr_learn(fileIn, fileData, &cliopt);
257  goto fin;
258  }
259 
260  tag = "class";
261  if (STRNCASEEQUAL(xmode, tag, strlen(tag)))
262  {
263  return cli_lr_classify(fileIn, fileData);
264  goto fin;
265  }
266 
267  }
268 
269  if (learn)
270  {
271  return cli_lr_learn(fileIn, fileData, &cliopt);
272  } else
273  return cli_lr_classify(fileIn, fileData);
274 
275 fin:
276  exit(0);
277 }
278 
279 
280 /* ****************************************************************************
281  * *
282  * *
283  **************************************************************************** */
284 static int
285 cli_lr_classify(fileIn, dataFile)
286  char *fileIn;
287  char *dataFile;
288 {
289  char *id = "000000.000";
290  char *fname = NULL;
291  FILE *fdata = NULL, *fin = NULL;
292 
293  fdata = stdout;
294  fin = stdin;
295 
296  /*
297  ** Handle messages
298  */
299  if (fileIn != NULL)
300  {
301  fin = fopen(fileIn, "r");
302  if (fin == NULL)
303  {
304  ZE_LogSysError("Error opening %s", fileIn);;
305  goto fin;
306  }
307  }
308 
309  lr_data_open(dataFile);
310 
311  if (fin != NULL)
312  {
313  char stime[32];
314  char sclass[32];
315  char sfile[512];
316 
317  int nham = 0, nspam = 0;
318  bool spam = FALSE;
319 
320  test_score_T mscore;
321  lr_cargs_T cargs;
322  lr_margs_T margs;
323 
324  memset(&cargs, 0, sizeof (cargs));
325  memset(&margs, 0, sizeof (margs));
326  memset(&mscore, 0, sizeof (mscore));
327 
328  memset(stime, 0, sizeof (stime));
329  memset(sclass, 0, sizeof (sclass));
330  memset(sfile, 0, sizeof (sfile));
331 
332  while (fin != NULL && fscanf(fin, "%s %s %s", stime, sclass, sfile) == 3)
333  {
334  spam = STRCASEEQUAL(sclass, "spam");
335  fname = sfile;
336  ZE_MessageInfo(13, "%-4s : %s", sclass, sfile);
337 
338  margs.cmd = LR_CMD_CLASS;
339  margs.class = LR_CLASS_UNKNOWN;
340  lr_classify(id, fname, &cargs, &margs, &mscore);
341 
342  ZE_MessageInfo(10, "%s judge=%-4s class=%-4s score=%.4f prob=%.4f",
343  fname,
344  spam ? "spam" : "ham",
345  mscore.odds > 0.0 ? "spam" : "ham",
346  mscore.odds, mscore.value);
347  }
348  }
349 
350  lr_data_close();
351 
352 fin:
353  return 0;
354 }
355 
356 
357 /* ****************************************************************************
358  * *
359  * *
360  **************************************************************************** */
361 
362 static int
363 cli_lr_learn(fileIn, dataFile, cliopt)
364  char *fileIn;
365  char *dataFile;
366  cli_opt_T *cliopt;
367 {
368  char *id = "000000.000";
369  char *fname = NULL;
370  FILE *fdata = NULL, *fin = NULL;
371 
372  bool resample = cliopt->resample;
373  double pmiss = cliopt->pmiss;
374 
375  double marge;
376 
377  fdata = stdout;
378  fin = stdin;
379 
380  marge = 0.5 - cliopt->al_threshold;
381  if (marge <= 0)
382  marge = 0.0001;
383  lr_learn_options(cliopt->al_enable, marge);
384 
385  /*
386  ** Handle messages
387  */
388  if (fileIn != NULL)
389  {
390  fin = fopen(fileIn, "r");
391  if (fin == NULL)
392  {
393  ZE_LogSysError("Error opening %s", fileIn);
394  goto fin;
395  }
396  }
397 
398  lr_data_open(dataFile);
399 
400  if (!lr_set_learn_callback(learn_callback))
401  {
402 
403  }
404 
405  if (fin != NULL)
406  {
407  char stime[32];
408  char sclass[32];
409  char sfile[512];
410 
411  static learn_evt_T eSpam[DHIST];
412  static learn_evt_T eHam[DHIST];
413 
414  int nham = 0, nspam = 0;
415  int pham = 0, pspam = 0;
416  int nbMax = DHIST;
417 
418  bool spam = FALSE;
419  test_score_T mscore;
420  lr_cargs_T cargs;
421  lr_margs_T margs;
422  time_t t_last, t_now;
423 
424  long nbl = 0;
425 
426  nbMax = 32768;
427  {
428  char *env = NULL;
429 
430  if ((env = getenv("LRRESAMPLEWIN")) != NULL)
431  {
432  int n;
433 
434  n = atof(env);
435  if (n >= 0)
436  nbMax = n;
437  }
438  }
439 
440  memset(&mscore, 0, sizeof (mscore));
441  memset(&cargs, 0, sizeof (cargs));
442  memset(&margs, 0, sizeof (margs));
443 
444  memset(stime, 0, sizeof (stime));
445  memset(sclass, 0, sizeof (sclass));
446  memset(sfile, 0, sizeof (sfile));
447 
448  while (TRUE)
449  {
450  int i;
451  bool ok;
452  time_t date;
453 
454  learn_evt_T evt;
455 
456  if (fscanf(fin, "%s %s %s", stime, sclass, sfile) != 3)
457  break;
458 
459  date = zeStr2time(stime, NULL, (time_t) 0);
460  spam = spam = STRCASEEQUAL(sclass, "spam");
461  fname = sfile;
462 
463  ZE_MessageInfo(10, "%-4s : %s", sclass, sfile);
464 
465  margs.query = FALSE;
466  margs.learnt = FALSE;
467  margs.resample = FALSE;
468 
469  cargs.nmsg++;
470 
471  cargs.nbml++;
472  ok = lr_learn(id, fname, &cargs, &margs, &mscore, spam);
473  ok = TRUE;
474 
475  ZE_MessageInfo(10,
476  "%s classification : %8.4f %.8f judge=%-4s class=%-4s"
477  " learn=%s query=%s features=%d"
478  " score=%g prob=%.6f",
479  sfile,
480  mscore.odds, mscore.value, STRBOOL(spam, "spam", "ham"),
481  STRBOOL(mscore.odds > 0.0, "spam", "ham"),
482  STRBOOL(margs.learnt, "true", "false"),
483  STRBOOL(margs.query, "true", "false"),
484  cargs.nFeatures, mscore.odds, mscore.value);
485 
486  if (!ok)
487  continue;
488 
489  memset(&evt, 0, sizeof (evt));
490  strlcpy(evt.fname, fname, sizeof (evt.fname));
491  evt.class = spam;
492  evt.date = date;
493  evt.ok = TRUE;
494 
495  pspam = nspam % nbMax;
496  pham = nham % nbMax;
497  if (spam)
498  {
499  eSpam[pspam] = evt;
500  nspam++;
501  } else
502  {
503  eHam[pham] = evt;
504  nham++;
505  }
506 
507  if (!resample)
508  continue;
509 
510  if (nspam == 0 || nham == 0)
511  continue;
512 
513  spam = !spam;
514  if (spam)
515  {
516  if (nspam > nbMax)
517  i = random() % nbMax;
518  else
519  i = random() % nspam;
520  fname = eSpam[i].fname;
521  } else
522  {
523  if (nham > nbMax)
524  i = random() % nbMax;
525  else
526  i = random() % nham;
527  fname = eHam[i].fname;
528  }
529 
530  margs.resample = TRUE;
531  lr_learn(id, fname, &cargs, &margs, &mscore, spam);
532  }
533 
534  fclose(fin);
535 
536  /*
537  ** save learned data...
538  */
539  lr_data_dump(dataFile);
540  }
541 
542  /*
543  ** close...
544  */
545  lr_data_close();
546 
547 fin:
548  return 0;
549 }
550 
551 
552 /* ****************************************************************************
553  * *
554  * *
555  **************************************************************************** */
556 
557 static double
558 learn_callback(i, cargs, margs)
559  int i;
560  lr_cargs_T *cargs;
561  lr_margs_T *margs;
562 {
563  double r = lrate;
564 
565  static bool ok = FALSE;
566  static int srate = 2;
567  static double ri = 0.5, rf = 0.004, teta = -0.005;
568 
569  if (!ok)
570  {
571  char *s = NULL;
572 
573  if ((s = getenv("LRATEDEFS")) != NULL)
574  {
575  char ebuf[256];
576  char *argv[8];
577  int argc;
578 
579  memset(argv, 0, sizeof (argv));
580  strlcpy(ebuf, s, sizeof (ebuf));
581  argc = zeStr2Tokens(ebuf, 8, argv, ",; ");
582  if (argv[0] != NULL)
583  srate = atof(argv[0]);
584  if (argv[1] != NULL)
585  ri = atoi(argv[1]);
586  if (argv[2] != NULL)
587  {
588  rf = atof(argv[2]);
589  lrate = rf;
590  }
591  if (argv[3] != NULL)
592  teta = atof(argv[3]);
593  }
594  ok = TRUE;
595  }
596 
597  switch (srate)
598  {
599  case 0:
600  r = lrate;
601  break;
602  case 1:
603  r = ri / sqrt(i + 1);
604  if (r < rf)
605  r = rf;
606  break;
607  case 2:
608  r = (ri - rf) * exp(teta * i) + rf;
609  break;
610  }
611 
612  ZE_MessageInfo(10, "* learning rate : %7d %8.5f", i, r);
613 
614  return r;
615 }
616 
617 /* ****************************************************************************
618  * *
619  * *
620  **************************************************************************** */
621 #define DPILE 20000
622 
623 
624 static bool
625 pile_push(pile, evt)
626  pile_T *pile;
627  learn_evt_T *evt;
628 {
629  if (pile == NULL)
630  return FALSE;
631 
632  if (pile->n >= DPILE - 1)
633  return FALSE;
634 
635  pile->serial++;
636  evt->serial = pile->serial;
637  pile->p[pile->n] = *evt;
638  pile->n++;
639 
640  return TRUE;
641 }
642 
643 static bool
644 pile_pop(pile, evt)
645  pile_T *pile;
646  learn_evt_T *evt;
647 {
648  if (pile == NULL)
649  return FALSE;
650 
651  if (pile->n <= 0)
652  return FALSE;
653 
654  *evt = pile->p[pile->n - 1];
655  pile->n--;
656 
657  return TRUE;
658 }
659 
660 static bool
661 pile_shift(pile, evt)
662  pile_T *pile;
663  learn_evt_T *evt;
664 {
665  if (pile == NULL)
666  return FALSE;
667 
668  if (pile->n <= 0)
669  return FALSE;
670 
671  *evt = pile->p[0];
672  {
673  int i;
674 
675  for (i = 0; i < pile->n - 1; i++)
676  pile->p[i] = pile->p[i + 1];
677  }
678  pile->n--;
679 
680  return TRUE;
681 }
682 
683 static bool
684 pile_check_top(pile, evt)
685  pile_T *pile;
686  learn_evt_T *evt;
687 {
688  if (pile == NULL)
689  return FALSE;
690 
691  if (pile->n <= 0)
692  return FALSE;
693 
694  *evt = pile->p[pile->n - 1];
695 
696  return TRUE;
697 }
698 
699 static bool
700 pile_check_bottom(pile, evt)
701  pile_T *pile;
702  learn_evt_T *evt;
703 {
704  if (pile == NULL)
705  return FALSE;
706 
707  if (pile->n <= 0)
708  return FALSE;
709 
710  *evt = pile->p[0];
711 
712  return TRUE;
713 }
714 
715 static int
716 lrevtcmp(const void *ea, const void *eb)
717 {
718  learn_evt_T *pea = (learn_evt_T *) ea;
719  learn_evt_T *peb = (learn_evt_T *) eb;
720 
721  if (pea->date > peb->date)
722  return 1;
723  if (pea->date < peb->date)
724  return -1;
725 
726  return (pea->serial - peb->serial);
727 }
728 
729 static bool
730 pile_sort(pile)
731  pile_T *pile;
732 {
733  if (pile == NULL)
734  return FALSE;
735 
736  if (pile->n > 1)
737  qsort(pile->p, pile->n, sizeof (learn_evt_T), lrevtcmp);
738 
739  return TRUE;
740 }
741 
742 /* ****************************************************************************
743  * *
744  * *
745  **************************************************************************** */
746 static bool
747 decode_lr_options(lrOpts, optarg)
748  lr_opts_T *lrOpts;
749  char *optarg;
750 {
751  char buf[1024];
752  char *argv[4];
753  int argc;
754 
755  if (optarg == NULL)
756  return FALSE;
757 
758  strlcpy(buf, optarg, sizeof buf);
759  argc = zeStr2Tokens(buf, 4, argv, "=");
760  if (argc < 2)
761  return FALSE;
762 
763  if (STRCASEEQUAL(argv[0], "LR_LRATE"))
764  {
765 
766  return TRUE;
767  }
768  if (STRCASEEQUAL(argv[0], "LR_USE_RAW_MSG"))
769  {
770 
771  return TRUE;
772  }
773  if (STRCASEEQUAL(argv[0], "LR_RAW_LENGTH"))
774  {
775 
776  return TRUE;
777  }
778  if (STRCASEEQUAL(argv[0], "LR_BODY_LENGTH"))
779  {
780 
781  return TRUE;
782  }
783  if (STRCASEEQUAL(argv[0], "LR_USE_BODY"))
784  {
785 
786  return TRUE;
787  }
788  if (STRCASEEQUAL(argv[0], "LR_USE_HEADERS"))
789  {
790 
791  return TRUE;
792  }
793  if (STRCASEEQUAL(argv[0], "LR_CLEAN_UP_HEADERS"))
794  {
795 
796  return TRUE;
797  }
798  if (STRCASEEQUAL(argv[0], "LR_CLEANUP_DATES"))
799  {
800 
801  return TRUE;
802  }
803 
804  return TRUE;
805 }
806 
807 /* ****************************************************************************
808  * *
809  * *
810  **************************************************************************** */
811 void
812 usage(arg)
813  char *arg;
814 {
815  printf("Usage : %s options\n"
816  " -h : this message\n"
817  " -l : toggle learn option\n"
818  " -i : input file with commands\n"
819  " -d : data file\n"
820  " -a : active learning\n"
821  " -t : active learning threshold (0.5 - margin)\n"
822  " -r : resample\n"
823  " -m : feedback miss probability ([0.0, 1.0])\n"
824  " -R : asymptotic learning rate\n"
825  " -x mode : where mode in learn, class, simulate\n", arg);
826  printf("\n %s\n %s\n\n", PACKAGE, COPYRIGHT);
827 }
double al_margin
Definition: ze-lr.c:132
#define STRBOOL(x, t, f)
Definition: macros.h:87
long nFeatures
Definition: ze-lr-funcs.h:95
bool lr_data_open(char *fname)
Definition: ze-lr-funcs.c:247
#define LR_OPTS_INITIALIZER
Definition: ze-lr-funcs.h:74
bool lr_data_dump(char *fname)
Definition: ze-lr-funcs.c:435
#define COPYRIGHT
Definition: version.h:31
bool lr_data_close()
Definition: ze-lr-funcs.c:350
void set_mime_debug(bool)
Definition: ze-demime.c:69
bool lr_set_learn_callback(lr_callback_F)
Definition: ze-lr-funcs.c:1074
int ze_logLevel
Definition: zeSyslog.c:34
int main(int argc, char **argv)
Definition: ze-lr.c:160
#define STRNCASEEQUAL(a, b, n)
Definition: macros.h:75
bool ok
Definition: ze-connopen.c:59
bool lr_learn_options(bool active, double threshold)
Definition: ze-lr-funcs.c:997
bool lr_classify(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *score)
Definition: ze-lr-funcs.c:965
double al_threshold
Definition: ze-lr.c:134
void zeLog_SetOutput(bool, bool)
Definition: zeSyslog.c:490
#define FALSE
Definition: macros.h:160
#define strlcpy
Definition: zeString.h:32
char * al_defs
Definition: ze-lr.c:136
#define DPILE
Definition: ze-lr.c:621
double noise
Definition: ze-lr.c:140
int zeStr2Tokens(char *, int, char **, char *)
Definition: zeStrings.c:610
double pmiss
Definition: ze-lr.c:138
char fname[128]
Definition: ze-lr-sim.c:51
#define LR_CMD_CLASS
Definition: ze-lr-funcs.h:36
bool lr_learn(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *score, bool spam)
Definition: ze-lr-funcs.c:980
time_t date
Definition: ze-lr-sim.c:49
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
bool error_feedback
Definition: ze-lr.c:142
bool resample
Definition: ze-lr.c:128
#define TRUE
Definition: macros.h:157
double value
Definition: ze-msg-score.h:67
#define DHIST
Definition: ze-lr.c:44
void usage(char *)
Definition: ze-lr.c:812
#define ZE_LogSysError(...)
Definition: zeSyslog.h:129
#define LR_CLASS_UNKNOWN
Definition: ze-lr-funcs.h:32
char * conf_file
Definition: ze-cf.c:38
#define CLI_OPT_INIT
Definition: ze-lr.c:145
int configure(char *, char *, bool)
Definition: ze-cf.c:1203
#define PACKAGE
Definition: version.h:28
long serial
Definition: ze-lr-sim.c:50
#define STRCASEEQUAL(a, b)
Definition: macros.h:72
bool al_enable
Definition: ze-lr.c:130
bool resample
Definition: ze-lr-funcs.h:109
time_t zeStr2time(char *s, int *error, time_t dval)
Definition: zeStrConvert.c:291
#define CLI_NONE
Definition: ze-lr.c:120