ze-filter  (ze-filter-0.8.0-develop-180218)
ze-lr-sim.c
Go to the documentation of this file.
1 /*
2  *
3  * ze-filter - Mail Server Filter for sendmail
4  *
5  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
6  *
7  * Auteur : Jose Marcio Martins da Cruz
8  * jose.marcio.mc@gmail.org
9  *
10  * Historique :
11  * Creation : janvier 2002
12  *
13  * This program is free software, but with restricted license :
14  *
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * More details about ze-filter license can be found at ze-filter
21  * web site : http://foss.jose-marcio.org
22  */
23 
24 #include <ze-sys.h>
25 #include <libze.h>
26 #include <libml.h>
27 #include "ze-filter.h"
28 
29 
30 
31 /* ****************************************************************************
32  * *
33  * *
34  **************************************************************************** */
35 #define LR_CLASS2LABEL(class) \
36  ((class) == LR_CLASS_HAM ? "ham" : \
37  (class) == LR_CLASS_SPAM ? "spam" : "unknown")
38 
39 #define LR_LABEL2CLASS(label) \
40  (STRCASEEQUAL((label), "spam") ? LR_CLASS_SPAM : \
41  STRCASEEQUAL((label), "ham") ? LR_CLASS_HAM : \
42  LR_CLASS_UNKNOWN)
43 
44 #define DHIST 100000
45 
46 typedef struct
47 {
48  int cmd;
49  time_t date;
50  long serial;
51  char fname[128];
52  int class;
53  bool ok;
54 } learn_evt_T;
55 
56 
57 static void
58 lr_evt_fill(evt, cmd, date, class, fname)
59  learn_evt_T *evt;
60  int cmd;
61  time_t date;
62  int class;
63  char *fname;
64 {
65  if (evt != NULL)
66  {
67  memset(evt, 0, sizeof (*evt));
68  evt->date = date;
69  evt->class = class;
70  strlcpy(evt->fname, fname, sizeof (evt->fname));
71  evt->ok = TRUE;
72  evt->cmd = cmd;
73  }
74 }
75 
76 #define LR_EVT_FILL(evt, cmd, date, class, fname) \
77  do { \
78  if ((evt) != NULL) { \
79  memset((evt), 0, sizeof(*(evt))); \
80  (evt)->date = (date); \
81  (evt)->class = (class); \
82  strlcpy((evt)->fname, (fname), sizeof((evt)->fname)); \
83  (evt)->ok = TRUE; \
84  (evt)->cmd = (cmd); \
85  } \
86  } while (0)
87 
88 
89 #define DPILE 20000
90 
91 typedef struct
92 {
93  int n;
95  long serial;
96 } pile_T;
97 
98 #define PILE_INIT(p) \
99  do { \
100  if (p != NULL) \
101  memset(p, 0, sizeof(p)); \
102  } while(0)
103 
104 
105 static bool pile_push(pile_T * pile, learn_evt_T * evt);
106 static bool pile_pop(pile_T * pile, learn_evt_T * evt);
107 static bool pile_shift(pile_T * pile, learn_evt_T * evt);
108 
109 static bool pile_check_top(pile_T * pile, learn_evt_T * evt);
110 static bool pile_check_bottom(pile_T * pile, learn_evt_T * evt);
111 
112 static bool pile_sort(pile_T * pile);
113 
114 /* ****************************************************************************
115  * *
116  * *
117  **************************************************************************** */
118 
119 
120 #define CLI_NONE 0
121 #define CLI_CLASSIFY 1
122 #define CLI_LEARN 2
123 #define CLI_ONLINE 3
124 
125 #define CLI_DELAY_FN_CTE 0
126 #define CLI_DELAY_FN_EXP 1
127 #define CLI_DELAY_FN_FIX 2
128 
129 typedef struct
130 {
131  /* resampling to equilibrate classes */
132  bool resample;
133 
134  /* */
135  time_t delay;
136  int delay_fn;
137 
138  /* (M/m) probability of active learning feedback miss */
140  double pmiss;
141 
142  /* (N) feedback error rate */
144  double pnoise;
145 
146  /* (e) ask for error feedback */
148 
149  time_t tstart;
150  time_t tend;
151 } cli_opts_T;
152 
153 #define CLI_OPT_INIT {FALSE, 7200, CLI_DELAY_FN_CTE, FALSE, -1., FALSE, -1., FALSE, 0, 0}
154 
155 void usage(char *);
156 
157 static bool decode_lr_options(lr_opts_T * lrOpt, cli_opts_T * cliOpt,
158  char *optarg);
159 
160 static int cli_lr_classify(char *fileIn, char *dataFile);
161 
162 static int cli_lr_learn(char *fileIn, char *dataFile, cli_opts_T * opt,
163  lr_opts_T * lrOpts);
164 
165 static int cli_lr_simul(char *fileIn, char *dataFile, cli_opts_T * opt,
166  lr_opts_T * lrOpts);
167 
168 static int cli_lr_extract(char *fileIn, char *dataFile, cli_opts_T * opt,
169  lr_opts_T * lrOpts);
170 
171 static double learn_callback(int i, lr_cargs_T * carg, lr_margs_T * marg);
172 static double lrate = 0.004;
173 
174 int
175 main(argc, argv)
176  int argc;
177  char **argv;
178 {
179  char *fileIn = NULL;
180  char *fileData = "/tmp/lr.txt";
181 
182  int mode = CLI_NONE;
183 
184  cli_opts_T cliOpt = CLI_OPT_INIT;
185 
187 
188  char *xmode = "simul";
189 
190  {
191  const char *args = "hi:d:lR:L:x:o:";
192  int c;
193  int io;
194 
195  while ((c = getopt(argc, argv, args)) != -1)
196  {
197  switch (c)
198  {
199  case 'h':
200  usage(argv[0]);
201  exit(0);
202  break;
203 
204  case 'i':
205  fileIn = optarg;
206  break;
207  case 'd':
208  fileData = optarg;
209  break;
210 
211 
212  case 'R':
213  case 'L':
214  lrate = atof(optarg);
215  break;
216  case 'x':
217  xmode = optarg;
218  break;
219 
220  case 'o':
221  if (!decode_lr_options(&lrOpts, &cliOpt, optarg))
222  ;
223  break;
224 
225  default:
226  break;
227  }
228  }
229 
230  io = optind;
231 
232  while (io < argc && *argv[io] == '-')
233  io++;
234 
235  if (io < argc)
236  {
237  /* fname = argv[io++]; */
238  }
239  }
240 
242  ze_logLevel = 10;
243 
244  if (0)
245  configure("ze-lr", conf_file, FALSE);
247 
248  if (xmode != NULL)
249  {
250  char *tag = NULL;
251 
252  tag = "learn";
253  if (STRNCASEEQUAL(xmode, tag, strlen(tag)))
254  {
255  (void) cli_lr_learn(fileIn, fileData, &cliOpt, &lrOpts);
256  goto fin;
257  }
258 
259  tag = "class";
260  if (STRNCASEEQUAL(xmode, tag, strlen(tag)))
261  {
262  (void) cli_lr_classify(fileIn, fileData);
263  goto fin;
264  }
265 
266  tag = "simul";
267  if (STRNCASEEQUAL(xmode, tag, strlen(tag)))
268  {
269  (void) cli_lr_simul(fileIn, fileData, &cliOpt, &lrOpts);
270  goto fin;
271  }
272 
273  tag = "extract";
274  if (STRNCASEEQUAL(xmode, tag, strlen(tag)))
275  {
276  (void) cli_lr_extract(fileIn, fileData, &cliOpt, &lrOpts);
277  goto fin;
278  }
279  }
280 
281 
282 fin:
283  exit(0);
284 }
285 
286 /* ****************************************************************************
287  * *
288  * *
289  **************************************************************************** */
290 static int
291 cli_lr_extract(fileIn, dataFile, cliopt, lropts)
292  char *fileIn;
293  char *dataFile;
294  cli_opts_T *cliopt;
295  lr_opts_T *lropts;
296 {
297  char *fname = NULL;
298  FILE *fdata = NULL, *fin = NULL;
299 
300  fdata = stdout;
301  fin = stdin;
302 
303  /*
304  ** Handle messages
305  */
306  if (fileIn != NULL)
307  {
308  fin = fopen(fileIn, "r");
309  if (fin == NULL)
310  {
311  ZE_LogSysError("Error opening %s", fileIn);;
312  goto fin;
313  }
314  }
315 
316  lr_data_open(dataFile);
317 
318  if (fin != NULL)
319  {
320  char stime[32];
321  char sclass[32];
322  char sfile[512];
323 
324  int nham = 0, nspam = 0;
325  bool spam = FALSE;
326 
327  lr_cargs_T cargs;
328  lr_margs_T margs;
329 
330  int nid = 0;
331 
332  memset(&cargs, 0, sizeof (cargs));
333  memset(&margs, 0, sizeof (margs));
334 
335  memset(stime, 0, sizeof (stime));
336  memset(sclass, 0, sizeof (sclass));
337  memset(sfile, 0, sizeof (sfile));
338 
339  while (fin != NULL && fscanf(fin, "%s %s %s", stime, sclass, sfile) == 3)
340  {
341  char id[256];
342 
343  spam = STRCASEEQUAL(sclass, "spam");
344  fname = sfile;
345 
346  snprintf(id, sizeof(id), "%08d", nid++);
347  ZE_MessageInfo(13, "# %-4s : %s", sclass, sfile);
348 
349  margs.cmd = LR_CMD_EXTRACT;
350  margs.class = LR_CLASS_UNKNOWN;
351  lr_extract(id, fname, &cargs, &margs);
352  }
353  }
354 
355  lr_data_close();
356 
357 fin:
358  return 0;
359 }
360 
361 
362 /* ****************************************************************************
363  * *
364  * *
365  **************************************************************************** */
366 static int
367 cli_lr_classify(fileIn, dataFile)
368  char *fileIn;
369  char *dataFile;
370 {
371  char *id = "000000.000";
372  char *fname = NULL;
373  FILE *fdata = NULL, *fin = NULL;
374 
375  fdata = stdout;
376  fin = stdin;
377 
378  /*
379  ** Handle messages
380  */
381  if (fileIn != NULL)
382  {
383  fin = fopen(fileIn, "r");
384  if (fin == NULL)
385  {
386  ZE_LogSysError("Error opening %s", fileIn);;
387  goto fin;
388  }
389  }
390 
391  lr_data_open(dataFile);
392 
393  if (fin != NULL)
394  {
395  char stime[32];
396  char sclass[32];
397  char sfile[512];
398 
399  int nham = 0, nspam = 0;
400  bool spam = FALSE;
401 
402  test_score_T mscore;
403  lr_cargs_T cargs;
404  lr_margs_T margs;
405 
406  memset(&cargs, 0, sizeof (cargs));
407  memset(&margs, 0, sizeof (margs));
408  memset(&mscore, 0, sizeof (mscore));
409 
410  memset(stime, 0, sizeof (stime));
411  memset(sclass, 0, sizeof (sclass));
412  memset(sfile, 0, sizeof (sfile));
413 
414  while (fin != NULL && fscanf(fin, "%s %s %s", stime, sclass, sfile) == 3)
415  {
416  spam = STRCASEEQUAL(sclass, "spam");
417  fname = sfile;
418  ZE_MessageInfo(13, "%-4s : %s", sclass, sfile);
419 
420  margs.cmd = LR_CMD_CLASS;
421  margs.class = LR_CLASS_UNKNOWN;
422  lr_classify(id, fname, &cargs, &margs, &mscore);
423 
424  ZE_MessageInfo(10, "%s %s op=%s judge=%-4s class=%-4s score=%.4f prob=%.4f",
425  stime, fname,
426  "class",
427  spam ? "spam" : "ham",
428  mscore.odds > 0.0 ? "spam" : "ham",
429  mscore.odds, mscore.value);
430  }
431  }
432 
433  lr_data_close();
434 
435 fin:
436  return 0;
437 }
438 
439 
440 /* ****************************************************************************
441  * *
442  * *
443  **************************************************************************** */
444 
445 static int
446 cli_lr_learn(fileIn, dataFile, cliopt, lropts)
447  char *fileIn;
448  char *dataFile;
449  cli_opts_T *cliopt;
450  lr_opts_T *lropts;
451 {
452  char *id = "000000.000";
453  char *fname = NULL;
454  FILE *fdata = NULL, *fin = NULL;
455 
456  bool resample = cliopt->resample;
457 
458  fdata = stdout;
459  fin = stdin;
460 
462 
463  /*
464  ** Handle messages
465  */
466  if (fileIn != NULL)
467  {
468  fin = fopen(fileIn, "r");
469  if (fin == NULL)
470  {
471  ZE_LogSysError("Error opening %s", fileIn);
472  goto fin;
473  }
474  }
475 
476  lr_data_open(dataFile);
477 
478  if (!lr_set_learn_callback(learn_callback))
479  {
480 
481  }
482 
483  if (fin != NULL)
484  {
485  char stime[32];
486  char sclass[32];
487  char sfile[512];
488 
489  static learn_evt_T eSpam[DHIST];
490  static learn_evt_T eHam[DHIST];
491 
492  int nham = 0, nspam = 0;
493  int pham = 0, pspam = 0;
494  int nbMax = DHIST;
495 
496  bool spam = FALSE;
497  test_score_T mscore;
498  lr_cargs_T cargs;
499  lr_margs_T margs;
500  time_t t_last, t_now;
501 
502  long nbl = 0;
503 
504  nbMax = 32768;
505  {
506  char *env = NULL;
507 
508  if ((env = getenv("LRRESAMPLEWIN")) != NULL)
509  {
510  int n;
511 
512  n = atof(env);
513  if (n >= 0)
514  nbMax = n;
515  }
516  }
517 
518  memset(&mscore, 0, sizeof (mscore));
519  memset(&cargs, 0, sizeof (cargs));
520  memset(&margs, 0, sizeof (margs));
521 
522  memset(stime, 0, sizeof (stime));
523  memset(sclass, 0, sizeof (sclass));
524  memset(sfile, 0, sizeof (sfile));
525 
526  while (TRUE)
527  {
528  int i;
529  bool ok;
530  time_t date;
531 
532  learn_evt_T evt;
533 
534  if (fscanf(fin, "%s %s %s", stime, sclass, sfile) != 3)
535  break;
536 
537  date = zeStr2time(stime, NULL, (time_t) 0);
538  spam = spam = STRCASEEQUAL(sclass, "spam");
539  fname = sfile;
540 
541  ZE_MessageInfo(10, "%-4s : %s", sclass, sfile);
542 
543  margs.query = FALSE;
544  margs.learnt = FALSE;
545  margs.resample = FALSE;
546 
547  cargs.nmsg++;
548 
549  cargs.nbml++;
550  ok = lr_learn(id, fname, &cargs, &margs, &mscore, spam);
551  ok = TRUE;
552 
553  ZE_MessageInfo(10,
554  "%s classification : %8.4f %.8f judge=%-4s class=%-4s"
555  " learn=%s query=%s features=%d"
556  " score=%g prob=%.6f",
557  sfile,
558  mscore.odds, mscore.value, STRBOOL(spam, "spam", "ham"),
559  STRBOOL(mscore.odds > 0.0, "spam", "ham"),
560  STRBOOL(margs.learnt, "true", "false"),
561  STRBOOL(margs.query, "true", "false"),
562  cargs.nFeatures, mscore.odds, mscore.value);
563 
564  if (!ok)
565  continue;
566 
567  memset(&evt, 0, sizeof (evt));
568  strlcpy(evt.fname, fname, sizeof (evt.fname));
569  evt.class = spam;
570  evt.date = date;
571  evt.ok = TRUE;
572 
573  pspam = nspam % nbMax;
574  pham = nham % nbMax;
575  if (spam)
576  {
577  eSpam[pspam] = evt;
578  nspam++;
579  } else
580  {
581  eHam[pham] = evt;
582  nham++;
583  }
584 
585  if (!resample)
586  continue;
587 
588  if (nspam == 0 || nham == 0)
589  continue;
590 
591  spam = !spam;
592  if (spam)
593  {
594  if (nspam > nbMax)
595  i = random() % nbMax;
596  else
597  i = random() % nspam;
598  fname = eSpam[i].fname;
599  } else
600  {
601  if (nham > nbMax)
602  i = random() % nbMax;
603  else
604  i = random() % nham;
605  fname = eHam[i].fname;
606  }
607 
608  margs.resample = TRUE;
609  lr_learn(id, fname, &cargs, &margs, &mscore, spam);
610  }
611 
612  fclose(fin);
613 
614  /*
615  ** save learned data...
616  */
617  lr_data_dump(dataFile);
618  }
619 
620  /*
621  ** close...
622  */
623  lr_data_close();
624 
625 fin:
626  return 0;
627 }
628 
629 
630 /* ****************************************************************************
631  * *
632  * *
633  **************************************************************************** */
634 
635 static bool
636 cli_simul_handle(cargs, margs, pile, evt)
637  lr_cargs_T *cargs;
638  lr_margs_T *margs;
639  pile_T *pile;
640  learn_evt_T *evt;
641 {
642  test_score_T mscore;
643  char *fname = NULL;
644  bool ok;
645 
646  char *id = "000000.000";
647 
648  memset(&mscore, 0, sizeof (mscore));
649 
650  memset(margs, 0, sizeof (*margs));
651  /*
652  ** margs->miss = FALSE;
653  ** margs->query = FALSE;
654  ** margs->learnt = FALSE;
655  ** margs->resample = FALSE;
656  */
657  margs->cmd = evt->cmd;
658  margs->class = evt->class;
659  fname = evt->fname;
660 
661  ZE_MessageInfo(17, "* Handle %3d %s", evt->cmd, fname);
662  switch (evt->cmd)
663  {
664  case LR_CMD_CLASS:
665  ok = lr_classify(id, fname, cargs, margs, &mscore);
666  cargs->nbmc++;
667  break;
668  case LR_CMD_LEARN:
669  ok =
670  lr_learn(id, fname, cargs, margs, &mscore, evt->class == LR_CLASS_SPAM);
671  cargs->nbml++;
672  break;
674  ok =
675  lr_learn(id, fname, cargs, margs, &mscore, evt->class == LR_CLASS_SPAM);
676  break;
678  ok =
679  lr_learn(id, fname, cargs, margs, &mscore, evt->class == LR_CLASS_SPAM);
680  cargs->nbml++;
681  break;
682  }
683  margs->score = mscore;
684 
685  return TRUE;
686 }
687 
688 
689 /* ****************************************************************************
690  * *
691  * *
692  **************************************************************************** */
693 
694 typedef struct
695 {
696  bool query;
697  bool error;
698  float loss;
699 } stcl_T;
700 
701 #define ST_DIM 0x10000
702 
703 static int
704 cli_lr_simul(fileIn, dataFile, cliopt, lropts)
705  char *fileIn;
706  char *dataFile;
707  cli_opts_T *cliopt;
708  lr_opts_T *lropts;
709 {
710  char *id = "000000.000";
711  char *fname = NULL;
712  FILE *fdata = NULL, *fin = NULL;
713 
714  bool resample = cliopt->resample;
715 
716  bool miss_enable = cliopt->miss_enable;
717  double pmiss = cliopt->pmiss;
718 
719  double risk = 0.;
720  double qrate = 0.;
721  double erate = 0.;
722  int stn = 0;
723  int st_dim = ST_DIM;
724 
725  stcl_T stcl[ST_DIM];
726 
727  st_dim = 1000;
728  memset(stcl, 0, sizeof (stcl));
729  {
730  char *env;
731 
732  if ((env = getenv("STDIM")) != NULL)
733  {
734  st_dim = zeStr2long(env, NULL, st_dim);
735  st_dim = MIN(st_dim, ST_DIM);
736  }
737  }
738 
739  fdata = stdout;
740  fin = stdin;
741 
743 
744  /*
745  ** Handle messages
746  */
747  if (fileIn != NULL)
748  {
749  fin = fopen(fileIn, "r");
750  if (fin == NULL)
751  {
752  ZE_LogSysError("Error opening %s", fileIn);
753  goto fin;
754  }
755  }
756 
757  lr_data_open(dataFile);
758 
759  if (!lr_set_learn_callback(learn_callback))
760  {
761 
762  }
763 
764  if (fin != NULL)
765  {
766  char stime[32];
767  char sclass[32];
768  char sfile[512];
769 
770  static learn_evt_T eSpam[DHIST];
771  static learn_evt_T eHam[DHIST];
772 
773  int nham = 0, nspam = 0;
774  int pham = 0, pspam = 0;
775  int nbMax = DHIST;
776 
777  lr_cargs_T cargs;
778  lr_margs_T margs;
779 
780  long nbl = 0;
781 
782  pile_T pile;
783  learn_evt_T pevt, fevt;
784 
785  PILE_INIT(&pile);
786 
787  nbMax = 32768;
788  {
789  char *env = NULL;
790 
791  if ((env = getenv("LRRESAMPLEWIN")) != NULL)
792  {
793  int n;
794 
795  n = atof(env);
796  if (n >= 0)
797  nbMax = n;
798  }
799  }
800 
801  memset(&cargs, 0, sizeof (cargs));
802  memset(&margs, 0, sizeof (margs));
803 
804  memset(stime, 0, sizeof (stime));
805  memset(sclass, 0, sizeof (sclass));
806  memset(sfile, 0, sizeof (sfile));
807 
808  while (TRUE)
809  {
810  int i;
811  bool ok;
812  time_t date;
813 
814  memset(&fevt, 0, sizeof (fevt));
815  if (fscanf(fin, "%s %s %s", stime, sclass, sfile) == 3)
816  {
817  int class;
818  bool spam = FALSE;
819 
820  cargs.nmsg++;
821 
822  date = zeStr2time(stime, NULL, (time_t) 0);
823  class = LR_LABEL2CLASS(sclass);
824  spam = (class == LR_CLASS_SPAM);
825  fname = sfile;
826 
827  if (cliopt->tend > 0 && date > cliopt->tend)
828  break;
829 
830 #if 0
831  if (cliopt->tstart > 0 && date < cliopt->tstart)
832  continue;
833 #endif
834 
835  ZE_MessageInfo(17, "* Read task %s", sfile);
836  (void) lr_evt_fill(&fevt, LR_CMD_CLASS, date, class, fname);
837 
838  if (spam)
839  {
840  pspam = nspam % nbMax;
841  eSpam[pspam] = fevt;
842  nspam++;
843  } else
844  {
845  pham = nham % nbMax;
846  eHam[pham] = fevt;
847  nham++;
848  }
849  } else
850  break;
851 
852  /*
853  ** Handle feedback events
854  */
855  while (pile_check_bottom(&pile, &pevt))
856  {
857  bool spam = FALSE;
858 
859  if (fevt.date < pevt.date)
860  break;
861 
862  (void) pile_shift(&pile, &pevt);
863 
864  spam = (pevt.class == LR_CLASS_SPAM);
865 
866  ZE_MessageInfo(10, "* %ld : Handle pile event %s", pevt.date, pevt.fname);
867  /*
868  ** Should I miss it ???
869  */
870 
871  /*
872  ** handle pevt
873  */
874  cli_simul_handle(&cargs, &margs, &pile, &pevt);
875 
876  ZE_MessageInfo(10,
877  "%ld %s op=%s judge=%-4s class=%-4s"
878  " learn=%-5s query=%-5s miss=%-5s noisy=%-5s features=%d"
879  " score=%g prob=%.6f",
880  pevt.date,
881  pevt.fname,
882  "feedback",
883  STRBOOL(spam, "spam", "ham"),
884  STRBOOL(margs.score.odds > 0.0, "spam", "ham"),
885  STRBOOL(margs.learnt, "true", "false"),
886  STRBOOL(margs.query, "true", "false"),
887  STRBOOL(FALSE, "true", "false"),
888  STRBOOL(FALSE, "true", "false"),
889  cargs.nFeatures, margs.score.odds, margs.score.value);
890  }
891 
892  /*
893  ** Handle file events
894  */
895  if (fevt.ok)
896  {
897  bool spam = FALSE;
898  bool query = FALSE;
899  bool noisy = FALSE;
900  bool miss = FALSE;
901 
902  int stp;
903 
904  stp = stn % st_dim;
905  /* memset(&stcl[stp], 0, sizeof(stcl_T)); */
906  stcl[stp].error = FALSE;
907  stcl[stp].query = FALSE;
908  stcl[stp].loss = 0.;
909 
910  ZE_MessageInfo(10, "* %ld : Handle file event %s", fevt.date, fevt.fname);
911  cli_simul_handle(&cargs, &margs, &pile, &fevt);
912 
913  spam = (fevt.class == LR_CLASS_SPAM);
914 
915  /* decide whether add it to the pile */
916  {
917  double loss;
918 
919  loss = fabs((spam ? 1. : 0.) - margs.score.value);
920 
921  stcl[stp].loss = loss;
922 
923  /* query golden class ? add it to the pile */
924  if (spam != (margs.score.odds > 0.0))
925  {
926  query = TRUE;
927  stcl[stp].error = TRUE;
928  }
929 
930  /* query as it's inside margin */
931  if (fabs(margs.score.value - 0.5) < lropts->active_margin)
932  {
933  query = TRUE;
934  stcl[stp].query = TRUE;
935  }
936 
937  if (query && cliopt->miss_enable && cliopt->pmiss > 0.0)
938  {
939  double r;
940 
941  margs.miss = FALSE;
942 
943  r = drand48();
944  if ((r = drand48()) < cliopt->pmiss)
945  {
946  miss = margs.miss = TRUE;
947  query = FALSE;
948  }
949  }
950 
951  /* update error and query rates */
952  {
953  int i, nq, ne;
954  int stmax;
955 
956  stn++;
957  stmax = MIN(stn, st_dim);
958 
959  for (i = nq = ne = 0, risk = 0.; i < stmax; i++)
960  {
961  if (stcl[i].query)
962  nq++;
963  if (stcl[i].error)
964  ne++;
965  risk += stcl[i].loss;
966  }
967  erate = ((double) ne) / stmax;
968  qrate = ((double) nq) / stmax;
969  risk = risk / stmax;
970  ZE_MessageInfo(10, "STATS: %7d QRATE %8.5f ERATE: %8.5f RISK: %8.5f",
971  stn, qrate, erate, risk);
972  }
973 
974  if (query)
975  {
976  learn_evt_T evt;
977 
978  ZE_MessageInfo(17, "* Add to pile %s spam : %d %8.3f ",
979  fevt.fname, spam, margs.score.odds);
980  evt = fevt;
982 
983  if (cliopt->noise_enable && cliopt->pnoise > 0)
984  {
985  double r;
986 
987  if ((r = drand48()) < cliopt->pnoise)
988  {
989  evt.class = (1 - evt.class);
990  noisy = TRUE;
991  }
992  }
993 
994  /* add one hour delay */
995  if (cargs.nmsg >= 2000)
996  {
997  switch (cliopt->delay_fn)
998  {
999  case CLI_DELAY_FN_CTE:
1000  evt.date += cliopt->delay;
1001  break;
1002  case CLI_DELAY_FN_EXP:
1003  {
1004  double x;
1005 
1006  x = -cliopt->delay * log(1 - drand48() + 1.e-10);
1007  x = MAX(x, 600);
1008  x = MIN(x, (30 * 86400));
1009  evt.date += x;
1010  }
1011  break;
1012  case CLI_DELAY_FN_FIX:
1013  evt.date += 86400 - (evt.date % 86400);
1014  break;
1015  default:
1016  evt.date += cliopt->delay;
1017  break;
1018  }
1019  }
1020  (void) pile_push(&pile, &evt);
1021  }
1022  }
1023 
1024  /* print results */
1025 
1026  ZE_MessageInfo(10,
1027  "%ld %s op=%s judge=%-4s class=%-4s"
1028  " learn=%-5s query=%-5s miss=%-5s noisy=%-5s features=%d"
1029  " score=%g prob=%.6f",
1030  fevt.date,
1031  fevt.fname,
1032  "class",
1033  STRBOOL(spam, "spam", "ham"),
1034  STRBOOL(margs.score.odds > 0.0, "spam", "ham"),
1035  STRBOOL(margs.learnt, "true", "false"),
1036  STRBOOL(query, "true", "false"),
1037  STRBOOL(miss, "true", "false"),
1038  STRBOOL(noisy, "true", "false"),
1039  cargs.nFeatures, margs.score.odds, margs.score.value);
1040 
1041  /* resample ? randomly choose old message and learn it */
1042 #if 0
1043  if (resample && nspam > 1 && nham > 1)
1044  {
1045  int m;
1046  int class;
1047 
1048  spam = !spam;
1049  if (spam)
1050  {
1051  m = (nspam > nbMax) ? nbMax : nspam;
1052  i = random() % m;
1053  fname = eSpam[i].fname;
1054  } else
1055  {
1056  m = (nham > nbMax) ? nbMax : nham;
1057  i = random() % m;
1058  fname = eHam[i].fname;
1059  }
1060  class = (spam ? LR_CLASS_SPAM : LR_CLASS_HAM);
1061  (void) lr_evt_fill(&fevt, LR_CMD_LEARN_RESAMPLE, date, class, fname);
1062  cli_simul_handle(&cargs, &margs, &pile, &fevt);
1063  }
1064 #endif
1065  }
1066 
1067  }
1068  fclose(fin);
1069  /*
1070  ** save learned data...
1071  */
1072  lr_data_dump(dataFile);
1073  }
1074 
1075  /*
1076  ** close...
1077  */
1078  lr_data_close();
1079 
1080 fin:
1081  return 0;
1082 }
1083 
1084 
1085 /* ****************************************************************************
1086  * *
1087  * *
1088  **************************************************************************** */
1089 
1090 static double
1091 learn_callback(i, cargs, margs)
1092  int i;
1093  lr_cargs_T *cargs;
1094  lr_margs_T *margs;
1095 {
1096  double r = lrate;
1097 
1098  static bool ok = FALSE;
1099  static int srate = 2;
1100  static double ri = 0.5, rf = 0.004, teta = -0.040;
1101 
1102  if (!ok)
1103  {
1104  char *s = NULL;
1105 
1106  if ((s = getenv("LRATEDEFS")) != NULL)
1107  {
1108  char ebuf[256];
1109  char *argv[8];
1110  int argc;
1111 
1112  memset(argv, 0, sizeof (argv));
1113  strlcpy(ebuf, s, sizeof (ebuf));
1114  argc = zeStr2Tokens(ebuf, 8, argv, ",; ");
1115  if (argv[0] != NULL)
1116  srate = atof(argv[0]);
1117  if (argv[1] != NULL)
1118  ri = atoi(argv[1]);
1119  if (argv[2] != NULL)
1120  {
1121  rf = atof(argv[2]);
1122  lrate = rf;
1123  }
1124  if (argv[3] != NULL)
1125  teta = atof(argv[3]);
1126  }
1127  ok = TRUE;
1128  }
1129 
1130  switch (srate)
1131  {
1132  case 0:
1133  r = lrate;
1134  break;
1135  case 1:
1136  r = ri / sqrt(i + 1);
1137  if (r < rf)
1138  r = rf;
1139  break;
1140  case 2:
1141  r = (ri - rf) * exp(teta * i) + rf;
1142  break;
1143  }
1144 
1145  ZE_MessageInfo(10, "* learning rate : %7d %8.5f", i, r);
1146 
1147  return r;
1148 }
1149 
1150 /* ****************************************************************************
1151  * *
1152  * *
1153  **************************************************************************** */
1154 #define DPILE 20000
1155 
1156 
1157 static bool
1158 pile_push(pile, evt)
1159  pile_T *pile;
1160  learn_evt_T *evt;
1161 {
1162  if (pile == NULL)
1163  return FALSE;
1164 
1165  if (pile->n >= DPILE - 1)
1166  return FALSE;
1167 
1168  pile->serial++;
1169  evt->serial = pile->serial;
1170  pile->p[pile->n] = *evt;
1171  pile->n++;
1172 
1173  return TRUE;
1174 }
1175 
1176 static bool
1177 pile_pop(pile, evt)
1178  pile_T *pile;
1179  learn_evt_T *evt;
1180 {
1181  if (pile == NULL)
1182  return FALSE;
1183 
1184  if (pile->n <= 0)
1185  return FALSE;
1186 
1187  *evt = pile->p[pile->n - 1];
1188  pile->n--;
1189 
1190  return TRUE;
1191 }
1192 
1193 static bool
1194 pile_shift(pile, evt)
1195  pile_T *pile;
1196  learn_evt_T *evt;
1197 {
1198  if (pile == NULL)
1199  return FALSE;
1200 
1201  if (pile->n <= 0)
1202  return FALSE;
1203 
1204  *evt = pile->p[0];
1205  {
1206  int i;
1207 
1208  for (i = 0; i < pile->n - 1; i++)
1209  pile->p[i] = pile->p[i + 1];
1210  }
1211  pile->n--;
1212 
1213  return TRUE;
1214 }
1215 
1216 static bool
1217 pile_check_top(pile, evt)
1218  pile_T *pile;
1219  learn_evt_T *evt;
1220 {
1221  if (pile == NULL)
1222  return FALSE;
1223 
1224  if (pile->n <= 0)
1225  return FALSE;
1226 
1227  *evt = pile->p[pile->n - 1];
1228 
1229  return TRUE;
1230 }
1231 
1232 static bool
1233 pile_check_bottom(pile, evt)
1234  pile_T *pile;
1235  learn_evt_T *evt;
1236 {
1237  if (pile == NULL)
1238  return FALSE;
1239 
1240  if (pile->n <= 0)
1241  return FALSE;
1242 
1243  *evt = pile->p[0];
1244 
1245  return TRUE;
1246 }
1247 
1248 static int
1249 lrevtcmp(const void *ea, const void *eb)
1250 {
1251  learn_evt_T *pea = (learn_evt_T *) ea;
1252  learn_evt_T *peb = (learn_evt_T *) eb;
1253 
1254  if (pea->date > peb->date)
1255  return 1;
1256  if (pea->date < peb->date)
1257  return -1;
1258 
1259  return (pea->serial - peb->serial);
1260 }
1261 
1262 static bool
1263 pile_sort(pile)
1264  pile_T *pile;
1265 {
1266  if (pile == NULL)
1267  return FALSE;
1268 
1269  if (pile->n > 1)
1270  qsort(pile->p, pile->n, sizeof (learn_evt_T), lrevtcmp);
1271 
1272  return TRUE;
1273 }
1274 
1275 /* ****************************************************************************
1276  * *
1277  * *
1278  **************************************************************************** */
1279 #define OPTION_BOOL(args, v) \
1280  { \
1281  if (zeStrRegex((args), "^yes|true|ok$", NULL, NULL, TRUE)) \
1282  (v) = TRUE; \
1283  if (zeStrRegex((args), "^no|false|ko$", NULL, NULL, TRUE)) \
1284  (v) = FALSE; \
1285  }
1286 
1287 #define OPTION_INT(args, v) \
1288  { \
1289  (v) = zeStr2long(args, NULL, (v)); \
1290  }
1291 
1292 #define OPTION_DOUBLE(args, v) \
1293  { \
1294  (v) = zeStr2double(args, NULL, (v)); \
1295  }
1296 
1297 static bool
1298 decode_lr_options(lrOpts, cliOpt, optarg)
1299  lr_opts_T *lrOpts;
1300  cli_opts_T *cliOpt;
1301  char *optarg;
1302 {
1303  char buf[1024];
1304  char *argv[4];
1305  int argc;
1306 
1307  if (optarg == NULL)
1308  return FALSE;
1309 
1310  strlcpy(buf, optarg, sizeof buf);
1311  argc = zeStr2Tokens(buf, 4, argv, "=");
1312  if (argc < 2)
1313  return FALSE;
1314 
1315  if (STRCASEEQUAL(argv[0], "LR_LRATE"))
1316  {
1317  OPTION_DOUBLE(argv[1], lrOpts->lrate);
1318  return TRUE;
1319  }
1320 
1321  if (STRCASEEQUAL(argv[0], "LR_RESAMPLE"))
1322  {
1323  OPTION_BOOL(argv[1], lrOpts->resample);
1324  return TRUE;
1325  }
1326 
1327  if (STRCASEEQUAL(argv[0], "LR_USE_RAW_MSG"))
1328  {
1329  OPTION_BOOL(argv[1], lrOpts->useRawMsg);
1330  return TRUE;
1331  }
1332 
1333  if (STRCASEEQUAL(argv[0], "LR_RAW_LENGTH"))
1334  {
1335  OPTION_INT(argv[1], lrOpts->rawLength);
1336  return TRUE;
1337  }
1338 
1339  if (STRCASEEQUAL(argv[0], "LR_BODY_LENGTH"))
1340  {
1341  OPTION_INT(argv[1], lrOpts->bodyLength);
1342  return TRUE;
1343  }
1344 
1345  if (STRCASEEQUAL(argv[0], "LR_USE_BODY"))
1346  {
1347  OPTION_BOOL(argv[1], lrOpts->useBody);
1348  return TRUE;
1349  }
1350 
1351  if (STRCASEEQUAL(argv[0], "LR_USE_HEADERS"))
1352  {
1353  OPTION_BOOL(argv[1], lrOpts->useHeaders);
1354  return TRUE;
1355  }
1356 
1357  if (STRCASEEQUAL(argv[0], "LR_CLEAN_UP_HEADERS"))
1358  {
1359  OPTION_BOOL(argv[1], lrOpts->cleanUpHeaders);
1360  return TRUE;
1361  }
1362 
1363  if (STRCASEEQUAL(argv[0], "LR_CLEANUP_DATES"))
1364  {
1365  OPTION_BOOL(argv[1], lrOpts->cleanUpDates);
1366  return TRUE;
1367  }
1368 
1369  if (STRCASEEQUAL(argv[0], "LR_ACTIVE_LEARNING"))
1370  {
1371  OPTION_BOOL(argv[1], lrOpts->active_learning);
1372  return TRUE;
1373  }
1374 
1375  if (STRCASEEQUAL(argv[0], "LR_ACTIVE_MARGIN"))
1376  {
1377  OPTION_DOUBLE(argv[1], lrOpts->active_margin);
1378  return TRUE;
1379  }
1380 
1381  if (STRCASEEQUAL(argv[0], "OPT_DELAY"))
1382  {
1383  OPTION_INT(argv[1], cliOpt->delay);
1384  return TRUE;
1385  }
1386 
1387  if (STRCASEEQUAL(argv[0], "OPT_DELAY_FUNCTION"))
1388  {
1389  cliOpt->delay_fn = CLI_DELAY_FN_CTE;
1390 
1391  if (STRCASEEQUAL(argv[1], "cte"))
1392  {
1393  cliOpt->delay_fn = CLI_DELAY_FN_CTE;
1394  return TRUE;
1395  }
1396  if (STRCASEEQUAL(argv[1], "exp"))
1397  {
1398  cliOpt->delay_fn = CLI_DELAY_FN_EXP;
1399  return TRUE;
1400  }
1401  if (STRCASEEQUAL(argv[1], "fix"))
1402  {
1403  cliOpt->delay_fn = CLI_DELAY_FN_FIX;
1404  return TRUE;
1405  }
1406  return TRUE;
1407  }
1408 
1409  if (STRCASEEQUAL(argv[0], "OPT_NOISE"))
1410  {
1411  OPTION_BOOL(argv[1], cliOpt->noise_enable);
1412  return TRUE;
1413  }
1414 
1415  if (STRCASEEQUAL(argv[0], "OPT_PROB_NOISE"))
1416  {
1417  OPTION_DOUBLE(argv[1], cliOpt->pnoise);
1418  return TRUE;
1419  }
1420 
1421  if (STRCASEEQUAL(argv[0], "OPT_MISS"))
1422  {
1423  OPTION_BOOL(argv[1], cliOpt->miss_enable);
1424  return TRUE;
1425  }
1426 
1427  if (STRCASEEQUAL(argv[0], "OPT_PROB_MISS"))
1428  {
1429  OPTION_DOUBLE(argv[1], cliOpt->pmiss);
1430  return TRUE;
1431  }
1432 
1433  if (STRCASEEQUAL(argv[0], "OPT_TSTART"))
1434  {
1435  OPTION_INT(argv[1], cliOpt->tstart);
1436  return TRUE;
1437  }
1438 
1439  if (STRCASEEQUAL(argv[0], "OPT_TEND"))
1440  {
1441  OPTION_INT(argv[1], cliOpt->tend);
1442  return TRUE;
1443  }
1444 
1445  return FALSE;
1446 }
1447 
1448 /* ****************************************************************************
1449  * *
1450  * *
1451  **************************************************************************** */
1452 void
1453 usage(arg)
1454  char *arg;
1455 {
1456  printf("Usage : %s options\n"
1457  " -h : this message\n"
1458  " -l : toggle learn option\n"
1459  " -i : input file with commands\n"
1460  " -d : data file\n"
1461  " -a : active learning\n"
1462  " -t : active learning threshold (0.5 - margin)\n"
1463  " -r : resample\n"
1464  " -m : feedback miss probability ([0.0,1.0])\n"
1465  " -R : asymptotic learning rate\n"
1466  " -o : option=value \n"
1467  " -x mode : where mode in learn, class, simulate\n", arg);
1468  printf("\n %s\n %s\n\n", PACKAGE, COPYRIGHT);
1469 }
#define CLI_NONE
Definition: ze-lr-sim.c:120
bool query
Definition: ze-lr-sim.c:696
#define MAX(a, b)
Definition: macros.h:139
#define LR_CMD_LEARN_RESAMPLE
Definition: ze-lr-funcs.h:38
#define STRBOOL(x, t, f)
Definition: macros.h:87
#define OPTION_DOUBLE(args, v)
Definition: ze-lr-sim.c:1292
long nFeatures
Definition: ze-lr-funcs.h:95
bool lr_data_open(char *fname)
Definition: ze-lr-funcs.c:247
#define LR_CLASS_SPAM
Definition: ze-lr-funcs.h:34
#define CLI_DELAY_FN_CTE
Definition: ze-lr-sim.c:125
#define DHIST
Definition: ze-lr-sim.c:44
int main(int argc, char **argv)
Definition: ze-lr-sim.c:175
#define LR_OPTS_INITIALIZER
Definition: ze-lr-funcs.h:74
bool lr_data_dump(char *fname)
Definition: ze-lr-funcs.c:435
#define ST_DIM
Definition: ze-lr-sim.c:701
#define LR_CMD_EXTRACT
Definition: ze-lr-funcs.h:40
#define COPYRIGHT
Definition: version.h:31
bool lr_data_close()
Definition: ze-lr-funcs.c:350
#define LR_CMD_LEARN_FEEDBACK
Definition: ze-lr-funcs.h:39
void set_mime_debug(bool)
Definition: ze-demime.c:69
bool lr_set_learn_callback(lr_callback_F)
Definition: ze-lr-funcs.c:1074
int ze_logLevel
Definition: zeSyslog.c:34
bool resample
Definition: ze-lr-sim.c:132
#define STRNCASEEQUAL(a, b, n)
Definition: macros.h:75
bool ok
Definition: ze-connopen.c:59
bool lr_learn_options(bool active, double threshold)
Definition: ze-lr-funcs.c:997
bool lr_classify(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *score)
Definition: ze-lr-funcs.c:965
void zeLog_SetOutput(bool, bool)
Definition: zeSyslog.c:490
#define FALSE
Definition: macros.h:160
#define strlcpy
Definition: zeString.h:32
long serial
Definition: ze-lr-sim.c:95
#define CLI_DELAY_FN_FIX
Definition: ze-lr-sim.c:127
#define LR_CMD_LEARN
Definition: ze-lr-funcs.h:37
int zeStr2Tokens(char *, int, char **, char *)
Definition: zeStrings.c:610
#define MIN(a, b)
Definition: macros.h:140
#define OPTION_BOOL(args, v)
Definition: ze-lr-sim.c:1279
time_t tstart
Definition: ze-lr-sim.c:149
time_t delay
Definition: ze-lr-sim.c:135
long zeStr2long(char *s, int *error, long dval)
Definition: zeStrConvert.c:35
bool error_feedback
Definition: ze-lr-sim.c:147
char fname[128]
Definition: ze-lr-sim.c:51
#define CLI_DELAY_FN_EXP
Definition: ze-lr-sim.c:126
#define LR_CMD_CLASS
Definition: ze-lr-funcs.h:36
time_t tend
Definition: ze-lr-sim.c:150
bool lr_learn(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs, test_score_T *score, bool spam)
Definition: ze-lr-funcs.c:980
time_t date
Definition: ze-lr-sim.c:49
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
bool miss_enable
Definition: ze-lr-sim.c:139
bool noise_enable
Definition: ze-lr-sim.c:143
#define TRUE
Definition: macros.h:157
double value
Definition: ze-msg-score.h:67
#define PILE_INIT(p)
Definition: ze-lr-sim.c:98
int n
Definition: ze-lr-sim.c:93
#define ZE_LogSysError(...)
Definition: zeSyslog.h:129
#define LR_CLASS_UNKNOWN
Definition: ze-lr-funcs.h:32
char * conf_file
Definition: ze-cf.c:38
double pmiss
Definition: ze-lr-sim.c:140
int configure(char *, char *, bool)
Definition: ze-cf.c:1203
#define CLI_OPT_INIT
Definition: ze-lr-sim.c:153
int delay_fn
Definition: ze-lr-sim.c:136
double pnoise
Definition: ze-lr-sim.c:144
#define LR_CLASS_HAM
Definition: ze-lr-funcs.h:33
test_score_T score
Definition: ze-lr-funcs.h:116
#define OPTION_INT(args, v)
Definition: ze-lr-sim.c:1287
#define PACKAGE
Definition: version.h:28
bool error
Definition: ze-lr-sim.c:697
double active_margin
Definition: ze-lr-funcs.h:71
float loss
Definition: ze-lr-sim.c:698
long serial
Definition: ze-lr-sim.c:50
#define STRCASEEQUAL(a, b)
Definition: macros.h:72
bool lr_extract(char *id, char *fname, lr_cargs_T *cargs, lr_margs_T *margs)
Definition: ze-lr-funcs.c:951
void usage(char *)
Definition: ze-lr-sim.c:1453
bool active_learning
Definition: ze-lr-funcs.h:70
bool resample
Definition: ze-lr-funcs.h:109
#define DPILE
Definition: ze-lr-sim.c:1154
time_t zeStr2time(char *s, int *error, time_t dval)
Definition: zeStrConvert.c:291