ze-filter  (ze-filter-0.8.0-develop-180218)
ze-bsegmentation.c
Go to the documentation of this file.
1 /*
2  *
3  * ze-filter - Mail Server Filter for sendmail
4  *
5  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
6  *
7  * Auteur : Jose Marcio Martins da Cruz
8  * jose.marcio.mc@gmail.org
9  *
10  * Historique :
11  * Creation : Thu Jun 15 13:41:01 CEST 2006
12  *
13  * This program is free software, but with restricted license :
14  *
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * More details about ze-filter license can be found at ze-filter
21  * web site : http://foss.jose-marcio.org
22  */
23 
24 
25 #include <ze-sys.h>
26 #include <ze-filter.h>
27 #include <ze-bfilter.h>
28 
29 
34 /* ****************************************************************************
35  * *
36  * *
37  **************************************************************************** */
38 
39 typedef struct msg_btsm_T
40 {
43  void *priv;
44 } msg_btsm_T;
45 
46 #define MSG_BTSM_INITIALIZER \
47  { \
48  JBT_INITIALIZER, \
49  NULL, \
50  NULL \
51  }
52 
53 #define FEATURE_WORD 0
54 #define FEATURE_NGRAM 1
55 
56 typedef struct {
59 } feature_T;
60 
61 
62 static bool mimepart2wordTokens(char *, size_t, char *, int, int,
63  void *, mime_part_T *);
64 
65 static bool mimepart2ngramTokens(char *, size_t, char *, int, int,
66  void *, mime_part_T *);
67 
68 /* ****************************************************************************
69 ** ##### ##### #### # # ###### # # ####
70 ** # # # # # # # # ## # #
71 ** ##### ##### # # # #### ##### # # # ####
72 ** # # # # # # # # # # # #
73 ** # # # # # # # # # ## # #
74 ** ##### # #### # # ###### # # ####
75  **************************************************************************** */
76 
77 static bool msg_btsm_add_token(msg_btsm_T * bm, char *token);
78 static bool msg_btsm_init(msg_btsm_T * bm);
79 static bool msg_btsm_end(msg_btsm_T * bm);
80 
81 
82 #define _BODY 0
83 #define _HTML 1
84 #define _HEADER 2
85 #define _CTYPE 3
86 #define _CDISP 4
87 #define _RCVD 5
88 #define _FROM 6
89 #define _MAILER 7
90 
91 
92 /* ****************************************************************************
93  * *
94  * *
95  **************************************************************************** */
96 
97 /*
98 ** TODO list
99 **
100 ** * Recursive tokenization
101 ** 1st - use character separators
102 ** 2nd - use multichar separators (__, --)
103 **
104 ** * Special meaning for some repeated characters at the end of a token :
105 ** !! ... ???
106 **
107 ** * Morphological analysis of a token
108 **
109 ** * Multibyte characters
110 **
111 ** * Which headers ?
112 ** - XMailer OK
113 ** - User-Agent OK
114 ** - Subject OK
115 ** - Received No good 8-(
116 ** - Content-Type OK
117 ** - Content-Disposition OK
118 ** - From. OK
119 ** - Content-ID OK
120 ** - Message-ID OK
121 ** Good ? What else ?
122 */
123 
124 #if 0
125 #define SEP_TOK " \t\n\r,/=&?\"()[]{}<>"
126 #else
127 #define SEP_TOK " \t\n\r,=&?\"()[]{}<>;~/"
128 #endif
129 
130 static char *TOK_SEPARATOR[] = {
131  " \t\n\r\"()[]{}<>/",
132  ",;=&?~",
133  NULL
134 };
135 
136 #define SEP_WS " \t\n\r"
137 
138 typedef struct
139 {
140  char *tag;
141  char *prefix;
142  bool recurse;
143  char *separator;
144  void (*func) (char *);
145  bool active;
146 } tokconf_T;
147 
148 /* TO ADD :
149 ** - Content-ID
150 */
151 
152 
153 #define TOKCONF_INITIALIZER {"body", "body", TRUE, NULL, NULL, TRUE}
154 
155 static bool extract_word_tokens(tokconf_T * cf, char *prefix,
156  char *separator,
157  char *buf,
158  int kind, msg_btsm_T * data, int level);
159 
160 static tokconf_T hdrs_tokconf[] = {
161  {"x-mailer", "xmailer", FALSE, NULL, NULL, TRUE},
162  {"user-agent", "uagent", FALSE, NULL, NULL, TRUE},
163  {"from", "from", FALSE, NULL, NULL, TRUE},
164  {"subject", "subject", FALSE, NULL, NULL, TRUE},
165  {"received", "rcvd", FALSE, NULL, NULL, FALSE},
166  {"content-type", "ctype", FALSE, NULL, NULL, TRUE},
167  {"content-disposition", "cdisp", FALSE, NULL, NULL, TRUE},
168  {"content-description", "cdesc", FALSE, NULL, NULL, TRUE},
169  {"content-transfer-encoding", "ctencode", FALSE, NULL, NULL, TRUE},
170  {"content-id", "cid", FALSE, NULL, NULL, TRUE},
171  {"message-id", "msgid", FALSE, NULL, NULL, TRUE},
172 
173  {"boundary", "bound", FALSE, NULL, NULL, TRUE},
174  {NULL, NULL, FALSE, NULL, NULL, FALSE}
175 };
176 
177 static tokconf_T *
178 get_tokconf_headers(tag)
179  char *tag;
180 {
181  tokconf_T *p = NULL;
182 
183  for (p = hdrs_tokconf; p->tag != NULL; p++)
184  {
185  if (STRCASEEQUAL(p->tag, tag))
186  return p;
187  }
188 
189  return NULL;
190 }
191 
192 static tokconf_T body_tokconf[] = {
193  {"body", "body", FALSE, NULL, NULL, TRUE},
194  {"text/plain", "body", FALSE, NULL, NULL, TRUE},
195  {"text/html", "body", FALSE, NULL, NULL, TRUE},
196  {"html/tags", "htmltags", FALSE, " \t\n\r,=&?\"()[]{}<>;~/", NULL, TRUE},
197  {"simple", "body", FALSE, NULL, NULL, TRUE},
198  {"boundary", "bound", FALSE, NULL, NULL, TRUE},
199  {"name", "name", FALSE, NULL, NULL, TRUE},
200  {"cdmain", "cdmain", FALSE, SEP_WS, NULL, TRUE},
201  {"cdname", "cdname", FALSE, NULL, NULL, TRUE},
202  {"ctmain", "ctmain", FALSE, SEP_WS, NULL, TRUE},
203  {"ctname", "ctname", FALSE, NULL, NULL, TRUE},
204  {NULL, NULL, FALSE, NULL, NULL, FALSE}
205 };
206 
207 static tokconf_T *
208 get_tokconf_body(tag)
209  char *tag;
210 {
211  tokconf_T *p = NULL;
212 
213  for (p = body_tokconf; p->tag != NULL; p++)
214  {
215  if (STRCASEEQUAL(p->tag, tag))
216  return p;
217  }
218 
219  return NULL;
220 }
221 
222 void
223 set_tokconf_active(tag, active)
224  char *tag;
225  bool active;
226 {
227  tokconf_T *p = NULL;
228 
229  for (p = body_tokconf; p->tag != NULL; p++)
230  {
231  if (STRCASEEQUAL(p->tag, tag))
232  {
233  p->active = active;
234  break;
235  }
236  }
237  for (p = hdrs_tokconf; p->tag != NULL; p++)
238  {
239  if (STRCASEEQUAL(p->tag, tag))
240  {
241  p->active = active;
242  break;
243  }
244  }
245 }
246 
247 /* ****************************************************************************
248  * *
249  * *
250  **************************************************************************** */
251 static void
252 token_trim_bounds(s)
253  char *s;
254 {
255  char *p, *q;
256  int i;
257 
258  if (s == NULL)
259  return;
260 
261  /* end of the string */
262  while ((i = strlen(s)) > 0)
263  {
264  if (strchr(".-/*'`/:", s[i - 1]) == NULL)
265  break;
266  s[i - 1] = '\0';
267  }
268 
269  /* the beginning */
270  p = q = s;
271  if ((i = strspn(p, "()><+-.*!'`/")) > 0)
272  {
273  p += i;
274  while (*p != '\0')
275  *q++ = *p++;
276  *q = '\0';
277  }
278 
279  p = q = s;
280  i = 0;
281  for (p = s; *p == '$' && !isdigit(*(p + 1)); p++)
282  i++;
283  if (i > 0)
284  {
285  q = s;
286  while (*p != '\0')
287  *q++ = *p++;
288  *q = '\0';
289  }
290 }
291 
292 /* ****************************************************************************
293  * *
294  * *
295  **************************************************************************** */
296 static bool
297 check_token(s)
298  char *s;
299 {
300 #if 1
301  if (strlen(s) == 0)
302  return FALSE;
303 #else
304  if (strlen(s) <= 2)
305  return FALSE;
306 #endif
307  if (strlen(s) > 40)
308  return FALSE;
309 #if 0
310  if (strspn(s, "0123456789") == strlen(s))
311  return FALSE;
312 #endif
313 
314  /* date */
315  if (zeStrRegex(s, "^[0-9]{2,2}/[0-9]{1,2}/[0-9]{2,4}$", NULL, NULL, TRUE))
316  return FALSE;
317 
318  /* time */
319  if (zeStrRegex
320  (s, "^[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}(pm|am)?$", NULL, NULL, TRUE))
321  return FALSE;
322  if (zeStrRegex(s, "^[0-9]{2,2}:[0-9]{2,2}(pm|am)?$", NULL, NULL, TRUE))
323  return FALSE;
324 
325  return TRUE;
326 }
327 
328 /* ****************************************************************************
329  * *
330  * *
331  **************************************************************************** */
332 #define ADD_TOKEN(bm, prefix, token) \
333  do { \
334  if (strlen(token) > 3) { \
335  char tstr[512]; \
336  \
337  if (prefix != NULL) \
338  snprintf(tstr, sizeof(tstr), "%s--%s", prefix, token); \
339  else \
340  snprintf(tstr, sizeof(tstr), "%s--%s", "GLOB", token); \
341  if (!msg_btsm_add_token(bm, tstr)) \
342  ZE_LogMsgError(0, "ERROR inserting new token"); \
343  } \
344  } while (0)
345 
346 
347 
348 static bool
349 extract_word_tokens(cf, prefix, separator, buf, kind, bm, level)
350  tokconf_T *cf;
351  char *prefix;
352  char *separator;
353  char *buf;
354  int kind;
355  msg_btsm_T *bm;
356  int level;
357 {
358  char *stok, *ptr;
359  char *prev = NULL;
360  bfilter_T *bf = NULL;
361 
363 
364  bf = bfilter_ptr();
365 
366  ASSERT(bf != NULL);
367  ASSERT(bf->signature == SIGNATURE);
368 
369  if (bm == NULL || buf == NULL || strlen(buf) == 0)
370  return FALSE;
371 
372  if (cf == NULL)
373  cf = &tcf;
374 
375  /* separator = cf->separator; */
376  if (separator == NULL)
377  separator = SEP_TOK;
378 
379 #if 0
380  if (level > 1)
381  separator = TOK_SEPARATOR[1];
382 #endif
383 
384  level++;
385 
386  zeStr2Lower(buf);
387 
388  for (stok = strtok_r(buf, separator, &ptr); stok != NULL;
389  stok = strtok_r(NULL, separator, &ptr))
390  {
391  if (ptr != NULL && *ptr != '\0')
392  {
393  char *p = ptr;
394 
395  if (p[0] == '$' && isspace(p[1]) && isdigit(p[2]))
396  p[1] = '0';
397  }
398 
399  token_trim_bounds(stok);
400 
401  if (!check_token(stok))
402  {
403  prev = NULL;
404  continue;
405  }
406 
407  /* dollars... */
408  if (*stok == '$')
409  {
410  char *q = stok + 1;
411 
412  while (isdigit(*q) || *q == '.')
413  {
414  if (*q != '.')
415  *q = '0';
416  q++;
417  }
418  }
419 
420  /* colors */
421  if (*stok == '#')
422  {
423  int i;
424 
425  for (i = 1; i <= 6 && isxdigit(stok[i]); i++)
426  stok[i] = '0';
427  }
428 
429  ADD_TOKEN(bm, prefix, stok);
430 
431  /* verifie la forme */
432  {
433  }
434 
435  {
436  char *ts = NULL;
437 
438  if ((ts = strdup(stok)) != NULL)
439  {
440  bool first = TRUE;
441  char *p, *q;
442 
443  if ((bf->flags & BFLAG_TRFTOK) != 0)
444  {
445  for (p = q = ts; *p != '\0'; p++)
446  {
447  switch (*p)
448  {
449  case '\'':
450  case '^':
451  break;
452  case ':':
453  if (!first && isxdigit(*(p - 1)) && isxdigit(*(p + 1)))
454  *q++ = *p;
455  break;
456  case '.':
457  if (!first && (isdigit(*(p - 1)) || isdigit(*(p + 1))))
458  *q++ = *p;
459  break;
460  default:
461  *q++ = *p;
462  break;
463  }
464  if (first)
465  first = FALSE;
466  }
467  *q = '\0';
468 
469  ADD_TOKEN(bm, prefix, ts);
470  }
471 
472  if (bf->segRecurse)
473  {
474 #define SECSEP ".^\':@|+_-%#!$"
475 
476  if (strpbrk(stok, SECSEP) != NULL)
477  {
478  strlcpy(ts, stok, strlen(stok) + 1);
479  extract_word_tokens(NULL, prefix, SECSEP, ts, kind, bm, level);
480  }
481  }
482 
483  FREE(ts);
484  } else
485  /* XXXX JOE */
486  ZE_LogSysError("strdup(%s) error", stok);
487  }
488 
489  if (prev != NULL && bf->segDouble)
490  {
491  char t[256];
492 
493  if (strlen(prev) > 0 && strlen(stok) > 0)
494  {
495  snprintf(t, sizeof (t), "%s-dbl-%s", prev, stok);
496  ADD_TOKEN(bm, prefix, t);
497  }
498  }
499 
500  prev = stok;
501  }
502 
503  return TRUE;
504 }
505 
506 /* ****************************************************************************
507 ** ##### # # ###### #### #### ## #### ######
508 ** # # ## ## # # # # # # # #
509 ** ##### ##### # ## # ##### #### #### # # # #####
510 ** # # # # # # # ###### # ### #
511 ** # # # # # # # # # # # # # #
512 ** ##### # # ###### #### #### # # #### ######
513  **************************************************************************** */
514 
515 /* ****************************************************************************
516  * *
517  * *
518  **************************************************************************** */
519 #define X_HTML_SEP " -x- "
520 
521 static char *
522 extract_html_tags(buf, size)
523  char *buf;
524  size_t size;
525 {
526  char *t = NULL;
527  char *p;
528  size_t msz;
529 
530  if (buf == NULL || strlen(buf) == 0)
531  return NULL;
532 
533  msz = 2 * (size + 1);
534  msz += (8 - msz % 8);
535  t = malloc(msz);
536  if (t == NULL)
537  {
538  ZE_LogSysError("malloc error");
539  return NULL;
540  }
541  memset(t, 0, msz);
542 
543  p = buf;
544  for (p = buf + strcspn(buf, "<"); *p != '\0'; p += strcspn(p, "<"))
545  {
546  int n;
547 
548  p++;
549  if (*p == '\0')
550  break;
551 
552  n = strcspn(p, ">");
553  if (p[n] == '\0')
554  break;
555 
556  (void) zeSafeStrnCat(t, msz, p, n);
557  p += n;
558 
559  strlcat(t, X_HTML_SEP, msz);
560  }
561 
562  return t;
563 }
564 
565 /* ****************************************************************************
566  * *
567  * *
568  **************************************************************************** */
569 #define NORM_FILENAME(fname) \
570  do { \
571  if (fname != NULL) \
572  { \
573  char *px = fname; \
574  \
575  for (px = fname; *px != '\0'; px++) \
576  { \
577  if (isdigit(*px)) \
578  *px = '0'; \
579  if (isspace(*px)) \
580  *px = '_'; \
581  } \
582  } \
583  } while (0)
584 
585 
586 
587 /* ****************************************************************************
588 
589  # # ##### #### # # ###### # # ####
590  # # # # # # # # ## # #
591  # # ##### # # # #### ##### # # # ####
592  # ## # # # # # # # # # # #
593  ## ## # # # # # # # ## # #
594  # # # #### # # ###### # # ####
595 
596  **************************************************************************** */
597 
598 static bool
599 mimepart2wordTokens(buf, size, xid, level, type, arg, mime_part)
600  char *buf;
601  size_t size;
602  char *xid;
603  int level;
604  int type;
605  void *arg;
606  mime_part_T *mime_part;
607 {
608  bfilter_T *bf = NULL;
609  msg_btsm_T *bm = (msg_btsm_T *) arg;
610 
611  if (bm == NULL)
612  return FALSE;
613 
614  bf = bfilter_ptr();
615 
616  ASSERT(bf != NULL);
617 
618  {
619  rfc2822_hdr_T *h = NULL;
620  tokconf_T *x = NULL;
621 
622  ZE_MessageInfo(19, "TYPE : %d", type);
623 
624  /*
625  ** Content-Type
626  */
627  h = rfc2822_lookup_header(mime_part->hdrs, "Content-Type");
628  if (h != NULL)
629  {
630  char *r = NULL;
631  int s_type = 0;
632 
633  ZE_MessageInfo(19, "HDR -> Content-Type... %s", h->value);
634  r = rfc2822_get_main_attr(h);
635  if (r != NULL)
636  {
637  s_type = which_mime_type(r);
638 
639  ZE_MessageInfo(19, " Type : %s", r);
640  if ((x = get_tokconf_body("ctmain")) != NULL && x->active)
641  {
642  convert_8to7(r, TRUE);
643  extract_word_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
644  }
645  }
646  FREE(r);
647 
648  r = rfc2822_get_attr(h, "name=");
649  if (r != NULL)
650  {
651  ZE_MessageInfo(19, " Disposition : %s", r);
652  if ((x = get_tokconf_body("ctname")) != NULL && x->active)
653  {
654  convert_8to7(r, TRUE);
655  NORM_FILENAME(r);
656  extract_word_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
657  }
658  }
659  FREE(r);
660 
661  r = rfc2822_get_attr(h, "filename=");
662  if (r != NULL)
663  {
664  ZE_MessageInfo(19, " Disposition : %s", r);
665  if ((x = get_tokconf_body("ctname")) != NULL && x->active)
666  {
667  convert_8to7(r, TRUE);
668  NORM_FILENAME(r);
669  extract_word_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
670  }
671  }
672  FREE(r);
673 
674  if (s_type == MIME_TYPE_MULTIPART)
675  {
676  char *bound = NULL;
677 
678  ZE_MessageInfo(19, " HDR TYPE : %s", h->value);
679  bound = rfc2822_get_attr(h, "boundary=");
680  if (bound != NULL)
681  {
682  ZE_MessageInfo(19, " BOUNDARY : %s", bound);
683  if ((x = get_tokconf_headers("boundary")) != NULL && x->active)
684  {
685  char *sep = " \t\n\r";
686  char *q = bound;
687 
688  convert_8to7(bound, TRUE);
689  for (q = bound; *q != '\0'; q++)
690  {
691  if (isdigit(*q))
692  *q = '0';
693 #if 0
694  if (isalpha(*q))
695  *q = 'A';
696 #endif
697  if (isxdigit(*q))
698  *q = 'F';
699  if (strchr("=-_", *q) != NULL)
700  *q = 'C';
701  }
702  extract_word_tokens(x, x->prefix, sep, bound, 0, bm, 0);
703  }
704 
705  }
706  FREE(bound);
707  }
708  }
709 
710  /*
711  ** Content-Disposition
712  */
713  h = rfc2822_lookup_header(mime_part->hdrs, "Content-Disposition");
714  if (h != NULL)
715  {
716  char *r = NULL;
717 
718  ZE_MessageInfo(19, "HDR -> Content-Disposition... %s", h->value);
719  r = rfc2822_get_main_attr(h);
720  if (r != NULL)
721  {
722  ZE_MessageInfo(19, " Disposition : %s", r);
723  if ((x = get_tokconf_body("cdmain")) != NULL && x->active)
724  {
725  convert_8to7(r, TRUE);
726  extract_word_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
727  }
728  }
729  FREE(r);
730 
731  r = rfc2822_get_attr(h, "name=");
732  if (r != NULL)
733  {
734  ZE_MessageInfo(19, " Disposition : %s", r);
735  if ((x = get_tokconf_body("cdname")) != NULL && x->active)
736  {
737  convert_8to7(r, TRUE);
738  NORM_FILENAME(r);
739  extract_word_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
740  }
741  }
742  FREE(r);
743 
744  r = rfc2822_get_attr(h, "filename=");
745  if (r != NULL)
746  {
747  ZE_MessageInfo(19, " Disposition : %s", r);
748  if ((x = get_tokconf_body("cdname")) != NULL && x->active)
749  {
750  convert_8to7(r, TRUE);
751  NORM_FILENAME(r);
752  extract_word_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
753  }
754  }
755  FREE(r);
756  }
757 
758  /*
759  ** All Headers
760  */
761  for (h = mime_part->hdrs; h != NULL; h = h->next)
762  {
763  ZE_MessageInfo(19, "H : %-20s - V : %s", h->key, h->value);
764 
765  if ((x = get_tokconf_headers(h->key)) != NULL && x->active)
766  {
767  convert_8to7(h->value, TRUE);
768 
769  if (STRCASEEQUAL(x->prefix, "msgid"))
770  {
771  char *px;
772 
773  for (px = h->value; *px != '\0'; px++)
774  if (isdigit(*px))
775  *px = '0';
776  }
777 
778  extract_word_tokens(x, x->prefix, x->separator, h->value, 0, bm, 0);
779  }
780  }
781  }
782 
783  if (type != MIME_TYPE_TEXT)
784  return TRUE;
785 
786  ZE_MessageInfo(11, "MIME PART SIZE : %d", size);
787 
788  if (bf->maxPartSize > 0 && size > bf->maxPartSize)
789  {
790  /* Shall log something ??? */
791  return TRUE;
792  }
793 
794  convert_8to7(buf, TRUE);
795 #if 0
796  if (0)
797  {
798  char *p, *q;
799 
800  for (p = q = buf; *p != '\0'; p++)
801  {
802  if (*p != *q || isalpha(*p))
803  *q++ = *p;
804  }
805  *q = '\0';
806  }
807 #endif
808 
809  if (abs(strspn(buf, " \t\r\n") - size) < 4)
810  return TRUE;
811  if (size < 6)
812  return TRUE;
813 
814  if (STRCASEEQUAL("text/html", mime_part->mime))
815  {
816  char *cleanbuf = NULL;
817 #if 0
818  tokconf_T *x = NULL;
819 #endif
820 
821  cleanbuf = cleanup_html_buffer(buf, strlen(buf) + 1);
822  convert_8to7(cleanbuf, TRUE);
823 #if 0
824  if ((x = get_tokconf_body("text/html")) != NULL)
825  extract_word_tokens(x, x->prefix, cleanbuf, 0, bm, 0);
826  else
827 #endif
828  extract_word_tokens(NULL, "body", NULL, cleanbuf, 0, bm, 0);
829  FREE(cleanbuf);
830 
831  cleanbuf = extract_html_tags(buf, strlen(buf) + 1);
832  ZE_MessageInfo(11, "MIME PART SIZE : %d 5", size);
833  ZE_MessageInfo(11, "MIME PART SIZE : %d \n%s", size, buf);
834  ZE_MessageInfo(11, "MIME PART SIZE : %d \n%s", size, cleanbuf);
835 #if 0
836  if ((x = get_tokconf_body("html/tags")) != NULL)
837  extract_word_tokens(x, x->prefix, cleanbuf, 0, bm, 0);
838  else
839 #endif
840  extract_word_tokens(NULL, "html", NULL, cleanbuf, 0, bm, 0);
841  FREE(cleanbuf);
842 
843  ZE_MessageInfo(11, "MIME PART SIZE : %d 6", size);
844  return TRUE;
845  }
846 
847  if (STRCASEEQUAL("text/plain", mime_part->mime))
848  {
849  extract_word_tokens(NULL, "body", NULL, buf, 0, bm, 0);
850 
851  return TRUE;
852  }
853 
854  extract_word_tokens(NULL, "body", NULL, buf, 0, bm, 0);
855 
856  return TRUE;
857 }
858 
859 
860 /* ****************************************************************************
861 
862  #### ##### #### # # ###### # # ####
863  # # # # # # # # ## # #
864  # ##### # # # #### ##### # # # ####
865  # # # # # # # # # # #
866  # # # # # # # # # ## # #
867  #### # #### # # ###### # # ####
868 
869  **************************************************************************** */
870 static int C_NGRAM = 5;
871 
872 static bool
873 extract_char_tokens(cf, prefix, separator, buf, kind, bm, level)
874  tokconf_T *cf;
875  char *prefix;
876  char *separator;
877  char *buf;
878  int kind;
879  msg_btsm_T *bm;
880  int level;
881 {
882  bfilter_T *bf = NULL;
883 
885 
886  bf = bfilter_ptr();
887 
888  ASSERT(bf != NULL);
889  ASSERT(bf->signature == SIGNATURE);
890 
891  if (bm == NULL || buf == NULL || strlen(buf) == 0)
892  return FALSE;
893 
894  if (cf == NULL)
895  cf = &tcf;
896 
897  level++;
898 
899  zeStr2Lower(buf);
900  {
901  char *p, *q, cp, cc;
902  bool blank = FALSE;
903 
904  cp = ' ';
905  cc = 0;
906  for (p = q = buf; *p != '\0'; cp = *p++)
907  {
908  if (*p == '\r')
909  continue;
910 
911  if (isblank(*p))
912  {
913  if (blank)
914  {
915  cc++;
916  continue;
917  } else
918  {
919  cc = 1;
920  blank = TRUE;
921  }
922 
923  *q++ = '_';
924  continue;
925  }
926  blank = FALSE;
927 
928  if (cp == *p)
929  cc++;
930  else
931  cc = 1;
932 
933  *q++ = *p;
934  }
935  *q = '\0';
936  }
937 
938  {
939  char *p;
940 
941  for (p = buf; strlen(p) >= C_NGRAM; p++)
942  {
943  char tok[64];
944 
945  zeSafeStrnCpy(tok, sizeof (tok), p, C_NGRAM);
946  ADD_TOKEN(bm, prefix, tok);
947  }
948  }
949 
950  return TRUE;
951 }
952 
953 /* ****************************************************************************
954  * *
955  * *
956  **************************************************************************** */
957 
958 static bool
959 mimepart2ngramTokens(buf, size, xid, level, type, arg, mime_part)
960  char *buf;
961  size_t size;
962  char *xid;
963  int level;
964  int type;
965  void *arg;
966  mime_part_T *mime_part;
967 {
968  bfilter_T *bf = NULL;
969  msg_btsm_T *bm = (msg_btsm_T *) arg;
970 
971  if (bm == NULL)
972  return FALSE;
973 
974  bf = bfilter_ptr();
975 
976  ASSERT(bf != NULL);
977 
978  {
979  rfc2822_hdr_T *h = NULL;
980  tokconf_T *x = NULL;
981 
982  ZE_MessageInfo(19, "TYPE : %d", type);
983 
984  /*
985  ** Content-Type
986  */
987  h = rfc2822_lookup_header(mime_part->hdrs, "Content-Type");
988  if (h != NULL)
989  {
990  char *r = NULL;
991  int s_type = 0;
992 
993  ZE_MessageInfo(19, "HDR -> Content-Type... %s", h->value);
994  r = rfc2822_get_main_attr(h);
995  if (r != NULL)
996  {
997  s_type = which_mime_type(r);
998 
999  ZE_MessageInfo(19, " Type : %s", r);
1000  if ((x = get_tokconf_body("ctmain")) != NULL && x->active)
1001  {
1002  convert_8to7(r, TRUE);
1003  extract_char_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
1004  }
1005  }
1006  FREE(r);
1007 
1008  r = rfc2822_get_attr(h, "name=");
1009  if (r != NULL)
1010  {
1011  ZE_MessageInfo(19, " Disposition : %s", r);
1012  if ((x = get_tokconf_body("ctname")) != NULL && x->active)
1013  {
1014  convert_8to7(r, TRUE);
1015  NORM_FILENAME(r);
1016  extract_char_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
1017  }
1018  }
1019  FREE(r);
1020 
1021  r = rfc2822_get_attr(h, "filename=");
1022  if (r != NULL)
1023  {
1024  ZE_MessageInfo(19, " Disposition : %s", r);
1025  if ((x = get_tokconf_body("ctname")) != NULL && x->active)
1026  {
1027  convert_8to7(r, TRUE);
1028  NORM_FILENAME(r);
1029  extract_char_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
1030  }
1031  }
1032  FREE(r);
1033 
1034  if (s_type == MIME_TYPE_MULTIPART)
1035  {
1036  char *bound = NULL;
1037 
1038  ZE_MessageInfo(19, " HDR TYPE : %s", h->value);
1039  bound = rfc2822_get_attr(h, "boundary=");
1040  if (bound != NULL)
1041  {
1042  ZE_MessageInfo(19, " BOUNDARY : %s", bound);
1043  if ((x = get_tokconf_headers("boundary")) != NULL && x->active)
1044  {
1045  char *sep = " \t\n\r";
1046  char *q = bound;
1047 
1048  convert_8to7(bound, TRUE);
1049  for (q = bound; *q != '\0'; q++)
1050  {
1051  if (isdigit(*q))
1052  *q = '0';
1053 #if 0
1054  if (isalpha(*q))
1055  *q = 'A';
1056 #endif
1057  if (isxdigit(*q))
1058  *q = 'F';
1059  if (strchr("=-_", *q) != NULL)
1060  *q = 'C';
1061  }
1062  extract_char_tokens(x, x->prefix, sep, bound, 0, bm, 0);
1063  }
1064 
1065  }
1066  FREE(bound);
1067  }
1068  }
1069 
1070  /*
1071  ** Content-Disposition
1072  */
1073  h = rfc2822_lookup_header(mime_part->hdrs, "Content-Disposition");
1074  if (h != NULL)
1075  {
1076  char *r = NULL;
1077 
1078  ZE_MessageInfo(19, "HDR -> Content-Disposition... %s", h->value);
1079  r = rfc2822_get_main_attr(h);
1080  if (r != NULL)
1081  {
1082  ZE_MessageInfo(19, " Disposition : %s", r);
1083  if ((x = get_tokconf_body("cdmain")) != NULL && x->active)
1084  {
1085  convert_8to7(r, TRUE);
1086  extract_char_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
1087  }
1088  }
1089  FREE(r);
1090 
1091  r = rfc2822_get_attr(h, "name=");
1092  if (r != NULL)
1093  {
1094  ZE_MessageInfo(19, " Disposition : %s", r);
1095  if ((x = get_tokconf_body("cdname")) != NULL && x->active)
1096  {
1097  convert_8to7(r, TRUE);
1098  NORM_FILENAME(r);
1099  extract_char_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
1100  }
1101  }
1102  FREE(r);
1103 
1104  r = rfc2822_get_attr(h, "filename=");
1105  if (r != NULL)
1106  {
1107  ZE_MessageInfo(19, " Disposition : %s", r);
1108  if ((x = get_tokconf_body("cdname")) != NULL && x->active)
1109  {
1110  convert_8to7(r, TRUE);
1111  NORM_FILENAME(r);
1112  extract_char_tokens(x, x->prefix, x->separator, r, 0, bm, 0);
1113  }
1114  }
1115  FREE(r);
1116  }
1117 
1118  /*
1119  ** All Headers
1120  */
1121  for (h = mime_part->hdrs; h != NULL; h = h->next)
1122  {
1123  ZE_MessageInfo(19, "H : %-20s - V : %s", h->key, h->value);
1124 
1125  if ((x = get_tokconf_headers(h->key)) != NULL && x->active)
1126  {
1127  convert_8to7(h->value, TRUE);
1128 
1129  if (STRCASEEQUAL(x->prefix, "msgid"))
1130  {
1131  char *px;
1132 
1133  for (px = h->value; *px != '\0'; px++)
1134  if (isdigit(*px))
1135  *px = '0';
1136  }
1137 
1138  extract_char_tokens(x, x->prefix, x->separator, h->value, 0, bm, 0);
1139  }
1140  }
1141  }
1142 
1143  if (type != MIME_TYPE_TEXT)
1144  return TRUE;
1145 
1146  ZE_MessageInfo(11, "MIME PART SIZE : %d", size);
1147 
1148  if (bf->maxPartSize > 0 && size > bf->maxPartSize)
1149  {
1150  /* Shall log something ??? */
1151  return TRUE;
1152  }
1153 
1154  convert_8to7(buf, TRUE);
1155 #if 0
1156  if (0)
1157  {
1158  char *p, *q;
1159 
1160  for (p = q = buf; *p != '\0'; p++)
1161  {
1162  if (*p != *q || isalpha(*p))
1163  *q++ = *p;
1164  }
1165  *q = '\0';
1166  }
1167 #endif
1168 
1169  if (abs(strspn(buf, " \t\r\n") - size) < 4)
1170  return TRUE;
1171  if (size < 6)
1172  return TRUE;
1173 
1174  if (STRCASEEQUAL("text/html", mime_part->mime))
1175  {
1176  char *cleanbuf = NULL;
1177 #if 0
1178  tokconf_T *x = NULL;
1179 #endif
1180 
1181  cleanbuf = cleanup_html_buffer(buf, strlen(buf) + 1);
1182  convert_8to7(cleanbuf, TRUE);
1183 #if 0
1184  if ((x = get_tokconf_body("text/html")) != NULL)
1185  extract_char_tokens(x, x->prefix, cleanbuf, 0, bm, 0);
1186  else
1187 #endif
1188  extract_char_tokens(NULL, "body", NULL, cleanbuf, 0, bm, 0);
1189  FREE(cleanbuf);
1190 
1191  cleanbuf = extract_html_tags(buf, strlen(buf) + 1);
1192  ZE_MessageInfo(11, "MIME PART SIZE : %d 5", size);
1193  ZE_MessageInfo(11, "MIME PART SIZE : %d \n%s", size, buf);
1194  ZE_MessageInfo(11, "MIME PART SIZE : %d \n%s", size, cleanbuf);
1195 #if 0
1196  if ((x = get_tokconf_body("html/tags")) != NULL)
1197  extract_char_tokens(x, x->prefix, cleanbuf, 0, bm, 0);
1198  else
1199 #endif
1200  extract_char_tokens(NULL, "html", NULL, cleanbuf, 0, bm, 0);
1201  FREE(cleanbuf);
1202 
1203  ZE_MessageInfo(11, "MIME PART SIZE : %d 6", size);
1204  return TRUE;
1205  }
1206 
1207  if (STRCASEEQUAL("text/plain", mime_part->mime))
1208  {
1209  extract_char_tokens(NULL, "body", NULL, buf, 0, bm, 0);
1210 
1211  return TRUE;
1212  }
1213 
1214  extract_char_tokens(NULL, "body", NULL, buf, 0, bm, 0);
1215 
1216  return TRUE;
1217 }
1218 
1219 /* ****************************************************************************
1220 
1221  # # ###### #### #### ## #### ######
1222  ## ## # # # # # # # #
1223  # ## # ##### #### #### # # # #####
1224  # # # # # ###### # ### #
1225  # # # # # # # # # # # #
1226  # # ###### #### #### # # #### ######
1227 
1228  **************************************************************************** */
1229 
1230 
1231 static int token_cmp(void *, void *);
1232 
1233 static bool
1234 msg_btsm_add_token(bm, token)
1235  msg_btsm_T *bm;
1236  char *token;
1237 {
1238  bool res = TRUE;
1239  sfilter_token_T tok, *t;
1240 
1241  memset(&tok, 0, sizeof (tok));
1242 
1243  strlcpy(tok.token, token, sizeof (tok.token));
1244  tok.prob = UT_PROB;
1245  if ((t = zeBTree_Get(&bm->bt, &tok)) == NULL)
1246  {
1247  tok.prob = UT_PROB;
1248  tok.nb = 1;
1249 
1250  if (!zeBTree_Add(&bm->bt, &tok))
1251  {
1252  ZE_LogMsgError(0, "ERROR inserting new token");
1253  res = FALSE;
1254  }
1255  } else
1256  t->nb++;
1257 
1258 fin:
1259  return res;
1260 }
1261 
1262 static bool
1263 msg_btsm_init(bm)
1264  msg_btsm_T *bm;
1265 {
1266  ASSERT(bm != NULL);
1267  ASSERT(bm->bt.signature == SIGNATURE);
1268 
1269  if (!zeBTree_Init(&bm->bt, sizeof (sfilter_token_T), token_cmp))
1270  {
1271  return FALSE;
1272  }
1273 
1274  return TRUE;
1275 }
1276 
1277 static bool
1278 msg_btsm_end(bm)
1279  msg_btsm_T *bm;
1280 {
1281  ASSERT(bm != NULL);
1282  ASSERT(bm->bt.signature == SIGNATURE);
1283 
1284  (void) zeBTree_Destroy(&bm->bt);
1285 
1286  return TRUE;
1287 }
1288 
1289 
1290 static int
1291 token_cmp(a, b)
1292  void *a;
1293  void *b;
1294 {
1295  sfilter_token_T *ta = a;
1296  sfilter_token_T *tb = b;
1297 
1298  return strcmp(ta->token, tb->token);
1299 }
1300 
1301 /* ****************************************************************************
1302  * *
1303  * *
1304  **************************************************************************** */
1305 bool
1306 bfilter_handle_message(id, fname, func, arg)
1307  char *id;
1308  char *fname;
1309  btsm_browse_F func;
1310  void *arg;
1311 {
1312  bool res = FALSE;
1313  bfilter_T *bf = NULL;
1315 
1316  bool decode;
1317  bool TextUnitWord = TRUE;
1318 
1319  bf = bfilter_ptr();
1320 
1321  ASSERT(bf != NULL);
1322  ASSERT(bf->signature == SIGNATURE);
1323 
1324  if (fname == NULL)
1325  return FALSE;
1326 
1327  id = STRNULL(id, "NOID");
1328  {
1329  size_t fsize;
1330 
1331  fsize = zeGetFileSize(fname);
1332 
1333  if (bf->maxMsgSize > 0 && fsize > bf->maxMsgSize)
1334  return FALSE;
1335  }
1336 
1337  (void) msg_btsm_init(&bm);
1338 
1339  {
1340  char *env = FALSE;
1341  static bool ok = FALSE;
1342 
1343  if ((env = getenv("TEXTUNIT")) != NULL)
1344  {
1345  if (STRCASEEQUAL(env, "NGRAM"))
1346  TextUnitWord = FALSE;
1347  }
1348 
1349  if ((env = getenv("NGRAMLEN")) != NULL && strlen(env) > 0)
1350  {
1351  int n;
1352 
1353  n = atoi(env);
1354  if (n > 0 && n < 10)
1355  C_NGRAM = n;
1356  }
1357 
1358  if (!ok)
1359  {
1360  ZE_MessageInfo(10, "Setting tokenizer to %s (unit length = %d)",
1361  TextUnitWord ? "WORD" : "NGRAM", C_NGRAM);
1362  ok = TRUE;
1363  }
1364  }
1365 
1366  if (TextUnitWord)
1367  decode = decode_mime_file(id, fname, NULL, mimepart2wordTokens, &bm);
1368  else
1369  decode = decode_mime_file(id, fname, NULL, mimepart2ngramTokens, &bm);
1370 
1371  if (decode)
1372  {
1373  int n;
1374 
1375  if (func != NULL)
1376  n = zeBTree_Browse(&bm.bt, func, arg);
1377 
1378  res = TRUE;
1379  }
1380  (void) msg_btsm_end(&bm);
1381 
1382  return res;
1383 }
1384 
#define SECSEP
#define MSG_BTSM_INITIALIZER
char * rfc2822_get_main_attr(rfc2822_hdr_T *header)
Definition: ze-rfc2822.c:178
void * zeBTree_Get(ZEBT_T *, void *)
Definition: zeBTree.c:281
#define ASSERT(a)
Definition: macros.h:27
bfilter_T * bfilter_ptr()
Definition: ze-bfilter.c:60
#define FREE(x)
Definition: macros.h:37
char * key
Definition: ze-rfc2822.h:37
bool segDouble
Definition: ze-bfilter.h:83
bool zeBTree_Init(ZEBT_T *, size_t, ZEBT_CMP_F)
Definition: zeBTree.c:96
rfc2822_hdr_T * next
Definition: ze-rfc2822.h:39
#define STRNULL(x, r)
Definition: macros.h:81
bool ok
Definition: ze-connopen.c:59
char * separator
void convert_8to7(char *buf, bool convert_spaces)
char * value
Definition: ze-rfc2822.h:38
bool zeBTree_Add(ZEBT_T *, void *)
Definition: zeBTree.c:309
size_t maxPartSize
Definition: ze-bfilter.h:78
#define TOKCONF_INITIALIZER
void set_tokconf_active(char *tag, bool active)
#define FALSE
Definition: macros.h:160
size_t zeGetFileSize(char *)
Definition: zeFileTools.c:132
#define strlcpy
Definition: zeString.h:32
#define ZE_LogMsgError(level,...)
Definition: zeSyslog.h:113
bool zeStrRegex(char *, char *, long *, long *, bool)
Definition: zeStrings.c:544
#define SEP_TOK
uint32_t flags
Definition: ze-bfilter.h:75
bool zeBTree_Destroy(ZEBT_T *)
Definition: zeBTree.c:192
struct msg_btsm_T msg_btsm_T
#define UT_PROB
Definition: ze-bfilter.h:190
#define SEP_WS
#define strlcat
Definition: zeString.h:28
#define MIME_TYPE_MULTIPART
Definition: ze-demime.h:41
#define strchr
Definition: ze-sys.h:218
bool decode_mime_file(char *, char *, uint32_t *, demime_F, void *)
Definition: ze-demime.c:584
char * mime
Definition: ze-demime.h:65
uint32_t signature
Definition: ze-bfilter.h:40
bfilter_T * bf
#define NORM_FILENAME(fname)
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
#define ADD_TOKEN(bm, prefix, token)
int zeSafeStrnCpy(char *, size_t, char *, size_t)
Definition: zeStrings.c:136
#define MIME_TYPE_TEXT
Definition: ze-demime.h:34
#define TRUE
Definition: macros.h:157
#define X_HTML_SEP
int zeSafeStrnCat(char *, size_t, char *, size_t)
Definition: zeStrings.c:107
#define ZE_LogSysError(...)
Definition: zeSyslog.h:129
#define isblank(c)
Definition: ze-sys.h:454
char * zeStr2Lower(char *)
Definition: zeStrings.c:295
#define STRCASEEQUAL(a, b)
Definition: macros.h:72
int(* btsm_browse_F)(void *, void *)
Definition: ze-bfilter.h:172
char * cleanup_html_buffer(char *, size_t)
Definition: ze-html.c:163
bool segRecurse
Definition: ze-bfilter.h:82
int which_mime_type(char *s)
Definition: ze-demime.c:662
#define SIGNATURE
Definition: ze-libjc.h:75
char * rfc2822_get_attr(rfc2822_hdr_T *header, char *attr)
Definition: ze-rfc2822.c:201
#define BFLAG_TRFTOK
Definition: ze-bfilter.h:32
size_t maxMsgSize
Definition: ze-bfilter.h:77
char token[128]
Definition: ze-bfilter.h:156
int zeBTree_Browse(ZEBT_T *, ZEBT_BROWSE_F, void *)
Definition: zeBTree.c:262
bool bfilter_handle_message(char *id, char *fname, btsm_browse_F func, void *arg)
rfc2822_hdr_T * hdrs
Definition: ze-demime.h:74
Definition: zeBTree.h:73
rfc2822_hdr_T * rfc2822_lookup_header(rfc2822_hdr_T *head, char *key)
Definition: ze-rfc2822.c:145