ze-filter  (ze-filter-0.8.0-develop-180218)
ze-chkcontent.c
Go to the documentation of this file.
1 
2 /*
3  *
4  * ze-filter - Mail Server Filter for sendmail
5  *
6  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
7  *
8  * Auteur : Jose Marcio Martins da Cruz
9  * jose.marcio.mc@gmail.org
10  *
11  * Historique :
12  * Creation : janvier 2002
13  *
14  * This program is free software, but with restricted license :
15  *
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
20  *
21  * More details about ze-filter license can be found at ze-filter
22  * web site : http://foss.jose-marcio.org
23  */
24 
25 #include <ze-sys.h>
26 
27 #include "ze-filter.h"
28 
29 
30 #define JDEBUG 0
31 
32 #define LOG_LEVEL 10
33 #define H0_LOG_LEVEL 12
34 
35 #ifndef _FFR_MSG_ENTROPY
36 #define _FFR_MSG_ENTROPY 0
37 #endif
38 
39 
40 #ifndef ENTROPY_BUF_SIZE
41 #define ENTROPY_BUF_SIZE 16000
42 #endif
43 
44 
47 
48 static uint32_t check_header_date(char *id, header_T * headers);
49 
50 static int check_unwanted_html_tags(char *, char *, bestof_T *);
51 static bool check_unwanted_boundary(char *, char *, bestof_T *);
52 static bool check_unwanted_mailer(char *, char *, bestof_T *);
53 static bool check_unwanted_charset(char *, char *, bestof_T *);
54 static int check_unwanted_expressions(char *, char *, bestof_T *);
55 
56 static int count_html_tags(char *);
57 static int count_html_comments(char *);
58 
59 static int count_uint32bits(uint32_t);
60 
61 /* ****************************************************************************
62  * *
63  * *
64  **************************************************************************** */
65 
66 #if 1
67 #define URLBL_SCORE() (MAX(urlbl_score, data->scores.urlbl))
68 #define REGEX_SCORE() (MAX(MAX(clean_score, raw_score), data->scores.body))
69 #else
70 #define URLBL_SCORE() (urlbl_score + data->scores.urlbl)
71 #define REGEX_SCORE() (MAX(clean_score, raw_score) + data->scores.body)
72 #endif
73 #define CURRENT_SCORE() (URLBL_SCORE() + REGEX_SCORE())
74 
75 static bool
76 check_mime_part(buf, size, id, level, type, arg, mime_part)
77  char *buf;
78  size_t size;
79  char *id;
80  int level;
81  int type;
82  void *arg;
83  mime_part_T *mime_part;
84 {
85  spamchk_T *data = (spamchk_T *) arg;
86 
87  int clean_score = 0, raw_score = 0, urlbl_score = 0;
88 
89  bool simple_text = TRUE;
90 
91  int x = CURRENT_SCORE();
92 
93  id = STRNULL(id, "00000000.000");
94  if (mime_part == NULL) {
95  ZE_LogMsgWarning(0, "mime_part NULL");
96  return FALSE;
97  }
98 
99  if (data == NULL) {
100  ZE_LogMsgWarning(0, "data NULL");
101  return FALSE;
102  }
103 
104  if (arg == NULL) {
105  ZE_LogMsgWarning(0, "arg NULL");
106  return FALSE;
107  }
108 
109  data->size += mime_part->size;
110  data->nb_part++;
111 
112  switch (type) {
113  case MIME_TYPE_TEXT:
114  data->nb_text++;
115  break;
116  case MIME_TYPE_IMAGE:
117  data->nb_image++;
118  break;
119  case MIME_TYPE_AUDIO:
120  data->nb_audio++;
121  break;
122  case MIME_TYPE_VIDEO:
123  data->nb_video++;
124  break;
126  data->nb_application++;
127  break;
129  data->nb_token++;
130  break;
131  case MIME_TYPE_MESSAGE:
132  data->nb_message++;
133  break;
134  case MIME_TYPE_MULTIPART:
135  data->nb_multipart++;
136  break;
137  }
138 
139  if (mime_part->hdrs != NULL) {
140  rfc2822_hdr_T *h;
141 
142  if (mime_part->type == MIME_TYPE_IMAGE);
143 
144  if ((h = rfc2822_lookup_header(mime_part->hdrs, "content-id")) != NULL) {
145  char *name = NULL;
146  rfc2822_hdr_T *hx = NULL;
147 
148  hx = rfc2822_lookup_header(mime_part->hdrs, "content-type");
149  if (hx != NULL)
150  name = rfc2822_get_main_attr(hx);
151 
152  if (name != NULL && strncasecmp(name, "image", strlen("image")) == 0) {
156  "%s SPAM CHECK - M%02d unwanted Content-ID for %s", id,
157  SPAM_MSG_CONTENT_ID, STRNULL(name, "???"));
158  }
159  FREE(name);
160  }
161  }
162 #if 1
163  /*
164  ** let's check empty attachments... usually appearing inside HTML
165  ** messages pointing to an inline inline attached empty image
166  */
167  if (mime_part->hdrs != NULL) {
168  rfc2822_hdr_T *hdr = NULL;
169  char *name = NULL;
170  char *mime = NULL;
171 
172  hdr = mime_part->hdrs;
173  while ((hdr = rfc2822_lookup_header(hdr, "content-type")) != NULL) {
174  mime = rfc2822_get_main_attr(hdr);
175  if (mime != NULL) {
176  name = rfc2822_get_attr(hdr, "name=");
177  if (name != NULL) {
178  char *p = NULL;
179 
180  p = strrchr(name, '.');
181 
182  ZE_MessageInfo(11, " CTYPE : %-8s %4d %6d %s", STRNULL(p, ".---"),
183  type, size, name);
184  if (size == 0) {
185  switch (type) {
186  case MIME_TYPE_IMAGE:
187  case MIME_TYPE_AUDIO:
188  case MIME_TYPE_VIDEO:
191  case MIME_TYPE_MESSAGE:
195  "%s SPAM CHECK - M%02d empty attachment %s",
197  "???"));
198  break;
199  default:
200  break;
201  }
202  }
203 
204  }
205  }
206  FREE(name);
207  FREE(mime);
208 
209  hdr = hdr->next;
210  }
211 
212  hdr = mime_part->hdrs;
213  while ((hdr = rfc2822_lookup_header(hdr, "content-disposition")) != NULL) {
214  mime = rfc2822_get_main_attr(hdr);
215  if (mime != NULL) {
216  name = rfc2822_get_attr(hdr, "filename=");
217  if (name != NULL) {
218  char *p = NULL;
219 
220  p = strrchr(name, '.');
221 
222  ZE_MessageInfo(11, " CDISP : %-8s %4d %6d %s", STRNULL(p, ".---"),
223  type, size, name);
224  if (size == 0) {
225  switch (type) {
226  case MIME_TYPE_IMAGE:
227  case MIME_TYPE_AUDIO:
228  case MIME_TYPE_VIDEO:
231  case MIME_TYPE_MESSAGE:
235  "%s SPAM CHECK - M%02d empty attachment %s",
237  "???"));
238  break;
239  default:
240  break;
241  }
242  }
243  }
244  }
245  FREE(name);
246  FREE(mime);
247 
248  hdr = hdr->next;
249  }
250  }
251 #endif
252 
253  /*
254  * spam_oracle
255  */
256  if (type != MIME_TYPE_TEXT) {
257  zeKStatsUpdate(&data->mksize.other, (double) size);
258  return TRUE;
259  }
260 #if 1
261  convert_8to7(buf, FALSE);
262 #else
263  /*
264  * remove unprintable or 8 bits caracteres before checking text
265  */
266  {
267  char *p, *q;
268  size_t sz;
269 
270  p = q = buf;
271  sz = size;
272 
273  for (sz = size; *p != '\0' && sz > 0; p++, sz--) {
274  int c = *((unsigned char *) p);
275 
276  if (c == 0x0A || c == 0x0D) {
277  *q++ = *p;
278  continue;
279  }
280 
281  if (*p == 0x1B) {
282  size--;
283  continue;
284  }
285 
286  if (c < 0x20) {
287  if (1)
288  *q++ = ' ';
289  else
290  size--;
291  continue;
292  }
293 
294  if (c > 0x7F) {
295  char s[] = " "
296  " "
297  "AAAAAAACEEEEIIIIGNOOOOOxOUUUUYPB" "aaaaaaaceeeeiiiionooooo-ouuuuyby";
298 
299  c -= 0x80;
300  if (c > 0 && c < sizeof (s))
301  *q++ = s[0x80];
302  continue;
303  }
304 
305  *q++ = *p;
306  }
307  *q = '\0';
308  }
309 #endif
310 
311  ZE_MessageInfo(19, "TYPE (%d) (%ld) (%s)", type, (long) size,
312  mime_part->mime);
313 
314  /*
315  **
316  ** text/html
317  **
318  */
319  if (strcasecmp("text/html", mime_part->mime) == 0) {
320  size_t real_size = 0;
321  int nb_tags;
322 
323  zeKStatsUpdate(&data->mksize.html, (double) size);
324 
325  data->nb_text_html++;
326 
327  simple_text = FALSE;
328 
329  if (mime_part->encode == MIME_ENCODE_BASE64)
330  data->nb_text_html_base64++;
331 
332  data->sz_text_html += mime_part->size;
333 
334  data->html.len_clean = mime_part->size;
335  data->html.len_raw = mime_part->size;
336 
337  /*
338  * URL BL
339  */
340  if (data->scores.do_urlbl)
341  urlbl_score = check_rurlbl(id, data->ip, buf);
342 
343  /*
344  * clean HTML code and handle it
345  */
346  {
347  char *html_clean = NULL;
348 
349  html_clean = cleanup_html_buffer(buf, size + 1);
350  convert_8to7(html_clean, FALSE);
351  if (html_clean != NULL) {
352  real_size = strlen(html_clean);
353 
354  ZE_MessageInfo(19, "Checking HTML clean");
355  if (data->scores.do_regex
356  && (CURRENT_SCORE() < data->content_max_score)) {
357  clean_score = check_regex(id, data->ip, html_clean, MAIL_BODY);
358 
359  if (CURRENT_SCORE() < data->content_max_score) {
360  char *x = NULL;
361 
362  x = realcleanup_text_buf(html_clean, strlen(html_clean));
363  if (x != NULL) {
364  int sc = 0;
365 
366  sc = check_regex(id, data->ip, x, MAIL_BODY);
367  if (sc > clean_score)
368  clean_score = sc;
369  }
370  FREE(x);
371  }
372  }
373 
374  if (data->scores.do_oracle) {
376 
377  if (text_word_length(html_clean, &st, strlen(html_clean))) {
378  data->html.st_wlen = st;
379  data->html.len_clean = strlen(html_clean);
380  }
381  }
382  }
383 #if _FFR_MSG_ENTROPY == 1
384  if (data->spam_oracle) {
385  if (html_clean != NULL) {
386  if (strlen(html_clean) < ENTROPY_BUF_SIZE)
387  text_buffer_entropy(html_clean, strlen(html_clean) + 1,
388  &data->html.h0, &data->html.h1, &data->html.h2);
389  } else {
390  if (size < ENTROPY_BUF_SIZE)
391  text_buffer_entropy(buf, size,
392  &data->html.h0, &data->html.h1, &data->html.h2);
393  }
394  }
395 #endif
396  FREE(html_clean);
397  }
398 
399 
400  if (data->scores.do_oracle) {
401  int n;
402 
403  /*
404  * XXX 100000 ??? JOE
405  */
406  if (size <= 100000)
407  data->html_invalid_tags += check_valid_html_tags(id, buf);
408 
409  if ((n = check_unwanted_html_tags(id, buf, &data->best)) > 0)
410  data->html_unwanted_tags = n;
411 
412  nb_tags = count_html_tags(buf);
413 
414  if (nb_tags * 5 > real_size)
415  data->html_high_tag_ratio++;
416 
417  {
418  /*
419  ** CLEAN buf from HTML codings
420  */
421  }
422  }
423  }
424 
425  /*
426  **
427  ** text/plain
428  **
429  */
430  if (strcasecmp("text/plain", mime_part->mime) == 0) {
431  zeKStatsUpdate(&data->mksize.plain, (double) size);
432 
433  data->nb_text_plain++;
434 
435  simple_text = FALSE;
436 
437  if (data->scores.do_oracle) {
438  {
439  char *pc = buf;
440  int sz = 0;
441 
442  for (pc = buf; pc != NULL && *pc != '\0'; pc++)
443  if (isalnum(*pc))
444  sz++;
445 
446  /*
447  * check cleaned up html part size
448  */
449  ZE_MessageInfo(19, "PLAIN TEXT size %d", size);
450  if (sz >= 0 && sz < 80) {
454  "%s SPAM CHECK - P%02d text/plain part too short : %d",
455  id, SPAM_PLAIN_TOO_SHORT, sz);
456  }
457  }
458 
459  if ((mime_part != NULL) && (strlen(mime_part->charset) == 0)) {
463  "%s SPAM CHECK - P%02d text/plain w/o charset", id,
465  }
466 
467  if (size > data->plain.len_raw) {
469 
470  data->plain.len_raw = size;
471  data->plain.len_clean = size;
472 
473  if (text_word_length(buf, &st, strlen(buf))) {
474  data->plain.st_wlen = st;
475  data->plain.len_clean = strlen(buf);
476  }
477 #if _FFR_MSG_ENTROPY == 1
478  if (size < ENTROPY_BUF_SIZE)
479  text_buffer_entropy(buf, size,
480  &data->plain.h0, &data->plain.h1,
481  &data->plain.h2);
482 #endif
483  }
484 
485  if (mime_part->encode == MIME_ENCODE_BASE64)
486  data->nb_text_plain_base64++;
487  data->sz_text_plain += mime_part->size;
488 
489  if (strspn(buf, " \n\r\t") == size)
490  data->nb_text_plain_empty++;
491  }
492 
493  /*
494  * URL BL
495  */
496  if (data->scores.do_urlbl)
497  urlbl_score = check_rurlbl(id, data->ip, buf);
498 
499  if (data->scores.do_regex && CURRENT_SCORE() < data->content_max_score) {
500  char *x = NULL;
501 
502  x = realcleanup_text_buf(buf, strlen(buf));
503 
504  if (x != NULL) {
505  ZE_MessageInfo(30, "BUF ...%s", x);
506  clean_score = check_regex(id, data->ip, x, MAIL_BODY);
507  FREE(x);
508  }
509  }
510  }
511 
512  /*
513  **
514  ** Text message without mime definition
515  **
516  */
517  if (simple_text) {
518  zeKStatsUpdate(&data->mksize.simple, (double) size);
519 
520  if (data->scores.do_oracle) {
521  if (size > data->plain.len_raw) {
523 
524  data->plain.len_raw = size;
525  data->plain.len_clean = size;
526 
527  if (text_word_length(buf, &st, strlen(buf))) {
528  data->plain.st_wlen = st;
529  data->plain.len_clean = strlen(buf);
530  }
531 #if _FFR_MSG_ENTROPY == 1
532  if ((size < ENTROPY_BUF_SIZE) && (size > 16))
533  text_buffer_entropy(buf, size,
534  &data->plain.h0, &data->plain.h1,
535  &data->plain.h2);
536 #endif
537  }
538 
539  if (strspn(buf, " \n\r\t") == size)
540  data->nb_text_simple_empty++;
541  }
542 
543  /*
544  * URL BL
545  */
546  if (data->scores.do_urlbl)
547  urlbl_score = check_rurlbl(id, data->ip, buf);
548 
549  if (data->scores.do_regex && CURRENT_SCORE() < data->content_max_score) {
550  char *x = NULL;
551 
552  x = realcleanup_text_buf(buf, strlen(buf));
553 
554  if (x != NULL) {
555  ZE_MessageInfo(30, "BUF ...%s", x);
556  clean_score = check_regex(id, data->ip, x, MAIL_BODY);
557  FREE(x);
558  }
559  }
560  }
561 
562  /*
563  * Checking special usual spam expressions
564  */
565  data->msg_bad_expressions += check_unwanted_expressions(id, buf, &data->best);
566 
567  ZE_MessageInfo(19, "Checking HTML raw and TEXT");
568  if (data->scores.do_regex && (CURRENT_SCORE() < data->content_max_score)) {
569  raw_score = check_regex(id, data->ip, buf, MAIL_BODY);
570  }
571 
572  data->scores.body = REGEX_SCORE();
573  data->scores.urlbl = URLBL_SCORE();
574 
575  return TRUE;
576 }
577 
578 /* ****************************************************************************
579  * *
580  * *
581  **************************************************************************** */
582 int
583 scan_body_contents(id, ip, fname, maxsize, data, flags, scores)
584  char *id;
585  char *ip;
586  char *fname;
587  size_t maxsize;
588  spamchk_T *data;
589  msg_flags_T *flags;
590  msg_scores_T *scores;
591 {
592  size_t size;
593  uint32_t mime_flags = 0;
594 
595  if (fname == NULL)
596  return 0;
597 
598  size = zeGetFileSize(fname);
599  if (size < 0)
600  return 0;
601 
602  data->spool_size = size;
603  data->max_spool_size = maxsize;
604 
605  if (scores != NULL)
606  data->scores = *scores;
607 
608  if (data->scores.do_regex)
609  data->scores.do_regex = (size <= maxsize);
610 
612  data->ip = ip;
613 
614  bestof_init(&data->best, 4, NULL);
615 
616  zeKStatsReset(&data->mksize.plain);
618  zeKStatsReset(&data->mksize.html);
620  zeKStatsReset(&data->mksize.simple);
621  zeKStatsReset(&data->mksize.other);
622  zeKStatsReset(&data->mksize.attach);
623 
624  /*
625  **
626  ** ORACLE - pre processing
627  **
628  */
629  {
630  uint32_t flags = 0;
631 
632  if ((flags = check_header_date(id, data->hdrs)) != 0)
633  data->flags.msg |= flags;
634  }
635 
636  if (data->scores.do_oracle) {
637  header_T *h;
638 
639  if ((h = get_msgheader(data->hdrs, "Content-type")) != NULL) {
640  char value[256];
641 
642  ZE_MessageInfo(15, "%s : got content-type header", id);
643 
644  memset(value, 0, sizeof (value));
645  if (get_msgheader_attribute(h, "charset", value, sizeof (value))
646  && (strlen(value) > 0)) {
647  if (check_unwanted_charset(id, value, &data->best)) {
651  "%s SPAM CHECK - M%02d unwanted charset : %s", id,
653  }
654  }
655 
656  if ((h->value != NULL)
657  && zeStrRegex(h->value, "multipart/(alternative|mixed)", NULL, NULL,
658  TRUE)) {
659  if (get_msgheader_attribute(h, "boundary", value, sizeof (value))) {
660  ZE_MessageInfo(19, "Boundary : %s", value);
661  if (check_unwanted_boundary(id, value, &data->best)) {
663 
666  "%s SPAM CHECK - M%02d unwanted boundary (%s)",
667  id, SPAM_MSG_UNWANTED_BOUNDARY, value);
668  }
669  } else {
673  "%s SPAM CHECK - M%02d short boundary boundary=(%s) len=(%d)",
674  id, SPAM_MSG_UNWANTED_BOUNDARY, "(NULL)", 0);
675  }
676  }
677 
678  }
679 
680  if ((h = get_msgheader(data->hdrs, "X-Mailer")) != NULL) {
681  char *mailer = STRNULL(h->value, "");
682 
683  if (check_unwanted_mailer(id, mailer, &data->best)) {
687  "%s SPAM CHECK - M%02d unwanted mailer (%s)",
688  id, SPAM_MSG_UNWANTED_MAILER, mailer);
689  }
690  }
691  if ((h = get_msgheader(data->hdrs, "User-Agent")) != NULL) {
692  char *mailer = STRNULL(h->value, "");
693 
694  if (check_unwanted_mailer(id, mailer, &data->best)) {
698  "%s SPAM CHECK - M%02d unwanted mailer (%s)",
699  id, SPAM_MSG_UNWANTED_MAILER, mailer);
700  }
701  }
702 
703  /*
704  * check if Subject is all HI CAPS
705  */
706  for (h = data->hdrs; h != NULL && (h = get_msgheader(h, "Subject")) != NULL;
707  h = h->next) {
708  bool nm = FALSE;
709  bool na = FALSE;
710  char *p = h->value;
711 
712  if (strlen(h->value) == 0)
713  continue;
714 
715  for (p = h->value; p != NULL && *p != '\0'; p++) {
716  if (islower(*p))
717  nm = TRUE;
718  if (isalpha(*p))
719  na = TRUE;
720  }
721  if (!nm) {
725  "%s SPAM CHECK - M%02d Subject doesn't contains lower case chars : %s",
727  }
728  if (!na) {
732  "%s SPAM CHECK - M%02d Subject doesn't contains alpha chars : %s",
734  }
735  }
736 
737  /*
738  ** RFC 2822 headers compliance
739  */
741  if (data->nb_rfc2822_hdrs_errors > 0) {
744  ZE_MessageInfo(LOG_LEVEL, "%s SPAM CHECK - M%02d RFC2822 headers", id,
746  }
747 
748  if (TRUE) {
749  int n;
750 
751  n = check_rfc2822_headers_syntax(id, data->hdrs);
752  if (n > 0) {
755  ZE_MessageInfo(LOG_LEVEL, "%s SPAM CHECK - M%02d Headers syntax :",
757  }
758  data->headers_syntax_errors = n;
759  }
760 
761  /*
762  ** Base 64 encoded messge
763  */
764  if ((h = get_msgheader(data->hdrs, "Content-Transfer-Encoding")) != NULL) {
765  if ((h->value != NULL)
766  && zeStrRegex(h->value, "base64", NULL, NULL, TRUE)) {
769  ZE_MessageInfo(LOG_LEVEL, "%s SPAM CHECK - M%02d B64 encoded message",
770  id, SPAM_MSG_BASE64);
771  }
772  }
773 
774  /*
775  ** Subject encoding subject
776  */
777  if ((h = get_msgheader(data->hdrs, "Subject")) != NULL) {
778  if ((h->value != NULL) &&
779  zeStrRegex(h->value, "^=[?].*[?][bB][?].*[?]=", NULL, NULL, TRUE)) {
782  ZE_MessageInfo(LOG_LEVEL, "%s SPAM CHECK - M%02d B64 encoded Subject",
784  }
785  }
786  }
787 
788  /*
789  **
790  ** Message body content checking
791  **
792  */
793 
794  /*
795  * N = ??? XXX
796  */
797  if (size < 8 * maxsize)
798  decode_mime_file(id, fname, &mime_flags, check_mime_part, (void *) data);
799 
800  /*
801  **
802  ** ORACLE - post processing
803  **
804  */
805  if (data->scores.do_oracle) {
806 
807  /*
808  ** message checking
809  */
810  if ((data->nb_part > 1) && (data->nb_text == 0)) {
811  SET_BIT(data->flags.msg, SPAM_MSG_NO_TEXT_PART);
814  "%s SPAM CHECK - M%02d No HTML nor TEXT parts : Total = %d",
815  id, SPAM_MSG_NO_TEXT_PART, data->nb_part);
816  }
817 
818  /*
819  * compare sizes
820  */
821 #if 1
822  {
823  if (zeKMean(&data->mksize.plain) < 1.);
824  }
825 #endif
826 
827  if (data->msg_bad_expressions > 0) {
828  SET_BIT(data->flags.msg, SPAM_MSG_BAD_EXPRESSIONS);
831  "%s SPAM CHECK - M%02d BAD EXPRESSIONS : %d", id,
832  SPAM_MSG_BAD_EXPRESSIONS, data->msg_bad_expressions);
833  }
834 
835  if ((data->nb_text_html > 0) && (data->nb_text_html > data->nb_text_plain)) {
836  SET_BIT(data->flags.msg, SPAM_MSG_TOO_MUCH_HTML);
839  "%s SPAM CHECK - M%02d NB HTML > PLAIN : %d %d", id,
840  SPAM_MSG_TOO_MUCH_HTML, data->nb_text_html,
841  data->nb_text_plain);
842  }
843 
844  /*
845  * correlate text/plain vs text/html parts
846  */
847  if (TRUE) {
848  double whtml[5], wplain[5];
849  double hhtml[5], hplain[5];
850  double vcoef, lcoef;
851 
852  memset(whtml, 0, sizeof (whtml));
853  memset(hhtml, 0, sizeof (hhtml));
854  memset(wplain, 0, sizeof (wplain));
855  memset(hplain, 0, sizeof (hplain));
856  if ((data->nb_text_html > 0) && (data->nb_text_plain > 0)) {
857  bool ldiff = FALSE;
858  bool ko = FALSE;
859 
860  whtml[0] = zeKMin(&data->html.st_wlen);
861  whtml[1] = zeKMax(&data->html.st_wlen);
862  whtml[2] = zeKMean(&data->html.st_wlen);
863  whtml[3] = zeKStdDev(&data->html.st_wlen);
864  whtml[4] = (double) data->html.len_clean;
865 
866  wplain[0] = zeKMin(&data->plain.st_wlen);
867  wplain[1] = zeKMax(&data->plain.st_wlen);
868  wplain[2] = zeKMean(&data->plain.st_wlen);
869  wplain[3] = zeKStdDev(&data->plain.st_wlen);
870  wplain[4] = (double) data->plain.len_clean;
871 
872  vcoef = vector_compare(whtml, wplain, 5);
873 
874  if (wplain[4] >= 1.)
875  lcoef = whtml[4] / wplain[4];
876  else
877  lcoef = 1000.;
878 
879  if (data->html.len_clean > 2500) {
880  if (abs(data->html.len_clean - data->plain.len_clean) > 500)
881  ldiff = TRUE;
882  } else {
883  if (abs(data->html.len_clean - data->plain.len_clean) > 1000)
884  ldiff = TRUE;
885  }
886  ldiff = FALSE;
887 
888  if (!ko && (vcoef < 0.9) && (vcoef > 0.1)) {
889  ko = TRUE;
890 
891  SET_BIT(data->flags.msg, SPAM_MSG_MATCH_MIME_PARTS);
894  "%s SPAM CHECK - M%02d HTML/PLAIN parts don't match vcoef=(%7.3f) lcoef=(%7.3f) (vcoef)",
895  id, SPAM_MSG_MATCH_MIME_PARTS, vcoef, lcoef);
896  }
897 
898  if (!ko && ((lcoef > 5) || (lcoef < 0.85))) {
899  ko = TRUE;
900 
901  SET_BIT(data->flags.msg, SPAM_MSG_MATCH_MIME_PARTS);
904  "%s SPAM CHECK - M%02d HTML/PLAIN parts don't match vcoef=(%7.3f) lcoef=(%7.3f) (lcoef)",
905  id, SPAM_MSG_MATCH_MIME_PARTS, vcoef, lcoef);
906  }
907 
908  if (!ko && ldiff) {
909  ko = TRUE;
910  /*
911  * JOE 2007 Aug 20
912  */
913 #if 1
914  SET_BIT(data->flags.msg, SPAM_MSG_MATCH_MIME_PARTS);
915 #endif
918  "%s SPAM CHECK - M%02d HTML/PLAIN parts don't match HTML(%6d)/PLAIN(%6d) (ldiff)",
920  data->html.len_clean, data->plain.len_clean);
921  }
922 #if _FFR_MSG_ENTROPY == 1
923  hhtml[0] = data->html.h0;
924  hhtml[1] = data->html.h1;
925  hhtml[2] = data->html.h2;
926  hhtml[3] = (double) data->html.len_clean;
927 
928  hplain[0] = data->plain.h0;
929  hplain[1] = data->plain.h1;
930  hplain[2] = data->plain.h2;
931  hplain[3] = (double) data->plain.len_clean;
932 
933  vcoef = vector_compare(hhtml, hplain, 4);
934 #if 0
935  if (wplain[4] >= 1.0)
936  lcoef = whtml[3] / wplain[3];
937  else
938  lcoef = 1000.;
939 #endif
940  /*
941  * compare entropies... XXX JOE
942  */
943 #endif
944  }
945  }
946 
947  /*
948  ** MIME decode errors
949  */
950  data->mime_errors = count_uint32bits(mime_flags);
951  if (data->mime_errors > 0) {
952  int i;
953  char sout[64];
954 
955  SET_BIT(data->flags.msg, SPAM_MSG_MIME_ERRORS);
958  "%s SPAM CHECK - M%02d MIME decode errors : %d",
959  id, SPAM_MSG_MIME_ERRORS, data->mime_errors);
960  memset(sout, 0, sizeof (sout));
961 
962  for (i = 0; i < 32; i++)
963  sout[i] = GET_BIT(mime_flags, i) ? ('0' + (i % 10)) : '.';
964 
967  "%s SPAM CHECK - M%02d MIME errors %s",
968  id, SPAM_MSG_MIME_ERRORS, sout);
969  }
970 
971  /*
972  ** text/plain checking
973  */
974  if (data->nb_text_plain_base64 > 0) {
975  SET_BIT(data->flags.plain, SPAM_PLAIN_BASE64);
978  "%s SPAM CHECK - P%02d text/plain encoded base64 : %d",
979  id, SPAM_PLAIN_BASE64, data->nb_text_plain_base64);
980  }
981 
982  if ((data->nb_text_plain_empty > 0)
983  && (data->nb_text_plain == data->nb_text_plain_empty)) {
984  SET_BIT(data->flags.plain, SPAM_PLAIN_EMPTY);
987  "%s SPAM CHECK - P%02d text/plain empty : %d", id,
988  SPAM_PLAIN_EMPTY, data->nb_text_plain_empty);
989  }
990 
991  /*
992  ** text/html checking
993  */
994  if (data->nb_text_html_base64 > 0) {
995  SET_BIT(data->flags.html, SPAM_HTML_BASE64);
998  "%s SPAM CHECK - H%02d text/html encoded base64 : %d",
999  id, SPAM_HTML_BASE64, data->nb_text_html_base64);
1000  }
1001 
1002  /*
1003  * check cleaned up html part size
1004  */
1005  if (data->html.len_clean > 0) {
1006  bool chk = FALSE;
1007  double r2c = 1.;
1008 
1009  if (data->html.len_clean > 0)
1010  r2c = ((double) data->html.len_raw) / ((double) data->html.len_clean);
1011 
1012  if (data->html.len_clean > 0 && data->html.len_clean < 100) {
1013  if ((data->plain.len_clean == 0) || (r2c >= 2.)) {
1014  chk = TRUE;
1015  }
1016  }
1017 
1018  if (chk) {
1019  SET_BIT(data->flags.html, SPAM_HTML_CLEAN_TOO_SHORT);
1020  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1022  "%s SPAM CHECK - H%02d HTML cleaned up too short : %d",
1023  id, SPAM_HTML_CLEAN_TOO_SHORT, data->html.len_clean);
1024  }
1025  }
1026 
1027  if (data->html_high_tag_ratio > 0) {
1028  SET_BIT(data->flags.html, SPAM_HTML_TAGS_RATIO);
1029  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1031  "%s SPAM CHECK - H%02d HTML tag/text ratio : %d", id,
1032  SPAM_HTML_TAGS_RATIO, data->html_high_tag_ratio);
1033  }
1034 
1035  if (data->html_unwanted_tags > 0) {
1036  SET_BIT(data->flags.html, SPAM_HTML_UNWANTED_TAGS);
1037  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1039  "%s SPAM CHECK - H%02d HTML with unwanted tags : %d", id,
1040  SPAM_HTML_UNWANTED_TAGS, data->html_unwanted_tags);
1041  }
1042 
1043  if (data->html_invalid_tags > 0) {
1044  SET_BIT(data->flags.html, SPAM_HTML_INVALID_TAGS);
1045  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1047  "%s SPAM CHECK - H%02d HTML with invalid tags : %d", id,
1048  SPAM_HTML_INVALID_TAGS, data->html_invalid_tags);
1049  }
1050 
1051  }
1052 
1053  /*
1054  ** Evaluate Oracle Score
1055  */
1056  if (data->scores.do_oracle) {
1057  ZE_MessageInfo(12, "Avant oracle_compute_score");
1058  data->scores.oracle = oracle_compute_score(id, ip, data);
1059  ZE_MessageInfo(12, "Apres oracle_compute_score");
1060  }
1061 
1062  if (scores != NULL)
1063  *scores = data->scores;
1064 
1065  return data->scores.body + data->scores.urlbl;
1066 }
1067 
1068 /* ****************************************************************************
1069  * *
1070  * *
1071  **************************************************************************** */
1072 int
1074  char *id;
1075  header_T *head;
1076 {
1077  int nerr = 0;
1078  int r;
1079  char *s;
1080  int nb = 0;
1081 
1082  bool mime_ct, mime_cd, mime_cte, mime_vers;
1083 
1084  mime_ct = mime_cd = mime_cte = mime_vers = FALSE;
1085 
1086  s = "Date";
1087  if ((r = count_msgheader_attr(head, s)) != 1) {
1088  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1089  ZE_MessageInfo(10,
1090  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (1,1)",
1091  id, s, r);
1092  nerr++;
1093  }
1094 
1095  s = "From";
1096  if ((r = count_msgheader_attr(head, s)) != 1) {
1097  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1098  ZE_MessageInfo(10,
1099  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (1,1)",
1100  id, s, r);
1101  nerr++;
1102  }
1103 
1104  s = "Sender";
1105  if ((r = count_msgheader_attr(head, s)) > 1) {
1106  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1107  ZE_MessageInfo(10,
1108  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1109  id, s, r);
1110  nerr++;
1111  }
1112 
1113  s = "Reply-To";
1114  if ((r = count_msgheader_attr(head, s)) > 1) {
1115  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1116  ZE_MessageInfo(10,
1117  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1118  id, s, r);
1119  nerr++;
1120  }
1121 
1122  s = "To";
1123  if ((r = count_msgheader_attr(head, s)) > 1) {
1124  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1125  ZE_MessageInfo(10,
1126  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1127  id, s, r);
1128  nerr++;
1129  }
1130  nb = r;
1131 
1132  s = "Cc";
1133  if ((r = count_msgheader_attr(head, s)) > 1) {
1134  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1135  ZE_MessageInfo(10,
1136  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1137  id, s, r);
1138  nerr++;
1139  }
1140  nb += r;
1141 
1142  s = "Bcc";
1143  if ((r = count_msgheader_attr(head, s)) > 1) {
1144  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1145  ZE_MessageInfo(10,
1146  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1147  id, s, r);
1148  nerr++;
1149  }
1150  nb += r;
1151 
1152  if (nb == 0) {
1153  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1154  ZE_MessageInfo(10,
1155  "%s SPAM CHECK - MSG RFC2822 HDRS count : No To nor Cc nor Bcc",
1156  id);
1157  nerr++;
1158  }
1159 
1160  s = "Message-ID";
1161  if ((r = count_msgheader_attr(head, s)) > 1) {
1162  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1163  ZE_MessageInfo(10,
1164  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1165  id, s, r);
1166  nerr++;
1167  }
1168  if (r == 0) {
1169  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1170  ZE_MessageInfo(10,
1171  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1172  id, s, r);
1173  nerr++;
1174  }
1175 
1176  s = "In-Reply-To";
1177  if ((r = count_msgheader_attr(head, s)) > 1) {
1178  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1179  ZE_MessageInfo(10,
1180  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1181  id, s, r);
1182  nerr++;
1183  }
1184 
1185  s = "References";
1186  if ((r = count_msgheader_attr(head, s)) > 1) {
1187  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1188  ZE_MessageInfo(10,
1189  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1190  id, s, r);
1191  nerr++;
1192  }
1193 
1194  s = "Subject";
1195  if ((r = count_msgheader_attr(head, s)) > 1) {
1196  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1197  ZE_MessageInfo(10,
1198  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1199  id, s, r);
1200  nerr++;
1201  }
1202  nb = r;
1203  if (nb == 0) {
1204  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1205  ZE_MessageInfo(10, "%s SPAM CHECK - MSG RFC2822 HDRS count : No Subject",
1206  id);
1207  nerr++;
1208  }
1209 
1210  s = "MIME-Version";
1211  if ((r = count_msgheader_attr(head, s)) > 1) {
1212  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1213  ZE_MessageInfo(10,
1214  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1215  id, s, r);
1216  nerr++;
1217  }
1218  mime_vers = (r > 0);
1219 
1220  s = "Content-Type";
1221  if ((r = count_msgheader_attr(head, s)) > 1) {
1222  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1223  ZE_MessageInfo(10,
1224  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1225  id, s, r);
1226  nerr++;
1227  }
1228  mime_ct = (r > 0);
1229 
1230  s = "Content-Disposition";
1231  if ((r = count_msgheader_attr(head, s)) > 1) {
1232  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1233  ZE_MessageInfo(10,
1234  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1235  id, s, r);
1236  nerr++;
1237  }
1238  mime_cd = (r > 0);
1239 
1240  s = "Content-Transfer-Encoding";
1241  if ((r = count_msgheader_attr(head, s)) > 1) {
1242 #if 0
1243  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1244  ZE_MessageInfo(10,
1245  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1246  id, s, r);
1247  nerr++;
1248 #endif
1249  }
1250  mime_cte = (r > 0);
1251 
1252  /*
1253  * XXX - check mime coherence
1254  */
1255  if ((mime_ct && !mime_vers) || (mime_cd && !mime_vers)
1256  || (mime_cte && !mime_vers)) {
1257  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1258  ZE_MessageInfo(10,
1259  "%s SPAM CHECK - MSG RFC2045 HDRS : MIME=(%d) CT=(%d) CD=(%d) CTE=(%d)",
1260  id, mime_vers, mime_ct, mime_cd, mime_cte, r);
1261  nerr++;
1262  }
1263 
1264  s = "X-Mailer";
1265  if ((r = count_msgheader_attr(head, s)) > 1) {
1266  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1267  ZE_MessageInfo(10,
1268  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1269  id, s, r);
1270  nerr++;
1271  }
1272 
1273  s = "User-Agent";
1274  if ((r = count_msgheader_attr(head, s)) > 1) {
1275  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1276  ZE_MessageInfo(10,
1277  "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1278  id, s, r);
1279  nerr++;
1280  }
1281 
1282  if (nerr > 5)
1283  nerr = 5;
1284 
1285  return nerr;
1286 }
1287 
1288 /* ****************************************************************************
1289  * *
1290  * *
1291  **************************************************************************** */
1292 int
1294  char *id;
1295  header_T *head;
1296 {
1297  int nerr = 0;
1298  char *s;
1299  header_T *h = NULL;
1300 
1301  s = "Date";
1302  h = head;
1303  while ((h = get_msgheader_next(h, s)) != NULL) {
1304  char *p;
1305  long pi, pf;
1306  bool ok = TRUE;
1307 
1308  if (h->value == NULL)
1309  continue;
1310 
1311  if ((strlen(h->value) == 0)
1312  || (strspn(h->value, " \t") == strlen(h->value))) {
1313  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1314  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1315  nerr++;
1316  continue;
1317  }
1318 
1319  p = h->value;
1320  s = "(Mon|Tue|Wed|Thu|Fri|Sat|Sun)";
1321  if (zeStrRegex(p, s, &pi, &pf, TRUE))
1322  p += pf;
1323 
1324  s =
1325  "[0-9]{1,2} (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec){1,1} [0-9]{4,4}";
1326  if (!zeStrRegex(p, s, &pi, &pf, TRUE)) {
1327  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1328  ZE_MessageInfo(10,
1329  "%s SPAM CHECK - MSG HDRS SYNTAX : Date : dd Mmm Yyyy : %s",
1330  id, h->value);
1331  ok = FALSE;
1332  } else
1333  p += pf;
1334 
1335  s = "[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}";
1336  if (!zeStrRegex(p, s, &pi, &pf, TRUE)) {
1337  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1338  ZE_MessageInfo(10,
1339  "%s SPAM CHECK - MSG HDRS SYNTAX : Date : HH:MM:SS : %s",
1340  id, h->value);
1341  ok = FALSE;
1342  } else
1343  p += pf;
1344 
1345 #if 0
1346  if (!ok)
1347  nerr++;
1348 #endif
1349  }
1350 
1351  s = "From";
1352  h = head;
1353  while ((h = get_msgheader_next(h, s)) != NULL) {
1354  if (h->value == NULL)
1355  continue;
1356  if ((strlen(h->value) == 0)
1357  || (strspn(h->value, " \t") == strlen(h->value))) {
1358  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1359  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1360  nerr++;
1361  }
1362  }
1363 
1364  s = "Sender";
1365  h = head;
1366  while ((h = get_msgheader_next(h, s)) != NULL) {
1367  if (h->value == NULL)
1368  continue;
1369  if ((strlen(h->value) == 0)
1370  || (strspn(h->value, " \t") == strlen(h->value))) {
1371  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1372  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1373 #if 0
1374  nerr++;
1375 #endif
1376  }
1377  }
1378 
1379  s = "Reply-To";
1380  h = head;
1381  while ((h = get_msgheader_next(h, s)) != NULL) {
1382  if (h->value == NULL)
1383  continue;
1384  if ((strlen(h->value) == 0)
1385  || (strspn(h->value, " \t") == strlen(h->value))) {
1386  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1387  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1388 #if 0
1389  nerr++;
1390 #endif
1391  }
1392  }
1393 
1394  s = "To";
1395  h = head;
1396  while ((h = get_msgheader_next(h, s)) != NULL) {
1397  if (h->value == NULL)
1398  continue;
1399  if ((strlen(h->value) == 0)
1400  || (strspn(h->value, " \t") == strlen(h->value))) {
1401  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1402  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1403  nerr++;
1404  }
1405  }
1406 
1407  s = "Cc";
1408  h = head;
1409  while ((h = get_msgheader_next(h, s)) != NULL) {
1410  if (h->value == NULL)
1411  continue;
1412  if ((strlen(h->value) == 0)
1413  || (strspn(h->value, " \t") == strlen(h->value))) {
1414  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1415  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1416 #if 0
1417  nerr++;
1418 #endif
1419  }
1420  }
1421 
1422  s = "Bcc";
1423  h = head;
1424  while ((h = get_msgheader_next(h, s)) != NULL) {
1425  if (h->value == NULL)
1426  continue;
1427  if ((strlen(h->value) == 0)
1428  || (strspn(h->value, " \t") == strlen(h->value))) {
1429  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1430  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1431  nerr++;
1432  }
1433  }
1434 
1435  s = "Message-ID";
1436  h = head;
1437  while ((h = get_msgheader_next(h, s)) != NULL) {
1438  if (h->value == NULL)
1439  continue;
1440  if (!zeStrRegex(h->value, "<[^>@]+@[^>]+>", NULL, NULL, TRUE)) {
1441  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1442  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %-12s", id, s);
1443  nerr++;
1444  }
1445  }
1446 
1447  s = "In-Reply-To";
1448  h = head;
1449  while ((h = get_msgheader_next(h, s)) != NULL) {
1450  if (h->value == NULL)
1451  continue;
1452  if (!zeStrRegex(h->value, "<[^>@]+@[^>]+>", NULL, NULL, TRUE)) {
1453  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1454  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1455 #if 0
1456  nerr++;
1457 #endif
1458  }
1459  }
1460 
1461  s = "References";
1462  h = head;
1463  while ((h = get_msgheader_next(h, s)) != NULL) {
1464  if (h->value == NULL)
1465  continue;
1466  if (!zeStrRegex(h->value, "<[^>@]+@[^>]+>", NULL, NULL, TRUE)) {
1467  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1468  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1469  nerr++;
1470  }
1471  }
1472 
1473  s = "Subject";
1474  h = head;
1475  while ((h = get_msgheader_next(h, s)) != NULL) {
1476  if (h->value == NULL)
1477  continue;
1478  if ((strlen(h->value) == 0)
1479  || (strspn(h->value, " \t") == strlen(h->value))) {
1480  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1481  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1482 #if 0
1483  nerr++;
1484 #endif
1485  }
1486  }
1487 
1488  s = "MIME-Version";
1489  h = head;
1490  while ((h = get_msgheader_next(h, s)) != NULL) {
1491  if (h->value == NULL)
1492  continue;
1493  if (strstr(h->value, "1.0") == NULL) {
1494  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1495  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %-12s : %s", id,
1496  s, h->value);
1497  nerr++;
1498  }
1499  }
1500 
1501  s = "Content-Type";
1502  h = head;
1503  while ((h = get_msgheader_next(h, s)) != NULL) {
1504  if (h->value == NULL)
1505  continue;
1506  if ((strlen(h->value) == 0)
1507  || (strspn(h->value, " \t") == strlen(h->value))) {
1508  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1509  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1510  nerr++;
1511  }
1512  }
1513 
1514  s = "Content-Disposition";
1515  h = head;
1516  while ((h = get_msgheader_next(h, s)) != NULL) {
1517  if (h->value == NULL)
1518  continue;
1519  if ((strlen(h->value) == 0)
1520  || (strspn(h->value, " \t") == strlen(h->value))) {
1521  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1522  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1523  nerr++;
1524  }
1525  }
1526 
1527  s = "Content-Transfer-Encoding";
1528  h = head;
1529  while ((h = get_msgheader_next(h, s)) != NULL) {
1530  if (h->value == NULL)
1531  continue;
1532  if ((strlen(h->value) == 0)
1533  || (strspn(h->value, " \t") == strlen(h->value))) {
1534  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1535  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1536  nerr++;
1537  }
1538  }
1539 
1540  s = "X-Mailer";
1541  h = head;
1542  while ((h = get_msgheader_next(h, s)) != NULL) {
1543  if (h->value == NULL)
1544  continue;
1545  if ((strlen(h->value) == 0)
1546  || (strspn(h->value, " \t") == strlen(h->value))) {
1547  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1548  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1549  nerr++;
1550  }
1551  }
1552 
1553  s = "User-Agent";
1554  h = head;
1555  while ((h = get_msgheader_next(h, s)) != NULL) {
1556  if (h->value == NULL)
1557  continue;
1558  if ((strlen(h->value) == 0)
1559  || (strspn(h->value, " \t") == strlen(h->value))) {
1560  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1561  ZE_MessageInfo(10, "%s SPAM CHECK - MSG HDRS SYNTAX : %s empty", id, s);
1562  nerr++;
1563  }
1564  }
1565 
1566  if (nerr > 5)
1567  nerr = 5;
1568 
1569  return nerr;
1570 }
1571 
1572 /* ****************************************************************************
1573  * *
1574  * *
1575  **************************************************************************** */
1576 static uint32_t
1577 check_header_date(id, headers)
1578  char *id;
1579  header_T *headers;
1580 {
1581  int nerr = 0;
1582  header_T *h = headers;
1583  time_t now = time(NULL);
1584  uint32_t flags = 0;
1585 
1586  ZE_MessageInfo(11, "Checking date : %s", id);
1587 
1588  if (headers == NULL)
1589  return 0;
1590 
1591  while ((h = get_msgheader_next(h, "Date")) != NULL) {
1592  time_t date_secs;
1593 
1594  if (h->value == NULL)
1595  continue;
1596 
1597  date_secs = header_date2secs(h->value);
1598 
1599  ZE_MessageInfo(11, "%s : Checking date : %ld %s", id, date_secs, h->value);
1600  if (date_secs == 0) {
1601  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1602  ZE_MessageInfo(10, "%s SPAM CHECK - INVALID DATE : %s", id, h->value);
1603  SET_BIT(flags, SPAM_MSG_BAD_DATE);
1604  nerr++;
1605  continue;
1606  }
1607 
1608  if (date_secs > now && date_secs - now > 48 HOURS) {
1609  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1610  ZE_MessageInfo(10, "%s SPAM CHECK - DATE IN THE FUTUR : %s", id,
1611  h->value);
1612  nerr++;
1613  SET_BIT(flags, SPAM_MSG_FUTURE_DATE);
1614  continue;
1615  }
1616 #if 0
1617  if (date_secs + 1 YEARS < now) {
1618  if (cf_get_int(CF_LOG_LEVEL_ORACLE) >= 2)
1619  ZE_MessageInfo(10, "%s SPAM CHECK - DATE IN THE PASR : %s", id,
1620  h->value);
1621  nerr++;
1623  continue;
1624  }
1625 #endif
1626  }
1627 
1628  return flags;
1629 }
1630 
1631 /* ****************************************************************************
1632  * *
1633  * *
1634  **************************************************************************** */
1635 static int
1636 count_html_tags(buf)
1637  char *buf;
1638 {
1639  char *p;
1640  int res = 0;
1641  long pos = 0;
1642 
1643  if (buf == NULL)
1644  return 0;
1645 
1646  p = buf;
1647  while ((strlen(p) > 0) && zeStrRegex(p, "<[^>]{1,40}>", NULL, &pos, TRUE)) {
1648  p += pos;
1649  res++;
1650  }
1651  return res;
1652 }
1653 
1654 /* ****************************************************************************
1655  * *
1656  * *
1657  **************************************************************************** */
1658 static int
1659 count_html_comments(buf)
1660  char *buf;
1661 {
1662  char *p;
1663  int res = 0;
1664  long pos = 0;
1665 
1666  if (buf == NULL)
1667  return 0;
1668 
1669  p = buf;
1670  while ((strlen(p) > 0)
1671  && zeStrRegex(p, "<!--[^>]{0,40}-->", NULL, &pos, TRUE)) {
1672  p += pos;
1673  res++;
1674  }
1675  return res;
1676 }
1677 
1678 
1679 /* ****************************************************************************
1680  * *
1681  * *
1682  **************************************************************************** */
1683 static int
1684 check_unwanted_html_tags(id, buf, best)
1685  char *id;
1686  char *buf;
1687  bestof_T *best;
1688 {
1689  int n = 0;
1690  double odds = 0.;
1691 
1692  n = count_oradata(id, "HTML-TAG", buf, FALSE, &odds);
1693  if (n > 0)
1694  bestof_add(best, odds);
1695 
1696  ZE_MessageInfo(12, " BAD %s %7.3f", "HTML-TAG", odds);
1697  return MIN(n, 3);
1698 }
1699 
1700 /* ****************************************************************************
1701  * *
1702  * *
1703  **************************************************************************** */
1704 static bool
1705 check_unwanted_boundary(id, boundary, best)
1706  char *id;
1707  char *boundary;
1708  bestof_T *best;
1709 {
1710  int n = 0;
1711  double odds = 0.;
1712 
1713  n = count_oradata(id, "BOUNDARY", boundary, TRUE, &odds);
1714  if (n > 0)
1715  bestof_add(best, odds);
1716 
1717  ZE_MessageInfo(12, " BAD %s %7.3f", "BOUNDARY", odds);
1718  return (n > 0);
1719 }
1720 
1721 /* ****************************************************************************
1722  * *
1723  * *
1724  **************************************************************************** */
1725 static bool
1726 check_unwanted_mailer(id, mailer, best)
1727  char *id;
1728  char *mailer;
1729  bestof_T *best;
1730 {
1731  int n = 0;
1732  double odds = 0.;
1733 
1734  if (mailer == NULL)
1735  return FALSE;
1736 
1737  if (strlen(mailer) == 0)
1738  return TRUE;
1739 
1740  n = count_oradata(id, "MAILER", mailer, TRUE, &odds);
1741  if (n > 0)
1742  bestof_add(best, odds);
1743 
1744  ZE_MessageInfo(12, " BAD %s %7.3f", "MAILER ", odds);
1745  return (n > 0);
1746 }
1747 
1748 /* ****************************************************************************
1749  * *
1750  * *
1751  **************************************************************************** */
1752 #define DCSET 256
1753 
1754 static bool
1755 check_unwanted_charset(id, charset, best)
1756  char *id;
1757  char *charset;
1758  bestof_T *best;
1759 {
1760  int n = 0;
1761  double odds = 0.;
1762 
1763  n = count_oradata(id, "CHARSET", charset, TRUE, &odds);
1764  if (n > 0)
1765  bestof_add(best, odds);
1766 
1767  ZE_MessageInfo(12, " BAD %s %7.3f", "CHARSET ", odds);
1768  return (n > 0);
1769 }
1770 
1771 /* ****************************************************************************
1772  * *
1773  * *
1774  **************************************************************************** */
1775 static int
1776 check_unwanted_expressions(id, buf, best)
1777  char *id;
1778  char *buf;
1779  bestof_T *best;
1780 {
1781  int n = 0;
1782  double odds = 0.;
1783 
1784  n = count_oradata(id, "BAD-EXPR", buf, FALSE, &odds);
1785  if (n > 0)
1786  bestof_add(best, odds);
1787 
1788  ZE_MessageInfo(12, " BAD %s %7.3f", "BAD-EXPR", odds);
1789 
1790  return n;
1791 }
1792 
1793 /* ****************************************************************************
1794  * *
1795  * *
1796  ******************************************************************************/
1797 static int
1798 count_uint32bits(val)
1799  uint32_t val;
1800 {
1801  int r = 0;
1802  int i;
1803 
1804  for (i = 0; i < 8 * sizeof (val); i++)
1805  if (GET_BIT(val, i))
1806  r++;
1807  return r;
1808 }
1809 
int msg_bad_expressions
int check_regex(char *, char *, char *, int)
Definition: ze-mailregex.c:295
kstats_T plain
Definition: ze-chkcontent.h:46
char * ip
Definition: ze-chkcontent.h:57
kstats_T html_clean
Definition: ze-chkcontent.h:49
msgpart_T plain
Definition: ze-chkcontent.h:99
#define MIME_TYPE_IMAGE
Definition: ze-demime.h:35
int encode
Definition: ze-demime.h:64
#define CURRENT_SCORE()
Definition: ze-chkcontent.c:73
#define REGEX_SCORE()
Definition: ze-chkcontent.c:68
char * rfc2822_get_main_attr(rfc2822_hdr_T *header)
Definition: ze-rfc2822.c:178
int html_unwanted_tags
#define MIME_TYPE_EXTENSION_TOKEN
Definition: ze-demime.h:39
char * realcleanup_text_buf(char *, size_t)
Definition: ze-oracle.c:408
#define MIME_TYPE_AUDIO
Definition: ze-demime.h:36
#define strrchr
Definition: ze-sys.h:219
#define MIME_TYPE_MESSAGE
Definition: ze-demime.h:40
#define SPAM_MSG_BASE64
header_T * get_msgheader(header_T *, char *)
Definition: ze-headers.c:144
bool bestof_init(bestof_T *b, int dim, bestcomp_F bcmp)
Definition: ze-bestof-n.c:51
#define FREE(x)
Definition: macros.h:37
#define SPAM_MSG_TOO_OLD_DATE
int nb_text_html_base64
Definition: ze-chkcontent.h:91
bool text_word_length(char *, kstats_T *, size_t)
Definition: ze-buffer.c:266
rfc2822_hdr_T * next
Definition: ze-rfc2822.h:39
int check_rfc2822_headers_count(char *, header_T *)
#define STRNULL(x, r)
Definition: macros.h:81
#define SPAM_HTML_UNWANTED_TAGS
#define SPAM_MSG_TOO_MUCH_HTML
int nb_text_plain_empty
bool ok
Definition: ze-connopen.c:59
void zeKStatsReset(kstats_T *)
Definition: zeKStats.c:92
void convert_8to7(char *buf, bool convert_spaces)
#define ENTROPY_BUF_SIZE
Definition: ze-chkcontent.c:41
double zeKMin(kstats_T *s)
Definition: zeKStats.c:62
bool text_buffer_entropy(char *, size_t, double *, double *, double *)
Definition: ze-entropy.c:210
int oracle_compute_score(char *, char *, spamchk_T *)
int count_oradata(char *, char *, char *, bool, double *)
Definition: ze-oracle.c:253
size_t max_spool_size
Definition: ze-chkcontent.h:61
int nb_application
Definition: ze-chkcontent.h:81
#define FALSE
Definition: macros.h:160
#define SPAM_PLAIN_TOO_SHORT
size_t zeGetFileSize(char *)
Definition: zeFileTools.c:132
int check_rurlbl(char *, char *, char *)
Definition: ze-mailregex.c:443
size_t size
Definition: ze-demime.h:62
bool zeStrRegex(char *, char *, long *, long *, bool)
Definition: zeStrings.c:544
int check_rfc2822_headers_syntax(char *, header_T *)
int content_max_score
Definition: ze-chkcontent.h:65
int html_high_tag_ratio
#define CF_LOG_LEVEL_ORACLE
Definition: cfh-defs.h:126
int headers_syntax_errors
#define GET_BIT(p, i)
Definition: macros.h:168
#define SPAM_MSG_SUBJECT_NO_ALPHA
size_t sz_text_html
Definition: ze-chkcontent.h:95
int cf_get_int(int id)
Definition: ze-cf.c:803
#define SET_BIT(p, i)
Definition: macros.h:166
#define MIN(a, b)
Definition: macros.h:140
#define SPAM_MSG_UNWANTED_CHARSET
bool get_msgheader_attribute(header_T *, char *, char *, size_t)
Definition: ze-headers.c:260
#define SPAM_MSG_FUTURE_DATE
#define MIME_TYPE_MULTIPART
Definition: ze-demime.h:41
kstats_T plain_clean
Definition: ze-chkcontent.h:47
bool decode_mime_file(char *, char *, uint32_t *, demime_F, void *)
Definition: ze-demime.c:584
#define LOG_LEVEL
Definition: ze-chkcontent.c:32
#define SPAM_PLAIN_NO_CHARSET
char * mime
Definition: ze-demime.h:65
#define SPAM_MSG_EMPTY_ATTACHMENT
#define SPAM_MSG_BAD_DATE
kstats_T other
Definition: ze-chkcontent.h:51
bool bestof_add(bestof_T *b, double v)
Definition: ze-bestof-n.c:75
double vector_compare(double *, double *, int)
Definition: ze-oracle.c:326
kstats_T simple
Definition: ze-chkcontent.h:50
int nb_text_plain
Definition: ze-chkcontent.h:86
int scan_body_contents(char *id, char *ip, char *fname, size_t maxsize, spamchk_T *data, msg_flags_T *flags, msg_scores_T *scores)
msg_flags_T flags
Definition: ze-chkcontent.h:68
#define SPAM_HTML_BASE64
#define SPAM_MSG_HEADERS_SYNTAX
void zeKStatsUpdate(kstats_T *, double)
Definition: zeKStats.c:101
size_t sz_text_plain
Definition: ze-chkcontent.h:94
char * value
Definition: ze-headers.h:36
size_t len_raw
Definition: ze-chkcontent.h:29
#define MAIL_BODY
Definition: ze-mailregex.h:37
int nb_text_html
Definition: ze-chkcontent.h:87
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
int nb
Definition: ze-connopen.c:61
size_t size
Definition: ze-chkcontent.h:93
#define MIME_TYPE_TEXT
Definition: ze-demime.h:34
#define SPAM_MSG_UNWANTED_BOUNDARY
int nb_multipart
Definition: ze-chkcontent.h:84
#define TRUE
Definition: macros.h:157
int count_uint32bits(uint32_t val)
Definition: ze-divers.c:33
time_t header_date2secs(char *date)
#define SPAM_MSG_BAD_EXPRESSIONS
size_t len_clean
Definition: ze-chkcontent.h:30
#define KSTATS_INITIALIZER
Definition: zeKStats.h:36
kstats_T attach
Definition: ze-chkcontent.h:52
int nb_text_plain_base64
Definition: ze-chkcontent.h:90
#define SPAM_MSG_SUBJECT_HI_CAPS
header_T * hdrs
#define MIME_TYPE_VIDEO
Definition: ze-demime.h:37
msg_ksizes_T mksize
int check_valid_html_tags(char *, char *)
Definition: ze-html.c:334
kstats_T st_wlen
Definition: ze-chkcontent.h:31
#define ZE_LogMsgWarning(level,...)
Definition: zeSyslog.h:112
uint32_t msg
Definition: ze-chkcontent.h:41
header_T * next
Definition: ze-headers.h:37
#define SPAM_MSG_RFC2822_HEADERS
int count_msgheader_attr(header_T *, char *)
Definition: ze-headers.c:116
bestof_T best
Definition: ze-chkcontent.h:69
#define CF_REGEX_MAX_SCORE
Definition: cfh-defs.h:118
#define HOURS
Definition: macros.h:144
#define SPAM_MSG_CONTENT_ID
#define SPAM_MSG_UNWANTED_MAILER
#define SPAM_PLAIN_EMPTY
kstats_T html
Definition: ze-chkcontent.h:48
int html_invalid_tags
#define SPAM_MSG_MATCH_MIME_PARTS
int nb_message
Definition: ze-chkcontent.h:83
double zeKMean(kstats_T *s)
Definition: zeKStats.c:43
#define SPAM_PLAIN_BASE64
char * cleanup_html_buffer(char *, size_t)
Definition: ze-html.c:163
double zeKMax(kstats_T *s)
Definition: zeKStats.c:72
#define SPAM_HTML_TAGS_RATIO
double zeKStdDev(kstats_T *s)
Definition: zeKStats.c:53
msg_scores_T scores
Definition: ze-chkcontent.h:72
char * charset
Definition: ze-demime.h:70
#define SPAM_MSG_MIME_ERRORS
char * rfc2822_get_attr(rfc2822_hdr_T *header, char *attr)
Definition: ze-rfc2822.c:201
int nb_text_simple_empty
#define SPAM_HTML_INVALID_TAGS
long uint32_t
Definition: ze-sys.h:489
int nb_rfc2822_hdrs_errors
#define MIME_TYPE_APPLICATION
Definition: ze-demime.h:38
#define URLBL_SCORE()
Definition: ze-chkcontent.c:67
header_T * get_msgheader_next(header_T *, char *)
Definition: ze-headers.c:175
#define MIME_ENCODE_BASE64
Definition: ze-demime.h:48
#define SPAM_HTML_CLEAN_TOO_SHORT
msgpart_T html
Definition: ze-chkcontent.h:98
uint32_t plain
Definition: ze-chkcontent.h:39
#define YEARS
Definition: macros.h:148
rfc2822_hdr_T * hdrs
Definition: ze-demime.h:74
#define SPAM_MSG_BASE64_SUBJECT
#define SPAM_MSG_NO_TEXT_PART
size_t spool_size
Definition: ze-chkcontent.h:60
rfc2822_hdr_T * rfc2822_lookup_header(rfc2822_hdr_T *head, char *key)
Definition: ze-rfc2822.c:145