33 #define H0_LOG_LEVEL 12 35 #ifndef _FFR_MSG_ENTROPY 36 #define _FFR_MSG_ENTROPY 0 40 #ifndef ENTROPY_BUF_SIZE 41 #define ENTROPY_BUF_SIZE 16000 50 static int check_unwanted_html_tags(
char *,
char *,
bestof_T *);
51 static bool check_unwanted_boundary(
char *,
char *,
bestof_T *);
52 static bool check_unwanted_mailer(
char *,
char *,
bestof_T *);
53 static bool check_unwanted_charset(
char *,
char *,
bestof_T *);
54 static int check_unwanted_expressions(
char *,
char *,
bestof_T *);
56 static int count_html_tags(
char *);
57 static int count_html_comments(
char *);
67 #define URLBL_SCORE() (MAX(urlbl_score, data->scores.urlbl)) 68 #define REGEX_SCORE() (MAX(MAX(clean_score, raw_score), data->scores.body)) 70 #define URLBL_SCORE() (urlbl_score + data->scores.urlbl) 71 #define REGEX_SCORE() (MAX(clean_score, raw_score) + data->scores.body) 73 #define CURRENT_SCORE() (URLBL_SCORE() + REGEX_SCORE()) 76 check_mime_part(buf, size,
id, level, type, arg, mime_part)
87 int clean_score = 0, raw_score = 0, urlbl_score = 0;
89 bool simple_text =
TRUE;
93 id =
STRNULL(
id,
"00000000.000");
94 if (mime_part == NULL) {
139 if (mime_part->
hdrs != NULL) {
152 if (name != NULL && strncasecmp(name,
"image", strlen(
"image")) == 0) {
156 "%s SPAM CHECK - M%02d unwanted Content-ID for %s",
id,
167 if (mime_part->
hdrs != NULL) {
172 hdr = mime_part->
hdrs;
195 "%s SPAM CHECK - M%02d empty attachment %s",
212 hdr = mime_part->
hdrs;
235 "%s SPAM CHECK - M%02d empty attachment %s",
273 for (sz = size; *p !=
'\0' && sz > 0; p++, sz--) {
274 int c = *((
unsigned char *) p);
276 if (c == 0x0A || c == 0x0D) {
297 "AAAAAAACEEEEIIIIGNOOOOOxOUUUUYPB" "aaaaaaaceeeeiiiionooooo-ouuuuyby";
300 if (c > 0 && c <
sizeof (s))
319 if (strcasecmp(
"text/html", mime_part->
mime) == 0) {
320 size_t real_size = 0;
347 char *html_clean = NULL;
351 if (html_clean != NULL) {
352 real_size = strlen(html_clean);
367 if (sc > clean_score)
383 #if _FFR_MSG_ENTROPY == 1 384 if (data->spam_oracle) {
385 if (html_clean != NULL) {
409 if ((n = check_unwanted_html_tags(
id, buf, &data->
best)) > 0)
412 nb_tags = count_html_tags(buf);
414 if (nb_tags * 5 > real_size)
430 if (strcasecmp(
"text/plain", mime_part->
mime) == 0) {
442 for (pc = buf; pc != NULL && *pc !=
'\0'; pc++)
450 if (sz >= 0 && sz < 80) {
454 "%s SPAM CHECK - P%02d text/plain part too short : %d",
459 if ((mime_part != NULL) && (strlen(mime_part->
charset) == 0)) {
463 "%s SPAM CHECK - P%02d text/plain w/o charset",
id,
477 #if _FFR_MSG_ENTROPY == 1 489 if (strspn(buf,
" \n\r\t") == size)
531 #if _FFR_MSG_ENTROPY == 1 539 if (strspn(buf,
" \n\r\t") == size)
632 if ((flags = check_header_date(
id, data->
hdrs)) != 0)
644 memset(value, 0,
sizeof (value));
646 && (strlen(value) > 0)) {
647 if (check_unwanted_charset(
id, value, &data->
best)) {
651 "%s SPAM CHECK - M%02d unwanted charset : %s",
id,
656 if ((h->
value != NULL)
661 if (check_unwanted_boundary(
id, value, &data->
best)) {
666 "%s SPAM CHECK - M%02d unwanted boundary (%s)",
673 "%s SPAM CHECK - M%02d short boundary boundary=(%s) len=(%d)",
683 if (check_unwanted_mailer(
id, mailer, &data->
best)) {
687 "%s SPAM CHECK - M%02d unwanted mailer (%s)",
694 if (check_unwanted_mailer(
id, mailer, &data->
best)) {
698 "%s SPAM CHECK - M%02d unwanted mailer (%s)",
712 if (strlen(h->
value) == 0)
715 for (p = h->
value; p != NULL && *p !=
'\0'; p++) {
725 "%s SPAM CHECK - M%02d Subject doesn't contains lower case chars : %s",
732 "%s SPAM CHECK - M%02d Subject doesn't contains alpha chars : %s",
765 if ((h->
value != NULL)
778 if ((h->
value != NULL) &&
797 if (size < 8 * maxsize)
805 if (data->scores.do_oracle) {
810 if ((data->nb_part > 1) && (data->nb_text == 0)) {
814 "%s SPAM CHECK - M%02d No HTML nor TEXT parts : Total = %d",
823 if (
zeKMean(&data->mksize.plain) < 1.);
827 if (data->msg_bad_expressions > 0) {
831 "%s SPAM CHECK - M%02d BAD EXPRESSIONS : %d",
id,
835 if ((data->nb_text_html > 0) && (data->nb_text_html > data->nb_text_plain)) {
839 "%s SPAM CHECK - M%02d NB HTML > PLAIN : %d %d",
id,
841 data->nb_text_plain);
848 double whtml[5], wplain[5];
849 double hhtml[5], hplain[5];
852 memset(whtml, 0,
sizeof (whtml));
853 memset(hhtml, 0,
sizeof (hhtml));
854 memset(wplain, 0,
sizeof (wplain));
855 memset(hplain, 0,
sizeof (hplain));
856 if ((data->nb_text_html > 0) && (data->nb_text_plain > 0)) {
860 whtml[0] =
zeKMin(&data->html.st_wlen);
861 whtml[1] =
zeKMax(&data->html.st_wlen);
862 whtml[2] =
zeKMean(&data->html.st_wlen);
863 whtml[3] =
zeKStdDev(&data->html.st_wlen);
864 whtml[4] = (double) data->html.len_clean;
866 wplain[0] =
zeKMin(&data->plain.st_wlen);
867 wplain[1] =
zeKMax(&data->plain.st_wlen);
868 wplain[2] =
zeKMean(&data->plain.st_wlen);
869 wplain[3] =
zeKStdDev(&data->plain.st_wlen);
870 wplain[4] = (double) data->plain.len_clean;
875 lcoef = whtml[4] / wplain[4];
879 if (data->html.len_clean > 2500) {
880 if (abs(data->html.len_clean - data->plain.len_clean) > 500)
883 if (abs(data->html.len_clean - data->plain.len_clean) > 1000)
888 if (!ko && (vcoef < 0.9) && (vcoef > 0.1)) {
894 "%s SPAM CHECK - M%02d HTML/PLAIN parts don't match vcoef=(%7.3f) lcoef=(%7.3f) (vcoef)",
898 if (!ko && ((lcoef > 5) || (lcoef < 0.85))) {
904 "%s SPAM CHECK - M%02d HTML/PLAIN parts don't match vcoef=(%7.3f) lcoef=(%7.3f) (lcoef)",
918 "%s SPAM CHECK - M%02d HTML/PLAIN parts don't match HTML(%6d)/PLAIN(%6d) (ldiff)",
920 data->html.len_clean, data->plain.len_clean);
922 #if _FFR_MSG_ENTROPY == 1 923 hhtml[0] = data->html.h0;
924 hhtml[1] = data->html.h1;
925 hhtml[2] = data->html.h2;
926 hhtml[3] = (double) data->html.len_clean;
928 hplain[0] = data->plain.h0;
929 hplain[1] = data->plain.h1;
930 hplain[2] = data->plain.h2;
931 hplain[3] = (
double) data->plain.len_clean;
935 if (wplain[4] >= 1.0)
936 lcoef = whtml[3] / wplain[3];
951 if (data->mime_errors > 0) {
958 "%s SPAM CHECK - M%02d MIME decode errors : %d",
960 memset(sout, 0,
sizeof (sout));
962 for (i = 0; i < 32; i++)
963 sout[i] =
GET_BIT(mime_flags, i) ? (
'0' + (i % 10)) :
'.';
967 "%s SPAM CHECK - M%02d MIME errors %s",
974 if (data->nb_text_plain_base64 > 0) {
978 "%s SPAM CHECK - P%02d text/plain encoded base64 : %d",
982 if ((data->nb_text_plain_empty > 0)
983 && (data->nb_text_plain == data->nb_text_plain_empty)) {
987 "%s SPAM CHECK - P%02d text/plain empty : %d",
id,
994 if (data->nb_text_html_base64 > 0) {
998 "%s SPAM CHECK - H%02d text/html encoded base64 : %d",
1005 if (data->html.len_clean > 0) {
1009 if (data->html.len_clean > 0)
1010 r2c = ((double) data->html.len_raw) / ((double) data->html.len_clean);
1012 if (data->html.len_clean > 0 && data->html.len_clean < 100) {
1013 if ((data->plain.len_clean == 0) || (r2c >= 2.)) {
1022 "%s SPAM CHECK - H%02d HTML cleaned up too short : %d",
1027 if (data->html_high_tag_ratio > 0) {
1031 "%s SPAM CHECK - H%02d HTML tag/text ratio : %d",
id,
1035 if (data->html_unwanted_tags > 0) {
1039 "%s SPAM CHECK - H%02d HTML with unwanted tags : %d",
id,
1043 if (data->html_invalid_tags > 0) {
1047 "%s SPAM CHECK - H%02d HTML with invalid tags : %d",
id,
1056 if (data->scores.do_oracle) {
1063 *scores = data->scores;
1065 return data->scores.
body + data->scores.urlbl;
1082 bool mime_ct, mime_cd, mime_cte, mime_vers;
1084 mime_ct = mime_cd = mime_cte = mime_vers =
FALSE;
1090 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (1,1)",
1099 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (1,1)",
1108 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1117 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1126 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1136 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1146 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1155 "%s SPAM CHECK - MSG RFC2822 HDRS count : No To nor Cc nor Bcc",
1164 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1171 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1180 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1189 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1198 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1205 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG RFC2822 HDRS count : No Subject",
1214 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1218 mime_vers = (r > 0);
1224 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1230 s =
"Content-Disposition";
1234 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1240 s =
"Content-Transfer-Encoding";
1245 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1255 if ((mime_ct && !mime_vers) || (mime_cd && !mime_vers)
1256 || (mime_cte && !mime_vers)) {
1259 "%s SPAM CHECK - MSG RFC2045 HDRS : MIME=(%d) CT=(%d) CD=(%d) CTE=(%d)",
1260 id, mime_vers, mime_ct, mime_cd, mime_cte, r);
1268 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1277 "%s SPAM CHECK - MSG RFC2822 HDRS count : %-12s : %d (0,1)",
1308 if (h->
value == NULL)
1311 if ((strlen(h->
value) == 0)
1312 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1314 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1320 s =
"(Mon|Tue|Wed|Thu|Fri|Sat|Sun)";
1325 "[0-9]{1,2} (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec){1,1} [0-9]{4,4}";
1329 "%s SPAM CHECK - MSG HDRS SYNTAX : Date : dd Mmm Yyyy : %s",
1335 s =
"[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}";
1339 "%s SPAM CHECK - MSG HDRS SYNTAX : Date : HH:MM:SS : %s",
1354 if (h->
value == NULL)
1356 if ((strlen(h->
value) == 0)
1357 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1359 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1367 if (h->
value == NULL)
1369 if ((strlen(h->
value) == 0)
1370 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1372 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1382 if (h->
value == NULL)
1384 if ((strlen(h->
value) == 0)
1385 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1387 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1397 if (h->
value == NULL)
1399 if ((strlen(h->
value) == 0)
1400 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1402 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1410 if (h->
value == NULL)
1412 if ((strlen(h->
value) == 0)
1413 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1415 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1425 if (h->
value == NULL)
1427 if ((strlen(h->
value) == 0)
1428 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1430 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1438 if (h->
value == NULL)
1442 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %-12s",
id, s);
1450 if (h->
value == NULL)
1454 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1464 if (h->
value == NULL)
1468 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1476 if (h->
value == NULL)
1478 if ((strlen(h->
value) == 0)
1479 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1481 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1491 if (h->
value == NULL)
1493 if (strstr(h->
value,
"1.0") == NULL) {
1495 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %-12s : %s",
id,
1504 if (h->
value == NULL)
1506 if ((strlen(h->
value) == 0)
1507 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1509 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1514 s =
"Content-Disposition";
1517 if (h->
value == NULL)
1519 if ((strlen(h->
value) == 0)
1520 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1522 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1527 s =
"Content-Transfer-Encoding";
1530 if (h->
value == NULL)
1532 if ((strlen(h->
value) == 0)
1533 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1535 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1543 if (h->
value == NULL)
1545 if ((strlen(h->
value) == 0)
1546 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1548 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1556 if (h->
value == NULL)
1558 if ((strlen(h->
value) == 0)
1559 || (strspn(h->
value,
" \t") == strlen(h->
value))) {
1561 ZE_MessageInfo(10,
"%s SPAM CHECK - MSG HDRS SYNTAX : %s empty",
id, s);
1577 check_header_date(
id, headers)
1583 time_t now = time(NULL);
1588 if (headers == NULL)
1594 if (h->
value == NULL)
1600 if (date_secs == 0) {
1608 if (date_secs > now && date_secs - now > 48
HOURS) {
1617 if (date_secs + 1
YEARS < now) {
1636 count_html_tags(buf)
1647 while ((strlen(p) > 0) &&
zeStrRegex(p,
"<[^>]{1,40}>", NULL, &pos,
TRUE)) {
1659 count_html_comments(buf)
1670 while ((strlen(p) > 0)
1684 check_unwanted_html_tags(
id, buf, best)
1705 check_unwanted_boundary(
id, boundary, best)
1726 check_unwanted_mailer(
id, mailer, best)
1737 if (strlen(mailer) == 0)
1755 check_unwanted_charset(
id, charset, best)
1776 check_unwanted_expressions(
id, buf, best)
1804 for (i = 0; i < 8 *
sizeof (val); i++)
int check_regex(char *, char *, char *, int)
char * realcleanup_text_buf(char *, size_t)
bool bestof_init(bestof_T *b, int dim, bestcomp_F bcmp)
#define SPAM_MSG_TOO_OLD_DATE
bool text_word_length(char *, kstats_T *, size_t)
int check_rfc2822_headers_count(char *, header_T *)
#define SPAM_HTML_UNWANTED_TAGS
#define SPAM_MSG_TOO_MUCH_HTML
void zeKStatsReset(kstats_T *)
double zeKMin(kstats_T *s)
bool text_buffer_entropy(char *, size_t, double *, double *, double *)
int oracle_compute_score(char *, char *, spamchk_T *)
int count_oradata(char *, char *, char *, bool, double *)
#define SPAM_PLAIN_TOO_SHORT
int check_rurlbl(char *, char *, char *)
bool zeStrRegex(char *, char *, long *, long *, bool)
int check_rfc2822_headers_syntax(char *, header_T *)
#define CF_LOG_LEVEL_ORACLE
int headers_syntax_errors
#define SPAM_MSG_SUBJECT_NO_ALPHA
#define SPAM_MSG_UNWANTED_CHARSET
#define SPAM_MSG_FUTURE_DATE
#define SPAM_PLAIN_NO_CHARSET
#define SPAM_MSG_EMPTY_ATTACHMENT
#define SPAM_MSG_BAD_DATE
bool bestof_add(bestof_T *b, double v)
double vector_compare(double *, double *, int)
int scan_body_contents(char *id, char *ip, char *fname, size_t maxsize, spamchk_T *data, msg_flags_T *flags, msg_scores_T *scores)
#define SPAM_MSG_HEADERS_SYNTAX
void zeKStatsUpdate(kstats_T *, double)
#define ZE_MessageInfo(level,...)
#define SPAM_MSG_UNWANTED_BOUNDARY
int count_uint32bits(uint32_t val)
time_t header_date2secs(char *date)
#define SPAM_MSG_BAD_EXPRESSIONS
#define KSTATS_INITIALIZER
#define SPAM_MSG_SUBJECT_HI_CAPS
#define ZE_LogMsgWarning(level,...)
#define SPAM_MSG_RFC2822_HEADERS
#define CF_REGEX_MAX_SCORE
#define SPAM_MSG_CONTENT_ID
#define SPAM_MSG_UNWANTED_MAILER
#define SPAM_MSG_MATCH_MIME_PARTS
double zeKMean(kstats_T *s)
#define SPAM_PLAIN_BASE64
double zeKMax(kstats_T *s)
#define SPAM_HTML_TAGS_RATIO
double zeKStdDev(kstats_T *s)
#define SPAM_MSG_MIME_ERRORS
#define SPAM_HTML_INVALID_TAGS
int nb_rfc2822_hdrs_errors
#define SPAM_HTML_CLEAN_TOO_SHORT
#define SPAM_MSG_BASE64_SUBJECT
#define SPAM_MSG_NO_TEXT_PART