ze-filter  (ze-filter-0.8.0-develop-180218)
ze-entropy.c
Go to the documentation of this file.
1 
2 /*
3  *
4  * ze-filter - Mail Server Filter for sendmail
5  *
6  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
7  *
8  * Auteur : Jose Marcio Martins da Cruz
9  * jose.marcio.mc@gmail.org
10  *
11  * Historique :
12  * Creation : janvier 2002
13  *
14  * This program is free software, but with restricted license :
15  *
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
20  *
21  * More details about ze-filter license can be found at ze-filter
22  * web site : http://foss.jose-marcio.org
23  */
24 
25 #include <ze-sys.h>
26 
27 #include "ze-filter.h"
28 
29 
30 /* ****************************************************************************
31  * *
32  * *
33  **************************************************************************** */
34 static int
35 conv2ascii(c)
36  int c;
37 {
38 #if 1
39  if (strchr("àäâ, c) != NULL) return 'a'; if (strchr("éèêë", c) != NULL) return 'e'; if (strchr("îï", c) != NULL) return 'i'; if (strchr("'ôö", c) != NULL) return 'o'; if (strchr("ùû", c) != NULL) return 'u'; if (strchr("ç", c) != NULL) return 'c'; #else switch (c) { case 'à': case 'ä': case 'â': return 'a'; case 'é': case 'è': case 'ê': case 'ë': return 'e'; case 'î': case 'ï': return 'i'; case 'ô': case 'ö': return 'o'; case 'ù': case 'û': return 'u'; case 'ç': return 'c'; } #endif return c; } /* **************************************************************************** * * * * **************************************************************************** */ void text2lowerascii(buf, size) char *buf; size_t size; { char *p = buf; int c; if (buf == NULL) return; for (p = buf; (size > 0) && (*p != '\0'); p++) { c = (conv2ascii(*p) + 256) % 256; *p = tolower(c); } } /* **************************************************************************** * * * * **************************************************************************** */ #define ONLY_LOWER 1 double entropy_monogram(buf, sz) char *buf; size_t sz; { int freq[256]; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz == 0)) return 0.; memset(freq, 0, sizeof (freq)); sum = 0; for (i = 0, p = buf; i < sz; i++, p++) { int c; c = conv2ascii(*p); c = (c + 256) % 256; c = tolower(c); freq[c]++; nc++; } for (i = 0; i < 256; i++) { if (freq[i] != 0) { double k; k = ((double) freq[i]) / ((double) nc); sum -= k * log(k); } } return sum / log(2.); } /* **************************************************************************** * * * * **************************************************************************** */ typedef struct { int key; int count; double pp; double pc; } hash_rec_T; static int get_hash_index(k, h, sz) uint32_t k; hash_rec_T *h; size_t sz; { int i, hv = 0; { int c, d; uint32_t tk = k; c = d = 0; for (i = 0; i < 4; i++) { d = tk & 0x000000FF; tk = tk >> 8; c = d; c ^= c << 6; hv += (c << 11) ^ (c >> 1); hv ^= (d << 14) + (d << 7) + (d << 4) + d; } hv %= sz; } for (i = 0; i < sz; i++) { int j = (hv + i) % sz; if (h[j].key == k) return j; if (h[j].key == 0) { h[j].key = k; return j; } } ZE_LogMsgWarning(0, "Hash table overflow"); return 0; } /* **************************************************************************** * * * * **************************************************************************** */ #define SZH0 256 #define SZH1 4096 #define SZH2 16384 #define SZH3 16384 /* **************************************************************************** * * * * **************************************************************************** */ bool text_buffer_entropy(buf, sz, e0, e1, e2) char *buf; size_t sz; double *e0; double *e1; double *e2; { hash_rec_T *h0, *h1, *h2; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz < 6)) return FALSE; text2lowerascii(buf, sz); h0 = (hash_rec_T *) malloc(SZH0 * sizeof (hash_rec_T)); h1 = (hash_rec_T *) malloc(SZH1 * sizeof (hash_rec_T)); h2 = (hash_rec_T *) malloc(SZH2 * sizeof (hash_rec_T)); if ((h0 == NULL) || (h1 == NULL) || (h2 == NULL)) { FREE(h0); FREE(h1); FREE(h2); return FALSE; } memset(h0, 0, SZH0 * sizeof (hash_rec_T)); memset(h1, 0, SZH1 * sizeof (hash_rec_T)); memset(h2, 0, SZH2 * sizeof (hash_rec_T)); sum = 0; for (i = 0, p = buf; i < sz - 2; i++, p++) { int c0, c1, c2; int hi; uint32_t k; c0 = p[0]; k = c0; hi = get_hash_index(k, h0, SZH0); h0[hi].count++; c1 = p[1]; k = (c1 << 8) | c0; hi = get_hash_index(k, h1, SZH1); h1[hi].count++; c2 = p[2]; k = (c2 << 16) | (c1 << 8) | c0; hi = get_hash_index(k, h2, SZH2); h2[hi].count++; nc++; } sum = 0.; for (i = 0; i < SZH0; i++) { if (h0[i].count > 0) { h0[i].pp = ((double) h0[i].count) / nc; sum -= h0[i].pp * log(h0[i].pp); } } *e0 = sum / log(2.); sum = 0.; for (i = 0; i < SZH1; i++) { if (h1[i].count > 0) { int hi; h1[i].pp = ((double) h1[i].count) / nc; hi = h1[i].key & 0xFF; hi = get_hash_index(hi, h0, SZH0); if (h0[hi].count > 0) { h1[i].pc = h1[i].pp / h0[hi].pp; sum -= h1[i].pp * log(h1[i].pc); } } } *e1 = sum / log(2.); sum = 0; for (i = 0; i < SZH2; i++) { if (h2[i].count > 0) { int hi; h2[i].pp = ((double) h2[i].count) / nc; hi = h2[i].key & 0xFFFF; hi = get_hash_index(hi, h1, SZH1); if (h1[hi].pp > 0) { h2[i].pc = h2[i].pp / h1[hi].pp; sum -= h2[i].pp * log(h2[i].pc); } } } *e2 = sum / log(2.); FREE(h0); FREE(h1); FREE(h2); return TRUE; } /* **************************************************************************** * * * * **************************************************************************** */ double entropy_token_class(buf, sz) char *buf; size_t sz; { int freq[256]; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz == 0)) return 0.; memset(freq, 0, sizeof (freq)); sum = 0; for (i = 0, p = buf; i < sz; i++, p++) { int c; c = conv2ascii(*p); c = (c + 256) % 256; nc++; if (isalpha(c)) { freq[2]++; continue; } if (isdigit(c)) { freq[3]++; continue; } if (isspace(c)) { freq[4]++; continue; } if (ispunct(c)) { freq[5]++; continue; } if (iscntrl(c)) { freq[6]++; continue; } freq[0]++; } for (i = 0; i < 256; i++) { if (freq[i] != 0) { double k; k = ((double) freq[i]) / ((double) nc); sum -= k * log(k); } } return sum / log(2.); } /* **************************************************************************** * * * * **************************************************************************** */ double entropy_punct_class(buf, sz) char *buf; size_t sz; { int freq[256]; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz == 0)) return 0.; memset(freq, 0, sizeof (freq)); sum = 0; for (i = 0, p = buf; i < sz; i++, p++) { int c; c = conv2ascii(*p); c = (c + 256) % 256; nc++; if (isalnum(c)) { freq[1]++; continue; } if (isspace(c)) { freq[2]++; continue; } if (ispunct(c)) { freq[2]++; continue; } ZE_MessageInfo(5, "Nor alnum, space, punct %c %3d ??? ", c, c); freq[0]++; } for (i = 0; i < 256; i++) { if (freq[i] != 0) { double k; k = ((double) freq[i]) / ((double) nc); sum -= k * log(k); } } return sum / log(2.); } /* **************************************************************************** * * * * **************************************************************************** */ static void buf_extract_tokens(buf) char *buf; { char *s, *ptr; for (s = strtok_r(buf, " ", &ptr); s != NULL; s = strtok_r(NULL, " ", &ptr)) { printf("--> %s\n", s); } } /* **************************************************************************** * * * * **************************************************************************** */ typedef struct { kstats_T html_st; size_t html_sz; kstats_T plain_st; size_t plain_sz; } DATA_T; static bool entropy_mime_part(buf, size, id, level, type, arg, mime_part) char *buf; size_t size; char *id; int level; int type; void *arg; mime_part_T *mime_part; { DATA_T *data = (DATA_T *) arg; double h0, h1, h2, h3, h4, ratio = 1.; char *cleanbuf = NULL; char *mtype = "PLAIN"; char *wbuf = buf; int n; uint32_t dt; kstats_T st = KSTATS_INITIALIZER; if (data == NULL) return FALSE; if (type != MIME_TYPE_TEXT) return TRUE; #if 1 if (abs(strspn(buf, " \t\r\n") - size) < 4) return TRUE; #endif #if 1 if (size < 6) return TRUE; #endif if (strcasecmp("text/html", mime_part->mime) == 0) { mtype = "HTML "; n = check_valid_html_tags(NULL, buf); ZE_LogMsgInfo(9, "NOT VALID TAGS = %6d", n); cleanbuf = cleanup_html_buffer(buf, strlen(buf)); #if 1 ZE_MessageInfo(9, "\nBUF ...\n%s\n", buf); if (cleanbuf != NULL) ZE_MessageInfo(9, "\nBUF ...\n%s\n", cleanbuf); { char *x = NULL; x = realcleanup_text_buf(cleanbuf, strlen(cleanbuf)); if (x != NULL) { ZE_MessageInfo(9, "\nBUF ...\n%s\n", x); buf_extract_tokens(x); FREE(x); } } #endif wbuf = cleanbuf; if (strlen(wbuf) > 0) ratio = ((double) strlen(buf)) / strlen(wbuf); } dt = zeTime_ms(); text_buffer_entropy(wbuf, strlen(wbuf) + 1, &h0, &h1, &h2); dt = zeTime_ms() - dt; ZE_MessageInfo(9, "DT = %ld", dt); h3 = entropy_token_class(wbuf, strlen(wbuf) + 1); h4 = entropy_punct_class(wbuf, strlen(wbuf) + 1); (void) text_word_length(wbuf, &st, strlen(wbuf)); ZE_MessageInfo(9, "%s ENTROPY = %7.3f %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f", mtype, h0, h1, h2, h3, h4, size, strlen(buf), strlen(wbuf), ratio); ZE_MessageInfo(9, "%s WORDS = %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f", mtype, zeKMin(&st), zeKMean(&st), zeKMax(&st), zeKStdDev(&st), size, strlen(buf), strlen(wbuf), ratio); if (0) { long prob[256], nb; if ((nb = text_buf_histogram(wbuf, strlen(wbuf) + 1, prob)) > 0) { int i; for (i = 0; i < 256; i++) { if (prob[i] > 0) ZE_MessageInfo(9, "%s HISTO = %3d %6d %c", mtype, i, prob[i], (isprint(i) ? i : '.')); } } } FREE(cleanbuf); return TRUE; } /* **************************************************************************** * * * * **************************************************************************** */ bool message_entropy(id, fname) char *id; char *fname; { DATA_T data; if (fname == NULL) return FALSE; memset(&data, 0, sizeof (data)); return decode_mime_file(id, fname, NULL, entropy_mime_part, &data); } /* **************************************************************************** * * * * **************************************************************************** */ typedef struct { kstats_T html_st; size_t html_sz; kstats_T plain_st; size_t plain_sz; } HTTP_T; #if 1 #define URL_DOMAIN_EXPRESSION "http[s]?://[^ /<>\\(\\)\"\'?]*" #else #define URL_DOMAIN_EXPRESSION "http://[^ /<>\"]*" #endif static bool mime_extract_http_urls(buf, size, id, level, type, arg, mime_part) char *buf; size_t size; char *id; int level; int type; void *arg; mime_part_T *mime_part; { HTTP_T *data = (HTTP_T *) arg; if (data == NULL) return FALSE; if (type != MIME_TYPE_TEXT) return TRUE; if (abs(strspn(buf, " \t\r\n") - size) < 4) return TRUE; if (size < 6) return TRUE; { long pi, pf; char *p = buf; char sout[4096]; memset(sout, 0, sizeof (sout)); while (zeStrRegex(p, "(ftp|http)[s]?://[^ /<>\"]*", &pi, &pf, TRUE)) { char sout[1024]; char c = '.'; long xi, xf, xh; if (zeStrRegex(p + pf, "></a>", &xi, NULL, TRUE) && (xi - pf) < 30) { bool okh, okf; okh = zeStrRegex(p + pf, ">", &xh, NULL, TRUE) && (xh < xi); okf = zeStrRegex(p + pf, ">.+</a>", &xf, NULL, TRUE) && (xf < xi); c = (okf || okh) ? '.' : 'H'; } strncpy(sout, p + pi, pf - pi); sout[pf - pi] = 0; #if 1 { char *p, *q; for (p = q = sout; *p != '\0'; p++) { if (strchr(" \t\r\n()", *p) == NULL) *q++ = *p; } *q = '\0'; } #else if ((xh = strcspn(sout, " \t\r\n()")) >= 0) sout[xh] = '\0'; #endif ZE_MessageInfo(0, "HTTP : %c %s", c, sout); p += pf; } } return TRUE; } /* **************************************************************************** * * * * **************************************************************************** */ #define SZBUF 0x20000 bool message_extract_http_urls(id, fname) char *id; char *fname; { HTTP_T data; if (fname != NULL) { memset(&data, 0, sizeof (data)); return decode_mime_file(id, fname, NULL, mime_extract_http_urls, &data); } else { char *buf = NULL; size_t sz; if ((buf = malloc(SZBUF)) != NULL) { sz = read(STDIN_FILENO, buf, SZBUF); if (sz > 0) return decode_mime_buffer(id, buf, sz, 0, NULL, mime_extract_http_urls, &data); if (sz < 0) return FALSE; } } return FALSE; } ", c) != NULL)
40  return 'a';
41  if (strchr("éèêë", c) != NULL)
42  return 'e';
43  if (strchr("îï", c) != NULL)
44  return 'i';
45  if (strchr("'ôö", c) != NULL)
46  return 'o';
47  if (strchr("ùû", c) != NULL)
48  return 'u';
49  if (strchr("ç, c) != NULL) return 'c'; #else switch (c) { case 'à': case 'ä': case 'â': return 'a'; case 'é': case 'è': case 'ê': case 'ë': return 'e'; case 'î': case 'ï': return 'i'; case 'ô': case 'ö': return 'o'; case 'ù': case 'û': return 'u'; case 'ç': return 'c'; } #endif return c; } /* **************************************************************************** * * * * **************************************************************************** */ void text2lowerascii(buf, size) char *buf; size_t size; { char *p = buf; int c; if (buf == NULL) return; for (p = buf; (size > 0) && (*p != '\0'); p++) { c = (conv2ascii(*p) + 256) % 256; *p = tolower(c); } } /* **************************************************************************** * * * * **************************************************************************** */ #define ONLY_LOWER 1 double entropy_monogram(buf, sz) char *buf; size_t sz; { int freq[256]; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz == 0)) return 0.; memset(freq, 0, sizeof (freq)); sum = 0; for (i = 0, p = buf; i < sz; i++, p++) { int c; c = conv2ascii(*p); c = (c + 256) % 256; c = tolower(c); freq[c]++; nc++; } for (i = 0; i < 256; i++) { if (freq[i] != 0) { double k; k = ((double) freq[i]) / ((double) nc); sum -= k * log(k); } } return sum / log(2.); } /* **************************************************************************** * * * * **************************************************************************** */ typedef struct { int key; int count; double pp; double pc; } hash_rec_T; static int get_hash_index(k, h, sz) uint32_t k; hash_rec_T *h; size_t sz; { int i, hv = 0; { int c, d; uint32_t tk = k; c = d = 0; for (i = 0; i < 4; i++) { d = tk & 0x000000FF; tk = tk >> 8; c = d; c ^= c << 6; hv += (c << 11) ^ (c >> 1); hv ^= (d << 14) + (d << 7) + (d << 4) + d; } hv %= sz; } for (i = 0; i < sz; i++) { int j = (hv + i) % sz; if (h[j].key == k) return j; if (h[j].key == 0) { h[j].key = k; return j; } } ZE_LogMsgWarning(0, "Hash table overflow"); return 0; } /* **************************************************************************** * * * * **************************************************************************** */ #define SZH0 256 #define SZH1 4096 #define SZH2 16384 #define SZH3 16384 /* **************************************************************************** * * * * **************************************************************************** */ bool text_buffer_entropy(buf, sz, e0, e1, e2) char *buf; size_t sz; double *e0; double *e1; double *e2; { hash_rec_T *h0, *h1, *h2; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz < 6)) return FALSE; text2lowerascii(buf, sz); h0 = (hash_rec_T *) malloc(SZH0 * sizeof (hash_rec_T)); h1 = (hash_rec_T *) malloc(SZH1 * sizeof (hash_rec_T)); h2 = (hash_rec_T *) malloc(SZH2 * sizeof (hash_rec_T)); if ((h0 == NULL) || (h1 == NULL) || (h2 == NULL)) { FREE(h0); FREE(h1); FREE(h2); return FALSE; } memset(h0, 0, SZH0 * sizeof (hash_rec_T)); memset(h1, 0, SZH1 * sizeof (hash_rec_T)); memset(h2, 0, SZH2 * sizeof (hash_rec_T)); sum = 0; for (i = 0, p = buf; i < sz - 2; i++, p++) { int c0, c1, c2; int hi; uint32_t k; c0 = p[0]; k = c0; hi = get_hash_index(k, h0, SZH0); h0[hi].count++; c1 = p[1]; k = (c1 << 8) | c0; hi = get_hash_index(k, h1, SZH1); h1[hi].count++; c2 = p[2]; k = (c2 << 16) | (c1 << 8) | c0; hi = get_hash_index(k, h2, SZH2); h2[hi].count++; nc++; } sum = 0.; for (i = 0; i < SZH0; i++) { if (h0[i].count > 0) { h0[i].pp = ((double) h0[i].count) / nc; sum -= h0[i].pp * log(h0[i].pp); } } *e0 = sum / log(2.); sum = 0.; for (i = 0; i < SZH1; i++) { if (h1[i].count > 0) { int hi; h1[i].pp = ((double) h1[i].count) / nc; hi = h1[i].key & 0xFF; hi = get_hash_index(hi, h0, SZH0); if (h0[hi].count > 0) { h1[i].pc = h1[i].pp / h0[hi].pp; sum -= h1[i].pp * log(h1[i].pc); } } } *e1 = sum / log(2.); sum = 0; for (i = 0; i < SZH2; i++) { if (h2[i].count > 0) { int hi; h2[i].pp = ((double) h2[i].count) / nc; hi = h2[i].key & 0xFFFF; hi = get_hash_index(hi, h1, SZH1); if (h1[hi].pp > 0) { h2[i].pc = h2[i].pp / h1[hi].pp; sum -= h2[i].pp * log(h2[i].pc); } } } *e2 = sum / log(2.); FREE(h0); FREE(h1); FREE(h2); return TRUE; } /* **************************************************************************** * * * * **************************************************************************** */ double entropy_token_class(buf, sz) char *buf; size_t sz; { int freq[256]; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz == 0)) return 0.; memset(freq, 0, sizeof (freq)); sum = 0; for (i = 0, p = buf; i < sz; i++, p++) { int c; c = conv2ascii(*p); c = (c + 256) % 256; nc++; if (isalpha(c)) { freq[2]++; continue; } if (isdigit(c)) { freq[3]++; continue; } if (isspace(c)) { freq[4]++; continue; } if (ispunct(c)) { freq[5]++; continue; } if (iscntrl(c)) { freq[6]++; continue; } freq[0]++; } for (i = 0; i < 256; i++) { if (freq[i] != 0) { double k; k = ((double) freq[i]) / ((double) nc); sum -= k * log(k); } } return sum / log(2.); } /* **************************************************************************** * * * * **************************************************************************** */ double entropy_punct_class(buf, sz) char *buf; size_t sz; { int freq[256]; int i; double sum = 0.; int nc = 0; char *p; if ((buf == NULL) || (sz == 0)) return 0.; memset(freq, 0, sizeof (freq)); sum = 0; for (i = 0, p = buf; i < sz; i++, p++) { int c; c = conv2ascii(*p); c = (c + 256) % 256; nc++; if (isalnum(c)) { freq[1]++; continue; } if (isspace(c)) { freq[2]++; continue; } if (ispunct(c)) { freq[2]++; continue; } ZE_MessageInfo(5, "Nor alnum, space, punct %c %3d ??? ", c, c); freq[0]++; } for (i = 0; i < 256; i++) { if (freq[i] != 0) { double k; k = ((double) freq[i]) / ((double) nc); sum -= k * log(k); } } return sum / log(2.); } /* **************************************************************************** * * * * **************************************************************************** */ static void buf_extract_tokens(buf) char *buf; { char *s, *ptr; for (s = strtok_r(buf, " ", &ptr); s != NULL; s = strtok_r(NULL, " ", &ptr)) { printf("--> %s\n", s); } } /* **************************************************************************** * * * * **************************************************************************** */ typedef struct { kstats_T html_st; size_t html_sz; kstats_T plain_st; size_t plain_sz; } DATA_T; static bool entropy_mime_part(buf, size, id, level, type, arg, mime_part) char *buf; size_t size; char *id; int level; int type; void *arg; mime_part_T *mime_part; { DATA_T *data = (DATA_T *) arg; double h0, h1, h2, h3, h4, ratio = 1.; char *cleanbuf = NULL; char *mtype = "PLAIN"; char *wbuf = buf; int n; uint32_t dt; kstats_T st = KSTATS_INITIALIZER; if (data == NULL) return FALSE; if (type != MIME_TYPE_TEXT) return TRUE; #if 1 if (abs(strspn(buf, " \t\r\n") - size) < 4) return TRUE; #endif #if 1 if (size < 6) return TRUE; #endif if (strcasecmp("text/html", mime_part->mime) == 0) { mtype = "HTML "; n = check_valid_html_tags(NULL, buf); ZE_LogMsgInfo(9, "NOT VALID TAGS = %6d", n); cleanbuf = cleanup_html_buffer(buf, strlen(buf)); #if 1 ZE_MessageInfo(9, "\nBUF ...\n%s\n", buf); if (cleanbuf != NULL) ZE_MessageInfo(9, "\nBUF ...\n%s\n", cleanbuf); { char *x = NULL; x = realcleanup_text_buf(cleanbuf, strlen(cleanbuf)); if (x != NULL) { ZE_MessageInfo(9, "\nBUF ...\n%s\n", x); buf_extract_tokens(x); FREE(x); } } #endif wbuf = cleanbuf; if (strlen(wbuf) > 0) ratio = ((double) strlen(buf)) / strlen(wbuf); } dt = zeTime_ms(); text_buffer_entropy(wbuf, strlen(wbuf) + 1, &h0, &h1, &h2); dt = zeTime_ms() - dt; ZE_MessageInfo(9, "DT = %ld", dt); h3 = entropy_token_class(wbuf, strlen(wbuf) + 1); h4 = entropy_punct_class(wbuf, strlen(wbuf) + 1); (void) text_word_length(wbuf, &st, strlen(wbuf)); ZE_MessageInfo(9, "%s ENTROPY = %7.3f %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f", mtype, h0, h1, h2, h3, h4, size, strlen(buf), strlen(wbuf), ratio); ZE_MessageInfo(9, "%s WORDS = %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f", mtype, zeKMin(&st), zeKMean(&st), zeKMax(&st), zeKStdDev(&st), size, strlen(buf), strlen(wbuf), ratio); if (0) { long prob[256], nb; if ((nb = text_buf_histogram(wbuf, strlen(wbuf) + 1, prob)) > 0) { int i; for (i = 0; i < 256; i++) { if (prob[i] > 0) ZE_MessageInfo(9, "%s HISTO = %3d %6d %c", mtype, i, prob[i], (isprint(i) ? i : '.')); } } } FREE(cleanbuf); return TRUE; } /* **************************************************************************** * * * * **************************************************************************** */ bool message_entropy(id, fname) char *id; char *fname; { DATA_T data; if (fname == NULL) return FALSE; memset(&data, 0, sizeof (data)); return decode_mime_file(id, fname, NULL, entropy_mime_part, &data); } /* **************************************************************************** * * * * **************************************************************************** */ typedef struct { kstats_T html_st; size_t html_sz; kstats_T plain_st; size_t plain_sz; } HTTP_T; #if 1 #define URL_DOMAIN_EXPRESSION "http[s]?://[^ /<>\\(\\)\"\'?]*" #else #define URL_DOMAIN_EXPRESSION "http://[^ /<>\"]*" #endif static bool mime_extract_http_urls(buf, size, id, level, type, arg, mime_part) char *buf; size_t size; char *id; int level; int type; void *arg; mime_part_T *mime_part; { HTTP_T *data = (HTTP_T *) arg; if (data == NULL) return FALSE; if (type != MIME_TYPE_TEXT) return TRUE; if (abs(strspn(buf, " \t\r\n") - size) < 4) return TRUE; if (size < 6) return TRUE; { long pi, pf; char *p = buf; char sout[4096]; memset(sout, 0, sizeof (sout)); while (zeStrRegex(p, "(ftp|http)[s]?://[^ /<>\"]*", &pi, &pf, TRUE)) { char sout[1024]; char c = '.'; long xi, xf, xh; if (zeStrRegex(p + pf, "></a>", &xi, NULL, TRUE) && (xi - pf) < 30) { bool okh, okf; okh = zeStrRegex(p + pf, ">", &xh, NULL, TRUE) && (xh < xi); okf = zeStrRegex(p + pf, ">.+</a>", &xf, NULL, TRUE) && (xf < xi); c = (okf || okh) ? '.' : 'H'; } strncpy(sout, p + pi, pf - pi); sout[pf - pi] = 0; #if 1 { char *p, *q; for (p = q = sout; *p != '\0'; p++) { if (strchr(" \t\r\n()", *p) == NULL) *q++ = *p; } *q = '\0'; } #else if ((xh = strcspn(sout, " \t\r\n()")) >= 0) sout[xh] = '\0'; #endif ZE_MessageInfo(0, "HTTP : %c %s", c, sout); p += pf; } } return TRUE; } /* **************************************************************************** * * * * **************************************************************************** */ #define SZBUF 0x20000 bool message_extract_http_urls(id, fname) char *id; char *fname; { HTTP_T data; if (fname != NULL) { memset(&data, 0, sizeof (data)); return decode_mime_file(id, fname, NULL, mime_extract_http_urls, &data); } else { char *buf = NULL; size_t sz; if ((buf = malloc(SZBUF)) != NULL) { sz = read(STDIN_FILENO, buf, SZBUF); if (sz > 0) return decode_mime_buffer(id, buf, sz, 0, NULL, mime_extract_http_urls, &data); if (sz < 0) return FALSE; } } return FALSE; } ", c) != NULL)
50  return 'c';
51 #else
52  switch (c) {
53  case 'à':
54  case 'ä':
55  case 'â':
56  return 'a';
57  case 'é':
58  case 'è':
59  case 'ê':
60  case 'ë':
61  return 'e';
62  case 'î':
63  case 'ï':
64  return 'i';
65  case 'ô':
66  case 'ö':
67  return 'o';
68  case 'ù':
69  case 'û':
70  return 'u';
71  case 'ç':
72  return 'c';
73  }
74 #endif
75 
76  return c;
77 }
78 
79 /* ****************************************************************************
80  * *
81  * *
82  **************************************************************************** */
83 void
84 text2lowerascii(buf, size)
85  char *buf;
86  size_t size;
87 {
88  char *p = buf;
89  int c;
90 
91  if (buf == NULL)
92  return;
93 
94  for (p = buf; (size > 0) && (*p != '\0'); p++) {
95  c = (conv2ascii(*p) + 256) % 256;
96  *p = tolower(c);
97  }
98 }
99 
100 /* ****************************************************************************
101  * *
102  * *
103  **************************************************************************** */
104 #define ONLY_LOWER 1
105 
106 double
108  char *buf;
109  size_t sz;
110 {
111  int freq[256];
112  int i;
113  double sum = 0.;
114  int nc = 0;
115  char *p;
116 
117  if ((buf == NULL) || (sz == 0))
118  return 0.;
119 
120  memset(freq, 0, sizeof (freq));
121  sum = 0;
122 
123  for (i = 0, p = buf; i < sz; i++, p++) {
124  int c;
125 
126  c = conv2ascii(*p);
127  c = (c + 256) % 256;
128  c = tolower(c);
129 
130  freq[c]++;
131  nc++;
132  }
133 
134  for (i = 0; i < 256; i++) {
135  if (freq[i] != 0) {
136  double k;
137 
138  k = ((double) freq[i]) / ((double) nc);
139  sum -= k * log(k);
140  }
141  }
142  return sum / log(2.);
143 }
144 
145 /* ****************************************************************************
146  * *
147  * *
148  **************************************************************************** */
149 typedef struct {
150  int key;
151  int count;
152  double pp;
153  double pc;
154 } hash_rec_T;
155 
156 static int
157 get_hash_index(k, h, sz)
158  uint32_t k;
159  hash_rec_T *h;
160  size_t sz;
161 {
162  int i, hv = 0;
163 
164  {
165  int c, d;
166  uint32_t tk = k;
167 
168  c = d = 0;
169  for (i = 0; i < 4; i++) {
170  d = tk & 0x000000FF;
171  tk = tk >> 8;
172  c = d;
173  c ^= c << 6;
174  hv += (c << 11) ^ (c >> 1);
175  hv ^= (d << 14) + (d << 7) + (d << 4) + d;
176  }
177  hv %= sz;
178  }
179  for (i = 0; i < sz; i++) {
180  int j = (hv + i) % sz;
181 
182  if (h[j].key == k)
183  return j;
184 
185  if (h[j].key == 0) {
186  h[j].key = k;
187  return j;
188  }
189  }
190 
191  ZE_LogMsgWarning(0, "Hash table overflow");
192  return 0;
193 }
194 
195 /* ****************************************************************************
196  * *
197  * *
198  **************************************************************************** */
199 #define SZH0 256
200 #define SZH1 4096
201 #define SZH2 16384
202 #define SZH3 16384
203 
204 
205 /* ****************************************************************************
206  * *
207  * *
208  **************************************************************************** */
209 bool
210 text_buffer_entropy(buf, sz, e0, e1, e2)
211  char *buf;
212  size_t sz;
213  double *e0;
214  double *e1;
215  double *e2;
216 {
217  hash_rec_T *h0, *h1, *h2;
218  int i;
219  double sum = 0.;
220  int nc = 0;
221  char *p;
222 
223  if ((buf == NULL) || (sz < 6))
224  return FALSE;
225 
226  text2lowerascii(buf, sz);
227 
228  h0 = (hash_rec_T *) malloc(SZH0 * sizeof (hash_rec_T));
229  h1 = (hash_rec_T *) malloc(SZH1 * sizeof (hash_rec_T));
230  h2 = (hash_rec_T *) malloc(SZH2 * sizeof (hash_rec_T));
231  if ((h0 == NULL) || (h1 == NULL) || (h2 == NULL)) {
232  FREE(h0);
233  FREE(h1);
234  FREE(h2);
235  return FALSE;
236  }
237 
238  memset(h0, 0, SZH0 * sizeof (hash_rec_T));
239  memset(h1, 0, SZH1 * sizeof (hash_rec_T));
240  memset(h2, 0, SZH2 * sizeof (hash_rec_T));
241 
242  sum = 0;
243  for (i = 0, p = buf; i < sz - 2; i++, p++) {
244  int c0, c1, c2;
245  int hi;
246  uint32_t k;
247 
248  c0 = p[0];
249 
250  k = c0;
251  hi = get_hash_index(k, h0, SZH0);
252  h0[hi].count++;
253 
254  c1 = p[1];
255 
256  k = (c1 << 8) | c0;
257  hi = get_hash_index(k, h1, SZH1);
258  h1[hi].count++;
259 
260  c2 = p[2];
261 
262  k = (c2 << 16) | (c1 << 8) | c0;
263  hi = get_hash_index(k, h2, SZH2);
264  h2[hi].count++;
265 
266  nc++;
267  }
268 
269  sum = 0.;
270  for (i = 0; i < SZH0; i++) {
271  if (h0[i].count > 0) {
272  h0[i].pp = ((double) h0[i].count) / nc;
273  sum -= h0[i].pp * log(h0[i].pp);
274  }
275  }
276  *e0 = sum / log(2.);
277 
278  sum = 0.;
279  for (i = 0; i < SZH1; i++) {
280  if (h1[i].count > 0) {
281  int hi;
282 
283  h1[i].pp = ((double) h1[i].count) / nc;
284  hi = h1[i].key & 0xFF;
285  hi = get_hash_index(hi, h0, SZH0);
286 
287  if (h0[hi].count > 0) {
288  h1[i].pc = h1[i].pp / h0[hi].pp;
289  sum -= h1[i].pp * log(h1[i].pc);
290  }
291  }
292  }
293  *e1 = sum / log(2.);
294 
295  sum = 0;
296  for (i = 0; i < SZH2; i++) {
297  if (h2[i].count > 0) {
298  int hi;
299 
300  h2[i].pp = ((double) h2[i].count) / nc;
301  hi = h2[i].key & 0xFFFF;
302  hi = get_hash_index(hi, h1, SZH1);
303 
304  if (h1[hi].pp > 0) {
305  h2[i].pc = h2[i].pp / h1[hi].pp;
306  sum -= h2[i].pp * log(h2[i].pc);
307  }
308  }
309  }
310  *e2 = sum / log(2.);
311 
312  FREE(h0);
313  FREE(h1);
314  FREE(h2);
315 
316  return TRUE;
317 }
318 
319 /* ****************************************************************************
320  * *
321  * *
322  **************************************************************************** */
323 
324 double
326  char *buf;
327  size_t sz;
328 {
329  int freq[256];
330  int i;
331  double sum = 0.;
332  int nc = 0;
333  char *p;
334 
335  if ((buf == NULL) || (sz == 0))
336  return 0.;
337 
338  memset(freq, 0, sizeof (freq));
339  sum = 0;
340 
341  for (i = 0, p = buf; i < sz; i++, p++) {
342  int c;
343 
344  c = conv2ascii(*p);
345  c = (c + 256) % 256;
346 
347  nc++;
348  if (isalpha(c)) {
349  freq[2]++;
350  continue;
351  }
352 
353  if (isdigit(c)) {
354  freq[3]++;
355  continue;
356  }
357  if (isspace(c)) {
358  freq[4]++;
359  continue;
360  }
361  if (ispunct(c)) {
362  freq[5]++;
363  continue;
364  }
365  if (iscntrl(c)) {
366  freq[6]++;
367  continue;
368  }
369  freq[0]++;
370  }
371 
372  for (i = 0; i < 256; i++) {
373  if (freq[i] != 0) {
374  double k;
375 
376  k = ((double) freq[i]) / ((double) nc);
377  sum -= k * log(k);
378  }
379  }
380  return sum / log(2.);
381 }
382 
383 /* ****************************************************************************
384  * *
385  * *
386  **************************************************************************** */
387 
388 double
390  char *buf;
391  size_t sz;
392 {
393  int freq[256];
394  int i;
395  double sum = 0.;
396  int nc = 0;
397  char *p;
398 
399  if ((buf == NULL) || (sz == 0))
400  return 0.;
401 
402  memset(freq, 0, sizeof (freq));
403  sum = 0;
404 
405  for (i = 0, p = buf; i < sz; i++, p++) {
406  int c;
407 
408  c = conv2ascii(*p);
409  c = (c + 256) % 256;
410 
411  nc++;
412  if (isalnum(c)) {
413  freq[1]++;
414  continue;
415  }
416 
417  if (isspace(c)) {
418  freq[2]++;
419  continue;
420  }
421  if (ispunct(c)) {
422  freq[2]++;
423  continue;
424  }
425 
426  ZE_MessageInfo(5, "Nor alnum, space, punct %c %3d ??? ", c, c);
427 
428  freq[0]++;
429  }
430 
431  for (i = 0; i < 256; i++) {
432  if (freq[i] != 0) {
433  double k;
434 
435  k = ((double) freq[i]) / ((double) nc);
436  sum -= k * log(k);
437  }
438  }
439  return sum / log(2.);
440 }
441 
442 
443 /* ****************************************************************************
444  * *
445  * *
446  **************************************************************************** */
447 static void
448 buf_extract_tokens(buf)
449  char *buf;
450 {
451  char *s, *ptr;
452 
453  for (s = strtok_r(buf, " ", &ptr); s != NULL; s = strtok_r(NULL, " ", &ptr)) {
454  printf("--> %s\n", s);
455  }
456 }
457 
458 
459 /* ****************************************************************************
460  * *
461  * *
462  **************************************************************************** */
463 typedef struct {
465  size_t html_sz;
467  size_t plain_sz;
468 } DATA_T;
469 
470 
471 static bool
472 entropy_mime_part(buf, size, id, level, type, arg, mime_part)
473  char *buf;
474  size_t size;
475  char *id;
476  int level;
477  int type;
478  void *arg;
479  mime_part_T *mime_part;
480 {
481  DATA_T *data = (DATA_T *) arg;
482 
483  double h0, h1, h2, h3, h4, ratio = 1.;
484  char *cleanbuf = NULL;
485  char *mtype = "PLAIN";
486  char *wbuf = buf;
487  int n;
488 
489  uint32_t dt;
491 
492  if (data == NULL)
493  return FALSE;
494 
495  if (type != MIME_TYPE_TEXT)
496  return TRUE;
497 
498 #if 1
499  if (abs(strspn(buf, " \t\r\n") - size) < 4)
500  return TRUE;
501 #endif
502 #if 1
503  if (size < 6)
504  return TRUE;
505 #endif
506 
507  if (strcasecmp("text/html", mime_part->mime) == 0) {
508  mtype = "HTML ";
509 
510  n = check_valid_html_tags(NULL, buf);
511 
512  ZE_LogMsgInfo(9, "NOT VALID TAGS = %6d", n);
513 
514  cleanbuf = cleanup_html_buffer(buf, strlen(buf));
515 
516 #if 1
517  ZE_MessageInfo(9, "\nBUF ...\n%s\n", buf);
518 
519  if (cleanbuf != NULL)
520  ZE_MessageInfo(9, "\nBUF ...\n%s\n", cleanbuf);
521 
522  {
523  char *x = NULL;
524 
525  x = realcleanup_text_buf(cleanbuf, strlen(cleanbuf));
526  if (x != NULL) {
527  ZE_MessageInfo(9, "\nBUF ...\n%s\n", x);
528  buf_extract_tokens(x);
529  FREE(x);
530  }
531  }
532 #endif
533  wbuf = cleanbuf;
534 
535  if (strlen(wbuf) > 0)
536  ratio = ((double) strlen(buf)) / strlen(wbuf);
537 
538  }
539 
540  dt = zeTime_ms();
541  text_buffer_entropy(wbuf, strlen(wbuf) + 1, &h0, &h1, &h2);
542  dt = zeTime_ms() - dt;
543  ZE_MessageInfo(9, "DT = %ld", dt);
544 
545  h3 = entropy_token_class(wbuf, strlen(wbuf) + 1);
546  h4 = entropy_punct_class(wbuf, strlen(wbuf) + 1);
547 
548  (void) text_word_length(wbuf, &st, strlen(wbuf));
549 
550  ZE_MessageInfo(9,
551  "%s ENTROPY = %7.3f %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
552  mtype, h0, h1, h2, h3, h4, size, strlen(buf), strlen(wbuf),
553  ratio);
554 
555  ZE_MessageInfo(9,
556  "%s WORDS = %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
557  mtype,
558  zeKMin(&st), zeKMean(&st), zeKMax(&st), zeKStdDev(&st),
559  size, strlen(buf), strlen(wbuf), ratio);
560 
561  if (0) {
562  long prob[256], nb;
563 
564  if ((nb = text_buf_histogram(wbuf, strlen(wbuf) + 1, prob)) > 0) {
565  int i;
566 
567  for (i = 0; i < 256; i++) {
568  if (prob[i] > 0)
569  ZE_MessageInfo(9, "%s HISTO = %3d %6d %c",
570  mtype, i, prob[i], (isprint(i) ? i : '.'));
571  }
572  }
573  }
574 
575  FREE(cleanbuf);
576 
577  return TRUE;
578 }
579 
580 
581 
582 /* ****************************************************************************
583  * *
584  * *
585  **************************************************************************** */
586 bool
587 message_entropy(id, fname)
588  char *id;
589  char *fname;
590 {
591  DATA_T data;
592 
593  if (fname == NULL)
594  return FALSE;
595 
596  memset(&data, 0, sizeof (data));
597 
598  return decode_mime_file(id, fname, NULL, entropy_mime_part, &data);
599 }
600 
601 
602 /* ****************************************************************************
603  * *
604  * *
605  **************************************************************************** */
606 typedef struct {
608  size_t html_sz;
610  size_t plain_sz;
611 } HTTP_T;
612 
613 #if 1
614 #define URL_DOMAIN_EXPRESSION "http[s]?://[^ /<>\\(\\)\"\'?]*"
615 #else
616 #define URL_DOMAIN_EXPRESSION "http://[^ /<>\"]*"
617 #endif
618 
619 static bool
620 mime_extract_http_urls(buf, size, id, level, type, arg, mime_part)
621  char *buf;
622  size_t size;
623  char *id;
624  int level;
625  int type;
626  void *arg;
627  mime_part_T *mime_part;
628 {
629  HTTP_T *data = (HTTP_T *) arg;
630 
631  if (data == NULL)
632  return FALSE;
633 
634  if (type != MIME_TYPE_TEXT)
635  return TRUE;
636 
637  if (abs(strspn(buf, " \t\r\n") - size) < 4)
638  return TRUE;
639  if (size < 6)
640  return TRUE;
641 
642  {
643  long pi, pf;
644  char *p = buf;
645  char sout[4096];
646 
647  memset(sout, 0, sizeof (sout));
648  while (zeStrRegex(p, "(ftp|http)[s]?://[^ /<>\"]*", &pi, &pf, TRUE)) {
649  char sout[1024];
650  char c = '.';
651  long xi, xf, xh;
652 
653  if (zeStrRegex(p + pf, "></a>", &xi, NULL, TRUE) && (xi - pf) < 30) {
654  bool okh, okf;
655 
656  okh = zeStrRegex(p + pf, ">", &xh, NULL, TRUE) && (xh < xi);
657  okf = zeStrRegex(p + pf, ">.+</a>", &xf, NULL, TRUE) && (xf < xi);
658 
659  c = (okf || okh) ? '.' : 'H';
660  }
661  strncpy(sout, p + pi, pf - pi);
662  sout[pf - pi] = 0;
663 #if 1
664  {
665  char *p, *q;
666 
667  for (p = q = sout; *p != '\0'; p++) {
668  if (strchr(" \t\r\n()", *p) == NULL)
669  *q++ = *p;
670  }
671  *q = '\0';
672  }
673 #else
674  if ((xh = strcspn(sout, " \t\r\n()")) >= 0)
675  sout[xh] = '\0';
676 #endif
677 
678  ZE_MessageInfo(0, "HTTP : %c %s", c, sout);
679 
680  p += pf;
681  }
682  }
683 
684  return TRUE;
685 }
686 
687 
688 /* ****************************************************************************
689  * *
690  * *
691  **************************************************************************** */
692 #define SZBUF 0x20000
693 
694 bool
696  char *id;
697  char *fname;
698 {
699  HTTP_T data;
700 
701  if (fname != NULL) {
702  memset(&data, 0, sizeof (data));
703 
704  return decode_mime_file(id, fname, NULL, mime_extract_http_urls, &data);
705  } else {
706  char *buf = NULL;
707  size_t sz;
708 
709  if ((buf = malloc(SZBUF)) != NULL) {
710  sz = read(STDIN_FILENO, buf, SZBUF);
711  if (sz > 0)
712  return decode_mime_buffer(id, buf, sz, 0, NULL, mime_extract_http_urls,
713  &data);
714  if (sz < 0)
715  return FALSE;
716  }
717  }
718  return FALSE;
719 }
size_t html_sz
Definition: ze-entropy.c:608
double pp
Definition: ze-entropy.c:152
kstats_T html_st
Definition: ze-entropy.c:607
char * realcleanup_text_buf(char *, size_t)
Definition: ze-oracle.c:408
#define SZH2
Definition: ze-entropy.c:201
#define FREE(x)
Definition: macros.h:37
bool text_word_length(char *, kstats_T *, size_t)
Definition: ze-buffer.c:266
uint64_t zeTime_ms()
Definition: zeTime.c:34
bool message_extract_http_urls(char *id, char *fname)
Definition: ze-entropy.c:695
#define SZBUF
Definition: ze-entropy.c:692
#define ZE_LogMsgInfo(level,...)
Definition: zeSyslog.h:110
double zeKMin(kstats_T *s)
Definition: zeKStats.c:62
#define FALSE
Definition: macros.h:160
bool zeStrRegex(char *, char *, long *, long *, bool)
Definition: zeStrings.c:544
#define SZH1
Definition: ze-entropy.c:200
double entropy_punct_class(char *buf, size_t sz)
Definition: ze-entropy.c:389
size_t plain_sz
Definition: ze-entropy.c:610
kstats_T html_st
Definition: ze-entropy.c:464
#define strchr
Definition: ze-sys.h:218
bool decode_mime_file(char *, char *, uint32_t *, demime_F, void *)
Definition: ze-demime.c:584
char * mime
Definition: ze-demime.h:65
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
int nb
Definition: ze-connopen.c:61
#define MIME_TYPE_TEXT
Definition: ze-demime.h:34
#define TRUE
Definition: macros.h:157
bool decode_mime_buffer(char *, char *, size_t, int, uint32_t *, demime_F, void *)
Definition: ze-demime.c:154
#define KSTATS_INITIALIZER
Definition: zeKStats.h:36
double pc
Definition: ze-entropy.c:153
int check_valid_html_tags(char *, char *)
Definition: ze-html.c:334
kstats_T plain_st
Definition: ze-entropy.c:466
#define ZE_LogMsgWarning(level,...)
Definition: zeSyslog.h:112
double entropy_monogram(char *buf, size_t sz)
Definition: ze-entropy.c:107
#define SZH0
Definition: ze-entropy.c:199
void text2lowerascii(char *buf, size_t size)
Definition: ze-entropy.c:84
double zeKMean(kstats_T *s)
Definition: zeKStats.c:43
char * cleanup_html_buffer(char *, size_t)
Definition: ze-html.c:163
double zeKMax(kstats_T *s)
Definition: zeKStats.c:72
double zeKStdDev(kstats_T *s)
Definition: zeKStats.c:53
bool text_buffer_entropy(char *buf, size_t sz, double *e0, double *e1, double *e2)
Definition: ze-entropy.c:210
size_t plain_sz
Definition: ze-entropy.c:467
long uint32_t
Definition: ze-sys.h:489
kstats_T plain_st
Definition: ze-entropy.c:609
double entropy_token_class(char *buf, size_t sz)
Definition: ze-entropy.c:325
size_t html_sz
Definition: ze-entropy.c:465
bool message_entropy(char *id, char *fname)
Definition: ze-entropy.c:587
long text_buf_histogram(char *, size_t, long *)
Definition: ze-buffer.c:310