39 if (
strchr(
"àäâ , c) != NULL)
return 'a';
if (strchr("éèêë", c) != NULL)
return 'e';
if (strchr("îï", c) != NULL)
return 'i';
if (strchr("'ôö", c) != NULL)
return 'o';
if (strchr("ùû", c) != NULL)
return 'u';
if (strchr("ç", c) != NULL)
return 'c';
#else
switch (c) {
case 'à':
case 'ä':
case 'â':
return 'a';
case 'é':
case 'è':
case 'ê':
case 'ë':
return 'e';
case 'î':
case 'ï':
return 'i';
case 'ô':
case 'ö':
return 'o';
case 'ù':
case 'û':
return 'u';
case 'ç':
return 'c';
}
#endif
return c;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
void
text2lowerascii(buf, size)
char *buf;
size_t size;
{
char *p = buf;
int c;
if (buf == NULL)
return;
for (p = buf; (size > 0) && (*p != '\0'); p++) {
c = (conv2ascii(*p) + 256) % 256;
*p = tolower(c);
}
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
#define ONLY_LOWER 1
double
entropy_monogram(buf, sz)
char *buf;
size_t sz;
{
int freq[256];
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz == 0))
return 0.;
memset(freq, 0, sizeof (freq));
sum = 0;
for (i = 0, p = buf; i < sz; i++, p++) {
int c;
c = conv2ascii(*p);
c = (c + 256) % 256;
c = tolower(c);
freq[c]++;
nc++;
}
for (i = 0; i < 256; i++) {
if (freq[i] != 0) {
double k;
k = ((double) freq[i]) / ((double) nc);
sum -= k * log(k);
}
}
return sum / log(2.);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
typedef struct {
int key;
int count;
double pp;
double pc;
} hash_rec_T;
static int
get_hash_index(k, h, sz)
uint32_t k;
hash_rec_T *h;
size_t sz;
{
int i, hv = 0;
{
int c, d;
uint32_t tk = k;
c = d = 0;
for (i = 0; i < 4; i++) {
d = tk & 0x000000FF;
tk = tk >> 8;
c = d;
c ^= c << 6;
hv += (c << 11) ^ (c >> 1);
hv ^= (d << 14) + (d << 7) + (d << 4) + d;
}
hv %= sz;
}
for (i = 0; i < sz; i++) {
int j = (hv + i) % sz;
if (h[j].key == k)
return j;
if (h[j].key == 0) {
h[j].key = k;
return j;
}
}
ZE_LogMsgWarning(0, "Hash table overflow");
return 0;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
#define SZH0 256
#define SZH1 4096
#define SZH2 16384
#define SZH3 16384
/* ****************************************************************************
* *
* *
**************************************************************************** */
bool
text_buffer_entropy(buf, sz, e0, e1, e2)
char *buf;
size_t sz;
double *e0;
double *e1;
double *e2;
{
hash_rec_T *h0, *h1, *h2;
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz < 6))
return FALSE;
text2lowerascii(buf, sz);
h0 = (hash_rec_T *) malloc(SZH0 * sizeof (hash_rec_T));
h1 = (hash_rec_T *) malloc(SZH1 * sizeof (hash_rec_T));
h2 = (hash_rec_T *) malloc(SZH2 * sizeof (hash_rec_T));
if ((h0 == NULL) || (h1 == NULL) || (h2 == NULL)) {
FREE(h0);
FREE(h1);
FREE(h2);
return FALSE;
}
memset(h0, 0, SZH0 * sizeof (hash_rec_T));
memset(h1, 0, SZH1 * sizeof (hash_rec_T));
memset(h2, 0, SZH2 * sizeof (hash_rec_T));
sum = 0;
for (i = 0, p = buf; i < sz - 2; i++, p++) {
int c0, c1, c2;
int hi;
uint32_t k;
c0 = p[0];
k = c0;
hi = get_hash_index(k, h0, SZH0);
h0[hi].count++;
c1 = p[1];
k = (c1 << 8) | c0;
hi = get_hash_index(k, h1, SZH1);
h1[hi].count++;
c2 = p[2];
k = (c2 << 16) | (c1 << 8) | c0;
hi = get_hash_index(k, h2, SZH2);
h2[hi].count++;
nc++;
}
sum = 0.;
for (i = 0; i < SZH0; i++) {
if (h0[i].count > 0) {
h0[i].pp = ((double) h0[i].count) / nc;
sum -= h0[i].pp * log(h0[i].pp);
}
}
*e0 = sum / log(2.);
sum = 0.;
for (i = 0; i < SZH1; i++) {
if (h1[i].count > 0) {
int hi;
h1[i].pp = ((double) h1[i].count) / nc;
hi = h1[i].key & 0xFF;
hi = get_hash_index(hi, h0, SZH0);
if (h0[hi].count > 0) {
h1[i].pc = h1[i].pp / h0[hi].pp;
sum -= h1[i].pp * log(h1[i].pc);
}
}
}
*e1 = sum / log(2.);
sum = 0;
for (i = 0; i < SZH2; i++) {
if (h2[i].count > 0) {
int hi;
h2[i].pp = ((double) h2[i].count) / nc;
hi = h2[i].key & 0xFFFF;
hi = get_hash_index(hi, h1, SZH1);
if (h1[hi].pp > 0) {
h2[i].pc = h2[i].pp / h1[hi].pp;
sum -= h2[i].pp * log(h2[i].pc);
}
}
}
*e2 = sum / log(2.);
FREE(h0);
FREE(h1);
FREE(h2);
return TRUE;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
double
entropy_token_class(buf, sz)
char *buf;
size_t sz;
{
int freq[256];
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz == 0))
return 0.;
memset(freq, 0, sizeof (freq));
sum = 0;
for (i = 0, p = buf; i < sz; i++, p++) {
int c;
c = conv2ascii(*p);
c = (c + 256) % 256;
nc++;
if (isalpha(c)) {
freq[2]++;
continue;
}
if (isdigit(c)) {
freq[3]++;
continue;
}
if (isspace(c)) {
freq[4]++;
continue;
}
if (ispunct(c)) {
freq[5]++;
continue;
}
if (iscntrl(c)) {
freq[6]++;
continue;
}
freq[0]++;
}
for (i = 0; i < 256; i++) {
if (freq[i] != 0) {
double k;
k = ((double) freq[i]) / ((double) nc);
sum -= k * log(k);
}
}
return sum / log(2.);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
double
entropy_punct_class(buf, sz)
char *buf;
size_t sz;
{
int freq[256];
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz == 0))
return 0.;
memset(freq, 0, sizeof (freq));
sum = 0;
for (i = 0, p = buf; i < sz; i++, p++) {
int c;
c = conv2ascii(*p);
c = (c + 256) % 256;
nc++;
if (isalnum(c)) {
freq[1]++;
continue;
}
if (isspace(c)) {
freq[2]++;
continue;
}
if (ispunct(c)) {
freq[2]++;
continue;
}
ZE_MessageInfo(5, "Nor alnum, space, punct %c %3d ??? ", c, c);
freq[0]++;
}
for (i = 0; i < 256; i++) {
if (freq[i] != 0) {
double k;
k = ((double) freq[i]) / ((double) nc);
sum -= k * log(k);
}
}
return sum / log(2.);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
static void
buf_extract_tokens(buf)
char *buf;
{
char *s, *ptr;
for (s = strtok_r(buf, " ", &ptr); s != NULL; s = strtok_r(NULL, " ", &ptr)) {
printf("--> %s\n", s);
}
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
typedef struct {
kstats_T html_st;
size_t html_sz;
kstats_T plain_st;
size_t plain_sz;
} DATA_T;
static bool
entropy_mime_part(buf, size, id, level, type, arg, mime_part)
char *buf;
size_t size;
char *id;
int level;
int type;
void *arg;
mime_part_T *mime_part;
{
DATA_T *data = (DATA_T *) arg;
double h0, h1, h2, h3, h4, ratio = 1.;
char *cleanbuf = NULL;
char *mtype = "PLAIN";
char *wbuf = buf;
int n;
uint32_t dt;
kstats_T st = KSTATS_INITIALIZER;
if (data == NULL)
return FALSE;
if (type != MIME_TYPE_TEXT)
return TRUE;
#if 1
if (abs(strspn(buf, " \t\r\n") - size) < 4)
return TRUE;
#endif
#if 1
if (size < 6)
return TRUE;
#endif
if (strcasecmp("text/html", mime_part->mime) == 0) {
mtype = "HTML ";
n = check_valid_html_tags(NULL, buf);
ZE_LogMsgInfo(9, "NOT VALID TAGS = %6d", n);
cleanbuf = cleanup_html_buffer(buf, strlen(buf));
#if 1
ZE_MessageInfo(9, "\nBUF ...\n%s\n", buf);
if (cleanbuf != NULL)
ZE_MessageInfo(9, "\nBUF ...\n%s\n", cleanbuf);
{
char *x = NULL;
x = realcleanup_text_buf(cleanbuf, strlen(cleanbuf));
if (x != NULL) {
ZE_MessageInfo(9, "\nBUF ...\n%s\n", x);
buf_extract_tokens(x);
FREE(x);
}
}
#endif
wbuf = cleanbuf;
if (strlen(wbuf) > 0)
ratio = ((double) strlen(buf)) / strlen(wbuf);
}
dt = zeTime_ms();
text_buffer_entropy(wbuf, strlen(wbuf) + 1, &h0, &h1, &h2);
dt = zeTime_ms() - dt;
ZE_MessageInfo(9, "DT = %ld", dt);
h3 = entropy_token_class(wbuf, strlen(wbuf) + 1);
h4 = entropy_punct_class(wbuf, strlen(wbuf) + 1);
(void) text_word_length(wbuf, &st, strlen(wbuf));
ZE_MessageInfo(9,
"%s ENTROPY = %7.3f %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
mtype, h0, h1, h2, h3, h4, size, strlen(buf), strlen(wbuf),
ratio);
ZE_MessageInfo(9,
"%s WORDS = %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
mtype,
zeKMin(&st), zeKMean(&st), zeKMax(&st), zeKStdDev(&st),
size, strlen(buf), strlen(wbuf), ratio);
if (0) {
long prob[256], nb;
if ((nb = text_buf_histogram(wbuf, strlen(wbuf) + 1, prob)) > 0) {
int i;
for (i = 0; i < 256; i++) {
if (prob[i] > 0)
ZE_MessageInfo(9, "%s HISTO = %3d %6d %c",
mtype, i, prob[i], (isprint(i) ? i : '.'));
}
}
}
FREE(cleanbuf);
return TRUE;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
bool
message_entropy(id, fname)
char *id;
char *fname;
{
DATA_T data;
if (fname == NULL)
return FALSE;
memset(&data, 0, sizeof (data));
return decode_mime_file(id, fname, NULL, entropy_mime_part, &data);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
typedef struct {
kstats_T html_st;
size_t html_sz;
kstats_T plain_st;
size_t plain_sz;
} HTTP_T;
#if 1
#define URL_DOMAIN_EXPRESSION "http[s]?://[^ /<>\\(\\)\"\'?]*"
#else
#define URL_DOMAIN_EXPRESSION "http://[^ /<>\"]*"
#endif
static bool
mime_extract_http_urls(buf, size, id, level, type, arg, mime_part)
char *buf;
size_t size;
char *id;
int level;
int type;
void *arg;
mime_part_T *mime_part;
{
HTTP_T *data = (HTTP_T *) arg;
if (data == NULL)
return FALSE;
if (type != MIME_TYPE_TEXT)
return TRUE;
if (abs(strspn(buf, " \t\r\n") - size) < 4)
return TRUE;
if (size < 6)
return TRUE;
{
long pi, pf;
char *p = buf;
char sout[4096];
memset(sout, 0, sizeof (sout));
while (zeStrRegex(p, "(ftp|http)[s]?://[^ /<>\"]*", &pi, &pf, TRUE)) {
char sout[1024];
char c = '.';
long xi, xf, xh;
if (zeStrRegex(p + pf, "></a>", &xi, NULL, TRUE) && (xi - pf) < 30) {
bool okh, okf;
okh = zeStrRegex(p + pf, ">", &xh, NULL, TRUE) && (xh < xi);
okf = zeStrRegex(p + pf, ">.+</a>", &xf, NULL, TRUE) && (xf < xi);
c = (okf || okh) ? '.' : 'H';
}
strncpy(sout, p + pi, pf - pi);
sout[pf - pi] = 0;
#if 1
{
char *p, *q;
for (p = q = sout; *p != '\0'; p++) {
if (strchr(" \t\r\n()", *p) == NULL)
*q++ = *p;
}
*q = '\0';
}
#else
if ((xh = strcspn(sout, " \t\r\n()")) >= 0)
sout[xh] = '\0';
#endif
ZE_MessageInfo(0, "HTTP : %c %s", c, sout);
p += pf;
}
}
return TRUE;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
#define SZBUF 0x20000
bool
message_extract_http_urls(id, fname)
char *id;
char *fname;
{
HTTP_T data;
if (fname != NULL) {
memset(&data, 0, sizeof (data));
return decode_mime_file(id, fname, NULL, mime_extract_http_urls, &data);
} else {
char *buf = NULL;
size_t sz;
if ((buf = malloc(SZBUF)) != NULL) {
sz = read(STDIN_FILENO, buf, SZBUF);
if (sz > 0)
return decode_mime_buffer(id, buf, sz, 0, NULL, mime_extract_http_urls,
&data);
if (sz < 0)
return FALSE;
}
}
return FALSE;
}
", c) != NULL)
41 if (
strchr(
"éèêë", c) != NULL)
43 if (
strchr(
"îï", c) != NULL)
45 if (
strchr(
"'ôö", c) != NULL)
47 if (
strchr(
"ùû", c) != NULL)
49 if (
strchr(
"ç , c) != NULL)
return 'c';
#else
switch (c) {
case 'à':
case 'ä':
case 'â':
return 'a';
case 'é':
case 'è':
case 'ê':
case 'ë':
return 'e';
case 'î':
case 'ï':
return 'i';
case 'ô':
case 'ö':
return 'o';
case 'ù':
case 'û':
return 'u';
case 'ç':
return 'c';
}
#endif
return c;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
void
text2lowerascii(buf, size)
char *buf;
size_t size;
{
char *p = buf;
int c;
if (buf == NULL)
return;
for (p = buf; (size > 0) && (*p != '\0'); p++) {
c = (conv2ascii(*p) + 256) % 256;
*p = tolower(c);
}
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
#define ONLY_LOWER 1
double
entropy_monogram(buf, sz)
char *buf;
size_t sz;
{
int freq[256];
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz == 0))
return 0.;
memset(freq, 0, sizeof (freq));
sum = 0;
for (i = 0, p = buf; i < sz; i++, p++) {
int c;
c = conv2ascii(*p);
c = (c + 256) % 256;
c = tolower(c);
freq[c]++;
nc++;
}
for (i = 0; i < 256; i++) {
if (freq[i] != 0) {
double k;
k = ((double) freq[i]) / ((double) nc);
sum -= k * log(k);
}
}
return sum / log(2.);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
typedef struct {
int key;
int count;
double pp;
double pc;
} hash_rec_T;
static int
get_hash_index(k, h, sz)
uint32_t k;
hash_rec_T *h;
size_t sz;
{
int i, hv = 0;
{
int c, d;
uint32_t tk = k;
c = d = 0;
for (i = 0; i < 4; i++) {
d = tk & 0x000000FF;
tk = tk >> 8;
c = d;
c ^= c << 6;
hv += (c << 11) ^ (c >> 1);
hv ^= (d << 14) + (d << 7) + (d << 4) + d;
}
hv %= sz;
}
for (i = 0; i < sz; i++) {
int j = (hv + i) % sz;
if (h[j].key == k)
return j;
if (h[j].key == 0) {
h[j].key = k;
return j;
}
}
ZE_LogMsgWarning(0, "Hash table overflow");
return 0;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
#define SZH0 256
#define SZH1 4096
#define SZH2 16384
#define SZH3 16384
/* ****************************************************************************
* *
* *
**************************************************************************** */
bool
text_buffer_entropy(buf, sz, e0, e1, e2)
char *buf;
size_t sz;
double *e0;
double *e1;
double *e2;
{
hash_rec_T *h0, *h1, *h2;
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz < 6))
return FALSE;
text2lowerascii(buf, sz);
h0 = (hash_rec_T *) malloc(SZH0 * sizeof (hash_rec_T));
h1 = (hash_rec_T *) malloc(SZH1 * sizeof (hash_rec_T));
h2 = (hash_rec_T *) malloc(SZH2 * sizeof (hash_rec_T));
if ((h0 == NULL) || (h1 == NULL) || (h2 == NULL)) {
FREE(h0);
FREE(h1);
FREE(h2);
return FALSE;
}
memset(h0, 0, SZH0 * sizeof (hash_rec_T));
memset(h1, 0, SZH1 * sizeof (hash_rec_T));
memset(h2, 0, SZH2 * sizeof (hash_rec_T));
sum = 0;
for (i = 0, p = buf; i < sz - 2; i++, p++) {
int c0, c1, c2;
int hi;
uint32_t k;
c0 = p[0];
k = c0;
hi = get_hash_index(k, h0, SZH0);
h0[hi].count++;
c1 = p[1];
k = (c1 << 8) | c0;
hi = get_hash_index(k, h1, SZH1);
h1[hi].count++;
c2 = p[2];
k = (c2 << 16) | (c1 << 8) | c0;
hi = get_hash_index(k, h2, SZH2);
h2[hi].count++;
nc++;
}
sum = 0.;
for (i = 0; i < SZH0; i++) {
if (h0[i].count > 0) {
h0[i].pp = ((double) h0[i].count) / nc;
sum -= h0[i].pp * log(h0[i].pp);
}
}
*e0 = sum / log(2.);
sum = 0.;
for (i = 0; i < SZH1; i++) {
if (h1[i].count > 0) {
int hi;
h1[i].pp = ((double) h1[i].count) / nc;
hi = h1[i].key & 0xFF;
hi = get_hash_index(hi, h0, SZH0);
if (h0[hi].count > 0) {
h1[i].pc = h1[i].pp / h0[hi].pp;
sum -= h1[i].pp * log(h1[i].pc);
}
}
}
*e1 = sum / log(2.);
sum = 0;
for (i = 0; i < SZH2; i++) {
if (h2[i].count > 0) {
int hi;
h2[i].pp = ((double) h2[i].count) / nc;
hi = h2[i].key & 0xFFFF;
hi = get_hash_index(hi, h1, SZH1);
if (h1[hi].pp > 0) {
h2[i].pc = h2[i].pp / h1[hi].pp;
sum -= h2[i].pp * log(h2[i].pc);
}
}
}
*e2 = sum / log(2.);
FREE(h0);
FREE(h1);
FREE(h2);
return TRUE;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
double
entropy_token_class(buf, sz)
char *buf;
size_t sz;
{
int freq[256];
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz == 0))
return 0.;
memset(freq, 0, sizeof (freq));
sum = 0;
for (i = 0, p = buf; i < sz; i++, p++) {
int c;
c = conv2ascii(*p);
c = (c + 256) % 256;
nc++;
if (isalpha(c)) {
freq[2]++;
continue;
}
if (isdigit(c)) {
freq[3]++;
continue;
}
if (isspace(c)) {
freq[4]++;
continue;
}
if (ispunct(c)) {
freq[5]++;
continue;
}
if (iscntrl(c)) {
freq[6]++;
continue;
}
freq[0]++;
}
for (i = 0; i < 256; i++) {
if (freq[i] != 0) {
double k;
k = ((double) freq[i]) / ((double) nc);
sum -= k * log(k);
}
}
return sum / log(2.);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
double
entropy_punct_class(buf, sz)
char *buf;
size_t sz;
{
int freq[256];
int i;
double sum = 0.;
int nc = 0;
char *p;
if ((buf == NULL) || (sz == 0))
return 0.;
memset(freq, 0, sizeof (freq));
sum = 0;
for (i = 0, p = buf; i < sz; i++, p++) {
int c;
c = conv2ascii(*p);
c = (c + 256) % 256;
nc++;
if (isalnum(c)) {
freq[1]++;
continue;
}
if (isspace(c)) {
freq[2]++;
continue;
}
if (ispunct(c)) {
freq[2]++;
continue;
}
ZE_MessageInfo(5, "Nor alnum, space, punct %c %3d ??? ", c, c);
freq[0]++;
}
for (i = 0; i < 256; i++) {
if (freq[i] != 0) {
double k;
k = ((double) freq[i]) / ((double) nc);
sum -= k * log(k);
}
}
return sum / log(2.);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
static void
buf_extract_tokens(buf)
char *buf;
{
char *s, *ptr;
for (s = strtok_r(buf, " ", &ptr); s != NULL; s = strtok_r(NULL, " ", &ptr)) {
printf("--> %s\n", s);
}
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
typedef struct {
kstats_T html_st;
size_t html_sz;
kstats_T plain_st;
size_t plain_sz;
} DATA_T;
static bool
entropy_mime_part(buf, size, id, level, type, arg, mime_part)
char *buf;
size_t size;
char *id;
int level;
int type;
void *arg;
mime_part_T *mime_part;
{
DATA_T *data = (DATA_T *) arg;
double h0, h1, h2, h3, h4, ratio = 1.;
char *cleanbuf = NULL;
char *mtype = "PLAIN";
char *wbuf = buf;
int n;
uint32_t dt;
kstats_T st = KSTATS_INITIALIZER;
if (data == NULL)
return FALSE;
if (type != MIME_TYPE_TEXT)
return TRUE;
#if 1
if (abs(strspn(buf, " \t\r\n") - size) < 4)
return TRUE;
#endif
#if 1
if (size < 6)
return TRUE;
#endif
if (strcasecmp("text/html", mime_part->mime) == 0) {
mtype = "HTML ";
n = check_valid_html_tags(NULL, buf);
ZE_LogMsgInfo(9, "NOT VALID TAGS = %6d", n);
cleanbuf = cleanup_html_buffer(buf, strlen(buf));
#if 1
ZE_MessageInfo(9, "\nBUF ...\n%s\n", buf);
if (cleanbuf != NULL)
ZE_MessageInfo(9, "\nBUF ...\n%s\n", cleanbuf);
{
char *x = NULL;
x = realcleanup_text_buf(cleanbuf, strlen(cleanbuf));
if (x != NULL) {
ZE_MessageInfo(9, "\nBUF ...\n%s\n", x);
buf_extract_tokens(x);
FREE(x);
}
}
#endif
wbuf = cleanbuf;
if (strlen(wbuf) > 0)
ratio = ((double) strlen(buf)) / strlen(wbuf);
}
dt = zeTime_ms();
text_buffer_entropy(wbuf, strlen(wbuf) + 1, &h0, &h1, &h2);
dt = zeTime_ms() - dt;
ZE_MessageInfo(9, "DT = %ld", dt);
h3 = entropy_token_class(wbuf, strlen(wbuf) + 1);
h4 = entropy_punct_class(wbuf, strlen(wbuf) + 1);
(void) text_word_length(wbuf, &st, strlen(wbuf));
ZE_MessageInfo(9,
"%s ENTROPY = %7.3f %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
mtype, h0, h1, h2, h3, h4, size, strlen(buf), strlen(wbuf),
ratio);
ZE_MessageInfo(9,
"%s WORDS = %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
mtype,
zeKMin(&st), zeKMean(&st), zeKMax(&st), zeKStdDev(&st),
size, strlen(buf), strlen(wbuf), ratio);
if (0) {
long prob[256], nb;
if ((nb = text_buf_histogram(wbuf, strlen(wbuf) + 1, prob)) > 0) {
int i;
for (i = 0; i < 256; i++) {
if (prob[i] > 0)
ZE_MessageInfo(9, "%s HISTO = %3d %6d %c",
mtype, i, prob[i], (isprint(i) ? i : '.'));
}
}
}
FREE(cleanbuf);
return TRUE;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
bool
message_entropy(id, fname)
char *id;
char *fname;
{
DATA_T data;
if (fname == NULL)
return FALSE;
memset(&data, 0, sizeof (data));
return decode_mime_file(id, fname, NULL, entropy_mime_part, &data);
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
typedef struct {
kstats_T html_st;
size_t html_sz;
kstats_T plain_st;
size_t plain_sz;
} HTTP_T;
#if 1
#define URL_DOMAIN_EXPRESSION "http[s]?://[^ /<>\\(\\)\"\'?]*"
#else
#define URL_DOMAIN_EXPRESSION "http://[^ /<>\"]*"
#endif
static bool
mime_extract_http_urls(buf, size, id, level, type, arg, mime_part)
char *buf;
size_t size;
char *id;
int level;
int type;
void *arg;
mime_part_T *mime_part;
{
HTTP_T *data = (HTTP_T *) arg;
if (data == NULL)
return FALSE;
if (type != MIME_TYPE_TEXT)
return TRUE;
if (abs(strspn(buf, " \t\r\n") - size) < 4)
return TRUE;
if (size < 6)
return TRUE;
{
long pi, pf;
char *p = buf;
char sout[4096];
memset(sout, 0, sizeof (sout));
while (zeStrRegex(p, "(ftp|http)[s]?://[^ /<>\"]*", &pi, &pf, TRUE)) {
char sout[1024];
char c = '.';
long xi, xf, xh;
if (zeStrRegex(p + pf, "></a>", &xi, NULL, TRUE) && (xi - pf) < 30) {
bool okh, okf;
okh = zeStrRegex(p + pf, ">", &xh, NULL, TRUE) && (xh < xi);
okf = zeStrRegex(p + pf, ">.+</a>", &xf, NULL, TRUE) && (xf < xi);
c = (okf || okh) ? '.' : 'H';
}
strncpy(sout, p + pi, pf - pi);
sout[pf - pi] = 0;
#if 1
{
char *p, *q;
for (p = q = sout; *p != '\0'; p++) {
if (strchr(" \t\r\n()", *p) == NULL)
*q++ = *p;
}
*q = '\0';
}
#else
if ((xh = strcspn(sout, " \t\r\n()")) >= 0)
sout[xh] = '\0';
#endif
ZE_MessageInfo(0, "HTTP : %c %s", c, sout);
p += pf;
}
}
return TRUE;
}
/* ****************************************************************************
* *
* *
**************************************************************************** */
#define SZBUF 0x20000
bool
message_extract_http_urls(id, fname)
char *id;
char *fname;
{
HTTP_T data;
if (fname != NULL) {
memset(&data, 0, sizeof (data));
return decode_mime_file(id, fname, NULL, mime_extract_http_urls, &data);
} else {
char *buf = NULL;
size_t sz;
if ((buf = malloc(SZBUF)) != NULL) {
sz = read(STDIN_FILENO, buf, SZBUF);
if (sz > 0)
return decode_mime_buffer(id, buf, sz, 0, NULL, mime_extract_http_urls,
&data);
if (sz < 0)
return FALSE;
}
}
return FALSE;
}
", c) != NULL)
94 for (p = buf; (size > 0) && (*p !=
'\0'); p++) {
95 c = (conv2ascii(*p) + 256) % 256;
117 if ((buf == NULL) || (sz == 0))
120 memset(freq, 0,
sizeof (freq));
123 for (i = 0, p = buf; i < sz; i++, p++) {
134 for (i = 0; i < 256; i++) {
138 k = ((double) freq[i]) / ((double) nc);
142 return sum / log(2.);
157 get_hash_index(k, h, sz)
169 for (i = 0; i < 4; i++) {
174 hv += (c << 11) ^ (c >> 1);
175 hv ^= (d << 14) + (d << 7) + (d << 4) + d;
179 for (i = 0; i < sz; i++) {
180 int j = (hv + i) % sz;
223 if ((buf == NULL) || (sz < 6))
231 if ((h0 == NULL) || (h1 == NULL) || (h2 == NULL)) {
243 for (i = 0, p = buf; i < sz - 2; i++, p++) {
251 hi = get_hash_index(k, h0,
SZH0);
257 hi = get_hash_index(k, h1,
SZH1);
262 k = (c2 << 16) | (c1 << 8) | c0;
263 hi = get_hash_index(k, h2,
SZH2);
270 for (i = 0; i <
SZH0; i++) {
271 if (h0[i].count > 0) {
272 h0[i].
pp = ((double) h0[i].count) / nc;
273 sum -= h0[i].
pp * log(h0[i].pp);
279 for (i = 0; i <
SZH1; i++) {
280 if (h1[i].count > 0) {
283 h1[i].
pp = ((double) h1[i].count) / nc;
284 hi = h1[i].
key & 0xFF;
285 hi = get_hash_index(hi, h0, SZH0);
287 if (h0[hi].count > 0) {
288 h1[i].
pc = h1[i].
pp / h0[hi].
pp;
289 sum -= h1[i].
pp * log(h1[i].pc);
296 for (i = 0; i <
SZH2; i++) {
297 if (h2[i].count > 0) {
300 h2[i].
pp = ((double) h2[i].count) / nc;
301 hi = h2[i].
key & 0xFFFF;
302 hi = get_hash_index(hi, h1, SZH1);
305 h2[i].
pc = h2[i].
pp / h1[hi].
pp;
306 sum -= h2[i].
pp * log(h2[i].pc);
335 if ((buf == NULL) || (sz == 0))
338 memset(freq, 0,
sizeof (freq));
341 for (i = 0, p = buf; i < sz; i++, p++) {
372 for (i = 0; i < 256; i++) {
376 k = ((double) freq[i]) / ((double) nc);
380 return sum / log(2.);
399 if ((buf == NULL) || (sz == 0))
402 memset(freq, 0,
sizeof (freq));
405 for (i = 0, p = buf; i < sz; i++, p++) {
431 for (i = 0; i < 256; i++) {
435 k = ((double) freq[i]) / ((double) nc);
439 return sum / log(2.);
448 buf_extract_tokens(buf)
453 for (s = strtok_r(buf,
" ", &ptr); s != NULL; s = strtok_r(NULL,
" ", &ptr)) {
454 printf(
"--> %s\n", s);
472 entropy_mime_part(buf, size,
id, level, type, arg, mime_part)
483 double h0, h1, h2, h3, h4, ratio = 1.;
484 char *cleanbuf = NULL;
485 char *mtype =
"PLAIN";
499 if (abs(strspn(buf,
" \t\r\n") - size) < 4)
507 if (strcasecmp(
"text/html", mime_part->
mime) == 0) {
519 if (cleanbuf != NULL)
528 buf_extract_tokens(x);
535 if (strlen(wbuf) > 0)
536 ratio = ((double) strlen(buf)) / strlen(wbuf);
551 "%s ENTROPY = %7.3f %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
552 mtype, h0, h1, h2, h3, h4, size, strlen(buf), strlen(wbuf),
556 "%s WORDS = %7.3f %7.3f %7.3f %7.3f - SIZE = %5d %5d %5d %7.2f",
559 size, strlen(buf), strlen(wbuf), ratio);
567 for (i = 0; i < 256; i++) {
570 mtype, i, prob[i], (isprint(i) ? i :
'.'));
596 memset(&data, 0,
sizeof (data));
614 #define URL_DOMAIN_EXPRESSION "http[s]?://[^ /<>\\(\\)\"\'?]*" 616 #define URL_DOMAIN_EXPRESSION "http://[^ /<>\"]*" 620 mime_extract_http_urls(buf, size,
id, level, type, arg, mime_part)
637 if (abs(strspn(buf,
" \t\r\n") - size) < 4)
647 memset(sout, 0,
sizeof (sout));
648 while (
zeStrRegex(p,
"(ftp|http)[s]?://[^ /<>\"]*", &pi, &pf,
TRUE)) {
653 if (
zeStrRegex(p + pf,
"></a>", &xi, NULL,
TRUE) && (xi - pf) < 30) {
657 okf =
zeStrRegex(p + pf,
">.+</a>", &xf, NULL,
TRUE) && (xf < xi);
659 c = (okf || okh) ?
'.' :
'H';
661 strncpy(sout, p + pi, pf - pi);
667 for (p = q = sout; *p !=
'\0'; p++) {
668 if (
strchr(
" \t\r\n()", *p) == NULL)
674 if ((xh = strcspn(sout,
" \t\r\n()")) >= 0)
692 #define SZBUF 0x20000 702 memset(&data, 0,
sizeof (data));
709 if ((buf = malloc(
SZBUF)) != NULL) {
710 sz = read(STDIN_FILENO, buf,
SZBUF);
char * realcleanup_text_buf(char *, size_t)
bool text_word_length(char *, kstats_T *, size_t)
bool message_extract_http_urls(char *id, char *fname)
#define ZE_LogMsgInfo(level,...)
double zeKMin(kstats_T *s)
bool zeStrRegex(char *, char *, long *, long *, bool)
double entropy_punct_class(char *buf, size_t sz)
#define ZE_MessageInfo(level,...)
#define KSTATS_INITIALIZER
#define ZE_LogMsgWarning(level,...)
double entropy_monogram(char *buf, size_t sz)
void text2lowerascii(char *buf, size_t size)
double zeKMean(kstats_T *s)
double zeKMax(kstats_T *s)
double zeKStdDev(kstats_T *s)
bool text_buffer_entropy(char *buf, size_t sz, double *e0, double *e1, double *e2)
double entropy_token_class(char *buf, size_t sz)
bool message_entropy(char *id, char *fname)
long text_buf_histogram(char *, size_t, long *)