46 #define MSG_BTSM_INITIALIZER \ 53 #define FEATURE_WORD 0 54 #define FEATURE_NGRAM 1 62 static bool mimepart2wordTokens(
char *,
size_t,
char *,
int,
int,
65 static bool mimepart2ngramTokens(
char *,
size_t,
char *,
int,
int,
77 static bool msg_btsm_add_token(
msg_btsm_T * bm,
char *token);
125 #define SEP_TOK " \t\n\r,/=&?\"()[]{}<>" 127 #define SEP_TOK " \t\n\r,=&?\"()[]{}<>;~/" 130 static char *TOK_SEPARATOR[] = {
131 " \t\n\r\"()[]{}<>/",
136 #define SEP_WS " \t\n\r" 144 void (*func) (
char *);
153 #define TOKCONF_INITIALIZER {"body", "body", TRUE, NULL, NULL, TRUE} 155 static bool extract_word_tokens(
tokconf_T * cf,
char *prefix,
161 {
"x-mailer",
"xmailer",
FALSE, NULL, NULL,
TRUE},
162 {
"user-agent",
"uagent",
FALSE, NULL, NULL, TRUE},
163 {
"from",
"from",
FALSE, NULL, NULL, TRUE},
164 {
"subject",
"subject",
FALSE, NULL, NULL, TRUE},
165 {
"received",
"rcvd",
FALSE, NULL, NULL,
FALSE},
166 {
"content-type",
"ctype",
FALSE, NULL, NULL, TRUE},
167 {
"content-disposition",
"cdisp",
FALSE, NULL, NULL, TRUE},
168 {
"content-description",
"cdesc",
FALSE, NULL, NULL, TRUE},
169 {
"content-transfer-encoding",
"ctencode",
FALSE, NULL, NULL, TRUE},
170 {
"content-id",
"cid",
FALSE, NULL, NULL, TRUE},
171 {
"message-id",
"msgid",
FALSE, NULL, NULL, TRUE},
173 {
"boundary",
"bound",
FALSE, NULL, NULL, TRUE},
174 {NULL, NULL,
FALSE, NULL, NULL, FALSE}
178 get_tokconf_headers(tag)
183 for (p = hdrs_tokconf; p->
tag != NULL; p++)
193 {
"body",
"body",
FALSE, NULL, NULL,
TRUE},
194 {
"text/plain",
"body",
FALSE, NULL, NULL, TRUE},
195 {
"text/html",
"body",
FALSE, NULL, NULL, TRUE},
196 {
"html/tags",
"htmltags",
FALSE,
" \t\n\r,=&?\"()[]{}<>;~/", NULL, TRUE},
197 {
"simple",
"body",
FALSE, NULL, NULL, TRUE},
198 {
"boundary",
"bound",
FALSE, NULL, NULL, TRUE},
199 {
"name",
"name",
FALSE, NULL, NULL, TRUE},
201 {
"cdname",
"cdname",
FALSE, NULL, NULL, TRUE},
203 {
"ctname",
"ctname",
FALSE, NULL, NULL, TRUE},
204 {NULL, NULL,
FALSE, NULL, NULL, FALSE}
208 get_tokconf_body(tag)
213 for (p = body_tokconf; p->
tag != NULL; p++)
229 for (p = body_tokconf; p->
tag != NULL; p++)
237 for (p = hdrs_tokconf; p->
tag != NULL; p++)
262 while ((i = strlen(s)) > 0)
264 if (
strchr(
".-/*'`/:", s[i - 1]) == NULL)
271 if ((i = strspn(p,
"()><+-.*!'`/")) > 0)
281 for (p = s; *p ==
'$' && !isdigit(*(p + 1)); p++)
310 if (strspn(s,
"0123456789") == strlen(s))
315 if (
zeStrRegex(s,
"^[0-9]{2,2}/[0-9]{1,2}/[0-9]{2,4}$", NULL, NULL,
TRUE))
320 (s,
"^[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}(pm|am)?$", NULL, NULL,
TRUE))
322 if (
zeStrRegex(s,
"^[0-9]{2,2}:[0-9]{2,2}(pm|am)?$", NULL, NULL,
TRUE))
332 #define ADD_TOKEN(bm, prefix, token) \ 334 if (strlen(token) > 3) { \ 337 if (prefix != NULL) \ 338 snprintf(tstr, sizeof(tstr), "%s--%s", prefix, token); \ 340 snprintf(tstr, sizeof(tstr), "%s--%s", "GLOB", token); \ 341 if (!msg_btsm_add_token(bm, tstr)) \ 342 ZE_LogMsgError(0, "ERROR inserting new token"); \ 349 extract_word_tokens(cf, prefix, separator, buf, kind, bm, level)
369 if (bm == NULL || buf == NULL || strlen(buf) == 0)
376 if (separator == NULL)
381 separator = TOK_SEPARATOR[1];
388 for (stok = strtok_r(buf, separator, &ptr); stok != NULL;
389 stok = strtok_r(NULL, separator, &ptr))
391 if (ptr != NULL && *ptr !=
'\0')
395 if (p[0] ==
'$' && isspace(p[1]) && isdigit(p[2]))
399 token_trim_bounds(stok);
401 if (!check_token(stok))
412 while (isdigit(*q) || *q ==
'.')
425 for (i = 1; i <= 6 && isxdigit(stok[i]); i++)
438 if ((ts = strdup(stok)) != NULL)
445 for (p = q = ts; *p !=
'\0'; p++)
453 if (!first && isxdigit(*(p - 1)) && isxdigit(*(p + 1)))
457 if (!first && (isdigit(*(p - 1)) || isdigit(*(p + 1))))
474 #define SECSEP ".^\':@|+_-%#!$" 476 if (strpbrk(stok,
SECSEP) != NULL)
478 strlcpy(ts, stok, strlen(stok) + 1);
479 extract_word_tokens(NULL, prefix,
SECSEP, ts, kind, bm, level);
493 if (strlen(prev) > 0 && strlen(stok) > 0)
495 snprintf(t,
sizeof (t),
"%s-dbl-%s", prev, stok);
519 #define X_HTML_SEP " -x- " 522 extract_html_tags(buf, size)
530 if (buf == NULL || strlen(buf) == 0)
533 msz = 2 * (size + 1);
534 msz += (8 - msz % 8);
544 for (p = buf + strcspn(buf,
"<"); *p !=
'\0'; p += strcspn(p,
"<"))
569 #define NORM_FILENAME(fname) \ 575 for (px = fname; *px != '\0'; px++) \ 599 mimepart2wordTokens(buf, size, xid, level, type, arg, mime_part)
640 if ((x = get_tokconf_body(
"ctmain")) != NULL && x->
active)
652 if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
665 if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
683 if ((x = get_tokconf_headers(
"boundary")) != NULL && x->
active)
685 char *sep =
" \t\n\r";
689 for (q = bound; *q !=
'\0'; q++)
699 if (
strchr(
"=-_", *q) != NULL)
702 extract_word_tokens(x, x->
prefix, sep, bound, 0, bm, 0);
723 if ((x = get_tokconf_body(
"cdmain")) != NULL && x->
active)
735 if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
748 if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
761 for (h = mime_part->
hdrs; h != NULL; h = h->
next)
765 if ((x = get_tokconf_headers(h->
key)) != NULL && x->
active)
773 for (px = h->
value; *px !=
'\0'; px++)
800 for (p = q = buf; *p !=
'\0'; p++)
802 if (*p != *q || isalpha(*p))
809 if (abs(strspn(buf,
" \t\r\n") - size) < 4)
816 char *cleanbuf = NULL;
824 if ((x = get_tokconf_body(
"text/html")) != NULL)
825 extract_word_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
828 extract_word_tokens(NULL,
"body", NULL, cleanbuf, 0, bm, 0);
831 cleanbuf = extract_html_tags(buf, strlen(buf) + 1);
836 if ((x = get_tokconf_body(
"html/tags")) != NULL)
837 extract_word_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
840 extract_word_tokens(NULL,
"html", NULL, cleanbuf, 0, bm, 0);
849 extract_word_tokens(NULL,
"body", NULL, buf, 0, bm, 0);
854 extract_word_tokens(NULL,
"body", NULL, buf, 0, bm, 0);
870 static int C_NGRAM = 5;
873 extract_char_tokens(cf, prefix, separator, buf, kind, bm, level)
891 if (bm == NULL || buf == NULL || strlen(buf) == 0)
906 for (p = q = buf; *p !=
'\0'; cp = *p++)
941 for (p = buf; strlen(p) >= C_NGRAM; p++)
959 mimepart2ngramTokens(buf, size, xid, level, type, arg, mime_part)
1000 if ((x = get_tokconf_body(
"ctmain")) != NULL && x->
active)
1012 if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
1025 if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
1043 if ((x = get_tokconf_headers(
"boundary")) != NULL && x->
active)
1045 char *sep =
" \t\n\r";
1049 for (q = bound; *q !=
'\0'; q++)
1059 if (
strchr(
"=-_", *q) != NULL)
1062 extract_char_tokens(x, x->
prefix, sep, bound, 0, bm, 0);
1083 if ((x = get_tokconf_body(
"cdmain")) != NULL && x->
active)
1095 if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
1108 if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
1121 for (h = mime_part->
hdrs; h != NULL; h = h->
next)
1125 if ((x = get_tokconf_headers(h->
key)) != NULL && x->
active)
1133 for (px = h->
value; *px !=
'\0'; px++)
1160 for (p = q = buf; *p !=
'\0'; p++)
1162 if (*p != *q || isalpha(*p))
1169 if (abs(strspn(buf,
" \t\r\n") - size) < 4)
1176 char *cleanbuf = NULL;
1184 if ((x = get_tokconf_body(
"text/html")) != NULL)
1185 extract_char_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
1188 extract_char_tokens(NULL,
"body", NULL, cleanbuf, 0, bm, 0);
1191 cleanbuf = extract_html_tags(buf, strlen(buf) + 1);
1196 if ((x = get_tokconf_body(
"html/tags")) != NULL)
1197 extract_char_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
1200 extract_char_tokens(NULL,
"html", NULL, cleanbuf, 0, bm, 0);
1209 extract_char_tokens(NULL,
"body", NULL, buf, 0, bm, 0);
1214 extract_char_tokens(NULL,
"body", NULL, buf, 0, bm, 0);
1231 static int token_cmp(
void *,
void *);
1234 msg_btsm_add_token(bm, token)
1241 memset(&tok, 0,
sizeof (tok));
1317 bool TextUnitWord =
TRUE;
1337 (void) msg_btsm_init(&bm);
1343 if ((env = getenv(
"TEXTUNIT")) != NULL)
1346 TextUnitWord = FALSE;
1349 if ((env = getenv(
"NGRAMLEN")) != NULL && strlen(env) > 0)
1354 if (n > 0 && n < 10)
1361 TextUnitWord ?
"WORD" :
"NGRAM", C_NGRAM);
1380 (void) msg_btsm_end(&bm);
#define MSG_BTSM_INITIALIZER
void * zeBTree_Get(ZEBT_T *, void *)
bfilter_T * bfilter_ptr()
bool zeBTree_Init(ZEBT_T *, size_t, ZEBT_CMP_F)
bool zeBTree_Add(ZEBT_T *, void *)
#define TOKCONF_INITIALIZER
void set_tokconf_active(char *tag, bool active)
#define ZE_LogMsgError(level,...)
bool zeStrRegex(char *, char *, long *, long *, bool)
bool zeBTree_Destroy(ZEBT_T *)
struct msg_btsm_T msg_btsm_T
#define NORM_FILENAME(fname)
#define ZE_MessageInfo(level,...)
#define ADD_TOKEN(bm, prefix, token)
int zeSafeStrnCpy(char *, size_t, char *, size_t)
int zeSafeStrnCat(char *, size_t, char *, size_t)
#define ZE_LogSysError(...)
char * zeStr2Lower(char *)
#define STRCASEEQUAL(a, b)
int(* btsm_browse_F)(void *, void *)
int zeBTree_Browse(ZEBT_T *, ZEBT_BROWSE_F, void *)
bool bfilter_handle_message(char *id, char *fname, btsm_browse_F func, void *arg)