46 #define MSG_BTSM_INITIALIZER \    53 #define   FEATURE_WORD        0    54 #define   FEATURE_NGRAM       1    62 static bool         mimepart2wordTokens(
char *, 
size_t, 
char *, 
int, 
int,
    65 static bool         mimepart2ngramTokens(
char *, 
size_t, 
char *, 
int, 
int,
    77 static bool         msg_btsm_add_token(
msg_btsm_T * bm, 
char *token);
   125 #define    SEP_TOK    " \t\n\r,/=&?\"()[]{}<>"   127 #define    SEP_TOK    " \t\n\r,=&?\"()[]{}<>;~/"   130 static char        *TOK_SEPARATOR[] = {
   131   " \t\n\r\"()[]{}<>/",
   136 #define    SEP_WS    " \t\n\r"   144   void                (*func) (
char *);
   153 #define TOKCONF_INITIALIZER    {"body", "body", TRUE, NULL, NULL, TRUE}   155 static bool         extract_word_tokens(
tokconf_T * cf, 
char *prefix,
   161   {
"x-mailer", 
"xmailer", 
FALSE, NULL, NULL, 
TRUE},
   162   {
"user-agent", 
"uagent", 
FALSE, NULL, NULL, TRUE},
   163   {
"from", 
"from", 
FALSE, NULL, NULL, TRUE},
   164   {
"subject", 
"subject", 
FALSE, NULL, NULL, TRUE},
   165   {
"received", 
"rcvd", 
FALSE, NULL, NULL, 
FALSE},
   166   {
"content-type", 
"ctype", 
FALSE, NULL, NULL, TRUE},
   167   {
"content-disposition", 
"cdisp", 
FALSE, NULL, NULL, TRUE},
   168   {
"content-description", 
"cdesc", 
FALSE, NULL, NULL, TRUE},
   169   {
"content-transfer-encoding", 
"ctencode", 
FALSE, NULL, NULL, TRUE},
   170   {
"content-id", 
"cid", 
FALSE, NULL, NULL, TRUE},
   171   {
"message-id", 
"msgid", 
FALSE, NULL, NULL, TRUE},
   173   {
"boundary", 
"bound", 
FALSE, NULL, NULL, TRUE},
   174   {NULL, NULL, 
FALSE, NULL, NULL, FALSE}
   178 get_tokconf_headers(tag)
   183   for (p = hdrs_tokconf; p->
tag != NULL; p++)
   193   {
"body", 
"body", 
FALSE, NULL, NULL, 
TRUE},
   194   {
"text/plain", 
"body", 
FALSE, NULL, NULL, TRUE},
   195   {
"text/html", 
"body", 
FALSE, NULL, NULL, TRUE},
   196   {
"html/tags", 
"htmltags", 
FALSE, 
" \t\n\r,=&?\"()[]{}<>;~/", NULL, TRUE},
   197   {
"simple", 
"body", 
FALSE, NULL, NULL, TRUE},
   198   {
"boundary", 
"bound", 
FALSE, NULL, NULL, TRUE},
   199   {
"name", 
"name", 
FALSE, NULL, NULL, TRUE},
   201   {
"cdname", 
"cdname", 
FALSE, NULL, NULL, TRUE},
   203   {
"ctname", 
"ctname", 
FALSE, NULL, NULL, TRUE},
   204   {NULL, NULL, 
FALSE, NULL, NULL, FALSE}
   208 get_tokconf_body(tag)
   213   for (p = body_tokconf; p->
tag != NULL; p++)
   229   for (p = body_tokconf; p->
tag != NULL; p++)
   237   for (p = hdrs_tokconf; p->
tag != NULL; p++)
   262   while ((i = strlen(s)) > 0)
   264     if (
strchr(
".-/*'`/:", s[i - 1]) == NULL)
   271   if ((i = strspn(p, 
"()><+-.*!'`/")) > 0)
   281   for (p = s; *p == 
'$' && !isdigit(*(p + 1)); p++)
   310   if (strspn(s, 
"0123456789") == strlen(s))
   315   if (
zeStrRegex(s, 
"^[0-9]{2,2}/[0-9]{1,2}/[0-9]{2,4}$", NULL, NULL, 
TRUE))
   320       (s, 
"^[0-9]{2,2}:[0-9]{2,2}:[0-9]{2,2}(pm|am)?$", NULL, NULL, 
TRUE))
   322   if (
zeStrRegex(s, 
"^[0-9]{2,2}:[0-9]{2,2}(pm|am)?$", NULL, NULL, 
TRUE))
   332 #define ADD_TOKEN(bm, prefix, token)        \   334     if (strlen(token) > 3) {          \   337       if (prefix != NULL)         \   338   snprintf(tstr, sizeof(tstr), "%s--%s", prefix, token);  \   340   snprintf(tstr, sizeof(tstr), "%s--%s", "GLOB", token);  \   341       if (!msg_btsm_add_token(bm, tstr))      \   342   ZE_LogMsgError(0, "ERROR inserting new token");   \   349 extract_word_tokens(cf, prefix, separator, buf, kind, bm, level)
   369   if (bm == NULL || buf == NULL || strlen(buf) == 0)
   376   if (separator == NULL)
   381     separator = TOK_SEPARATOR[1];
   388   for (stok = strtok_r(buf, separator, &ptr); stok != NULL;
   389        stok = strtok_r(NULL, separator, &ptr))
   391     if (ptr != NULL && *ptr != 
'\0')
   395       if (p[0] == 
'$' && isspace(p[1]) && isdigit(p[2]))
   399     token_trim_bounds(stok);
   401     if (!check_token(stok))
   412       while (isdigit(*q) || *q == 
'.')
   425       for (i = 1; i <= 6 && isxdigit(stok[i]); i++)
   438       if ((ts = strdup(stok)) != NULL)
   445           for (p = q = ts; *p != 
'\0'; p++)
   453                 if (!first && isxdigit(*(p - 1)) && isxdigit(*(p + 1)))
   457                 if (!first && (isdigit(*(p - 1)) || isdigit(*(p + 1))))
   474 #define SECSEP ".^\':@|+_-%#!$"   476           if (strpbrk(stok, 
SECSEP) != NULL)
   478             strlcpy(ts, stok, strlen(stok) + 1);
   479             extract_word_tokens(NULL, prefix, 
SECSEP, ts, kind, bm, level);
   493       if (strlen(prev) > 0 && strlen(stok) > 0)
   495         snprintf(t, 
sizeof (t), 
"%s-dbl-%s", prev, stok);
   519 #define X_HTML_SEP     " -x- "   522 extract_html_tags(buf, size)
   530   if (buf == NULL || strlen(buf) == 0)
   533   msz = 2 * (size + 1);
   534   msz += (8 - msz % 8);
   544   for (p = buf + strcspn(buf, 
"<"); *p != 
'\0'; p += strcspn(p, 
"<"))
   569 #define  NORM_FILENAME(fname)                \   575       for (px = fname; *px != '\0'; px++)    \   599 mimepart2wordTokens(buf, size, xid, level, type, arg, mime_part)
   640         if ((x = get_tokconf_body(
"ctmain")) != NULL && x->
active)
   652         if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
   665         if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
   683           if ((x = get_tokconf_headers(
"boundary")) != NULL && x->
active)
   685             char               *sep = 
" \t\n\r";
   689             for (q = bound; *q != 
'\0'; q++)
   699               if (
strchr(
"=-_", *q) != NULL)
   702             extract_word_tokens(x, x->
prefix, sep, bound, 0, bm, 0);
   723         if ((x = get_tokconf_body(
"cdmain")) != NULL && x->
active)
   735         if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
   748         if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
   761     for (h = mime_part->
hdrs; h != NULL; h = h->
next)
   765       if ((x = get_tokconf_headers(h->
key)) != NULL && x->
active)
   773           for (px = h->
value; *px != 
'\0'; px++)
   800     for (p = q = buf; *p != 
'\0'; p++)
   802       if (*p != *q || isalpha(*p))
   809   if (abs(strspn(buf, 
" \t\r\n") - size) < 4)
   816     char               *cleanbuf = NULL;
   824     if ((x = get_tokconf_body(
"text/html")) != NULL)
   825       extract_word_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
   828       extract_word_tokens(NULL, 
"body", NULL, cleanbuf, 0, bm, 0);
   831     cleanbuf = extract_html_tags(buf, strlen(buf) + 1);
   836     if ((x = get_tokconf_body(
"html/tags")) != NULL)
   837       extract_word_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
   840       extract_word_tokens(NULL, 
"html", NULL, cleanbuf, 0, bm, 0);
   849     extract_word_tokens(NULL, 
"body", NULL, buf, 0, bm, 0);
   854   extract_word_tokens(NULL, 
"body", NULL, buf, 0, bm, 0);
   870 static int          C_NGRAM = 5;
   873 extract_char_tokens(cf, prefix, separator, buf, kind, bm, level)
   891   if (bm == NULL || buf == NULL || strlen(buf) == 0)
   906     for (p = q = buf; *p != 
'\0'; cp = *p++)
   941     for (p = buf; strlen(p) >= C_NGRAM; p++)
   959 mimepart2ngramTokens(buf, size, xid, level, type, arg, mime_part)
  1000         if ((x = get_tokconf_body(
"ctmain")) != NULL && x->
active)
  1012         if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
  1025         if ((x = get_tokconf_body(
"ctname")) != NULL && x->
active)
  1043           if ((x = get_tokconf_headers(
"boundary")) != NULL && x->
active)
  1045             char               *sep = 
" \t\n\r";
  1049             for (q = bound; *q != 
'\0'; q++)
  1059               if (
strchr(
"=-_", *q) != NULL)
  1062             extract_char_tokens(x, x->
prefix, sep, bound, 0, bm, 0);
  1083         if ((x = get_tokconf_body(
"cdmain")) != NULL && x->
active)
  1095         if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
  1108         if ((x = get_tokconf_body(
"cdname")) != NULL && x->
active)
  1121     for (h = mime_part->
hdrs; h != NULL; h = h->
next)
  1125       if ((x = get_tokconf_headers(h->
key)) != NULL && x->
active)
  1133           for (px = h->
value; *px != 
'\0'; px++)
  1160     for (p = q = buf; *p != 
'\0'; p++)
  1162       if (*p != *q || isalpha(*p))
  1169   if (abs(strspn(buf, 
" \t\r\n") - size) < 4)
  1176     char               *cleanbuf = NULL;
  1184     if ((x = get_tokconf_body(
"text/html")) != NULL)
  1185       extract_char_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
  1188       extract_char_tokens(NULL, 
"body", NULL, cleanbuf, 0, bm, 0);
  1191     cleanbuf = extract_html_tags(buf, strlen(buf) + 1);
  1196     if ((x = get_tokconf_body(
"html/tags")) != NULL)
  1197       extract_char_tokens(x, x->
prefix, cleanbuf, 0, bm, 0);
  1200       extract_char_tokens(NULL, 
"html", NULL, cleanbuf, 0, bm, 0);
  1209     extract_char_tokens(NULL, 
"body", NULL, buf, 0, bm, 0);
  1214   extract_char_tokens(NULL, 
"body", NULL, buf, 0, bm, 0);
  1231 static int          token_cmp(
void *, 
void *);
  1234 msg_btsm_add_token(bm, token)
  1241   memset(&tok, 0, 
sizeof (tok));
  1317   bool                TextUnitWord = 
TRUE;
  1337   (void) msg_btsm_init(&bm);
  1343     if ((env = getenv(
"TEXTUNIT")) != NULL)
  1346         TextUnitWord = FALSE;
  1349     if ((env = getenv(
"NGRAMLEN")) != NULL && strlen(env) > 0)
  1354       if (n > 0 && n < 10)
  1361        TextUnitWord ? 
"WORD" : 
"NGRAM", C_NGRAM); 
  1380   (void) msg_btsm_end(&bm);
 
#define MSG_BTSM_INITIALIZER
void * zeBTree_Get(ZEBT_T *, void *)
bfilter_T * bfilter_ptr()
bool zeBTree_Init(ZEBT_T *, size_t, ZEBT_CMP_F)
bool zeBTree_Add(ZEBT_T *, void *)
#define TOKCONF_INITIALIZER
void set_tokconf_active(char *tag, bool active)
#define ZE_LogMsgError(level,...)
bool zeStrRegex(char *, char *, long *, long *, bool)
bool zeBTree_Destroy(ZEBT_T *)
struct msg_btsm_T msg_btsm_T
#define NORM_FILENAME(fname)
#define ZE_MessageInfo(level,...)
#define ADD_TOKEN(bm, prefix, token)
int zeSafeStrnCpy(char *, size_t, char *, size_t)
int zeSafeStrnCat(char *, size_t, char *, size_t)
#define ZE_LogSysError(...)
char * zeStr2Lower(char *)
#define STRCASEEQUAL(a, b)
int(* btsm_browse_F)(void *, void *)
int zeBTree_Browse(ZEBT_T *, ZEBT_BROWSE_F, void *)
bool bfilter_handle_message(char *id, char *fname, btsm_browse_F func, void *arg)