ze-filter  (ze-filter-0.8.0-develop-180218)
ze-html.c
Go to the documentation of this file.
1 /*
2  *
3  * ze-filter - Mail Server Filter for sendmail
4  *
5  * Copyright (c) 2001-2018 - Jose-Marcio Martins da Cruz
6  *
7  * Auteur : Jose Marcio Martins da Cruz
8  * jose.marcio.mc@gmail.org
9  *
10  * Historique :
11  * Creation : janvier 2002
12  *
13  * This program is free software, but with restricted license :
14  *
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
19  *
20  * More details about ze-filter license can be found at ze-filter
21  * web site : http://foss.jose-marcio.org
22  */
23 
24 #include <ze-sys.h>
25 
26 #include "ze-libjc.h"
27 
28 
29 /* ****************************************************************************
30  * *
31  * *
32  **************************************************************************** */
33 #define HEXA_CHARS "0123456789abcdef"
34 
35 static bool
36 is_hexa_char(c)
37  int c;
38 {
39  c = tolower(c);
40 
41  return (strchr(HEXA_CHARS, c) != NULL);
42 }
43 
44 static int
45 hexa2char(xa, xb)
46  char xa;
47  char xb;
48 {
49  int v = 0;
50  char *p;
51 
52  xa = tolower(xa);
53  if ((p = strchr(HEXA_CHARS, xa)) == NULL)
54  return 0;
55  v = (int) (p - HEXA_CHARS);
56 
57  xb = tolower(xb);
58  if ((p = strchr(HEXA_CHARS, xb)) == NULL)
59  return 0;
60  v <<= 4;
61  v += (int) (p - HEXA_CHARS);
62 
63  return v;
64 }
65 
66 
67 /* ****************************************************************************
68  * *
69  * *
70  **************************************************************************** */
71 void
72 html_clean_codes(buf, size)
73  char *buf;
74  size_t size;
75 {
76  char *p, *q;
77  char *new = NULL;
78  size_t sz;
79 
80  if ((buf == NULL) || (size == 0) || (strlen(buf) > size))
81  {
82  ZE_LogMsgError(0, "Error...");
83  return;
84  }
85 
86  sz = size + 1;
87  sz += (8 - sz % 8);
88  if ((new = (char *) malloc(sz)) == NULL)
89  {
90  ZE_LogSysError("malloc new error");
91  return;
92  }
93  memcpy(new, buf, size);
94 
95  for (p = new, q = buf; (size > 0) && (*p != '\0'); size--, p++)
96  {
97  if (*p != '%')
98  {
99  *q++ = *p;
100  continue;
101  }
102  if (*p == '%')
103  {
104  if (!is_hexa_char(p[1]) || !is_hexa_char(p[2]))
105  {
106  *q++ = *p;
107  continue;
108  }
109  *q++ = hexa2char(p[1], p[2]);
110  p += 2;
111  continue;
112  }
113 
114  }
115 
116  *q = '\0';
117 
118  FREE(new);
119 }
120 
121 /* ****************************************************************************
122  * *
123  * *
124  **************************************************************************** */
125 char *
127  char *buf;
128 {
129  char *p, *q;
130 
131  if (buf == NULL)
132  return buf;
133 
134  p = q = buf;
135 
136  for (p = q = buf; *p != '\0'; p++)
137  {
138  if (*p == '%')
139  {
140  if (strchr("01234567890abcdef", tolower(p[1])) &&
141  strchr("01234567890abcdef", tolower(p[2])))
142  {
143  *q++ = (p[1] << 4) + p[2];
144  p += 2;
145  continue;
146  }
147  }
148  if (p[0] == '&' && p[1] == '#')
149  {
150 
151  }
152  *q++ = *p;
153  }
154 
155  return buf;
156 }
157 
158 /* ****************************************************************************
159  * *
160  * *
161  **************************************************************************** */
162 char *
164  char *buf;
165  size_t size;
166 {
167  char *p = NULL, *s, *t;
168  int i;
169  bool state = FALSE;
170  size_t sz;
171 
172  if ((buf == NULL) || (size == 0))
173  return NULL;
174 
175  sz = size + 1;
176  sz += (8 - sz % 8);
177  if ((p = (char *) malloc(sz)) == NULL)
178  {
179  ZE_LogSysError("malloc error");
180  return NULL;
181  }
182 
183  memset(p, 0, size + 1);
184 
185  t = buf;
186  s = p;
187  for (i = 0; (i < size) && (*t != '\0'); i++, t++)
188  {
189  if (!state)
190  {
191  /* begin of HTML tag */
192  if (*t == '<')
193  {
194  state = TRUE;
195  continue;
196  }
197 
198  /* coded character */
199  if (*t == '&')
200  {
201  long pi, pf;
202 
203  if (zeStrRegex(t, "&[A-Za-z]+;", &pi, &pf, TRUE) && (pi == 0))
204  {
205  int c;
206 
207  c = get_html_entity(t + pi);
208  if (c != '\0')
209  *s++ = c;
210 
211  t += (pf - pi - 1);
212  i += (pf - pi - 1);
213  continue;
214  }
215  if (zeStrRegex(t, "&#[0-9]+;", &pi, &pf, TRUE) && (pi == 0))
216  {
217  char *q = t;
218  int c;
219 
220  q += strcspn(q, "0123456789");
221  c = atoi(q);
222  if (c != '\0')
223  *s++ = c;
224 
225  t += (pf - pi - 1);
226  i += (pf - pi - 1);
227  continue;
228  }
229  }
230 
231  /* default : simply copy it */
232  *s++ = *t;
233  continue;
234  } else
235  {
236  if (*t == '>')
237  state = FALSE;
238  }
239  }
240  *s = '\0';
241 
242  return p;
243 }
244 
245 /* ****************************************************************************
246  * *
247  * *
248  **************************************************************************** */
249 typedef struct regex_tag_T
250 {
251  char *tag;
252  regex_t re;
253  bool ok;
254 }
256 
257 #if 1
258 static char *VALID_HTML_TAGS[] = {
259  "<!doctype html[^>]*>",
260  "</?(a|abbr|acronym|address|applet|b|bdo|big|blockquote|body) ?[^>]*>",
261  "</?(button|caption|center|cite|code|colgroup|dd|del|dfn|dir|div) ?[^>]*>",
262  "</?(dl|dt|em|fieldset|font|form|frameset|h[1-6]|head|html|i) ?[^>]*>",
263  "</?(iframe|ins|kbd|label|legend|li|map|menu|nobr|noframes|noscript) ?[^>]*>",
264  "</?(object|ol|optgroup|option|p|pre|q|s|samp|script|select) ?[^>]*>",
265  "</?(small|span|strike|strong|style|sub|sup|table|tbody|td) ?[^>]*>",
266  "</?(td|textarea|tfoot|th|thead|title|tr|tt|u|ul|var) ?[^>]*>",
267 
268  "</?(area|base|basefont|br|col|frame|hr|img|input|isindex|link) ?[^>]*>",
269  "<(meta|param) ?[^>]*>",
270 
271  "<!--[^>]*[--]?[ ]*>",
272  "<(html)?[ ]*[?]?xml.*[ :]?[^>]*>",
273  "<[/?]?xml.*[ :]?[^>]*>",
274  "</?(o|v|w):[^>]*>",
275  "</?x-sigsep>",
276  "</?x-tab>",
277  "</?X-[^>]*>",
278  NULL
279 };
280 #else
281 static char *VALID_HTML_TAGS[] = {
282  "<!doctype html[^>]*>", "</?a ?[^>]*>", "</?abbr ?[^>]*>",
283  "</?acronym ?[^>]*>", "</?address ?[^>]*>", "</?applet ?[^>]*>",
284  "<area ?[^>]*>", "</?b ?[^>]*>", "<base ?[^>]*>",
285  "<basefont ?[^>]*>", "</?bdo ?[^>]*>", "</?big ?[^>]*>",
286  "</?blockquote ?[^>]*>", "</?body ?[^>]*>", "<br ?[^>]*>",
287  "</?button ?[^>]*>", "</?caption ?[^>]*>", "</?center ?[^>]*>",
288  "</?cite ?[^>]*>", "</?code ?[^>]*>", "<col ?[^>]*>",
289  "</?colgroup ?[^>]*>", "</?dd ?[^>]*>", "</?del ?[^>]*>",
290  "</?dfn ?[^>]*>", "</?dir ?[^>]*>", "</?div ?[^>]*>",
291  "</?dl ?[^>]*>", "</?dt ?[^>]*>", "</?em ?[^>]*>",
292  "</?fieldset ?[^>]*>", "</?font ?[^>]*>", "</?form ?[^>]*>",
293  "<frame ?[^>]*>", "</?frameset ?[^>]*>", "</?h[1-6] ?[^>]*>",
294  "</?head ?[^>]*>", "<hr ?[^>]*>", "</?html ?[^>]*>",
295  "</?i ?[^>]*>", "</?iframe ?[^>]*>", "<img ?[^>]*>",
296  "<input ?[^>]*>", "</?ins ?[^>]*>", "<isindex ?[^>]*>",
297  "</?kbd ?[^>]*>", "</?label ?[^>]*>", "</?legend ?[^>]*>",
298  "</?li ?[^>]*>", "<link ?[^>]*>", "</?map ?[^>]*>",
299  "</?menu ?[^>]*>", "<meta ?[^>]*>", "</?nobr ?[^>]*>", "</?noframes ?[^>]*>",
300  "</?noscript ?[^>]*>", "</?object ?[^>]*>", "</?ol ?[^>]*>",
301  "</?optgroup ?[^>]*>", "</?option ?[^>]*>", "</?p ?[^>]*>",
302  "<param ?[^>]*>", "</?pre ?[^>]*>", "</?q ?[^>]*>",
303  "</?s ?[^>]*>", "</?samp ?[^>]*>", "</?script ?[^>]*>",
304  "</?select ?[^>]*>", "</?small ?[^>]*>", "</?span ?[^>]*>",
305  "</?strike ?[^>]*>", "</?strong ?[^>]*>", "</?style ?[^>]*>",
306  "</?sub ?[^>]*>", "</?sup ?[^>]*>", "</?table ?[^>]*>",
307  "</?tbody ?[^>]*>", "</?td ?[^>]*>", "</?textarea ?[^>]*>",
308  "</?tfoot ?[^>]*>",
309  "</?th ?[^>]*>",
310  "</?thead ?[^>]*>",
311  "</?title ?[^>]*>",
312  "</?tr ?[^>]*>",
313  "</?tt ?[^>]*>",
314  "</?u ?[^>]*>",
315  "</?ul ?[^>]*>",
316  "</?var ?[^>]*>",
317  "<!--[^>]*[--]?[ ]*>",
318 
319  "<[/?]?xml.*[ :]?[^>]*>",
320  "</?(o|v|w):[^>]*>",
321  "</?x-sigsep>",
322  "</?x-tab>",
323  "</?X-[^>]*>",
324  NULL
325 };
326 #endif
327 
328 /* ****************************************************************************
329  * *
330  * *
331  **************************************************************************** */
332 
333 int
335  char *id;
336  char *buf;
337 {
338  char *p = buf;
339  int score = 0;
340  bool xmlbuf = FALSE;
341  long pi, pf;
342 
343  if ((buf == NULL) || (strlen(buf) == 0))
344  return score;
345 
346  id = STRNULL(id, "NOID");
347 
348  xmlbuf = zeStrRegex(buf, "<(html)?[ ]*[?]?xml.*[ :]?[^>]*>", NULL, NULL, TRUE);
349  if (xmlbuf)
350  return 0;
351 
352  while (strlen(p) > 0)
353  {
354  char rbuf[1024];
355 
356  pi = pf = 0;
357 
358  if (!zeStrRegex(p, "<[^>]*>", &pi, &pf, TRUE))
359  break;
360 
361  if ((pf - pi) < sizeof (rbuf))
362  {
363  bool ok = FALSE;
364  char **s;
365  size_t len = pf - pi;
366 
367  strncpy(rbuf, p + pi, len);
368  rbuf[len] = 0;
369 
370  {
371  char *u, *v;
372 
373  for (u = v = rbuf; *u != '\0'; u++)
374  {
375  if ((*u != '\n') && (*u != '\r'))
376  *v++ = *u;
377  }
378  *v = '\0';
379  }
380 
381  ZE_MessageInfo(19, "%s SPAMCHECK : Checking : %s, %ld %ld", id, rbuf, pi,
382  pf);
383 
384  for (s = VALID_HTML_TAGS; (*s != NULL) && !ok; s++)
385  ok = zeStrRegex(rbuf, *s, NULL, NULL, TRUE);
386 
387 #if 0
388  if (1 && !ok && xmlbuf)
389  {
390  for (s = VALID_XML_TAGS; (*s != NULL) && !ok; s++)
391  ok = zeStrRegex(rbuf, *s, NULL, NULL, TRUE);
392  }
393 #endif
394  if (!ok)
395  {
396  score++;
397  if (score <= 10)
398  ZE_MessageInfo(10, "%s SPAM CHECK - NOT VALID HTML TAG : %s", id, rbuf);
399  if (score == 10)
400  {
401  ZE_MessageInfo(10,
402  "%s SPAM CHECK - NOT VALID HTML TAG : more than 10 already found ! ",
403  id);
404  break;
405  }
406  }
407  }
408 
409  p += pf;
410  }
411 
412  return score;
413 }
414 
415 /* ****************************************************************************
416  * *
417  * *
418  **************************************************************************** */
419 typedef struct
420 {
421  char *name;
422  int value;
423  int code1;
424  int code2;
425 }
427 
428 static html_entity_T VALID_ENTITIES[] = {
429  {"&lt;", 0, '<', '<'},
430  {"&gt;", 0, '>', '>'},
431  {"&amp;", 0, '#', '#'},
432  {"&quot;", 0, '"', '"'},
433 
434  {"&nbsp;", 160, ' ', ' '},
435  {"&iexcl;", 161, ' ', '!'},
436  {"&cent;", 162, ' ', ' '},
437  {"&pound;", 163, ' ', ' '},
438  {"&curren;", 164, ' ', ' '},
439  {"&yen;", 165, ' ', ' '},
440  {"&brvbar;", 166, '|', '|'},
441  {"&sect;", 167, ' ', ' '},
442  {"&uml;", 168, ' ', ' '},
443  {"&copy;", 169, ' ', ' '},
444  {"&ordf;", 170, ' ', ' '},
445  {"&laquo;", 171, ' ', ' '},
446  {"&not;", 172, ' ', ' '},
447  {"&shy;", 173, ' ', ' '},
448  {"&reg;", 174, ' ', ' '},
449  {"&macr;", 175, ' ', ' '},
450  {"&deg;", 176, ' ', ' '},
451  {"&plusmn;", 177, ' ', '+'},
452  {"&sup2;", 178, '²', ' '},
453  {"&sup3;", 179, ' ', ' '},
454  {"&acute;", 180, '^', '^'},
455  {"&micro;", 181, ' ', ' '},
456  {"&para;", 182, ' ', ' '},
457  {"&middot;", 183, ' ', ' '},
458  {"&cedil;", 184, 'ç', 'c'},
459  {"&sup1;", 185, ' ', ' '},
460  {"&ordm;", 186, ' ', ' '},
461  {"&raquo;", 187, ' ', ' '},
462  {"&frac14;", 188, ' ', ' '},
463  {"&frac12;", 189, ' ', ' '},
464  {"&frac34;", 190, ' ', ' '},
465  {"&iquest;", 191, ' ', ' '},
466  {"&Agrave;", 192, ' ', 'a'},
467  {"&Aacute;", 193, ' ', 'a'},
468  {"&Acirc;", 194, ' ', 'a'},
469  {"&Atilde;", 195, ' ', 'a'},
470  {"&Auml;", 196, ' ', 'a'},
471  {"&Aring;", 197, ' ', 'a'},
472  {"&AElig;", 198, ' ', 'a'},
473  {"&Ccedil;", 199, ' ', 'c'},
474  {"&Egrave;", 200, ' ', 'e'},
475  {"&Eacute;", 201, ' ', 'e'},
476  {"&Ecirc;", 202, ' ', 'e'},
477  {"&Euml;", 203, ' ', 'e'},
478  {"&Igrave;", 204, ' ', 'i'},
479  {"&Iacute;", 205, ' ', 'i'},
480  {"&Icirc;", 206, ' ', 'i'},
481  {"&Iuml;", 207, ' ', 'i'},
482  {"&ETH;", 208, ' ', ' '},
483  {"&Ntilde;", 209, ' ', 'n'},
484  {"&Ograve;", 210, ' ', 'o'},
485  {"&Oacute;", 211, ' ', 'o'},
486  {"&Ocirc;", 212, ' ', 'o'},
487  {"&Otilde;", 213, ' ', 'o'},
488  {"&Ouml;", 214, ' ', 'o'},
489  {"&times;", 215, ' ', 'x'},
490  {"&Oslash;", 216, ' ', 'o'},
491  {"&Ugrave;", 217, ' ', 'u'},
492  {"&Uacute;", 218, ' ', 'u'},
493  {"&Ucirc;", 219, ' ', 'u'},
494  {"&Uuml;", 220, ' ', 'u'},
495  {"&Yacute;", 221, ' ', 'y'},
496  {"&THORN;", 222, ' ', ' '},
497  {"&szlig;", 223, ' ', 's'},
498  {"&agrave;", 224, ' ', 'a'},
499  {"&aacute;", 225, ' ', 'a'},
500  {"&acirc;", 226, ' ', 'a'},
501  {"&atilde;", 227, ' ', 'a'},
502  {"&auml;", 228, ' ', 'a'},
503  {"&aring;", 229, ' ', 'a'},
504  {"&aelig;", 230, ' ', 'a'},
505  {"&ccedil;", 231, ' ', 'c'},
506  {"&egrave;", 232, ' ', 'e'},
507  {"&eacute;", 233, ' ', 'e'},
508  {"&ecirc;", 234, ' ', 'e'},
509  {"&euml;", 235, ' ', 'e'},
510  {"&igrave;", 236, ' ', 'i'},
511  {"&iacute;", 237, ' ', 'i'},
512  {"&icirc;", 238, ' ', 'i'},
513  {"&iuml;", 239, ' ', 'i'},
514  {"&eth;", 240, ' ', ' '},
515  {"&ntilde;", 241, ' ', 'n'},
516  {"&ograve;", 242, ' ', 'o'},
517  {"&oacute;", 243, ' ', 'o'},
518  {"&ocirc;", 244, ' ', 'o'},
519  {"&otilde;", 245, ' ', 'o'},
520  {"&ouml;", 246, ' ', 'o'},
521  {"&divide;", 247, ' ', '/'},
522  {"&oslash;", 248, ' ', 'o'},
523  {"&ugrave;", 249, ' ', 'u'},
524  {"&uacute;", 250, ' ', 'u'},
525  {"&ucirc;", 251, ' ', 'u'},
526  {"&uuml;", 252, ' ', 'u'},
527  {"&yacute;", 253, ' ', 'y'},
528  {"&thorn;", 254, ' ', ' '},
529  {"&yuml;", 255, ' ', 'y'},
530 
531  {NULL, 0, '\0', '\0'}
532 };
533 
534 int
536  char *s;
537 {
538  html_entity_T *p = VALID_ENTITIES;
539 
540  for (p = VALID_ENTITIES; p->name != NULL; p++)
541  {
542  if (strncasecmp(p->name, s, strlen(p->name)) == 0)
543  return p->code2;
544  }
545  return 0;
546 }
regex_t re
Definition: ze-html.c:252
char * convert_html_codes(char *buf)
Definition: ze-html.c:126
#define FREE(x)
Definition: macros.h:37
struct regex_tag_T regex_tag_T
#define HEXA_CHARS
Definition: ze-html.c:33
#define STRNULL(x, r)
Definition: macros.h:81
#define FALSE
Definition: macros.h:160
#define ZE_LogMsgError(level,...)
Definition: zeSyslog.h:113
bool zeStrRegex(char *, char *, long *, long *, bool)
Definition: zeStrings.c:544
void html_clean_codes(char *buf, size_t size)
Definition: ze-html.c:72
char * name
Definition: ze-html.c:421
#define strchr
Definition: ze-sys.h:218
bool ok
Definition: ze-html.c:253
#define ZE_MessageInfo(level,...)
Definition: zeSyslog.h:90
#define TRUE
Definition: macros.h:157
char * tag
Definition: ze-html.c:251
#define memcpy(d, s, n)
Definition: ze-sys.h:224
#define ZE_LogSysError(...)
Definition: zeSyslog.h:129
int check_valid_html_tags(char *id, char *buf)
Definition: ze-html.c:334
int get_html_entity(char *s)
Definition: ze-html.c:535
char * cleanup_html_buffer(char *buf, size_t size)
Definition: ze-html.c:163