]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
* Add decoding entities as it is specified in w3c recommendations
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 16 Sep 2009 12:42:36 +0000 (16:42 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 16 Sep 2009 12:42:36 +0000 (16:42 +0400)
src/html.c

index 1381fdd4fa5f5a4402b70212827665c71d6c0c2c..1377b051a1d1df60007a95bd35f8e2ae915f67b8 100644 (file)
@@ -157,6 +157,291 @@ static struct html_tag tag_defs[] =
   { Tag_WBR,        "wbr",        (CM_INLINE|CM_EMPTY)},
 };
 
+sig_atomic_t entities_sorted = 0;
+struct _entity;
+typedef struct _entity entity;
+
+struct _entity
+{
+    char *name;
+    uint    code;
+};
+
+
+static entity entities_defs[] =
+{
+    /*
+    ** Markup pre-defined character entities
+    */
+    { "quot",        34 },
+    { "amp",         38 },
+    { "apos",           39 },
+    { "lt",          60 },
+    { "gt",          62 },
+
+    /*
+    ** Latin-1 character entities
+    */
+    { "nbsp",           160 },
+    { "iexcl",          161 },
+    { "cent",           162 },
+    { "pound",          163 },
+    { "curren",         164 },
+    { "yen",            165 },
+    { "brvbar",         166 },
+    { "sect",           167 },
+    { "uml",            168 },
+    { "copy",           169 },
+    { "ordf",           170 },
+    { "laquo",          171 },
+    { "not",            172 },
+    { "shy",            173 },
+    { "reg",            174 },
+    { "macr",           175 },
+    { "deg",            176 },
+    { "plusmn",         177 },
+    { "sup2",           178 },
+    { "sup3",           179 },
+    { "acute",          180 },
+    { "micro",          181 },
+    { "para",           182 },
+    { "middot",         183 },
+    { "cedil",          184 },
+    { "sup1",           185 },
+    { "ordm",           186 },
+    { "raquo",          187 },
+    { "frac14",         188 },
+    { "frac12",         189 },
+    { "frac34",         190 },
+    { "iquest",         191 },
+    { "Agrave",         192 },
+    { "Aacute",         193 },
+    { "Acirc",          194 },
+    { "Atilde",         195 },
+    { "Auml",           196 },
+    { "Aring",          197 },
+    { "AElig",          198 },
+    { "Ccedil",         199 },
+    { "Egrave",         200 },
+    { "Eacute",         201 },
+    { "Ecirc",          202 },
+    { "Euml",           203 },
+    { "Igrave",         204 },
+    { "Iacute",         205 },
+    { "Icirc",          206 },
+    { "Iuml",           207 },
+    { "ETH",            208 },
+    { "Ntilde",         209 },
+    { "Ograve",         210 },
+    { "Oacute",         211 },
+    { "Ocirc",          212 },
+    { "Otilde",         213 },
+    { "Ouml",           214 },
+    { "times",          215 },
+    { "Oslash",         216 },
+    { "Ugrave",         217 },
+    { "Uacute",         218 },
+    { "Ucirc",          219 },
+    { "Uuml",           220 },
+    { "Yacute",         221 },
+    { "THORN",          222 },
+    { "szlig",          223 },
+    { "agrave",         224 },
+    { "aacute",         225 },
+    { "acirc",          226 },
+    { "atilde",         227 },
+    { "auml",           228 },
+    { "aring",          229 },
+    { "aelig",          230 },
+    { "ccedil",         231 },
+    { "egrave",         232 },
+    { "eacute",         233 },
+    { "ecirc",          234 },
+    { "euml",           235 },
+    { "igrave",         236 },
+    { "iacute",         237 },
+    { "icirc",          238 },
+    { "iuml",           239 },
+    { "eth",            240 },
+    { "ntilde",         241 },
+    { "ograve",         242 },
+    { "oacute",         243 },
+    { "ocirc",          244 },
+    { "otilde",         245 },
+    { "ouml",           246 },
+    { "divide",         247 },
+    { "oslash",         248 },
+    { "ugrave",         249 },
+    { "uacute",         250 },
+    { "ucirc",          251 },
+    { "uuml",           252 },
+    { "yacute",         253 },
+    { "thorn",          254 },
+    { "yuml",           255 },
+
+    /*
+    ** Extended Entities defined in HTML 4: Symbols 
+    */
+    { "fnof",        402 },
+    { "Alpha",       913 },
+    { "Beta",        914 },
+    { "Gamma",       915 },
+    { "Delta",       916 },
+    { "Epsilon",     917 },
+    { "Zeta",        918 },
+    { "Eta",         919 },
+    { "Theta",       920 },
+    { "Iota",        921 },
+    { "Kappa",       922 },
+    { "Lambda",      923 },
+    { "Mu",          924 },
+    { "Nu",          925 },
+    { "Xi",          926 },
+    { "Omicron",     927 },
+    { "Pi",          928 },
+    { "Rho",         929 },
+    { "Sigma",       931 },
+    { "Tau",         932 },
+    { "Upsilon",     933 },
+    { "Phi",         934 },
+    { "Chi",         935 },
+    { "Psi",         936 },
+    { "Omega",       937 },
+    { "alpha",       945 },
+    { "beta",        946 },
+    { "gamma",       947 },
+    { "delta",       948 },
+    { "epsilon",     949 },
+    { "zeta",        950 },
+    { "eta",         951 },
+    { "theta",       952 },
+    { "iota",        953 },
+    { "kappa",       954 },
+    { "lambda",      955 },
+    { "mu",          956 },
+    { "nu",          957 },
+    { "xi",          958 },
+    { "omicron",     959 },
+    { "pi",          960 },
+    { "rho",         961 },
+    { "sigmaf",      962 },
+    { "sigma",       963 },
+    { "tau",         964 },
+    { "upsilon",     965 },
+    { "phi",         966 },
+    { "chi",         967 },
+    { "psi",         968 },
+    { "omega",       969 },
+    { "thetasym",    977 },
+    { "upsih",       978 },
+    { "piv",         982 },
+    { "bull",       8226 },
+    { "hellip",     8230 },
+    { "prime",      8242 },
+    { "Prime",      8243 },
+    { "oline",      8254 },
+    { "frasl",      8260 },
+    { "weierp",     8472 },
+    { "image",      8465 },
+    { "real",       8476 },
+    { "trade",      8482 },
+    { "alefsym",    8501 },
+    { "larr",       8592 },
+    { "uarr",       8593 },
+    { "rarr",       8594 },
+    { "darr",       8595 },
+    { "harr",       8596 },
+    { "crarr",      8629 },
+    { "lArr",       8656 },
+    { "uArr",       8657 },
+    { "rArr",       8658 },
+    { "dArr",       8659 },
+    { "hArr",       8660 },
+    { "forall",     8704 },
+    { "part",       8706 },
+    { "exist",      8707 },
+    { "empty",      8709 },
+    { "nabla",      8711 },
+    { "isin",       8712 },
+    { "notin",      8713 },
+    { "ni",         8715 },
+    { "prod",       8719 },
+    { "sum",        8721 },
+    { "minus",      8722 },
+    { "lowast",     8727 },
+    { "radic",      8730 },
+    { "prop",       8733 },
+    { "infin",      8734 },
+    { "ang",        8736 },
+    { "and",        8743 },
+    { "or",         8744 },
+    { "cap",        8745 },
+    { "cup",        8746 },
+    { "int",        8747 },
+    { "there4",     8756 },
+    { "sim",        8764 },
+    { "cong",       8773 },
+    { "asymp",      8776 },
+    { "ne",         8800 },
+    { "equiv",      8801 },
+    { "le",         8804 },
+    { "ge",         8805 },
+    { "sub",        8834 },
+    { "sup",        8835 },
+    { "nsub",       8836 },
+    { "sube",       8838 },
+    { "supe",       8839 },
+    { "oplus",      8853 },
+    { "otimes",     8855 },
+    { "perp",       8869 },
+    { "sdot",       8901 },
+    { "lceil",      8968 },
+    { "rceil",      8969 },
+    { "lfloor",     8970 },
+    { "rfloor",     8971 },
+    { "lang",       9001 },
+    { "rang",       9002 },
+    { "loz",        9674 },
+    { "spades",     9824 },
+    { "clubs",      9827 },
+    { "hearts",     9829 },
+    { "diams",      9830 },
+
+    /*
+    ** Extended Entities defined in HTML 4: Special (less Markup at top)
+    */
+    { "OElig",       338 },
+    { "oelig",       339 },
+    { "Scaron",      352 },
+    { "scaron",      353 },
+    { "Yuml",        376 },
+    { "circ",        710 },
+    { "tilde",       732 },
+    { "ensp",       8194 },
+    { "emsp",       8195 },
+    { "thinsp",     8201 },
+    { "zwnj",       8204 },
+    { "zwj",        8205 },
+    { "lrm",        8206 },
+    { "rlm",        8207 },
+    { "ndash",      8211 },
+    { "mdash",      8212 },
+    { "lsquo",      8216 },
+    { "rsquo",      8217 },
+    { "sbquo",      8218 },
+    { "ldquo",      8220 },
+    { "rdquo",      8221 },
+    { "bdquo",      8222 },
+    { "dagger",     8224 },
+    { "Dagger",     8225 },
+    { "permil",     8240 },
+    { "lsaquo",     8249 },
+    { "rsaquo",     8250 },
+    { "euro",       8364 },
+    { NULL,        0 }
+};
+
+
 static int
 tag_cmp (const void *m1, const void *m2)
 {
@@ -166,6 +451,15 @@ tag_cmp (const void *m1, const void *m2)
        return g_ascii_strcasecmp (p1->name, p2->name);
 }
 
+static int
+entity_cmp (const void *m1, const void *m2)
+{
+       const entity *p1 = m1;
+       const entity *p2 = m2;
+
+       return g_ascii_strcasecmp (p1->name, p2->name);
+}
+
 static GNode* 
 construct_html_node (memory_pool_t *pool, char *text)
 {
@@ -269,6 +563,7 @@ decode_entitles (char *s, guint *len)
        char *e = s;
        char *end_ptr;
        int state = 0, val, base;
+       entity *found, key;
 
        if (len == NULL || *len == 0) {
                l = strlen (s); 
@@ -281,7 +576,7 @@ decode_entitles (char *s, guint *len)
                switch (state) {
                        /* Out of entitle */
                        case 0:
-                               if (*h == '&' && *(h + 1) == '#') {
+                               if (*h == '&') {
                                        state = 1;
                                        e = h;
                                        h ++;
@@ -296,29 +591,45 @@ decode_entitles (char *s, guint *len)
                        case 1:
                                if (*h == ';') {
                                        /* Determine base */
-                                       if (*(e + 2) == 'x' || *(e + 2) == 'X') {
-                                               base = 16;
-                                       }
-                                       else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
-                                               base = 8;
+                                       /* First find in entities table */
+
+                                       key.name = e + 1;
+                                       *h = '\0';
+                                       if (*(e + 1) != '#' && 
+                                                       (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof ( entity), entity_cmp)) != NULL) {
+                                               if (found->code > 0 || found->code < 127) {
+                                                       *t = (char)found->code;
+                                               } 
+                                               else {
+                                                       /* Skip undecoded */
+                                                       t = h;
+                                               }
                                        }
                                        else {
-                                               base = 10;
-                                       }
-                                       if (base == 10) {
-                                               val = strtoul ((e + 2), &end_ptr, base);
-                                       }
-                                       else {
-                                               val = strtoul ((e + 3), &end_ptr, base);
-                                       }
-                                       if ((end_ptr != NULL && *end_ptr != ';') || (val == 0 || val > 127)) {
-                                               msg_info ("decode_entitles: invalid entitle code, cannot convert, strtoul returned %d, while reading %s", val, end_ptr);
-                                               /* Skip undecoded */
-                                               t = h;
-                                       }
-                                       else {
-                                               *t = (char)val;
+                                               if (*(e + 2) == 'x' || *(e + 2) == 'X') {
+                                                       base = 16;
+                                               }
+                                               else if (*(e + 2) == 'o' || *(e + 2) == 'O') {
+                                                       base = 8;
+                                               }
+                                               else {
+                                                       base = 10;
+                                               }
+                                               if (base == 10) {
+                                                       val = strtoul ((e + 2), &end_ptr, base);
+                                               }
+                                               else {
+                                                       val = strtoul ((e + 3), &end_ptr, base);
+                                               }
+                                               if ((end_ptr != NULL && *end_ptr != '\0') || (val == 0 || val > 127)) {
+                                                       /* Skip undecoded */
+                                                       t = h;
+                                               }
+                                               else {
+                                                       *t = (char)val;
+                                               }
                                        }
+                                       *h = ';';
                                        state = 0;
                                        t ++;
                                }
@@ -433,6 +744,10 @@ add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_p
                qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
                tags_sorted = 1;
        }
+       if (!entities_sorted) {
+               qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp);
+               entities_sorted = 1;
+       }
 
        /* First call of this function */
        if (part->html_nodes == NULL) {