From 6564daea7412f4f3242bc67287f4762470b089a3 Mon Sep 17 00:00:00 2001 From: Mrcus Kool Date: Tue, 9 Aug 2011 01:09:03 -0600 Subject: [PATCH] Optimize regular expression ACLs This patch is inspired by the work that I did for ufdbGuard and a few emails with Amos. The new code optimises lists of regular expressions. The optimisations are: * initial .* is stripped * RE-1 RE-2 ... RE-n are joined into one large RE: (RE-1)|(RE-2)|...|(RE-n) * -i ... -i options are optimised: the second one is ignored, same for +i If compounding optimization fails it falls back to using unoptimized expressions. --- src/acl/RegexData.cc | 228 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 198 insertions(+), 30 deletions(-) diff --git a/src/acl/RegexData.cc b/src/acl/RegexData.cc index b52bbfa4ae..93c829b054 100644 --- a/src/acl/RegexData.cc +++ b/src/acl/RegexData.cc @@ -3,6 +3,7 @@ * * DEBUG: section 28 Access Control * AUTHOR: Duane Wessels + * AUTHOR: Marcus Kool * * SQUID Web Proxy Cache http://www.squid-cache.org/ * ---------------------------------------------------------- @@ -32,6 +33,7 @@ * * * Copyright (c) 2003, Robert Collins + * Copyright (c) 2011, Marcus Kool */ #include "squid.h" @@ -41,8 +43,7 @@ #include "wordlist.h" #include "ConfigParser.h" -static void aclDestroyRegexList(relist * data); -void +static void aclDestroyRegexList(relist * data) { relist *next = NULL; @@ -123,49 +124,216 @@ ACLRegexData::dump() return W; } -static void aclParseRegexList(relist **curlist); -void -aclParseRegexList(relist **curlist) +static char * +removeUnnecessaryWildcards(char * t) +{ + char * orig = t; + + if (strncmp(t, "^.*", 3) == 0) + t += 3; + + /* NOTE: an initial '.' might seem unnessary but is not; + * it can be a valid requirement that cannot be optimised + */ + while (*t == '.' && *(t+1) == '*') { + t += 2; + } + + if (*t == '\0') { + debugs(28, DBG_IMPORTANT, "" << cfg_filename << " line " << config_lineno << ": " << config_input_line); + debugs(28, DBG_IMPORTANT, "WARNING: regular expression '" << orig << "' has only wildcards and matches all strings. Using '.*' instead."); + return ".*"; + } + if (t != orig) { + debugs(28, DBG_IMPORTANT, "" << cfg_filename << " line " << config_lineno << ": " << config_input_line); + debugs(28, DBG_IMPORTANT, "WARNING: regular expression '" << orig << "' has unnecessary wildcard(s). Using '" << t << "' instead."); + } + + return t; +} + +static relist ** +compileRE(relist **Tail, char * RE, int flags) { - relist **Tail; - relist *q = NULL; - char *t = NULL; - regex_t comp; int errcode; + relist *q; + regex_t comp; + + if (RE == NULL || *RE == '\0') + return Tail; + + if ((errcode = regcomp(&comp, RE, flags)) != 0) { + char errbuf[256]; + regerror(errcode, &comp, errbuf, sizeof errbuf); + debugs(28, DBG_CRITICAL, "" << cfg_filename << " line " << config_lineno << ": " << config_input_line); + debugs(28, DBG_CRITICAL, "ERROR: invalid regular expression: '" << RE << "': " << errbuf); + return NULL; + } + debugs(28, 2, "compileRE: compiled '" << RE << "' with flags " << flags ); + + q = (relist *) memAllocate(MEM_RELIST); + q->pattern = xstrdup(RE); + q->regex = comp; + q->flags = flags; + *(Tail) = q; + Tail = &q->next; + + return Tail; +} + +/** Compose and compile one large RE from a set of (small) REs. + * The ultimate goal is to have only one RE per ACL so that regexec() is + * called only once per ACL. + */ +static int +compileOptimisedREs(relist **curlist, wordlist * wl) +{ + relist **Tail; + relist *newlist; + relist **newlistp; + int numREs = 0; int flags = REG_EXTENDED | REG_NOSUB; + int largeREindex = 0; + char largeRE[BUFSIZ]; - debugs(28,5, HERE << "Regex new line."); + newlist = NULL; + newlistp = &newlist; - for (Tail = (relist **)curlist; *Tail; Tail = &((*Tail)->next)); - while ((t = ConfigParser::strtokFile())) { + largeRE[0] = '\0'; - debugs(28,5, HERE << "Regex token: " << t); + while (wl != NULL) { + int RElen; + RElen = strlen( wl->key ); - if (strcmp(t, "-i") == 0) { - flags |= REG_ICASE; - continue; + if (strcmp(wl->key, "-i") == 0) { + if (flags & REG_ICASE) { + /* optimisation of -i ... -i */ + debugs(28, 2, "compileOptimisedREs: optimisation of -i ... -i" ); + } else { + debugs(28, 2, "compileOptimisedREs: -i" ); + newlistp = compileRE( newlistp, largeRE, flags ); + if (newlistp == NULL) { + aclDestroyRegexList( newlist ); + return 0; + } + flags |= REG_ICASE; + largeRE[largeREindex=0] = '\0'; + } + } else if (strcmp(wl->key, "+i") == 0) { + if ((flags & REG_ICASE) == 0) { + /* optimisation of +i ... +i */ + debugs(28, 2, "compileOptimisedREs: optimisation of +i ... +i"); + } else { + debugs(28, 2, "compileOptimisedREs: +i"); + newlistp = compileRE( newlistp, largeRE, flags ); + if (newlistp == NULL) { + aclDestroyRegexList( newlist ); + return 0; + } + flags &= ~REG_ICASE; + largeRE[largeREindex=0] = '\0'; + } + } else if (RElen + largeREindex + 3 < BUFSIZ-1) { + debugs(28, 2, "compileOptimisedREs: adding RE '" << wl->key << "'"); + if (largeREindex > 0) + largeRE[largeREindex++] = '|'; + largeRE[largeREindex++] = '('; + for (char * t = wl->key; *t != '\0'; t++) + largeRE[largeREindex++] = *t; + largeRE[largeREindex++] = ')'; + largeRE[largeREindex] = '\0'; + numREs++; + } else { + debugs(28, 2, "compileOptimisedREs: buffer full, generating new optimised RE..." ); + newlistp = compileRE( newlistp, largeRE, flags ); + if (newlistp == NULL) { + aclDestroyRegexList( newlist ); + return 0; + } + largeRE[largeREindex=0] = '\0'; + continue; /* do the loop again to add the RE to largeRE */ } + wl = wl->next; + } + + newlistp = compileRE( newlistp, largeRE, flags ); + if (newlistp == NULL) { + aclDestroyRegexList( newlist ); + return 0; + } + + /* all was successful, so put the new list at the tail */ + if (*curlist == NULL) { + *curlist = newlist; + } else { + for (Tail = curlist; *Tail != NULL; Tail = &((*Tail)->next)) + ; + (*Tail) = newlist; + } + + debugs(28, 2, "compileOptimisedREs: " << numREs << " REs are optimised into one RE."); + if (numREs > 100) { + debugs(28, (opt_parse_cfg_only?DBG_IMPORTANT:2), "" << cfg_filename << " line " << config_lineno << ": " << config_input_line); + debugs(28, (opt_parse_cfg_only?DBG_IMPORTANT:2), "WARNING: there are more than 100 regular expressions. " << + "Consider using less REs or use rules without expressions like 'dstdomain'."); + } + + return 1; +} - if (strcmp(t, "+i") == 0) { +static void +compileUnoptimisedREs(relist **curlist, wordlist * wl) +{ + relist **Tail; + relist **newTail; + int flags = REG_EXTENDED | REG_NOSUB; + + for (Tail = curlist; *Tail != NULL; Tail = &((*Tail)->next)) + ; + + while (wl != NULL) { + int RElen; + RElen = strlen( wl->key ); + if (strcmp(wl->key, "-i") == 0) { + flags |= REG_ICASE; + } else if (strcmp(wl->key, "+i") == 0) { flags &= ~REG_ICASE; - continue; + } else { + newTail = compileRE( Tail, wl->key , flags ); + if (newTail == NULL) + debugs(28, DBG_CRITICAL, "ERROR: Skipping regular expression. Compile failed: '" << wl->key << "'"); + else + Tail = newTail; } + wl = wl->next; + } +} + +static void +aclParseRegexList(relist **curlist) +{ + char *t; + wordlist *wl = NULL; - if ((errcode = regcomp(&comp, t, flags)) != 0) { - char errbuf[256]; - regerror(errcode, &comp, errbuf, sizeof errbuf); - debugs(28, 0, "" << cfg_filename << " line " << config_lineno << ": " << config_input_line); - debugs(28, 0, "aclParseRegexList: Invalid regular expression '" << t << "': " << errbuf); - continue; + debugs(28, 2, HERE << "aclParseRegexList: new Regex line or file"); + + while ((t = ConfigParser::strtokFile()) != NULL) { + t = removeUnnecessaryWildcards(t); + if (strlen(t) > BUFSIZ-1) { + debugs(28, DBG_CRITICAL, "" << cfg_filename << " line " << config_lineno << ": " << config_input_line); + debugs(28, DBG_CRITICAL, "ERROR: Skipping regular expression. Larger than " << BUFSIZ-1 << " characters: '" << wl->key << "'"); + } else { + debugs(28, 3, "aclParseRegexList: buffering RE '" << t << "'"); + wordlistAdd(&wl, t); } + } - q = (relist *)memAllocate(MEM_RELIST); - q->flags = flags; - q->pattern = xstrdup(t); - q->regex = comp; - *(Tail) = q; - Tail = &q->next; + if (!compileOptimisedREs(curlist, wl)) { + debugs(28, DBG_IMPORTANT, "WARNING: optimisation of regular expressions failed; using fallback method without optimisation"); + compileUnoptimisedREs(curlist, wl); } + + wordlistDestroy(&wl); } void -- 2.39.5