From: Amos Jeffries Date: Wed, 29 Jul 2015 07:11:17 +0000 (-0700) Subject: SourceLayout: refactor regex pattern objects X-Git-Tag: merge-candidate-3-v1~24^2~2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=95b8eae2c38635e41ec0c982ee8794a2b591beba;p=thirdparty%2Fsquid.git SourceLayout: refactor regex pattern objects * moves the regex pattern state storage to class RegexPattern in base/RegexPattern.h which is MEMPROXY_CLASS pooled and constructed with flags and pattern preset. - for now the regcomp generated data is set separately. * Replaces ACL storage class RegexList with a std::list * converts refresh_pattern regex data to class RegexPattern for its pattern and -i/+i flag details. --- 95b8eae2c38635e41ec0c982ee8794a2b591beba diff --cc src/RefreshPattern.h index e835dd6be9,e835dd6be9..8812253474 --- a/src/RefreshPattern.h +++ b/src/RefreshPattern.h @@@ -9,21 -9,21 +9,48 @@@ #ifndef SQUID_REFRESHPATTERN_H_ #define SQUID_REFRESHPATTERN_H_ --#include "compat/GnuRegex.h" ++#include "base/RegexPattern.h" --/// a representation of a refresh pattern. Currently a POD. ++/// a representation of a refresh pattern. class RefreshPattern { ++ MEMPROXY_CLASS(RefreshPattern); ++ public: -- const char *pattern; -- regex_t compiled_pattern; ++ ++ /* ++ * Defaults: ++ * MIN NONE ++ * PCT 20% ++ * MAX 3 days ++ */ ++#define REFRESH_DEFAULT_MAX static_cast(259200) ++ ++ RefreshPattern(const char *aPattern, const decltype(RegexPattern::flags) &reFlags) : ++ pattern(reFlags, aPattern), ++ min(0), pct(0.20), max(REFRESH_DEFAULT_MAX), ++ next(NULL), ++ max_stale(0) ++ { ++ memset(&flags, 0, sizeof(flags)); ++ } ++ ++ ~RefreshPattern() { ++ while (RefreshPattern *t = next) { ++ next = t->next; ++ t->next = nullptr; ++ delete t; ++ } ++ } ++ // ~RefreshPattern() default destructor is fine ++ ++ RegexPattern pattern; time_t min; double pct; time_t max; RefreshPattern *next; struct { -- bool icase; bool refresh_ims; bool store_stale; #if USE_HTTP_VIOLATIONS @@@ -39,6 -39,6 +66,8 @@@ // statistics about how many matches this pattern has had mutable struct stats_ { ++ stats_() : matchTests(0), matchCount(0) {} ++ uint64_t matchTests; uint64_t matchCount; // TODO: some stats to indicate how useful/less the flags are would be nice. diff --cc src/acl/RegexData.cc index 3103b3c8e0,951da42b01..d48cf01867 --- a/src/acl/RegexData.cc +++ b/src/acl/RegexData.cc @@@ -44,37 -30,19 +30,19 @@@ ACLRegexData::~ACLRegexData( bool ACLRegexData::match(char const *word) { - if (word == NULL) + if (!word) return 0; - debugs(28, 3, "aclRegexData::match: checking '" << word << "'"); - - RegexList *first, *prev; - - first = data; - - prev = NULL; + debugs(28, 3, "checking '" << word << "'"); - RegexList *current = first; - - while (current) { - debugs(28, 3, "aclRegexData::match: looking for '" << current->pattern << "'"); - - if (regexec(¤t->regex, word, 0, 0, 0) == 0) { - if (prev != NULL) { - /* shift the element just found to the second position - * in the list */ - prev->next = current->next; - current->next = first->next; - first->next = current; - } - - debugs(28, 2, "aclRegexData::match: match '" << current->pattern << "' found in '" << word << "'"); + // walk the list of patterns to see if one matches + for (auto &i : data) { - if (regexec(&i.regex, word, 0, 0, 0) == 0) { - debugs(28, 2, "'" << i.pattern << "' found in '" << word << "'"); ++ if (i.match(word)) { ++ debugs(28, 2, "'" << i.c_str() << "' found in '" << word << "'"); + // TODO: old code also popped the pattern to second place of the list + // in order to reduce patterns search times. return 1; } - - prev = current; - current = current->next; } return 0; @@@ -94,11 -63,10 +63,10 @@@ ACLRegexData::dump() cons } else { sl.push_back(SBuf("+i")); } - flags = temp->flags; + flags = i.flags; } - sl.push_back(SBuf(temp->pattern)); - temp = temp->next; - sl.push_back(SBuf(i.pattern)); ++ sl.push_back(SBuf(i.c_str())); } return sl; @@@ -162,7 -123,7 +123,7 @@@ compileRE(std::list &curl } /** Compose and compile one large RE from a set of (small) REs. -- * The ultimate goal is to have only one RE per ACL so that regexec() is ++ * The ultimate goal is to have only one RE per ACL so that match() is * called only once per ACL. */ static int diff --cc src/base/Makefile.am index f0a3a23fe3,72643db253..dc6811c0ca --- a/src/base/Makefile.am +++ b/src/base/Makefile.am @@@ -25,9 -25,10 +25,11 @@@ libbase_la_SOURCES = CharacterSet.cc \ InstanceId.h \ Lock.h \ + LookupTable.h \ LruMap.h \ Packable.h \ + RegexPattern.cc \ + RegexPattern.h \ RunnersRegistry.cc \ RunnersRegistry.h \ Subscription.h \ diff --cc src/base/RegexPattern.cc index 71f520d322,e70879ac4d..355e1e9813 --- a/src/base/RegexPattern.cc +++ b/src/base/RegexPattern.cc @@@ -7,5 -7,10 +7,17 @@@ */ #include "squid.h" - #include "RegexList.h" + #include "base/RegexPattern.h" ++RegexPattern::RegexPattern(int aFlags, const char *aPattern) : ++ flags(aFlags), ++ pattern(xstrdup(aPattern)) ++{ ++ memset(®ex, 0, sizeof(regex)); ++} ++ + RegexPattern::~RegexPattern() + { + xfree(pattern); + regfree(®ex); + } diff --cc src/base/RegexPattern.h index 0000000000,180488b147..ad83063c88 mode 000000,100644..100644 --- a/src/base/RegexPattern.h +++ b/src/base/RegexPattern.h @@@ -1,0 -1,35 +1,42 @@@ + /* + * Copyright (C) 1996-2015 The Squid Software Foundation and contributors + * + * Squid software is distributed under GPLv2+ license and includes + * contributions from numerous individuals and organizations. + * Please see the COPYING and CONTRIBUTORS files for details. + */ + + #ifndef SQUID_SRC_BASE_REGEXPATTERN_H + #define SQUID_SRC_BASE_REGEXPATTERN_H + ++#include "compat/GnuRegex.h" + #include "mem/forward.h" + + /** + * A regular expression, + * plain text and compiled representations + */ + class RegexPattern + { + MEMPROXY_CLASS(RegexPattern); + + public: + RegexPattern() = delete; - RegexPattern(int aFlags, const char *aPattern) : flags(aFlags), pattern(xstrdup(aPattern)) {} ++ RegexPattern(int aFlags, const char *aPattern); + RegexPattern(const RegexPattern &) = delete; + RegexPattern(RegexPattern &&) = default; + ~RegexPattern(); + ++ const char * c_str() const {return pattern;} ++ bool match(const char *str) const {return regexec(®ex,str,0,NULL,0)==0;} ++ ++public: + int flags; - char *pattern; + regex_t regex; ++ ++private: ++ char *pattern; + }; + + #endif /* SQUID_SRC_BASE_REGEXPATTERN_H */ + diff --cc src/cache_cf.cc index 5eafe10450,5eafe10450..59cb9a41b9 --- a/src/cache_cf.cc +++ b/src/cache_cf.cc @@@ -2557,8 -2557,8 +2557,8 @@@ dump_refreshpattern(StoreEntry * entry while (head != NULL) { storeAppendPrintf(entry, "%s%s %s %d %d%% %d", name, -- head->flags.icase ? " -i" : null_string, -- head->pattern, ++ head->pattern.flags®_ICASE ? " -i" : null_string, ++ head->pattern.c_str(), (int) head->min / 60, (int) (100.0 * head->pct + 0.5), (int) head->max / 60); @@@ -2728,16 -2728,16 +2728,12 @@@ parse_refreshpattern(RefreshPattern ** pct = pct < 0.0 ? 0.0 : pct; max = max < 0 ? 0 : max; -- t = static_cast(xcalloc(1, sizeof(RefreshPattern))); -- t->pattern = (char *) xstrdup(pattern); -- t->compiled_pattern = comp; ++ t = new RefreshPattern(pattern, flags); ++ t->pattern.regex = comp; t->min = min; t->pct = pct; t->max = max; -- if (flags & REG_ICASE) -- t->flags.icase = true; -- if (refresh_ims) t->flags.refresh_ims = true; @@@ -2774,20 -2774,20 +2770,14 @@@ *head = t; -- safe_free(pattern); ++ xfree(pattern); } static void free_refreshpattern(RefreshPattern ** head) { -- RefreshPattern *t; -- -- while ((t = *head) != NULL) { -- *head = t->next; -- safe_free(t->pattern); -- regfree(&t->compiled_pattern); -- safe_free(t); -- } ++ delete *head; ++ *head = nullptr; #if USE_HTTP_VIOLATIONS refresh_nocache_hack = 0; diff --cc src/refresh.cc index 2dcbff8b58,2dcbff8b58..e558224160 --- a/src/refresh.cc +++ b/src/refresh.cc @@@ -79,42 -79,42 +79,28 @@@ static struct RefreshCounts int status[STALE_DEFAULT + 1]; } refreshCounts[rcCount]; --/* -- * Defaults: -- * MIN NONE -- * PCT 20% -- * MAX 3 days -- */ --#define REFRESH_DEFAULT_MIN (time_t)0 --#define REFRESH_DEFAULT_PCT 0.20 --#define REFRESH_DEFAULT_MAX (time_t)259200 -- static const RefreshPattern *refreshUncompiledPattern(const char *); static OBJH refreshStats; static int refreshStaleness(const StoreEntry * entry, time_t check_time, const time_t age, const RefreshPattern * R, stale_flags * sf); --static RefreshPattern DefaultRefresh; ++static RefreshPattern DefaultRefresh("", 0); /** Locate the first refresh_pattern rule that matches the given URL by regex. * -- * \note regexec() returns 0 if matched, and REG_NOMATCH otherwise -- * -- * \return A pointer to the refresh_pattern parameters to use, or NULL if there is no match. ++ * \return A pointer to the refresh_pattern parameters to use, or nullptr if there is no match. */ const RefreshPattern * refreshLimits(const char *url) { -- const RefreshPattern *R; -- -- for (R = Config.Refresh; R; R = R->next) { ++ for (auto R = Config.Refresh; R; R = R->next) { ++(R->stats.matchTests); -- if (!regexec(&(R->compiled_pattern), url, 0, 0, 0)) { ++ if (R->pattern.match(url)) { ++(R->stats.matchCount); return R; } } -- return NULL; ++ return nullptr; } /** Locate the first refresh_pattern rule that has the given uncompiled regex. @@@ -123,19 -123,19 +109,17 @@@ * This function is only ever called if there is no URI. Because a regex match is impossible, Squid * forces the "." rule to apply (if it exists) * -- * \return A pointer to the refresh_pattern parameters to use, or NULL if there is no match. ++ * \return A pointer to the refresh_pattern parameters to use, or nullptr if there is no match. */ static const RefreshPattern * refreshUncompiledPattern(const char *pat) { -- const RefreshPattern *R; -- -- for (R = Config.Refresh; R; R = R->next) { -- if (0 == strcmp(R->pattern, pat)) ++ for (auto R = Config.Refresh; R; R = R->next) { ++ if (0 == strcmp(R->pattern.c_str(), pat)) return R; } -- return NULL; ++ return nullptr; } /** @@@ -318,7 -318,7 +302,7 @@@ refreshCheck(const StoreEntry * entry, if (NULL == R) R = &DefaultRefresh; -- debugs(22, 3, "Matched '" << R->pattern << " " << ++ debugs(22, 3, "Matched '" << R->pattern.c_str() << " " << (int) R->min << " " << (int) (100.0 * R->pct) << "%% " << (int) R->max << "'"); @@@ -709,8 -709,8 +693,8 @@@ refreshStats(StoreEntry * sentry R->stats.matchCount, R->stats.matchTests, xpercent(R->stats.matchCount, R->stats.matchTests), -- (R->flags.icase ? "-i " : ""), -- R->pattern); ++ (R->pattern.flags®_ICASE ? "-i " : ""), ++ R->pattern.c_str()); } int i; @@@ -762,12 -762,12 +746,6 @@@ refreshInit(void refreshCounts[rcCDigest].proto = "Cache Digests"; #endif -- memset(&DefaultRefresh, '\0', sizeof(DefaultRefresh)); -- DefaultRefresh.pattern = ""; -- DefaultRefresh.min = REFRESH_DEFAULT_MIN; -- DefaultRefresh.pct = REFRESH_DEFAULT_PCT; -- DefaultRefresh.max = REFRESH_DEFAULT_MAX; -- refreshRegisterWithCacheManager(); } diff --cc test-suite/squidconf/regex index 0000000000,3eca26e204..3f85acca39 mode 000000,100644..100644 --- a/test-suite/squidconf/regex +++ b/test-suite/squidconf/regex @@@ -1,0 -1,20 +1,24 @@@ + ## Copyright (C) 1996-2015 The Squid Software Foundation and contributors + ## + ## Squid software is distributed under GPLv2+ license and includes + ## contributions from numerous individuals and organizations. + ## Please see the COPYING and CONTRIBUTORS files for details. + ## + + # + # This file contains the list of regular expression syntaxes used + # it covers various uses of: + # unoptimized multi-line patterns + # + # Some other regression related patterns are tested in regressions-3.4.0.1 + # + + acl G dstdom_regex \.g...l\.com$ + acl G dstdom_regex \.g...le\.com$ + + acl B browser ^Mozilla + acl B browser ^Java/[0-9]+(\.[0-9]+)? ++ ++# invalid pattern - this should ERROR ++acl foo browser * ++