[thirdparty/squid.git] / src / acl / RegexData.cc

/*
 * Copyright (C) 1996-2017 The Squid Software Foundation and contributors
 *
 * Squid software is distributed under GPLv2+ license and includes
 * contributions from numerous individuals and organizations.
 * Please see the COPYING and CONTRIBUTORS files for details.
 */

/*
 * Portions of this code are copyrighted and released under GPLv2+ by:
 * Copyright (c) 2011, Marcus Kool
 * Please add new claims to the CONTRIBUTORS file instead.
 */

/* DEBUG: section 28    Access Control */

#include "squid.h"
#include "acl/Acl.h"
#include "acl/Checklist.h"
#include "acl/RegexData.h"
#include "base/RegexPattern.h"
#include "ConfigParser.h"
#include "Debug.h"
#include "sbuf/Algorithms.h"
#include "sbuf/List.h"

ACLRegexData::~ACLRegexData()
{
}

bool
ACLRegexData::match(char const *word)
{
    if (!word)
        return 0;

    debugs(28, 3, "checking '" << word << "'");

    // walk the list of patterns to see if one matches
    for (auto &i : data) {
        if (i.match(word)) {
            debugs(28, 2, '\'' << i.c_str() << "' found in '" << word << '\'');
            // TODO: old code also popped the pattern to second place of the list
            // in order to reduce patterns search times.
            return 1;
        }
    }

    return 0;
}

SBufList
ACLRegexData::dump() const
{
    SBufList sl;
    int flags = REG_EXTENDED | REG_NOSUB;

    // walk and dump the list
    // keeping the flags values consistent
    for (auto &i : data) {
        if (i.flags != flags) {
            if ((i.flags&REG_ICASE) != 0) {
                sl.emplace_back("-i");
            } else {
                sl.emplace_back("+i");
            }
            flags = i.flags;
        }

        sl.emplace_back(i.c_str());
    }

    return sl;
}

static const char *
removeUnnecessaryWildcards(char * t)
{
    char * orig = t;

    if (strncmp(t, "^.*", 3) == 0)
        t += 3;

    /* NOTE: an initial '.' might seem unnessary but is not;
     * it can be a valid requirement that cannot be optimised
     */
    while (*t == '.'  &&  *(t+1) == '*') {
        t += 2;
    }

    if (*t == '\0') {
        debugs(28, DBG_IMPORTANT, cfg_filename << " line " << config_lineno << ": " << config_input_line);
        debugs(28, DBG_IMPORTANT, "WARNING: regular expression '" << orig << "' has only wildcards and matches all strings. Using '.*' instead.");
        return ".*";
    }
    if (t != orig) {
        debugs(28, DBG_IMPORTANT, cfg_filename << " line " << config_lineno << ": " << config_input_line);
        debugs(28, DBG_IMPORTANT, "WARNING: regular expression '" << orig << "' has unnecessary wildcard(s). Using '" << t << "' instead.");
    }

    return t;
}

static bool
compileRE(std::list<RegexPattern> &curlist, const char * RE, int flags)
{
    if (RE == NULL || *RE == '\0')
        return curlist.empty(); // XXX: old code did this. It looks wrong.

    regex_t comp;
    if (int errcode = regcomp(&comp, RE, flags)) {
        char errbuf[256];
        regerror(errcode, &comp, errbuf, sizeof errbuf);
        debugs(28, DBG_CRITICAL, cfg_filename << " line " << config_lineno << ": " << config_input_line);
        debugs(28, DBG_CRITICAL, "ERROR: invalid regular expression: '" << RE << "': " << errbuf);
        return false;
    }
    debugs(28, 2, "compiled '" << RE << "' with flags " << flags);

    curlist.emplace_back(flags, RE);
    curlist.back().regex = comp;

    return true;
}

static bool
compileRE(std::list<RegexPattern> &curlist, const SBufList &RE, int flags)
{
    if (RE.empty())
        return curlist.empty(); // XXX: old code did this. It looks wrong.
    SBuf regexp;
    static const SBuf openparen("("), closeparen(")"), separator(")|(");
    JoinContainerIntoSBuf(regexp, RE.begin(), RE.end(), separator, openparen,
                          closeparen);
    return compileRE(curlist, regexp.c_str(), flags);
}

/** Compose and compile one large RE from a set of (small) REs.
 * The ultimate goal is to have only one RE per ACL so that match() is
 * called only once per ACL.
 */
static int
compileOptimisedREs(std::list<RegexPattern> &curlist, const SBufList &sl)
{
    std::list<RegexPattern> newlist;
    SBufList accumulatedRE;
    int numREs = 0, reSize = 0;
    int flags = REG_EXTENDED | REG_NOSUB;

    for (const SBuf & configurationLineWord : sl) {
        static const SBuf minus_i("-i");
        static const SBuf plus_i("+i");
        if (configurationLineWord == minus_i) {
            if (flags & REG_ICASE) {
                /* optimisation of  -i ... -i */
                debugs(28, 2, "optimisation of -i ... -i" );
            } else {
                debugs(28, 2, "-i" );
                if (!compileRE(newlist, accumulatedRE, flags))
                    return 0;
                flags |= REG_ICASE;
                accumulatedRE.clear();
                reSize = 0;
            }
            continue;
        } else if (configurationLineWord == plus_i) {
            if ((flags & REG_ICASE) == 0) {
                /* optimisation of  +i ... +i */
                debugs(28, 2, "optimisation of +i ... +i");
            } else {
                debugs(28, 2, "+i");
                if (!compileRE(newlist, accumulatedRE, flags))
                    return 0;
                flags &= ~REG_ICASE;
                accumulatedRE.clear();
                reSize = 0;
            }
            continue;
        }

        debugs(28, 2, "adding RE '" << configurationLineWord << "'");
        accumulatedRE.push_back(configurationLineWord);
        ++numREs;
        reSize += configurationLineWord.length();

        if (reSize > 1024) { // must be < BUFSIZ everything included
            debugs(28, 2, "buffer full, generating new optimised RE..." );
            if (!compileRE(newlist, accumulatedRE, flags))
                return 0;
            accumulatedRE.clear();
            reSize = 0;
            continue;    /* do the loop again to add the RE to largeRE */
        }
    }

    if (!compileRE(newlist, accumulatedRE, flags))
        return 0;

    accumulatedRE.clear();
    reSize = 0;

    /* all was successful, so put the new list at the tail */
    curlist.splice(curlist.end(), newlist);

    debugs(28, 2, numREs << " REs are optimised into one RE.");
    if (numREs > 100) {
        debugs(28, (opt_parse_cfg_only?DBG_IMPORTANT:2), cfg_filename << " line " << config_lineno << ": " << config_input_line);
        debugs(28, (opt_parse_cfg_only?DBG_IMPORTANT:2), "WARNING: there are more than 100 regular expressions. " <<
               "Consider using less REs or use rules without expressions like 'dstdomain'.");
    }

    return 1;
}

static void
compileUnoptimisedREs(std::list<RegexPattern> &curlist, const SBufList &sl)
{
    int flags = REG_EXTENDED | REG_NOSUB;

    static const SBuf minus_i("-i"), plus_i("+i");
    for (auto configurationLineWord : sl) {
        if (configurationLineWord == minus_i) {
            flags |= REG_ICASE;
        } else if (configurationLineWord == plus_i) {
            flags &= ~REG_ICASE;
        } else {
            if (!compileRE(curlist, configurationLineWord.c_str() , flags))
                debugs(28, DBG_CRITICAL, "ERROR: Skipping regular expression. "
                       "Compile failed: '" << configurationLineWord << "'");
        }
    }
}

void
ACLRegexData::parse()
{
    debugs(28, 2, "new Regex line or file");

    SBufList sl;
    while (char *t = ConfigParser::RegexStrtokFile()) {
        const char *clean = removeUnnecessaryWildcards(t);
        if (strlen(clean) > BUFSIZ-1) {
            debugs(28, DBG_CRITICAL, cfg_filename << " line " << config_lineno << ": " << config_input_line);
            debugs(28, DBG_CRITICAL, "ERROR: Skipping regular expression. Larger than " << BUFSIZ-1 << " characters: '" << clean << "'");
        } else {
            debugs(28, 3, "buffering RE '" << clean << "'");
            sl.emplace_back(clean);
        }
    }

    if (!compileOptimisedREs(data, sl)) {
        debugs(28, DBG_IMPORTANT, "WARNING: optimisation of regular expressions failed; using fallback method without optimisation");
        compileUnoptimisedREs(data, sl);
    }
}

bool
ACLRegexData::empty() const
{
    return data.empty();
}

ACLData<char const *> *
ACLRegexData::clone() const
{
    /* Regex's don't clone yet. */
    assert(data.empty());
    return new ACLRegexData;
}
Commit	Line	Data
	1	/*
	2	* Copyright (C) 1996-2017 The Squid Software Foundation and contributors
	3	*
	4	* Squid software is distributed under GPLv2+ license and includes
	5	* contributions from numerous individuals and organizations.
	6	* Please see the COPYING and CONTRIBUTORS files for details.
	7	*/
	8
	9	/*
	10	* Portions of this code are copyrighted and released under GPLv2+ by:
	11	* Copyright (c) 2011, Marcus Kool
	12	* Please add new claims to the CONTRIBUTORS file instead.
	13	*/
	14
	15	/* DEBUG: section 28 Access Control */
	16
	17	#include "squid.h"
	18	#include "acl/Acl.h"
	19	#include "acl/Checklist.h"
	20	#include "acl/RegexData.h"
	21	#include "base/RegexPattern.h"
	22	#include "ConfigParser.h"
	23	#include "Debug.h"
	24	#include "sbuf/Algorithms.h"
	25	#include "sbuf/List.h"
	26
	27	ACLRegexData::~ACLRegexData()
	28	{
	29	}
	30
	31	bool
	32	ACLRegexData::match(char const *word)
	33	{
	34	if (!word)
	35	return 0;
	36
	37	debugs(28, 3, "checking '" << word << "'");
	38
	39	// walk the list of patterns to see if one matches
	40	for (auto &i : data) {
	41	if (i.match(word)) {
	42	debugs(28, 2, '\'' << i.c_str() << "' found in '" << word << '\'');
	43	// TODO: old code also popped the pattern to second place of the list
	44	// in order to reduce patterns search times.
	45	return 1;
	46	}
	47	}
	48
	49	return 0;
	50	}
	51
	52	SBufList
	53	ACLRegexData::dump() const
	54	{
	55	SBufList sl;
	56	int flags = REG_EXTENDED \| REG_NOSUB;
	57
	58	// walk and dump the list
	59	// keeping the flags values consistent
	60	for (auto &i : data) {
	61	if (i.flags != flags) {
	62	if ((i.flags&REG_ICASE) != 0) {
	63	sl.emplace_back("-i");
	64	} else {
	65	sl.emplace_back("+i");
	66	}
	67	flags = i.flags;
	68	}
	69
	70	sl.emplace_back(i.c_str());
	71	}
	72
	73	return sl;
	74	}
	75
	76	static const char *
	77	removeUnnecessaryWildcards(char * t)
	78	{
	79	char * orig = t;
	80
	81	if (strncmp(t, "^.*", 3) == 0)
	82	t += 3;
	83
	84	/* NOTE: an initial '.' might seem unnessary but is not;
	85	* it can be a valid requirement that cannot be optimised
	86	*/
	87	while (t == '.' && (t+1) == '*') {
	88	t += 2;
	89	}
	90
	91	if (*t == '\0') {
	92	debugs(28, DBG_IMPORTANT, cfg_filename << " line " << config_lineno << ": " << config_input_line);
	93	debugs(28, DBG_IMPORTANT, "WARNING: regular expression '" << orig << "' has only wildcards and matches all strings. Using '.*' instead.");
	94	return ".*";
	95	}
	96	if (t != orig) {
	97	debugs(28, DBG_IMPORTANT, cfg_filename << " line " << config_lineno << ": " << config_input_line);
	98	debugs(28, DBG_IMPORTANT, "WARNING: regular expression '" << orig << "' has unnecessary wildcard(s). Using '" << t << "' instead.");
	99	}
	100
	101	return t;
	102	}
	103
	104	static bool
	105	compileRE(std::list<RegexPattern> &curlist, const char * RE, int flags)
	106	{
	107	if (RE == NULL \|\| *RE == '\0')
	108	return curlist.empty(); // XXX: old code did this. It looks wrong.
	109
	110	regex_t comp;
	111	if (int errcode = regcomp(&comp, RE, flags)) {
	112	char errbuf[256];
	113	regerror(errcode, &comp, errbuf, sizeof errbuf);
	114	debugs(28, DBG_CRITICAL, cfg_filename << " line " << config_lineno << ": " << config_input_line);
	115	debugs(28, DBG_CRITICAL, "ERROR: invalid regular expression: '" << RE << "': " << errbuf);
	116	return false;
	117	}
	118	debugs(28, 2, "compiled '" << RE << "' with flags " << flags);
	119
	120	curlist.emplace_back(flags, RE);
	121	curlist.back().regex = comp;
	122
	123	return true;
	124	}
	125
	126	static bool
	127	compileRE(std::list<RegexPattern> &curlist, const SBufList &RE, int flags)
	128	{
	129	if (RE.empty())
	130	return curlist.empty(); // XXX: old code did this. It looks wrong.
	131	SBuf regexp;
	132	static const SBuf openparen("("), closeparen(")"), separator(")\|(");
	133	JoinContainerIntoSBuf(regexp, RE.begin(), RE.end(), separator, openparen,
	134	closeparen);
	135	return compileRE(curlist, regexp.c_str(), flags);
	136	}
	137
	138	/** Compose and compile one large RE from a set of (small) REs.
	139	* The ultimate goal is to have only one RE per ACL so that match() is
	140	* called only once per ACL.
	141	*/
	142	static int
	143	compileOptimisedREs(std::list<RegexPattern> &curlist, const SBufList &sl)
	144	{
	145	std::list<RegexPattern> newlist;
	146	SBufList accumulatedRE;
	147	int numREs = 0, reSize = 0;
	148	int flags = REG_EXTENDED \| REG_NOSUB;
	149
	150	for (const SBuf & configurationLineWord : sl) {
	151	static const SBuf minus_i("-i");
	152	static const SBuf plus_i("+i");
	153	if (configurationLineWord == minus_i) {
	154	if (flags & REG_ICASE) {
	155	/* optimisation of -i ... -i */
	156	debugs(28, 2, "optimisation of -i ... -i" );
	157	} else {
	158	debugs(28, 2, "-i" );
	159	if (!compileRE(newlist, accumulatedRE, flags))
	160	return 0;
	161	flags \|= REG_ICASE;
	162	accumulatedRE.clear();
	163	reSize = 0;
	164	}
	165	continue;
	166	} else if (configurationLineWord == plus_i) {
	167	if ((flags & REG_ICASE) == 0) {
	168	/* optimisation of +i ... +i */
	169	debugs(28, 2, "optimisation of +i ... +i");
	170	} else {
	171	debugs(28, 2, "+i");
	172	if (!compileRE(newlist, accumulatedRE, flags))
	173	return 0;
	174	flags &= ~REG_ICASE;
	175	accumulatedRE.clear();
	176	reSize = 0;
	177	}
	178	continue;
	179	}
	180
	181	debugs(28, 2, "adding RE '" << configurationLineWord << "'");
	182	accumulatedRE.push_back(configurationLineWord);
	183	++numREs;
	184	reSize += configurationLineWord.length();
	185
	186	if (reSize > 1024) { // must be < BUFSIZ everything included
	187	debugs(28, 2, "buffer full, generating new optimised RE..." );
	188	if (!compileRE(newlist, accumulatedRE, flags))
	189	return 0;
	190	accumulatedRE.clear();
	191	reSize = 0;
	192	continue; /* do the loop again to add the RE to largeRE */
	193	}
	194	}
	195
	196	if (!compileRE(newlist, accumulatedRE, flags))
	197	return 0;
	198
	199	accumulatedRE.clear();
	200	reSize = 0;
	201
	202	/* all was successful, so put the new list at the tail */
	203	curlist.splice(curlist.end(), newlist);
	204
	205	debugs(28, 2, numREs << " REs are optimised into one RE.");
	206	if (numREs > 100) {
	207	debugs(28, (opt_parse_cfg_only?DBG_IMPORTANT:2), cfg_filename << " line " << config_lineno << ": " << config_input_line);
	208	debugs(28, (opt_parse_cfg_only?DBG_IMPORTANT:2), "WARNING: there are more than 100 regular expressions. " <<
	209	"Consider using less REs or use rules without expressions like 'dstdomain'.");
	210	}
	211
	212	return 1;
	213	}
	214
	215	static void
	216	compileUnoptimisedREs(std::list<RegexPattern> &curlist, const SBufList &sl)
	217	{
	218	int flags = REG_EXTENDED \| REG_NOSUB;
	219
	220	static const SBuf minus_i("-i"), plus_i("+i");
	221	for (auto configurationLineWord : sl) {
	222	if (configurationLineWord == minus_i) {
	223	flags \|= REG_ICASE;
	224	} else if (configurationLineWord == plus_i) {
	225	flags &= ~REG_ICASE;
	226	} else {
	227	if (!compileRE(curlist, configurationLineWord.c_str() , flags))
	228	debugs(28, DBG_CRITICAL, "ERROR: Skipping regular expression. "
	229	"Compile failed: '" << configurationLineWord << "'");
	230	}
	231	}
	232	}
	233
	234	void
	235	ACLRegexData::parse()
	236	{
	237	debugs(28, 2, "new Regex line or file");
	238
	239	SBufList sl;
	240	while (char *t = ConfigParser::RegexStrtokFile()) {
	241	const char *clean = removeUnnecessaryWildcards(t);
	242	if (strlen(clean) > BUFSIZ-1) {
	243	debugs(28, DBG_CRITICAL, cfg_filename << " line " << config_lineno << ": " << config_input_line);
	244	debugs(28, DBG_CRITICAL, "ERROR: Skipping regular expression. Larger than " << BUFSIZ-1 << " characters: '" << clean << "'");
	245	} else {
	246	debugs(28, 3, "buffering RE '" << clean << "'");
	247	sl.emplace_back(clean);
	248	}
	249	}
	250
	251	if (!compileOptimisedREs(data, sl)) {
	252	debugs(28, DBG_IMPORTANT, "WARNING: optimisation of regular expressions failed; using fallback method without optimisation");
	253	compileUnoptimisedREs(data, sl);
	254	}
	255	}
	256
	257	bool
	258	ACLRegexData::empty() const
	259	{
	260	return data.empty();
	261	}
	262
	263	ACLData<char const >
	264	ACLRegexData::clone() const
	265	{
	266	/* Regex's don't clone yet. */
	267	assert(data.empty());
	268	return new ACLRegexData;
	269	}
	270