From: Willy Tarreau Date: Sun, 29 Apr 2012 13:39:40 +0000 (+0200) Subject: MEDIUM: pattern: add the "base" sample fetch method X-Git-Tag: v1.5-dev12~137 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=a7ad50cdb1056c23602770ef62860c30a6f3a016;p=thirdparty%2Fhaproxy.git MEDIUM: pattern: add the "base" sample fetch method This one returns the concatenation of the first Host header entry with the path. It can make content-switching rules easier, help with fighting DDoS on certain URLs and improve shared caches efficiency. --- diff --git a/doc/configuration.txt b/doc/configuration.txt index 0d748e4605..66abe40fba 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -7968,6 +7968,47 @@ application layer (layer 7). Those require that a full HTTP request has been read, and are only evaluated then. They may require slightly more CPU resources than the layer 4 ones, but not much since the request and response are indexed. +base + Returns true when the concatenation of the first Host header and the path + part of the request, which starts at the first slash and ends before the + question mark, equals one of the strings. It may be used to match known + files in virtual hosting environments, such as "www.example.com/favicon.ico". + See also "path" and "uri". + +base_beg + Returns true when the base (see above) begins with one of the strings. This + can be used to send certain directory names to alternative backends. See also + "path_beg". + +base_dir + Returns true when one of the strings is found isolated or delimited with + slashes in the base (see above). Probably of little use, see "url_dir" and + "path_dir" instead. + +base_dom + Returns true when one of the strings is found isolated or delimited with dots + in the base (see above). Probably of little use, see "path_dom" and "url_dom" + instead. + +base_end + Returns true when the base (see above) ends with one of the strings. This may + be used to control file name extension, though "path_end" is cheaper. + +base_len + Returns true when the base (see above) length matches the values or ranges + specified. This may be used to detect abusive requests for instance. + +base_reg + Returns true when the base (see above) matches one of the regular + expressions. It can be used any time, but it is important to remember that + regex matching is slower than other methods. See also "path_reg", "url_reg" + and all "base_" criteria. + +base_sub + Returns true when the base (see above) contains one of the strings. It can be + used to detect particular patterns in paths, such as "../" for example. See + also "base_dir". + cook() All "cook*" matching criteria inspect all "Cookie" headers to find a cookie with the name between parenthesis. If multiple occurrences of the cookie are @@ -8209,11 +8250,12 @@ status url Applies to the whole URL passed in the request. The only real use is to match - "*", for which there already is a predefined ACL. + "*", for which there already is a predefined ACL. See also "base". url_beg Returns true when the URL begins with one of the strings. This can be used to - check whether a URL begins with a slash or with a protocol scheme. + check whether a URL begins with a slash or with a protocol scheme. See also + "base_beg". url_dir Returns true when one of the strings is found isolated or delimited with @@ -8248,7 +8290,7 @@ url_port url_reg Returns true when the URL matches one of the regular expressions. It can be used any time, but it is important to remember that regex matching is slower - than other methods. See also "path_reg" and all "url_" criteria. + than other methods. See also "base_reg", "path_reg" and all "url_" criteria. url_sub Returns true when the URL contains one of the strings. It can be used to @@ -8421,6 +8463,14 @@ equivalent used in ACLs. The list of currently supported pattern fetch functions is the following : + base This returns the concatenation of the first Host header and the + path part of the request, which starts at the first slash and + ends before the question mark. It can be useful in virtual + hosted environments to detect URL abuses as well as to improve + shared caches efficiency. Using this with a limited size stick + table also allows one to collect statistics about most commonly + requested objects by host/path. + src This is the source IPv4 address of the client of the session. It is of type IPv4 and works on both IPv4 and IPv6 tables. On IPv6 tables, IPv4 address is mapped to its IPv6 equivalent, diff --git a/src/proto_http.c b/src/proto_http.c index 082b53032c..820f6434f2 100644 --- a/src/proto_http.c +++ b/src/proto_http.c @@ -8041,6 +8041,50 @@ smp_fetch_path(struct proxy *px, struct session *l4, void *l7, unsigned int opt, return 1; } +/* This produces a concatenation of the first occurrence of the Host header + * followed by the path component if it begins with a slash ('/'). This means + * that '*' will not be added, resulting in exactly the first Host entry. + * If no Host header is found, then the path is returned as-is. The returned + * value is stored in the trash so it does not need to be marked constant. + */ +static int +smp_fetch_base(struct proxy *px, struct session *l4, void *l7, unsigned int opt, + const struct arg *args, struct sample *smp) +{ + struct http_txn *txn = l7; + char *ptr, *end, *beg; + struct hdr_ctx ctx; + + CHECK_HTTP_MESSAGE_FIRST(); + + ctx.idx = 0; + if (!http_find_header2("Host", 4, txn->req.buf->p + txn->req.sol, &txn->hdr_idx, &ctx) || + !ctx.vlen) + return smp_fetch_path(px, l4, l7, opt, args, smp); + + /* OK we have the header value in ctx.line+ctx.val for ctx.vlen bytes */ + memcpy(trash, ctx.line + ctx.val, ctx.vlen); + smp->type = SMP_T_STR; + smp->data.str.str = trash; + smp->data.str.len = ctx.vlen; + + /* now retrieve the path */ + end = txn->req.buf->p + txn->req.sol + txn->req.sl.rq.u + txn->req.sl.rq.u_l; + beg = http_get_path(txn); + if (!beg) + beg = end; + + for (ptr = beg; ptr < end && *ptr != '?'; ptr++); + + if (beg < ptr && *beg == '/') { + memcpy(smp->data.str.str + smp->data.str.len, beg, ptr - beg); + smp->data.str.len += ptr - beg; + } + + smp->flags = SMP_F_VOL_1ST; + return 1; +} + static int acl_fetch_proto_http(struct proxy *px, struct session *l4, void *l7, unsigned int opt, const struct arg *args, struct sample *smp) @@ -8530,6 +8574,15 @@ static int val_hdr(struct arg *arg, char **err_msg) * Please take care of keeping this list alphabetically sorted. */ static struct acl_kw_list acl_kws = {{ },{ + { "base", acl_parse_str, smp_fetch_base, acl_match_str, ACL_USE_L7REQ_VOLATILE|ACL_MAY_LOOKUP, 0 }, + { "base_beg", acl_parse_str, smp_fetch_base, acl_match_beg, ACL_USE_L7REQ_VOLATILE, 0 }, + { "base_dir", acl_parse_str, smp_fetch_base, acl_match_dir, ACL_USE_L7REQ_VOLATILE, 0 }, + { "base_dom", acl_parse_str, smp_fetch_base, acl_match_dom, ACL_USE_L7REQ_VOLATILE, 0 }, + { "base_end", acl_parse_str, smp_fetch_base, acl_match_end, ACL_USE_L7REQ_VOLATILE, 0 }, + { "base_len", acl_parse_int, smp_fetch_base, acl_match_len, ACL_USE_L7REQ_VOLATILE, 0 }, + { "base_reg", acl_parse_reg, smp_fetch_base, acl_match_reg, ACL_USE_L7REQ_VOLATILE, 0 }, + { "base_sub", acl_parse_str, smp_fetch_base, acl_match_sub, ACL_USE_L7REQ_VOLATILE, 0 }, + { "cook", acl_parse_str, smp_fetch_cookie, acl_match_str, ACL_USE_L7REQ_VOLATILE|ACL_MAY_LOOKUP, ARG1(0,STR) }, { "cook_beg", acl_parse_str, smp_fetch_cookie, acl_match_beg, ACL_USE_L7REQ_VOLATILE, ARG1(0,STR) }, { "cook_cnt", acl_parse_int, acl_fetch_cookie_cnt, acl_match_int, ACL_USE_L7REQ_VOLATILE, ARG1(0,STR) }, @@ -8627,6 +8680,7 @@ static struct acl_kw_list acl_kws = {{ },{ /* Note: must not be declared as its list will be overwritten */ static struct sample_fetch_kw_list sample_fetch_keywords = {{ },{ { "hdr", smp_fetch_hdr, ARG2(1,STR,SINT), val_hdr, SMP_T_CSTR, SMP_CAP_REQ }, + { "base", smp_fetch_base, 0, NULL, SMP_T_CSTR, SMP_CAP_REQ }, { "path", smp_fetch_path, 0, NULL, SMP_T_CSTR, SMP_CAP_REQ }, { "url", smp_fetch_url, 0, NULL, SMP_T_CSTR, SMP_CAP_REQ }, { "url_ip", smp_fetch_url_ip, 0, NULL, SMP_T_IPV4, SMP_CAP_REQ },